Esempio n. 1
0
    def parseGLTextFile(self, infoDict):
        """
        parse a GreenLogger text file
        """
        dataRecItemCt = 0
        dataRecsAdded = 0
        dataRecsDupSkipped = 0

        #        print "About to put lines into temp table"
        self.putTextLinesIntoTmpTable(infoDict)
        #        print "Finished putting lines into temp table"
        # get the metadata header
        stSQL = """SELECT Line FROM tmpLines WHERE Line LIKE '{"Instrument identifier":%' GROUP BY Line;"""
        mtDat = scidb.curT.execute(stSQL).fetchone()
        dictHd = ast.literal_eval(mtDat['Line'])
        # somewhat redundant to do 'LoggerSerialNumber' here (fn 'assureChannelIsInDB' would fill in
        # later), but allows putting Model into the DB
        iInstSpecID = scidb.assureItemIsInTableField(dictHd['Model'],
                                                     "InstrumentSpecs",
                                                     "InstrumentSpec")
        iLoggerID = scidb.assureItemIsInTableField(
            dictHd['Instrument identifier'], "Loggers", "LoggerSerialNumber")
        scidb.curD.execute(
            "UPDATE Loggers SET InstrumentSpecID = ? WHERE ID = ?;",
            (iInstSpecID, iLoggerID))

        # get the hour offset(s); most files will have only one
        stSQL = "SELECT substr(Line, 21, 3) as TZ FROM tmpLines " \
                "WHERE Line LIKE '____-__-__ __:__:__ ___%' GROUP BY TZ;"
        hrOffsets = scidb.curT.execute(stSQL).fetchall()
        for hrOffset in hrOffsets:
            iTimeZoneOffset = int(hrOffset['TZ'])
            # make a dictionary of channel IDs for data lines that have this hour offset
            # would be slightly different for different hour offsets
            lCols = dictHd[
                'Columns']  # get col metadata; this is a list of dictionaries
            #            # it is indexed by the columns list, and is zero-based
            #            lCh = [0 for i in range(len(lCols))] # initially, fill with all zeroes
            dictChannels = {}  # this is the dictionary we will build
            for dictCol in lCols:
                # somewhat redundant to fill these in before calling function 'assureChannelIsInDB', but
                #  allows assigning sensor device types
                iDeviceTypeID = scidb.assureItemIsInTableField(
                    dictCol['Device'], "DeviceSpecs", "DeviceSpec")
                iSensorID = scidb.assureItemIsInTableField(
                    dictCol['Identifier'], "Sensors", "SensorSerialNumber")
                scidb.curD.execute(
                    "UPDATE Sensors SET DeviceSpecID = ? WHERE ID = ?;",
                    (iDeviceTypeID, iSensorID))
                #                iDataTypeID = scidb.assureItemIsInTableField(dictCol['DataType'], "DataTypes", "TypeText")
                #                iDataUnitsID = scidb.assureItemIsInTableField(dictCol['DataUnits'], "DataUnits", "UnitsText")
                # build list to create the channel
                # list items are: ChannelID, originalCol, Logger, Sensor, dataType, dataUnits, hrOffset, new
                lChannel = [
                    0, dictCol['Order'], dictHd['Instrument identifier'],
                    dictCol['Identifier'], dictCol['DataType'],
                    dictCol['DataUnits'], iTimeZoneOffset, ''
                ]
                dictChannels[dictCol['Order']] = (
                    lChannel[:])  # the key is the column number
                scidb.assureChannelIsInDB(dictChannels[
                    dictCol['Order']])  # get or create the channel
                iChannelID = dictChannels[dictCol['Order']][0]
                # store the column name as a Series
                iSeriesID = scidb.assureItemIsInTableField(
                    dictCol['Name'], "DataSeries", "DataSeriesDescription")
                # tie it to this Channel, to offer later
                stSQLcs = 'INSERT INTO tmpChanSegSeries(ChannelID, SeriesID) VALUES (?, ?);'
                try:
                    scidb.curD.execute(stSQLcs, (iChannelID, iSeriesID))
                except sqlite3.IntegrityError:
                    pass  # silently ignore duplicates

#            print 'Before Channel function'
#            for ky in dictChannels.keys():
#                print ky, dictChannels[ky][:]
#            for ky in dictChannels.keys():
#                scidb.assureChannelIsInDB(dictChannels[ky])
#            print 'After Channel function'
#            for ky in dictChannels.keys():
#                print ky, dictChannels[ky][:]

# make a list of channel IDs for the set of lines with this HrOffset, for quick lookup
# it is indexed by the columns list, and is zero-based
            lCh = []
            for iCol in range(len(lCols)):
                iNomCol = iCol + 1
                if iNomCol in dictChannels:
                    lChanSet = dictChannels[iNomCol][:]
                    lCh.append(lChanSet[0])
                else:  # does not correspond to a data colum
                    lCh.append(
                        0)  # placeholder, to make list indexes work right

            # done setting up channels, get data lines
            stSQL = "SELECT ID, Line FROM tmpLines WHERE substr(Line, 21, 3) = ? " \
                "AND Line LIKE '____-__-__ __:__:__ ___%' ORDER BY ID;"
            recs = scidb.curT.execute(stSQL, (hrOffset['TZ'], )).fetchall()
            for rec in recs:
                lData = rec['Line'].split('\t')
                # item zero is the timestamp followed by the timezone offset
                sTimeStamp = lData[
                    0][:-4]  # drop timezone offset, we already have it
                tsAsTime = datetime.datetime.strptime(sTimeStamp,
                                                      "%Y-%m-%d %H:%M:%S")
                tsAsTime.replace(
                    tzinfo=None
                )  # make sure it does not get local timezone info
                tsAsTime = tsAsTime + datetime.timedelta(
                    hours=-iTimeZoneOffset)
                tsAsDate = tsAsTime.date()
                stSQL = "INSERT INTO Data (UTTimestamp, ChannelID, Value) VALUES (?, ?, ?)"
                for iCol in range(len(lData)):
                    if iCol > 0:  # an item of data
                        # give some progress diagnostics
                        dataRecItemCt += 1
                        if dataRecItemCt % 100 == 0:
                            self.msgArea.ChangeValue("Line " + str(rec['ID']) +
                                                     " of " +
                                                     str(infoDict['lineCt']) +
                                                     "; " +
                                                     str(dataRecsAdded) +
                                                     " records added, " +
                                                     str(dataRecsDupSkipped) +
                                                     " duplicates skipped.")
                            wx.Yield()
                        try:  # much faster to try and fail than to test first
                            scidb.curD.execute(
                                stSQL, (tsAsTime, lCh[iCol], lData[iCol]))
                            dataRecsAdded += 1  # count it
                        except sqlite3.IntegrityError:  # item is already in Data table
                            dataRecsDupSkipped += 1  # count but otherwise ignore
                        finally:
                            wx.Yield()

        # finished parsing lines
        infoDict['numNewDataRecsAdded'] = dataRecsAdded
        infoDict['numDupDataRecsSkipped'] = dataRecsDupSkipped
        infoDict['stParseMsg'] = str(infoDict['lineCt']) + " lines processed; " + \
            str(dataRecsAdded) + " data records added to database, " + \
            str(dataRecsDupSkipped) + " duplicates skipped."
        self.msgArea.ChangeValue(infoDict['stParseMsg'])
Esempio n. 2
0
    def parseHoboWareTextFile(self, infoDict):
        """
        parse a data file exported as text by HoboWare
        """
        sStrip = '" \x0a\x0d'  # characters to strip from parsed items
        # regular expression pattern to find logger number
        pLogger = re.compile(r'LGR S/N: (?P<nLogger>\d+)')
        # regular expression pattern to find sensor number
        pSensor = re.compile(r'SEN S/N: (?P<nSensor>\d+)')
        # regular expression pattern to find hour offset
        pHrOffset = re.compile(r'Time, GMT(?P<sHrOffset>.+)')
        dataRecItemCt = 0
        dataRecsAdded = 0
        dataRecsDupSkipped = 0

        print "About to put lines into temp table"
        self.putTextLinesIntoTmpTable(infoDict)
        print "Finished putting lines into temp table"
        #parse file, start with header line; 1st line in this file format
        scidb.curT.execute("SELECT * FROM tmpLines ORDER BY ID;")
        for rec in scidb.curT:
            if rec['ID'] == 1:
                """
                Build a dictionary of the channels.
                The indexes will be the column numbers because all data files have at least that.
                The value will be a 7-membered list
                The first item in each list will be the primary key in the Channels table.
                The list is first created with this = 0.
                The rest of the list is built up, then the list is sent to a function
                 that looks in the database.
                The function fills in the primary key, either existing or newly created.
                List item 7 will be "new" if the record is new, otherwise "existing"
                The list contains the text values of logger serial number, sensor serial number,
                 data type, and data units.
                The function takes care of filling in all these in their respective tables.
                When the dictionary is complete, the calling procedure can quickly insert data values
                 into the data table by just pulling list item [0] for the dictionary key, which key
                 is the column number in the source file.
                This loose structure allows some kludgy workarounds for bugs that were in some versions
                 of the data files.
                """
                lHdrs = rec['Line'].split('\t')
                # ignore item zero, just a pound sign and possibly three junk characters
                # item 1 is the hour offset, and clue to export bugs we need to workaround
                sHd = lHdrs[1].strip(sStrip)
                m = pHrOffset.search(sHd)
                if m:
                    sTimeOffset = m.group('sHrOffset')
                    lTimeOffsetComponents = sTimeOffset.split(':')
                    sTimeOffsetHrs = lTimeOffsetComponents[0]
                    iHrOffset = int(sTimeOffsetHrs)
                else:
                    iHrOffset = 0

                dictChannels = {}
                # list items are: ChannelID, originalCol, Logger, Sensor, dataType, dataUnits, hrOffset, new
                lChannel = [0, 0, '', '', '', '', iHrOffset, '']

                for iCol in range(len(lHdrs)):
                    # skip items 0 & 1
                    if iCol > 1:  # a header for a data column
                        lChannel[1] = iCol + 1  # stored columns are 1-based
                        sHd = lHdrs[iCol].strip(sStrip)
                        # get the type and units
                        lTypeUnits = sHd.split('(', 2)
                        sTypeUnits = lTypeUnits[0].strip(' ')
                        lTypeUnits = sTypeUnits.split(',')

                        ## TNP_MOD
                        print("TNP debug1")
                        print(lTypeUnits)
                        ## If units specified, make units 'NA'
                        if (len(lTypeUnits) == 1):
                            lTypeUnits.append('NA')

                        if lTypeUnits[0]:
                            sType = lTypeUnits[0].strip(' ')
                        else:
                            sType = '(na)'
                        lChannel[4] = sType
                        if lTypeUnits[1]:
                            sUnits = lTypeUnits[1].strip(' ')
                        else:
                            sUnits = '(na)'
                        lChannel[5] = sUnits
                        # get the logger ID and sensor ID
                        m = pLogger.search(sHd)
                        if m:
                            sLoggerID = m.group('nLogger')
                        else:
                            sLoggerID = '(na)'
                        lChannel[2] = sLoggerID
                        m = pSensor.search(sHd)
                        if m:
                            sSensorID = m.group('nSensor')
                        else:
                            sSensorID = "(na)"
                        lChannel[3] = sSensorID
                        dictChannels[iCol + 1] = (lChannel[:])
                # gone through all the headers, apply bug workarounds here

                print 'Before Channel function'
                for ky in dictChannels.keys():
                    print ky, dictChannels[ky][:]
                for ky in dictChannels.keys():
                    scidb.assureChannelIsInDB(dictChannels[ky])
                print 'After Channel function'
                for ky in dictChannels.keys():
                    print ky, dictChannels[ky][:]
                # make a list of channel IDs for the rest of this file, for quick lookup
                # it is indexed by the columns list, and is zero-based
                lCh = []
                for iCol in range(len(lHdrs)):
                    iNomCol = iCol + 1
                    if iNomCol in dictChannels:
                        lChanSet = dictChannels[iNomCol][:]
                        lCh.append(lChanSet[0])
                    else:  # does not correspond to a data colum
                        lCh.append(
                            0)  # placeholder, to make list indexes work right

            else:  # not the 1st (header) line, but a line of data
                lData = rec['Line'].split('\t')
                # ignore item zero, a line number, not used
                sTimeStamp = lData[1]
                tsAsTime = datetime.datetime.strptime(sTimeStamp,
                                                      "%Y-%m-%d %H:%M:%S")
                tsAsTime.replace(
                    tzinfo=None
                )  # make sure it does not get local timezone info
                tsAsTime = tsAsTime + datetime.timedelta(hours=-iHrOffset)
                tsAsDate = tsAsTime.date()
                stSQL = "INSERT INTO Data (UTTimestamp, ChannelID, Value) VALUES (?, ?, ?)"
                for iCol in range(len(lData)):
                    if iCol > 1:  # an item of data
                        # give some progress diagnostics
                        dataRecItemCt += 1
                        if dataRecItemCt % 100 == 0:
                            self.msgArea.ChangeValue("Line " + str(rec['ID']) +
                                                     " of " +
                                                     str(infoDict['lineCt']) +
                                                     "; " +
                                                     str(dataRecsAdded) +
                                                     " records added, " +
                                                     str(dataRecsDupSkipped) +
                                                     " duplicates skipped.")
                            wx.Yield()
                        try:  # much faster to try and fail than to test first
                            scidb.curD.execute(
                                stSQL, (tsAsTime, lCh[iCol], lData[iCol]))
                            dataRecsAdded += 1  # count it
                        except sqlite3.IntegrityError:  # item is already in Data table
                            dataRecsDupSkipped += 1  # count but otherwise ignore
                        finally:
                            wx.Yield()

        # finished parsing lines
        infoDict['numNewDataRecsAdded'] = dataRecsAdded
        infoDict['numDupDataRecsSkipped'] = dataRecsDupSkipped
        infoDict['stParseMsg'] = str(infoDict['lineCt']) + " lines processed; " + \
            str(dataRecsAdded) + " data records added to database, " + \
            str(dataRecsDupSkipped) + " duplicates skipped."
        self.msgArea.ChangeValue(infoDict['stParseMsg'])
Esempio n. 3
0
    def parseBlueTermFile(self, infoDict):
        """
        a BlueTerm log file can contain captured data from a series of daily
        GL text files, possibly even from different loggers
        """
        dataRecItemCt = 0
        dataRecsAdded = 0
        dataRecsDupSkipped = 0

        self.putTextLinesIntoTmpTable(infoDict)

        # find the different data dump segments
        # within each one, data will be from only one logger, and so we
        #  can use the same metadata header
        # make a list of 2-tuples; tuple item 0 = the starting ID of the
        # dump segment and tuple item 1 = the ending ID
        lDmpSegs = []
        stSQL = """SELECT ID, Line FROM tmpLines
            WHERE Line = '{"datadump":"begin"}' OR Line = '{"datadump":"end"}'
            ORDER BY ID;"""
        dmSegs = scidb.curT.execute(stSQL).fetchall()
        # begin or end may be missing, due to log interruption
        idSegStart = None
        idSegEnd = None
        # only use segments that have valid begin/end
        for dmSegItm in dmSegs:
            if dmSegItm['Line'] == '{"datadump":"begin"}':
                idSegStart = dmSegItm['ID']
            elif dmSegItm['Line'] == '{"datadump":"end"}':
                if idSegStart != None:
                    # make the tuple
                    tSeg = (idSegStart, dmSegItm['ID'])
                    # append to list
                    lDmpSegs.append(tSeg)
                    # null the vars
                    idSegStart = None
                    idSegEnd = None
            else:  # this should not happen, but tie up if it does
                idSegStart = None
                idSegEnd = None

        print "dump segments:", lDmpSegs

        # being worked on >>>>
        iNumSegs = len(lDmpSegs)
        if iNumSegs == 0:
            infoDict['stParseMsg'] = " No valid data segments in file."
            # will skip following loop
        iCtSeg = 0
        for tSeg in lDmpSegs:
            iCtSeg += 1
            idSegStart, idSegEnd = tSeg
            # get the metadata header; if multiple will all be the same for one dump segment
            stSQL = """SELECT Line 
                    FROM tmpLines 
                    WHERE ID > ? AND ID < ? 
                    AND Line LIKE '{"Instrument identifier":%'
                    GROUP BY Line;"""
            mtDat = scidb.curT.execute(stSQL, tSeg).fetchone()
            dictHd = ast.literal_eval(mtDat['Line'])
            # somewhat redundant to do 'LoggerSerialNumber' here (fn 'assureChannelIsInDB' would fill in
            # later), but allows putting Model into the DB
            iInstSpecID = scidb.assureItemIsInTableField(
                dictHd['Model'], "InstrumentSpecs", "InstrumentSpec")
            iLoggerID = scidb.assureItemIsInTableField(
                dictHd['Instrument identifier'], "Loggers",
                "LoggerSerialNumber")
            scidb.curD.execute(
                "UPDATE Loggers SET InstrumentSpecID = ? WHERE ID = ?;",
                (iInstSpecID, iLoggerID))

            # get the hour offset(s); most files will have only one
            stSQL = "SELECT substr(Line, 21, 3) as TZ FROM tmpLines " \
                    "WHERE ID > ? AND ID < ? " \
                    "AND Line LIKE '____-__-__ __:__:__ ___%' GROUP BY TZ;"
            hrOffsets = scidb.curT.execute(stSQL, tSeg).fetchall()
            for hrOffset in hrOffsets:
                iTimeZoneOffset = int(hrOffset['TZ'])
                # make a dictionary of channel IDs for data lines that have this hour offset
                # would be slightly different for different hour offsets
                lCols = dictHd[
                    'Columns']  # get col metadata; this is a list of dictionaries
                #            # it is indexed by the columns list, and is zero-based
                #            lCh = [0 for i in range(len(lCols))] # initially, fill with all zeroes
                dictChannels = {}  # this is the dictionary we will build
                for dictCol in lCols:
                    # somewhat redundant to fill these in before calling function 'assureChannelIsInDB', but
                    #  allows assigning sensor device types
                    iDeviceTypeID = scidb.assureItemIsInTableField(
                        dictCol['Device'], "DeviceSpecs", "DeviceSpec")
                    iSensorID = scidb.assureItemIsInTableField(
                        dictCol['Identifier'], "Sensors", "SensorSerialNumber")
                    scidb.curD.execute(
                        "UPDATE Sensors SET DeviceSpecID = ? WHERE ID = ?;",
                        (iDeviceTypeID, iSensorID))
                    #                iDataTypeID = scidb.assureItemIsInTableField(dictCol['DataType'], "DataTypes", "TypeText")
                    #                iDataUnitsID = scidb.assureItemIsInTableField(dictCol['DataUnits'], "DataUnits", "UnitsText")
                    # build list to create the channel
                    # list items are: ChannelID, originalCol, Logger, Sensor, dataType, dataUnits, hrOffset, new
                    lChannel = [
                        0, dictCol['Order'], dictHd['Instrument identifier'],
                        dictCol['Identifier'], dictCol['DataType'],
                        dictCol['DataUnits'], iTimeZoneOffset, ''
                    ]
                    dictChannels[dictCol['Order']] = (
                        lChannel[:])  # the key is the column number
                    scidb.assureChannelIsInDB(dictChannels[
                        dictCol['Order']])  # get or create the channel
                    iChannelID = dictChannels[dictCol['Order']][0]
                    # store the column name as a Series
                    iSeriesID = scidb.assureItemIsInTableField(
                        dictCol['Name'], "DataSeries", "DataSeriesDescription")
                    # tie it to this Channel, to offer later
                    stSQLcs = 'INSERT INTO tmpChanSegSeries(ChannelID, SeriesID) VALUES (?, ?);'
                    try:
                        scidb.curD.execute(stSQLcs, (iChannelID, iSeriesID))
                    except sqlite3.IntegrityError:
                        pass  # silently ignore duplicates

    #            print 'Before Channel function'
    #            for ky in dictChannels.keys():
    #                print ky, dictChannels[ky][:]
    #            for ky in dictChannels.keys():
    #                scidb.assureChannelIsInDB(dictChannels[ky])
    #            print 'After Channel function'
    #            for ky in dictChannels.keys():
    #                print ky, dictChannels[ky][:]

    # make a list of channel IDs for the set of lines with this HrOffset, for quick lookup
    # it is indexed by the columns list, and is zero-based
                lCh = []
                for iCol in range(len(lCols)):
                    iNomCol = iCol + 1
                    if iNomCol in dictChannels:
                        lChanSet = dictChannels[iNomCol][:]
                        lCh.append(lChanSet[0])
                    else:  # does not correspond to a data colum
                        lCh.append(
                            0)  # placeholder, to make list indexes work right

                # done setting up channels, get data lines
                stSQL = "SELECT ID, Line FROM tmpLines " \
                    "WHERE ID > ? AND ID < ? " \
                    "AND substr(Line, 21, 3) = ? " \
                    "AND Line LIKE '____-__-__ __:__:__ ___%' ORDER BY ID;"
                recs = scidb.curT.execute(stSQL, (
                    idSegStart,
                    idSegEnd,
                    hrOffset['TZ'],
                )).fetchall()
                iNumSegLines = len(recs)
                iCtSegLines = 0
                for rec in recs:
                    iCtSegLines += 1
                    lData = rec['Line'].split('\t')
                    # item zero is the timestamp followed by the timezone offset
                    sTimeStamp = lData[
                        0][:-4]  # drop timezone offset, we already have it
                    tsAsTime = datetime.datetime.strptime(
                        sTimeStamp, "%Y-%m-%d %H:%M:%S")
                    tsAsTime.replace(
                        tzinfo=None
                    )  # make sure it does not get local timezone info
                    tsAsTime = tsAsTime + datetime.timedelta(
                        hours=-iTimeZoneOffset)
                    tsAsDate = tsAsTime.date()
                    stSQL = "INSERT INTO Data (UTTimestamp, ChannelID, Value) VALUES (?, ?, ?)"
                    for iCol in range(len(lData)):
                        if iCol > 0:  # an item of data
                            # give some progress diagnostics
                            dataRecItemCt += 1
                            if dataRecItemCt % 100 == 0:
                                self.msgArea.ChangeValue(
                                    "Segment " + str(iCtSeg) + " of " +
                                    str(iNumSegs) + ", HrOffset " +
                                    str(iTimeZoneOffset) + ", Line " +
                                    str(iCtSegLines) + " of " +
                                    str(iNumSegLines) + "; " +
                                    str(dataRecsAdded) + " records added, " +
                                    str(dataRecsDupSkipped) +
                                    " duplicates skipped.")
                                wx.Yield()
                            try:  # much faster to try and fail than to test first
                                scidb.curD.execute(
                                    stSQL, (tsAsTime, lCh[iCol], lData[iCol]))
                                dataRecsAdded += 1  # count it
                            except sqlite3.IntegrityError:  # item is already in Data table
                                dataRecsDupSkipped += 1  # count but otherwise ignore
                            finally:
                                wx.Yield()
        # <<<<< being worked on
        # finished parsing lines
        infoDict['numNewDataRecsAdded'] = dataRecsAdded
        infoDict['numDupDataRecsSkipped'] = dataRecsDupSkipped
        infoDict['stParseMsg'] = str(infoDict['lineCt']) + " lines processed; " + \
            str(dataRecsAdded) + " data records added to database, " + \
            str(dataRecsDupSkipped) + " duplicates skipped."
        self.msgArea.ChangeValue(infoDict['stParseMsg'])
Esempio n. 4
0
    def parseHoboWareTextFile(self, infoDict):
        """
        parse a data file exported as text by HoboWare
        """
        sStrip = '" \x0a\x0d' # characters to strip from parsed items
        # regular expression pattern to find logger number
        pLogger = re.compile(r'LGR S/N: (?P<nLogger>\d+)')
        # regular expression pattern to find sensor number
        pSensor = re.compile(r'SEN S/N: (?P<nSensor>\d+)')
        # regular expression pattern to find hour offset
        pHrOffset = re.compile(r'Time, GMT(?P<sHrOffset>.+)')
        dataRecItemCt = 0
        dataRecsAdded = 0
        dataRecsDupSkipped = 0

        print "About to put lines into temp table"
        self.putTextLinesIntoTmpTable(infoDict)
        print "Finished putting lines into temp table"
        #parse file, start with header line; 1st line in this file format
        scidb.curT.execute("SELECT * FROM tmpLines ORDER BY ID;")
        for rec in scidb.curT:
            if rec['ID'] == 1:
                """
                Build a dictionary of the channels.
                The indexes will be the column numbers because all data files have at least that.
                The value will be a 7-membered list
                The first item in each list will be the primary key in the Channels table.
                The list is first created with this = 0.
                The rest of the list is built up, then the list is sent to a function
                 that looks in the database.
                The function fills in the primary key, either existing or newly created.
                List item 7 will be "new" if the record is new, otherwise "existing"
                The list contains the text values of logger serial number, sensor serial number,
                 data type, and data units.
                The function takes care of filling in all these in their respective tables.
                When the dictionary is complete, the calling procedure can quickly insert data values
                 into the data table by just pulling list item [0] for the dictionary key, which key
                 is the column number in the source file.
                This loose structure allows some kludgy workarounds for bugs that were in some versions
                 of the data files.
                """
                lHdrs = rec['Line'].split('\t')
                # ignore item zero, just a pound sign and possibly three junk characters
                # item 1 is the hour offset, and clue to export bugs we need to workaround
                sHd = lHdrs[1].strip(sStrip)
                m = pHrOffset.search(sHd)
                if m:
                    sTimeOffset = m.group('sHrOffset')
                    lTimeOffsetComponents = sTimeOffset.split(':')
                    sTimeOffsetHrs = lTimeOffsetComponents[0]
                    iHrOffset = int(sTimeOffsetHrs)
                else:
                    iHrOffset = 0
               
                dictChannels = {}
                # list items are: ChannelID, originalCol, Logger, Sensor, dataType, dataUnits, hrOffset, new
                lChannel = [0, 0, '', '', '', '', iHrOffset, '']

                for iCol in range(len(lHdrs)):
                    # skip items 0 & 1
                    if iCol > 1: # a header for a data column
                        lChannel[1] = iCol + 1 # stored columns are 1-based
                        sHd = lHdrs[iCol].strip(sStrip)
                        # get the type and units
                        lTypeUnits = sHd.split('(',2)
                        sTypeUnits = lTypeUnits[0].strip(' ')
                        lTypeUnits = sTypeUnits.split(',')
                        
                        ## TNP_MOD
                        ## If units specified, make units 'NA'
                        if(len(lTypeUnits)==1):
                            lTypeUnits.append('NA')
                        
                        if lTypeUnits[0]:
                            sType = lTypeUnits[0].strip(' ')
                        else:
                            sType = '(na)'
                        lChannel[4] = sType
                        if lTypeUnits[1]:
                            sUnits = lTypeUnits[1].strip(' ')
                        else:
                            sUnits = '(na)'
                        lChannel[5] = sUnits
                        # get the logger ID and sensor ID
                        m = pLogger.search(sHd)
                        if m:
                            sLoggerID = m.group('nLogger')
                        else:
                            sLoggerID = '(na)'
                        lChannel[2] = sLoggerID
                        m = pSensor.search(sHd)
                        if m:
                            sSensorID = m.group('nSensor')
                        else:
                            sSensorID = "(na)"
                        lChannel[3] = sSensorID
                        dictChannels[iCol + 1] = (lChannel[:])
                # gone through all the headers, apply bug workarounds here
                
                print 'Before Channel function'
                for ky in dictChannels.keys():
                    print ky, dictChannels[ky][:]
                for ky in dictChannels.keys():
                    scidb.assureChannelIsInDB(dictChannels[ky])
                print 'After Channel function'
                for ky in dictChannels.keys():
                    print ky, dictChannels[ky][:]
                # make a list of channel IDs for the rest of this file, for quick lookup
                # it is indexed by the columns list, and is zero-based
                lCh = []
                for iCol in range(len(lHdrs)):
                    iNomCol = iCol + 1
                    if iNomCol in dictChannels:
                        lChanSet = dictChannels[iNomCol][:]
                        lCh.append(lChanSet[0])
                    else: # does not correspond to a data colum
                        lCh.append(0) # placeholder, to make list indexes work right
                
            else: # not the 1st (header) line, but a line of data
                lData = rec['Line'].split('\t')
                # ignore item zero, a line number, not used
                sTimeStamp = lData[1]
                tsAsTime = datetime.datetime.strptime(sTimeStamp, "%Y-%m-%d %H:%M:%S")
                tsAsTime.replace(tzinfo=None) # make sure it does not get local timezone info
                tsAsTime = tsAsTime + datetime.timedelta(hours = -iHrOffset)
                tsAsDate = tsAsTime.date()
                stSQL = "INSERT INTO Data (UTTimestamp, ChannelID, Value) VALUES (?, ?, ?)"
                for iCol in range(len(lData)):
                    if iCol > 1: # an item of data
                        # give some progress diagnostics
                        dataRecItemCt += 1
                        if dataRecItemCt % 100 == 0:
                            self.msgArea.ChangeValue("Line " + str(rec['ID']) +
                                " of " + str(infoDict['lineCt']) + "; " +
                                str(dataRecsAdded) + " records added, " +
                                str(dataRecsDupSkipped) + " duplicates skipped.")
                            wx.Yield()
                        try: # much faster to try and fail than to test first
                            scidb.curD.execute(stSQL, (tsAsTime, lCh[iCol], lData[iCol]))
                            dataRecsAdded += 1 # count it
                        except sqlite3.IntegrityError: # item is already in Data table
                            dataRecsDupSkipped += 1 # count but otherwise ignore
                        finally:
                            wx.Yield()

        # finished parsing lines
        infoDict['numNewDataRecsAdded'] = dataRecsAdded
        infoDict['numDupDataRecsSkipped'] = dataRecsDupSkipped
        infoDict['stParseMsg'] = str(infoDict['lineCt']) + " lines processed; " + \
            str(dataRecsAdded) + " data records added to database, " + \
            str(dataRecsDupSkipped) + " duplicates skipped."
        self.msgArea.ChangeValue(infoDict['stParseMsg'])        
Esempio n. 5
0
    def parseGLTextFile(self, infoDict):
        """
        parse a GreenLogger text file
        """
        dataRecItemCt = 0
        dataRecsAdded = 0
        dataRecsDupSkipped = 0

#        print "About to put lines into temp table"
        self.putTextLinesIntoTmpTable(infoDict)
#        print "Finished putting lines into temp table"
        # get the metadata header
        stSQL = """SELECT Line FROM tmpLines WHERE Line LIKE '{"Instrument identifier":%' GROUP BY Line;"""
        mtDat =  scidb.curT.execute(stSQL).fetchone()
        dictHd = ast.literal_eval(mtDat['Line'])
        # somewhat redundant to do 'LoggerSerialNumber' here (fn 'assureChannelIsInDB' would fill in
        # later), but allows putting Model into the DB
        iInstSpecID = scidb.assureItemIsInTableField(dictHd['Model'], "InstrumentSpecs", "InstrumentSpec")
        iLoggerID = scidb.assureItemIsInTableField(dictHd['Instrument identifier'], "Loggers", "LoggerSerialNumber")
        scidb.curD.execute("UPDATE Loggers SET InstrumentSpecID = ? WHERE ID = ?;", (iInstSpecID, iLoggerID))
        
        # get the hour offset(s); most files will have only one
        stSQL = "SELECT substr(Line, 21, 3) as TZ FROM tmpLines " \
                "WHERE Line LIKE '____-__-__ __:__:__ ___%' GROUP BY TZ;"
        hrOffsets = scidb.curT.execute(stSQL).fetchall()
        for hrOffset in hrOffsets:
            iTimeZoneOffset = int(hrOffset['TZ'])
            # make a dictionary of channel IDs for data lines that have this hour offset
            # would be slightly different for different hour offsets
            lCols = dictHd['Columns'] # get col metadata; this is a list of dictionaries
#            # it is indexed by the columns list, and is zero-based
#            lCh = [0 for i in range(len(lCols))] # initially, fill with all zeroes
            dictChannels = {} # this is the dictionary we will build
            for dictCol in lCols:
                # somewhat redundant to fill these in before calling function 'assureChannelIsInDB', but
                #  allows assigning sensor device types
                iDeviceTypeID = scidb.assureItemIsInTableField(dictCol['Device'], "DeviceSpecs", "DeviceSpec")
                iSensorID = scidb.assureItemIsInTableField(dictCol['Identifier'], "Sensors", "SensorSerialNumber")
                scidb.curD.execute("UPDATE Sensors SET DeviceSpecID = ? WHERE ID = ?;", (iDeviceTypeID, iSensorID))
#                iDataTypeID = scidb.assureItemIsInTableField(dictCol['DataType'], "DataTypes", "TypeText")
#                iDataUnitsID = scidb.assureItemIsInTableField(dictCol['DataUnits'], "DataUnits", "UnitsText")
                # build list to create the channel
                # list items are: ChannelID, originalCol, Logger, Sensor, dataType, dataUnits, hrOffset, new
                lChannel = [0, dictCol['Order'], dictHd['Instrument identifier'], dictCol['Identifier'],
                            dictCol['DataType'], dictCol['DataUnits'], iTimeZoneOffset, '']
                dictChannels[dictCol['Order']] = (lChannel[:]) # the key is the column number
                scidb.assureChannelIsInDB(dictChannels[dictCol['Order']]) # get or create the channel
                iChannelID = dictChannels[dictCol['Order']][0]
                # store the column name as a Series
                iSeriesID = scidb.assureItemIsInTableField(dictCol['Name'], "DataSeries", "DataSeriesDescription")
                # tie it to this Channel, to offer later
                stSQLcs = 'INSERT INTO tmpChanSegSeries(ChannelID, SeriesID) VALUES (?, ?);'
                try:
                    scidb.curD.execute(stSQLcs, (iChannelID, iSeriesID))
                except sqlite3.IntegrityError:
                    pass # silently ignore duplicates

#            print 'Before Channel function'
#            for ky in dictChannels.keys():
#                print ky, dictChannels[ky][:]
#            for ky in dictChannels.keys():
#                scidb.assureChannelIsInDB(dictChannels[ky])
#            print 'After Channel function'
#            for ky in dictChannels.keys():
#                print ky, dictChannels[ky][:]

            # make a list of channel IDs for the set of lines with this HrOffset, for quick lookup
            # it is indexed by the columns list, and is zero-based
            lCh = []
            for iCol in range(len(lCols)):
                iNomCol = iCol + 1
                if iNomCol in dictChannels:
                    lChanSet = dictChannels[iNomCol][:]
                    lCh.append(lChanSet[0])
                else: # does not correspond to a data colum
                    lCh.append(0) # placeholder, to make list indexes work right             
        
            # done setting up channels, get data lines
            stSQL = "SELECT ID, Line FROM tmpLines WHERE substr(Line, 21, 3) = ? " \
                "AND Line LIKE '____-__-__ __:__:__ ___%' ORDER BY ID;"
            recs = scidb.curT.execute(stSQL, (hrOffset['TZ'],)).fetchall()
            for rec in recs:
                lData = rec['Line'].split('\t')
                # item zero is the timestamp followed by the timezone offset
                sTimeStamp = lData[0][:-4] # drop timezone offset, we already have it
                tsAsTime = datetime.datetime.strptime(sTimeStamp, "%Y-%m-%d %H:%M:%S")
                tsAsTime.replace(tzinfo=None) # make sure it does not get local timezone info
                tsAsTime = tsAsTime + datetime.timedelta(hours = -iTimeZoneOffset)
                tsAsDate = tsAsTime.date()
                stSQL = "INSERT INTO Data (UTTimestamp, ChannelID, Value) VALUES (?, ?, ?)"
                for iCol in range(len(lData)):
                    if iCol > 0: # an item of data
                        # give some progress diagnostics
                        dataRecItemCt += 1
                        if dataRecItemCt % 100 == 0:
                            self.msgArea.ChangeValue("Line " + str(rec['ID']) +
                                " of " + str(infoDict['lineCt']) + "; " +
                                str(dataRecsAdded) + " records added, " +
                                str(dataRecsDupSkipped) + " duplicates skipped.")
                            wx.Yield()
                        try: # much faster to try and fail than to test first
                            scidb.curD.execute(stSQL, (tsAsTime, lCh[iCol], lData[iCol]))
                            dataRecsAdded += 1 # count it
                        except sqlite3.IntegrityError: # item is already in Data table
                            dataRecsDupSkipped += 1 # count but otherwise ignore
                        finally:
                            wx.Yield()

        # finished parsing lines
        infoDict['numNewDataRecsAdded'] = dataRecsAdded
        infoDict['numDupDataRecsSkipped'] = dataRecsDupSkipped
        infoDict['stParseMsg'] = str(infoDict['lineCt']) + " lines processed; " + \
            str(dataRecsAdded) + " data records added to database, " + \
            str(dataRecsDupSkipped) + " duplicates skipped."
        self.msgArea.ChangeValue(infoDict['stParseMsg'])
Esempio n. 6
0
    def parseBlueTermFile(self, infoDict):
        """
        a BlueTerm log file can contain captured data from a series of daily
        GL text files, possibly even from different loggers
        """
        dataRecItemCt = 0
        dataRecsAdded = 0
        dataRecsDupSkipped = 0

        self.putTextLinesIntoTmpTable(infoDict)

        # find the different data dump segments
        # within each one, data will be from only one logger, and so we
        #  can use the same metadata header
        # make a list of 2-tuples; tuple item 0 = the starting ID of the
        # dump segment and tuple item 1 = the ending ID
        lDmpSegs = []
        stSQL = """SELECT ID, Line FROM tmpLines
            WHERE Line = '{"datadump":"begin"}' OR Line = '{"datadump":"end"}'
            ORDER BY ID;"""
        dmSegs = scidb.curT.execute(stSQL).fetchall()
        # begin or end may be missing, due to log interruption
        idSegStart = None
        idSegEnd = None
        # only use segments that have valid begin/end
        for dmSegItm in dmSegs:
            if dmSegItm['Line'] == '{"datadump":"begin"}':
                idSegStart = dmSegItm['ID']
            elif dmSegItm['Line'] == '{"datadump":"end"}':
                if idSegStart != None:
                    # make the tuple
                    tSeg = (idSegStart, dmSegItm['ID'])
                    # append to list
                    lDmpSegs.append(tSeg)
                    # null the vars
                    idSegStart = None
                    idSegEnd = None
            else: # this should not happen, but tie up if it does
                idSegStart = None
                idSegEnd = None
                    
        print "dump segments:", lDmpSegs
        
        # being worked on >>>>
        iNumSegs = len(lDmpSegs)
        if iNumSegs == 0:
            infoDict['stParseMsg'] = " No valid data segments in file."
            # will skip following loop
        iCtSeg = 0
        for tSeg in lDmpSegs:
            iCtSeg += 1
            idSegStart, idSegEnd = tSeg
            # get the metadata header; if multiple will all be the same for one dump segment
            stSQL = """SELECT Line 
                    FROM tmpLines 
                    WHERE ID > ? AND ID < ? 
                    AND Line LIKE '{"Instrument identifier":%'
                    GROUP BY Line;"""
            mtDat =  scidb.curT.execute(stSQL, tSeg).fetchone()
            dictHd = ast.literal_eval(mtDat['Line'])
            # somewhat redundant to do 'LoggerSerialNumber' here (fn 'assureChannelIsInDB' would fill in
            # later), but allows putting Model into the DB
            iInstSpecID = scidb.assureItemIsInTableField(dictHd['Model'], "InstrumentSpecs", "InstrumentSpec")
            iLoggerID = scidb.assureItemIsInTableField(dictHd['Instrument identifier'], "Loggers", "LoggerSerialNumber")
            scidb.curD.execute("UPDATE Loggers SET InstrumentSpecID = ? WHERE ID = ?;", (iInstSpecID, iLoggerID))
            
            # get the hour offset(s); most files will have only one
            stSQL = "SELECT substr(Line, 21, 3) as TZ FROM tmpLines " \
                    "WHERE ID > ? AND ID < ? " \
                    "AND Line LIKE '____-__-__ __:__:__ ___%' GROUP BY TZ;"
            hrOffsets = scidb.curT.execute(stSQL, tSeg).fetchall()
            for hrOffset in hrOffsets:
                iTimeZoneOffset = int(hrOffset['TZ'])
                # make a dictionary of channel IDs for data lines that have this hour offset
                # would be slightly different for different hour offsets
                lCols = dictHd['Columns'] # get col metadata; this is a list of dictionaries
    #            # it is indexed by the columns list, and is zero-based
    #            lCh = [0 for i in range(len(lCols))] # initially, fill with all zeroes
                dictChannels = {} # this is the dictionary we will build
                for dictCol in lCols:
                    # somewhat redundant to fill these in before calling function 'assureChannelIsInDB', but
                    #  allows assigning sensor device types
                    iDeviceTypeID = scidb.assureItemIsInTableField(dictCol['Device'], "DeviceSpecs", "DeviceSpec")
                    iSensorID = scidb.assureItemIsInTableField(dictCol['Identifier'], "Sensors", "SensorSerialNumber")
                    scidb.curD.execute("UPDATE Sensors SET DeviceSpecID = ? WHERE ID = ?;", (iDeviceTypeID, iSensorID))
    #                iDataTypeID = scidb.assureItemIsInTableField(dictCol['DataType'], "DataTypes", "TypeText")
    #                iDataUnitsID = scidb.assureItemIsInTableField(dictCol['DataUnits'], "DataUnits", "UnitsText")
                    # build list to create the channel
                    # list items are: ChannelID, originalCol, Logger, Sensor, dataType, dataUnits, hrOffset, new
                    lChannel = [0, dictCol['Order'], dictHd['Instrument identifier'], dictCol['Identifier'],
                                dictCol['DataType'], dictCol['DataUnits'], iTimeZoneOffset, '']
                    dictChannels[dictCol['Order']] = (lChannel[:]) # the key is the column number
                    scidb.assureChannelIsInDB(dictChannels[dictCol['Order']]) # get or create the channel
                    iChannelID = dictChannels[dictCol['Order']][0]
                    # store the column name as a Series
                    iSeriesID = scidb.assureItemIsInTableField(dictCol['Name'], "DataSeries", "DataSeriesDescription")
                    # tie it to this Channel, to offer later
                    stSQLcs = 'INSERT INTO tmpChanSegSeries(ChannelID, SeriesID) VALUES (?, ?);'
                    try:
                        scidb.curD.execute(stSQLcs, (iChannelID, iSeriesID))
                    except sqlite3.IntegrityError:
                        pass # silently ignore duplicates

    #            print 'Before Channel function'
    #            for ky in dictChannels.keys():
    #                print ky, dictChannels[ky][:]
    #            for ky in dictChannels.keys():
    #                scidb.assureChannelIsInDB(dictChannels[ky])
    #            print 'After Channel function'
    #            for ky in dictChannels.keys():
    #                print ky, dictChannels[ky][:]

                # make a list of channel IDs for the set of lines with this HrOffset, for quick lookup
                # it is indexed by the columns list, and is zero-based
                lCh = []
                for iCol in range(len(lCols)):
                    iNomCol = iCol + 1
                    if iNomCol in dictChannels:
                        lChanSet = dictChannels[iNomCol][:]
                        lCh.append(lChanSet[0])
                    else: # does not correspond to a data colum
                        lCh.append(0) # placeholder, to make list indexes work right             
            
                # done setting up channels, get data lines
                stSQL = "SELECT ID, Line FROM tmpLines " \
                    "WHERE ID > ? AND ID < ? " \
                    "AND substr(Line, 21, 3) = ? " \
                    "AND Line LIKE '____-__-__ __:__:__ ___%' ORDER BY ID;"
                recs = scidb.curT.execute(stSQL, (idSegStart, idSegEnd, hrOffset['TZ'],)).fetchall()
                iNumSegLines = len(recs)
                iCtSegLines = 0
                for rec in recs:
                    iCtSegLines += 1
                    lData = rec['Line'].split('\t')
                    # item zero is the timestamp followed by the timezone offset
                    sTimeStamp = lData[0][:-4] # drop timezone offset, we already have it
                    tsAsTime = datetime.datetime.strptime(sTimeStamp, "%Y-%m-%d %H:%M:%S")
                    tsAsTime.replace(tzinfo=None) # make sure it does not get local timezone info
                    tsAsTime = tsAsTime + datetime.timedelta(hours = -iTimeZoneOffset)
                    tsAsDate = tsAsTime.date()
                    stSQL = "INSERT INTO Data (UTTimestamp, ChannelID, Value) VALUES (?, ?, ?)"
                    for iCol in range(len(lData)):
                        if iCol > 0: # an item of data
                            # give some progress diagnostics
                            dataRecItemCt += 1
                            if dataRecItemCt % 100 == 0:
                                self.msgArea.ChangeValue("Segment " + str(iCtSeg) +
                                    " of " + str(iNumSegs) +
                                    ", HrOffset " + str(iTimeZoneOffset) +
                                    ", Line " + str(iCtSegLines) +
                                    " of " + str(iNumSegLines) + "; " +
                                    str(dataRecsAdded) + " records added, " +
                                    str(dataRecsDupSkipped) + " duplicates skipped.")
                                wx.Yield()
                            try: # much faster to try and fail than to test first
                                scidb.curD.execute(stSQL, (tsAsTime, lCh[iCol], lData[iCol]))
                                dataRecsAdded += 1 # count it
                            except sqlite3.IntegrityError: # item is already in Data table
                                dataRecsDupSkipped += 1 # count but otherwise ignore
                            finally:
                                wx.Yield()
        # <<<<< being worked on
        # finished parsing lines
        infoDict['numNewDataRecsAdded'] = dataRecsAdded
        infoDict['numDupDataRecsSkipped'] = dataRecsDupSkipped
        infoDict['stParseMsg'] = str(infoDict['lineCt']) + " lines processed; " + \
            str(dataRecsAdded) + " data records added to database, " + \
            str(dataRecsDupSkipped) + " duplicates skipped."
        self.msgArea.ChangeValue(infoDict['stParseMsg'])
Esempio n. 7
0
    def parseHoboWareTextFile(self, infoDict):
        """
        parse a data file exported as text by HoboWare
        """
        sStrip = '" \x0a\x0d'  # characters to strip from parsed items
        # regular expression pattern to find logger number
        pLogger = re.compile(r"LGR S/N: (?P<nLogger>\d+)")
        # regular expression pattern to find sensor number
        pSensor = re.compile(r"SEN S/N: (?P<nSensor>\d+)")
        # regular expression pattern to find hour offset
        pHrOffset = re.compile(r"Time, GMT(?P<sHrOffset>.+)")
        dataRecItemCt = 0
        dataRecsAdded = 0
        dataRecsDupSkipped = 0

        print "About to put lines into temp table"
        self.putTextLinesIntoTmpTable(infoDict)
        print "Finished putting lines into temp table"
        # parse file, start with header line; 1st line in this file format
        scidb.curT.execute("SELECT * FROM tmpLines ORDER BY ID;")
        for rec in scidb.curT:
            if rec["ID"] == 1:
                """
                Build a dictionary of the channels.
                The indexes will be the column numbers because all data files have at least that.
                The value will be a 7-membered list
                The first item in each list will be the primary key in the Channels table.
                The list is first created with this = 0.
                The rest of the list is built up, then the list is sent to a function
                 that looks in the database.
                The function fills in the primary key, either existing or newly created.
                List item 7 will be "new" if the record is new, otherwise "existing"
                The list contains the text values of logger serial number, sensor serial number,
                 data type, and data units.
                The function takes care of filling in all these in their respective tables.
                When the dictionary is complete, the calling procedure can quickly insert data values
                 into the data table by just pulling list item [0] for the dictionary key, which key
                 is the column number in the source file.
                This loose structure allows some kludgy workarounds for bugs that were in some versions
                 of the data files.
                """
                lHdrs = rec["Line"].split("\t")
                # ignore item zero, just a pound sign and possibly three junk characters
                # item 1 is the hour offset, and clue to export bugs we need to workaround
                sHd = lHdrs[1].strip(sStrip)
                m = pHrOffset.search(sHd)
                if m:
                    sTimeOffset = m.group("sHrOffset")
                    lTimeOffsetComponents = sTimeOffset.split(":")
                    sTimeOffsetHrs = lTimeOffsetComponents[0]
                    iHrOffset = int(sTimeOffsetHrs)
                else:
                    iHrOffset = 0

                dictChannels = {}
                # list items are: ChannelID, originalCol, Logger, Sensor, dataType, dataUnits, hrOffset, new
                lChannel = [0, 0, "", "", "", "", iHrOffset, ""]

                for iCol in range(len(lHdrs)):
                    # skip items 0 & 1
                    if iCol > 1:  # a header for a data column
                        lChannel[1] = iCol + 1  # stored columns are 1-based
                        sHd = lHdrs[iCol].strip(sStrip)
                        # get the type and units
                        lTypeUnits = sHd.split("(", 2)
                        sTypeUnits = lTypeUnits[0].strip(" ")
                        lTypeUnits = sTypeUnits.split(",")
                        # check validity
                        #                        print "lTypeUnits (before fix):", lTypeUnits
                        # If there are no umits, this is a non-data column like "Bad Battery"
                        if len(lTypeUnits) == 1:
                            lTypeUnits.append("non-data")
                        #                        print "lTypeUnits (after fix):", lTypeUnits
                        if lTypeUnits[0]:
                            sType = lTypeUnits[0].strip(" ")
                        else:
                            sType = "(na)"
                        lChannel[4] = sType
                        if lTypeUnits[1]:
                            sUnits = lTypeUnits[1].strip(" ")
                        else:
                            sUnits = "(na)"
                        lChannel[5] = sUnits
                        # get the logger ID and sensor ID
                        m = pLogger.search(sHd)
                        if m:
                            sLoggerID = m.group("nLogger")
                        else:
                            sLoggerID = "(na)"
                        lChannel[2] = sLoggerID
                        m = pSensor.search(sHd)
                        if m:
                            sSensorID = m.group("nSensor")
                        else:
                            sSensorID = "(na)"
                        lChannel[3] = sSensorID
                        dictChannels[iCol + 1] = lChannel[:]
                # gone through all the headers, apply bug workarounds here

                print "Before Channel function"
                for ky in dictChannels.keys():
                    print ky, dictChannels[ky][:]
                for ky in dictChannels.keys():
                    scidb.assureChannelIsInDB(dictChannels[ky])
                print "After Channel function"
                for ky in dictChannels.keys():
                    print ky, dictChannels[ky][:]
                # make a list of channel IDs for the rest of this file, for quick lookup
                # it is indexed by the columns list, and is zero-based
                lCh = []
                for iCol in range(len(lHdrs)):
                    iNomCol = iCol + 1
                    if iNomCol in dictChannels:
                        lChanSet = dictChannels[iNomCol][:]
                        lCh.append(lChanSet[0])
                    else:  # does not correspond to a data colum
                        lCh.append(0)  # placeholder, to make list indexes work right

            else:  # not the 1st (header) line, but a line of data
                lData = rec["Line"].split("\t")
                # ignore item zero, a line number, not used
                sTimeStamp = lData[1]
                try:
                    tsAsTime = datetime.datetime.strptime(sTimeStamp, "%Y-%m-%d %H:%M:%S")
                except:  # time format is nonstandard, give a try to wx datetime parsing
                    # does not work, perhaps it's a wx/Python conversion problem
                    # but string e.g. "05/16/10 12:00:00 PM" gives dates in 2106
                    #                    dt = wx.DateTime() # Uninitialized datetime
                    #                    DateTimeValid = dt.ParseDateTime(sTimeStamp)
                    #                    if DateTimeValid != -1: # valid datetime
                    #                        tsAsTime = datetime.datetime.fromtimestamp(dt.GetTicks())
                    #                    else:
                    print "unresolvable timestamp:", sTimeStamp
                    self.msgArea.ChangeValue("unresolvable timestamp: " + TimeStamp)
                    return
                tsAsTime.replace(tzinfo=None)  # make sure it does not get local timezone info
                tsAsTime = tsAsTime + datetime.timedelta(hours=-iHrOffset)
                tsAsDate = tsAsTime.date()
                stSQL = "INSERT INTO Data (UTTimestamp, ChannelID, Value) VALUES (?, ?, ?)"
                for iCol in range(len(lData)):
                    if iCol > 1:  # an item of data
                        # give some progress diagnostics
                        dataRecItemCt += 1
                        if dataRecItemCt % 100 == 0:
                            self.msgArea.ChangeValue(
                                "Line "
                                + str(rec["ID"])
                                + " of "
                                + str(infoDict["lineCt"])
                                + "; "
                                + str(dataRecsAdded)
                                + " records added, "
                                + str(dataRecsDupSkipped)
                                + " duplicates skipped."
                            )
                            wx.Yield()
                        try:  # much faster to try and fail than to test first
                            # stripping commas from floats is a hack, but had to get this working
                            #                            if ',' in lData[iCol]:
                            #                                print lData[iCol], lData[iCol].replace(',',''), tsAsTime, lCh[iCol]
                            scidb.curD.execute(stSQL, (tsAsTime, lCh[iCol], lData[iCol].replace(",", "")))
                            #                            print "record added"
                            dataRecsAdded += 1  # count it
                        except sqlite3.IntegrityError:  # error adding item
                            # distinguish duplicate error from invalid Value error
                            err_type, err_value, err_traceback = sys.exc_info()
                            # if Value is non-numeric, we get 'constraint failed'; silently ignore
                            if "not unique" in repr(err_value):  # only count these
                                dataRecsDupSkipped += 1  # count but otherwise ignore
                        #                            if ',' in lData[iCol]:
                        #                                print "record not added:", repr(err_value)
                        #                                print tsAsTime, lCh[iCol], lData[iCol]
                        finally:
                            wx.Yield()

        # finished parsing lines
        infoDict["numNewDataRecsAdded"] = dataRecsAdded
        infoDict["numDupDataRecsSkipped"] = dataRecsDupSkipped
        infoDict["stParseMsg"] = (
            str(infoDict["lineCt"])
            + " lines processed; "
            + str(dataRecsAdded)
            + " data records added to database, "
            + str(dataRecsDupSkipped)
            + " duplicates skipped."
        )
        self.msgArea.ChangeValue(infoDict["stParseMsg"])