def checkPointMatchesFocal(dataLines): ''' Checks data in dataLines for "point" lines where the indicated focal individual doesn't match the "current" focal individual and/or their dates don't match. "Current", meaning the individual listed in the most-recent same-day focal header. Includes any "point" lines where there is no most- recent same-day focal header. dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. Returns a list of list of strings: the "point" lines that don't match the current focal. ''' from constants import focalAbbrev, pntAbbrev lastFocal = [] nonMatchingPoints = [] for line in dataLines: if isType(line, focalAbbrev): lastFocal = line[:] elif isType(line, pntAbbrev): if lastFocal == []: #PNT with no HDR yet, report this nonMatchingPoints.append(line) elif not sameActor(lastFocal, line) or not sameDate(lastFocal, line): nonMatchingPoints.append(line) return nonMatchingPoints
def countPointsPerFocal(dataLines): ''' Counts the number of "point" lines recorded during each focal sample. Returns a list of ([focal header list of strings], number of points) tuples. dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. They are also presumed to be in chronological order. ''' from constants import focalAbbrev, pntAbbrev focalCounts = {} lastFocal = 'NONE YET' focalCounts[lastFocal] = 0 for line in dataLines: if isType(line, focalAbbrev): lastFocal = '\t'.join(line) focalCounts[lastFocal] = 0 elif isType(line, pntAbbrev): focalCounts[lastFocal] += 1 # Counting done, now convert focals back to string lists. Keep them sorted! outLines = [] for (focal, count) in sorted(focalCounts.items()): focalAsList = focal.split('\t') outLines.append([focalAsList, count]) return outLines
def checkNeighborsPerPoint(dataLines): ''' Counts the number of neighbor lines for each "point" line in dataLines. dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. Returns a list of (point, number of neighbors) tuples. The "point" is a string: the items from a "point" list of strings joined and tab-delimited. The "number of neighbors" is an integer. ''' from constants import pntAbbrev, neighborAbbrev lastPoint = 'NONE YET' pointsAndCounts = {} pointsAndCounts[lastPoint[:]] = 0 for line in dataLines: if isType(line, pntAbbrev): lastPoint = '\t'.join(line) pointsAndCounts[lastPoint[:]] = 0 elif isType(line, neighborAbbrev): pointsAndCounts[lastPoint[:]] += 1 return sorted(pointsAndCounts.items(), key = lambda pair: pair[0])
def checkActorIsActee(dataLines): ''' Checks ad-lib and neighbor lines in dataLines for cases where the two indicated individuals are the same. dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. Returns a list of lists of strings: the lines where this is true. ''' from constants import adlibAbbrev, neighborAbbrev linesOfInterest = [line for line in dataLines if isType(line, adlibAbbrev) or isType(line, neighborAbbrev)] return [line for line in linesOfInterest if line[5] == line[7]]
def checkDuplicateFocals(dataLines): ''' dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. The following are assumed true about the lists of strings: 1) The line "type" is indicated by the string at [0] 2) The date for event is at [2] 3) For lines indicating a new focal sample, the focal's ID is at [5] Gathers all the lines indicating the beginning of new focal samples, and checks for cases where the same individual was sampled more than once in the same day. Returns a list of (date, name) tuples (date and name are both strings), sorted by date, listing all duplicate focals. If none found, returns an empty list. ''' from constants import focalAbbrev # Make list of (date, name) tuples for each focal sample dateNames = [(line[2],line[5]) for line in dataLines if isType(line, focalAbbrev)] duplicateFocals = set() for focal in dateNames: if dateNames.count(focal) > 1: duplicateFocals.add(focal) if len(duplicateFocals) > 0: # Then we have some duplicates return sorted(list(duplicateFocals), key = lambda focal: focal[0]) return []
def countFocalTypes(dataLines): ''' Counts the number of lines in dataLines that are focal headers, grouped by focal sample type (juvenile, adult female, or other). dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. Returns a dictionary whose keys are all the focal sample types that occur in the data, and whose values are the number of times those lines occur. ''' from constants import focalAbbrev, stypeAdultFem, stypeJuv, stypeOther allFocals = [line for line in dataLines if isType(line, focalAbbrev)] focalTypes = [stypeAdultFem, stypeJuv] focalsDict = {} focalsDict[stypeAdultFem] = 0 focalsDict[stypeJuv] = 0 focalsDict[stypeOther] = 0 for focal in allFocals: if focal[6] in focalTypes: focalsDict[focal[6]] += 1 else: focalsDict[stypeOther] += 1 # Remove the "other" stype from the dictionary if it's zero. It SHOULD # always be zero, so it's only noteworthy when it's > 0. if focalsDict[stypeOther] == 0: focalsDict.pop(stypeOther) return focalsDict
def checkNeighborNotReal(dataLines): ''' Checks neighbor lines in dataLines for cases where the neighbor is noted as "INF" (a not-yet-named infant) or some other placeholder-type value. All legitimate names used as a neighbor should be exactly 3 characters, so any names that aren't will be flagged here, whether or not they were specifically listed as possible "placeholder" values beforehand. This function is different from checkActorActeeNotReal in that it uses a different (smaller) set of "placeholder" values. Some of values used as neighbors are not allowed for use in ad-libs. See Babase documentation. dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. Returns a list of lists of strings: the lines where this is true. ''' from constants import unnamedCodes, neighborAbbrev # Make a set of known "placeholder" codes to check for plcHoldrs = set(unnamedCodes) linesOfInterest = [line for line in dataLines if isType(line, neighborAbbrev)] return [line for line in linesOfInterest if line[5] in plcHoldrs or line[7] in plcHoldrs or len(line[5]) != 3 or len(line[7]) != 3]
def checkMountsConsortsInvolvedFocal(dataLines): ''' Checks data for cases where a mount, e*********n, or consort was recorded and makes sure either the actor or actee was the focal individual. Returns a list of list of strings representing all the cases where this is true. Checks both "note" lines and "ad-lib" lines for these behaviors. dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. [0] in each list of strings is the "type" of data recorded in that line. ''' from constants import focalAbbrev, noteAbbrev, adlibAbbrev, bb_consort, bb_mount, bb_ejaculation mountsEtc = [bb_consort, bb_mount, bb_ejaculation] outLines = [] lastFocal = [] focalIndiv = '' # Gather lines of interest for line in dataLines: if isType(line, focalAbbrev): lastFocal = line[:] focalIndiv = lastFocal[5].upper() elif isType(line, noteAbbrev) and behaviorsInNote(line, mountsEtc): if focalIndiv == '': # no focal yet outLines.append(line) continue interaction = (line[-1]).split() # SHOULD be [actor, act, actee] if interaction[1].upper() in mountsEtc: # this is an admittedly poor attempt to parse actor/actee from a note actor = interaction[0].upper() actee = interaction[2].upper() if focalIndiv not in [actor, actee]: outLines.append(line) elif isType(line, adlibAbbrev) and line[6] in mountsEtc: if focalIndiv == '': # no focal yet outLines.append(line) else: actor = line[5] actee = line[7] if focalIndiv not in [actor, actee]: outLines.append(line) return outLines
def checkMountsConsortsDuringFocal(dataLines): ''' Checks if mounts, ejaculations, and consorts were recorded during a focal sample. Returns a list of list of strings representing all the cases that were outside a focal sample. Checks both "note" lines and "ad-lib" lines for these behaviors. dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. [0] in each list of strings is the "type" of data recorded in that line. ''' from constants import focalAbbrev, noteAbbrev, adlibAbbrev, bb_consort, bb_mount, bb_ejaculation from datetime import datetime mountsEtc = [bb_consort, bb_mount, bb_ejaculation] outLines = [] lastFocal = [] focalEnd = '' for line in dataLines: if isType(line, focalAbbrev): lastFocal = line[:] focalEnd = ' '.join([lastFocal[2], lastFocal[7]]) focalEnd = datetime.strptime(focalEnd, '%Y-%m-%d %H:%M:%S') # Check for behaviors in notes elif isType(line, noteAbbrev) and behaviorsInNote(line, mountsEtc): if len(lastFocal) == 0: # no focal yet outLines.append(line) elif not duringFocal(line, focalEnd): outLines.append(line) # Check for behaviors in ad-libs elif isType(line, adlibAbbrev) and line[6] in mountsEtc: if len(lastFocal) == 0: # no focal yet outLines.append(line) elif not duringFocal(line, focalEnd): outLines.append(line) return outLines
def checkFocalInfantStatus(dataLines, moms): ''' dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. The following are assumed true about the lists of strings: 1) The line "type" is indicated by the string at [0] 2) The date and time of the event are at [2] and [3] respectively 3) In lines indicating a focal point, the focal individual's ID is at [5] and their activity/position/etc. are in [6]. 4) Data in [6] indicate presence/absence of an infant if the string is exactly 4 characters long. Characters 3-4 are about the infant. If she has no infant, character #3 will be pntActNoInfant. If she does have one, it will be anything else. Gathers all the lines from dataLines that represent individual "points" in a focal sample, then checks each one to see if the row's data says anything about the presence or absence of the focal's infant. Next, this function uses the provided dictionary, moms, to see if the focal really did or did not have an infant on that day. Any rows where the focal data disagrees with "moms", regarding whether or not she has an infant are returned. Each row has an extra string appended to the end, indicating her infant status according to "moms": "(HAS INFANT)" or "(NO INFANT)". Returns a list of lists of strings, the aforementioned rows. If no rows found with this discrepancy, returns an empty list. ''' from constants import pntAbbrev, pntActNoInfant thePnts = [line for line in dataLines if isType(line,pntAbbrev)] wrongInfPnts = [] # Dict with string explanation of whether the focal individual has an # infant according to "moms" momsStr = {} momsStr[True] = '(HAS INFANT)' momsStr[False] = '(NO INFANT)' for pnt in thePnts: if len(pnt[6]) < 4: # This point does not say anything about infants. Ignore. continue # Else, it does say something about infants. Does it mention one, or # does it specifically say that she doesn't have one? pntSaysInfant = (pnt[6][2] != pntActNoInfant) # Now, what does other data (demography data, presumably) say about # whether she has an infant? demogSaysInfant = hasInfant(pnt, moms) if pntSaysInfant != demogSaysInfant: # Discrepant! Add to return list. outLine = pnt[:] outLine.append(momsStr[demogSaysInfant]) wrongInfPnts.append(outLine) return wrongInfPnts
def checkNotesNoFocals(dataLines): ''' Checks data in dataLines for "note" lines on days with no focal samples recorded. This is important, because these notes will not be recorded in Babase. dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. Returns a list of list of strings: the "note" lines on days with no focals. ''' from constants import focalAbbrev, noteAbbrev focalDates = set() notes = [] for line in dataLines: if isType(line, focalAbbrev): focalDates.add(line[2]) # Add the focal date elif isType(line, noteAbbrev): notes.append(line) return [note for note in notes if note[2] not in focalDates]
def checkBehavsInNotes(dataLines, criteriaBehavs): ''' Checks all "note" lines for cases where any of the behaviors in criteriaBehavs occur. dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. Returns a list of lists of strings: the lines where this is true. ''' from constants import noteAbbrev notes = [line for line in dataLines if isType(line, noteAbbrev)] return [note for note in notes if behaviorsInNote(note, criteriaBehavs)]
def getPointsPerFocal(dataLines): ''' Gathers the "point" lines recorded during each focal sample. Returns a dictionary whose keys are focal headers (each a single string), and whose values are lists of associated points (each its own list of strings). dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. They are also presumed to be in chronological order. ''' from constants import focalAbbrev, pntAbbrev focalCounts = {} lastFocal = 'NONE YET' focalCounts[lastFocal] = [] for line in dataLines: if isType(line, focalAbbrev): lastFocal = '\t'.join(line) focalCounts[lastFocal] = [] elif isType(line, pntAbbrev): focalCounts[lastFocal].append(line) return focalCounts
def countLines(dataLines, sampleType=''): ''' dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. sampleType is a string indicating which "type" of data to count in dataLines. Counts the number of lines in dataLines are of "type" sampleType. If sampleType is not provided or is the empty string, counts all lines. Returns an integer, the number of lines. ''' if sampleType == '': return len(dataLines) return len([line for line in dataLines if isType(line, sampleType)])
def checkInvalidFocalTypes(dataLines): ''' Checks all focal header lines for invalid focal sample types. These will occur, for example, if an observer accidentally starts a focal on an adult male. dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. Returns a list of lists of strings: the lines where this is true. ''' from constants import focalAbbrev, stypeAdultFem, stypeJuv focals = [line for line in dataLines if isType(line, focalAbbrev)] return [focal for focal in focals if focal[6] not in [stypeAdultFem, stypeJuv]]
def pointsOutOfSight(dataLines): ''' dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. [0] in each list of strings is the "type" of data recorded in that line. In "point" lines, [6] is the point's "activity", or "out of sight". Searches through the data for "out of sight" points and gathers them into a list. Returns a list of lists of strings: all the "out of sight" lines. ''' from constants import pntAbbrev, outOfSightValue oosLines =[line for line in dataLines if isType(line, pntAbbrev) and line[6] == outOfSightValue] return oosLines
def kenyaLinesPerDay(dataLines, sampleType='', typeName=''): ''' Just like the "countLinesPerDay" function in errorCheckingHelpers, but this one adjusts lots of the formatting for the sake of our Kenyan observers. Also adds a "typeName" parameter, allowing a more-common word/phrase to use in output instead of the sampleType. e.g. if sampleType = focalAbbrev, typeName might be "Focal Sample". ''' # Get all possible dates from dataLines and add them to a dictionary that # will count lines per date. dateCounts = {} for line in dataLines: if line[2] not in dateCounts: dateCounts[line[2]] = 0 # Condense dataLines into only those with correct sampleType. # # Do this after collecting possible dates so we can keep dates with zero lines. theseLines = [] if sampleType == '': theseLines = dataLines[:] else: theseLines = [line for line in dataLines if isType(line, sampleType)] # Go through data and count lines per date for line in theseLines: dateCounts[line[2]] += 1 # Handle typeName, if provided. lineType = typeName or sampleType or 'Line' # Write the results resultInfo = [] commentLine = lineType + 's Collected Per Day:' resultInfo.append(commentLine) for (date, count) in sorted(dateCounts.items(), key=lambda pair: pair[0]): kenyaDate = kenyaDateTime(date, False) commentLine = '\t' + kenyaDate + ':\t' + str(count) resultInfo.append(commentLine) return '\n'.join(resultInfo)
def countLinesPerDay(dataLines, sampleType=''): ''' dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. sampleType is a string indicating which "type" of data to count in dataLines. Each line's date should be at [2]. Gets all distinct dates from the data in dataLines, then notes how many lines of "type" sampleType occurred each day. If sampleType is not provided or is the empty string, counts all lines. Returns a single string that will include several line breaks. ''' # Get all possible dates from dataLines and add them to a dictionary that # will count lines per date. dateCounts = {} for line in dataLines: if line[2] not in dateCounts: dateCounts[line[2]] = 0 # Condense dataLines into only those with correct sampleType. # # Do this after collecting possible dates so we can keep dates with zero lines. theseLines = [] if sampleType == '': theseLines = dataLines[:] else: theseLines = [line for line in dataLines if isType(line, sampleType)] # Go through data and count lines per date for line in theseLines: dateCounts[line[2]] += 1 # Write the results resultInfo = [] commentLine = sampleType + ' Lines Collected Per Day:' resultInfo.append(commentLine) for (date, count) in sorted(dateCounts.items(), key = lambda pair: pair[0]): commentLine = '\t' + date + ':\t' + str(count) resultInfo.append(commentLine) return '\n'.join(resultInfo)
def checkFocalOverlaps(dataLines): ''' dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. [0] in each list of strings is the "type" of data recorded in that line. Checks for overlapping focals in dataLines, and returns a list of (focal1, focal2) tuples. focal1 and 2 are both lists of strings. focal1 is a "header" line, focal2 is the "header" of the last focal to occur during focal1 ''' from constants import focalAbbrev allFocals = [line for line in dataLines if isType(line, focalAbbrev)] overlapHdrs = [] for focal in allFocals: allOverlaps = findOverlaps(focal, allFocals) for overlap in allOverlaps: overlapHdrs.append((focal,overlap)) return overlapHdrs
def checkActorActeeNotReal(dataLines): ''' Checks ad-lib lines in dataLines for cases where either the actor or actee is noted as "NULL" or some other placeholder-type value. This function is different from checkNeighborNotReal in that it uses a different (larger) set of "placeholder" values. Some of these values are okay for use as neighbors. See Babase documentation. dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. Returns a list of lists of strings: the lines where this is true. ''' from constants import unknSnames, unnamedCodes, adlibAbbrev # Make a set of known "placeholder" codes to check for plcHoldrs = set(unknSnames.keys()).union(unnamedCodes) linesOfInterest = [line for line in dataLines if isType(line, adlibAbbrev)] return [line for line in linesOfInterest if line[5] in plcHoldrs or line[7] in plcHoldrs]
def checkNeighborNotReal(dataLines): ''' Checks neighbor lines in dataLines for cases where the neighbor is noted as "INF" (a not-yet-named infant) or some other placeholder-type value. This function is different from checkActorActeeNotReal in that it uses a different (smaller) set of "placeholder" values. Some of values used as neighbors are not allowed for use in ad-libs. See Babase documentation. dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. Returns a list of lists of strings: the lines where this is true. ''' from constants import unnamedCodes, neighborAbbrev # Make a set of known "placeholder" codes to check for plcHoldrs = set(unnamedCodes) linesOfInterest = [line for line in dataLines if isType(line, neighborAbbrev)] return [line for line in linesOfInterest if line[5] in plcHoldrs or line[7] in plcHoldrs]
def checkDuplicateGroups(dataLines): ''' dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. The following are assumed true about the lists of strings: 1) The line "type" is indicated by the string at [0] 2) The date for event is at [2] 3) For lines indicating a new focal sample, the focal group is at [4] Gathers all the lines indicating the beginning of new focal samples, and checks for cases where more than one group was sampled on a single day. Returns a list of (date, groups) tuples (date and groups are both strings), sorted by date, listing all duplicates. If none found, returns an empty list. ''' from constants import focalAbbrev # Make set of (date, group) tuples from all the data focalInfoSet = set([(line[2], line[4]) for line in dataLines if isType(line, focalAbbrev)]) # Dictionary of dates (keys) and list of group(s) (values) sampled on those dates datesGroups = {} for (focalDate, focalGrp) in focalInfoSet: if focalDate not in datesGroups: datesGroups[focalDate] = [focalGrp] else: datesGroups[focalDate].append(focalGrp) duplicateFocals = [] for (focalDate, groups) in sorted(datesGroups.items(), key = lambda pair: pair[0]): # Sort by date if len(groups) > 1: duplicateFocals.append((focalDate, str(groups))) return duplicateFocals
def importDataForCompares(dataLines): ''' dataLines is a list of lists of strings: the data from a processed prim8 data file, stripped and split. Collects all "header" rows (beginning rows for a focal sample) into a list. A tab-delimited "Y" or "N" is added to each of these, indicating if the focal was completed. "N" is for samples that had no data, no points, or only out- of-sight points. "Y" is used for all others. After adding Y/N to each sample's header, some values in these rows are removed, while others are rearranged, as follows: Before: HDR OBS yyyy-mm-dd HH:MM:SS GGG NNN TTT hh:mm:ss Y/N HDR is the code used to indicate that this is a header. OBS is the observer's initials. yyyy-mm-dd is obvious. HH:MM:SS is the start time GGG is the group's abbreviation NNN is the focal individual's sname TTT is the focal sample type hh:mm:ss is the end time Y/N is the one-character code added, described above After: HH:MM:SS yyyy-mm-dd GGG OBS NNN Y/N FULLTEXT NumPoints NumIS (Same abbreviations as above, except...) FULLTEXT is the full text of the line in "before", joined together as a single string. Useful for output. NumPoints is an integer, indicating how many points were recorded in this sample. Points that occurred during this time but with a different focal individual are NOT included in this count. NumIS is an integer, the number of points in this sample that were recorded "in sight" (i.e. not out of sight). Points that occurred during this time but with a different focal individual are NOT included in this count. This order is used because it's similar to that used in the log file. It makes comparisons easier to write. Returns a list of lists of strings, each list of strings for each focal sample, and in the "After" format shown above. ''' # List of all the header rows, i.e. the focal samples theFocals = [line[:] for line in dataLines if isType(line, focalAbbrev)] # List of all the focals that were not "completed". This means any # samples with no points, or only out-of-sight points. If we # first remove all the out-of-sight points, then gathering samples # with no points will include all of these at once. badData = dataLines[:] badData = [ line[:] for line in badData if line not in pointsOutOfSight(dataLines) ] # Make list of rows with no points (or only OOS points) incompletes = [ line[:] for line in theseWithoutThose(badData, focalAbbrev, [pntAbbrev]) ] # Add a "Y" or "N" to each row in theFocals to indicate how # "complete" the sample was for line in theFocals: if line in incompletes: line.append("N") else: line.append("Y") # Get points per focal pointsPerFocal = countPointsPerFocal(dataLines) # Get in-sight points per focal pointsISPerFocal = countPointsPerFocal(dataLines, incOutOfSights=False) # Convert these data to dictionaries, to simplify the next step pointsPerFocal = {focal: points for (focal, points) in pointsPerFocal} pointsISPerFocal = {focal: points for (focal, points) in pointsISPerFocal} # Now rearrange the columns into their desired output format outData = [] for line in theFocals: newLine = [] newLine.append(line[3]) # Time newLine.append(line[2]) # Date newLine.append(line[4]) # Grp newLine.append(line[1]) # Observer newLine.append(line[5]) # Sname newLine.append(line[-1]) # Complete fullText = "\t".join(line[:-1]) # Omit the "complete" value newLine.append(fullText) # Fulltext numPoints = pointsPerFocal[fullText] newLine.append(str(numPoints)) # NumPoints numIS = pointsISPerFocal[fullText] newLine.append(str(numIS)) # NumIS outData.append(newLine) return outData
def checkUniqueNeighbors(dataLines, sampleProtocols): ''' Checks all the recorded neighbors for each point collected during sampleProtocols samples to make sure that the list of neighbors is unique. Ideally, only check juvenile samples because the adult female protocol allows for some redundancy. Returns a list of lists of strings: the point lines that have non-unique neighbors, and all of the point's associated neighbor lines. When considering uniqueness of neighbors, placeholder names (any names in constants.unknSnames) are ignored. If a point has more than three neighbors, the neighbors are likely not unique, but neighbor uniqueness really isn't _the_ problem. Those cases are not returned by this function. dataLines is a list of lists of strings, presumed to be all the data from a file, stripped and split. sampleProtocols is a list of strings. ''' from constants import unknSnames, focalAbbrev, pntAbbrev, neighborAbbrev # Make placeholders for iteration. # Holds the last-read "header" line, but only for focals of the type(s) allowed by sampleProtocols currentHeader = [] #Holds the last-read "point" line, but only if it was in an allowed focal currentPoint = [] # Dictionary of points and neighbors. myPnts = {} # Key: the point line--joined as a string # Value: list of the neighbor lines (as lists of strings) for the point # Key for dictionary when neighbors are missing a point line missingPntKey = '(MISSING POINT LINE)' myPnts[missingPntKey] = [] for line in dataLines: if line[0] not in [focalAbbrev, pntAbbrev, neighborAbbrev]: # Then we don't care about it for this question continue elif isType(line, focalAbbrev): if line[6] in sampleProtocols: #This is a focal sample of interest currentHeader = line[:] currentPoint = [] else: #We don't care about any of the data in this focal currentHeader = [] currentPoint = [] continue # All that's left are points and neighbors. If currentHeader is empty, then we don't care about any of these. elif currentHeader == []: continue # Only points and neighbors that actually happened during focals of interest are left elif isType(line, pntAbbrev): if sameDate(line, currentHeader): #This should always be true, but added here just in case currentPoint = line[:] myPnts['\t'.join(currentPoint)] = [] elif isType(line, neighborAbbrev): if currentPoint == []: #This should only happen if the observer messed up somewhere else myPnts[missingPntKey].append(line) elif sameDate(line, currentPoint): myPnts['\t'.join(currentPoint)].append(line) # Get list of names that are allowed to be nonunique fakeNames = unknSnames.keys() # Make list to hold the point and neighbor lines with nonunique neighbors nonUniqueNeighbors = [] for (point, neighbors) in sorted(myPnts.items()): if len(neighbors) > 3: # Then you've got too many neighbors. Don't bother # with checking for neighbor uniqueness, this point has # bigger issues. continue nghNames = [] for neighbor in neighbors: #Collect all the neighbor names into one list if neighbor[7] not in fakeNames: nghNames.append(neighbor[7]) if len(nghNames) > len(set(nghNames)): #Then 1 or more neighbors is redundant if len(nonUniqueNeighbors) > 0: # For every instance after the first, add a newline first nonUniqueNeighbors.append([]) # When this is output to file, a newline will be added nonUniqueNeighbors.append(point.split('\t')) for neighbor in neighbors: nonUniqueNeighbors.append(neighbor) return nonUniqueNeighbors
def theseWithoutThose(dataLines, thisType, notThose, butYesThem = [], beforeThem = []): ''' Checks the data in dataLines and collects lines of "type" thisType that don't have any notThose lines after them before the next thisType line. For example, in the data below: 1 HDR datadatadatadata 2 PNT datadatadatadata 3 HDR datadatadatadata 4 HDR datadatadatadata 5 PNT datadatadatadata 6 NGH datadatadatadata theseWithoutThose(theData, HDR, [PNT]) would not return line 1 or 4 because a PNT line occurs before the next HDR or before the end of data. Only line 3 would be returned. theseWithoutThose(theData, PNT, [NGH]) would return line 2 but not 5. If the optional "butYesThem" is given, then also add the qualification that at least one line of at least one type in butYesThem must also occur before the next "thisType" line. For example given "theData" above: theseWithoutThose(theData, HDR, [PNT], [NGH]) would return no lines. theseWithoutThose(theData, HDR, [NGH], [PNT]) would return only line 1. If the optional "beforeThem" is given, then add the qualification that there also mustn't be any "notThose" lines before lines of the type(s) included in "beforeThem". For example, given "theData" below (slightly different from above): 1 HDR datadatadatadata 2 PNT datadatadatadata 3 HDR datadatadatadata 4 NGH datadatadatadata 5 PNT datadatadatadata 6 NGH datadatadatadata 7 HDR datadatadatadata theseWithoutThose(theData, 'PNT', ['NGH'], beforeThem = ['HDR']) would return line 2 theseWithoutThose(theData, 'PNT', ['NGH']) would return nothing theseWithoutThose(theData, 'HDR', ['PNT'], ['NGH'],['PNT']) would return line 3 (it's essentially asking for HDR with NGH before any PNT) dataLines is a list of lists of strings, presumed to be all the data from a file, stripped and split. [0] in each list of strings is the "type" of data recorded in that line. thisType is a string, indicating which line types to check for. notThose is a list of strings indicating which line types can't follow thisType lines. butYesThem is also a list of strings. thisType and any items in butYesThem cannot be in notThose. Returns a list of lists of strings. ''' if thisType in notThose: return ['ERROR (' + thisType + ': Cannot exclude an item type from itself'] for this in butYesThem: if this in notThose: return ['ERROR (' + this + '): Cannot require and forbid an item type'] these = [] maybeThis = [] yesFound = False checkYes = False if len(butYesThem) > 0: checkYes = True for line in dataLines: if maybeThis == []: if isType(line, thisType): maybeThis = line[:] if checkYes: yesFound = False else: #maybeThis is not empty, so there's a candidate for "these" if checkYes: if line[0] in beforeThem: # When checkYes, notThose can be in beforeThem, so check beforeThem first if yesFound: # Hooray! these.append(maybeThis) maybeThis = [] yesFound = False elif line[0] in notThose: # maybeThis is a "these" but not "without those" maybeThis = [] yesFound = False elif line[0] in butYesThem: #found a "butYesThem" yesFound = True elif isType(line, thisType): if yesFound: # Winner! these.append(maybeThis) else: if line[0] in notThose: # maybeThis is a "these" but not "without those" maybeThis = [] elif isType(line, thisType) or line[0] in beforeThem: # maybeThis didn't have any of "notThose" these.append(maybeThis) maybeThis = [] if isType(line, thisType): maybeThis = line[:] if maybeThis != []: # we have a maybeThis and got to end of data without finding any of "those". Add it. Unless... if checkYes and not yesFound: # Made it to end, but no yesFound. Don't add. pass else: these.append(maybeThis) return these
def checkUniqueNeighbors(dataLines, sampleProtocols): ''' Checks all the recorded neighbors for each point collected during sampleProtocols samples to make sure that the list of neighbors is unique. Ideally, only check juvenile samples because the adult female protocol allows for some redundancy. Returns a list of lists of strings: the point lines that have non-unique neighbors, and all of the point's associated neighbor lines. When considering uniqueness of neighbors, placeholder names (any names in constants.unknSnames) are ignored. dataLines is a list of lists of strings, presumed to be all the data from a file, stripped and split. sampleProtocols is a list of strings. ''' from constants import unknSnames, focalAbbrev, pntAbbrev, neighborAbbrev # Make placeholders for iteration. # Holds the last-read "header" line, but only for focals of the type(s) allowed by sampleProtocols currentHeader = [] #Holds the last-read "point" line, but only if it was in an allowed focal currentPoint = [] # Dictionary of points and neighbors. myPnts = {} # Key: the point line--joined as a string # Value: list of the neighbor lines (as lists of strings) for the point # Key for dictionary when neighbors are missing a point line missingPntKey = '(MISSING POINT LINE)' myPnts[missingPntKey] = [] for line in dataLines: if line[0] not in [focalAbbrev, pntAbbrev, neighborAbbrev]: # Then we don't care about it for this question continue elif isType(line, focalAbbrev): if line[6] in sampleProtocols: #This is a focal sample of interest currentHeader = line[:] currentPoint = [] else: #We don't care about any of the data in this focal currentHeader = [] currentPoint = [] continue # All that's left are points and neighbors. If currentHeader is empty, then we don't care about any of these. elif currentHeader == []: continue # Only points and neighbors that actually happened during focals of interest are left elif isType(line, pntAbbrev): if sameDate(line, currentHeader): #This should always be true, but added here just in case currentPoint = line[:] myPnts['\t'.join(currentPoint)] = [] elif isType(line, neighborAbbrev): if currentPoint == []: #This should only happen if the observer messed up somewhere else myPnts[missingPntKey].append(line) elif sameDate(line, currentPoint): myPnts['\t'.join(currentPoint)].append(line) # Get list of names that are allowed to be nonunique fakeNames = unknSnames.keys() # Make list to hold the point and neighbor lines with nonunique neighbors nonUniqueNeighbors = [] for (point, neighbors) in myPnts.iteritems(): nghNames = [] for neighbor in neighbors: #Collect all the neighbor names into one list if neighbor[7] not in fakeNames: nghNames.append(neighbor[7]) if len(nghNames) > len(set(nghNames)): #Then 1 or more neighbors is redundant if len(nonUniqueNeighbors) > 0: # For every instance after the first, add a newline first nonUniqueNeighbors.append([]) # When this is output to file, a newline will be added nonUniqueNeighbors.append(point.split('\t')) for neighbor in neighbors: nonUniqueNeighbors.append(neighbor) return nonUniqueNeighbors
def errorAlertSummary(dataLines): ''' dataLines is a list of list of strings, presumed to be all the data from a file, stripped and split. Each list of strings should have a 'yyyy-mm-dd' date at [2], and a hh:mm:ss time at [3]. A code indicating the sample's "type" should be at [0]. Reads the data in dataLines and lists cases of apparent errors in the data. Also brings alerts to things that may not be "wrong" but may indicate problems: -- Same focal collected more than once/day -- More than 1 group sampled in a single day -- Focals with overlapping times -- Focals with no data at all -- Focals with no points -- Focals with >10 points -- Points w/ no neighbors (exclude out of sight points) -- Neighbors w/o a preceding PNT -- Points w/ >3 neighbors -- Points from juvenile samples with non-unique neighbors -- Neighbors w/o an N0/N1/N2 code -- Notes on days w/o any focals -- Actor == Actee -- Actor or Actee is a non-sname placeholder (NULL, XXX, 998, etc.) -- Neighbor is a non-sname placeholder ('IMM', 'INF') -- Notes lines possibly containing mounts, ejaculations, or consorts -- Non-note lines that recorded mounts, ejaculations, or consorts -- Mounts/Ejaculations/Consorts not during a focal -- Mounts/Ejaculations/Consorts not involving the focal individual Not implemented, but maybe worth adding: -- JM's AS/OS/DSing AF's -- Actor/actee in different groups Returns a single string that will include several line breaks. ''' from constants import focalAbbrev, pntAbbrev, neighborAbbrev, noteAbbrev, adlibAbbrev, outOfSightValue, stypeJuv, p8_nghcodes, bb_mount, bb_ejaculation, bb_consort, bb_mount_long, bb_ejaculation_long, bb_consort_long, bb_consort_long2 alertLines = [] # Add summary header commentLine = '------Alerts and Errors:\n' alertLines.append(commentLine) # Check for individuals sampled >1x/day alertData = ['\t'.join(line) for line in checkDuplicateFocals(dataLines)] commentLine = writeAlert('duplicate (date, sname) pairs', alertData) + '\n' alertLines.append(commentLine) # Check for >1 group sampled in one day alertData = ['\t'.join(line) for line in checkDuplicateGroups(dataLines)] commentLine = writeAlert('>1 group sampled in a day', alertData) + '\n' alertLines.append(commentLine) # Check for overlapping focals alertData = [] allOverlaps = checkFocalOverlaps(dataLines) for (focal1, focal2) in allOverlaps: outFocal1 = ' '.join([focal1[2], focal1[3], focal1[5]]) # Date, time, ID outFocal2 = ' '.join([focal2[2], focal2[3], focal2[5]]) alertData.append(', '.join([outFocal1, outFocal2])) commentLine = writeAlert('overlapping focals', alertData) + '\n' alertLines.append(commentLine) # Check for focal samples with no data alertData = ['\t'.join(line) for line in theseWithoutThose(dataLines, focalAbbrev, [pntAbbrev, neighborAbbrev, noteAbbrev, adlibAbbrev])] commentLine = writeAlert('focal samples with no data', alertData) + '\n' alertLines.append(commentLine) # Check for focal samples with no points alertData = ['\t'.join(line) for line in theseWithoutThose(dataLines, focalAbbrev, [pntAbbrev])] commentLine = writeAlert('focal samples without points', alertData) + '\n' alertLines.append(commentLine) # Check for focal samples with >10 points alertData = [('\t'.join(focal) + '; ' + str(count) + ' points') for (focal, count) in countPointsPerFocal(dataLines) if count > 10] commentLine = writeAlert('focal samples with > 10 points', alertData) + '\n' alertLines.append(commentLine) # Check for in-sight points with no neighbors alertData = theseWithoutThose(dataLines, pntAbbrev, [neighborAbbrev], beforeThem = [focalAbbrev]) alertData = ['\t'.join(line) for line in alertData if line[6] != outOfSightValue] commentLine = writeAlert('in-sight points w/o neighbors', alertData) + '\n' alertLines.append(commentLine) # Check for neighbors without a preceding point. This occurs in two # different ways: Neighbor lines occur just after a focal starts and before # any points, or a point is followed by >3 neighbors # First, neighbors after focals alertData = theseWithoutThose(dataLines, focalAbbrev, [pntAbbrev], [neighborAbbrev], [pntAbbrev]) alertData = ['\t'.join(line) for line in alertData] commentLine = writeAlert(('header-then-neighbor, with no ' + pntAbbrev), alertData) +'\n' alertLines.append(commentLine) # Second, points with >3 neighbors alertData = [pair[0] for pair in checkNeighborsPerPoint(dataLines) if pair[1] > 3] commentLine = writeAlert('points with >3 neighbors', alertData) + '\n' alertLines.append(commentLine) # Check for non-unique neighbors in juvenile samples alertData = ['\t'.join(line) for line in checkUniqueNeighbors(dataLines, [stypeJuv])] commentLine = writeAlert('non-unique neighbors in juvenile samples', alertData) + '\n' alertLines.append(commentLine) # Check for neighbors without appropriate neighbor codes alertData = ['\t'.join(line) for line in dataLines if isType(line, neighborAbbrev) and line[-1] not in p8_nghcodes] commentLine = writeAlert('neighbors w/o neighbor codes', alertData) + '\n' alertLines.append(commentLine) # Check for notes on days without any focals alertData = ['\t'.join(line) for line in checkNotesNoFocals(dataLines)] commentLine = writeAlert('notes on days without any focals', alertData) + '\n' alertLines.append(commentLine) # Check for data where actor is actee, or focal is neighbor alertData = ['\t'.join(line) for line in checkActorIsActee(dataLines)] commentLine = writeAlert('lines where actor is actee, or focal is neighbor', alertData) + '\n' alertLines.append(commentLine) # Check for data where actor or actee is a non-sname placeholder alertData = ['\t'.join(line) for line in checkActorActeeNotReal(dataLines)] commentLine = writeAlert('lines where actor or actee is a non-sname placeholder', alertData) + '\n' alertLines.append(commentLine) # Check for lines where neighbor is a non-sname placeholder (different placeholders from ad-libs) alertData = ['\t'.join(line) for line in checkNeighborNotReal(dataLines)] commentLine = writeAlert('lines where neighbor is a non-sname placeholder', alertData) + '\n' alertLines.append(commentLine) # Check for notes that appear to contain mounts, ejaculations, or consorts MEC_list = [bb_mount, bb_ejaculation, bb_consort, bb_mount_long, bb_ejaculation_long, bb_consort_long, bb_consort_long2] alertData = ['\t'.join(line) for line in checkBehavsInNotes(dataLines, MEC_list)] commentLine = writeAlert('notes that appear to contain mounts, ejaculations, or consorts', alertData) + '\n' alertLines.append(commentLine) # Check for lines with mounts, ejaculations, or consorts recorded as regular, legit behaviors alertData = ['\t'.join(line) for line in checkSpecificBehavior(dataLines, MEC_list)] commentLine = writeAlert('lines with mounts, ejaculations, or consorts recorded as regular, legit behaviors', alertData) + '\n' alertLines.append(commentLine) # Check for lines with mounts, ejaculations, or consorts recorded outside of a focal sample alertData = ['\t'.join(line) for line in checkMountsConsortsDuringFocal(dataLines)] commentLine = writeAlert('lines with mounts, ejaculations, or consorts recorded outside of a focal sample', alertData) + '\n' alertLines.append(commentLine) # Check for lines with mounts, ejaculations, or consorts not involving the focal individual alertData = ['\t'.join(line) for line in checkMountsConsortsInvolvedFocal(dataLines)] commentLine = writeAlert('lines with mounts, ejaculations, or consorts not involving the focal individual', alertData) + '\n' alertLines.append(commentLine) return '\n'.join(alertLines)