def merge_data(data, infile, outfile=None): """ Merge the data generated from parsing ELAN files with an existing CSV file -Arguments: * data -- data dictionary to be added to the existing file * infile -- filename of the existing CSV file * outfile -- filename for the new file. Default: None -- will update the infile in place -Return: nothing """ if VERBOSE: print "Merging with existing file: ", infile tmpfile = ".elanfmt-tmp" with open(infile, "rb") as fin: with open(tmpfile, "wb") as fout: #initialize the input file reader csvin = csv.DictReader(fin, delimiter=",", quotechar='"', quoting=csv.QUOTE_ALL, lineterminator="\n") #generate the complete field list for output file fields = copy.deepcopy(csvin.fieldnames) for field in headers: if field not in fields: fields.append(field) #initialize the output file writer and output file csvout = csv.DictWriter(fout, fields, delimiter=",", quotechar='"', quoting=csv.QUOTE_ALL, lineterminator="\n") csvout.writerow(dict(zip(fields,fields))) #for each line in the input file, update with the corresponding line #of data then, write to Excel-readable CSV file for line in csvin: #if there is data for a line, update the line and output. try: line.update(data[line[COLUMN_SUBJECT]+line[COLUMN_STIMULUS]]) csvout.writerow(line) #if not, copy over the existing data, leaving the new columns blank except KeyError: datum = filldict(fields, "") for key, value in line.iteritems(): datum[key] = value csvout.writerow(datum) if outfile is not None: if VERBOSE: print "Outputting to new file: ", outfile if os.path.isfile(outfile): os.remove(outfile) os.rename(tmpfile, outfile) else: if VERBOSE: print "Outputting to same file: ", infile if os.path.isfile(infile): os.remove(infile) os.rename(tmpfile, infile)
def parse_elan(elanfile, subjectID, stimID, filterdic=None, checkTranscript=False, includeNonWord=False): """ Parse an ELAN generated annotation file -Arguments: * elanfile -- the filename of the ELAN (XML) file to parse * subjectID -- identifying info for the current participant * stimID -- identifier for the current stimulus; together with subjectID identifies a row of output * filter -- filter to apply to determine what info to extract. Default=None : return only latency information * checkTranscript -- if true, consult an external transcript file to supply new information Default=False * includeNonWord -- if true, include non word sounds as words (for purposes of indexing) -Return * data -- dictionary containing an entry for each of the desired fields given by filter. """ #Parse the XML ELAN file and gather the pertinent parts doc = minidom.parse(elanfile) times = doc.getElementsByTagName("TIME_SLOT") tiers = doc.getElementsByTagName("TIER") #write row identifying information dataline = filldict(headers, "-") #(make sure all datalines have same fields dataline.update({COLUMN_SUBJECT: subjectID, \ COLUMN_STIMULUS: stimID, \ COLUMN_LATENCY: times[0].attributes["TIME_VALUE"].value,}) #Default behavior, return only latency if filterdic is None: return dataline for tier in tiers: #if filter contains entries for tier, process them key = tier.attributes["TIER_ID"].value.upper() if key in filterdic.keys(): #check if each annotation entry is in the filter #(saves searching the annotations multiple times) nwOffset = 0 for i, item in enumerate(tier.getElementsByTagName("ALIGNABLE_ANNOTATION")): itemdata = item.childNodes[1].firstChild.data.lower() #Determine if either the word/phoneme/region/index/etc is a match if itemdata in filterdic[key] or str(i+1-nwOffset) in filterdic[key]: try: #extract timing information #get time indices timeindex1 = int(item.attributes["TIME_SLOT_REF1"].value[2:])-1 timeindex2 = int(item.attributes["TIME_SLOT_REF2"].value[2:])-1 #use indices to get onset and duration onset = times[timeindex1].attributes["TIME_VALUE"].value duration = int(times[timeindex2].attributes["TIME_VALUE"].value) - int(onset) except ValueError: print "WARNING: bad timing info at: \n\tsubject %s,\n\tstimulus %s,\n\ttier %s,\n\titem# %s" % (subjectID, stimID, tier.attributes["TIER_ID"].value, i+1) continue #Remember that both keyword and index may match at the same time #for keyword search if itemdata in filterdic[key]: filterpair = key.capitalize()+itemdata.capitalize() #write the data fields dataline["Onset"+filterpair] = int(onset) dataline["Duration"+filterpair] = int(duration) #for index search #prevent ++SOUND++ entries from matching if includeNonWord is false if not includeNonWord and itemdata == "++" + itemdata[2:-2] + "++": nwOffset += 1 elif str(i+1-nwOffset) in filterdic[key]: filterpair = key.capitalize()+str(i+1-nwOffset) #write the data fields dataline["Onset"+filterpair] = int(onset) dataline["Duration"+filterpair] = int(duration) return dataline