Ejemplo n.º 1
0
def merge_data(data, infile, outfile=None):
    """
    Merge the data generated from parsing ELAN files with an existing CSV file
    -Arguments:
     * data -- data dictionary to be added to the existing file
     * infile -- filename of the existing CSV file
     * outfile -- filename for the new file.
      Default: None -- will update the infile in place
    -Return: nothing
    """
    if VERBOSE:
        print "Merging with existing file: ", infile
    tmpfile = ".elanfmt-tmp"

    with open(infile, "rb") as fin:
        with open(tmpfile, "wb") as fout:
            #initialize the input file reader
            csvin = csv.DictReader(fin, delimiter=",", quotechar='"',
                                   quoting=csv.QUOTE_ALL, lineterminator="\n")

            #generate the complete field list for output file
            fields = copy.deepcopy(csvin.fieldnames)
            for field in headers:
                if field not in fields:
                    fields.append(field)

            #initialize the output file writer and output file
            csvout = csv.DictWriter(fout, fields, delimiter=",", quotechar='"',
                                    quoting=csv.QUOTE_ALL, lineterminator="\n")
            csvout.writerow(dict(zip(fields,fields)))

            #for each line in the input file, update with the corresponding line
            #of data then, write to Excel-readable CSV file
            for line in csvin:
                #if there is data for a line, update the line and output.
                try:
                    line.update(data[line[COLUMN_SUBJECT]+line[COLUMN_STIMULUS]])
                    csvout.writerow(line)
                #if not, copy over the existing data, leaving the new columns blank
                except KeyError:
                    datum = filldict(fields, "")
                    for key, value in line.iteritems():
                        datum[key] = value
                    csvout.writerow(datum)

    if outfile is not None:
        if VERBOSE:
            print "Outputting to new file: ", outfile
        if os.path.isfile(outfile):
            os.remove(outfile)
        os.rename(tmpfile, outfile)
    else:
        if VERBOSE:
            print "Outputting to same file: ", infile
        if os.path.isfile(infile):
            os.remove(infile)
        os.rename(tmpfile, infile)
Ejemplo n.º 2
0
def parse_elan(elanfile, subjectID, stimID, filterdic=None, checkTranscript=False,
              includeNonWord=False):
    """
    Parse an ELAN generated annotation file
    -Arguments:
      * elanfile -- the filename of the ELAN (XML) file to parse
      * subjectID -- identifying info for the current participant
      * stimID -- identifier for the current stimulus; together with subjectID
        identifies a row of output
      * filter -- filter to apply to determine what info to extract.
         Default=None : return only latency information
      * checkTranscript -- if true, consult an external transcript file to
        supply new information
         Default=False
      * includeNonWord -- if true, include non word sounds as words (for
        purposes of indexing)
    -Return
      * data -- dictionary containing an entry for each of the desired fields
        given by filter.
    """

    #Parse the XML ELAN file and gather the pertinent parts
    doc = minidom.parse(elanfile)
    times = doc.getElementsByTagName("TIME_SLOT")
    tiers = doc.getElementsByTagName("TIER")

    #write row identifying information
    dataline = filldict(headers, "-") #(make sure all datalines have same fields
    dataline.update({COLUMN_SUBJECT: subjectID, \
                    COLUMN_STIMULUS: stimID, \
                    COLUMN_LATENCY: times[0].attributes["TIME_VALUE"].value,})

    #Default behavior, return only latency
    if filterdic is None:
        return dataline

    for tier in tiers:
        #if filter contains entries for tier, process them
        key = tier.attributes["TIER_ID"].value.upper()
        if key in filterdic.keys():
            #check if each annotation entry is in the filter
            #(saves searching the annotations multiple times)
            nwOffset = 0
            for i, item in enumerate(tier.getElementsByTagName("ALIGNABLE_ANNOTATION")):
                itemdata = item.childNodes[1].firstChild.data.lower()

                #Determine if either the word/phoneme/region/index/etc is a match
                if itemdata in filterdic[key] or str(i+1-nwOffset) in filterdic[key]:
                    try:
                        #extract timing information
                        #get time indices
                        timeindex1 = int(item.attributes["TIME_SLOT_REF1"].value[2:])-1
                        timeindex2 = int(item.attributes["TIME_SLOT_REF2"].value[2:])-1
                        #use indices to get onset and duration
                        onset = times[timeindex1].attributes["TIME_VALUE"].value
                        duration = int(times[timeindex2].attributes["TIME_VALUE"].value) - int(onset)
                    except ValueError:
                        print "WARNING: bad timing info at: \n\tsubject %s,\n\tstimulus %s,\n\ttier %s,\n\titem# %s" % (subjectID, stimID, tier.attributes["TIER_ID"].value, i+1)
                        continue
                    #Remember that both keyword and index may match at the same time
                    #for keyword search
                    if itemdata in filterdic[key]:
                        filterpair = key.capitalize()+itemdata.capitalize()
                        #write the data fields
                        dataline["Onset"+filterpair] = int(onset)
                        dataline["Duration"+filterpair] = int(duration)
                    #for index search
                    #prevent ++SOUND++ entries from matching if includeNonWord is false
                    if not includeNonWord and itemdata == "++" + itemdata[2:-2] + "++":
                        nwOffset += 1
                    elif str(i+1-nwOffset) in filterdic[key]:
                        filterpair = key.capitalize()+str(i+1-nwOffset)
                        #write the data fields
                        dataline["Onset"+filterpair] = int(onset)
                        dataline["Duration"+filterpair] = int(duration)

    return dataline