def __init__(self,ns=u'http://purl.org/linguistics/data/'): #assign user defined namespace, or simply use default self.ns=ns #one converter per input file is read #(maybe change this to global if multiple files are to be read in a batch? self.converter=CharConverter('praat','uni') #the universe to populate #self.universe=LingUniverse() #self.universe=linguniv #will be used for interactive tier description self.user_defined_types=[] #for hashing time points, for speed self.time_points={} #init basic data members self.segments=[] self.morphemes=[] self.translations=[] self.notes=[]
def __init__(self): #one converter per input file is read #(maybe change this to global if multiple files are to be read in a batch? #assume file is in XSAMPA, for now self.converter = CharConverter('xsampa', 'uni') #the universe to populate #self.universe=LingUniverse() #will be used for interactive tier description self.user_defined_types = [] #for hashing time points self.time_points = {}
def toLatex(self,out_file): #r=TableReader() converter=CharConverter('ipa','tipa') #map=ipa_tipa.getMap(0,4) latex_header=u'%This document was autogenerated using from a Python script.\n\documentclass[letter,12pt]{article}\n\\usepackage{linguex}\n\n\\usepackage[tone,extra]{tipa}\n\n\\begin{document}\n\n' outfile=codecs.open(out_file,'w','utf-8') outfile.write(latex_header) for i in self.igts: outfile.write(u'\exg.') outfile.write(u'\t') p=printTipaList(i.phons) #print p #convert from ipa to tipa #p=stringReplace(map,p) p=converter.convert(p) #print p p=tipaClean(p) #print p+'\n' outfile.write(p+u'\\\\\n') #take care of ctrl char's in latex g=u'\t'+printList(i.glosses)+u'\\\\\n' g=g.replace('_','\_') outfile.write(g) outfile.write(u'\t'+i.translation+u'\n\n') outfile.write(u'\n\n\end{document}') outfile.close()
def __init__(self): # one converter per input file is read # (maybe change this to global if multiple files are to be read in a batch? # assume file is in XSAMPA, for now self.converter = CharConverter("xsampa", "uni") # the universe to populate # self.universe=LingUniverse() # will be used for interactive tier description self.user_defined_types = [] # for hashing time points self.time_points = {}
class PraatReader(): """ A class for parsing and processing Praat TextGrid files """ #from Termset # def __init__(self,ns=u'http://purl.org/linguistics/data/termset/'): # """ # Init the termset and set its namespace # # @type ns: unicode # @param ns: a unicode obj representing a namespace # """ # self.namespace = Namespace(ns) # # self.ontology = Ontology() def __init__(self,ns=u'http://purl.org/linguistics/data/'): #assign user defined namespace, or simply use default self.ns=ns #one converter per input file is read #(maybe change this to global if multiple files are to be read in a batch? self.converter=CharConverter('praat','uni') #the universe to populate #self.universe=LingUniverse() #self.universe=linguniv #will be used for interactive tier description self.user_defined_types=[] #for hashing time points, for speed self.time_points={} #init basic data members self.segments=[] self.morphemes=[] self.translations=[] self.notes=[] #def setNS(self,ns): # """ # Set the namespace for the ontology. # @type ns: str # @param ns: a namespace string # @rtype: # @return: # """ def setURI(self,uri): """ Ses the URI for the ontology. @type uri: str @param uri: a string representing a URI """ self.uri=uri self.ontology=Ontology(uri) def readPraat(self,filename): """ Reads and parses a Praat TextGrid file. """ try: self.input_file=open(filename, 'r') print('Trying to open '+filename+'...') except IOError: print error print('Make sure the path is correct.') print 'Reading Praat file...' #test input file for proper type try: file_lines=self.input_file.readlines(8) if file_lines[1]!='Object class = "TextGrid"\n': raise IOError() print 'This Praat file is good.' #go to beginning of file self.input_file.seek(0,0) #temp variables current_type='' #get user defined ling types for line in self.input_file: #find the name of the tier if line.find('name = "')!=-1: self.user_defined_types.append(findQuoted(line)) self.input_file.seek(0,0) #BEGIN MAIN ALGORITHM print 'Processing contents...' #process line by line for line in self.input_file: #reset tier type as different types are encountered if line.find('IntervalTier')!=-1: current_type='Interval' elif line.find('PointTier')!=-1: current_type='Point' else: current_type=current_type #find the name of the tier if line.find('name = "')!=-1: dstr_role = findQuoted(line) #for handling individual intervals if line.find('intervals [')!=-1: try: time_interval=[] data='' time_interval.append(findNum(self.input_file.next())) time_interval.append(findNum(self.input_file.next())) data=findQuoted(self.input_file.next()) #only build an interval if the text has content if (data.isspace()!=1 and data!=''): self.handleData(data,dstr_role,time_interval) except StopIteration: pass except IOError: print "Input file is not well formed or not of type TextGrid." print file_lines[1] def getUniverse(self): """ A method for returning the ling. universe """ return self.universe def handleData(self,data,dstr_role,time_interval): """ Decide on the linguistic unit to instantiate data string repr. of linguistic form or grammar unit label dstr_role how the data is used in the Praat file (gloss, translation, etc) time_interval time alignment Instantiates units and add them to ling universe """ start=time_interval[0] end=time_interval[1] #build data obj's and add to universe if dstr_role=='segment': data=self.converter.convert(data) # #deal w. linguniv #self.universe.addData(FormUnit('Munken',data,start,end)) #self.segments.append(FormUnit('Munken',data,start,end)) pass #print data, 'is a segment' elif dstr_role=='morpheme': #self.universe.addData(Morpheme('Munken',data,start,end)) #elif dstr_role=='Checked tone': #self.universe.addData(FormUnit(data)????) #self.morphemes.append(Morpheme('Munken',data,start,end)) print data, ' is a Morpheme' elif dstr_role=='translation': #self.universe.addData(FormUnit('English',data,start,end)) #self.translations.append(FormUnit('English',data,start,end)) print data, ' is a translation' #bug here: notes show up as forms of morphemes #elif dstr_role=='notes': # self.universe.addData(FormUnit('English',data,start,end)) def printIGT(self,out_file): """ build IGT objects based on number of translations (since trans are common to the set of morphemes and segments ); assumes intervals are ordered (segment, syllable, morpheme, trans, notes) """ igts=[] for t in self.translations: temp_morphemes=[] temp_segments=[] #loop over morphemes to find those that are assoc'd with a translation for g in self.morphemes: if g.start>=t.start and g.end<=t.end: temp_morphemes.append(g.label) #loop over segment transcriptions to find those associated with each morpheme for p in self.segments: if p.start==g.start and p.end==g.end: #print p.text temp_segments.append(p.getString()) igts.append(IGT(temp_segments, temp_morphemes, t.segments)) c=IGTCollection(igts) print 'there are', len(self.segments),'segment transcriptions' #print 'there are', len(self.syllables),'syllables' print 'there are', len(self.morphemes),'morphemes' print 'there are ', len(self.translations),' translations' print 'there are ', len(self.notes),' notes' for t in self.translations: print t.segments+' ', #c.toStndOut() c.toLatex(out_file)
def parseBibtex(self, bibfile): """ Parse a bibtext file and extract entries @type bibfile: string @param bibfile: the filename to be read """ f = open(bibfile, 'r') file_lines = f.readlines() #use the character converter from eltk.utils converter = CharConverter('latex', 'uni') #loop over input file for line in file_lines: #replace latex special character codes w. unicode # #only works for this latex style: \x{y} # #this won't work: \{xy} # line = converter.convert(line) #if 'author' in line: print line #get rid of leading and trailing spaces line = line.strip() #skip over blank lines if line == '': continue #begin entry if line[0] == '@': if '@preamble' in line: pass #get the name for the entry entry_name = line[line.find("{") + 1:line.find(",")] #create a new entry self.entries[entry_name] = Entry() #assign id using original bib files entry name self.entries[entry_name].id = entry_name #find the entry's type type = line[1:line.find('{')].lower() #normalize the type to camel-case, eg 'InCollection', not 'Incollection' type = type.capitalize() type = type.replace('Inproceedings', 'InProceedings') type = type.replace('Incollection', 'InCollection') type = type.replace('Inbook', 'InBook') type = type.replace('Phdthesis', 'PhdThesis') type = type.replace('Mastersthesis', 'MastersThesis') type = type.replace('Techreport', 'TechReport') self.entries[entry_name].type = type #if field uses " and also contains internal {}'s if line.find('\"') < line.find('{') and line.find('{') != -1: #delete internal braces line.replace('{', '') line.replace('}', '') #normalize in favor of {}'s, not quotes #but leave internal quotes line = line.replace('\"', '{', 1) if line.count('\"') == 1: line = line.replace('\"', '}') else: line = rreplace(line, '\"', '}') #process fields, line by line #non-integer fields if 'author' in line: #pick out string containing authors authors_string = line[line.find('{') + 1:line.rfind('}')] #reformat the string and add to object self.entries[entry_name].authors = findAuthors(authors_string) #set authors_string in Entry self.entries[entry_name].authors_string = authors_string if 'editor' in line: #pick out string containing editors editors_string = line[line.find('{') + 1:line.rfind('}')] #reformat the string and add to object (use same as authors) self.entries[entry_name].editors = findAuthors(editors_string) #set editors_string in Entry self.entries[entry_name].editors_string = editors_string if 'title' in line: #don't use title case due to bug in title() function (ie if there's a non-ascii char, then the next char get capitalized regardless of whether it's at the beginning of a word.) self.entries[entry_name].title = line[line.find('{') + 1:line.rfind('}')] #.title() if 'booktitle' in line: #don't use title case as per note above self.entries[entry_name].booktitle = line[ line.find('{') + 1:line.rfind('}')] #.title() if 'journal' in line: self.entries[entry_name].journal = line[line.find('{') + 1:line.rfind('}')] if 'pages' in line: self.entries[entry_name].pages = line[line.find('{') + 1:line.rfind('}')] if 'publisher' in line: self.entries[entry_name].publisher = line[line.find('{') + 1:line.rfind('}')] if 'address' in line: self.entries[entry_name].address = line[line.find('{') + 1:line.rfind('}')] if 'location' in line: self.entries[entry_name].location = line[line.find('{') + 1:line.rfind('}')] if 'school' in line: self.entries[entry_name].school = line[line.find('{') + 1:line.rfind('}')] if 'organization' in line: self.entries[entry_name].organization = line[line.find('{') + 1:line.rfind('}')] if 'institution' in line: self.entries[entry_name].institution = line[line.find('{') + 1:line.rfind('}')] if 'series' in line: self.entries[entry_name].series = line[line.find('{') + 1:line.rfind('}')] if 'edition' in line: self.entries[entry_name].edition = line[line.find('{') + 1:line.rfind('}')] if 'howpublished' in line: self.entries[entry_name].howpublished = line[line.find('{') + 1:line.rfind('}')] if 'month' in line: self.entries[entry_name].month = line[line.find('{') + 1:line.rfind('}')] if 'note' in line: self.entries[entry_name].note = line[line.find('{') + 1:line.rfind('}')] #various identifiers if 'doi' in line: self.entries[entry_name].doi = line[line.find('{') + 1:line.rfind('}')] if 'isbn' in line: self.entries[entry_name].isbn = line[line.find('{') + 1:line.rfind('}')] if 'issn' in line: self.entries[entry_name].issn = line[line.find('{') + 1:line.rfind('}')] if 'lccn' in line: self.entries[entry_name].lccn = line[line.find('{') + 1:line.rfind('}')] #integer fields if 'year' in line: self.entries[entry_name].year = findInt(line) if 'volume' in line: self.entries[entry_name].volume = findInt(line) if 'number' in line: self.entries[entry_name].number = findInt(line) if 'chapter' in line: self.entries[entry_name].chapter = findInt(line) print 'Found ' + str(len(self.entries)) + ' entries' #post processing to fix entries w. editor and no author and #to replace id with the convention form makeID() #loop through entries keys = self.entries.keys() for k in keys: new_id = '' #make the new ID if self.entries[k].authors_string == '': new_id = makeID(self.entries[k].editors_string, self.entries[k].year) else: new_id = makeID(self.entries[k].authors_string, self.entries[k].year) self.entries[new_id] = self.entries.pop(k) self.entries[new_id].id = new_id
class ElanReader(object): """ A class for reading Elan eaf files """ def __init__(self): # one converter per input file is read # (maybe change this to global if multiple files are to be read in a batch? # assume file is in XSAMPA, for now self.converter = CharConverter("xsampa", "uni") # the universe to populate # self.universe=LingUniverse() # will be used for interactive tier description self.user_defined_types = [] # for hashing time points self.time_points = {} def setNS(self, ns): """ Set the namespace for the ontology. @type ns: str @param ns: a namespace string @rtype: @return: """ self.ns = ns def setOntologyURI(self, uri): """ Ses the URI for the ontology. @type uri: str @param uri: a string representing a URI """ self.ontology = Ontology(uri) def readElan(self, f): """Reads and parses an Elan file and populates the ling universe @type f:string @param f:an eaf file name """ print "Reading Elan file..." self.file_in = open(f, "r") # return IOError if root of eaf is not ANNOTATION_DOCUMENT try: file_lines = self.file_in.readlines(2) if file_lines[1][1:20] != "ANNOTATION_DOCUMENT": raise IOError print "This Elan file is good." except IOError: print "Input file is not well formed or not of type 'eaf'." # go to beginning of file self.file_in.seek(0, 0) # begin xml processing self.dom = parse(self.file_in) # store time points in a dict. for later use for t in self.dom.getElementsByTagName("TIME_SLOT"): self.time_points[t.getAttribute("TIME_SLOT_ID")] = t.getAttribute("TIME_VALUE") # store alignable elements in a dict. for later use self.alignable_elems = self.dom.getElementsByTagName("ALIGNABLE_ANNOTATION") # BEGIN MAIN ALGORITHM: print "Processing contents..." # process tier by tier, based on whether they contain alignable or reference annotations tier_elems = self.dom.getElementsByTagName("TIER") # get user defined ling types for tier in tier_elems: self.user_defined_types.append(tier.getAttribute("LINGUISTIC_TYPE_REF")) # print self.user_defined_types for tier in tier_elems: # dstr_role='data structure role' # used to decide on what type of linguistic unit to instantiate dstr_role = tier.getAttribute("LINGUISTIC_TYPE_REF") alignable_elems = tier.getElementsByTagName("ALIGNABLE_ANNOTATION") if len(alignable_elems) > 0: self.handleAnnotation(alignable_elems, dstr_role) ref_elems = tier.getElementsByTagName("REF_ANNOTATION") if len(ref_elems) > 0: self.handleAnnotation(ref_elems, dstr_role) # BEGIN OTHER CLASS METHODS def handleAnnotation(self, elems, dstr_role): """ Process ALIGNABLE_ANNOTATION and REF_ANNOTATION elements @type elems: @param elems: the elements to start from @type dstr_role: @param dstr_role: the element type """ # print 'There are ',len(elems),' annotation elements.' for e in elems: annot_val_elems = e.getElementsByTagName("ANNOTATION_VALUE") if annot_val_elems > 0: self.handleAnnotationValue(annot_val_elems, dstr_role, self.findTimeInterval(e)) def handleAnnotationValue(self, annot_val_elems, dstr_role, time_interval): """ Process CDATA associated with ANNOTATION_VALUE elements. annot_val_elems the elements to be processed dstr_role element type to be passed on time_interval time interval list to be passed on Calls handleData(...) """ for v in annot_val_elems: for c in v.childNodes: if c.nodeType == 3: self.handleData(c.data, dstr_role, time_interval) def findTimeInterval(self, elem): """Build time interval list elem the element to start from Return a list containing the start and end times """ if elem.tagName == "ALIGNABLE_ANNOTATION": start = elem.getAttribute("TIME_SLOT_REF1") end = elem.getAttribute("TIME_SLOT_REF2") # look up time refs in hash return [self.time_points[start], self.time_points[end]] elif elem.tagName == "REF_ANNOTATION": ref = elem.getAttribute("ANNOTATION_REF") for e in self.alignable_elems: if e.getAttribute("ANNOTATION_ID") == ref: # recursive call return self.findTimeInterval(e) def handleData(self, data, dstr_role, time_interval): """Decide on which linguistic units to instantiate. data string repr. of linguistic form or grammar unit label dstr_role how the data is used in the Elan file (gloss, translation, etc) time_interval time alignment Instantiates units and adds to ling. universe """ start = float(time_interval[0]) end = float(time_interval[1]) # build data obj's and add to universe if dstr_role == "Sentence-level Transcription": # convert unicode to string (nec. for character converter) if type(data) == unicode: data = str(data) data = self.converter.convert(data) print data, " is a Clause or Phrase" # generalize this later data = data.split() for d in data: print d, " is a SyntacticWord" w = gold.SyntacticWord(self.ns + makeID(d), []) # self.universe.addData(FormUnit('Koshin',d,start,end)) start = start + 0.00001 # self.universe.addData(FormUnit(data,start,end)) elif dstr_role == "Morpheme": # generalize this later ############################################### print data, "###data" data = data.split() for d in data: print d, "####d" # self.universe.addData(Morpheme('Koshin',d,start,end)) start = start + 0.00001 ################################################ # self.universe.addData(Morpheme(data,start,end)) # elif dstr_role=='Checked tone': # self.universe.addData(FormUnit(data,start,end)) elif dstr_role == "Sentence-level Translation": # print 'trans' pass # self.universe.addData(FormUnit('English',data,start,end)) # elif dstr_role=='Notes': # self.universe.addData(FormUnit(data,start,end) def writeData(self, outfile): self.ontology.save(outfile)
def toLatex(self, out_file): """ Outputs a tex file, converting from IPA to TIPA for each entry. out_file The name of the tex output file This requires the following tex ackages: tipa, linguex, supertabular """ print 'Printing to LeTex file...' converter = CharConverter('ipa', 'tipa') latex_header = u'%This document was autogenerated using from a Python script.\n\documentclass[letter,12pt]{article}\n\\usepackage{linguex}\n\n\\usepackage[tone,extra]{tipa}\n\\usepackage{supertabular}\n\n\\begin{document}\n\n' outfile = codecs.open(out_file, 'w', 'utf-8') outfile.write(latex_header) ############begin first section of bilingual dictionary###################3 outfile.write(u'\\section*{' + self.obj_lang + u'--' + self.meta_lang + u' Dictionary}\n\n') outfile.write(u'\\begin{supertabular}{lll}\n\n') keys = self.entries_1.keys() keys.sort() for k in keys: #headword head_word = converter.convert(k) head_word = tipaClean(head_word) #alt forms keys2 = self.entries_1[k].alt_forms.keys() alts = '(' for k2 in keys2: alts = alts + k2 + ': ' for i in self.entries_1[k].alt_forms[k2]: if i is not None: #################START HERE with tipa issue #print i+' ', i = converter.convert(i) i = tipaClean(i) #print i alts = alts + '\\textipa{' + i + '}, ' alts = alts + ')' alts = alts.replace(', )', ')') if alts == '()': alts = '' #translation translation = '' for t in self.entries_1[k].translation: translation = translation + t + ', ' translation = translation[:len(translation) - 2] translation = translation.replace('_', '\_') outfile.write(u'\\textbf{\\textipa{' + head_word + u'}}&' + alts + u'&' + translation + u'\\\\\\\\\n') outfile.write(u'\end{supertabular}\n\n') ############begin next section of bilingual dictionary###################3 outfile.write(u'\\section*{' + self.meta_lang + u'--' + self.obj_lang + u' Dictionary}\n\n') outfile.write(u'\\begin{supertabular}{lll}\n\n') keys_2 = self.entries_2.keys() keys_2.sort() for k in keys_2: #head word head_word = k head_word = head_word.replace('_', '\_') #unpack translations trans = '' for t in self.entries_2[k].translation: if t is not None: t = converter.convert(t) t = tipaClean(t) trans = trans + t + ', ' else: trans = trans trans = trans[:len(trans) - 2] outfile.write(u'\\textbf{' + head_word + u'}&' + '' + u'&\\textipa{' + trans + u'}\\\\\\\\\n') outfile.write(u'\end{supertabular}\n\n\end{document}') outfile.close()
def parseBibtex(self,bibfile): """ Parse a bibtext file and extract entries @type bibfile: string @param bibfile: the filename to be read """ f=open(bibfile,'r') file_lines=f.readlines() #use the character converter from eltk.utils converter=CharConverter('latex','uni') #loop over input file for line in file_lines: #replace latex special character codes w. unicode # #only works for this latex style: \x{y} # #this won't work: \{xy} # line=converter.convert(line) #if 'author' in line: print line #get rid of leading and trailing spaces line=line.strip() #skip over blank lines if line=='': continue #begin entry if line[0]=='@': if '@preamble' in line: pass #get the name for the entry entry_name=line[line.find("{")+1:line.find(",")] #create a new entry self.entries[entry_name]=Entry() #assign id using original bib files entry name self.entries[entry_name].id=entry_name #find the entry's type type=line[1:line.find('{')].lower() #normalize the type to camel-case, eg 'InCollection', not 'Incollection' type=type.capitalize() type=type.replace('Inproceedings','InProceedings') type=type.replace('Incollection','InCollection') type=type.replace('Inbook','InBook') type=type.replace('Phdthesis','PhdThesis') type=type.replace('Mastersthesis','MastersThesis') type=type.replace('Techreport','TechReport') self.entries[entry_name].type=type #if field uses " and also contains internal {}'s if line.find('\"')<line.find('{') and line.find('{')!=-1: #delete internal braces line.replace('{','') line.replace('}','') #normalize in favor of {}'s, not quotes #but leave internal quotes line=line.replace('\"','{',1) if line.count('\"')==1: line=line.replace('\"','}') else: line=rreplace(line,'\"','}') #process fields, line by line #non-integer fields if 'author' in line: #pick out string containing authors authors_string=line[line.find('{')+1:line.rfind('}')] #reformat the string and add to object self.entries[entry_name].authors=findAuthors(authors_string) #set authors_string in Entry self.entries[entry_name].authors_string=authors_string if 'editor' in line: #pick out string containing editors editors_string=line[line.find('{')+1:line.rfind('}')] #reformat the string and add to object (use same as authors) self.entries[entry_name].editors=findAuthors(editors_string) #set editors_string in Entry self.entries[entry_name].editors_string=editors_string if 'title' in line: #don't use title case due to bug in title() function (ie if there's a non-ascii char, then the next char get capitalized regardless of whether it's at the beginning of a word.) self.entries[entry_name].title=line[line.find('{')+1:line.rfind('}')] #.title() if 'booktitle' in line: #don't use title case as per note above self.entries[entry_name].booktitle=line[line.find('{')+1:line.rfind('}')]#.title() if 'journal' in line: self.entries[entry_name].journal=line[line.find('{')+1:line.rfind('}')] if 'pages' in line: self.entries[entry_name].pages=line[line.find('{')+1:line.rfind('}')] if 'publisher' in line: self.entries[entry_name].publisher=line[line.find('{')+1:line.rfind('}')] if 'address' in line: self.entries[entry_name].address=line[line.find('{')+1:line.rfind('}')] if 'location' in line: self.entries[entry_name].location=line[line.find('{')+1:line.rfind('}')] if 'school' in line: self.entries[entry_name].school=line[line.find('{')+1:line.rfind('}')] if 'organization' in line: self.entries[entry_name].organization=line[line.find('{')+1:line.rfind('}')] if 'institution' in line: self.entries[entry_name].institution=line[line.find('{')+1:line.rfind('}')] if 'series' in line: self.entries[entry_name].series=line[line.find('{')+1:line.rfind('}')] if 'edition' in line: self.entries[entry_name].edition=line[line.find('{')+1:line.rfind('}')] if 'howpublished' in line: self.entries[entry_name].howpublished=line[line.find('{')+1:line.rfind('}')] if 'month' in line: self.entries[entry_name].month=line[line.find('{')+1:line.rfind('}')] if 'note' in line: self.entries[entry_name].note=line[line.find('{')+1:line.rfind('}')] #various identifiers if 'doi' in line: self.entries[entry_name].doi=line[line.find('{')+1:line.rfind('}')] if 'isbn' in line: self.entries[entry_name].isbn=line[line.find('{')+1:line.rfind('}')] if 'issn' in line: self.entries[entry_name].issn=line[line.find('{')+1:line.rfind('}')] if 'lccn' in line: self.entries[entry_name].lccn=line[line.find('{')+1:line.rfind('}')] #integer fields if 'year' in line: self.entries[entry_name].year=findInt(line) if 'volume' in line: self.entries[entry_name].volume=findInt(line) if 'number' in line: self.entries[entry_name].number=findInt(line) if 'chapter' in line: self.entries[entry_name].chapter=findInt(line) print 'Found '+str(len(self.entries))+' entries' #post processing to fix entries w. editor and no author and #to replace id with the convention form makeID() #loop through entries keys=self.entries.keys() for k in keys: new_id='' #make the new ID if self.entries[k].authors_string=='': new_id=makeID(self.entries[k].editors_string,self.entries[k].year) else: new_id=makeID(self.entries[k].authors_string,self.entries[k].year) self.entries[new_id]=self.entries.pop(k) self.entries[new_id].id=new_id
from itertools import combinations from random import randint derivWeight = .90 sampleSize = 100 minmean = 10 iters = 100 retention = {} xmldoc = minidom.parse('./swadesh-retent.xml') itemlist = xmldoc.getElementsByTagName('word') for s in itemlist: retention[s.attributes['sem'].value] = float(float(s.attributes['retention'].value)/float(100)) latex_converter=CharConverter('uni','tipa') lgg = {} xml_list = listdir('./LanguageXML') widgets = ['Reading Language XML: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA()] pbar = ProgressBar(widgets=widgets,maxval=len(xml_list)).start() for j in range(len(xml_list)): name = xml_list[j] lang = name.split('.')[0] lgg[lang] = {} xmlname = './LanguageXML/' + name xmldoc = minidom.parse(xmlname) itemlist = xmldoc.getElementsByTagName('word') for s in itemlist : mean = s.attributes['sem'].value
# -*- coding: UTF-8 -*- from eltk.utils.CharConverter import * """ This script demostrates the use of CharConverter """ if __name__ == '__main__': #latex to unicode test c0 = CharConverter('latex', 'uni') print c0.convert('author = {S\\\'{a}ndor Hervey},') #unicode to latex test c1 = CharConverter('uni', 'latex') print c1.convert('ä') #praat to uni test c2 = CharConverter('praat', 'uni') print c2.convert('\\t.o') print 'If you do not see IPA characters, you may not have the required fonts installed.'
def toLatex(self,out_file): """ Outputs a tex file, converting from IPA to TIPA for each entry. out_file The name of the tex output file This requires the following tex ackages: tipa, linguex, supertabular """ print 'Printing to LeTex file...' converter=CharConverter('ipa','tipa') latex_header=u'%This document was autogenerated using from a Python script.\n\documentclass[letter,12pt]{article}\n\\usepackage{linguex}\n\n\\usepackage[tone,extra]{tipa}\n\\usepackage{supertabular}\n\n\\begin{document}\n\n' outfile=codecs.open(out_file,'w','utf-8') outfile.write(latex_header) ############begin first section of bilingual dictionary###################3 outfile.write(u'\\section*{'+self.obj_lang+u'--'+self.meta_lang+u' Dictionary}\n\n') outfile.write(u'\\begin{supertabular}{lll}\n\n') keys=self.entries_1.keys() keys.sort() for k in keys: #headword head_word=converter.convert(k) head_word=tipaClean(head_word) #alt forms keys2=self.entries_1[k].alt_forms.keys() alts='(' for k2 in keys2: alts=alts+k2+': ' for i in self.entries_1[k].alt_forms[k2]: if i is not None: #################START HERE with tipa issue #print i+' ', i=converter.convert(i) i=tipaClean(i) #print i alts=alts+'\\textipa{'+i+'}, ' alts=alts+')' alts=alts.replace(', )',')') if alts=='()': alts='' #translation translation='' for t in self.entries_1[k].translation: translation=translation+t+', ' translation=translation[:len(translation)-2] translation=translation.replace('_','\_') outfile.write(u'\\textbf{\\textipa{'+head_word+u'}}&'+alts+u'&'+translation+u'\\\\\\\\\n') outfile.write(u'\end{supertabular}\n\n') ############begin next section of bilingual dictionary###################3 outfile.write(u'\\section*{'+self.meta_lang+u'--'+self.obj_lang+u' Dictionary}\n\n') outfile.write(u'\\begin{supertabular}{lll}\n\n') keys_2=self.entries_2.keys() keys_2.sort() for k in keys_2: #head word head_word=k head_word=head_word.replace('_','\_') #unpack translations trans='' for t in self.entries_2[k].translation: if t is not None: t=converter.convert(t) t=tipaClean(t) trans=trans+t+', ' else: trans=trans trans=trans[:len(trans)-2] outfile.write(u'\\textbf{'+head_word+u'}&'+''+u'&\\textipa{'+trans+u'}\\\\\\\\\n') outfile.write(u'\end{supertabular}\n\n\end{document}') outfile.close()
# -*- coding: UTF-8 -*- from eltk.utils.CharConverter import * """ This script demostrates the use of CharConverter """ if __name__=='__main__': #latex to unicode test c0=CharConverter('latex','uni') print c0.convert('author = {S\\\'{a}ndor Hervey},') #unicode to latex test c1=CharConverter('uni','latex') print c1.convert('ä') #praat to uni test c2=CharConverter('praat','uni') print c2.convert('\\t.o') print 'If you do not see IPA characters, you may not have the required fonts installed.'
class ElanReader(object): """ A class for reading Elan eaf files """ def __init__(self): #one converter per input file is read #(maybe change this to global if multiple files are to be read in a batch? #assume file is in XSAMPA, for now self.converter = CharConverter('xsampa', 'uni') #the universe to populate #self.universe=LingUniverse() #will be used for interactive tier description self.user_defined_types = [] #for hashing time points self.time_points = {} def setNS(self, ns): """ Set the namespace for the ontology. @type ns: str @param ns: a namespace string @rtype: @return: """ self.ns = ns def setOntologyURI(self, uri): """ Ses the URI for the ontology. @type uri: str @param uri: a string representing a URI """ self.ontology = Ontology(uri) def readElan(self, f): """Reads and parses an Elan file and populates the ling universe @type f:string @param f:an eaf file name """ print 'Reading Elan file...' self.file_in = open(f, 'r') #return IOError if root of eaf is not ANNOTATION_DOCUMENT try: file_lines = self.file_in.readlines(2) if file_lines[1][1:20] != 'ANNOTATION_DOCUMENT': raise IOError print 'This Elan file is good.' except IOError: print "Input file is not well formed or not of type 'eaf'." #go to beginning of file self.file_in.seek(0, 0) #begin xml processing self.dom = parse(self.file_in) #store time points in a dict. for later use for t in self.dom.getElementsByTagName('TIME_SLOT'): self.time_points[t.getAttribute('TIME_SLOT_ID')] = t.getAttribute( 'TIME_VALUE') #store alignable elements in a dict. for later use self.alignable_elems = self.dom.getElementsByTagName( 'ALIGNABLE_ANNOTATION') #BEGIN MAIN ALGORITHM: print 'Processing contents...' #process tier by tier, based on whether they contain alignable or reference annotations tier_elems = self.dom.getElementsByTagName('TIER') #get user defined ling types for tier in tier_elems: self.user_defined_types.append( tier.getAttribute('LINGUISTIC_TYPE_REF')) #print self.user_defined_types for tier in tier_elems: #dstr_role='data structure role' #used to decide on what type of linguistic unit to instantiate dstr_role = tier.getAttribute('LINGUISTIC_TYPE_REF') alignable_elems = tier.getElementsByTagName('ALIGNABLE_ANNOTATION') if len(alignable_elems) > 0: self.handleAnnotation(alignable_elems, dstr_role) ref_elems = tier.getElementsByTagName('REF_ANNOTATION') if len(ref_elems) > 0: self.handleAnnotation(ref_elems, dstr_role) #BEGIN OTHER CLASS METHODS def handleAnnotation(self, elems, dstr_role): """ Process ALIGNABLE_ANNOTATION and REF_ANNOTATION elements @type elems: @param elems: the elements to start from @type dstr_role: @param dstr_role: the element type """ #print 'There are ',len(elems),' annotation elements.' for e in elems: annot_val_elems = e.getElementsByTagName('ANNOTATION_VALUE') if annot_val_elems > 0: self.handleAnnotationValue(annot_val_elems, dstr_role, self.findTimeInterval(e)) def handleAnnotationValue(self, annot_val_elems, dstr_role, time_interval): """ Process CDATA associated with ANNOTATION_VALUE elements. annot_val_elems the elements to be processed dstr_role element type to be passed on time_interval time interval list to be passed on Calls handleData(...) """ for v in annot_val_elems: for c in v.childNodes: if c.nodeType == 3: self.handleData(c.data, dstr_role, time_interval) def findTimeInterval(self, elem): """Build time interval list elem the element to start from Return a list containing the start and end times """ if elem.tagName == 'ALIGNABLE_ANNOTATION': start = elem.getAttribute('TIME_SLOT_REF1') end = elem.getAttribute('TIME_SLOT_REF2') #look up time refs in hash return [self.time_points[start], self.time_points[end]] elif elem.tagName == 'REF_ANNOTATION': ref = elem.getAttribute('ANNOTATION_REF') for e in self.alignable_elems: if e.getAttribute('ANNOTATION_ID') == ref: #recursive call return self.findTimeInterval(e) def handleData(self, data, dstr_role, time_interval): """Decide on which linguistic units to instantiate. data string repr. of linguistic form or grammar unit label dstr_role how the data is used in the Elan file (gloss, translation, etc) time_interval time alignment Instantiates units and adds to ling. universe """ start = float(time_interval[0]) end = float(time_interval[1]) #build data obj's and add to universe if dstr_role == 'Sentence-level Transcription': #convert unicode to string (nec. for character converter) if type(data) == unicode: data = str(data) data = self.converter.convert(data) print data, ' is a Clause or Phrase' #generalize this later data = data.split() for d in data: print d, ' is a SyntacticWord' w = gold.SyntacticWord(self.ns + makeID(d), []) #self.universe.addData(FormUnit('Koshin',d,start,end)) start = start + .00001 #self.universe.addData(FormUnit(data,start,end)) elif dstr_role == 'Morpheme': #generalize this later ############################################### print data, '###data' data = data.split() for d in data: print d, '####d' #self.universe.addData(Morpheme('Koshin',d,start,end)) start = start + .00001 ################################################ #self.universe.addData(Morpheme(data,start,end)) #elif dstr_role=='Checked tone': # self.universe.addData(FormUnit(data,start,end)) elif dstr_role == 'Sentence-level Translation': #print 'trans' pass #self.universe.addData(FormUnit('English',data,start,end)) #elif dstr_role=='Notes': # self.universe.addData(FormUnit(data,start,end) def writeData(self, outfile): self.ontology.save(outfile)