def __init__(self,ns=u'http://purl.org/linguistics/data/'):


        #assign user defined namespace, or simply use default
        self.ns=ns 

        #one converter per input file is read
        #(maybe change this to global if multiple files are to be read in a batch?
        self.converter=CharConverter('praat','uni')
        
        #the universe to populate
        #self.universe=LingUniverse()
        #self.universe=linguniv

        #will be used for interactive tier description
        self.user_defined_types=[]
    
        #for hashing time points, for speed
        self.time_points={}   

       
        #init basic data members 
        self.segments=[]
        self.morphemes=[]
        self.translations=[]
        self.notes=[]
    def __init__(self):

        #one converter per input file is read
        #(maybe change this to global if multiple files are to be read in a batch?
        #assume file is in XSAMPA, for now
        self.converter = CharConverter('xsampa', 'uni')

        #the universe to populate
        #self.universe=LingUniverse()

        #will be used for interactive tier description
        self.user_defined_types = []

        #for hashing time points
        self.time_points = {}
Beispiel #3
0
    def __init__(self,ns=u'http://purl.org/linguistics/data/'):


        #assign user defined namespace, or simply use default
        self.ns=ns 

        #one converter per input file is read
        #(maybe change this to global if multiple files are to be read in a batch?
        self.converter=CharConverter('praat','uni')
        
        #the universe to populate
        #self.universe=LingUniverse()
        #self.universe=linguniv

        #will be used for interactive tier description
        self.user_defined_types=[]
    
        #for hashing time points, for speed
        self.time_points={}   

       
        #init basic data members 
        self.segments=[]
        self.morphemes=[]
        self.translations=[]
        self.notes=[]
Beispiel #4
0
    def toLatex(self,out_file):
       
        #r=TableReader()
        converter=CharConverter('ipa','tipa')
        #map=ipa_tipa.getMap(0,4)
        


        
        latex_header=u'%This document was autogenerated using from  a Python script.\n\documentclass[letter,12pt]{article}\n\\usepackage{linguex}\n\n\\usepackage[tone,extra]{tipa}\n\n\\begin{document}\n\n'
        outfile=codecs.open(out_file,'w','utf-8')
        outfile.write(latex_header)

        for i in self.igts:
                

            outfile.write(u'\exg.')
            outfile.write(u'\t')

            p=printTipaList(i.phons)
            #print p
            #convert from ipa to tipa 
            #p=stringReplace(map,p)
            p=converter.convert(p)
            #print p
            p=tipaClean(p)
            #print p+'\n'
            outfile.write(p+u'\\\\\n')

            


            #take care of ctrl char's in latex
            g=u'\t'+printList(i.glosses)+u'\\\\\n'
            g=g.replace('_','\_')
            outfile.write(g)
            outfile.write(u'\t'+i.translation+u'\n\n')
        outfile.write(u'\n\n\end{document}')
        outfile.close()
Beispiel #5
0
    def __init__(self):

        # one converter per input file is read
        # (maybe change this to global if multiple files are to be read in a batch?
        # assume file is in XSAMPA, for now
        self.converter = CharConverter("xsampa", "uni")

        # the universe to populate
        # self.universe=LingUniverse()

        # will be used for interactive tier description
        self.user_defined_types = []

        # for hashing time points
        self.time_points = {}
class PraatReader():

    """
    A class for parsing and processing Praat TextGrid files
    
    """

#from Termset
#    def __init__(self,ns=u'http://purl.org/linguistics/data/termset/'):
        
#        """
#        Init the termset and set its namespace 
#
#        @type ns: unicode
#        @param ns: a unicode obj representing a namespace
#        """       
#        self.namespace = Namespace(ns)
#        
#        self.ontology = Ontology()




    def __init__(self,ns=u'http://purl.org/linguistics/data/'):


        #assign user defined namespace, or simply use default
        self.ns=ns 

        #one converter per input file is read
        #(maybe change this to global if multiple files are to be read in a batch?
        self.converter=CharConverter('praat','uni')
        
        #the universe to populate
        #self.universe=LingUniverse()
        #self.universe=linguniv

        #will be used for interactive tier description
        self.user_defined_types=[]
    
        #for hashing time points, for speed
        self.time_points={}   

       
        #init basic data members 
        self.segments=[]
        self.morphemes=[]
        self.translations=[]
        self.notes=[]

    #def setNS(self,ns):
    #    """
    #    Set the namespace for the ontology.

    #    @type ns: str
    #    @param ns: a namespace string
    #    @rtype:
    #    @return:
    #    """

    def setURI(self,uri):
        """
        Ses the URI for the ontology.

        @type uri: str 
        @param uri: a string representing a URI
        """

        self.uri=uri
        self.ontology=Ontology(uri)

 

    def readPraat(self,filename):
        """
        Reads and parses a Praat TextGrid file.
        """

       
        try:
        
            self.input_file=open(filename, 'r')
        
            print('Trying to open '+filename+'...')

            

        except IOError:
            print error
            print('Make sure the path is correct.')
 

        
        print 'Reading Praat file...'

        #test input file for proper type
        try:

	    file_lines=self.input_file.readlines(8)

	    if file_lines[1]!='Object class = "TextGrid"\n':
	        raise IOError()

                        
            print 'This Praat file is good.'
            
           

            #go to beginning of file
            self.input_file.seek(0,0)

            #temp variables
            current_type=''
       

            #get user defined ling types
            for line in self.input_file:
 
                #find the name of the tier
	        if line.find('name = "')!=-1:
		    self.user_defined_types.append(findQuoted(line))


            self.input_file.seek(0,0)

            #BEGIN MAIN  ALGORITHM
            print 'Processing contents...' 
            #process line by line
            for line in self.input_file:
            
                #reset tier type as different types are encountered
                if line.find('IntervalTier')!=-1:
	            current_type='Interval'
    
	        elif line.find('PointTier')!=-1:
	            current_type='Point'
	
	        else: current_type=current_type

	        #find the name of the tier
	        if line.find('name = "')!=-1:
		    dstr_role = findQuoted(line)


	        #for handling individual intervals	
	        if line.find('intervals [')!=-1:
		    try:
                        time_interval=[]
                        data=''
                        time_interval.append(findNum(self.input_file.next()))
			time_interval.append(findNum(self.input_file.next()))
                        data=findQuoted(self.input_file.next())
			
                        #only build an interval if the text has content
			if (data.isspace()!=1 and data!=''):
			
                            self.handleData(data,dstr_role,time_interval)

		    except StopIteration:
		        pass


        except IOError:
            
            print "Input file is not well formed or not of type TextGrid."
	    print file_lines[1]
 

    def getUniverse(self):

        """
        A method for returning the ling. universe
        """

        return self.universe



    def handleData(self,data,dstr_role,time_interval):
        
        """
        Decide on the linguistic unit to instantiate 
        
        data            string repr. of linguistic form or grammar unit label
        dstr_role       how the data is used in the Praat file (gloss, translation, etc)
        time_interval   time alignment

        Instantiates units and add them to ling universe
        """

        start=time_interval[0]
        end=time_interval[1]


        #build data obj's and add to universe
        if dstr_role=='segment':
            data=self.converter.convert(data)
            #
            #deal w. linguniv
            #self.universe.addData(FormUnit('Munken',data,start,end))
            #self.segments.append(FormUnit('Munken',data,start,end))
            
            pass
            #print data, 'is a segment'

        elif dstr_role=='morpheme':
            
            #self.universe.addData(Morpheme('Munken',data,start,end))
                    #elif dstr_role=='Checked tone':
            #self.universe.addData(FormUnit(data)????)

            #self.morphemes.append(Morpheme('Munken',data,start,end))
            print data, ' is a Morpheme'

        elif dstr_role=='translation':
            #self.universe.addData(FormUnit('English',data,start,end))
            #self.translations.append(FormUnit('English',data,start,end))

            print data, ' is a translation'

        #bug here: notes show up as forms of morphemes
        #elif dstr_role=='notes':
        #    self.universe.addData(FormUnit('English',data,start,end))
     



    

    def printIGT(self,out_file):
        """
        build IGT objects based on number of translations
        (since trans are common to the set of morphemes and segments ); 
        assumes intervals are ordered (segment, syllable, morpheme, trans, notes)
        """

        igts=[]

        for t in self.translations:

            temp_morphemes=[]
            temp_segments=[]

	    #loop over morphemes to find those that are assoc'd with a translation 
	    for g in self.morphemes:
            
                if g.start>=t.start and g.end<=t.end:
                
                    temp_morphemes.append(g.label)
               
                
                    #loop over segment transcriptions to find those associated with each morpheme 
                    for p in self.segments:

                        if p.start==g.start and p.end==g.end:
                            #print p.text
                            temp_segments.append(p.getString())
 

            igts.append(IGT(temp_segments,  temp_morphemes, t.segments))
            
        c=IGTCollection(igts)	

        print 'there are', len(self.segments),'segment transcriptions'
        #print 'there are', len(self.syllables),'syllables'
        print 'there are', len(self.morphemes),'morphemes'
        print 'there are ', len(self.translations),' translations'
        print 'there are ', len(self.notes),' notes'

        for t in self.translations:
            print t.segments+' ',


        #c.toStndOut()    
        c.toLatex(out_file) 
    def parseBibtex(self, bibfile):
        """ 
        Parse a bibtext file and extract entries

        @type   bibfile: string
        @param  bibfile: the filename to be read
        """
        f = open(bibfile, 'r')

        file_lines = f.readlines()

        #use the character converter from eltk.utils
        converter = CharConverter('latex', 'uni')

        #loop over input file
        for line in file_lines:

            #replace latex special character codes w. unicode
            #
            #only works for this latex style: \x{y}
            #
            #this won't work: \{xy}
            #
            line = converter.convert(line)

            #if 'author' in line: print line

            #get rid of leading and trailing spaces
            line = line.strip()

            #skip over blank lines
            if line == '':
                continue

            #begin entry
            if line[0] == '@':

                if '@preamble' in line: pass

                #get the name for the entry
                entry_name = line[line.find("{") + 1:line.find(",")]

                #create a new entry
                self.entries[entry_name] = Entry()

                #assign id using original bib files entry name
                self.entries[entry_name].id = entry_name

                #find the entry's type
                type = line[1:line.find('{')].lower()

                #normalize the type to camel-case, eg 'InCollection', not 'Incollection'
                type = type.capitalize()
                type = type.replace('Inproceedings', 'InProceedings')
                type = type.replace('Incollection', 'InCollection')
                type = type.replace('Inbook', 'InBook')
                type = type.replace('Phdthesis', 'PhdThesis')
                type = type.replace('Mastersthesis', 'MastersThesis')
                type = type.replace('Techreport', 'TechReport')

                self.entries[entry_name].type = type

            #if field uses " and also contains internal {}'s
            if line.find('\"') < line.find('{') and line.find('{') != -1:
                #delete internal braces
                line.replace('{', '')
                line.replace('}', '')

                #normalize in favor of {}'s, not quotes
                #but leave internal quotes
                line = line.replace('\"', '{', 1)
                if line.count('\"') == 1:
                    line = line.replace('\"', '}')
                else:
                    line = rreplace(line, '\"', '}')

            #process fields, line by line

            #non-integer fields
            if 'author' in line:

                #pick out string containing authors
                authors_string = line[line.find('{') + 1:line.rfind('}')]

                #reformat the string and add to object
                self.entries[entry_name].authors = findAuthors(authors_string)

                #set authors_string in Entry
                self.entries[entry_name].authors_string = authors_string

            if 'editor' in line:

                #pick out string containing editors
                editors_string = line[line.find('{') + 1:line.rfind('}')]

                #reformat the string and add to object (use same as authors)
                self.entries[entry_name].editors = findAuthors(editors_string)

                #set editors_string in Entry
                self.entries[entry_name].editors_string = editors_string

            if 'title' in line:
                #don't use title case due to bug in title() function (ie if there's a non-ascii char, then the next char get capitalized regardless of whether it's at the beginning of a word.)

                self.entries[entry_name].title = line[line.find('{') +
                                                      1:line.rfind('}')]
                #.title()

            if 'booktitle' in line:
                #don't use title case as per note above
                self.entries[entry_name].booktitle = line[
                    line.find('{') + 1:line.rfind('}')]  #.title()

            if 'journal' in line:
                self.entries[entry_name].journal = line[line.find('{') +
                                                        1:line.rfind('}')]

            if 'pages' in line:
                self.entries[entry_name].pages = line[line.find('{') +
                                                      1:line.rfind('}')]

            if 'publisher' in line:
                self.entries[entry_name].publisher = line[line.find('{') +
                                                          1:line.rfind('}')]

            if 'address' in line:
                self.entries[entry_name].address = line[line.find('{') +
                                                        1:line.rfind('}')]

            if 'location' in line:
                self.entries[entry_name].location = line[line.find('{') +
                                                         1:line.rfind('}')]

            if 'school' in line:
                self.entries[entry_name].school = line[line.find('{') +
                                                       1:line.rfind('}')]

            if 'organization' in line:
                self.entries[entry_name].organization = line[line.find('{') +
                                                             1:line.rfind('}')]

            if 'institution' in line:
                self.entries[entry_name].institution = line[line.find('{') +
                                                            1:line.rfind('}')]

            if 'series' in line:
                self.entries[entry_name].series = line[line.find('{') +
                                                       1:line.rfind('}')]

            if 'edition' in line:
                self.entries[entry_name].edition = line[line.find('{') +
                                                        1:line.rfind('}')]

            if 'howpublished' in line:
                self.entries[entry_name].howpublished = line[line.find('{') +
                                                             1:line.rfind('}')]

            if 'month' in line:
                self.entries[entry_name].month = line[line.find('{') +
                                                      1:line.rfind('}')]

            if 'note' in line:
                self.entries[entry_name].note = line[line.find('{') +
                                                     1:line.rfind('}')]

            #various identifiers
            if 'doi' in line:
                self.entries[entry_name].doi = line[line.find('{') +
                                                    1:line.rfind('}')]

            if 'isbn' in line:
                self.entries[entry_name].isbn = line[line.find('{') +
                                                     1:line.rfind('}')]

            if 'issn' in line:
                self.entries[entry_name].issn = line[line.find('{') +
                                                     1:line.rfind('}')]

            if 'lccn' in line:
                self.entries[entry_name].lccn = line[line.find('{') +
                                                     1:line.rfind('}')]

            #integer fields
            if 'year' in line:

                self.entries[entry_name].year = findInt(line)

            if 'volume' in line:
                self.entries[entry_name].volume = findInt(line)

            if 'number' in line:
                self.entries[entry_name].number = findInt(line)

            if 'chapter' in line:
                self.entries[entry_name].chapter = findInt(line)

        print 'Found ' + str(len(self.entries)) + ' entries'

        #post processing to fix entries w. editor and no author and
        #to replace id with the convention form makeID()

        #loop through entries
        keys = self.entries.keys()
        for k in keys:
            new_id = ''

            #make the new ID
            if self.entries[k].authors_string == '':
                new_id = makeID(self.entries[k].editors_string,
                                self.entries[k].year)
            else:
                new_id = makeID(self.entries[k].authors_string,
                                self.entries[k].year)

            self.entries[new_id] = self.entries.pop(k)
            self.entries[new_id].id = new_id
Beispiel #8
0
class ElanReader(object):

    """
    A class for reading Elan eaf files

    """

    def __init__(self):

        # one converter per input file is read
        # (maybe change this to global if multiple files are to be read in a batch?
        # assume file is in XSAMPA, for now
        self.converter = CharConverter("xsampa", "uni")

        # the universe to populate
        # self.universe=LingUniverse()

        # will be used for interactive tier description
        self.user_defined_types = []

        # for hashing time points
        self.time_points = {}

    def setNS(self, ns):
        """
        Set the namespace for the ontology.

        @type ns: str
        @param ns: a namespace string
        @rtype:
        @return:
        """
        self.ns = ns

    def setOntologyURI(self, uri):
        """
        Ses the URI for the ontology.

        @type uri: str 
        @param uri: a string representing a URI
        """

        self.ontology = Ontology(uri)

    def readElan(self, f):

        """Reads and parses an Elan file and populates the ling universe
        
        @type  f:string
        @param f:an eaf file name
        """

        print "Reading Elan file..."

        self.file_in = open(f, "r")

        # return IOError if root of eaf is not ANNOTATION_DOCUMENT
        try:

            file_lines = self.file_in.readlines(2)

            if file_lines[1][1:20] != "ANNOTATION_DOCUMENT":
                raise IOError
            print "This Elan file is good."

        except IOError:
            print "Input file is not well formed or not of type 'eaf'."

        # go to beginning of file
        self.file_in.seek(0, 0)

        # begin xml processing
        self.dom = parse(self.file_in)

        # store time points in a dict. for later use
        for t in self.dom.getElementsByTagName("TIME_SLOT"):
            self.time_points[t.getAttribute("TIME_SLOT_ID")] = t.getAttribute("TIME_VALUE")

        # store alignable elements in a dict. for later use
        self.alignable_elems = self.dom.getElementsByTagName("ALIGNABLE_ANNOTATION")

        # BEGIN MAIN ALGORITHM:

        print "Processing contents..."

        # process tier by tier, based on whether they contain alignable or reference annotations
        tier_elems = self.dom.getElementsByTagName("TIER")

        # get user defined ling types
        for tier in tier_elems:
            self.user_defined_types.append(tier.getAttribute("LINGUISTIC_TYPE_REF"))

        # print self.user_defined_types

        for tier in tier_elems:

            # dstr_role='data structure role'
            # used to decide on what type of linguistic unit to instantiate
            dstr_role = tier.getAttribute("LINGUISTIC_TYPE_REF")

            alignable_elems = tier.getElementsByTagName("ALIGNABLE_ANNOTATION")
            if len(alignable_elems) > 0:
                self.handleAnnotation(alignable_elems, dstr_role)

            ref_elems = tier.getElementsByTagName("REF_ANNOTATION")
            if len(ref_elems) > 0:
                self.handleAnnotation(ref_elems, dstr_role)

    # BEGIN OTHER CLASS METHODS

    def handleAnnotation(self, elems, dstr_role):

        """
        Process ALIGNABLE_ANNOTATION and REF_ANNOTATION  elements
        
        @type  elems: 
        @param elems: the elements to start from

        @type  dstr_role:
        @param dstr_role: the element type
        """

        # print 'There are ',len(elems),' annotation elements.'
        for e in elems:
            annot_val_elems = e.getElementsByTagName("ANNOTATION_VALUE")
            if annot_val_elems > 0:
                self.handleAnnotationValue(annot_val_elems, dstr_role, self.findTimeInterval(e))

    def handleAnnotationValue(self, annot_val_elems, dstr_role, time_interval):

        """ Process CDATA associated with ANNOTATION_VALUE elements.
        
        annot_val_elems         the elements to be processed
        dstr_role               element type to be passed on
        time_interval           time interval list to be passed on

        Calls handleData(...)
        """

        for v in annot_val_elems:
            for c in v.childNodes:
                if c.nodeType == 3:
                    self.handleData(c.data, dstr_role, time_interval)

    def findTimeInterval(self, elem):

        """Build time interval list

        elem    the element to start from
        
        Return a list containing the start and end times 
        """

        if elem.tagName == "ALIGNABLE_ANNOTATION":

            start = elem.getAttribute("TIME_SLOT_REF1")
            end = elem.getAttribute("TIME_SLOT_REF2")

            # look up time refs in hash
            return [self.time_points[start], self.time_points[end]]

        elif elem.tagName == "REF_ANNOTATION":

            ref = elem.getAttribute("ANNOTATION_REF")

            for e in self.alignable_elems:

                if e.getAttribute("ANNOTATION_ID") == ref:

                    # recursive call
                    return self.findTimeInterval(e)

    def handleData(self, data, dstr_role, time_interval):

        """Decide on which linguistic units to instantiate.
        
        data            string repr. of linguistic form or grammar unit label
        dstr_role       how the data is used in the Elan file (gloss, translation, etc)
        time_interval   time alignment
        
        Instantiates units and adds to ling. universe
        """

        start = float(time_interval[0])
        end = float(time_interval[1])

        # build data obj's and add to universe
        if dstr_role == "Sentence-level Transcription":

            # convert unicode to string (nec. for character converter)
            if type(data) == unicode:
                data = str(data)

            data = self.converter.convert(data)

            print data, " is a Clause or Phrase"

            # generalize this later
            data = data.split()
            for d in data:
                print d, " is a SyntacticWord"
                w = gold.SyntacticWord(self.ns + makeID(d), [])
                # self.universe.addData(FormUnit('Koshin',d,start,end))
                start = start + 0.00001

            # self.universe.addData(FormUnit(data,start,end))

        elif dstr_role == "Morpheme":

            # generalize this later
            ###############################################
            print data, "###data"
            data = data.split()
            for d in data:

                print d, "####d"
                # self.universe.addData(Morpheme('Koshin',d,start,end))
                start = start + 0.00001
            ################################################

            # self.universe.addData(Morpheme(data,start,end))

        # elif dstr_role=='Checked tone':
        #    self.universe.addData(FormUnit(data,start,end))
        elif dstr_role == "Sentence-level Translation":
            # print 'trans'
            pass
            # self.universe.addData(FormUnit('English',data,start,end))
        # elif dstr_role=='Notes':
        #    self.universe.addData(FormUnit(data,start,end)

    def writeData(self, outfile):
        self.ontology.save(outfile)
Beispiel #9
0
    def toLatex(self, out_file):
        """ 
        Outputs a tex file, converting from IPA to TIPA for each entry.
        
        out_file        The name of the tex output file
        
        This requires the following tex ackages: tipa, linguex, supertabular
        """

        print 'Printing to LeTex file...'

        converter = CharConverter('ipa', 'tipa')

        latex_header = u'%This document was autogenerated using from  a Python script.\n\documentclass[letter,12pt]{article}\n\\usepackage{linguex}\n\n\\usepackage[tone,extra]{tipa}\n\\usepackage{supertabular}\n\n\\begin{document}\n\n'

        outfile = codecs.open(out_file, 'w', 'utf-8')
        outfile.write(latex_header)

        ############begin first section of bilingual dictionary###################3

        outfile.write(u'\\section*{' + self.obj_lang + u'--' + self.meta_lang +
                      u' Dictionary}\n\n')

        outfile.write(u'\\begin{supertabular}{lll}\n\n')

        keys = self.entries_1.keys()

        keys.sort()

        for k in keys:

            #headword
            head_word = converter.convert(k)
            head_word = tipaClean(head_word)

            #alt forms
            keys2 = self.entries_1[k].alt_forms.keys()
            alts = '('
            for k2 in keys2:
                alts = alts + k2 + ': '

                for i in self.entries_1[k].alt_forms[k2]:
                    if i is not None:
                        #################START HERE with tipa issue
                        #print i+' ',
                        i = converter.convert(i)
                        i = tipaClean(i)
                        #print i
                        alts = alts + '\\textipa{' + i + '}, '

            alts = alts + ')'
            alts = alts.replace(', )', ')')
            if alts == '()': alts = ''

            #translation
            translation = ''
            for t in self.entries_1[k].translation:
                translation = translation + t + ', '
            translation = translation[:len(translation) - 2]
            translation = translation.replace('_', '\_')

            outfile.write(u'\\textbf{\\textipa{' + head_word + u'}}&' + alts +
                          u'&' + translation + u'\\\\\\\\\n')

        outfile.write(u'\end{supertabular}\n\n')

        ############begin next section of bilingual dictionary###################3

        outfile.write(u'\\section*{' + self.meta_lang + u'--' + self.obj_lang +
                      u' Dictionary}\n\n')

        outfile.write(u'\\begin{supertabular}{lll}\n\n')

        keys_2 = self.entries_2.keys()

        keys_2.sort()

        for k in keys_2:

            #head word
            head_word = k
            head_word = head_word.replace('_', '\_')

            #unpack translations
            trans = ''
            for t in self.entries_2[k].translation:
                if t is not None:
                    t = converter.convert(t)
                    t = tipaClean(t)
                    trans = trans + t + ', '
                else:
                    trans = trans
            trans = trans[:len(trans) - 2]

            outfile.write(u'\\textbf{' + head_word + u'}&' + '' +
                          u'&\\textipa{' + trans + u'}\\\\\\\\\n')

        outfile.write(u'\end{supertabular}\n\n\end{document}')
        outfile.close()
Beispiel #10
0
    def parseBibtex(self,bibfile):

        """ 
        Parse a bibtext file and extract entries

        @type   bibfile: string
        @param  bibfile: the filename to be read
        """
        f=open(bibfile,'r') 
    
        file_lines=f.readlines()
    
        #use the character converter from eltk.utils
        converter=CharConverter('latex','uni')    

        #loop over input file
        for line in file_lines:
         
            #replace latex special character codes w. unicode
            #
            #only works for this latex style: \x{y}
            #
            #this won't work: \{xy}
            #
            line=converter.convert(line)
            
            #if 'author' in line: print line
            
            #get rid of leading and trailing spaces
            line=line.strip()

            #skip over blank lines
            if line=='':
                continue
   

            #begin entry
            if line[0]=='@':
                
                if '@preamble' in line: pass

                #get the name for the entry
                entry_name=line[line.find("{")+1:line.find(",")]
                
                #create a new entry
                self.entries[entry_name]=Entry()
                
                #assign id using original bib files entry name
                self.entries[entry_name].id=entry_name
          
                #find the entry's type
                type=line[1:line.find('{')].lower()

                #normalize the type to camel-case, eg 'InCollection', not 'Incollection'
                type=type.capitalize()
                type=type.replace('Inproceedings','InProceedings')
                type=type.replace('Incollection','InCollection')
                type=type.replace('Inbook','InBook')
                type=type.replace('Phdthesis','PhdThesis')
                type=type.replace('Mastersthesis','MastersThesis')
                type=type.replace('Techreport','TechReport')
    
                self.entries[entry_name].type=type


            
            #if field uses " and also contains internal {}'s
            if line.find('\"')<line.find('{') and line.find('{')!=-1:
                #delete internal braces
                line.replace('{','')
                line.replace('}','')
            
                #normalize in favor of {}'s, not quotes
                #but leave internal quotes
                line=line.replace('\"','{',1)
                if line.count('\"')==1:
                    line=line.replace('\"','}')
                else:
                    line=rreplace(line,'\"','}')
 


            #process fields, line by line

            #non-integer fields
            if 'author' in line:

                #pick out string containing authors
                authors_string=line[line.find('{')+1:line.rfind('}')]
                
                #reformat the string and add to object
                self.entries[entry_name].authors=findAuthors(authors_string)
                
                #set authors_string in Entry
                self.entries[entry_name].authors_string=authors_string

            if 'editor' in line:
            
                #pick out string containing editors
                editors_string=line[line.find('{')+1:line.rfind('}')]
            
                #reformat the string and add to object (use same as authors)
                self.entries[entry_name].editors=findAuthors(editors_string)

                #set editors_string in Entry
                self.entries[entry_name].editors_string=editors_string

            if 'title' in line:
                #don't use title case due to bug in title() function (ie if there's a non-ascii char, then the next char get capitalized regardless of whether it's at the beginning of a word.)
                
                self.entries[entry_name].title=line[line.find('{')+1:line.rfind('}')]
                #.title()

            if 'booktitle' in line:
                #don't use title case as per note above
                self.entries[entry_name].booktitle=line[line.find('{')+1:line.rfind('}')]#.title()



            if 'journal' in line:
                self.entries[entry_name].journal=line[line.find('{')+1:line.rfind('}')]


            if 'pages' in line:
                self.entries[entry_name].pages=line[line.find('{')+1:line.rfind('}')]

            if 'publisher' in line:
                self.entries[entry_name].publisher=line[line.find('{')+1:line.rfind('}')]

            if 'address' in line:
                self.entries[entry_name].address=line[line.find('{')+1:line.rfind('}')]

            if 'location' in line:
                self.entries[entry_name].location=line[line.find('{')+1:line.rfind('}')]



            if 'school' in line:
                self.entries[entry_name].school=line[line.find('{')+1:line.rfind('}')]

            if 'organization' in line:
                self.entries[entry_name].organization=line[line.find('{')+1:line.rfind('}')]

            if 'institution' in line:
                self.entries[entry_name].institution=line[line.find('{')+1:line.rfind('}')]


            if 'series' in line:
                self.entries[entry_name].series=line[line.find('{')+1:line.rfind('}')]


            if 'edition' in line:
                self.entries[entry_name].edition=line[line.find('{')+1:line.rfind('}')]


            if 'howpublished' in line:
                self.entries[entry_name].howpublished=line[line.find('{')+1:line.rfind('}')]

            if 'month' in line:
                self.entries[entry_name].month=line[line.find('{')+1:line.rfind('}')]


            if 'note' in line:
                self.entries[entry_name].note=line[line.find('{')+1:line.rfind('}')]

            #various identifiers
            if 'doi' in line:
                self.entries[entry_name].doi=line[line.find('{')+1:line.rfind('}')]

            if 'isbn' in line:
                self.entries[entry_name].isbn=line[line.find('{')+1:line.rfind('}')]

            if 'issn' in line:
                self.entries[entry_name].issn=line[line.find('{')+1:line.rfind('}')]

            if 'lccn' in line:
                self.entries[entry_name].lccn=line[line.find('{')+1:line.rfind('}')]




            #integer fields
            if 'year' in line:

                self.entries[entry_name].year=findInt(line)

            if 'volume' in line:
                self.entries[entry_name].volume=findInt(line)

            if 'number' in line:
                self.entries[entry_name].number=findInt(line)


            if 'chapter' in line:
                self.entries[entry_name].chapter=findInt(line)

        print 'Found '+str(len(self.entries))+' entries'


        #post processing to fix entries w. editor and no author and
        #to replace id with the convention form makeID()

        #loop through entries
        keys=self.entries.keys()
        for k in keys:
            new_id=''
            
            #make the new ID
            if self.entries[k].authors_string=='':
                new_id=makeID(self.entries[k].editors_string,self.entries[k].year)
            else:
                new_id=makeID(self.entries[k].authors_string,self.entries[k].year)

             
            self.entries[new_id]=self.entries.pop(k)
            self.entries[new_id].id=new_id
from itertools import combinations
from random import randint

derivWeight = .90
sampleSize = 100
minmean = 10
iters = 100

retention = {}
xmldoc = minidom.parse('./swadesh-retent.xml')
itemlist = xmldoc.getElementsByTagName('word')

for s in itemlist:
    retention[s.attributes['sem'].value] = float(float(s.attributes['retention'].value)/float(100))

latex_converter=CharConverter('uni','tipa')

lgg = {}
xml_list = listdir('./LanguageXML')
widgets = ['Reading Language XML: ', Percentage(), ' ', Bar(marker=RotatingMarker()),
                   ' ', ETA()]
pbar = ProgressBar(widgets=widgets,maxval=len(xml_list)).start()
for j in range(len(xml_list)):
    name = xml_list[j]
    lang = name.split('.')[0]
    lgg[lang] = {}
    xmlname = './LanguageXML/' + name
    xmldoc = minidom.parse(xmlname)
    itemlist = xmldoc.getElementsByTagName('word') 
    for s in itemlist :
        mean = s.attributes['sem'].value
# -*- coding: UTF-8 -*-
from eltk.utils.CharConverter import *
"""
This script demostrates the use of CharConverter
"""

if __name__ == '__main__':

    #latex to unicode test
    c0 = CharConverter('latex', 'uni')
    print c0.convert('author = {S\\\'{a}ndor Hervey},')

    #unicode to latex test
    c1 = CharConverter('uni', 'latex')
    print c1.convert('ä')

    #praat to uni test
    c2 = CharConverter('praat', 'uni')
    print c2.convert('\\t.o')

    print 'If you do not see IPA characters, you may not have the required fonts installed.'
Beispiel #13
0
    def toLatex(self,out_file):
           
        """ 
        Outputs a tex file, converting from IPA to TIPA for each entry.
        
        out_file        The name of the tex output file
        
        This requires the following tex ackages: tipa, linguex, supertabular
        """
        
        print 'Printing to LeTex file...'

        converter=CharConverter('ipa','tipa')
        
        latex_header=u'%This document was autogenerated using from  a Python script.\n\documentclass[letter,12pt]{article}\n\\usepackage{linguex}\n\n\\usepackage[tone,extra]{tipa}\n\\usepackage{supertabular}\n\n\\begin{document}\n\n'
        
        outfile=codecs.open(out_file,'w','utf-8')
        outfile.write(latex_header)

        ############begin first section of bilingual dictionary###################3

        outfile.write(u'\\section*{'+self.obj_lang+u'--'+self.meta_lang+u' Dictionary}\n\n')

        outfile.write(u'\\begin{supertabular}{lll}\n\n')   


        keys=self.entries_1.keys()
        
        keys.sort()
        

        for k in keys:
            
            #headword      
            head_word=converter.convert(k)
            head_word=tipaClean(head_word)
            
            #alt forms
            keys2=self.entries_1[k].alt_forms.keys()
            alts='('
            for k2 in keys2:
                alts=alts+k2+': '
            
                for i in self.entries_1[k].alt_forms[k2]:
                    if i is not None:
                        #################START HERE with tipa issue
                        #print i+' ',
                        i=converter.convert(i)
                        i=tipaClean(i) 
                        #print i
                        alts=alts+'\\textipa{'+i+'}, '

            alts=alts+')'
            alts=alts.replace(', )',')')
            if alts=='()': alts=''
 
            #translation
            translation=''
            for t in self.entries_1[k].translation:
                translation=translation+t+', '
            translation=translation[:len(translation)-2]
            translation=translation.replace('_','\_')

             
            outfile.write(u'\\textbf{\\textipa{'+head_word+u'}}&'+alts+u'&'+translation+u'\\\\\\\\\n')

        outfile.write(u'\end{supertabular}\n\n')
        
        
        
        ############begin next section of bilingual dictionary###################3

        outfile.write(u'\\section*{'+self.meta_lang+u'--'+self.obj_lang+u' Dictionary}\n\n')

        outfile.write(u'\\begin{supertabular}{lll}\n\n') 
        




        keys_2=self.entries_2.keys()

        keys_2.sort()

        for k in keys_2:
            
            #head word
            head_word=k
            head_word=head_word.replace('_','\_')



            #unpack translations
            trans=''
            for t in self.entries_2[k].translation:
                if t is not None:
                    t=converter.convert(t)
                    t=tipaClean(t)
                    trans=trans+t+', '
                else: trans=trans
            trans=trans[:len(trans)-2]
           
            outfile.write(u'\\textbf{'+head_word+u'}&'+''+u'&\\textipa{'+trans+u'}\\\\\\\\\n')

         
        outfile.write(u'\end{supertabular}\n\n\end{document}')
        outfile.close()
# -*- coding: UTF-8 -*-
from eltk.utils.CharConverter import *

"""
This script demostrates the use of CharConverter
"""

if __name__=='__main__':

    #latex to unicode test
    c0=CharConverter('latex','uni')
    print c0.convert('author = {S\\\'{a}ndor Hervey},')

    #unicode to latex test
    c1=CharConverter('uni','latex')
    print c1.convert('ä')


    #praat to uni test
    c2=CharConverter('praat','uni')
    print c2.convert('\\t.o')

    print 'If you do not see IPA characters, you may not have the required fonts installed.'
Beispiel #15
0
class PraatReader():

    """
    A class for parsing and processing Praat TextGrid files
    
    """

#from Termset
#    def __init__(self,ns=u'http://purl.org/linguistics/data/termset/'):
        
#        """
#        Init the termset and set its namespace 
#
#        @type ns: unicode
#        @param ns: a unicode obj representing a namespace
#        """       
#        self.namespace = Namespace(ns)
#        
#        self.ontology = Ontology()




    def __init__(self,ns=u'http://purl.org/linguistics/data/'):


        #assign user defined namespace, or simply use default
        self.ns=ns 

        #one converter per input file is read
        #(maybe change this to global if multiple files are to be read in a batch?
        self.converter=CharConverter('praat','uni')
        
        #the universe to populate
        #self.universe=LingUniverse()
        #self.universe=linguniv

        #will be used for interactive tier description
        self.user_defined_types=[]
    
        #for hashing time points, for speed
        self.time_points={}   

       
        #init basic data members 
        self.segments=[]
        self.morphemes=[]
        self.translations=[]
        self.notes=[]

    #def setNS(self,ns):
    #    """
    #    Set the namespace for the ontology.

    #    @type ns: str
    #    @param ns: a namespace string
    #    @rtype:
    #    @return:
    #    """

    def setURI(self,uri):
        """
        Ses the URI for the ontology.

        @type uri: str 
        @param uri: a string representing a URI
        """

        self.uri=uri
        self.ontology=Ontology(uri)

 

    def readPraat(self,filename):
        """
        Reads and parses a Praat TextGrid file.
        """

       
        try:
        
            self.input_file=open(filename, 'r')
        
            print('Trying to open '+filename+'...')

            

        except IOError:
            print error
            print('Make sure the path is correct.')
 

        
        print 'Reading Praat file...'

        #test input file for proper type
        try:

	    file_lines=self.input_file.readlines(8)

	    if file_lines[1]!='Object class = "TextGrid"\n':
	        raise IOError()

                        
            print 'This Praat file is good.'
            
           

            #go to beginning of file
            self.input_file.seek(0,0)

            #temp variables
            current_type=''
       

            #get user defined ling types
            for line in self.input_file:
 
                #find the name of the tier
	        if line.find('name = "')!=-1:
		    self.user_defined_types.append(findQuoted(line))


            self.input_file.seek(0,0)

            #BEGIN MAIN  ALGORITHM
            print 'Processing contents...' 
            #process line by line
            for line in self.input_file:
            
                #reset tier type as different types are encountered
                if line.find('IntervalTier')!=-1:
	            current_type='Interval'
    
	        elif line.find('PointTier')!=-1:
	            current_type='Point'
	
	        else: current_type=current_type

	        #find the name of the tier
	        if line.find('name = "')!=-1:
		    dstr_role = findQuoted(line)


	        #for handling individual intervals	
	        if line.find('intervals [')!=-1:
		    try:
                        time_interval=[]
                        data=''
                        time_interval.append(findNum(self.input_file.next()))
			time_interval.append(findNum(self.input_file.next()))
                        data=findQuoted(self.input_file.next())
			
                        #only build an interval if the text has content
			if (data.isspace()!=1 and data!=''):
			
                            self.handleData(data,dstr_role,time_interval)

		    except StopIteration:
		        pass


        except IOError:
            
            print "Input file is not well formed or not of type TextGrid."
	    print file_lines[1]
 

    def getUniverse(self):

        """
        A method for returning the ling. universe
        """

        return self.universe



    def handleData(self,data,dstr_role,time_interval):
        
        """
        Decide on the linguistic unit to instantiate 
        
        data            string repr. of linguistic form or grammar unit label
        dstr_role       how the data is used in the Praat file (gloss, translation, etc)
        time_interval   time alignment

        Instantiates units and add them to ling universe
        """

        start=time_interval[0]
        end=time_interval[1]


        #build data obj's and add to universe
        if dstr_role=='segment':
            data=self.converter.convert(data)
            #
            #deal w. linguniv
            #self.universe.addData(FormUnit('Munken',data,start,end))
            #self.segments.append(FormUnit('Munken',data,start,end))
            
            pass
            #print data, 'is a segment'

        elif dstr_role=='morpheme':
            
            #self.universe.addData(Morpheme('Munken',data,start,end))
                    #elif dstr_role=='Checked tone':
            #self.universe.addData(FormUnit(data)????)

            #self.morphemes.append(Morpheme('Munken',data,start,end))
            print data, ' is a Morpheme'

        elif dstr_role=='translation':
            #self.universe.addData(FormUnit('English',data,start,end))
            #self.translations.append(FormUnit('English',data,start,end))

            print data, ' is a translation'

        #bug here: notes show up as forms of morphemes
        #elif dstr_role=='notes':
        #    self.universe.addData(FormUnit('English',data,start,end))
     



    

    def printIGT(self,out_file):
        """
        build IGT objects based on number of translations
        (since trans are common to the set of morphemes and segments ); 
        assumes intervals are ordered (segment, syllable, morpheme, trans, notes)
        """

        igts=[]

        for t in self.translations:

            temp_morphemes=[]
            temp_segments=[]

	    #loop over morphemes to find those that are assoc'd with a translation 
	    for g in self.morphemes:
            
                if g.start>=t.start and g.end<=t.end:
                
                    temp_morphemes.append(g.label)
               
                
                    #loop over segment transcriptions to find those associated with each morpheme 
                    for p in self.segments:

                        if p.start==g.start and p.end==g.end:
                            #print p.text
                            temp_segments.append(p.getString())
 

            igts.append(IGT(temp_segments,  temp_morphemes, t.segments))
            
        c=IGTCollection(igts)	

        print 'there are', len(self.segments),'segment transcriptions'
        #print 'there are', len(self.syllables),'syllables'
        print 'there are', len(self.morphemes),'morphemes'
        print 'there are ', len(self.translations),' translations'
        print 'there are ', len(self.notes),' notes'

        for t in self.translations:
            print t.segments+' ',


        #c.toStndOut()    
        c.toLatex(out_file) 
class ElanReader(object):
    """
    A class for reading Elan eaf files

    """
    def __init__(self):

        #one converter per input file is read
        #(maybe change this to global if multiple files are to be read in a batch?
        #assume file is in XSAMPA, for now
        self.converter = CharConverter('xsampa', 'uni')

        #the universe to populate
        #self.universe=LingUniverse()

        #will be used for interactive tier description
        self.user_defined_types = []

        #for hashing time points
        self.time_points = {}

    def setNS(self, ns):
        """
        Set the namespace for the ontology.

        @type ns: str
        @param ns: a namespace string
        @rtype:
        @return:
        """
        self.ns = ns

    def setOntologyURI(self, uri):
        """
        Ses the URI for the ontology.

        @type uri: str 
        @param uri: a string representing a URI
        """

        self.ontology = Ontology(uri)

    def readElan(self, f):
        """Reads and parses an Elan file and populates the ling universe
        
        @type  f:string
        @param f:an eaf file name
        """

        print 'Reading Elan file...'

        self.file_in = open(f, 'r')

        #return IOError if root of eaf is not ANNOTATION_DOCUMENT
        try:

            file_lines = self.file_in.readlines(2)

            if file_lines[1][1:20] != 'ANNOTATION_DOCUMENT':
                raise IOError
            print 'This Elan file is good.'

        except IOError:
            print "Input file is not well formed or not of type 'eaf'."

        #go to beginning of file
        self.file_in.seek(0, 0)

        #begin xml processing
        self.dom = parse(self.file_in)

        #store time points in a dict. for later use
        for t in self.dom.getElementsByTagName('TIME_SLOT'):
            self.time_points[t.getAttribute('TIME_SLOT_ID')] = t.getAttribute(
                'TIME_VALUE')

        #store alignable elements in a dict. for later use
        self.alignable_elems = self.dom.getElementsByTagName(
            'ALIGNABLE_ANNOTATION')

        #BEGIN MAIN ALGORITHM:

        print 'Processing contents...'

        #process tier by tier, based on whether they contain alignable or reference annotations
        tier_elems = self.dom.getElementsByTagName('TIER')

        #get user defined ling types
        for tier in tier_elems:
            self.user_defined_types.append(
                tier.getAttribute('LINGUISTIC_TYPE_REF'))

        #print self.user_defined_types

        for tier in tier_elems:

            #dstr_role='data structure role'
            #used to decide on what type of linguistic unit to instantiate
            dstr_role = tier.getAttribute('LINGUISTIC_TYPE_REF')

            alignable_elems = tier.getElementsByTagName('ALIGNABLE_ANNOTATION')
            if len(alignable_elems) > 0:
                self.handleAnnotation(alignable_elems, dstr_role)

            ref_elems = tier.getElementsByTagName('REF_ANNOTATION')
            if len(ref_elems) > 0:
                self.handleAnnotation(ref_elems, dstr_role)

    #BEGIN OTHER CLASS METHODS

    def handleAnnotation(self, elems, dstr_role):
        """
        Process ALIGNABLE_ANNOTATION and REF_ANNOTATION  elements
        
        @type  elems: 
        @param elems: the elements to start from

        @type  dstr_role:
        @param dstr_role: the element type
        """

        #print 'There are ',len(elems),' annotation elements.'
        for e in elems:
            annot_val_elems = e.getElementsByTagName('ANNOTATION_VALUE')
            if annot_val_elems > 0:
                self.handleAnnotationValue(annot_val_elems, dstr_role,
                                           self.findTimeInterval(e))

    def handleAnnotationValue(self, annot_val_elems, dstr_role, time_interval):
        """ Process CDATA associated with ANNOTATION_VALUE elements.
        
        annot_val_elems         the elements to be processed
        dstr_role               element type to be passed on
        time_interval           time interval list to be passed on

        Calls handleData(...)
        """

        for v in annot_val_elems:
            for c in v.childNodes:
                if c.nodeType == 3:
                    self.handleData(c.data, dstr_role, time_interval)

    def findTimeInterval(self, elem):
        """Build time interval list

        elem    the element to start from
        
        Return a list containing the start and end times 
        """

        if elem.tagName == 'ALIGNABLE_ANNOTATION':

            start = elem.getAttribute('TIME_SLOT_REF1')
            end = elem.getAttribute('TIME_SLOT_REF2')

            #look up time refs in hash
            return [self.time_points[start], self.time_points[end]]

        elif elem.tagName == 'REF_ANNOTATION':

            ref = elem.getAttribute('ANNOTATION_REF')

            for e in self.alignable_elems:

                if e.getAttribute('ANNOTATION_ID') == ref:

                    #recursive call
                    return self.findTimeInterval(e)

    def handleData(self, data, dstr_role, time_interval):
        """Decide on which linguistic units to instantiate.
        
        data            string repr. of linguistic form or grammar unit label
        dstr_role       how the data is used in the Elan file (gloss, translation, etc)
        time_interval   time alignment
        
        Instantiates units and adds to ling. universe
        """

        start = float(time_interval[0])
        end = float(time_interval[1])

        #build data obj's and add to universe
        if dstr_role == 'Sentence-level Transcription':

            #convert unicode to string (nec. for character converter)
            if type(data) == unicode: data = str(data)

            data = self.converter.convert(data)

            print data, ' is a Clause or Phrase'

            #generalize this later
            data = data.split()
            for d in data:
                print d, ' is a SyntacticWord'
                w = gold.SyntacticWord(self.ns + makeID(d), [])
                #self.universe.addData(FormUnit('Koshin',d,start,end))
                start = start + .00001

            #self.universe.addData(FormUnit(data,start,end))

        elif dstr_role == 'Morpheme':

            #generalize this later
            ###############################################
            print data, '###data'
            data = data.split()
            for d in data:

                print d, '####d'
                #self.universe.addData(Morpheme('Koshin',d,start,end))
                start = start + .00001
            ################################################

            #self.universe.addData(Morpheme(data,start,end))

        #elif dstr_role=='Checked tone':
        #    self.universe.addData(FormUnit(data,start,end))
        elif dstr_role == 'Sentence-level Translation':
            #print 'trans'
            pass
            #self.universe.addData(FormUnit('English',data,start,end))
        #elif dstr_role=='Notes':
        #    self.universe.addData(FormUnit(data,start,end)

    def writeData(self, outfile):
        self.ontology.save(outfile)