Python classifyVerbs Examples

Programming Language: Python

Namespace/Package Name: classifyVerbs

Method/Function: classifyVerbs

Examples at hotexamples.com: 3

Python classifyVerbs - 3 examples found. These are the top rated real world Python examples of classifyVerbs.classifyVerbs extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: testVerbs.py Project: nparslow/L2FrenchWritingAnalyser

    "clause": {},
}

#for xmlfilename in xmlfiles:
#for xmlfilename in glob.glob('/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellCheckerCorrected/entrycorrected_*/0/0/0/entrycorrected_*.E*.dep.xml'):
for xmlfilename in glob.glob('/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellCheckerCorrected/entrycorrected_394/0/0/0/entrycorrected_*.E*.dep.xml'):

    #xmlfilename = "/home/nparslow/Documents/AutoCorrige/Corpora/" + xmlfilename
    print "analysing", xmlfilename
    tok2finalforms, tok2lemmacats, verb2info, trees, (weight, maxweight) = getFinalTokenFormsAndTreesAndWeight(xmlfilename)
    print
    print "verb2info"
    print verb2info
    print

    info  = classifyVerbs.classifyVerbs(verb2info)

    print "info"
    print info
    print
    '''
    for verb, info in verb2info.items():

        if len(info) == 1:
            tense = "notense"
            if "tense" in info[0]:
                tense = info[0]["tense"]
            mood = "infinitive" # sometimes mood is not there
            if "mode" in info[0]:
                mood = info[0]["mode"]
            if mood == "indicative" and tense == "notense":

Example #2

Show file

File: Sentence.py Project: nparslow/L2FrenchWritingAnalyser

    def __init__(self, processedDepXMLFile, processedTokenFile, processedLogFile, processedMEltFile, debug=False):

        if debug:
            print "new Sentence"
            print processedDepXMLFile
            print processedTokenFile
            print processedLogFile
            print processedMEltFile

        self.debug = debug

        self.tokens = None
        self.rawtokens = None
        self.matchregex = None
        self.uniquetokens = None
        self.lemmaCats = None
        self.spellingcorrections = None
        self.weightperword = None
        self.minweight = None
        self.meltconfidences = None
        self.meltdiffs = None
        self.wordsbeforemainverb = None
        self.hasnomainverb = None
        # parse info
        self.parsed = None
        # verb info:
        self.vgroups = None # should be sum of verb entries for a particular axis projection, != no. of verbs as compounds count as one verb group
        self.vanalysis = None # will be a dictionary with property -> count
        self.vsingle = None
        self.vaux = None
        self.vcompound = None
        self.vindicative = None
        self.vconditional = None
        self.vsubjunctive = None
        self.vimperfect = None
        self.vfuture = None
        self.vpresent = None
        self.vnotense = None
        # clause info:
        self.crel = None
        self.cnom = None
        self.cacc = None
        self.cloc = None
        # tree info
        self.trees = None

        self.wordforms = []
        self.verbAnalysis = None
        self.totalverbgroups = None

        if os.path.isfile(processedTokenFile):
            self.rawtokens = readDepXMLFile.getTokensFromFile(processedTokenFile)

        # melttoks is a list of (tok, tag, prob)
        meltToks = getMEltInfo.loadAndAlign(processedMEltFile, self.rawtokens )

        if os.path.isfile(processedDepXMLFile):
            xmlinfo = readDepXMLFile.readDepXMLFile(processedDepXMLFile, self.debug)
            self.wordforms = xmlinfo["wordforms"]

            if debug:
                print "xmlinfo, tok to final forms:"
                print xmlinfo["tok2finalforms"]
                print "xmlinfo, tok to lemmacats:"
                print xmlinfo["tok2lemmacats"]
            self.setLemmaCats(xmlinfo["tok2lemmacats"])
            if debug:
                print "xmlinfo, lemmacats:"
                print self.lemmaCats
            self.weightperword = xmlinfo["weightperword"]
            self.minweight = xmlinfo["minweight"]
            self.wordsbeforemainverb = xmlinfo["wordsbeforemainverb"]
            self.hasnomainverb = 1 if xmlinfo["wordsbeforemainverb"] == -1 else 0
            self.trees = dict( (x,len(xmlinfo["trees"][x])) for x in xmlinfo["trees"])
            self.parsed = xmlinfo["parsemode"]
            #print "sent parsed", self.parsed

            self.tok2finalforms = xmlinfo["tok2finalforms"]

            #print "sent", xmlinfo["verb2info"]
            self.verbAnalysis, self.totalverbgroups = classifyVerbs.classifyVerbs(xmlinfo["verb2info"])
            #print "sent", self.verbAnalysis

            self.calcSpellingCorrections()

            self.meltdiffs = getMEltInfo.getMEltDiffs(xmlinfo["tok2lemmacats"], meltToks, debug)

        if len(self.wordforms) > 0:
            # sentence was at least partially parsed
            #print "sorted word forms:"
            #print sorted(wordsforms, key=lambda x: x[2].split('_')[1])
            # sort by the start point of the token
            #re.sub(ur'E\d+F\d+\|', u'', x[3], flags=re.UNICODE)
            self.tokens = [Token(x[0], x[1], re.findall(ur'(?<=[F\d]\d\|)[^ ]+', x[3], flags=re.UNICODE),
                                     [int(x) for x in re.findall(ur'(?<=F)\d+(?=\|)', x[3], flags=re.UNICODE) ])
                               for x in sorted(self.wordforms, key=lambda x: int(x[2].split('_')[1]))]
            #sentence.forms = [x[1] for x in sorted(wordsforms, key=lambda x: x[2].split('_')[1])]
        else:
            # sentence wasn't parsed, so use the tokens file:
            #tokens = getTokensFromFile(processedTokenFile)
            print "using tokens***"
            self.tokens = [Token(None, None, self.rawtokens[i], i) for i in range(len(self.rawtokens))]

        if debug: print "sent tokens:", self.tokens

        self.matchregex = makeRegexFromTokens(self.rawtokens, debug)
        if debug:
            print "obstokens:", self.rawtokens
            print "regex", self.matchregex



        self.meltconfidences = [float(x[2]) if (x[2] is not None and x[2] != 'None') else 1.0 for x in meltToks]
        if debug: print "sent melt confs:", self.meltconfidences

Example #3

Show file

File: documentProperties_backup.py Project: nparslow/L2FrenchWritingAnalyser

def getNextSentenceFromFiles( processedDepXMLFile, processedTokenFile, processedLogFile, debug=False ):

    #print processedSentenceFile
    sentence = None
    if os.path.isfile(processedDepXMLFile):
        sentence = Sentence()

        tree = ET.parse(processedDepXMLFile)
        # 'W' nodes are 'words' which can include multiple tokens, e.g. 'bien que' is one word
        # .iter for recursive, .findall for depth of 1
        # id the cluster then get the lex element from the cluster (we'll process it later)
        wordsforms = [(x.attrib['lemma'], x.attrib['form'], x.attrib['cluster'],
                       fixMixedEncodings(tree.findall("cluster[@id='" + x.attrib['cluster']+"']")[0].attrib["lex"]))
                      for x in tree.iter('node') if len(x.get('lemma'))>0 and x.get('lemma') != "_EPSILON"]
        print "wordsforms"
        print wordsforms
        # correct the encodings and remove epsilons

        tokens = getTokensFromFile(processedTokenFile)
        if len(wordsforms) > 0:
            # sentence was at least partially parsed
            #print "sorted word forms:"
            #print sorted(wordsforms, key=lambda x: x[2].split('_')[1])
            # sort by the start point of the token
            #re.sub(ur'E\d+F\d+\|', u'', x[3], flags=re.UNICODE)
            sentence.tokens = [Token(x[0], x[1], re.findall(ur'(?<=[F\d]\d\|)[^ ]+', x[3], flags=re.UNICODE),
                                     [int(x) for x in re.findall(ur'(?<=F)\d+(?=\|)', x[3], flags=re.UNICODE) ])
                               for x in sorted(wordsforms, key=lambda x: int(x[2].split('_')[1]))]
            #sentence.forms = [x[1] for x in sorted(wordsforms, key=lambda x: x[2].split('_')[1])]
        else:
            # sentence wasn't parsed, so use the tokens file:
            #tokens = getTokensFromFile(processedTokenFile)
            print "using tokens***"
            sentence.tokens = [Token(None, None, tokens[i], i) for i in range(len(tokens))]

        #print sentence.tokens
        #print "obs forms:", [x.observedform for x in sentence.tokens]

        tok2finalforms, tok2lemmacats, verb2info, trees, (weight, minweight) = \
            compareCorrectedCorpus.getFinalTokenFormsAndTreesAndWeight(processedDepXMLFile)
        sentence.setLemmaCats(tok2lemmacats)
        sentence.weightperword = weight
        sentence.minweight = minweight

        verbAnalysis = classifyVerbs.classifyVerbs(verb2info)
        sentence.vsingle = 0
        if "single" in verbAnalysis: sentence.vsingle += verbAnalysis["single"]
        sentence.vaux = 0
        if "aux" in verbAnalysis: sentence.vaux += verbAnalysis["aux"]
        sentence.vcompound = 0
        if "compound" in verbAnalysis: sentence.vcompound += verbAnalysis["compound"]
        sentence.vindicative = 0
        if "indicative" in verbAnalysis: sentence.vindicative += verbAnalysis["indicative"]
        sentence.vconditional = 0
        if "conditional" in verbAnalysis: sentence.vconditional += verbAnalysis["conditional"]
        sentence.vsubjunctive = 0
        if "subjonctive" in verbAnalysis: sentence.vsubjunctive += verbAnalysis["subjonctive"]
        sentence.vimperfect = 0
        if "imperfect" in verbAnalysis: sentence.vimperfect += verbAnalysis["imperfect"]
        sentence.vfuture = 0
        if "future" in verbAnalysis: sentence.vfuture += verbAnalysis["future"]
        sentence.vpresent = 0
        if "present" in verbAnalysis: sentence.vpresent += verbAnalysis["present"]
        sentence.vnotense = 0
        if "notense" in verbAnalysis: sentence.vnotense += verbAnalysis["notense"]
        # clause info:
        sentence.crel = 0
        if "rel" in verbAnalysis: sentence.crel += verbAnalysis["rel"]
        sentence.cnom = 0
        if "nom" in verbAnalysis: sentence.cnom += verbAnalysis["nom"]
        sentence.cacc = 0
        if "acc" in verbAnalysis: sentence.cacc += verbAnalysis["acc"]
        sentence.cloc = 0
        if "loc" in verbAnalysis: sentence.cloc += verbAnalysis["loc"]

        sxpipeSpellingChanges = 0
        for i in range(len(tokens)):
            # skip multitoken elements, too hard
            if len(tok2finalforms[i+1]) > 1 or len(tok2finalforms[i+1][0].split(' ')) > 1: continue
            if tok2finalforms[i+1][0][0] == "_": continue
            t = tokens[i].lower()
            f = tok2finalforms[i+1][0].lower()

            if t != f:
                print "spelling?", t, f, tok2finalforms[i+1]
                sxpipeSpellingChanges += 1
        sentence.spellingcorrections=sxpipeSpellingChanges
        '''
            t = joinTokens([tokens[i].lower()])
            f = joinTokens(tok2finalforms[i+1]).lower()

            print t, f
            if i > 0:
                # tok2finalforms first token is no. 1
                if tok2finalforms[i+1] == tok2finalforms[i]: continue # don't look if two same wordforms in a row
            if t != f:
                # check its not a multiwoprint "spelling?", t, f, tok2finalforms[i+1]
                    sxpipeSpellingChanges += 1rd thing
                tmpi = i+1
                isDouble = False
                while f.startswith(t) and tmpi < len(tokens) and len(t) < len(f):
                    t = joinTokens([t, tokens[tmpi]])
                    if t == f:
                        isDouble = True
                        break
                    tmpi += 1

                if not isDouble:
                    print "spelling?", t, f, tok2finalforms[i+1]
                    sxpipeSpellingChanges += 1
        '''


        '''
        for s_token, observedtoken in zip([sentence.tokens[0]] + [sentence.tokens[i] for i in range(1,len(sentence.tokens))
                                                                  if sentence.tokens[i-1].parseposition != \
                                                                     sentence.tokens[i].parseposition],
                                          getTokensFromFile(processedTokenFile)):
            # the tokens file overrules the depxml, e.g. in depxml you have \?
            if s_token.observedform != observedtoken:
                print "combining:", s_token.frmgform, s_token.observedform, observedtoken
                s_token.observedform = observedtoken
        '''
        #print words
        #print forms

        # we remove double entries from amalgams
        if debug: print "pre make regex:"
        '''
        print [(sentence.tokens[i].parseposition[0],
                sentence.tokens[i-1].parseposition[-1],
                sentence.tokens[i].observedform) for i in range(1, len(sentence.tokens))]
        print [sentence.tokens[0].observedform] +[sentence.tokens[i].observedform
                                   for i in range(1,len(sentence.tokens))
                                   if sentence.tokens[i].parseposition[0] > sentence.tokens[i-1].parseposition[-1]]
        '''
        '''
        obstokens = []
        obstokenpositions = []
        for token in sentence.tokens:
            for obstoken, obstokenposition in zip(token.observedform, token.parseposition):
                if obstokenposition not in obstokenpositions:
                    obstokens.append(obstoken)
                    obstokenpositions.append(obstokenposition)
        '''
        obstokens = sentence.setAndGetUniqueTokens()
        #print "obstokens", obstokens
        sentence.matchregex = makeRegexFromTokens(obstokens)
        if debug: print "obstokens:", obstokens
        '''
        sentence.matchregex = makeRegexFromTokens(
            [sentence.tokens[0].observedform] +[sentence.tokens[i].observedform
                                   for i in range(1,len(sentence.tokens))
                                   if sentence.tokens[i].parseposition[0] > sentence.tokens[i-1].parseposition[-1]])
        '''
    return sentence