"clause": {}, } #for xmlfilename in xmlfiles: #for xmlfilename in glob.glob('/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellCheckerCorrected/entrycorrected_*/0/0/0/entrycorrected_*.E*.dep.xml'): for xmlfilename in glob.glob('/home/nparslow/Documents/AutoCorrige/Corpora/analysed_SpellCheckerCorrected/entrycorrected_394/0/0/0/entrycorrected_*.E*.dep.xml'): #xmlfilename = "/home/nparslow/Documents/AutoCorrige/Corpora/" + xmlfilename print "analysing", xmlfilename tok2finalforms, tok2lemmacats, verb2info, trees, (weight, maxweight) = getFinalTokenFormsAndTreesAndWeight(xmlfilename) print print "verb2info" print verb2info print info = classifyVerbs.classifyVerbs(verb2info) print "info" print info print ''' for verb, info in verb2info.items(): if len(info) == 1: tense = "notense" if "tense" in info[0]: tense = info[0]["tense"] mood = "infinitive" # sometimes mood is not there if "mode" in info[0]: mood = info[0]["mode"] if mood == "indicative" and tense == "notense":
def __init__(self, processedDepXMLFile, processedTokenFile, processedLogFile, processedMEltFile, debug=False): if debug: print "new Sentence" print processedDepXMLFile print processedTokenFile print processedLogFile print processedMEltFile self.debug = debug self.tokens = None self.rawtokens = None self.matchregex = None self.uniquetokens = None self.lemmaCats = None self.spellingcorrections = None self.weightperword = None self.minweight = None self.meltconfidences = None self.meltdiffs = None self.wordsbeforemainverb = None self.hasnomainverb = None # parse info self.parsed = None # verb info: self.vgroups = None # should be sum of verb entries for a particular axis projection, != no. of verbs as compounds count as one verb group self.vanalysis = None # will be a dictionary with property -> count self.vsingle = None self.vaux = None self.vcompound = None self.vindicative = None self.vconditional = None self.vsubjunctive = None self.vimperfect = None self.vfuture = None self.vpresent = None self.vnotense = None # clause info: self.crel = None self.cnom = None self.cacc = None self.cloc = None # tree info self.trees = None self.wordforms = [] self.verbAnalysis = None self.totalverbgroups = None if os.path.isfile(processedTokenFile): self.rawtokens = readDepXMLFile.getTokensFromFile(processedTokenFile) # melttoks is a list of (tok, tag, prob) meltToks = getMEltInfo.loadAndAlign(processedMEltFile, self.rawtokens ) if os.path.isfile(processedDepXMLFile): xmlinfo = readDepXMLFile.readDepXMLFile(processedDepXMLFile, self.debug) self.wordforms = xmlinfo["wordforms"] if debug: print "xmlinfo, tok to final forms:" print xmlinfo["tok2finalforms"] print "xmlinfo, tok to lemmacats:" print xmlinfo["tok2lemmacats"] self.setLemmaCats(xmlinfo["tok2lemmacats"]) if debug: print "xmlinfo, lemmacats:" print self.lemmaCats self.weightperword = xmlinfo["weightperword"] self.minweight = xmlinfo["minweight"] self.wordsbeforemainverb = xmlinfo["wordsbeforemainverb"] self.hasnomainverb = 1 if xmlinfo["wordsbeforemainverb"] == -1 else 0 self.trees = dict( (x,len(xmlinfo["trees"][x])) for x in xmlinfo["trees"]) self.parsed = xmlinfo["parsemode"] #print "sent parsed", self.parsed self.tok2finalforms = xmlinfo["tok2finalforms"] #print "sent", xmlinfo["verb2info"] self.verbAnalysis, self.totalverbgroups = classifyVerbs.classifyVerbs(xmlinfo["verb2info"]) #print "sent", self.verbAnalysis self.calcSpellingCorrections() self.meltdiffs = getMEltInfo.getMEltDiffs(xmlinfo["tok2lemmacats"], meltToks, debug) if len(self.wordforms) > 0: # sentence was at least partially parsed #print "sorted word forms:" #print sorted(wordsforms, key=lambda x: x[2].split('_')[1]) # sort by the start point of the token #re.sub(ur'E\d+F\d+\|', u'', x[3], flags=re.UNICODE) self.tokens = [Token(x[0], x[1], re.findall(ur'(?<=[F\d]\d\|)[^ ]+', x[3], flags=re.UNICODE), [int(x) for x in re.findall(ur'(?<=F)\d+(?=\|)', x[3], flags=re.UNICODE) ]) for x in sorted(self.wordforms, key=lambda x: int(x[2].split('_')[1]))] #sentence.forms = [x[1] for x in sorted(wordsforms, key=lambda x: x[2].split('_')[1])] else: # sentence wasn't parsed, so use the tokens file: #tokens = getTokensFromFile(processedTokenFile) print "using tokens***" self.tokens = [Token(None, None, self.rawtokens[i], i) for i in range(len(self.rawtokens))] if debug: print "sent tokens:", self.tokens self.matchregex = makeRegexFromTokens(self.rawtokens, debug) if debug: print "obstokens:", self.rawtokens print "regex", self.matchregex self.meltconfidences = [float(x[2]) if (x[2] is not None and x[2] != 'None') else 1.0 for x in meltToks] if debug: print "sent melt confs:", self.meltconfidences
def getNextSentenceFromFiles( processedDepXMLFile, processedTokenFile, processedLogFile, debug=False ): #print processedSentenceFile sentence = None if os.path.isfile(processedDepXMLFile): sentence = Sentence() tree = ET.parse(processedDepXMLFile) # 'W' nodes are 'words' which can include multiple tokens, e.g. 'bien que' is one word # .iter for recursive, .findall for depth of 1 # id the cluster then get the lex element from the cluster (we'll process it later) wordsforms = [(x.attrib['lemma'], x.attrib['form'], x.attrib['cluster'], fixMixedEncodings(tree.findall("cluster[@id='" + x.attrib['cluster']+"']")[0].attrib["lex"])) for x in tree.iter('node') if len(x.get('lemma'))>0 and x.get('lemma') != "_EPSILON"] print "wordsforms" print wordsforms # correct the encodings and remove epsilons tokens = getTokensFromFile(processedTokenFile) if len(wordsforms) > 0: # sentence was at least partially parsed #print "sorted word forms:" #print sorted(wordsforms, key=lambda x: x[2].split('_')[1]) # sort by the start point of the token #re.sub(ur'E\d+F\d+\|', u'', x[3], flags=re.UNICODE) sentence.tokens = [Token(x[0], x[1], re.findall(ur'(?<=[F\d]\d\|)[^ ]+', x[3], flags=re.UNICODE), [int(x) for x in re.findall(ur'(?<=F)\d+(?=\|)', x[3], flags=re.UNICODE) ]) for x in sorted(wordsforms, key=lambda x: int(x[2].split('_')[1]))] #sentence.forms = [x[1] for x in sorted(wordsforms, key=lambda x: x[2].split('_')[1])] else: # sentence wasn't parsed, so use the tokens file: #tokens = getTokensFromFile(processedTokenFile) print "using tokens***" sentence.tokens = [Token(None, None, tokens[i], i) for i in range(len(tokens))] #print sentence.tokens #print "obs forms:", [x.observedform for x in sentence.tokens] tok2finalforms, tok2lemmacats, verb2info, trees, (weight, minweight) = \ compareCorrectedCorpus.getFinalTokenFormsAndTreesAndWeight(processedDepXMLFile) sentence.setLemmaCats(tok2lemmacats) sentence.weightperword = weight sentence.minweight = minweight verbAnalysis = classifyVerbs.classifyVerbs(verb2info) sentence.vsingle = 0 if "single" in verbAnalysis: sentence.vsingle += verbAnalysis["single"] sentence.vaux = 0 if "aux" in verbAnalysis: sentence.vaux += verbAnalysis["aux"] sentence.vcompound = 0 if "compound" in verbAnalysis: sentence.vcompound += verbAnalysis["compound"] sentence.vindicative = 0 if "indicative" in verbAnalysis: sentence.vindicative += verbAnalysis["indicative"] sentence.vconditional = 0 if "conditional" in verbAnalysis: sentence.vconditional += verbAnalysis["conditional"] sentence.vsubjunctive = 0 if "subjonctive" in verbAnalysis: sentence.vsubjunctive += verbAnalysis["subjonctive"] sentence.vimperfect = 0 if "imperfect" in verbAnalysis: sentence.vimperfect += verbAnalysis["imperfect"] sentence.vfuture = 0 if "future" in verbAnalysis: sentence.vfuture += verbAnalysis["future"] sentence.vpresent = 0 if "present" in verbAnalysis: sentence.vpresent += verbAnalysis["present"] sentence.vnotense = 0 if "notense" in verbAnalysis: sentence.vnotense += verbAnalysis["notense"] # clause info: sentence.crel = 0 if "rel" in verbAnalysis: sentence.crel += verbAnalysis["rel"] sentence.cnom = 0 if "nom" in verbAnalysis: sentence.cnom += verbAnalysis["nom"] sentence.cacc = 0 if "acc" in verbAnalysis: sentence.cacc += verbAnalysis["acc"] sentence.cloc = 0 if "loc" in verbAnalysis: sentence.cloc += verbAnalysis["loc"] sxpipeSpellingChanges = 0 for i in range(len(tokens)): # skip multitoken elements, too hard if len(tok2finalforms[i+1]) > 1 or len(tok2finalforms[i+1][0].split(' ')) > 1: continue if tok2finalforms[i+1][0][0] == "_": continue t = tokens[i].lower() f = tok2finalforms[i+1][0].lower() if t != f: print "spelling?", t, f, tok2finalforms[i+1] sxpipeSpellingChanges += 1 sentence.spellingcorrections=sxpipeSpellingChanges ''' t = joinTokens([tokens[i].lower()]) f = joinTokens(tok2finalforms[i+1]).lower() print t, f if i > 0: # tok2finalforms first token is no. 1 if tok2finalforms[i+1] == tok2finalforms[i]: continue # don't look if two same wordforms in a row if t != f: # check its not a multiwoprint "spelling?", t, f, tok2finalforms[i+1] sxpipeSpellingChanges += 1rd thing tmpi = i+1 isDouble = False while f.startswith(t) and tmpi < len(tokens) and len(t) < len(f): t = joinTokens([t, tokens[tmpi]]) if t == f: isDouble = True break tmpi += 1 if not isDouble: print "spelling?", t, f, tok2finalforms[i+1] sxpipeSpellingChanges += 1 ''' ''' for s_token, observedtoken in zip([sentence.tokens[0]] + [sentence.tokens[i] for i in range(1,len(sentence.tokens)) if sentence.tokens[i-1].parseposition != \ sentence.tokens[i].parseposition], getTokensFromFile(processedTokenFile)): # the tokens file overrules the depxml, e.g. in depxml you have \? if s_token.observedform != observedtoken: print "combining:", s_token.frmgform, s_token.observedform, observedtoken s_token.observedform = observedtoken ''' #print words #print forms # we remove double entries from amalgams if debug: print "pre make regex:" ''' print [(sentence.tokens[i].parseposition[0], sentence.tokens[i-1].parseposition[-1], sentence.tokens[i].observedform) for i in range(1, len(sentence.tokens))] print [sentence.tokens[0].observedform] +[sentence.tokens[i].observedform for i in range(1,len(sentence.tokens)) if sentence.tokens[i].parseposition[0] > sentence.tokens[i-1].parseposition[-1]] ''' ''' obstokens = [] obstokenpositions = [] for token in sentence.tokens: for obstoken, obstokenposition in zip(token.observedform, token.parseposition): if obstokenposition not in obstokenpositions: obstokens.append(obstoken) obstokenpositions.append(obstokenposition) ''' obstokens = sentence.setAndGetUniqueTokens() #print "obstokens", obstokens sentence.matchregex = makeRegexFromTokens(obstokens) if debug: print "obstokens:", obstokens ''' sentence.matchregex = makeRegexFromTokens( [sentence.tokens[0].observedform] +[sentence.tokens[i].observedform for i in range(1,len(sentence.tokens)) if sentence.tokens[i].parseposition[0] > sentence.tokens[i-1].parseposition[-1]]) ''' return sentence