def convert_compounds(self,sentence,sid): if self.ptype =="conll7" or self.ptype=="nyt": deppos = self.getPosTag(sentence[str(sid)][self.pos]) else: deppos = getPos(sentence[str(sid)][self.lex]) if self.uselemma: token_to_use=self.lemma else: token_to_use=self.lex cmpd=sentence[str(sid)][token_to_use]+"|"+sentence[str(sid)][self.relname]+"-"+deppos+"|"+sentence[str(sid+1)][token_to_use] sentence[str(sid+1)][token_to_use]=cmpd if self.lemma >-1 and not self.uselemma: sentence[str(sid+1)][self.lemma]=sentence[str(sid)][self.lemma]+"|"+sentence[str(sid)][self.relname]+"|"+sentence[str(sid+1)][self.lemma] sentence[str(sid)]=self.erased #print "Converting compounds: ",cmpd,sid #print sentence.keys() #DON"T PICK UP DEPENDENCIES OF DEPENDENCY #for index in sentence.keys(): # if len(sentence[index])==self.arclength: #print index,sentence[index] # if sentence[index][self.headpos]==str(sid): # print "Found dependency on dependency" # DO NOT WANT TO CONSIDER THIS as dependency of head # sentence[index][self.headpos]=str(sid+1) #print "Complete" return sentence
def aptTransform(self,arc,outputformat="conll4"): #ptype=conll7: arc = [form,lemma,POS,NER,head,rel] #aptInput requires: arc = [form/POS,head,rel] if outputformat==self.ptype: return arc else: if self.uselemma: token_to_use = self.lemma else: token_to_use = self.lex index_adj=1-self.firstindex if self.ptype=="conll7" or self.ptype=="nyt": return [arc[token_to_use].lower()+"/"+self.getPosTag(arc[self.pos]),int(arc[self.headpos])+index_adj,arc[self.relname]] else: return [getLex(arc[token_to_use],tdelim='|')+"/"+getPos(arc[token_to_use]),int(arc[self.headpos])+index_adj,arc[self.relname]]
def process_sentence(self,sentence): #do not know relation name or Pos tags #looking for any combination of the compound words #print sentence for i,arc in enumerate(sentence.values()): if len(arc)==self.arclength: canddep=getLex(arc[self.lex],postagged=self.postagged).lower() #print self.relname,arc candrel=arc[self.relname] #print arc if int(arc[self.headpos])==i+2 and candrel not in CompoundFinder.blacklist: candhead= getLex(sentence[arc[self.headpos]][0]).lower() candkey = canddep+" "+candhead #print candkey if candkey in self.compounds.keys(): if self.ptype=="nyt" or self.ptype=="conll7": self.compounds[candkey].match(canddep,candhead,(candrel,arc[self.pos][0],sentence[arc[self.headpos]][self.pos][0])) else: self.compounds[candkey].match(canddep,candhead,(candrel,getPos(arc[0]),getPos(sentence[arc[1]][0]))) self.counts[candkey]+=1
def process_sentence_contiguous(self,sentence): #just interested in contiguous nouns don't care about relations #print sentence if self.clean: self.output_sentence(sentence,self.cleanstream) mxindex=len(sentence.values())-1+self.firstindex lemmamatch=False for i in sentence.keys(): try: sid =int (i) if sid<mxindex: arc=sentence[i] canddep=getLex(arc[self.lex]).lower() candhead=getLex(sentence[str(sid+1)][self.lex]).lower() candkey=canddep+" "+candhead if self.uselemma: candkey2=getLex(arc[self.lemma]).lower()+" "+getLex(sentence[str(sid+1)][self.lemma]).lower() candkey3=getLex(arc[self.lex].lower())+" "+getLex(sentence[str(sid+1)][self.lemma]).lower() if candkey2 in self.compounds.keys(): lemmamatch=True candkey=candkey2 elif candkey3 in self.compounds.keys(): lemmamatch=True candkey=candkey3 else: lemmamatch=False #print candkey if lemmamatch or (candkey in self.compounds.keys()): #print "Found",candkey self.counts[candkey]+=1 self.cont+=1 if self.counts[candkey]==1: print "First found compound: line "+str(self.lines) print i, arc, sentence[str(sid+1)],sentence[arc[self.headpos]] if self.postagged: deppos=getPos(arc[self.lex]) dephead=getPos(sentence[str(sid+1)][self.lex]) else: deppos=arc[self.pos][0] dephead=sentence[str(sid+1)][self.pos][0] if deppos=="N" and dephead=="N": self.countpos[candkey]+=1 if arc[self.headpos]==str(sid+1): #dependency relationship sofar=self.rels.get(arc[self.relname],0) self.rels[arc[self.relname]]=sofar+1 self.countrel[candkey]+=1 if self.convert: sentence=self.convert_compounds(sentence,sid) else: if sentence[str(sid+1)][self.headpos]==str(sid): #dependency relationship in other direction - ignore this so as to not create cyclic dependencies #if False: print "Not converting reverse dependency into compound" self.cont_revdep+=1 else: print candkey, " :contiguous but no dependency: ",i,arc,sentence[str(sid+1)],sentence[arc[self.headpos]] self.cont_nodep+=1 if self.convert: sentence=self.convert_compounds(sentence,sid) except: print "Warning: error ignored" pass if self.convert: self.output_sentence(sentence, self.outstream) if self.convert7: self.output_sentence(sentence, self.convert7stream, outputformat="conll7")