コード例 #1
0
    def process_sentence(self,sentence):
        #do not know relation name or Pos tags
        #looking for any combination of the compound words
        #print sentence
        for i,arc in enumerate(sentence.values()):
            if len(arc)==self.arclength:
                canddep=getLex(arc[self.lex],postagged=self.postagged).lower()
                #print self.relname,arc
                candrel=arc[self.relname]
                #print arc
                if int(arc[self.headpos])==i+2 and candrel not in CompoundFinder.blacklist:
                    candhead= getLex(sentence[arc[self.headpos]][0]).lower()

                    candkey = canddep+" "+candhead
                    #print candkey
                    if candkey in self.compounds.keys():
                        if self.ptype=="nyt" or self.ptype=="conll7":
                            self.compounds[candkey].match(canddep,candhead,(candrel,arc[self.pos][0],sentence[arc[self.headpos]][self.pos][0]))
                        else:
                            self.compounds[candkey].match(canddep,candhead,(candrel,getPos(arc[0]),getPos(sentence[arc[1]][0])))
                        self.counts[candkey]+=1
コード例 #2
0
    def aptTransform(self,arc,outputformat="conll4"):
    #ptype=conll7: arc = [form,lemma,POS,NER,head,rel]

    #aptInput requires: arc = [form/POS,head,rel]
        if outputformat==self.ptype:
            return arc
        else:
            if self.uselemma:
                token_to_use = self.lemma
            else:
                token_to_use = self.lex

            index_adj=1-self.firstindex

            if self.ptype=="conll7" or self.ptype=="nyt":
                return [arc[token_to_use].lower()+"/"+self.getPosTag(arc[self.pos]),int(arc[self.headpos])+index_adj,arc[self.relname]]
            else:
                return [getLex(arc[token_to_use],tdelim='|')+"/"+getPos(arc[token_to_use]),int(arc[self.headpos])+index_adj,arc[self.relname]]
コード例 #3
0
    def process_sentence_contiguous(self,sentence):
        #just interested in contiguous nouns don't care about relations

        #print sentence
        if self.clean:
            self.output_sentence(sentence,self.cleanstream)
        mxindex=len(sentence.values())-1+self.firstindex
        lemmamatch=False
        for i in sentence.keys():
            try:
                sid =int (i)
                if sid<mxindex:
                    arc=sentence[i]
                    canddep=getLex(arc[self.lex]).lower()
                    candhead=getLex(sentence[str(sid+1)][self.lex]).lower()

                    candkey=canddep+" "+candhead
                    if self.uselemma:
                        candkey2=getLex(arc[self.lemma]).lower()+" "+getLex(sentence[str(sid+1)][self.lemma]).lower()
                        candkey3=getLex(arc[self.lex].lower())+" "+getLex(sentence[str(sid+1)][self.lemma]).lower()
                        if candkey2 in self.compounds.keys():
                            lemmamatch=True
                            candkey=candkey2

                        elif candkey3 in self.compounds.keys():
                            lemmamatch=True
                            candkey=candkey3
                        else:
                            lemmamatch=False


                    #print candkey
                    if lemmamatch or (candkey in self.compounds.keys()):
                        #print "Found",candkey
                        self.counts[candkey]+=1
                        self.cont+=1
                        if self.counts[candkey]==1:
                            print "First found compound: line "+str(self.lines)
                            print i, arc, sentence[str(sid+1)],sentence[arc[self.headpos]]

                        if self.postagged:
                            deppos=getPos(arc[self.lex])
                            dephead=getPos(sentence[str(sid+1)][self.lex])
                        else:
                            deppos=arc[self.pos][0]
                            dephead=sentence[str(sid+1)][self.pos][0]

                        if deppos=="N" and dephead=="N":
                            self.countpos[candkey]+=1
                        if arc[self.headpos]==str(sid+1): #dependency relationship
                            sofar=self.rels.get(arc[self.relname],0)
                            self.rels[arc[self.relname]]=sofar+1

                            self.countrel[candkey]+=1
                            if self.convert: sentence=self.convert_compounds(sentence,sid)

                        else:
                            if sentence[str(sid+1)][self.headpos]==str(sid): #dependency relationship in other direction - ignore this so as to not create cyclic dependencies
                            #if False:
                                print "Not converting reverse dependency into compound"
                                self.cont_revdep+=1
                            else:
                                print candkey, " :contiguous but no dependency: ",i,arc,sentence[str(sid+1)],sentence[arc[self.headpos]]
                                self.cont_nodep+=1
                                if self.convert: sentence=self.convert_compounds(sentence,sid)
            except:
                print "Warning: error ignored"
                pass

        if self.convert: self.output_sentence(sentence, self.outstream)
        if self.convert7: self.output_sentence(sentence, self.convert7stream, outputformat="conll7")