Python SentencePair Examples

Programming Language: Python

Namespace/Package Name: sentpair

Class/Type: SentencePair

Examples at hotexamples.com: 2

Python SentencePair - 2 examples found. These are the top rated real world Python examples of sentpair.SentencePair extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

addlemma(1)

addpos(1)

addword(1)

Example #1

Show file

File: stsdata.py Project: julieweeds/STS

    def processline(self,line):

 #       if self.validfile:
            #print "Processing "+line
            matchobj=STSData.sidPATT.match(line)
            if matchobj:
                if self.label in self.pairset:
                    self.currentpair=self.pairset[self.label]
                else:
                    self.currentpair=SentencePair(self.fileid,self.setid,self.testing)

            else:
                matchobj=STSData.sidendPATT.match(line)
                if matchobj:
                        self.pairset[self.label]=self.currentpair
                        #print "Finished reading sentence - stored pair ..."
                else:
                    matchobj = STSData.wordPATT.match(line)
                    if matchobj:
                        word = matchobj.group(1)
                        self.currentpair.addword(word,self.sentid)
                    else:
                        matchobj = STSData.lemmaPATT.match(line)
                        if matchobj:
                            lemma = matchobj.group(1)
                            self.currentpair.addlemma(lemma,self.sentid)
                        else:
                            matchobj=STSData.posPATT.match(line)
                            if matchobj:
                                pos = matchobj.group(1)
                                self.currentpair.addpos(pos,self.sentid)

Example #2

Show file

File: stsdata.py Project: julieweeds/STS

class STSData:
    sidPATT = re.compile('.*<document>')
    sidendPATT = re.compile('.*</document>')
    wordPATT = re.compile('.*<word>(.*)</word>')
    lemmaPATT = re.compile('.*<lemma>(.*)</lemma>')
    posPATT = re.compile('.*<POS>(.).*</POS>') #only first char of POS
    fileidPATT= re.compile('.*input(.*).pair(.*)(.).tagged')
    gssetPATT = re.compile('.*.gs.(.*).txt')
    wordposPATT = re.compile('(.*)/(.)') #only first char of POS
    methods = ["additive","multiplicative"]
    setmethods = ["avg_max","geo_max"]
    simthreshold = 1.0
    minsim = 0.001
    threshtype="nonbin"
    seed = 666


    def __init__(self,graphson,testing,windows,threshold,threshtype,verbose,adja,adjb):
        self.pairset={} #label is setid_fileid
        self.vectordict={} #mapping from (word,POS) tuples to wordvectors
        self.wordcounts={} #count the number of times each (word,POS) tuple occurs in data for analysis
        self.uncovered={} #store words and counts not in thesaurus
        self.sid=0
        self.filesread=0
        self.setid=""
        self.sentid=""
        self.label=""
        self.fileid=0
        self.simaverage={} #average similarities for different functions and subsets
        self.nosplits=-1 #number of cross-validation splits
        self.show=graphson
        self.updated=0
        self.testing=testing
        self.comp=""
        self.metric=""
        self.setsim=""
        self.allfeatures={} #dictionary of all feature dimenesions
        self.fkeys=[] #list (to be sorted) of all features to
        self.fk_idx={} #feature --> dimension
        self.dim=0
        WordVector.windows=windows
        STSData.simthreshold=threshold
        STSData.threshtype=threshtype
        self.verbose = verbose
        self.adja=adja
        self.adjb=adjb

    def setseed(self):
        random.seed(STSData.seed)#for reproducible results

    def readdata(self,parentname):
        dirlist = glob.glob(parentname+'/*')
        if len(dirlist)==0:
            print "Aborting due to empty data directory "+parentname
            exit(1)
        for d in dirlist:
            print "Reading "+d
            filelist = glob.glob(d+'/*')
            for f in filelist:
                #print "Reading "+f
                matchobj = STSData.fileidPATT.match(f)
                if matchobj:
                    self.setid=matchobj.group(1)
                    self.fileid=matchobj.group(2)
                    self.sentid=matchobj.group(3)
                    self.label=self.setid+"_"+self.fileid
                    #print "Self.label = "+self.label
                    #print "Setid: "+self.setid+" fileid: "+self.fileid
                else:
                    print "Error with filename, should contain id number "+f

                self.readdatafile(f)
            if self.testing == True: break
#        self.removeduplicates()
        self.vectordict_init()


    def readdatafile(self,filename):
        #print "Opening "+filename
        dstream = open(filename,'r')
        self.validfile=True
        for line in dstream:
            self.processline(line.rstrip())
        dstream.close()
        self.filesread+=1


    def processline(self,line):

 #       if self.validfile:
            #print "Processing "+line
            matchobj=STSData.sidPATT.match(line)
            if matchobj:
                if self.label in self.pairset:
                    self.currentpair=self.pairset[self.label]
                else:
                    self.currentpair=SentencePair(self.fileid,self.setid,self.testing)

            else:
                matchobj=STSData.sidendPATT.match(line)
                if matchobj:
                        self.pairset[self.label]=self.currentpair
                        #print "Finished reading sentence - stored pair ..."
                else:
                    matchobj = STSData.wordPATT.match(line)
                    if matchobj:
                        word = matchobj.group(1)
                        self.currentpair.addword(word,self.sentid)
                    else:
                        matchobj = STSData.lemmaPATT.match(line)
                        if matchobj:
                            lemma = matchobj.group(1)
                            self.currentpair.addlemma(lemma,self.sentid)
                        else:
                            matchobj=STSData.posPATT.match(line)
                            if matchobj:
                                pos = matchobj.group(1)
                                self.currentpair.addpos(pos,self.sentid)


    def removeduplicates(self):
        #remove pairs from pairset where the two sentences are identical
        #however, this needs work as the system assumes consecutive numbering of pairs
        total={}
        dups={}
        for key in self.pairset.keys():
            pair=self.pairset[key]
            fileid=pair.fid
            if fileid in total.keys():
                total[fileid]=total[fileid]+1
            else:
                total[fileid]=1
            if pair.isidentical():
                if fileid in dups.keys():
                    dups[fileid]=dups[fileid]+1
                else:
                    dups[fileid]=1
                print "Removing duplicate:"
                pair.display()
                del self.pairset[key]
        for fileid in total.keys():
            if fileid in dups.keys():
                top=dups[fileid]
            else:
                top=0.0
            percent = top*100.0/total[fileid]
            print "For "+fileid+" removed "+str(top)+" duplicates out of "+str(total[fileid])+" pairs: "+str(percent)+"%"


    def averagesim(self,type,subset):
        label=type+"_"+subset
        #print label
        if label in self.simaverage:
            average = self.simaverage[label]
        else:
            total =0
            count =0
            if subset=='all':
                for p in self.pairset.values():
                    total+=p.sim(type)
                    count+=1
            else :
                for p in self.pairset.values():
                    if (p.fid == subset):
                            total+=p.sim(type)
                            #print "average sim "+str(p.sim(type))
                            count+=1
                            #print count

            average = total/count
            self.simaverage[label]=average
        return average

    def readgs(self,dirname):
        #read in gs scores and associate with sentence pairs
        filelist = glob.glob(dirname+'/*')
        for f in filelist:
            print "Reading "+f
            self.readgsfile(f)
            if self.testing == True: break

    def readgsfile(self,filename):
        matchobj=STSData.gssetPATT.match(filename)
        if matchobj:
            gsid=matchobj.group(1)
        else:
            print "Error with gs file "+filename
            exit(1)
        pid =0
        instream=open(filename,'r')
        for line in instream:
            pid +=1
            self.processgsline(line.rstrip(),gsid,pid)
        instream.close()

    def processgsline(self,line,gsid,pid):
        label = gsid+"_"+str(pid)
        if label in self.pairset.keys():
            self.pairset[label].gs=float(line)
        else:
            print "WARNING "+label+" not found in pairset"


    def split(self,num):
        #print "Splitting data into subsets for cross-validation ..."
        self.nosplits=num
        for pair in self.pairset.values():
            pair.cvsplit = random.randint(1,num)

    def fitpoly(self,subset,excl,type):
        #subset is a setid and excl is cv_split to exclude and type is similarity to correlate
        carryon=True
        fileid = 1
        correlationx=[]
        correlationy=[]
        while carryon==True:
            label = subset+"_"+str(fileid)
            if label in self.pairset.keys():
                #print "Checking for "+label
                if self.pairset[label].cvsplit==excl:
                    #ignore
                    #print "Ignoring"
                    fileid+=1
                else:
                    correlationy.append(self.pairset[label].gs)
                    correlationx.append(self.pairset[label].sim(type))
                    fileid+=1
            else:
                carryon=False #assumes pairs are consecutively numbered
        #print len(correlationx),len(correlationy)
        x=numpy.array(correlationx)
        y=numpy.array(correlationy)
        thispoly= numpy.poly1d(numpy.polyfit(x,y,1))

        if excl==1 and self.show == True:
            pr=stats.spearmanr(x,y)
            mytitle="Regression line for: "+subset+":"+str(excl)+":"+type
            self.showpoly(x,y,thispoly,mytitle,pr,1,5)

        return thispoly

    def showpoly(self,x,y,poly,title,pr,xl,yl):
        xp=numpy.linspace(0,xl,100)
        plt.plot(x,y,'.',xp,poly(xp),'-')
        plt.ylim(0,yl)
        plt.title(title)
        mytext1="srcc = "+str(pr[0])
        mytext2="p = "+str(pr[1])
        plt.text(0.05,yl*0.9,mytext1)
        plt.text(0.05,yl*0.8,mytext2)
        plt.show()

    def testpoly(self,subset,excl,type):

        #to generate and test regression line

        thispoly = self.fitpoly(subset,excl,type)
        #print thispoly

        fileid =1
        predictions=[]

        gs=[]
        carryon=True
        #noones=0
        #nozeroes=0

        while carryon == True:
            label = subset+"_"+str(fileid)
            if label in self.pairset.keys():
                if self.pairset[label].cvsplit== excl:
                    predictions.append(thispoly(self.pairset[label].sim(type)))
                    gs.append(self.pairset[label].gs)
                    fileid+=1
                else:
                    #ignore

                    fileid+=1
            else:
                carryon = False
                #now to compute spearman correlation coefficient between gs and predictions

        x=numpy.array(predictions)
        y=numpy.array(gs)
        #print len(x),len(y)
        #sumzeroone=nozeroes+noones
        #print nozeroes, noones, sumzeroone
        #print x,y

        #pr = stats.spearmanr(x,y)
        pr=stats.spearmanr(x,y)
        if excl==1 and self.show==True:
            mytitle="Correlation for: "+subset+": "+str(excl)+": "+type
            self.showpoly(x,y,numpy.poly1d(numpy.polyfit(x,y,1)),mytitle,pr,5,5)
            #print pr
        return pr

    def testpoly2(self,subset,excl,type1, type2):

        #to generate and compare 2 regression lines

        thispoly1 = self.fitpoly(subset,excl,type1)
        thispoly2 = self.fitpoly(subset,excl,type2)
        #print thispoly

        fileid =1
#        predictions1=[]
        predictions=[]

        gs=[]
        carryon=True
        #noones=0
        #nozeroes=0

        while carryon == True:
            label = subset+"_"+str(fileid)
            if label in self.pairset.keys():
                if self.pairset[label].cvsplit== excl:
                #if self.pairset[label].sim(type)==1:
                #  noones+=1
                #if self.pairset[label].sim(type)==0:
                #  nozeroes+=1
 #                   predictions1.append(thispoly1(self.pairset[label].sim(type1)))
                    predictions.append(thispoly2(self.pairset[label].sim(type2)))
                    gs.append(self.pairset[label].gs)
                    error1 = thispoly1(self.pairset[label].sim(type1))-self.pairset[label].gs
                    error2 = thispoly2(self.pairset[label].sim(type2))-self.pairset[label].gs
                    diff=pow(error2,2)-pow(error1,2)
                    self.pairset[label].totaldiff+=diff
                    fileid+=1
                else:
                    #ignore
                    fileid+=1
            else:
                carryon = False
            #now to compute spearman correlation coefficient between gs and predictions

        x=numpy.array(predictions)
        y=numpy.array(gs)
        #print len(x),len(y)
        #sumzeroone=nozeroes+noones
        #print nozeroes, noones, sumzeroone
        #print x,y
        pr = stats.spearmanr(x,y)
        if excl==1 and self.show==True:
            mytitle="Correlation for: "+subset+": "+str(excl)+": "+type
            self.showpoly(x,y,numpy.poly1d(numpy.polyfit(x,y,1)),mytitle,pr,5,5)
            #print pr
        return pr



    def testread(self,sim,dataset):

        #print "Testing"
        #print "Files read = "+str(self.filesread)
        if self.testing:
            print "Pairs stored = "+str(len(self.pairset))
            for p in self.pairset.values():
                p.display()
        #else:
        #    for p in self.pairset.values():
        #        p.display()


        #print "Average lemma overlap of content words is "+str(self.averagesim("lemma_content","all"))
        print "Average "+sim+" for "+dataset+" data is "+str(self.averagesim(sim,dataset))


    def vectordict_init(self):
        for pair in self.pairset.values():
            for sent in ['A','B']:
               for item in pair.returncontentlemmas(sent):
                    if item in self.vectordict:
                        #self.check=True
                        self.wordcounts[item]+=1 #count how many times each item occurs for analysis
                    else:
                        self.vectordict[item]=WordVector(item)
                        self.wordcounts[item]=1
        print "Vector dictionary initialised with "+str(len(self.vectordict.keys()))+" words"

    def compute_token_coverage(self):
        total=0
        covered=0
        for word in self.wordcounts.keys():
            freq=self.wordcounts[word]
            total+=freq
            if len(self.vectordict[word].vector)>0:
                covered+=freq
            else:
                self.uncovered[word]=freq
        coverage=covered*100.0/total
        self.analyse_uncovered()
        return coverage

    def analyse_uncovered(self):
        #outlog=open('logfile','w')
        poscounts={} #count how many uncovered in each POS
        for tuple in self.uncovered.keys():
            (word,pos)=tuple
            #outlog.write(word+"/"+pos+"\t"+str(self.uncovered[tuple])+"\n")
            if pos in poscounts.keys():
                poscounts[pos]+=1
            else:
                poscounts[pos]=1

        #outlog.close()
        total=0
        for pos in poscounts.keys():

            print "Uncovered by POS:-"
            print pos+" : " + str(poscounts[pos])
            total+=poscounts[pos]
        print "Total "+str(total
)

    def readvectors(self,vectorfilename,cachename):
        print"Reading vector file "+vectorfilename
        linesread=0
        instream=open(vectorfilename,'r')
        for line in instream:
            self.processvectorline(line.rstrip())
            linesread+=1
            #if (self.testing==True and linesread>1000):
             #   break
            if (linesread%10000 == 0):
                print "Read "+str(linesread)+" lines and updated "+str(self.updated)+" vectors"
                sys.stdout.flush()

        print "Read "+str(linesread)+" lines and updated "+str(self.updated)+" vectors"
        coverage=self.updated*100.0/len(self.vectordict.keys())
        print "Vector dictionary type coverage is "+str(coverage)+"%"
        print "Token coverage is "+str(self.compute_token_coverage())+"%"
        instream.close()
        if cachename==vectorfilename:
            print "Vector cache up to date"
        else:
            print "Writing vector cache"
            self.makecache(cachename)
        print "Compressing vector dictionary representation"
        self.makematrix()
        print "Finished sparse array generation"

    def processvectorline(self,line):
        featurelist=line.split('\t')
        matchobj = STSData.wordposPATT.match(featurelist[0])
        if matchobj:
            wordpos=(matchobj.group(1),matchobj.group(2))
        else:
            print "Error with vector file matching "+featurelist[0]
            #this could be "__FILTERED" so ignore line and carry on
            return

        #if len(featurelist)>WordVector.dim:
         #   WordVector.dim=len(featurelist)
        if wordpos in self.vectordict.keys():
            featurelist.reverse()
            featurelist.pop()
            self.updatevector(wordpos,featurelist)
            self.updated+=1

    def updatevector(self,wordpos,featurelist):
        while(len(featurelist)>0):
            f=featurelist.pop()
            sc=featurelist.pop()
            added=self.vectordict[wordpos].addfeature(f,sc)
            if added:
                self.allfeatures[f]=1
        self.vectordict[wordpos].length=pow(self.vectordict[wordpos].length2,0.5)

    def makematrix(self):
        self.fkeys =self.allfeatures.keys()
        self.fkeys.sort()
        for i in range(len(self.fkeys)):
            self.fk_idx[self.fkeys[i]] = i
        del self.fkeys
        del self.allfeatures
        self.dim=len(self.fk_idx)
        update_params(self.dim,self.adja,self.adjb)
        print "Dimensionality is "+ str(self.dim)
        self.makearrays()

    def makearrays(self):
        #need to convert a word vector which stores a dictionary of features into a sparse array based on fk_idx
        for wordvector in self.vectordict.values():

            temparray = numpy.zeros(self.dim)
            for feature in wordvector.vector.keys():

                col=self.fk_idx[feature]
                score=wordvector.vector[feature]
#
                temparray[col]=score
           # print temparray
            wordvector.array = sparse.csr_matrix(temparray)
            #print wordvector.array.data
           # print "Converted "+wordvector.word+"/"+wordvector.pos


#    def composeall(self,method,metric):
#        for pair in self.pairset.values():
#            pair.compose(self.vectordict,method,metric) # compose and sentence sim each pair of sentences
#            sys.stdout.flush()


    def composeall_faster(self,method,metric): #compose each sentence in each pair and compute similarity of pair
        self.comp=method
        self.metric=metric
        if method in STSData.methods:
            donepairs=0
            for pair in self.pairset.values():
                self.compose_faster(pair)
                sys.stdout.flush()
                pair.getsentsim()
                donepairs+=1
                if donepairs%500 ==0:
                    print "Completed composition and similarity calculations for "+str(donepairs)+" pairs"
                 #   break


        else:
            print "Unknown method of composition "+method


    def compose_faster(self,pair):
        pair.comp=self.comp
        pair.metric=self.metric
        for sent in ['A','B']:
            lemmalist=pair.returncontentlemmas(sent) #get all lemmas in sentence
            pair.sentvector[sent]=WordVector((sent,'S'))
            if pair.comp == "multiplicative":
                pair.sentvector[sent].array=sparse.csr_matrix(numpy.ones(self.dim)) #initialise sentence array as ones
            else:  #assume additive
                pair.sentvector[sent].array=sparse.csr_matrix(numpy.zeros(self.dim)) #initialise sentence array as zeroes

            #print lemmalist
            for tuple in lemmalist:
                if tuple in self.vectordict:
                    if len(self.vectordict[tuple].vector)>0:  #only compose non-zero vectors
                    #     print tuple, "yes"
                        if pair.comp == "multiplicative":
                            #pair.sentvector[sent].mult_array(self.vectordict[tuple])
                            pair.sentvector[sent].array=pair.sentvector[sent].array.multiply(self.vectordict[tuple].array)
                        else: #assume additive
                            #pair.sentvector[sent].add_array(self.vectordict[tuple])
                            pair.sentvector[sent].array=pair.sentvector[sent].array + self.vectordict[tuple].array
            #pair.sentvector[sent].display()

    def set_simall(self,method,metric):
        self.metric=metric
        self.setsim=method
        if self.setsim in STSData.setmethods:
            donepairs=0
            for pair in self.pairset.values():
                self.set_sim(pair)
                sys.stdout.flush()
                donepairs+=1
                #if self.testing:
                if donepairs%500 ==0:
                    print "Completed set similarity calculations for "+str(donepairs)+" pairs"
                       #break


        else:
            print "Unknown method of set similarity "+self.setsim

    def set_sim(self,pair):
        pair.metric=self.metric
        pair.setsim=self.setsim
        label="set_"+pair.metric+"_"+pair.setsim
        if label in pair.sentsim.keys():
            sim = pair.sentsim[label]
        else:
            lemmalistA=pair.returncontentlemmas('A') #get all content lemmas in sentence A
            lemmalistB=pair.returncontentlemmas('B') #get all content lemmas in sentence B

            #compute set sim A->B
            (total1,count1)= self.set_sim1(lemmalistA,lemmalistB)
            #sim1=self.set_sim1(lemmalistA,lemmalistB)
            #compute set sim B->A
            (total2,count2)= self.set_sim1(lemmalistB,lemmalistA)
            #sim2=self.set_sim1(lemmalistB,lemmalistA)
            #compute arithmetic mean
            if self.setsim=="geo_max":
                sim=pow(total1*total2,1.0/(count1+count2))
            else:
                sim =(total1+total2)/(count1+count2)
            #sim =(sim1+sim2)/2
            pair.sentsim[label]=sim
        if self.verbose:
            print (pair.toString("sent_set"))
        return sim

    def set_sim1(self,lemmalistA,lemmalistB): #asymmetric set sim from A to B

        if self.setsim=="geo_max":
            total =1.0
        else:
            total=0.0
        count=0.0
        for lemmaA in lemmalistA:
            maxsim=STSData.minsim #smoothing - if no lemmas in B have entry or any similarity to this lemma
            maxlemma=("$!","$!")
            if lemmaA in self.vectordict:
                if lemmaA in lemmalistB: #check if word is actually in the other sentence
                    maxsim=1.0
                    maxlemma=lemmaA
                    #print lemmaA, lemmaB, thissim
                else:
                    if len(self.vectordict[lemmaA].vector)>0: #only consider non-zero vectors

                        for lemmaB in lemmalistB: #find maximally similar lemma in B
                            if lemmaB in self.vectordict:
                                if len(self.vectordict[lemmaB].vector)>0:
                                    thissim=self.vectordict[lemmaA].findsim(self.vectordict[lemmaB],self.metric)
                                    if thissim<0:
                                        print lemmaA,lemmaB,thissim
                                        self.vectordict[lemmaA].debug=True
                                        thissim=self.vectordict[lemmaA].findsim(self.vectordict[lemmaB],self.metric)
                                    if thissim>1:
                                        print lemmaA, lemmaB, thissim
                                        self.vectordict[lemmaA].debug=True
                                        thissim=self.vectordict[lemmaA].findsim(self.vectordict[lemmaB],self.metric)
                                    if(thissim>maxsim):
                                        maxsim=thissim
                                        maxlemma=lemmaB


            else:
                print "Vector dictionary error for ", lemmaA
            if maxsim < STSData.simthreshold: #similarity threshold
                if STSData.threshtype=="weighted":
                     maxsim = maxsim/STSData.simthreshold #weighted thresholding
                else:
                    maxsim = STSData.minsim # minimum similarity i.e., ignore in binary or non-binary case
                    maxlemma=("$!",maxlemma)
            else:
                if STSData.threshtype=="nonbin":
                    maxsim = maxsim * 1.0 # leave similarity as it is for non-binary threshold
                else:
                    maxsim = 1.0 #in - weighted thresholding or binary threshold

            if self.setsim=="geo_max":
                total = total * maxsim
            else :
                total = total + maxsim
            count +=1
            if self.verbose:
                (wordA,posA)=lemmaA
                (wordB,posB)=maxlemma
                print wordA+"/"+posA+ " : "+wordB+"/"+posB+" : "+str(maxsim)

#        if self.setsim=="geo_max":
#            sim = pow(total,(1.0/count))
#        else:
#            sim = total/count
        return (total,count)

    def ranksent(self,f,type,repeats,outstream):
        ranking=[]
        for key in self.pairset.keys():#create unordered list of (key,score) pairs
            if self.pairset[key].fid == f:
                score = self.pairset[key].totaldiff/repeats
                ranking.append((key,score))
        #sort list based on score
        ranking.sort(key=operator.itemgetter(1))
        rank=1
        for (key,score) in ranking:
            outstream.write(str(rank)+" : "+str(score)+"\n")
            outstream.write(self.pairset[key].toString(type))
            rank+=1

    def makecache(self,filename):
        outstream = open(filename,'w')
        for vector in self.vectordict.values():
            if len(vector.vector)>0:
                vector.makecache(outstream)
        outstream.close()

    def inspect(self):
        print "Pairs stored = "+str(len(self.pairset))
        #for p in self.pairset.values():
         #   print(p.toString("sent_set"))