Example #1
0
class Thesaurus:

    wordposPATT = re.compile('(.*)/(.)') #only first char of POS
    byblo = False # byblo neighbours file or appthes generated from vector file

    def __init__(self,vectorfilename,simcachefile,simcache,windows,k,adja,adjb,compress):
        self.vectorfilename=vectorfilename
        self.simcachefile=simcachefile
        self.simcache=simcache
        self.thisvector=""
        self.vectordict={} #dictionary of vectors
        self.allfeatures={} #dictionary of all feature dimensions
        self.updated=0
        self.fkeys=[] #list (to be sorted) of all features to
        self.fk_idx={} #feature --> dimension
        self.dim=0
        WordVector.windows=windows
        self.k=k
        self.adja=adja
        self.adjb=adjb
        self.filter=False
        self.filterwords=[]
        self.compress=compress #whether to generate sparse vector representation for efficient sim calcs

    def readvectors(self):
        if self.simcache:
            #don't bother reading in vectors - just need simcache
            same=True
        else:
            print"Reading vector file "+self.vectorfilename
            linesread=0
            instream=open(self.vectorfilename,'r')
            for line in instream:
                self.processvectorline(line.rstrip())
                linesread+=1
                if (linesread%10000 == 0):
                    print "Read "+str(linesread)+" lines and updated "+str(self.updated)+" vectors"
                    sys.stdout.flush()

            print "Read "+str(linesread)+" lines and updated "+str(self.updated)+" vectors"
            instream.close()
            if self.compress:
                print "Compressing vector dictionary representation"
                self.makematrix()
                print "Finished sparse array generation"

    def processvectorline(self,line):
        featurelist=line.split('\t')
        matchobj = Thesaurus.wordposPATT.match(featurelist[0])
        if matchobj:
            wordpos=(matchobj.group(1),matchobj.group(2))
        else:
            print "Error with vector file matching "+featurelist[0]
            #this could be "__FILTERED" so ignore line and carry on
            return

            #if len(featurelist)>WordVector.dim:
            #   WordVector.dim=len(featurelist)

        self.vectordict[wordpos]=WordVector(wordpos) #initialise WordVector in vector dictionary

        featurelist.reverse() #reverse list so can pop features and scores off
        featurelist.pop() #take off last item which is word itself
        self.updatevector(wordpos,featurelist)
        self.updated+=1

    def updatevector(self,wordpos,featurelist):
        while(len(featurelist)>0):
            f=featurelist.pop()
            sc=featurelist.pop()
            added=self.vectordict[wordpos].addfeature(f,sc)
            if added:
                self.allfeatures[f]=1
        self.vectordict[wordpos].length=pow(self.vectordict[wordpos].length2,0.5)

    def readsims(self):

        print"Reading sim file "+self.simcachefile
        linesread=0
        instream=open(self.simcachefile,'r')
        for line in instream:
            self.processsimline(line.rstrip())
            linesread+=1
            if (linesread%1000 == 0):
                print "Read "+str(linesread)+" lines and updated "+str(self.updated)+" similarity vectors"
                sys.stdout.flush()
                #return
        self.topk(self.k)
        print "Read "+str(linesread)+" lines and updated "+str(self.updated)+" vectors"
        instream.close()


    def processsimline(self,line):
        featurelist=line.split('\t')
        matchobj = Thesaurus.wordposPATT.match(featurelist[0])
        if matchobj:
            wordpos=(matchobj.group(1),matchobj.group(2))
        else:
            print "Error with vector file matching "+featurelist[0]
            return


        #self.vectordict[wordpos]=WordVector(wordpos) #initialise WordVector in vector dictionary
        (word,pos)=wordpos
        add=True
        if self.filter:
            if word+"/"+pos in self.filterwords:
                add=True
            else:
                add=False

        if add:
            self.thisvector=WordVector(wordpos)

            featurelist.reverse() #reverse list so can pop features and scores off
            featurelist.pop() #take off last item which is word itself
            if Thesaurus.byblo:
                #no extra fields
                check=True
            else:
                self.thisvector.width=float(featurelist.pop())
                self.thisvector.length=float(featurelist.pop())
            self.updatesimvector(wordpos,featurelist)
            self.thisvector.topk(self.k)
            self.vectordict[wordpos]=self.thisvector
            #self.vectordict[wordpos].displaysims()
            self.updated+=1

    def updatesimvector(self,wordpos,featurelist):
        while(len(featurelist)>0):
            f=featurelist.pop()
            sc=featurelist.pop()
            self.thisvector.allsims[f]=float(sc)


    def makematrix(self):
        self.fkeys =self.allfeatures.keys()
        self.fkeys.sort()
        for i in range(len(self.fkeys)):
            self.fk_idx[self.fkeys[i]] = i
        del self.fkeys
        del self.allfeatures
        self.dim=len(self.fk_idx)
        print "Dimensionality is "+ str(self.dim)
        update_params(self.dim,self.adja,self.adjb)
        self.makearrays()

    def makearrays(self):
        #need to convert a word vector which stores a dictionary of features into a sparse array based on fk_idx
        for wordvector in self.vectordict.values():

            temparray = numpy.zeros(self.dim)
            for feature in wordvector.vector.keys():

                col=self.fk_idx[feature]
                score=wordvector.vector[feature]
                #
                temparray[col]=score
                # print temparray
            wordvector.array = sparse.csr_matrix(temparray)
            #print wordvector.array.data
            # print "Converted "+wordvector.word+"/"+wordvector.pos

    def allpairssims(self,metric):
        if self.simcache:
            #read in from sim cache
            self.readsims()
            #outstream=open(self.simcachefile,'w')
            #for wordvectorA in self.vectordict.values():
            #    wordvectorA.outputsims(outstream)
            #outstream.close()
        else:
            outstream=open(self.simcachefile,'w')
            #compute all pairs sims and write sim cache
            done =0
            for wordvectorA in self.vectordict.values():
                wordvectorA.allsims={}
                for wordvectorB in self.vectordict.values():
                    if wordvectorA.equals(wordvectorB):
                        #ignore
                        same =True
                    else:
                        label = wordvectorB.word+"/"+wordvectorB.pos

                        sim=wordvectorA.findsim(wordvectorB,metric)
                        if sim<0:
                            wordvectorA.debug=True
                            wordvectorA.findsim(wordvectorB,metric)
                        if sim>1:
                            wordvectorA.debug=True
                            wordvectorA.findsim(wordvectorB,metric)
                        wordvectorA.allsims[label]=sim
                wordvectorA.outputtopk(outstream,self.k)

                done+=1
                if done%100==0: print "Completed similarity calculations for "+str(done)+" words"


        #for wordvectorA in self.vectordict.values():
         #   wordvectorA.analyse()

    def outputsim(self,wordA,wordB,metric):
        sim =-1
        if self.simcache:
           (wa,pa)=wordA
           if wordA in self.vectordict.keys():
               (wb,pb)=wordB
               label=wb+"/"+pb
               if label in self.vectordict[wordA].allsims.keys():
                   sim = self.vectordict[wordA].allsims[label]
                   print "Similarity between "+wa+"/"+pa+" and "+wb +"/"+pb+" is "+str(sim)
               else:
                   print label + " not in neighbour set"
           else:
               print wa+"/"+pa+" not in dictionary"


        else:
            if wordA in self.vectordict.keys():
                vectorA = self.vectordict[wordA]

                if wordB in self.vectordict.keys():
                    vectorB = self.vectordict[wordB]
                    sim = vectorA.findsim(vectorB,metric)
                    print "Similarity between "+vectorA.word+"/"+vectorA.pos+" and "+vectorB.word +"/"+vectorB.pos+" is "+str(sim)
                    print "("+str(vectorA.width) + ", "+str(vectorB.width)+")"

                else:
                    (word,pos)=wordB
                    print word+"/"+pos +" not in dictionary"

            else:
                (word,pos)=wordA
                print word+"/"+pos +" not in dictionary"


    def topk(self,k):
        #retain top k neighbours for each word
        for thisvector in self.vectordict.values():
            thisvector.topk(k)

    def topsim(self,sim):
        #retain similarities over sim threshold
        for thisvector in self.vectordict.values():
            #print thisvector,sim
            thisvector.keeptopsim(sim)

    def displayneighs(self,word,k):
        if word in self.vectordict.keys():

            vector=self.vectordict[word]
            vector.topk(k)
            vector.displaysims()
        else:
            (word,pos)=word
            print word+"/"+pos + " not in dictionary"

    def analyse(self):
        totaltop=0.0
        totalavg=0.0
        squaretop=0.0
        squareavg=0.0
        count=0
        correlationx=[]
        correlationy1=[]
        correlationy2=[]
        totalsd = 0.0
        squaresd=0.0

        for wordvectorA in self.vectordict.values():
            count+=1
            totaltop+=wordvectorA.topsim
            squaretop+=wordvectorA.topsim*wordvectorA.topsim
            totalavg+=wordvectorA.avgsim
            squareavg+=wordvectorA.avgsim*wordvectorA.avgsim
            totalsd+=wordvectorA.sd
            squaresd+=wordvectorA.sd * wordvectorA.sd
            correlationx.append(float(wordvectorA.width))
            correlationy1.append(float(wordvectorA.topsim))
            correlationy2.append(float(wordvectorA.avgsim))

        avgtop=totaltop/count
        sdtop=pow(squaretop/count - avgtop*avgtop,0.5)
        avgavg=totalavg/count
        sdavg=pow(squareavg/count-avgavg*avgavg,0.5)
        avgsd=totalsd/count
        sdsd=pow(squaresd/count-avgsd*avgsd,0.5)

        print "Top similarity: average = "+str(avgtop)+" sd = "+str(sdtop)
        print "average similarity: average = "+str(avgavg)+" sd = "+str(sdavg)
        print "SD similarity: average = "+str(avgsd)+" sd = "+str(sdsd)


        #print correlationx
        #print correlationy1
        x=numpy.array(correlationx)
        y=numpy.array(correlationy1)

        #print x
        #print y

        thispoly= numpy.poly1d(numpy.polyfit(x,y,1))


        pr=stats.spearmanr(x,y)
        mytitle="Regression line for width and top similarity"
      #  self.showpoly(x,y,thispoly,mytitle,pr,1,1)
        print "SRCC for width and top similarity is "+str(pr[0])+" ("+str(pr[1])+")"
        print thispoly

        x=numpy.array(correlationx)
        y=numpy.array(correlationy2)
        thispoly= numpy.poly1d(numpy.polyfit(x,y,1))


        pr=stats.spearmanr(x,y)
        mytitle="Regression line for width and average similarity"
     #   self.showpoly(x,y,thispoly,mytitle,pr,1,1)
        print "SRCC for width and average similarity is "+str(pr[0])+" ("+str(pr[1])+")"
        print thispoly

    def showpoly(self,x,y,poly,title,pr,xl,yl):
        xp=numpy.linspace(0,xl,100)
        plt.plot(x,y,'.',xp,poly(xp),'-')
        plt.ylim(0,yl)
        plt.title(title)
        mytext1="srcc = "+str(pr[0])
        mytext2="p = "+str(pr[1])
        plt.text(0.05,yl*0.9,mytext1)
        plt.text(0.05,yl*0.8,mytext2)
        plt.show()