def adjustsims(self,myBless,meanpoly,sdpoly): (_,w1)=myBless.countdict.get(self.word,(0,0)) # print self.word,w1 for(sim,neigh) in self.tuplelist: (_,w2)=myBless.countdict.get(untag(neigh,'/'),(0,0)) #print self.word,w1,neigh,w2 jointwidth=widthfunction(w1,w2) mean=meanpoly(jointwidth) sd=sdpoly(jointwidth) p=normal(mean,sd).cdf(sim) self.allsims[neigh]=p self.tuplelist=[] self.topk(self.getk())
def get_topsim(self,concept,wordlist): #for given concept find closest neighbour in wordlist and return rank and sim rank=1 topsim=0 maxrank=1000 toprank=maxrank vector=self.vectordict.get(concept,None) if vector!=None: for (sim, word) in vector.tuplelist: #sorted list of concepts neighbours if untag(word,'/') in wordlist: #hey presto found the nearest one topsim=sim toprank=rank break else: rank+=1 else: print "Warning: No vector for: ",concept #convertrank=float(maxrank-toprank)/float(maxrank) return (toprank,topsim)
def readsomesims(self,entrylist): print"Reading sim file "+self.simcachefile linesread=0 instream=open(self.simcachefile,'r') for line in instream: if untag(line.split('\t')[0],'/') in entrylist: self.processsimline(line.rstrip()) linesread+=1 if self.updated==len(entrylist): #all found break if (linesread%1000 == 0): #print "Read "+str(linesread)+" lines and updated "+str(self.updated)+" similarity vectors" sys.stdout.flush() #return self.topk(self.k) print "Read "+str(linesread)+" lines and updated "+str(self.updated)+" vectors" instream.close()
def correlate(self,myBless,displaylist=[0,2,3]): labels=['Log Width','Log Frequency','Average Similarity','Sd similarity'] mymatrix=[[],[],[],[]] polys=[] for concept in myBless.entrydict.keys(): concept2=(concept,'N') self.vectordict[concept2].analyse() (freq,width)=myBless.countdict.get(concept,(0,0)) freq=math.log(float(freq)) # width=math.log(float(width)) if self._do_top: sim=self.vectordict[concept2].topsim nn = untag(self.vectordict[concept2].nearestneighbour,'/') (f2,w2)=myBless.countdict.get(nn,(0,0)) f2=math.log(float(f2)) # w2=math.log(float(w2)) # freq=math.pow(float(f2)*float(freq),0.5) # width = math.pow(float(w2)*float(width),0.5) width = widthfunction(width,w2) # print nn, sim,f2,w2, width labels[2]='Top Similarity' else: sim=float(self.vectordict[concept2].avgsim) width=widthfunction(width,width) sd=float(self.vectordict[concept2].sd) #print concept, width, freq, sim,sd mymatrix[1].append(freq) mymatrix[2].append(sim) mymatrix[3].append(sd) mymatrix[0].append(width) for i in range(len(displaylist)-1): for j in range(i+1,len(displaylist)): print labels[displaylist[i]],labels[displaylist[j]] xs=np.array(mymatrix[displaylist[i]]) ys=np.array(mymatrix[displaylist[j]]) whichpoly=(j+1)%2 whichpoly=0 #print whichpoly polys.append(showpoly(xs,ys,labels[displaylist[i]],labels[displaylist[j]])[whichpoly]) return polys
def allsims(self,entrylist): if self.blesscache: self.simcachefile=self.simcachefile+".blesscache" print"Reading sim file "+self.simcachefile linesread=0 instream=open(self.simcachefile,'r') for line in instream: word=line.split('\t')[0] if self.poscheck(word,'/') and untag(word,'/') in entrylist:#check this is a word in blessDB with correct pos self.processsimline(line.rstrip()) linesread+=1 if (linesread%1000 == 0): print "Read "+str(linesread)+" lines and updated "+str(self.updated)+" similarity vectors" sys.stdout.flush() #return self.topk(self.k) print "Read "+str(linesread)+" lines and updated "+str(self.updated)+" vectors" print self.vectordict.keys() instream.close() if not self.blesscache: #write cache self.writecache()