def loadsims(self, simsfile, use_cache=False, make_cache=True): #is there a relevant cache of relevant sims? If so, load #otherwise first need to establish which word pairs we need to store similarities for using the pairmatrix #then read the simsfile and store the similarities #and write to cache self.simsfile = simsfile if use_cache: self.loadcachedsims() else: for [w1, w2, _r] in self.pairmatrix: #for each word (in each word pair) want to put the other word in its dictionary so a similarity will be stored if found in simsfile self.entrydict[w1].addwordtodicts(w2) self.entrydict[w2].addwordtodicts(w1) simstream = open(simsfile, 'r') print "Reading " + simsfile linesread = 0 added = 0 ignored = 0 for line in simstream: linesread += 1 line.rstrip() fields = line.split('\t') fields.reverse() (w1, _) = untag(fields.pop()) #print w1 if len(self.entrydict[w1].simdict) > 0: #don't care about sims for words not in evaluation rank = 1 while len(fields) > 0: (w2, _) = untag(fields.pop()) score = float(fields.pop()) added += self.entrydict[w1].addscorestodicts( w2, score, rank) #will only add if pair is initialised rank += 1 else: #print "Ignoring line "+str(linesread)+": "+w1 ignored += 1 if self.verbose and linesread % 100 == 0: print "Read " + str( linesread) + " lines and ignored " + str( ignored) + " lines and stored " + str( added) + " similarities" #break print "Read " + str(linesread) + " lines and ignored " + str( ignored) + " lines and stored " + str(added) + " similarities" simstream.close() if make_cache: self.makesimcache()
def loadsims(self,simsfile,use_cache=False,make_cache=True): #is there a relevant cache of relevant sims? If so, load #otherwise first need to establish which word pairs we need to store similarities for using the pairmatrix #then read the simsfile and store the similarities #and write to cache self.simsfile=simsfile if use_cache: self.loadcachedsims() else: for [w1,w2,_r] in self.pairmatrix: #for each word (in each word pair) want to put the other word in its dictionary so a similarity will be stored if found in simsfile self.entrydict[w1].addwordtodicts(w2) self.entrydict[w2].addwordtodicts(w1) simstream=open(simsfile,'r') print "Reading "+simsfile linesread=0 added=0 ignored=0 for line in simstream: linesread+=1 line.rstrip() fields=line.split('\t') fields.reverse() (w1,_)=untag(fields.pop()) #print w1 if len(self.entrydict[w1].simdict)>0: #don't care about sims for words not in evaluation rank=1 while len(fields)>0: (w2,_)=untag(fields.pop()) score=float(fields.pop()) added+=self.entrydict[w1].addscorestodicts(w2,score,rank) #will only add if pair is initialised rank+=1 else: #print "Ignoring line "+str(linesread)+": "+w1 ignored+=1 if self.verbose and linesread%100==0: print "Read "+str(linesread)+" lines and ignored "+str(ignored)+" lines and stored "+str(added)+" similarities" #break print "Read "+str(linesread)+" lines and ignored "+str(ignored)+" lines and stored "+str(added)+" similarities" simstream.close() if make_cache: self.makesimcache()
def __init__(self, fields): if len(fields) == 3: (self.word, self.pos) = untag(fields[0]) self.freq = fields[1] self.width = fields[2] elif len(fields) == 2: (self.word, self.pos) = untag(fields[0]) self.freq = fields[1] self.width = 0 else: print "Warning: invalid entry " + fields self.simdict = { } #dictionary to store mapping from word to similarity score self.rankdict = { } #dictionary to store mapping from word to rank in neighbour list self.paircount = 0 #number of evaluation pairs involved in self.featdict = {} #dict of features and scores self.precisiondict = {} #to store values of WeedsPrecision self.min_precisiondict = {} #to store values of ClarkeDE self.invCLdict = {} #to store values of invCL (lenci)
def loadvectors(self, vectorfile, use_cache=False, make_cache=True): self.vectorfile = vectorfile if use_cache: #self.loadvectorcache() self.vectorfile = self.vectorfile + ".cached" make_cache = False for [w1, w2, _r] in self.pairmatrix: self.entrydict[w1].paircount += 1 self.entrydict[w2].paircount += 1 instream = open(self.vectorfile, 'r') print "Reading " + self.vectorfile linesread = 0 lineswritten = 0 if make_cache: outstream = open(self.vectorfile + ".cached", 'w') print "Writing " + self.vectorfile + ".cached" for line in instream: linesread += 1 line = line.rstrip() fields = line.split('\t') fields.reverse() (w1, _) = untag(fields.pop()) if w1 != "" and self.entrydict[w1].paircount > 0: #store this vector if make_cache: outstream.write(line) lineswritten += 1 while len(fields) > 0: w2 = fields.pop() sc = float(fields.pop()) self.entrydict[w1].addfeature(w2, sc) if self.verbose and linesread % 1000 == 0: print "Read " + str(linesread) + " lines and written " + str( lineswritten) + " lines" if make_cache: outstream.close() print "Written " + str(lineswritten) + " lines" instream.close() print "Read " + str(linesread) + " lines"
def loadvectors(self,vectorfile,use_cache=False,make_cache=True): self.vectorfile=vectorfile if use_cache: #self.loadvectorcache() self.vectorfile=self.vectorfile+".cached" make_cache=False for[w1,w2,_r] in self.pairmatrix: self.entrydict[w1].paircount+=1 self.entrydict[w2].paircount+=1 instream=open(self.vectorfile,'r') print "Reading "+self.vectorfile linesread=0 lineswritten=0 if make_cache: outstream=open(self.vectorfile+".cached",'w') print "Writing "+self.vectorfile+".cached" for line in instream: linesread+=1 line=line.rstrip() fields=line.split('\t') fields.reverse() (w1,_)=untag(fields.pop()) if w1 != "" and self.entrydict[w1].paircount>0: #store this vector if make_cache: outstream.write(line) lineswritten+=1 while len(fields)>0: w2=fields.pop() sc=float(fields.pop()) self.entrydict[w1].addfeature(w2,sc) if self.verbose and linesread%1000==0: print "Read "+str(linesread)+" lines and written "+str(lineswritten)+" lines" if make_cache: outstream.close() print "Written "+str(lineswritten)+" lines" instream.close() print "Read "+str(linesread)+" lines"