def loadsims(self, simsfile, use_cache=False, make_cache=True):
        #is there a relevant cache of relevant sims? If so, load
        #otherwise first need to establish which word pairs we need to store similarities for using the pairmatrix
        #then read the simsfile and store the similarities
        #and write to cache
        self.simsfile = simsfile
        if use_cache:
            self.loadcachedsims()
        else:
            for [w1, w2, _r] in self.pairmatrix:
                #for each word (in each word pair) want to put the other word in its dictionary so a similarity will be stored if found in simsfile
                self.entrydict[w1].addwordtodicts(w2)
                self.entrydict[w2].addwordtodicts(w1)

            simstream = open(simsfile, 'r')
            print "Reading " + simsfile
            linesread = 0
            added = 0
            ignored = 0
            for line in simstream:
                linesread += 1
                line.rstrip()
                fields = line.split('\t')
                fields.reverse()
                (w1, _) = untag(fields.pop())
                #print w1
                if len(self.entrydict[w1].simdict) > 0:
                    #don't care about sims for words not in evaluation
                    rank = 1
                    while len(fields) > 0:
                        (w2, _) = untag(fields.pop())
                        score = float(fields.pop())
                        added += self.entrydict[w1].addscorestodicts(
                            w2, score,
                            rank)  #will only add if pair is initialised
                        rank += 1
                else:
                    #print "Ignoring line "+str(linesread)+": "+w1
                    ignored += 1

                if self.verbose and linesread % 100 == 0:
                    print "Read " + str(
                        linesread) + " lines and ignored " + str(
                            ignored) + " lines and stored " + str(
                                added) + " similarities"
                    #break
            print "Read " + str(linesread) + " lines and ignored " + str(
                ignored) + " lines and stored " + str(added) + " similarities"
            simstream.close()
            if make_cache:
                self.makesimcache()
    def loadsims(self,simsfile,use_cache=False,make_cache=True):
        #is there a relevant cache of relevant sims? If so, load
        #otherwise first need to establish which word pairs we need to store similarities for using the pairmatrix
        #then read the simsfile and store the similarities
        #and write to cache
        self.simsfile=simsfile
        if use_cache:
            self.loadcachedsims()
        else:
            for [w1,w2,_r] in self.pairmatrix:
                #for each word (in each word pair) want to put the other word in its dictionary so a similarity will be stored if found in simsfile
                self.entrydict[w1].addwordtodicts(w2)
                self.entrydict[w2].addwordtodicts(w1)

            simstream=open(simsfile,'r')
            print "Reading "+simsfile
            linesread=0
            added=0
            ignored=0
            for line in simstream:
                linesread+=1
                line.rstrip()
                fields=line.split('\t')
                fields.reverse()
                (w1,_)=untag(fields.pop())
                #print w1
                if len(self.entrydict[w1].simdict)>0:
                    #don't care about sims for words not in evaluation
                    rank=1
                    while len(fields)>0:
                        (w2,_)=untag(fields.pop())
                        score=float(fields.pop())
                        added+=self.entrydict[w1].addscorestodicts(w2,score,rank) #will only add if pair is initialised
                        rank+=1
                else:
                    #print "Ignoring line "+str(linesread)+": "+w1
                    ignored+=1

                if self.verbose and linesread%100==0:
                    print "Read "+str(linesread)+" lines and ignored "+str(ignored)+" lines and stored "+str(added)+" similarities"
                    #break
            print "Read "+str(linesread)+" lines and ignored "+str(ignored)+" lines and stored "+str(added)+" similarities"
            simstream.close()
            if make_cache:
                self.makesimcache()
Exemple #3
0
 def __init__(self, fields):
     if len(fields) == 3:
         (self.word, self.pos) = untag(fields[0])
         self.freq = fields[1]
         self.width = fields[2]
     elif len(fields) == 2:
         (self.word, self.pos) = untag(fields[0])
         self.freq = fields[1]
         self.width = 0
     else:
         print "Warning: invalid entry " + fields
     self.simdict = {
     }  #dictionary to store mapping from word to similarity score
     self.rankdict = {
     }  #dictionary to store mapping from word to rank in neighbour list
     self.paircount = 0  #number of evaluation pairs involved in
     self.featdict = {}  #dict of features and scores
     self.precisiondict = {}  #to store values of WeedsPrecision
     self.min_precisiondict = {}  #to store values of ClarkeDE
     self.invCLdict = {}  #to store values of invCL (lenci)
    def loadvectors(self, vectorfile, use_cache=False, make_cache=True):
        self.vectorfile = vectorfile
        if use_cache:
            #self.loadvectorcache()
            self.vectorfile = self.vectorfile + ".cached"
            make_cache = False

        for [w1, w2, _r] in self.pairmatrix:
            self.entrydict[w1].paircount += 1
            self.entrydict[w2].paircount += 1

        instream = open(self.vectorfile, 'r')
        print "Reading " + self.vectorfile
        linesread = 0
        lineswritten = 0
        if make_cache:
            outstream = open(self.vectorfile + ".cached", 'w')
            print "Writing " + self.vectorfile + ".cached"
        for line in instream:
            linesread += 1
            line = line.rstrip()
            fields = line.split('\t')
            fields.reverse()
            (w1, _) = untag(fields.pop())
            if w1 != "" and self.entrydict[w1].paircount > 0:
                #store this vector
                if make_cache:
                    outstream.write(line)
                    lineswritten += 1
                while len(fields) > 0:
                    w2 = fields.pop()
                    sc = float(fields.pop())
                    self.entrydict[w1].addfeature(w2, sc)
            if self.verbose and linesread % 1000 == 0:
                print "Read " + str(linesread) + " lines and written " + str(
                    lineswritten) + " lines"

        if make_cache:
            outstream.close()
            print "Written " + str(lineswritten) + " lines"
        instream.close()
        print "Read " + str(linesread) + " lines"
    def loadvectors(self,vectorfile,use_cache=False,make_cache=True):
        self.vectorfile=vectorfile
        if use_cache:
            #self.loadvectorcache()
            self.vectorfile=self.vectorfile+".cached"
            make_cache=False

        for[w1,w2,_r] in self.pairmatrix:
            self.entrydict[w1].paircount+=1
            self.entrydict[w2].paircount+=1

        instream=open(self.vectorfile,'r')
        print "Reading "+self.vectorfile
        linesread=0
        lineswritten=0
        if make_cache:
            outstream=open(self.vectorfile+".cached",'w')
            print "Writing "+self.vectorfile+".cached"
        for line in instream:
            linesread+=1
            line=line.rstrip()
            fields=line.split('\t')
            fields.reverse()
            (w1,_)=untag(fields.pop())
            if w1 != "" and self.entrydict[w1].paircount>0:
                #store this vector
                if make_cache:
                    outstream.write(line)
                    lineswritten+=1
                while len(fields)>0:
                    w2=fields.pop()
                    sc=float(fields.pop())
                    self.entrydict[w1].addfeature(w2,sc)
            if self.verbose and linesread%1000==0:
                print "Read "+str(linesread)+" lines and written "+str(lineswritten)+" lines"

        if make_cache:
            outstream.close()
            print "Written "+str(lineswritten)+" lines"
        instream.close()
        print "Read "+str(linesread)+" lines"