Ejemplo n.º 1
0
 def test_analogy(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.6, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.7, 0.1, 1.1, 1.1, 1.1, 0.1]])
     word_embedding = WordVector(embed_matrix, dictionary)
     d = word_embedding.analogy('the',
                                'fox',
                                'quick',
                                num=2,
                                metric='euclidean')
     self.assertEqual(2, len(d), 'wrong number of analogies returned')
     self.assertEqual('jumped', d[0], 'wrong most likely analogy returned')
     self.assertEqual('over', d[1],
                      'wrong 2nd most likely analogy returned')
Ejemplo n.º 2
0
 def test_get_vector_by_num(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.1, 0.1, 0.9, 0.9, 0.9, 0.1],
                              [1.0, 0.1, 0.1, 1.0, 1.0, 0.8, 0.1]])
     word_embedding = WordVector(embed_matrix, dictionary)
     self.assertTrue(
         np.sum(
             np.abs(
                 np.array([1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1]) -
                 word_embedding.get_vector_by_num(3))) < 0.1,
         'incorrest closest indices')
     self.assertTrue(
         np.sum(
             np.abs(
                 np.array([1.0, 0.1, 0.1, 1.0, 1.0, 0.8, 0.1]) -
                 word_embedding.get_vector_by_num(5))) < 0.1,
         'incorrest closest indices')
     self.assertTrue(
         np.sum(
             np.abs(
                 np.array([1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]) -
                 word_embedding.get_vector_by_num(0))) < 0.1,
         'incorrest closest indices')
Ejemplo n.º 3
0
 def test_closest_row_indices(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 1.01], [2.0, 2.0], [2.0, 2.1],
                              [1.0, 0.0], [0, 1.01], [-1.0, 0.0]])
     word_embedding = WordVector(embed_matrix, dictionary)
     dist_list = word_embedding.closest_row_indices(np.array([[2.0, 2.0]]),
                                                    3, 'euclidean')
     self.assertTrue(
         np.sum(np.abs(np.array([1, 2, 0]) - dist_list)) < 0.1,
         'incorrest closest indices')
     dist_list = word_embedding.closest_row_indices(np.array([[2.0, 2.0]]),
                                                    3, 'cosine')
     self.assertTrue(
         np.sum(np.abs(np.array([1, 0, 2]) - dist_list)) < 0.1,
         'incorrest closest indices')
     dist_list = word_embedding.closest_row_indices(np.array([[1.0, 1.0]]),
                                                    6, 'euclidean')
     self.assertTrue(
         np.sum(np.abs(np.array([0, 3, 4, 1, 2, 5]) - dist_list)) < 0.1,
         'incorrest closest indices')
Ejemplo n.º 4
0
 def test_gets(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 1.01], [2.0, 2.0], [2.0, 2.1],
                              [1.0, 0.0], [0, 1.01], [-1.0, 0.0]])
     word_embedding = WordVector(embed_matrix, dictionary)
     d = word_embedding.get_dict()
     dr = word_embedding.get_reverse_dict()
     em = word_embedding.get_embed()
     d.pop('the')  # mutate, check that copies were returned
     dr.pop(1)
     em[0, 0] = 10
     d = word_embedding.get_dict()
     dr = word_embedding.get_reverse_dict()
     em = word_embedding.get_embed()
     self.assertEqual(6, len(d), 'wrong dictionary length')
     self.assertEqual(6, len(dr), 'wrong dictionary length')
     self.assertEqual(1.0, em[0, 0], 'wrong value in embed matrix')
     self.assertEqual(3, d['fox'], 'wrong value from dictionary')
     self.assertEqual('jumped', dr[4],
                      'wrong value from reverse dictionary')
Ejemplo n.º 5
0
    def processsimline(self,line):
        featurelist=line.split('\t')
        matchobj = Thesaurus.wordposPATT.match(featurelist[0])
        if matchobj:
            wordpos=(matchobj.group(1),matchobj.group(2))
        else:
            print "Error with vector file matching "+featurelist[0]
            return


        #self.vectordict[wordpos]=WordVector(wordpos) #initialise WordVector in vector dictionary
        (word,pos)=wordpos
        add=True
        if self.filter:
            if word+"/"+pos in self.filterwords:
                add=True
            else:
                add=False

        if add:
            self.thisvector=WordVector(wordpos)

            featurelist.reverse() #reverse list so can pop features and scores off
            featurelist.pop() #take off last item which is word itself
            if Thesaurus.byblo:
                #no extra fields
                check=True
            else:
                self.thisvector.width=float(featurelist.pop())
                self.thisvector.length=float(featurelist.pop())
            self.updatesimvector(wordpos,featurelist)
            self.thisvector.topk(self.k)
            self.vectordict[wordpos]=self.thisvector
            #self.vectordict[wordpos].displaysims()
            self.updated+=1
Ejemplo n.º 6
0
 def test_num_words(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.1, 0.1, 0.9, 0.9, 0.9, 0.1],
                              [1.0, 0.1, 0.1, 1.0, 1.0, 0.8, 0.1]])
     word_embedding = WordVector(embed_matrix, dictionary)
     self.assertEqual(6, word_embedding.num_words(),
                      'incorrect number of words')
Ejemplo n.º 7
0
 def n_closest(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 1.01], [2.0, 2.0], [2.0, 2.1],
                              [1.0, 0.0], [0, 1.01], [-1.0, 0.0]])
     word_embedding = WordVector(embed_matrix, dictionary)
     nc_list = word_embedding.n_closest('quick', 3, metric='euclidean')
     self.assertEqual(['quick', 'brown', 'the'], nc_list,
                      'wrong n-closest words returned')
     nc_list = word_embedding.n_closest('quick', 2, metric='cosine')
     self.assertEqual(['the', 'fox'], nc_list,
                      'wrong n-closest words returned')
Ejemplo n.º 8
0
 def test_most_common(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.6, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.7, 0.1, 1.1, 1.1, 1.1, 0.1]])
     word_embedding = WordVector(embed_matrix, dictionary)
     mc_list = word_embedding.most_common(3)
     self.assertEqual(['the', 'quick', 'brown'], mc_list,
                      'wrong most common words returned')
     mc_list = word_embedding.most_common(1)
     self.assertEqual(['the'], mc_list, 'wrong most common words returned')
Ejemplo n.º 9
0
 def test_project_2D_2(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.1, 0.1, 0.9, 0.9, 0.9, 0.1],
                              [1.0, 0.1, 0.1, 1.0, 1.0, 0.8, 0.1]])
     word_embedding = WordVector(embed_matrix, dictionary)
     proj, words = word_embedding.project_2d(0, 6)
     self.assertEqual((6, 2), proj.shape,
                      'incorrect projection array size returned')
     self.assertEqual('the', words[0], 'incorrect word at index 0')
     self.assertEqual('fox', words[3], 'incorrect word at index 3')
Ejemplo n.º 10
0
def load():
	files = ['../data/adventures_of_sherlock_holmes.txt',
        	'../data/hound_of_the_baskervilles.txt',
        	'../data/sign_of_the_four.txt']
	word_array, dictionary, num_lines, num_words = docload.build_word_array(
    	files, vocab_size=50000, gutenberg=True)

	print('Document loaded and processed: {} lines, {} words.'
      	.format(num_lines, num_words))

	print('Building training set ...')
	x, y = WindowModel.build_training_set(word_array)

	# shuffle and split 10% validation data
	x_shuf, y_shuf = sklearn.utils.shuffle(x, y, random_state=0)
	split = round(x_shuf.shape[0]*0.9)
	x_val, y_val = (x_shuf[split:, :], y_shuf[split:, :])
	x_train, y_train = (x[:split, :], y[:split, :])

	print('Training set built.')
	graph_params = {'batch_size': 32,
	                'vocab_size': np.max(x)+1,
	                'embed_size': 64,
	                'hid_size': 64,
	                'neg_samples': 64,
	                'learn_rate': 0.01,
	                'momentum': 0.9,
	                'embed_noise': 0.1,
	                'hid_noise': 0.3,
	                'optimizer': 'Momentum'}
	model = WindowModel(graph_params)
	print('Model built. Vocab size = {}. Document length = {} words.'
	      .format(np.max(x)+1, len(word_array)))

	print('Training ...')
	results = model.train(x_train, y_train, x_val, y_val, epochs=120, verbose=False)

	word_vector_embed = WordVector(results['embed_weights'], dictionary)
	word_vector_nce = WordVector(results['nce_weights'], dictionary)
Ejemplo n.º 11
0
 def test_words_in_range(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.6, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.7, 0.1, 1.1, 1.1, 1.1, 0.1]])
     word_embedding = WordVector(embed_matrix, dictionary)
     range_list = word_embedding.words_in_range(3, 6)
     self.assertEqual(['fox', 'jumped', 'over'], range_list,
                      'wrong most common words returned')
     range_list = word_embedding.words_in_range(0, 2)
     self.assertEqual(['the', 'quick'], range_list,
                      'wrong most common words returned')
Ejemplo n.º 12
0
class Thesaurus:

    wordposPATT = re.compile('(.*)/(.)') #only first char of POS
    byblo = False # byblo neighbours file or appthes generated from vector file

    def __init__(self,vectorfilename,simcachefile,simcache,windows,k,adja,adjb,compress):
        self.vectorfilename=vectorfilename
        self.simcachefile=simcachefile
        self.simcache=simcache
        self.thisvector=""
        self.vectordict={} #dictionary of vectors
        self.allfeatures={} #dictionary of all feature dimensions
        self.updated=0
        self.fkeys=[] #list (to be sorted) of all features to
        self.fk_idx={} #feature --> dimension
        self.dim=0
        WordVector.windows=windows
        self.k=k
        self.adja=adja
        self.adjb=adjb
        self.filter=False
        self.filterwords=[]
        self.compress=compress #whether to generate sparse vector representation for efficient sim calcs

    def readvectors(self):
        if self.simcache:
            #don't bother reading in vectors - just need simcache
            same=True
        else:
            print"Reading vector file "+self.vectorfilename
            linesread=0
            instream=open(self.vectorfilename,'r')
            for line in instream:
                self.processvectorline(line.rstrip())
                linesread+=1
                if (linesread%10000 == 0):
                    print "Read "+str(linesread)+" lines and updated "+str(self.updated)+" vectors"
                    sys.stdout.flush()

            print "Read "+str(linesread)+" lines and updated "+str(self.updated)+" vectors"
            instream.close()
            if self.compress:
                print "Compressing vector dictionary representation"
                self.makematrix()
                print "Finished sparse array generation"

    def processvectorline(self,line):
        featurelist=line.split('\t')
        matchobj = Thesaurus.wordposPATT.match(featurelist[0])
        if matchobj:
            wordpos=(matchobj.group(1),matchobj.group(2))
        else:
            print "Error with vector file matching "+featurelist[0]
            #this could be "__FILTERED" so ignore line and carry on
            return

            #if len(featurelist)>WordVector.dim:
            #   WordVector.dim=len(featurelist)

        self.vectordict[wordpos]=WordVector(wordpos) #initialise WordVector in vector dictionary

        featurelist.reverse() #reverse list so can pop features and scores off
        featurelist.pop() #take off last item which is word itself
        self.updatevector(wordpos,featurelist)
        self.updated+=1

    def updatevector(self,wordpos,featurelist):
        while(len(featurelist)>0):
            f=featurelist.pop()
            sc=featurelist.pop()
            added=self.vectordict[wordpos].addfeature(f,sc)
            if added:
                self.allfeatures[f]=1
        self.vectordict[wordpos].length=pow(self.vectordict[wordpos].length2,0.5)

    def readsims(self):

        print"Reading sim file "+self.simcachefile
        linesread=0
        instream=open(self.simcachefile,'r')
        for line in instream:
            self.processsimline(line.rstrip())
            linesread+=1
            if (linesread%1000 == 0):
                print "Read "+str(linesread)+" lines and updated "+str(self.updated)+" similarity vectors"
                sys.stdout.flush()
                #return
        self.topk(self.k)
        print "Read "+str(linesread)+" lines and updated "+str(self.updated)+" vectors"
        instream.close()


    def processsimline(self,line):
        featurelist=line.split('\t')
        matchobj = Thesaurus.wordposPATT.match(featurelist[0])
        if matchobj:
            wordpos=(matchobj.group(1),matchobj.group(2))
        else:
            print "Error with vector file matching "+featurelist[0]
            return


        #self.vectordict[wordpos]=WordVector(wordpos) #initialise WordVector in vector dictionary
        (word,pos)=wordpos
        add=True
        if self.filter:
            if word+"/"+pos in self.filterwords:
                add=True
            else:
                add=False

        if add:
            self.thisvector=WordVector(wordpos)

            featurelist.reverse() #reverse list so can pop features and scores off
            featurelist.pop() #take off last item which is word itself
            if Thesaurus.byblo:
                #no extra fields
                check=True
            else:
                self.thisvector.width=float(featurelist.pop())
                self.thisvector.length=float(featurelist.pop())
            self.updatesimvector(wordpos,featurelist)
            self.thisvector.topk(self.k)
            self.vectordict[wordpos]=self.thisvector
            #self.vectordict[wordpos].displaysims()
            self.updated+=1

    def updatesimvector(self,wordpos,featurelist):
        while(len(featurelist)>0):
            f=featurelist.pop()
            sc=featurelist.pop()
            self.thisvector.allsims[f]=float(sc)


    def makematrix(self):
        self.fkeys =self.allfeatures.keys()
        self.fkeys.sort()
        for i in range(len(self.fkeys)):
            self.fk_idx[self.fkeys[i]] = i
        del self.fkeys
        del self.allfeatures
        self.dim=len(self.fk_idx)
        print "Dimensionality is "+ str(self.dim)
        update_params(self.dim,self.adja,self.adjb)
        self.makearrays()

    def makearrays(self):
        #need to convert a word vector which stores a dictionary of features into a sparse array based on fk_idx
        for wordvector in self.vectordict.values():

            temparray = numpy.zeros(self.dim)
            for feature in wordvector.vector.keys():

                col=self.fk_idx[feature]
                score=wordvector.vector[feature]
                #
                temparray[col]=score
                # print temparray
            wordvector.array = sparse.csr_matrix(temparray)
            #print wordvector.array.data
            # print "Converted "+wordvector.word+"/"+wordvector.pos

    def allpairssims(self,metric):
        if self.simcache:
            #read in from sim cache
            self.readsims()
            #outstream=open(self.simcachefile,'w')
            #for wordvectorA in self.vectordict.values():
            #    wordvectorA.outputsims(outstream)
            #outstream.close()
        else:
            outstream=open(self.simcachefile,'w')
            #compute all pairs sims and write sim cache
            done =0
            for wordvectorA in self.vectordict.values():
                wordvectorA.allsims={}
                for wordvectorB in self.vectordict.values():
                    if wordvectorA.equals(wordvectorB):
                        #ignore
                        same =True
                    else:
                        label = wordvectorB.word+"/"+wordvectorB.pos

                        sim=wordvectorA.findsim(wordvectorB,metric)
                        if sim<0:
                            wordvectorA.debug=True
                            wordvectorA.findsim(wordvectorB,metric)
                        if sim>1:
                            wordvectorA.debug=True
                            wordvectorA.findsim(wordvectorB,metric)
                        wordvectorA.allsims[label]=sim
                wordvectorA.outputtopk(outstream,self.k)

                done+=1
                if done%100==0: print "Completed similarity calculations for "+str(done)+" words"


        #for wordvectorA in self.vectordict.values():
         #   wordvectorA.analyse()

    def outputsim(self,wordA,wordB,metric):
        sim =-1
        if self.simcache:
           (wa,pa)=wordA
           if wordA in self.vectordict.keys():
               (wb,pb)=wordB
               label=wb+"/"+pb
               if label in self.vectordict[wordA].allsims.keys():
                   sim = self.vectordict[wordA].allsims[label]
                   print "Similarity between "+wa+"/"+pa+" and "+wb +"/"+pb+" is "+str(sim)
               else:
                   print label + " not in neighbour set"
           else:
               print wa+"/"+pa+" not in dictionary"


        else:
            if wordA in self.vectordict.keys():
                vectorA = self.vectordict[wordA]

                if wordB in self.vectordict.keys():
                    vectorB = self.vectordict[wordB]
                    sim = vectorA.findsim(vectorB,metric)
                    print "Similarity between "+vectorA.word+"/"+vectorA.pos+" and "+vectorB.word +"/"+vectorB.pos+" is "+str(sim)
                    print "("+str(vectorA.width) + ", "+str(vectorB.width)+")"

                else:
                    (word,pos)=wordB
                    print word+"/"+pos +" not in dictionary"

            else:
                (word,pos)=wordA
                print word+"/"+pos +" not in dictionary"


    def topk(self,k):
        #retain top k neighbours for each word
        for thisvector in self.vectordict.values():
            thisvector.topk(k)

    def topsim(self,sim):
        #retain similarities over sim threshold
        for thisvector in self.vectordict.values():
            #print thisvector,sim
            thisvector.keeptopsim(sim)

    def displayneighs(self,word,k):
        if word in self.vectordict.keys():

            vector=self.vectordict[word]
            vector.topk(k)
            vector.displaysims()
        else:
            (word,pos)=word
            print word+"/"+pos + " not in dictionary"

    def analyse(self):
        totaltop=0.0
        totalavg=0.0
        squaretop=0.0
        squareavg=0.0
        count=0
        correlationx=[]
        correlationy1=[]
        correlationy2=[]
        totalsd = 0.0
        squaresd=0.0

        for wordvectorA in self.vectordict.values():
            count+=1
            totaltop+=wordvectorA.topsim
            squaretop+=wordvectorA.topsim*wordvectorA.topsim
            totalavg+=wordvectorA.avgsim
            squareavg+=wordvectorA.avgsim*wordvectorA.avgsim
            totalsd+=wordvectorA.sd
            squaresd+=wordvectorA.sd * wordvectorA.sd
            correlationx.append(float(wordvectorA.width))
            correlationy1.append(float(wordvectorA.topsim))
            correlationy2.append(float(wordvectorA.avgsim))

        avgtop=totaltop/count
        sdtop=pow(squaretop/count - avgtop*avgtop,0.5)
        avgavg=totalavg/count
        sdavg=pow(squareavg/count-avgavg*avgavg,0.5)
        avgsd=totalsd/count
        sdsd=pow(squaresd/count-avgsd*avgsd,0.5)

        print "Top similarity: average = "+str(avgtop)+" sd = "+str(sdtop)
        print "average similarity: average = "+str(avgavg)+" sd = "+str(sdavg)
        print "SD similarity: average = "+str(avgsd)+" sd = "+str(sdsd)


        #print correlationx
        #print correlationy1
        x=numpy.array(correlationx)
        y=numpy.array(correlationy1)

        #print x
        #print y

        thispoly= numpy.poly1d(numpy.polyfit(x,y,1))


        pr=stats.spearmanr(x,y)
        mytitle="Regression line for width and top similarity"
      #  self.showpoly(x,y,thispoly,mytitle,pr,1,1)
        print "SRCC for width and top similarity is "+str(pr[0])+" ("+str(pr[1])+")"
        print thispoly

        x=numpy.array(correlationx)
        y=numpy.array(correlationy2)
        thispoly= numpy.poly1d(numpy.polyfit(x,y,1))


        pr=stats.spearmanr(x,y)
        mytitle="Regression line for width and average similarity"
     #   self.showpoly(x,y,thispoly,mytitle,pr,1,1)
        print "SRCC for width and average similarity is "+str(pr[0])+" ("+str(pr[1])+")"
        print thispoly

    def showpoly(self,x,y,poly,title,pr,xl,yl):
        xp=numpy.linspace(0,xl,100)
        plt.plot(x,y,'.',xp,poly(xp),'-')
        plt.ylim(0,yl)
        plt.title(title)
        mytext1="srcc = "+str(pr[0])
        mytext2="p = "+str(pr[1])
        plt.text(0.05,yl*0.9,mytext1)
        plt.text(0.05,yl*0.8,mytext2)
        plt.show()