def __init__(self, lp , PROGRAM_Halt, wv_fileaddress, max_rank):
     self.NOT_FOUND = 1
     self.lp = lp
     self.PROGRAM_Halt = PROGRAM_Halt
     
     import numpy         
     try:
         from wvlib import wvlib
     except:
         import wvlib
     try:
         self.lp ("Loading word-embeddings matrix from file:" + wv_fileaddress + "\nmax_rank: "+str(max_rank))
         self.__w2v_model = wvlib.load (wv_fileaddress , max_rank=max_rank)            
     except Exception as E:
         self.PROGRAM_Halt ("Error loading word2vec mode.\nError:"+E.message)
     
     self.lp ("Normalizing word-embeddings matrix.")
     self.__w2v_model.normalize()
     self.__words = [u'_<_MASK__>_', u'_<_OOV_>_'] + [w.decode('utf-8') for w in self.__w2v_model.words()]
     #use if error: self.__words = [u'_<_MASK__>_', u'_<_OOV_>_'] + [w.decode('utf-8','ignore') for w in self.__w2v_model.words()]
     self.__word_index_dict = {w:i for i, w in enumerate(self.__words)}
     latent_dim = self.__w2v_model._vectors.vectors[0].shape[0]
     self.__weights = numpy.concatenate([numpy.zeros((1,latent_dim)), numpy.random.randn(1,latent_dim), numpy.asarray(self.__w2v_model.vectors())])
Ejemplo n.º 2
0
                 os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/RW-STANFORD', 'EN-RW-STANFORD.txt'),
                 os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/VERB-143', 'EN-VERB-143.txt'),
                 os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/WS-353-REL', 'EN-WS-353-REL.txt'),
                 os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/WS-353-SIM', 'EN-WS-353-SIM.txt'),
                 os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/WS-353-ALL', 'EN-WS-353-ALL.txt'),
                 os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/YP-130', 'EN-YP-130.txt'),
                 os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/MayoSRS', 'MayoSRS.txt'),
                 os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/UMNSRS', 'UMNSRS-sim.txt'),\
                 os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/UMNSRS', 'UMNSRS-rel.txt')]

    from tools import utilities as util
    config = CommandLine()
    #from word2Vec import tools as util
    if os.path.isfile(config.inputFile):
        try:
            wv = wvlib.load(config.inputFile).normalize()
            #references = [(r, eva.read_referenceSingleWords(r)) for r in evafilePath]
            references = [(r, eva.read_reference(r)) for r in evafilePath]
            print '%20s\trho\tmissed\ttotal\tratio' % 'dataset'
            for name, ref in references:
                #rho, count = eva.evaluateTest(newWordVecs, ref,wordList)
                rho, count = eva.evaluate(wv, ref)
                total, miss = len(ref), len(ref) - count
                print '%20s\t%.4f\t%d\t%d\t(%.2f%%)' % \
                (eva.baseroot(name), rho, miss, total, 100.*miss/total)
        except FormatError:
            print "skip", config.inputFile
    else:
        folderList = util.get_filepaths(config.inputFile)
        for i, item in enumerate(folderList):
            filename, file_extension = os.path.splitext(item)
Ejemplo n.º 3
0
    
if  __name__ =='__main__':

    
    #filePath=os.path.join(os.path.dirname(__file__), 'w2vData', 'PubMed15_Dependancy1.txt') #PubMed-w2v.bin #PubMed-and-PMC-w2v.bin
    
    evafilePath=[os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/MayoSRS', 'MayoSRS.txt'),
                 os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/UMNSRS', 'UMNSRS-sim.txt'),\
                 os.path.join(os.path.dirname(__file__), 'wvlib/word-similarities/UMNSRS', 'UMNSRS-rel.txt')]
    
    from tools import utilities as util
    config = CommandLine()
    #from word2Vec import tools as util
    if os.path.isfile(config.inputFile):
        try:
            wv = wvlib.load(config.inputFile).normalize()
            #references = [(r, eva.read_referenceSingleWords(r)) for r in evafilePath]
            references = [(r, eva.read_reference(r)) for r in evafilePath]
            print '%20s\trho\tmissed\ttotal\tratio' % 'dataset'
            for name, ref in references:
                #rho, count = eva.evaluateTest(newWordVecs, ref,wordList)
                rho, count = eva.evaluate(wv, ref)
                total, miss = len(ref), len(ref) - count
                print '%20s\t%.4f\t%d\t%d\t(%.2f%%)' % \
                (eva.baseroot(name), rho, miss, total, 100.*miss/total)
        except FormatError:
            print "skip",config.inputFile
    else:
            folderList=util.get_filepaths(config.inputFile)
            for i,item in enumerate(folderList):
                filename, file_extension = os.path.splitext(item)
Ejemplo n.º 4
0
            
            
        if '-f' in opts:
            self.fname = True

    def printHelp(self):
        help = __doc__.replace('<PROGNAME>',sys.argv[0],1)
        print >> sys.stderr, help
        exit()
        
if __name__ == "__main__":
    config = CommandLine()

    #from gensim.models import word2vec

    w2v1 = wv.load(config.inputFile).normalize()  # orginal w2v
    #print w2v1['his']
    w2v1= dict(w2v1)
    print len(w2v1.keys())
        
    w2v2 = wv.load(config.retroVector).normalize() # symetric w2v
    w2v2= dict(w2v2)
   
    #print "size of w2v 2:",len(w2v2)
            
    seta=set(w2v1.keys())
    setb=set(w2v2.keys())
        
    intersection = seta.intersection(setb)
        
    newWordvector = w2v1.copy()
Ejemplo n.º 5
0
def get_data(limit=100000):


    #Let us say the point of this little program is to get the data,
    #get the vectors and the create both embedding matrix
    #and the data in nice indexes

    #wv = wvlib.load("/usr/share/ParseBank/vector-space-models/FIN/w2v_pbv3_lm.rev01.bin",max_rank=1000000)
    wv = wvlib.load("/home/ginter/w2v/pb34_lemma_200_v2.bin").normalize()#,max_rank=10000000000).normalize()
    #wv.normalize()
    #remember to normalize!
    lines2 = open('./example_harvest/the_res', 'rt').readlines()[:50]
    lines = open('test_sent_2.txt', 'rt').readlines()

    lines += lines2

    #Such a small vocab I can ignore this stuff: vocab_set = set()
    vocab_list = []
    vecs = []

    examples = []
    corrupt_examples = []

    labels = []
    incomplete = []

    triplets = set()

    for line in lines[1:]:
        if len(line) > 4:
            exp = line.strip()
            indexes = []
            for w in exp.split()[:-1]:
                success = True
                if w not in vocab_list:
                    vocab_list.append(w)
                    try:
                        vecs.append(wv.word_to_vector(w.decode('utf8'))) 
                    except:
                        vecs.append(numpy.zeros(200,))
                        #print w
                        incomplete.append(w)
                indexes.append(vocab_list.index(w))

            destination_ok = True
            for w in exp.split()[:-1]:
                if w in incomplete:
                    #print '!', w
                    destination_ok = False

            if destination_ok:
                try:
                    if '.'.join([str(indexes[0]), str(indexes[1]), str(indexes[2])]) not in triplets:
                        examples.append((indexes, exp.split()[-1]))
                        print len(examples)
                        triplets.add('.'.join([str(indexes[0]), str(indexes[1]), str(indexes[2])]))
                    if len(examples) > limit:
                        break
                except:
                    print indexes

            else:
                corrupt_examples.append((indexes, exp.split()[-1]))

    return examples, vocab_list, vecs
Ejemplo n.º 6
0
def train_vectors(data, vec_size=50, window_size=2):


    wv = wvlib.load("/usr/share/ParseBank/vector-space-models/FIN/w2v_pbv3_lm.rev01.bin",max_rank=400000)

    #Let's not normalize, just for kicks
    #wv = wv.normalize()
    rng = np.random.RandomState(1234)
    vt = VectorThing(rng, vec_size, data)
    minibatch_size = 10

    parameters = []
    input_vectors = []

    input = T.dmatrix()
    ref = T.dmatrix()

    cls = Classifier(rng, window_size, vec_size, input, ref)
    #functions
    #cost, updates = cls.get_cost_and_updates(learning_rate=0.1)
    #train = theano.function([input, ref], cost, updates=updates)
    
    train = cls.get_training_function()


    #Make batches for sentence, cut them into parts of 10 or so,
    #train maximum of that amount at once

    for epoch in range(0,50):
        print epoch
        epoch_cost = []
        for i, sentence in enumerate(data):
            #Create training material for this sentence
            sentence_refs = []
            sentence_inputs = []
            #print i
            if i%100 == 0 and i > 0:
                print np.mean(epoch_cost), i
            for win in window(sentence, n=window_size):
                try:

                    ref_vector =  wv.word_to_vector(win[-1])
                    w_vectors = []
                    for w in win[:-1]:
                        w_vectors.append(wv.word_to_vector(w))

                    sentence_refs.append(ref_vector)
                    sentence_inputs.append(np.concatenate(w_vectors))

                except:
                    pass#print ':('

            batches_ref = []
            for b_ref in chunks(sentence_refs, minibatch_size):
                batches_ref.append(b_ref)
            batches_input = []
            for b_inp in chunks(sentence_inputs, minibatch_size):
                batches_input.append(b_inp)

            #insert paragraph vector
            cls.paragraph.set_value(vt.sentence_vecs[i].eval())
            #before = vt.sentence_vecs[i].eval()
            #Train them
            for rf, inpt in zip(batches_ref, batches_input):
                #import pdb;pdb.set_trace()
                batch_cost = train(inpt, rf)
                epoch_cost.append(batch_cost)
                #print batch_cost, len(sentence_refs)
            #recover the new vector
            vt.sentence_vecs[i].set_value(cls.paragraph.eval())
            #after = vt.sentence_vecs[i].eval()
        print 'mean_cost', np.mean(epoch_cost)
        save_model(vt, 'model_epoch_' + str(epoch))


    ####AGAIN! Now with only paragraph vectors#####

    cls.learning_rate = 0.1
    train_p = cls.get_training_function_only_paragraph()

    for epoch in range(0,100):
        print epoch
        epoch_cost = []
        for i, sentence in enumerate(data):
            #Create training material for this sentence
            sentence_refs = []
            sentence_inputs = []
            #print i
            if i%100 == 0 and i > 0:
                print np.mean(epoch_cost), i
            for win in window(sentence, n=window_size):
                try:

                    ref_vector =  wv.word_to_vector(win[-1])
                    w_vectors = []
                    for w in win[:-1]:
                        w_vectors.append(wv.word_to_vector(w))

                    sentence_refs.append(ref_vector)
                    sentence_inputs.append(np.concatenate(w_vectors))

                except:
                    pass#print ':('

            batches_ref = []
            for b_ref in chunks(sentence_refs, minibatch_size):
                batches_ref.append(b_ref)
            batches_input = []
            for b_inp in chunks(sentence_inputs, minibatch_size):
                batches_input.append(b_inp)

            #insert paragraph vector
            cls.paragraph.set_value(vt.sentence_vecs[i].eval())
            #before = vt.sentence_vecs[i].eval()
            #Train them
            for rf, inpt in zip(batches_ref, batches_input):
                #import pdb;pdb.set_trace()
                batch_cost = train(inpt, rf)
                epoch_cost.append(batch_cost)
                #print batch_cost, len(sentence_refs)
            #recover the new vector
            vt.sentence_vecs[i].set_value(cls.paragraph.eval())
            #after = vt.sentence_vecs[i].eval()
        print 'mean_cost', np.mean(epoch_cost)
        save_model(vt, 'model_epoch_' + str(epoch))
Ejemplo n.º 7
0
from tools import utilities 

''' Read all the word vectors and normalize them '''
def read_word_vectors(filename):    
  word_vecs = {}
  if filename.endswith('.gz'): file_object = gzip.open(filename, 'r')
  else: file_object = open(filename, 'r')

  for line_num, line in enumerate(file_object):
    line = line.strip().lower()
    word = line.split()[0]
    word_vecs[word] = numpy.zeros(len(line.split())-1, dtype=float)
    for index, vec_val in enumerate(line.split()[1:]):
      word_vecs[word][index] = float(vec_val)      
    ''' normalize weight vector '''
    word_vecs[word] /= math.sqrt((word_vecs[word]**2).sum() + 1e-6)        

  sys.stderr.write("Vectors read from: "+filename+" \n")
  return word_vecs
  
if __name__ == "__main__":

    
    inputw2vFile = sys.argv[1]
    outputFile = sys.argv[2]
    
    util = utilities()
    w2v1 = wv.load(inputw2vFile)  # orginal w2v
    w2v1= dict(w2v1)
    util.write_word_vecs(w2v1, outputFile)  # output file
Ejemplo n.º 8
0
def main():

    '''
    made_up_data = []
    for i in range(100):
        made_up_data.append([random.randint(1,6) / 10.0 for y in range(3)] + [random.randint(6,9) / 10.0 for y in range(3)])

    #Let's make a simple ae
    split_data = numpy.array(made_up_data)[:,:3], numpy.array(made_up_data)[:,3:]
    #Let's test the concatenation layer thing!
    '''
    #Config stuff
    n_in_vecs = 2
    vec_size = 200
    hidden_size = 200
    minibatch_size = 1000

    #Let us load the data
    inf = open('dataset_xm.list','rb')
    examples, vocab, vecs = pickle.load(inf)#)vector_grabber.get_data()
    inf.close()
    #examples, vocab, vecs = vector_grabber_mini.get_data()
    print len(examples)
    #import pdb;pdb.set_trace()
    #out = open('the_clean_data_mini','wb')
    #pickle.dump((examples, vocab, vecs), out)
    #out.close()
    #import pdb;pdb.set_trace()
    #vec table made!

    #import pdb;pdb.set_trace()

    n_vecs = []
    for v in vecs:
        if len(v) < 300:
            n_vecs.append(v)
        else:
            n_vecs.append(numpy.zeros(200))

    vec_table = theano.shared(value=numpy.array(n_vecs), name='W', borrow=True)

    n_examples = []
    for e in examples:
        if e[-1].startswith('X'):
            n_examples.append(e[:-1])

    eval_cong = []
    eval_incong = []
    for e in examples:
        if e[-1].startswith('c'):
            eval_cong.append(e[:-1])
        elif e[-1].startswith('i'):
            eval_incong.append(e[:-1])

    #minibatches
    minibatches = []
    for i in range(0,len(n_examples), minibatch_size):
        minibatches.append(numpy.array(n_examples[i:i+minibatch_size]))

    #Let's create input variables:
    input_variables = [T.lvector() for i in range(n_in_vecs)]
    inputs = [vec_table[input_variables[0]], vec_table[input_variables[1]]]
    conc_layer = concatenation_layer(inputs)

    o_input = T.lvector()
    rng = numpy.random.RandomState(1234)

    #SV_ae
    sv_ae = simple_ae_layer(conc_layer.output, vec_size*n_in_vecs, vec_size*n_in_vecs, hidden_size, rng)
    sv_sp_layer = split_layer(sv_ae.output, n_in_vecs, vec_size)
    sv_res_f = theano.function(input_variables, sv_sp_layer.output)
    sv_cost = T.mean((conc_layer.output - sv_ae.output) ** 2)
    sv_cost_f = theano.function(input_variables, [sv_cost])

    learning_rate = theano.shared(0.8)

    sv_gparams = [T.grad(sv_cost, param) for param in sv_ae.params]
    updates = [(param, param - learning_rate * gparam) for param, gparam in zip(sv_ae.params, sv_gparams)]
    sv_train_f = theano.function(input_variables, sv_cost, updates=updates)

    #Mapping
    mapl = linear_mapping_layer(sv_ae.hidden_output, vec_size, hidden_size, rng)
    m_cost = T.mean((vec_table[o_input] - mapl.output) ** 2)
    #learning_rate = theano.shared(0.5)
    gparams = [T.grad(m_cost, param) for param in mapl.params]
    das_parameters = [mapl.W, sv_ae.W]
    updates = [(param, param - learning_rate * gparam) for param, gparam in zip(das_parameters, gparams)]
    m_train_f = theano.function([input_variables[0], input_variables[1], o_input], m_cost, updates=updates)
    m_cost_f = theano.function([input_variables[0], input_variables[1], o_input], m_cost)
    #
    res_f = theano.function(inputs, mapl.output)
    vec_f = theano.function([o_input], vec_table[o_input])
    gres_f = theano.function(input_variables, mapl.output)
    vres_f = theano.function(inputs, sv_ae.hidden_output)

    #Now let us do evaluation related stuffs!
    #Let us load up a model
    #model__latest_200_ae_map680
    inf = open('./models_ae/model__latest_200_ae_map680','rb')#model_200_ae_map7880','rb')#'./models_ae/model_200_ae_map620','rb')
    params = pickle.load(inf)
    #pickle.dump([sv_ae.params, mapl.params], outf)
    for p,a in zip(params[0], sv_ae.params):
        a.set_value(p.get_value())
    for p,a in zip(params[1], mapl.params):
        a.set_value(p.get_value())
    inf.close()

    xc = len(eval_cong)
    c_results = gres_f(numpy.array(eval_cong).reshape(xc,3)[:,0], numpy.array(eval_cong).reshape(xc,3)[:,1])

    xi = len(eval_incong)
    ic_results = gres_f(numpy.array(eval_incong).reshape(xi,3)[:,0], numpy.array(eval_incong).reshape(xi,3)[:,1])

    c_objs = vec_f(numpy.array(eval_cong).reshape(xc,3)[:,2])
    ic_objs = vec_f(numpy.array(eval_incong).reshape(xi,3)[:,2])

    #Find the pairs
    the_pairs_c = []
    the_pairs_ic = []
    for trp in eval_cong:
        the_pairs_c.append(str(trp[0][0]) + '+' + str(trp[0][1]))
    for trp in eval_incong:
        the_pairs_ic.append(str(trp[0][0]) + '+' + str(trp[0][1]))

    pair_map = []
    for trp in eval_cong:
        if str(trp[0][0]) + '+' + str(trp[0][1]) in the_pairs_c and str(trp[0][0]) + '+' + str(trp[0][1]) in the_pairs_ic:
            pair_map.append((the_pairs_c.index(str(trp[0][0]) + '+' + str(trp[0][1])),  the_pairs_ic.index(str(trp[0][0]) + '+' + str(trp[0][1]))))

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    count = 0

    for tpl in pair_map:
        cd = cosine(c_objs[tpl[0]], c_results[tpl[0]])
        icd = cosine(ic_objs[tpl[1]], ic_results[tpl[1]])
        print cd, icd
        if cd < icd:
            count += 1

    print count/float(len(pair_map))



    #The fun loop
    #wv = wvlib.load("/usr/share/ParseBank/vector-space-models/FIN/w2v_pbv3_lm.rev01.bin",max_rank=100000)
    #import pdb;pdb.set_trace()
    wv = wvlib.load("/home/ginter/w2v/pb34_lemma_200_v2.bin",max_rank=50000).normalize()
    import pdb;pdb.set_trace()
    #The loop
    while True:
        try:
            xs = raw_input('Subject ')
            xv = raw_input('Verb ')
            xsv = wv.word_to_vector(xs.decode('utf8'))
            xvv = wv.word_to_vector(xv.decode('utf8'))

            if xs.startswith('exit') or xv.startswith('exit'):
                break

            for t in wv.approximate_nearest(res_f([xsv], [xvv])[0]):#wv.nearest(vres_f(numpy.array([xsv,]),numpy.array([xvv]))[0]):
                print t
            print
        except:
            pass



    '''