def get_w2v_labels(y_original, dim=200):
    y_new = np.zeros((y_original.shape[0], dim))
    if dim == 200:
        model = word2vec.load(root + 'word2vec/vectors.bin')
    elif dim in [100,50,25,10]:
        model = word2vec.load(root + 'semantic-network/data/text8-%s.bin'%dim)
    else:
        raise NotImplementedError
    for i, label in enumerate(y_original):
        y_new[i,:] = model[classes[label]]

    return y_new
def get_char_embedding():
    """提取字向量,并保存至 ../data/char_embedding.npy"""
    print('getting the char_embedding.npy')
    wv = word2vec.load('../raw_data/char_embedding.txt')
    char_embedding = wv.vectors
    chars = wv.vocab
    n_special_sym = len(SPECIAL_SYMBOL)
    sr_id2char = pd.Series(chars, index=range(n_special_sym, n_special_sym + len(chars)))
    sr_char2id = pd.Series(range(n_special_sym, n_special_sym + len(chars)), index=chars)

    # 添加特殊符号:<PAD>:0, <UNK>:1
    embedding_size = 256

    vec_special_sym = np.random.randn(n_special_sym, embedding_size)
    for i in range(n_special_sym):
        sr_id2char[i] = SPECIAL_SYMBOL[i]
        sr_char2id[SPECIAL_SYMBOL[i]] = i
    char_embedding = np.vstack([vec_special_sym, char_embedding])
    # 保存字向量
    save_path = '../data/'
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    np.save(save_path + 'char_embedding.npy', char_embedding)
    # 保存字与id的对应关系
    with open(save_path + 'sr_char2id.pkl', 'wb') as outp:
        pickle.dump(sr_id2char, outp)
        pickle.dump(sr_char2id, outp)
    print('Saving the char_embedding.npy to ../data/char_embedding.npy')
Exemple #3
0
def predict():
    model = word2vec.load('./latents.bin')
    predictions = []
    with open('MSRParaphraseCorpus/MSR_easy.txt') as f:
        data = f.readlines()
    block = []
    for each in data:
        block.append(flex(getWords(each.lower())))
    i = 1
    while i+1 < len(block):
        if int(block[i][0]) - int(block[i+1][0]) < 200 and int(block[i][0]) - int(block[i+1][0]) > -200:
            t1 = block[i][1:]
            t2 = block[i+1][1:]
            t = union(t1, t2)
            # -------------- sementic similarity between two sentences ------- #
            similarity_ssv = ssv(t, t1, t2, model)
            #print 'ssv ', similarity_ssv

            # ----------------- word similarity between sentences ------------ #
            similarity_wo = wo(t, t1, t2, model)
            #print 'wo ', similarity_wo

            alpha = 0.8
            similarity = alpha*similarity_ssv + (1-alpha)*similarity_wo
            print similarity, str(block[i][0]), str(block[i+1][0])
            predictions.append([similarity, str(block[i][0]), str(block[i+1][0])])
            i = i + 2
        else:
            i = i + 1
def loadArg1():
    model=word2vec.load("/mnt/mint_share/text8.bin")
    data=np.empty((17572,1,100,100),dtype='float64')
    label=np.empty((17472,),dtype='uint8')
    with codecs.open("/mnt/mint_share/train_pdtb.json","rU","utf-8") as f:
        for i,line in  enumerate(f):
            unit=json.loads(line)
            len1 = len(unit['Arg1']['Word'])
            if(len1 <100):
                for j in range(len1):
                    try:
                        j_ = model[unit['Arg1']['Word'][j]]
                    except:
                        j_ = model['fillin']
                    data[i,:,j,:]= j_
                for j in range(100- len1):
                    data[i,:,len1+j,:]=model['fillin']
            else:
                for j in range(100):
                    try:
                        j_ = model[unit['Arg1']['Word'][j]]
                    except:
                        j_ = model['fillin']
                    data[i,:,j,:]= j_
    with open("arg1_image_100","wb") as f1:
        # dill.dump(data,f1)
        cPickle.dump(data,f1,protocol=2)
Exemple #5
0
def test():
    # ------------ common between two measurments ---------------------------- #
    t1 = "a quick brown dog jumps over the lazy fox"
    t2 = "a quick brown fox jumps over the lazy dog"
    t2 = "jumps over the lazy fox is a quick brown dog"
    #t1 = "Amrozi accused his brother, whom he called the witness, of deliberately distorting his evidence.".lower()
    #t2 = "Referring to him as only the witness, Amrozi accused his brother of deliberately distorting his evidence.".lower()
    #t1 = "i have to find you, tell me you need me."
    #t2 = "don't wanna know who is taking you home"
    t1 = getWords(t1)
    t2 = getWords(t2)
    t1 = flex(t1)
    t2 = flex(t2)
    t = union(t1, t2)
    #t = ["a", "brown", "jumps", "the", "fox", "dog", "quick", "over", "lazy"]
    print t

    model = word2vec.load('./latents.bin')
    # -------------- sementic similarity between two sentences --------------- #
    similarity_ssv = ssv(t, t1, t2, model)
    print 'ssv ', similarity_ssv

    # ----------------- word similarity between sentences -------------------- #
    similarity_wo = wo(t, t1, t2, model)
    print 'wo ', similarity_wo

    alpha = 0.8
    print alpha*similarity_ssv + (1-alpha)*similarity_wo
def embed(sentences):
    model = word2vec.load('~/word2vec_models/GoogleNews-vectors-negative300.bin')
    embedded_sentences = []
    tokenized_sentences = []

    max_len = 0
    for sentence in sentences:
        tokenized_sentence = sent_tokenize(sentence)
        tokenized_sentences.append(tokenized_sentence)
        if len(tokenized_sentence) > max_len:
            max_len = len(tokenized_sentence)


    for sentence in sentences:
        tokenized_sentence = sent_tokenize(sentence)
        embedded_words = []
        
        for word in tokenized_sentence:
            try:
                word = model['word']
            except:
                word = np.zeros(300)
            embedded_words.append(word)

        #padding    
        for i in range(max_len - len(embedded_words)):
            embedded_words.append(np.zeros(300))

        embedded_sentences.append(embedded_words)

    embedded_sentences = np.array(embedded_sentences)

    return embedded_sentences
Exemple #7
0
def save_latent_features_of_tagsjson():
    model = word2vec.load('../lib/word2vec/vectors.bin')
    all_tags = []
    with open('tags.json', 'r') as f:
         data = json.load(f)

    i=0
    while i < len(data['item']):
        all_tags = all_tags + data['item'][i]['tag_text'].replace('"','').lower().split('|')
        all_tags = all_tags + data['item'][i]['tag_query'].replace('"','').lower().split('|')
        i=i+1
    i=0
    while i < len(all_tags):
        if all_tags[i][0] == ' ':
            all_tags[i] = all_tags[i][1:]
            i=i-1
        i=i+1
    print all_tags
    latent_tags=[]
    latent_model=[]
    for i in all_tags:
        try:
            a=model[str(i)]
            latent_tags.append(str(i))
            latent_model.append(a)
        except Exception, e:
            print i
            print e
def assign_pretrained_word_embedding(sess,vocabulary_index2word,vocab_size,model,word2vec_model_path=None):
    print("using pre-trained word emebedding.started.word2vec_model_path:",word2vec_model_path)
    # word2vecc=word2vec.load('word_embedding.txt') #load vocab-vector fiel.word2vecc['w91874']
    word2vec_model = word2vec.load(word2vec_model_path, kind='bin')
    word2vec_dict = {}
    for word, vector in zip(word2vec_model.vocab, word2vec_model.vectors):
        word2vec_dict[word] = vector
    word_embedding_2dlist = [[]] * vocab_size  # create an empty word_embedding list.
    word_embedding_2dlist[0] = np.zeros(FLAGS.embed_size)  # assign empty for first word:'PAD'
    bound = np.sqrt(6.0) / np.sqrt(vocab_size)  # bound for random variables.
    count_exist = 0;
    count_not_exist = 0
    for i in range(1, vocab_size):  # loop each word
        word = vocabulary_index2word[i]  # get a word
        embedding = None
        try:
            embedding = word2vec_dict[word]  # try to get vector:it is an array.
        except Exception:
            embedding = None
        if embedding is not None:  # the 'word' exist a embedding
            word_embedding_2dlist[i] = embedding;
            count_exist = count_exist + 1  # assign array to this word.
        else:  # no embedding for this word
            word_embedding_2dlist[i] = np.random.uniform(-bound, bound, FLAGS.embed_size);
            count_not_exist = count_not_exist + 1  # init a random value for the word.
    word_embedding_final = np.array(word_embedding_2dlist)  # covert to 2d array.
    word_embedding = tf.constant(word_embedding_final, dtype=tf.float32)  # convert to tensor
    t_assign_embedding = tf.assign(model.Embedding,word_embedding)  # assign this value to our embedding variables of our model.
    sess.run(t_assign_embedding);
    print("word. exists embedding:", count_exist, " ;word not exist embedding:", count_not_exist)
    print("using pre-trained word emebedding.ended...")
Exemple #9
0
def test_distance():
    model = word2vec.load(output_txt)
    metrics = model.distance("the", "the", "the")
    assert len(metrics) == 3
    for item in metrics:
        # There should be 3 items per record
        assert len(item) == 3
def create_voabulary(simple=None,word2vec_model_path='zhihu-word2vec-title-desc.bin-100',name_scope=''): #zhihu-word2vec-multilabel.bin-100
    cache_path ='cache_vocabulary_label_pik/'+ name_scope + "_word_voabulary.pik"
    print("cache_path:",cache_path,"file_exists:",os.path.exists(cache_path))
    if os.path.exists(cache_path):#如果缓存文件存在,则直接读取
        with open(cache_path, 'r') as data_f:
            vocabulary_word2index, vocabulary_index2word=pickle.load(data_f)
            return vocabulary_word2index, vocabulary_index2word
    else:
        vocabulary_word2index={}
        vocabulary_index2word={}
        if simple is not None:
            word2vec_model_path='zhihu-word2vec.bin-100'
        print("create vocabulary. word2vec_model_path:",word2vec_model_path)
        model=word2vec.load(word2vec_model_path,kind='bin')
        vocabulary_word2index['PAD_ID']=0
        vocabulary_index2word[0]='PAD_ID'
        special_index=0
        if 'biLstmTextRelation' in name_scope:
            vocabulary_word2index['EOS']=1 # a special token for biLstTextRelation model. which is used between two sentences.
            vocabulary_index2word[1]='EOS'
            special_index=1
        for i,vocab in enumerate(model.vocab):
            vocabulary_word2index[vocab]=i+1+special_index
            vocabulary_index2word[i+1+special_index]=vocab

        #save to file system if vocabulary of words is not exists.
        if not os.path.exists(cache_path): #如果不存在写到缓存文件中
            with open(cache_path, 'a') as data_f:
                pickle.dump((vocabulary_word2index,vocabulary_index2word), data_f)
    return vocabulary_word2index,vocabulary_index2word
Exemple #11
0
def test_load_txt():
    model = word2vec.load(output_txt)
    vocab = model.vocab
    vectors = model.vectors

    assert vectors.shape[0] == vocab.shape[0]
    assert vectors.shape[0] > 3000
    assert vectors.shape[1] == 10
Exemple #12
0
def test_closest():
    model = word2vec.load(output_txt)
    indexes, metrics = model.closest(model["the"], n=30)
    assert indexes.shape == (30, )
    assert indexes.shape == metrics.shape

    py_response = model.generate_response(indexes, metrics).tolist()
    assert len(py_response) == 30
    assert len(py_response[0]) == 2
Exemple #13
0
def load(modelpath):

    model = word2vec.load(modelpath)

    nvocab = [ unicode(i,'utf-8') for i in model.vocab ]
    index = { v:n for n,v in enumerate(nvocab) }
    l2norm = model.l2norm
    
    return (index,l2norm)
Exemple #14
0
def get_feats(seqs, train=False):
    print "get_feats"
    vec_model, dim = word2vec.load(vecfile)
    zero_vec = data_util.zero_vec(dim)
    feats = []
    labels = []
    global label_set
    label_set = set([])
    for s in seqs:
        s_feats = []
        s_labels = []
        for pair in s:
            word = pair[0]
            vector = word2vec.get(word, vec_model)
            s_feats.append(vector)
            s_labels.append(pair[1])
            label_set.add(pair[1])
        feats.append(s_feats)
        labels.append(s_labels)
    if train:
        num_labels = len(list(label_set))
        create_labelencoder(list(label_set), num_labels)
        global max_seq_len
        #max_seq_len = max([len(txt) for txt in feats])
    print "max_seq_len: " + str(max_seq_len)

    # Pad sequences
    #feats = pad_sequences(numpy.array(feats), maxlen=max_seq_len, dtype='float32', padding="pre")
    #labels = pad_sequences(numpy.array(labels), maxlen=max_seq_len, dtype='str', padding="pre", value='O')

    padded_feats = []
    padded_labels = []
    for feat in feats:
        #print "seq len: " + str(len(feat))
        while len(feat) > max_seq_len:
            feat_part = feat[0:max_seq_len]
            padded_feats.append(pad_feat(feat_part, max_seq_len, zero_vec))
            feat = feat[max_seq_len:]
        new_feat = pad_feat(feat, max_seq_len, zero_vec)
        padded_feats.append(new_feat)
    for labs in labels:
        while len(labs) > max_seq_len:
            labs_part = labs[0:max_seq_len]
            padded_labels.append(pad_feat(labs_part, max_seq_len, 'O'))
            labs = labs[max_seq_len:]
        padded_labels.append(pad_feat(labs, max_seq_len, 'O'))
    feats = padded_feats
    labels = padded_labels

    # Encode labels
    encoded_labels = encode_labels(labels, max_len=max_seq_len)
    print "labels[0]: " + str(encoded_labels[0])
    #for row in labels:
    #    encoded_row = encode_labels(row)
    #    encoded_labels.append(encoded_row)
    print "feats: " + str(len(feats)) + " labels: " + str(len(encoded_labels))
    return feats, encoded_labels
def load_matrix(bin_path, input2idx):
    model = word2vec.load(bin_path)
    vector_dim = model.vectors.shape[1]
    matrix = np.zeros((len(input2idx), vector_dim))
    for word, i in input2idx.items():
        embedding_vector = model[word]
        if embedding_vector is not None:
            matrix[i] = embedding_vector
    return matrix
Exemple #16
0
def get_embed(csv, col, embed_file):
        if os.path.exists(embed_file):
                return word2vec.load(embed_file)
            
        def csv2txt(text, voc):
            text = ' '.join([item for item in text.split() if item in voc])
            text += '\n'
            with open('word_token.txt', 'a')as f:
                f.write(text)
        os.system('rm word_token.txt')
        
        vec = TfidfVectorizer(max_df=0.9, min_df= 3,smooth_idf=1, sublinear_tf=1)
        vec.fit(csv[col])
        voc = vec.vocabulary_
        
        csv[col].apply(csv2txt, args=[voc])
        word2vec.word2vec('word_token.txt', embed_file, 256, verbose=1)
        return word2vec.load(embed_file)
Exemple #17
0
def test_prediction():
    model = word2vec.load(output_bin)
    indexes, metrics = model.cosine('the')
    assert indexes.shape == (10,)
    assert indexes.shape == metrics.shape

    py_response = model.generate_response(indexes, metrics).tolist()
    assert len(py_response) == 10
    assert len(py_response[0]) == 2
Exemple #18
0
def save():
    model = word2vec.load('data/vec_google.bin')

    logging.info('loading word vectors to redis')

    for index, word in enumerate(model.vocab):
        save_word_vector(word, model[word])

        if index % 1000 == 0: logging.info(index)
Exemple #19
0
def test_prediction():
    model = word2vec.load(output_bin)
    indexes, metrics = model.cosine('the')
    assert indexes.shape == (10, )
    assert indexes.shape == metrics.shape

    py_response = model.generate_response(indexes, metrics).tolist()
    assert len(py_response) == 10
    assert len(py_response[0]) == 2
Exemple #20
0
 def __init__(self, file_path):
     # w2v_file = os.path.join(base_path,  "vectors_poem.bin")
     self.model = word2vec.load(file_path)
     if 'unknown' not in self.model.vocab_hash:
         unknown_vec = np.random.uniform(-0.1, 0.1,
                                         size=128)  #生成120个-0.1到0.1的数
         self.model.vocab_hash['unknown'] = len(self.model.vocab)
         self.model.vectors = np.row_stack(
             (self.model.vectors, unknown_vec))  # np.row_stack水平拼接两个参数
Exemple #21
0
def test_analogy():
    model = word2vec.load(output_txt)
    indexes, metrics = model.analogy(pos=["the", "the"], neg=["the"], n=20)
    assert indexes.shape == (20,)
    assert indexes.shape == metrics.shape

    py_response = model.generate_response(indexes, metrics).tolist()
    assert len(py_response) == 20
    assert len(py_response[0]) == 2
Exemple #22
0
def test_closest():
    model = word2vec.load(output_txt)
    indexes, metrics = model.closest(model["the"], n=30)
    assert indexes.shape == (30,)
    assert indexes.shape == metrics.shape

    py_response = model.generate_response(indexes, metrics).tolist()
    assert len(py_response) == 30
    assert len(py_response[0]) == 2
Exemple #23
0
def test_analogy():
    model = word2vec.load(output_txt)
    indexes, metrics = model.analogy(pos=['the', 'the'], neg=['the'], n=20)
    assert indexes.shape == (20, )
    assert indexes.shape == metrics.shape

    py_response = model.generate_response(indexes, metrics).tolist()
    assert len(py_response) == 20
    assert len(py_response[0]) == 2
Exemple #24
0
    def __init__(self,\
                patchlength=3,\
                maxlength=700,\
                embedding_size=100,\
                num_verbs=2,\
                allinclude=False,\
                shorten=False,\
                shorten_front=False,\
                testflag=False):   #几句前文是否shorten #是否输出不带tag,只有单词的句子

        #patchlength:每次输入前文额外的句子的数量.
        #maxlength:每句话的最大长度.(包括前文额外句子).超过该长度的句子会被丢弃.
        #embedding_size:词向量维度数.
        self.url = 'http://166.111.139.15:9000'
        self.shorten = shorten
        self.shorten_front = shorten_front  #几句前文是否shorten #是否输出不带tag,只有单词的句子
        self.patchlength = patchlength
        self.maxlength = maxlength
        self.embedding_size = embedding_size
        self.num_verbs = num_verbs
        self.allinclude = allinclude
        self.verbtags = ['VB', 'VBZ', 'VBP', 'VBD', 'VBN', 'VBG']  #所有动词的tag
        self.model = word2vec.load('tense/combine100.bin')  #加载词向量模型
        self.tagdict = {')': 0}
        print('loaded model')
        self.oldqueue = Queue()
        self.testflag = testflag
        if testflag == False:
            self.resp = open(r'tense/resp2').readlines()
            self.readlength = len(self.resp)
            print('readlength', self.readlength)
            #            self.pointer=random.randint(0,self.readlength-1)
            self.pointer = 0
            print('pointer', self.pointer)
            for _ in range(self.patchlength):
                self.oldqueue.put(self.resp[self.pointer])
                self.pointer += 1
        else:
            for _ in range(self.patchlength):
                if shorten_front == True:
                    self.oldqueue.put(input())
                else:
                    self.oldqueue.put(self.parse(input()))

        self.cldict = dict()
        #加载文字

        #加载原型词典(把动词变为它的原型)
        with open('tense/ldict2', 'rb') as f:
            self.ldict = pickle.load(f)
        with open('tense/tagdict', 'rb') as f:
            self.tagdict = pickle.load(f)
        with open('tense/cldict', 'rb') as f:
            self.cldictori = pickle.load(f)

        print('loaded lemma')
Exemple #25
0
def load_wv_model(word_vector_file, word_vector_type):
    if word_vector_type == WordVectorTypes.glove.name:
        #from glove import Glove
        glove_model = GloveWrapper.load(word_vector_file)
        wv_model = GloveWrapper(glove_model)
    else:
        import word2vec
        w2v_model = word2vec.load(word_vector_file)
        wv_model = W2VWrapper(w2v_model)
    return wv_model
Exemple #26
0
def test_similar():
    model = word2vec.load(output_bin)
    indexes, metrics = model.similar("the")
    assert indexes.shape == (10, )
    assert indexes.shape == metrics.shape

    py_response = model.generate_response(indexes, metrics).tolist()
    print(py_response)
    assert len(py_response) == 10
    assert len(py_response[0]) == 2
Exemple #27
0
def sentiment(test):
    model = CreateModel()
    #FitModel(model)
    Vector = word2vec.load("vectors.bin")
    print()
    vec=Vector[test]
    print(vec)
    t = model.predict(vec)
    print(t)
    return t
Exemple #28
0
 def __init__(self):
     self.word2vec_model = None
     self.cosine_similarity_map = {}
     self.word_vectors_map = {}
     #
     print 'Loading word vectors into the python model ...'
     start_time = time.time()
     self.word2vec_model = wv.load(cap.absolute_path+'./wordvectors/pubmed.bin')
     print 'The execution time for the loading was ', time.time()-start_time
     print 'word2vec_model.vocab', self.word2vec_model.vocab
Exemple #29
0
def main(em_file, em_result):
    '''
    embedding ->numpy
    '''
    em = word2vec.load(em_file)
    vec = (em.vectors)
    word2id = em.vocab_hash
    # d = dict(vector = vec, word2id = word2id)
    # t.save(d,em_result)
    np.savez_compressed(em_result, vector=vec, word2id=word2id)
Exemple #30
0
 def train(self):
     if not os.path.isfile(self.trained_fname):
         print("Previous training '" + self.trained_fname + "' not found. Begin training on input '" +
               self.input_fname + "' into " + str(self.train_dimensions) + " dimensions ...")
         self.trained_fname = 'src/resources/output' + self.train_dimensions
         word2vec.word2vec(self.input_fname, self.trained_fname, size=self.train_dimensions)
     else:
         print("Trained data seems to exist at '" + self.trained_fname + "'")
     print("Loading training results...")
     self.model = word2vec.load(self.trained_fname, kind='bin')
Exemple #31
0
 def salt(self):
     print '\nfrom salt !!!!!', '\n'
     model = word2vec.load('./ActionsA/latents.bin')
     with open(self.conversation_filepath + 'conversation.csv') as fh:
         f = map(lambda x: x.split(","),
                 filter(lambda x: (x != ""),
                        fh.read().split("\n")))
     for each in f:
         print distance(each[0],
                        'very well said i bet but i need more beer', model)
 def __init__(self, originData=None, w2vModelPath="vectors.w2v", vectorSize=100):
     self.__model = None
     self.__vectorSize = vectorSize
     if type(originData) is str:
         word2vec.word2vec(
             originData, 
             w2vModelPath, 
             size=vectorSize, 
             verbose=True)
         self.__model = word2vec.load(w2vModelPath)
Exemple #33
0
 def __init__(self, coefficient: int = 0.4):
     self.coefficient = coefficient
     print("Starting loading model for word2vec...")
     self.model = load(filename_start)
     print("Successfully loaded!")
     self.tags = [
         "VERB", "NOUN", "ADV", "DET", "ADJ", "SCONJ", "INTJ", "X", "NUM",
         "PART", "ADP", "PRON", "X"
     ]
     self.commands = []
def extract(dim, data, trained):
    if(not trained):
        word2vec.word2phrase(data, data+'-phrases', verbose=True)
        word2vec.word2vec(data+'-phrases', data+'.bin', size=dim, verbose=True)
    model = word2vec.load(data+'.bin')
    keys = model.vocab
    features = model.vectors
    dic = dict(zip(keys,features))
    print(len(dic))
    return dic
 def __init__(self, embeddings_path=None):
     file_path = Path + '/conf/system.properties'
     self.props = propertyUtil.parse(file_path)
     if embeddings_path is None:
         embeddings_path = self.props.get("EMBEDDING_PATH")
     model = word2vec.load(Path + '/' + embeddings_path)
     self.model = model
     self.embeddings = model.vectors.tolist()
     self.vocab = model.vocab.tolist()
     self.wordsMap = self._build(self.vocab)
Exemple #36
0
def load_wv_model(word_vector_file, word_vector_type):
    if word_vector_type == WordVectorTypes.glove.name:
        #from glove import Glove
        glove_model = GloveWrapper.load(word_vector_file)
        wv_model = GloveWrapper(glove_model)
    else: 
        import word2vec
        w2v_model = word2vec.load(word_vector_file)
        wv_model = W2VWrapper(w2v_model)
    return wv_model
Exemple #37
0
def test_similar():
    model = word2vec.load(output_bin)
    indexes, metrics = model.similar("the")
    assert indexes.shape == (10,)
    assert indexes.shape == metrics.shape

    py_response = model.generate_response(indexes, metrics).tolist()
    print(py_response)
    assert len(py_response) == 10
    assert len(py_response[0]) == 2
Exemple #38
0
def emb2npz(emb_file_path, emb_dict_path):
    """将txt格式的embedding转为字典格式, 并将<PAD>和<UNK>加入"""
    emb = word2vec.load(emb_file_path)
    vec = emb.vectors
    word2id = emb.vocab_hash
    word2id['<PAD>'] = len(word2id)
    pad_row = [0] * vec.shape[1]
    vec = np.row_stack((vec, pad_row))
    np.savez_compressed(emb_dict_path, vec=vec, word2id=word2id)
    print('word size: {}'.format(len(word2id)))
    print('emb shape: {}'.format(vec.shape))
Exemple #39
0
def load_embedding(path):
    wv = word2vec.load(path)
    vocab = wv.vocab
    word2idx = {}
    word_embedding = wv.vectors
    for i in range(1, len(vocab) + 1):
        word2idx[vocab[i-1]] = i
    word2idx['<0>'] = 0
    word_zero = np.zeros(len(word_embedding[0]))
    word_embedding = np.vstack([word_zero, word_embedding])
    return word2idx, word_embedding
def build_glove_dic():
    glove_path = 'glove.6B.50d.txt'
    wv = word2vec.load(glove_path)
    vocab = wv.vocab
    sr_word2id = pd.Series(range(1, len(vocab) + 1), index=vocab)
    sr_word2id['<unk>'] = 0
    word_embedding = wv.vectors
    word_mean = np.mean(word_embedding, axis=0)
    word_embedding = np.vstack([word_mean, word_embedding])

    return sr_word2id, word_embedding
def word_to_vec(config_path: str, dimension: int, T = ""):
    folder = "data/word2vec"
    words_file = os.path.join(folder, f"{T}words-noisefiltered-{dimension}")
    phrases_file = os.path.join(folder, f"{T}phrases-noisefiltered-{dimension}")
    w2v_file = os.path.join(folder, f"{T}noisefiltered-{dimension}.bin")
    import word2vec

    word2vec.word2phrase(words_file, phrases_file, verbose=True)
    word2vec.word2vec(phrases_file, w2v_file, size=dimension)
    logging.info("wrote to " + w2v_file)
    return word2vec.load(w2v_file)
Exemple #42
0
def tran(path):
    model = word2vec.load(path)
    vocab, vectors = model.vocab, model.vectors
    print(path)
    print('shape of word embeddings: ', vectors.shape)

    new_path = path.split('.')[0] + '.txt'
    print('transform start...')
    with open(new_path, "w") as f:
        for word, vector in tqdm(zip(vocab, vectors)):
            f.write(str(word) + ' ' + ' '.join(map(str, vector)) + '\n')
    print('Transform Complete!\n')
def train():
    movie_set = cornell_movie_set.MovieSet()
    movie_set.parse_movie_set('train')
    word2vec.word2phrase('cornell_movie_train.txt',
                         'movie_phrases_train.txt',
                         verbose=True)
    word2vec.word2vec('movie_phrases_train.txt',
                      'movie_train.bin',
                      size=100,
                      verbose=True)
    model = word2vec.load('movie_train.bin')
    return model
Exemple #44
0
def create_model():
	in_file = open(sys.argv[1])
	out_file = open(sys.argv[2],"w")
	json_data = json.load(in_file)
	final_hash = {}
	model = word2vec.load(sys.argv[3])
	clusters = word2vec.load_clusters(sys.argv[4])

	for loc in json_data:
		count = 0
		keywords = []
		final_hash[loc] = {}
		final_hash[loc]["doc_length"] = json_data[loc]["len"]	
		final_hash[loc]["keywords"] = []
		final_hash[loc]["centroids"] = []
		word_vectors = {}	#"word" => [vector]
		word_clusters = {}	#"cluster_no" => [words]
		cluster_centroids = {}
		for word in json_data[loc]["keywords"]:
			if len(word.split()) > 1:
				continue
			count += 1
			try:
				vec = model[word]
				cluster_no = clusters[word]
			except KeyError:
				#print("No entry in word2vec for " + word)
				continue
			word_vectors[word] = vec
			
			if cluster_no not in word_clusters:
				word_clusters[cluster_no] = []
				cluster_centroids[cluster_no] = len(vec)*[0.0]
			word_clusters[cluster_no].append(word)
			for i in range(len(vec)):
				cluster_centroids[cluster_no][i] += word_vectors[word][i]
		for cluster_no in word_clusters:
			cluster_len = len(word_clusters[cluster_no])
			for i in range(len(cluster_centroids[cluster_no])):
				cluster_centroids[cluster_no][i] = cluster_centroids[cluster_no][i] / cluster_len
	
		for cluster_no in word_clusters:
			keys = []
			for word in word_clusters[cluster_no]:
				keys.append((word,json_data[loc]["keywords"][word]))
			final_hash[loc]["keywords"].append(keys)
			final_hash[loc]["centroids"].append(cluster_centroids[cluster_no])		
		#print(" Total keywords in " + loc + " : " + str(count))
		#print(" Total word vectors in " + loc + " : " + str(len(word_vectors)))	
		

	
	json.dump(final_hash,out_file)
Exemple #45
0
def load_data():
    papers = []
    filename = sys.argv[1]
    file = open(filename, 'r')
    for line in file.readlines():
        dic = json.loads(line)
        papers.append(dic)

    word_embedding = word2vec.load(
        'word2vec_result.bin')  #导入预训练好的word2vec embedding
    print('load data finished')
    return papers, word_embedding
Exemple #46
0
def test_model_with_clusters():
    clusters = word2vec.load_clusters(output_clusters)
    model = word2vec.load(output_txt)
    assert clusters.vocab.shape == model.vocab.shape

    model.clusters = clusters
    indexes, metrics = model.analogy(pos=["the", "the"], neg=["the"], n=30)
    assert indexes.shape == (30, )
    assert indexes.shape == metrics.shape

    py_response = model.generate_response(indexes, metrics).tolist()
    assert len(py_response) == 30
    assert len(py_response[0]) == 3
Exemple #47
0
def init(path_to_we_model, path_to_relations):
    st = time.time()
    we_model = word2vec.load(path_to_we_model)
    et = time.time()
    we_loading_time = et - st
    st = time.time()
    relational_embedding = composition.compose_dataset(path_to_relations, we_model)
    et = time.time()
    relemb_build_time = et - st
    api = Fabric(we_model, relational_embedding, path_to_relations)
    print("Time to load WE model: " + str(we_loading_time))
    print("Time to build relemb: " + str(relemb_build_time))
    return api
Exemple #48
0
def getanology(second, first, third):
	import word2vec
	# Import the word2vec binary file: dataset
	model = word2vec.load('/export/home/sysadmin/text8.bin')

	# We can do simple queries to retreive words related to "word"

	indexes, metrics = model.analogy(pos=[first, third], neg=[second], n=10)

	#model.vocab[indexes]
	related_word = model.vocab[indexes[0]]

	return related_word
Exemple #49
0
def embedding_transform(emb_file):
    model = word2vec.load(emb_file)
    vocab, vectors = model.vocab, model.vectors
    print(emb_file)
    print('setting size of word embedding: {0}'.format(vectors.shape))

    new_file = emb_file.split('.')[0] + '_.txt'
    print('Transforming.....')

    with open(new_file, 'w') as f:
        for word, vec in zip(vocab, vectors):
            f.write(str(word) + ' ' + ' '.join(map(str, vec)) + '\n')
    print('fransform finished.')
Exemple #50
0
def test_model_with_clusters():
    clusters = word2vec.load_clusters(output_clusters)
    model = word2vec.load(output_bin)
    assert clusters.vocab.shape == model.vocab.shape

    model.clusters = clusters
    indexes, metrics = model.analogy(pos=["the", "the"], neg=["the"], n=30)
    assert indexes.shape == (30,)
    assert indexes.shape == metrics.shape

    py_response = model.generate_response(indexes, metrics).tolist()
    assert len(py_response) == 30
    assert len(py_response[0]) == 3
Exemple #51
0
def cosine_all():
    '''
        Use model to cosine all name in the book.
    '''
    _name_list = name_list[:]
    _name_list = map(lambda _: _.encode('utf-8'), _name_list)
    model = word2vec.load('../tmp/book.bin')
    _ret = {}
    for _ in _name_list:
        try:
            _ret.update(model.cosine(_, n=10))
        except:
            print _ + ' not found'
    return _ret
def create_voabulary_labelO():
    model = word2vec.load('zhihu-word2vec-multilabel.bin-100', kind='bin') #zhihu-word2vec.bin-100
    count=0
    vocabulary_word2index_label={}
    vocabulary_index2word_label={}
    label_unique={}
    for i,vocab in enumerate(model.vocab):
        if '__label__' in vocab:  #'__label__-2051131023989903826
            label=vocab[vocab.index('__label__')+len('__label__'):]
            if label_unique.get(label,None) is None: #不曾出现过的话,保持到字典中
                vocabulary_word2index_label[label]=count
                vocabulary_index2word_label[count]=label #ADD
                count=count+1
                label_unique[label]=label
    return vocabulary_word2index_label,vocabulary_index2word_label
Exemple #53
0
def matcher(line, context):
    model = word2vec.load('../lib/word2vec/vectors.bin')
    #clusters = word2vec.load_clusters('../lib/word2vec/text8-clusters.txt')
    a = numpy.loadtxt('latent_model.txt')
    with open('latent_tags.txt') as f:
        b = f.readlines()


    a = model['sports']
    print a
    b = model['sporting']
    print b
    result = 1 - spatial.distance.cosine(a, b)
    print result
    return 'jankiap50'
Exemple #54
0
def genDB():
  global model
  con, cur = createDB()
  for corp in corpus:
    f, TBNAME = corp
    print f, TBNAME
    model = word2vec.load(word2vec_model[TBNAME])
    d, m = readPatterns(f)
    json.dump(m, open('syn_%s.json'%TBNAME,'w+'))

    for word, data in d.iteritems():
      #print '.', word, TBNAME
      cur.execute("""INSERT INTO %s (word, data) VALUES('%s','%s');"""%(TBNAME, word.replace("'","''"), json.dumps(data).replace("'","''")))
      con.commit()
    #cur.execute("SELECT * FROM test;")
  cur.close()
  con.close()
def calculate_similarity(query, text, model_path):
    embeddingSize = 300  
    query_embedding =np.zeros((1,embeddingSize))  
    stop = stopwords.get_stopwords('english')
    model = word2vec.load(model_path)
    query_embedding = get_embedding(query, model, stop, query_embedding)
    
    nword=0
    score = 0.0
    for word in nltk.tokenize.word_tokenize(text.decode('utf8')):
        if word in model and word not in stop:
            nword += 1
            wordNorm = linalg.norm(model[word])
            score += np.dot(query_embedding, model[word]) / wordNorm
    
    if nword!=0:
        score = score / nword
    print score[0]
    return score[0]
def inject_word2vec_embeddings_old(session, word2vec_path, input_size, dict_dir, source_vocab_size, target_vocab_size):
    # (100000, 300)
    word2vec_model = word2vec.load(word2vec_path, encoding="latin-1")  # automatically detects format
    print("w2v model created!")

    source_vocab_path, target_vocab_path = get_source_target_vocab_path(dict_dir, source_vocab_size, target_vocab_size)
    w2v_vectors_source = get_w2v_pretrained_vectors(word2vec_model, source_vocab_path, source_vocab_size, input_size)
    w2v_vectors_target = get_w2v_pretrained_vectors(word2vec_model, target_vocab_path, target_vocab_size, input_size)

    print("pre-trained source shape " + str(w2v_vectors_source.shape))
    print(w2v_vectors_source)
    print(w2v_vectors_source.shape)  # (vocab_size, embedding_dim)
    with tf.variable_scope("embedding_attention_seq2seq"):
        with tf.variable_scope("RNN"):
            with tf.variable_scope("EmbeddingWrapper", reuse=True):
                # 1) getting Variable containing embeddings
                embedding = vs.get_variable("embedding", w2v_vectors_source.shape, trainable=False)

                # 2) using placeholder to assign embedding
                X = tf.placeholder(tf.float32, shape=w2v_vectors_source.shape)  # model.vectors.shape
                set_x = embedding.assign(X)
                session.run(tf.initialize_all_variables())
                session.run(set_x, feed_dict={X: w2v_vectors_source})

                v = session.run(embedding)
                print("After pre-trained")
                print(v)

    # embedding_attention_decoder  | embedding_attention_seq2seq/embedding_attention_decoder/embedding:0
    with tf.variable_scope("embedding_attention_seq2seq"):
        with tf.variable_scope("embedding_attention_decoder", reuse=True):
            decoder_embedding = vs.get_variable("embedding", w2v_vectors_target.shape, trainable=False)
            # 2) using placeholder to assign embedding
            X = tf.placeholder(tf.float32, shape=w2v_vectors_target.shape)  # model.vectors.shape
            set_x = decoder_embedding.assign(X)

            session.run(tf.initialize_all_variables())
            session.run(set_x, feed_dict={X: w2v_vectors_target})

            v = session.run(decoder_embedding)
            print("After pre-trained")
            print(v)
def load_model(desc, tfidf_doc='split_plot', tfidf_wthr=1):
    """Load appropriate model based on descriptor type.
    """

    model = None
    if desc.startswith('tfidf'):
        model = encode_tfidf_model(tfidf_doc, tfidf_wthr)
        desc = desc + '-' + tfidf_doc + '-' + str(tfidf_wthr)

    elif desc == 'word2vec':
        model = w2v.load('models/movie_plots_1364.d-300.mc1.w2v', kind='bin')

    elif desc == 'skipthought':
        model = skipthoughts.load_model()

    elif desc == 'vis-text-embed':
        raise ValueError('Visual-Text embeddings are not yet supported.')
    #     model = VisTextEncoder()

    return model, desc
def main():
    # model = word2vec.load('/home/lr/sasano/corpus/word2vec/jawiki-mecab.bin')
    model = word2vec.load('/home/lr/tsakaki/work/word2vec/w2v_jawiki_latest_mecab_baseform_s300_w500.bin')

    for line in sys.stdin:
        line = line.rstrip()
        lst = line.split('\t')
        arg_pred0 = lst[0]
        arg_pred1 = lst[1]
        clas = lst[2]
        clas_detail = lst[3] if len(lst) >= 4 else ""

        arg = cut_arg_pred(arg_pred0)[0]
        pred0 = cut_arg_pred(arg_pred0)[1]
        pred1 = cut_arg_pred(arg_pred1)[1]
        ans_vector = []
        try:
            ans_vector.extend(model[arg])
            pred0_vec = model[pred0]
            pred1_vec = model[pred1]
            diff_vec = []
            for i in xrange(0, len(pred0_vec)):
                diff_vec.append(pred0_vec[i] - pred1_vec[i])

            ans_vector.extend(diff_vec)
        except:
            continue

        label = 1 if (clas == "反義" and clas_detail == "属性反義") else -1

        # print arg
        # print pred0
        # print pred1
        # print label
        # sys.exit(1)
        sys.stdout.write("%d " % label)
        for ind, val in enumerate(ans_vector):
            sys.stdout.write("%d:%f " % (ind+1, val))
        print

    return
def active_learn_main(engine, initial_term, user_id, concept_id=False):
    '''
        engine is 
        initial_term is
        user_id is 
        concept_id is 
    '''
    
    #user will select a term and then the term will be run through the word2vec model to come up with similar terms
    #if it is an existing concept pull the existing data from db else start from scratch
    if concept_id:
        term_list = engine.execute(select([ConceptTerms.c.term]).where(ConceptTerms.c.concept_id
                                                            == concept_id))
        term_exc = engine.execute(select([ConceptTerms_reject.c.term]).where(ConceptTerms_reject.c.concept_id
                                                            == concept_id))
        pred_list = engine.execute(select([ConceptPredictors.c.predictor]).where(ConceptPredictors.c.concept_id
                                                            == concept_id))
        pred_exc = engine.execute(select([ConceptPredictorsReject.c.predictor]).where(ConceptPredictorsReject.c.concept_id
                                                            == concept_id))
    else:
        term_list = set([initial_term])
        term_exc = set()
        pred_list = set()
        pred_exc = set()


    #load in model
    #model = word2vec.load('/groups/clinicaltrials/clinicaltrials/data/criteria.bin')
    #clusters = word2vec.load_clusters('/groups/clinicaltrials/clinicaltrials/data/criteria-clusters.txt')
    model = word2vec.load('../data/criteria.bin')
    clusters = word2vec.load_clusters('../data/criteria-clusters.txt')

    # add clusters to model
    model.clusters = clusters
    
    #add skip terms to term_exc and pred_exc
    skip_term, skip_pred = skip_terms()
    term_exc.update(skip_term)
    pred_exc.update(skip_pred)

    term_list, pred_list = run_active_learning(term_list, term_exc, pred_list, pred_exc, engine, concept_id, user_id, model)