コード例 #1
0
ファイル: w2v_util.py プロジェクト: huji-nlp/ucca
def load_word2vec(w2v):
    if isinstance(w2v, str):
        print("Loading word vectors from '%s'..." % w2v, flush=True)
        try:
            w2v = Word2Vec.load_word2vec_format(w2v)
        except ValueError:
            w2v = Word2Vec.load_word2vec_format(w2v, binary=True)
    return w2v
コード例 #2
0
class GunicornApplication(BaseApplication):

    parser = SafeConfigParser()
    with codecs.open('config.ini', 'r', encoding='utf-8') as f:
        parser.readfp(f)

    #Mandatory Loading for standard disambiguation
    wiki_w2v_embeddings_file = parser.get('Word2VecRest',
                                          'embeddings_w2v_wikipedia')
    w2vmodel_dbpedia = Word2Vec.load_word2vec_format(wiki_w2v_embeddings_file,
                                                     binary=True)

    #If no doc2vec embeddings are loaded (due to memory constraints), we always return 0 as cosine similarity
    wiki_d2v_embeddings_file = parser.get('Word2VecRest',
                                          'embeddings_d2v_wikipedia')
    if os.path.isfile(wiki_d2v_embeddings_file):
        d2vmodel = Doc2Vec.load(wiki_d2v_embeddings_file)
    else:
        d2vmodel = None

    #Optional Embeddings
    biomed_w2v_embedings_file = parser.get('Word2VecRest',
                                           'embeddings_w2v_calbc')
    if os.path.isfile(biomed_w2v_embedings_file):
        w2vmodel_biomed = Word2Vec.load_word2vec_format(
            biomed_w2v_embedings_file, binary=True)

    wiki_d2v_german_embeddings = parser.get('Word2VecRest',
                                            'embeddings_d2v_wikipedia_german')
    if os.path.isfile(wiki_d2v_german_embeddings):
        d2vmodel_german = Doc2Vec.load(wiki_d2v_german_embeddings)

    def __init__(self, wsgi_app, port=5000):
        self.options = {
            'bind': "127.0.0.1:{port}".format(port=port),
            'workers': 3,
            'preload_app': True,
            'timeout': 200,
        }
        self.application = wsgi_app

        super(GunicornApplication, self).__init__()

    def load_config(self):
        config = dict([(key, value) for key, value in self.options.iteritems()
                       if key in self.cfg.settings and value is not None])
        for key, value in config.iteritems():
            self.cfg.set(key.lower(), value)

    def load(self):
        return self.application
コード例 #3
0
    def calculate_sim_without_tag(self, load_model, ofname, write_flag=True):
        # 加载指定w2v model
        w2v_model = Word2Vec.load_word2vec_format(r'%s/%s' % (macro.MODELS_DIR, load_model), binary=True)  # C format
        # 读入评测词对语料
        id_list, word1_list, word2_list, headline = utils.read2wordlist(self.f_tuple_list, mode='no_tag')
        # 新的题头
        new_headline = headline.strip() + '\tPrediction\n'
        # 计算相似度
        auto_sim_list = []
        for w1, w2 in zip(word1_list, word2_list):
            try:
                auto_sim = w2v_model.similarity(w1, w2)  # 向量余弦相似度[-1,1]
                auto_sim = utils.convert_sim(auto_sim)  # 将余弦相似度放到1-10得分
                print '%-10s\t%-10s\t%-10s' % (w1, w2, auto_sim)
            except:
                auto_sim = 1  # 未登录词,为了区分1.0,赋值为1
                print '%-10s\t%-10s\t%-10s' % (w1, w2, '______Not Found______')
            auto_sim_list.append(auto_sim)

        # 相似度计算的结果是否写入文件
        if write_flag:
            print 'write result to file...'
            with open('%s/%s' % (macro.RESULTS_DIR, ofname), 'w') as fw:
                fw.write(new_headline)
                for w1, w2, auto_sim in zip(word1_list, word2_list, auto_sim_list):
                    fw.write('%s\t%s\t%s\n' % (w1, w2, auto_sim))

        return word1_list, word2_list, auto_sim_list, new_headline
コード例 #4
0
ファイル: wordserve.py プロジェクト: ravenscroftj/sapienta3
def main():
    global _wv
    
    import argparse
    
    ap = argparse.ArgumentParser()
    
    ap.add_argument("vector_file", action="store", help="Path to the word2vec file to serve")
    ap.add_argument("-t", "--testclient", dest="test", action="store_true", help="If set, runs the test client against an existing server instance")
    ap.add_argument("--host", dest="host", action="store", default="localhost", help="The host to bind to, defaults to localhost")
    ap.add_argument("-p", "--port", dest="port", action="store", help="The port to serve on, defaults to 5000", default=5000)
    ap.add_argument("-d", "--debug", dest="debug", action="store_true", help="If true, provides debug output")
    
    args = ap.parse_args()
    
    if args.test:
        print("Running test client against http://{}:{}".format(args.host,args.port))
        
        tc = WordservClient(args.host,args.port)
        
        for i in range(0,500):
            vecs = tc.vector(["hello","world"])
        
        sys.exit(1)
    
    print("Loading word vector - this might take a while...")
    
    _wv = Word2Vec.load_word2vec_format(args.vector_file, binary=True)
    
    app.run(port=args.port, debug=args.debug)
コード例 #5
0
ファイル: features.py プロジェクト: kennyjoseph/twitter_dm
def get_init_data(model_file, ark_file):
    from gensim.models.word2vec import Word2Vec

    model = Word2Vec.load_word2vec_format(model_file, binary=False)
    ark_clusters = get_ark_clusters(ark_file)

    files = [resource_filename('twitter_dm', 'data/identity_dictionaries/identity/'+x) for x in
             resource_listdir('twitter_dm', 'data/identity_dictionaries/identity/')]

    files += [resource_filename('twitter_dm', 'data/identity_dictionaries/non_identity_words/'+x) for x in
             resource_listdir('twitter_dm', 'data/identity_dictionaries/non_identity_words/')]

    all_dictionaries = Dictionaries(list_of_files=files)
    twit_sets = []
    stopwords = get_stopwords()

    tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat(None)

    for v in [10, 100, 1000, 10000,50000]:
        twit_id = set(tw_distant_supervision_identity_dat[
                      (tw_distant_supervision_identity_dat.tot > v)].term.values)
        twit_id = {t for t in twit_id if t not in stopwords and t.replace(" person","") not in stopwords}
        twit_sets.append([twit_id,"twit_identities_"+str(v)])

    twit_sets.append([EXPERT_NON_IDENTITIES,"expert_non"])
    twit_sets.append([stopwords,"stopword"])

    return model, all_dictionaries, ark_clusters, [t[0] for t in twit_sets],[t[1] for t in twit_sets]
コード例 #6
0
def return_data(data_type, embed_dim=50): 
    """Return the data specified by the inputted `data_type`.

    This function is built to allow for easier calls for the data from scripts
    external to this one. 

    Args: 
    ----
        data_type: str
        embed_dim (optional): int

    Return: varied
    """

    if data_type == "word_embedding": 
        embedding_fp = 'data/word_embeddings/glove.6B.{}d.txt'.format(embed_dim)
        wrd_embedding = Word2Vec.load_word2vec_format(embedding_fp, binary=False)
        return wrd_embedding
    elif data_type == "articles": 
        body_fp = 'data/articles/twenty_newsgroups/bodies.pkl'
        headline_fp = 'data/articles/twenty_newsgroups/headlines.pkl'

        with open(body_fp, 'rb') as f: 
            bodies = pickle.load(f)
        with open(headline_fp, 'rb') as f: 
            headlines = pickle.load(f)
        return bodies, headlines
    else: 
        raise Exception('Invalid data type requested!')
コード例 #7
0
ファイル: word2vec.py プロジェクト: PenKeyBoy/comm-code
    def __init__(self,args):
        self.sentences = []
        self.sentence = []
        self.vocab = set()
        if args.restore is None:
            with codecs.open(args.file,'r',encoding='utf-8') as fr:
                for count,line in enumerate(fr):
                    if count > 1 and line.startswith('-DOC'):
                        self.sentences.append(self.sentence)
                        self.sentence = []
                    else:
                        try:
                            word,tag = line.rstrip('\r\n').split()
                        except Exception as e:
                            print("no enough element to unpack line {} is: {} with error:{}".format(count+1,line,e))
                        else:
                            for char in word:
                                self.sentence.append(char)
                                self.vocab.add(char)
            #pdb.set_trace()
            self.wordvec = Word2Vec(sentences=self.sentences,size=args.vector_size,window=args.window,min_count=args.min_count,max_vocab_size=len(self.vocab),
                               workers=args.workers,sg=args.sg,batch_words=args.batch_size)

        else:
            self.wordvec = Word2Vec.load_word2vec_format(args.restore)
        self.randvec = RandomVec(args.vector_size)
コード例 #8
0
def return_data(data_type, embed_dim=50):
    """Return the data specified by the inputted `data_type`.

    This function is built to allow for easier calls for the data from scripts
    external to this one. 

    Args: 
    ----
        data_type: str
        embed_dim (optional): int

    Return: varied
    """

    if data_type == "word_embedding":
        embedding_fp = 'data/word_embeddings/glove.6B.{}d.txt'.format(
            embed_dim)
        wrd_embedding = Word2Vec.load_word2vec_format(embedding_fp,
                                                      binary=False)
        return wrd_embedding
    elif data_type == "articles":
        body_fp = 'data/articles/twenty_newsgroups/bodies.pkl'
        headline_fp = 'data/articles/twenty_newsgroups/headlines.pkl'

        with open(body_fp, 'rb') as f:
            bodies = pickle.load(f)
        with open(headline_fp, 'rb') as f:
            headlines = pickle.load(f)
        return bodies, headlines
    else:
        raise Exception('Invalid data type requested!')
コード例 #9
0
 def loadW2VFloat(self, emb_path, type="text"):
     #print("Loading W2V data...")
     num_keys = 0
     if type=="textgz":
         # this seems faster than gensim non-binary load
         for line in gzip.open(emb_path):
             l = line.strip().split()
             st=l[0].lower()
             self.pre_emb[st]=np.asarray(l[1:], dtype=np.float32)
         num_keys=len(self.pre_emb)
     if type=="text":
         # this seems faster than gensim non-binary load
         for line in open(emb_path):
             l = line.strip().split()
             st,emb=l[0].lower(),[]
             for val in l[1:]:
                 try:
                     v = float(val)
                     emb.append(v)
                 except:
                     emb.append(0)
             self.pre_emb[st]=np.asarray(emb)
         num_keys=len(self.pre_emb)
     else:
         self.pre_emb = Word2Vec.load_word2vec_format(emb_path,binary=True)
         self.pre_emb.init_sims(replace=True)
         num_keys=len(self.pre_emb.vocab)
     #print("loaded word2vec len ", num_keys)
     gc.collect()
コード例 #10
0
 def mergeW2V(self, emb_path, type="bin"):
     print("Loading W2V data...")
     num_keys = 0
     if type == "textgz":
         # this seems faster than gensim non-binary load
         for line in gzip.open(emb_path):
             l = line.strip().split()
             st = l[0].lower()
             self.pre_emb[st] = np.asarray(l[1:])
         num_keys = len(self.pre_emb)
     if type == "text":
         # this seems faster than gensim non-binary load
         i = 0
         for line in open(emb_path):
             l = line.strip().split()
             st = l[0].lower()
             if st in self.pre_emb:
                 continue
             else:
                 i += 1
                 #if i % 10000 == 0:
                 #    print st
                 self.pre_emb[st] = np.asarray(l[1:])
         num_keys = len(self.pre_emb)
         print (num_keys)
     else:
         self.pre_emb = Word2Vec.load_word2vec_format(emb_path, binary=True)
         self.pre_emb.init_sims(replace=True)
         num_keys = len(self.pre_emb.vocab)
     print("loaded word2vec len ", num_keys)
     gc.collect()
コード例 #11
0
    def from_word2vec_model(cls, word2vec_model):
        """
        WARNING: `gensim` is required to use this function!

        Load a word2vec vector model.
        :param word2vec_model: path to word2vec model or a fitted word2vec model
        :return: a `Vectors` object
        """
        try:
            import gensim # gensim version hack
            if (int(gensim.__version__.split('.')[0]) < 1):
                from gensim.models.word2vec import Word2Vec as Word2VecLoader
            else:
                from gensim.models import KeyedVectors as Word2VecLoader
        except ImportError as ex:
            logging.error('Gensim is required to use this method!')
            raise ex

        if (isinstance(word2vec_model, str)):
            model = Word2VecLoader.load_word2vec_format(word2vec_model, binary=word2vec_model.endswith('bin'))
        else:
            model = word2vec_model

        vocab = model.vocab.keys()

        vectors = {}

        dims = len(model[next(iter(vocab))])  # vector dimensionality

        dimension_names = ['f%02d' % i for i in range(dims)]
        for word in vocab:
            vectors[word] = zip(dimension_names, model[word])

        return Vectors(vectors)
コード例 #12
0
def load_word2vec(filename):
    global sym
    np.random.seed(1337)
    sym = 2 * (np.random.rand(300) - 0.5)
    embedding = w2v.load_word2vec_format(filename, binary=True)
    print('Loaded word embedding')
    return embedding
コード例 #13
0
 def __init__(self, args):
     print('processing corpus')
     if args.restore is None:
         corpus = open(args.corpus, 'r').read().lower().split()
         sentences = []
         sentence = []
         length = 0
         for word in corpus:
             sentence.append(word)
             length += 1
             if length == args.sentence_length:
                 sentences.append(sentence)
                 sentence = []
                 length = 0
         if length != 0:
             sentences.append(sentence)
         print('training')
         self.wvec_model = Word2Vec(sentences=sentences,
                                    size=args.dimension,
                                    window=args.window,
                                    workers=args.workers,
                                    sg=args.sg,
                                    batch_words=args.batch_size,
                                    min_count=1,
                                    max_vocab_size=args.vocab_size)
     else:
         self.wvec_model = Word2Vec.load_word2vec_format(args.restore,
                                                         binary=True)
     self.rand_model = RandomVec(args.dimension)
コード例 #14
0
 def __init__(self, args):
     print('processing corpus')
     if args.restore is None:
         sentences = []
         with codecs.open(args.corpus, 'r', 'utf-8') as file:
             for line in tqdm(file):
                 line = line.strip().lower()
                 sentence = line.split(u' ')
                 #print(sentence)
                 sentences.append(sentence)
         #bigram_transformer = Phrases(sentences)
         #print(bigram_transformer[sentences])
         print('start to train word2vec embeddings')
         self.wvec_model = Word2Vec(sentences=sentences,
                                    size=args.dimension,
                                    window=args.window,
                                    workers=args.workers,
                                    sg=args.sg,
                                    batch_words=args.batch_size,
                                    min_count=1
                                    #max_vocab_size=args.vocab_size
                                    )
     else:
         self.wvec_model = Word2Vec.load_word2vec_format(args.restore,
                                                         binary=True)
     self.rand_model = RandomVec(args.dimension)
コード例 #15
0
def pretrained_embedding(vocab_processor):
    """Creates word embedding matrix from GoogleNews w2v.

    Requieres google news w2v downloaded from https://code.google.com/archive/p/word2vec/ in data
    """
    if not os.path.exists('data/GoogleNews-vectors-negative300.bin'):
        print(
            'You need to have google news w2v downloaded (from https://code.google.com/archive/p/word2vec/) and placed in ./data/GoogleNews-vectors-negative300.bin'
        )
        sys.exit()
    w2v = Word2Vec.load_word2vec_format(
        'data/GoogleNews-vectors-negative300.bin', binary=True)
    w2v.init_sims(replace=True)
    gc.collect()
    words = [
        vocab_processor.vocabulary_.reverse(i)
        for i in range(vocab_processor.vocabulary_.__len__())
    ]

    W_embeddings = []
    for w in words:
        try:
            W_embeddings.append(w2v.__getitem__(w))
        except KeyError:
            W_embeddings.append(np.random.uniform(
                -0.1, 0.1,
                300))  # Boundries makes variance equal as the ones from google
    del w2v
    gc.collect()
    W_embeddings = np.array(W_embeddings)
    return W_embeddings
コード例 #16
0
def return_data(data_type, embed_dim=50): 
    """Return the data specified by the inputted `data_type`.

    This function is built to allow for easier calls for the data from scripts
    external to this one. 

    Args: 
    ----
        data_type: str
        embed_dim (optional): int

    Return: varied
    """

    if data_type == "word_embedding": 
        embedding_fp = 'data/word_embeddings/glove.6B.{}d.txt'.format(embed_dim)
        wrd_embedding = Word2Vec.load_word2vec_format(embedding_fp, binary=False)
        return wrd_embedding
    elif data_type == "reviews": 
        reviews_fp = 'work/reviews/amazon/filtered_tokenized_reviews.pkl'
        ratios_fp = 'work/reviews/amazon/filtered_ratios.npy'

        with open(reviews_fp, 'rb') as f: 
            reviews = pickle.load(f)
        ratios = np.load(ratios_fp)
        return reviews, ratios 
    else: 
        raise Exception('Invalid data type requested!')
コード例 #17
0
    def loadEmbeddings(self, filepath, data_path, vocab_size, binary_val):
        if not os.path.exists(data_path):
            os.makedirs(data_path)
        embed_short = os.path.normpath("%s/embed.dat" % data_path)
        if not os.path.exists(embed_short):
            print("Caching word embeddings in memmapped format...")
            print(binary_val, filepath)
            wv = Word2Vec.load_word2vec_format("%s" % (filepath),
                                               binary=binary_val)
            fp = np.memmap(embed_short,
                           dtype=np.double,
                           mode='w+',
                           shape=wv.syn0.shape)
            fp[:] = wv.syn0[:]
            with open(os.path.normpath("%s/embed.vocab" % data_path),
                      "w",
                      encoding='utf-8') as fp:
                for _, w in sorted(
                    (voc.index, word) for word, voc in wv.vocab.items()):
                    fp.write('%s\n' % w)
            del fp, wv

        self.W = np.memmap(os.path.normpath("%s/embed.dat" % data_path),
                           dtype=np.double,
                           mode="r",
                           shape=(vocab_size, self.embedding_size))
        with codecs.open(os.path.normpath("%s/embed.vocab" % data_path), 'r',
                         'utf-8') as f:
            vocab_list = [x.strip() for x in f.readlines()]
        self.vocab_dict = {w: k for k, w in enumerate(vocab_list)}
コード例 #18
0
    def _update_word_vec_dict(self):
        '''Updates the word vector dictionary

		'''
        glove_file = 'data/glove_data/glove.6B.' + str(self.W2V_DIM) + 'd.txt'
        glove = Word2Vec.load_word2vec_format(glove_file)
        self.glove_dict = glove
コード例 #19
0
def get_init_data(model_file, ark_file, dict_filepath, twit_dict_file):

    model = Word2Vec.load_word2vec_format(model_file, binary=False)
    ark_clusters = get_ark_clusters(ark_file)
    all_dictionaries = Dictionaries(dict_filepath)
    twit_sets = []
    stopwords = get_stopwords()
    tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat(
        twit_dict_file)

    for v in [10, 100, 1000, 10000, 50000]:
        twit_id = set(tw_distant_supervision_identity_dat[(
            tw_distant_supervision_identity_dat.tot > v)].term.values)
        twit_id = {
            t
            for t in twit_id
            if t not in stopwords and t.replace(" person", "") not in stopwords
        }
        twit_sets.append([twit_id, "twit_identities_" + str(v)])

    twit_sets.append([EXPERT_NON_IDENTITIES, "expert_non"])
    twit_sets.append([stopwords, "stopword"])

    return model, all_dictionaries, ark_clusters, [t[0] for t in twit_sets], [
        t[1] for t in twit_sets
    ]
コード例 #20
0
def load_model(model_type, json_dir="jsons"):

    config = load_config(json_dir)
    model_dir = config["model_dir"][model_type]

    if model_type == 'english1000':
        print('\n Loaded english1000! \n')
        return SemanticModel.load(os.path.join(model_dir, "english1000sm.hf5"))
    elif model_type == 'word2vec':
        modelfile = os.path.join(model_dir,
                                 "GoogleNews-vectors-negative300.bin")
        norm = False
        from gensim.models.word2vec import Word2Vec
        model = Word2Vec.load_word2vec_format(modelfile,
                                              binary=True,
                                              max_vocab_size=10000)
        usevocab = set(
            cPickle.load(
                open(
                    "/auto/k8/huth/storydata/comodels/complete2-15w-denseco-mat-vocab"
                )))
        vocab, vocinds = zip(*[(w, model.vocab[w].index) for w in model.vocab])
        #w2v_usevocab = [(w,val.index) for w,val in w2v.vocab.items() if w in usevocab]
        #srtvocab = [w for w,voc in sorted(w2v.vocab.items(), key=lambda item:item[1].index)]
        #srtvocab,srtinds = zip(*sorted(w2v_usevocab, key=lambda item:item[1]))
        if norm:
            data = model.syn0norm[list(vocinds)]
        else:
            data = model.syn0[list(vocinds)]
        return SemanticModel(data.T, vocab)
    else:
        raise ValueError('Unknown model type: %s' % self.model_type)
コード例 #21
0
ファイル: word2vec_tool.py プロジェクト: hongleifu/zhihu
 def load(self):
     self.word_vect_modle = Word2Vec.load_word2vec_format(
         self.vec_model_file, binary=False)
     print('load word2vector model done!')
     self.max_word_len = len(self.word_vect_modle.wv.vocab)
     print('all word num is:', self.max_word_len)
     self.init_data()
コード例 #22
0
def load_word_vec(path, vocab):
    model = Word2Vec.load_word2vec_format(path, binary=True)
    word_vecs = {}
    for word in vocab:
        if word in model:
            word_vecs[word] = model[word]
    return word_vecs, model.vector_size
コード例 #23
0
ファイル: test.py プロジェクト: ZheweiMedia/DL_experiments
def wordEmbedding():
    """
    These code is from 
    http://vene.ro/blog/word-movers-distance-in-python.html
    """
    if not os.path.exists("data/embed.dat"):
	    print ("Caching word embeddings in memmapped format...")
	    from gensim.models.word2vec import Word2Vec
	    wv = Word2Vec.load_word2vec_format("/home/medialab/NLP_data/GoogleNews-vectors-negative300.bin.gz", binary = True)
	    fp = numpy.memmap("data/embed.dat", dtype=numpy.double, mode='w+', shape=wv.syn0.shape)
	    fp[:] = wv.syn0[:]
	    with open("data/embed.vocab", "w") as f:
		    for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()):
			    print >> f, unidecode(w)
			    pass
	    del fp, wv
	

    W = numpy.memmap("data/embed.dat", dtype=numpy.double, mode="r", shape=(3000000, 300))
    with open("data/embed.vocab") as f:
	    vocab_list = map(str.strip, f.readlines())
    
  
    vocab_dict = {w:k for k,w in enumerate(vocab_list)}
    return W, vocab_dict
コード例 #24
0
ファイル: Analogy_test.py プロジェクト: LiyuanZHOU/word2vec
def Analogy_test(embedding='vectors.txt', analogy_test='analogy-test.txt'):
    model_vec = Word2Vec.load_word2vec_format(embedding, binary=False)
    vec_sim = 0
    count = 0
    sum_line = 0
    with open(analogy_test, 'r') as f:
        for line in f:
            wordsArray = line.split()
            if len(wordsArray) < 4:
                count += 1
                continue
            word1 = wordsArray[0].lower()
            word2 = wordsArray[1].lower()
            word3 = wordsArray[2].lower()
            word4 = wordsArray[3].lower()
            try:
                tuple1 = model_vec.most_similar(positive=[word3, word2],
                                                negative=[word1],
                                                topn=1)
                if tuple1[0][0] == word4:
                    vec_sim += 1
            except KeyError:
                count += 1
                continue
            sum_line += 1
    print "ignored lines is " + str(count)
    print "precision is " + str(float(vec_sim) / sum_line)
コード例 #25
0
ファイル: SheffNLP.py プロジェクト: cmoralesmx/COM6513_NLP
    def __init__(self, tokenWeights = True, extraFeatures = True, EXTRA_WEIGHTS_LABELS = [
    'bleuScore', 'similarityScore', 'wordMoversDistance', 'crossUnigramsRatio']):
        self.words = {}
        self.words2 = {}  # hypothesis words
        self.wordId = 0
        self.wordId2 = 0  # hypothesis
        self.extraFeatures = {} # for our new features
        self.docId = 0
        self.documents = {}
        self.tokenWeights = tokenWeights
        self.extraFeatures = extraFeatures
        self.EXTRA_WEIGHTS_LABELS = EXTRA_WEIGHTS_LABELS
        #####################
        if not os.path.exists("data/embed.dat"):
            print("Caching word embeddings in memmapped format...")
            #from gensim import models
            from gensim.models.word2vec import Word2Vec
            wv = Word2Vec.load_word2vec_format("data/GoogleNews-vectors-negative300.bin.gz",
                binary=True)
            wv.init_sims(replace=True) # recommended new step?
            fp = np.memmap("data/embed.dat", dtype=np.double, mode='w+', shape=wv.syn0.shape)
            fp[:] = wv.syn0[:]
            with open("data/embed.vocab", "w") as f:
                for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()):
                    f.write(w.encode('utf-8'))
                    f.write('\n'.encode('utf-8'))
                    #print(w, file=f)
                    pass
            del wv

        self.W = np.memmap("data/embed.dat", dtype=np.double, mode="r", shape=(3000000, 300))
        with open("data/embed.vocab") as f:
            self.vocab_list = map(str.strip, f.readlines())

        self.vocab_dict = {w: k for k, w in enumerate(self.vocab_list)}
コード例 #26
0
ファイル: word2vec_tool.py プロジェクト: hongleifu/zhihu
    def read_extend_file(self):
        print('now read extend file')
        self.extend_word_vect_modle = Word2Vec.load_word2vec_format(
            self.extend_vect_file, binary=False)
        max_extend_word_len = len(self.extend_word_vect_modle.wv.vocab)
        for index in range(max_extend_word_len):
            word = self.extend_word_vect_modle.index2word[index]
            self.words_vect_matrix.append(self.extend_word_vect_modle[word])
            print('type self.extend_word_vect_modle[word] ',
                  type(self.extend_word_vect_modle[word]))
            self.extend_words_vect_matrix.append(
                self.extend_word_vect_modle[word])
            self.extend_words_vect[word] = self.extend_word_vect_modle[
                word].tolist()
            self.extend_words_index[word] = self.max_word_len + index
            self.extend_index_words[self.max_word_len + index] = word

    # inf = open(self.extend_vect_file, 'r')
    # print('now read extend file')
    # for index, line in enumerate(inf.readlines()):
    #     word_vect = line.strip().split(' ')
    #     word = word_vect[0]
    #     vects = word_vect[1:]
    #     vects_f=[float(item) for item in vects]
    #     self.extend_words_vect[word] = vects_f
    #     self.extend_words_vect_matrix.append(np.array(vects_f))
    #     self.extend_words_index[word] = self.max_word_len+index
    #     self.extend_index_words[self.max_word_len+index] = word
    #     self.words_vect_matrix.append(np.array(vects_f))
    #     print('word:%s,index:%s'%(word,str(self.max_word_len+index)))
    # inf.close()
        print('now read extend file done!')
コード例 #27
0
class MyView(View):

    model = Word2Vec.load_word2vec_format(MODEL_FILE, binary=True)

    @classmethod
    def n_similarity(cls, s1, s2):
        # TODO: preprocesses of s1, s2 goes here
        s1, s2 = cls.__removeStopwords(s1), cls.__removeStopwords(s2)
        if not s1 or not s2:
            return 0.0
        return cls.model.n_similarity(s1, s2)

    @classmethod
    def __removeStopwords(cls, tokens):
        withoutstops = []
        for word in tokens:
            if not word or (word not in cls.model):
                continue
            withoutstops.append(word)
        return withoutstops

    @classmethod
    def get(cls, request, *args, **kwargs):
        s1, s2 = request.GET.get('s1', '[]'), request.GET.get('s2', '[]')
        s1, s2 = json.loads(s1), json.loads(s2)
        return JsonResponse({'n_similarity': cls.n_similarity(s1, s2)})
コード例 #28
0
def word2vec():
    print('Loading word2vec model...')
    w2v = Word2Vec.load_word2vec_format(
        'data/GoogleNews-vectors-negative300.bin', binary=True)

    print('Creating listener...')
    address = ('localhost', 6000)
    with Listener(address, authkey=b'password') as listener:
        while True:
            with listener.accept() as conn:
                print('connection accepted from {0}'.format(
                    listener.last_accepted))
                while True:
                    try:
                        msg = conn.recv()
                        try:
                            if msg[0] == 'vocab':
                                conn.send(msg[1] in w2v.vocab)
                            elif isinstance(msg[0], list):
                                conn.send(w2v.n_similarity(*msg))
                            else:
                                conn.send(w2v.similarity(*msg))
                        except KeyError:
                            conn.send(0.)
                    except (EOFError, ConnectionResetError):
                        break
コード例 #29
0
def load_vectors():
    print("loading word2vec vectors...")
    t0 = time()
    model = Word2Vec.load_word2vec_format('/Volumes/Seagate Backup Plus Drive/MacFilesThatICantFit/GoogleNews-vectors-negative300.bin', binary = True)
    loadTime = time() - t0
    print("word2vec vectors loaded in %0.3f seconds" % loadTime)
    print()

    # done "training" the model; we can do the following to trim uneeded memory
    t0 = time()
    print("trimming model memory...")
    model.init_sims(replace=True)
    trimTime = time() - t0
    print("trimmed memory in %0.3f seconds" % trimTime)
    print()

    vec = model['hello']

    print('type of vector')
    print(type(vec))
    print('vector')
    print(vec)

    sys.exit(1)

    return model
コード例 #30
0
def load_word_embedding_dict(embedding, embedding_path):
    """
    从文件中读取词向量
    :param embedding: 词嵌入类型
    :param embedding_path: 词嵌入路径
    :return: 词向量字典 词向量维度
    """
    if embedding == 'word2vec':
        word2vec = Word2Vec.load_word2vec_format(embedding_path, binary=True)
        embed_dim = word2vec.vector_size
        return word2vec, embed_dim, False
    elif embedding == 'glove':
        embed_dim = -1
        embed_dict = dict()
        with open(embedding_path, 'r', encoding='utf-8') as fp:
            for line in fp:
                line = line.strip()
                if len(line) == 0:
                    continue

                tokens = line.split()
                if embed_dim < 0:
                    embed_dim = len(tokens) - 1
                else:
                    assert (embed_dim + 1 == len(tokens))
                embed = np.empty([1, embed_dim], dtype=np.float64)
                embed[:] = tokens[1:]
                embed_dict[tokens[0]] = embed
        return embed_dict, embed_dim, True
    else:
        raise ValueError("词嵌入需从 [word2vec, glove] 选取")
コード例 #31
0
ファイル: recommend.py プロジェクト: DevinJeon/soma0612
 def __init__(self, wmodel, sentiment_tagger, opinion_extractor, clusterer):
     if isinstance(wmodel, Word2Vec):
         self.wmodel = wmodel
     else:
         self.wmodel = Word2Vec.load_word2vec_format(wmodel, binary=True)
     self.tagger = sentiment_tagger
     self.extractor = opinion_extractor
     self.clusterer = clusterer
コード例 #32
0
ファイル: github.py プロジェクト: wenhuazang/NLP
def loadGoogleVector():
	t1 = time.clock()
	vector_bin = "/home/paul/Data/GoogleNews-vectors-negative300.bin"
	vector_bin2 = "/home/paul/Data/"
	model = Word2Vec.load_word2vec_format(vector_bin, binary=True)
	t2 = time.clock()
	print ("loading GoogleVector time : %.2f" % (t2 - t1))
	return model
コード例 #33
0
    def from_file(cls, filepath, binary, stemmer=None, pos_tagger=None):
        assert (isinstance(binary, bool))
        assert (isinstance(stemmer, Stemmer) or stemmer is None)
        assert (isinstance(pos_tagger, POSTagger) or pos_tagger is None)

        w2v_model = Word2Vec.load_word2vec_format(filepath, binary=binary)

        return cls(w2v_model, stemmer, pos_tagger)
コード例 #34
0
    def check(self, model):
        assert model.contains(['topics_term', 'sentences_term'])

        with ElapsedTimeIndicator('load ' + self._word2vec_model +
                                  ' [{elapsed}]') as indicator:
            self._word2vec = Word2Vec.load_word2vec_format(
                self._word2vec_model, binary=True)
            self._word2vec.init_sims(replace=True)
コード例 #35
0
ファイル: lookup.py プロジェクト: hunterhector/FinestTune
 def __init__(self, word2vec_model, use_binary=True):
     """
     :param word2vec_model: The word2vec model path.
     :param use_binary: Whether the word2vec model is binary.
     :return:
     """
     logger.info("Loading word2vec ...")
     self.model = Word2Vec.load_word2vec_format(word2vec_model, binary=use_binary)
     print("Loading done...")
コード例 #36
0
def get_model(model_num, model_names):
    
    
    if model_num < 10:
        model = Word2Vec.load(model_path + model_names)
    elif model_num < 99:
        model = Doc2Vec.load(model_path + model_names)
    else:
        model = Word2Vec.load_word2vec_format(model_path + model_names, binary=True)  # C text format
    return model
 def __train__(self,):
   if self.restore == None:
     print('start to train word2vec models ... ')
     self.wvec_model = Word2Vec(sentences=self.corpuss, size=args.dimension, window=args.window,
                                workers=args.workers,
                                sg=args.sg,
                                batch_words=args.batch_size, min_count=3#, max_vocab_size=args.vocab_size
                                )
   else:
     self.wvec_model = Word2Vec.load_word2vec_format(args.restore, binary=True)
   #self.rand_model = RandomVec(args.dimension)
   '''
コード例 #38
0
    def __init__(self):

        # ATTENTION ------------------------------------
        # if changing filepath of vectors.bin do so here
        home = expanduser("~")
        filename = home + "/trunk/vectors.bin"
        # ----------------------------------------------

        try:
            self.model = Word2Vec.load_word2vec_format(filename, binary=True)
        except IOError:
            self.foundFile = False
コード例 #39
0
def word2vec_features(data_matrix, stemming=False, stop_words=None, TFIDF=False, ngram_range=(1, 1), max_features=None,
                      length=False, number_in_tweet=False, words_present=[], policy='sum'):
    print '\n------------------'
    print 'Creating feature vector matrix...\n'
    if stemming:
        print '\n------------------'
        print 'Stemming...'
        stemmer = SnowballStemmer("english")
        tweets = [" ".join([stemmer.stem(word) for word in word_tokenize(data_point[2].lower().decode("utf8"))]) for data_point in data_matrix]
    else:
        tweets = [data_point[2].lower() for data_point in data_matrix]

    print '\n------------------'
    print 'Loading word2vec model...'

    model = Word2Vec.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True)  # C binary format

    # determine the policy on how to build vectors
    if policy == 'sum':
        policy = _build_sent_vec_as_sum
    else:
        policy = _build_sent_vec_as_average

    print 'Applying word2vec model...'

    # create a len(tweets) x 300 dimensional matrix
    dataset = np.squeeze(np.array([policy(sent, model) for sent in tweets]))

    print "Done"

    if length:
        lengths = np.array([[len(word_tokenize(data_point[2].decode("utf8")))] for data_point in data_matrix])
        dataset = np.concatenate((dataset, lengths), axis=1)

    if number_in_tweet:
        numbers = []
        for data_point in data_matrix:
            number_list = list_of_ints_from_string(data_point[2])
            filtered_number_list = [number for number in number_list if abs(number) < 10]
            if len(filtered_number_list) == 0:
                numbers.append([0])
            else:
                numbers.append([np.mean(filtered_number_list)])
        dataset = np.concatenate((dataset, numbers), axis=1)

    for word in words_present:
        word_present = np.array([[int(word.lower() in word_tokenize(data_point[2].lower().decode("utf8")))] for data_point in data_matrix])
        dataset = np.concatenate((dataset, word_present), axis=1)

    print '\n------------------'
    print 'Feature vector constructed.'
    return dataset
コード例 #40
0
ファイル: tegen.py プロジェクト: shishih/testgensim
def teword():
    # model=Word2Vec.load_word2vec_format('vectorseg.bin',binary=False)
    # sim=model.most_similar(positive=[u'好',u'开心'],negative=[u'下雨'],topn=2)
    # print sim
    documents=[u"今天 天气 真是 好 啊",u"明天 就要 下雨 了,伐 开心"]
    model=Word2Vec(documents,size=20,window=5,min_count=1)
    sim=model.most_similar(positive=[u"好"],topn=2)
    # model.save('./tmp/tevec')
    print sim

    model=Word2Vec.load_word2vec_format('vectorseg.bin',binary=False)
    Word2Vec.intersect_word2vec_format(model,'fieldvec.bin',binary=False)
    Word2Vec.train_batch_sg(model, sentences, alpha, work=None)
コード例 #41
0
ファイル: w2vpruning.py プロジェクト: molybdaen/word2bit
def getEmbeddingsAndVocab(w2vModelFilename, rebuild=False):
    if path.exists(w2vModelFilename):
        p, f = path.split(w2vModelFilename)
        fName = f.split('.')[0]
        matFile = path.join(p, fName + "-mat.npy")
        vocFile = path.join(p, fName + "-voc.pkl")
        if not path.exists(matFile) or not path.exists(vocFile):
            model = Word2Vec.load_word2vec_format(w2vModelFilename, binary=False)
            np.save(matFile, model.syn0)
            cPickle.dump(model.vocab, open(vocFile, "w"))
        m = np.load(matFile)
        v = cPickle.load(open(vocFile, "r"))
        return m, v
コード例 #42
0
def make_model(type='gensim'):
    if type=='google':
        model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
    else:
        nlu = spacy.en.English()
        model = Word2Vec(size=300)

        for i, lex in enumerate(nlu.vocab):
            model.vocab[lex.orth_] = Vocab(index=i, count=None)
            model.index2word.append(lex.orth_)

        model.syn0norm = np.asarray(map(lambda x: x.repvec, nlu.vocab))
        model.syn0 = np.asarray(map(lambda x: x.repvec, nlu.vocab))
    return model
コード例 #43
0
ファイル: process_embed.py プロジェクト: makrai/efnilex-vect
 def main(self):
     self.in_filen = sys.argv[1]
     self.file_pref, ext = os.path.splitext(self.in_filen)
     # TODO skip words with control characters and decrease vocab size
     # in header
     if ext == '.pkl':
         # this branch is for embeddings from
         # https://sites.google.com/site/rmyeid/projects/polyglot
         logging.warning(
             'There is a version of this function in the multiwsi repo ' 
             'that writes the embedding with fewer digits (using st.format)')
         with open(self.file_pref+'.w2v', mode='w') as out_file:
             with open(self.in_filen, mode='rb') as in_file:
                 words, vecs = pickle.load(in_file)
             out_file.write('{} {}\n'.format(*vecs.shape))
             for word, vec in zip(words, vecs):
                 out_file.write('{}  {}\n'.format(
                     word.encode('utf8'), 
                     ' '.join(str(coord) for coord in vec.tolist())))
     elif ext == '.w2v':
         m = Word2Vec.load_word2vec_format(self.in_filen)
         m.save(self.file_pref+'.gensim')
     elif ext == '.txt':
         self.read_txt()
     elif ext == '.bin':
         if 'glove' in self.file_pref:
             raise NotImplementedError(
                 'glove binaries are not suppoerted')
         else:
             m = Word2Vec.load_word2vec_format(self.in_filen, binary=True)
             logging.info("Saving {}".format(self.file_pref+'.gensim'))
             m.save(self.file_pref+'.gensim')
             logging.info("Saving {}".format(self.file_pref+'.w2v'))
             m.save_word2vec_format(self.file_pref+'.w2v')
     else:
         raise NotImplementedError('unknown extension')
コード例 #44
0
ファイル: embedding.py プロジェクト: borgr/ucca
 def __init__(self, feature_extractor, **kwargs):
     self.feature_extractor = feature_extractor
     self.sizes = {}
     self.embedding = {}
     for suffix, dims in kwargs.items():
         dim = dims[0]
         if isinstance(dim, int):
             self.sizes[suffix] = dim
             self.embedding[suffix] = defaultdict(lambda s=dim: Config().random.normal(size=s))
         else:
             print("Loading word vectors from '%s'..." % dim)
             w2v = Word2Vec.load_word2vec_format(dim)
             unk = Config().random.normal(size=w2v.vector_size)
             self.sizes[suffix] = w2v.vector_size
             self.embedding[suffix] = Word2VecWrapper(w2v, unk)
コード例 #45
0
ファイル: processors.py プロジェクト: mfomicheva/metric-dev
    def get(self, config, from_file=False):

        print("Getting sentence vectors")

        lines_ref = codecs.open(os.path.expanduser(config.get('Data', 'ref')) + '.' + 'token', 'r', 'utf-8').readlines()
        lines_tgt = codecs.open(os.path.expanduser(config.get('Data', 'tgt')) + '.' + 'token', 'r', 'utf-8').readlines()

        fvectors = os.path.expanduser(config.get('Vectors', 'path'))
        wv = Word2Vec.load_word2vec_format(fvectors, binary=False)

        AbstractProcessor.set_result_tgt(self, self.sents2vec(lines_tgt, wv))
        AbstractProcessor.set_result_ref(self, self.sents2vec(lines_ref, wv))

        wv = None
        print("Finished getting sentence vectors")
コード例 #46
0
ファイル: indexer.py プロジェクト: borgr/ucca
 def __init__(self, feature_extractor, **kwargs):
     self.feature_extractor = feature_extractor
     self.feature_types = {"numeric": FeatureInformation(feature_extractor.num_features_numeric())}
     for suffix, (dim, size) in kwargs.items():
         if isinstance(dim, int):
             init = None
             indices = self.auto_increment_dict(size)
         else:
             print("Loading word vectors from '%s'..." % dim)
             w2v = Word2Vec.load_word2vec_format(dim)
             size = len(w2v.vocab) + 1
             dim = w2v.vector_size
             init = (w2v,)
             indices = self.auto_increment_dict(size, w2v.vocab)
         self.feature_types[suffix] = FeatureInformation(
             feature_extractor.num_features_non_numeric(suffix), dim, size, init, indices)
コード例 #47
0
ファイル: vectorize.py プロジェクト: neutronest/sentinet
    def train_google_model(self, google_file):
        """
        using the google word vector dataset to extract the word feature

        Parameters:
        -----------
        google_file: the location about google.bin/G.bin
                     type: string

        Return:
        -------
        None

        """
        self.google_modopel = Word2Vec.load_word2vec_format(google_file, binary=True)
        return
コード例 #48
0
ファイル: knowledge.py プロジェクト: frnsys/geiger
    def __init__(self, remote):
        global _w2v
        global _w2v_conn

        self.remote = remote
        if not remote and _w2v is None:
            print('Loading word2vec model...')
            _w2v = Word2Vec.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)
            self.vocab = Vocab(remote, None)
            print('Done loading word2vec')
        elif _w2v_conn is None:
            print('Connecting to word2vec process...')
            address = ('localhost', 6000)
            _w2v_conn = Client(address, authkey=b'password')
            self.vocab = Vocab(remote, _w2v_conn)
            print('Done connecting to word2vec')
        self.conn = _w2v_conn
コード例 #49
0
ファイル: wmd.py プロジェクト: eipiplusun/educe
def create_cache(filepath="data"):
    if ((not os.path.exists(filepath+"/"+"embed.dat") or
         not os.path.exists(filepath+"/"+"embed.vocab"))):
        print("Cache of word embeddings...",
              file=sys.stderr)
        from gensim.models.word2vec import Word2Vec
        wv = Word2Vec.load_word2vec_format(
            filepath+"/"+"GoogleNews-vectors-negative300.bin.gz",
            binary=True)
        fp = np.memmap(filepath+"/"+"embed.dat", dtype=np.double, mode='w+',
                       shape=wv.syn0.shape)
        fp[:] = wv.syn0[:]
        with open(filepath+"/"+"embed.vocab", "w", encoding="utf8") as f:
            for _, w in sorted((voc.index, word) for word, voc
                               in wv.vocab.items()):
                print(w, file=f)
        del fp, wv
        print('done', file=sys.stderr)
コード例 #50
0
def get_init_data(model_file, ark_file, dict_filepath, twit_dict_file):

    model = Word2Vec.load_word2vec_format(model_file, binary=False)
    ark_clusters = get_ark_clusters(ark_file)
    all_dictionaries = Dictionaries(dict_filepath)
    twit_sets = []
    stopwords = get_stopwords()
    tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat(twit_dict_file)

    for v in [10, 100, 1000, 10000,50000]:
        twit_id = set(tw_distant_supervision_identity_dat[
                      (tw_distant_supervision_identity_dat.tot > v)].term.values)
        twit_id = {t for t in twit_id if t not in stopwords and t.replace(" person","") not in stopwords}
        twit_sets.append([twit_id,"twit_identities_"+str(v)])

    twit_sets.append([EXPERT_NON_IDENTITIES,"expert_non"])
    twit_sets.append([stopwords,"stopword"])

    return model, all_dictionaries, ark_clusters, [t[0] for t in twit_sets],[t[1] for t in twit_sets]
コード例 #51
0
 def loadEmbeddings(self, filepath, data_path, vocab_size, binary_val):
     if not os.path.exists(data_path):
         os.makedirs(data_path)
     embed_short = os.path.normpath("%s/embed.dat" % data_path)
     if not os.path.exists(embed_short):
         print("Caching word embeddings in memmapped format...")
         print(binary_val, filepath)
         wv =  Word2Vec.load_word2vec_format("%s" % (filepath), binary=binary_val)
         fp = np.memmap(embed_short, dtype=np.double, mode='w+', shape=wv.syn0.shape)
         fp[:] = wv.syn0[:]
         with open(os.path.normpath("%s/embed.vocab" % data_path), "w", encoding='utf-8') as fp:
             for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()):
                 fp.write('%s\n' % w)
         del fp, wv
         
     self.W = np.memmap(os.path.normpath("%s/embed.dat" % data_path), dtype=np.double, mode="r", shape=(vocab_size, self.embedding_size))
     with codecs.open(os.path.normpath("%s/embed.vocab" % data_path), 'r', 'utf-8') as f:
         vocab_list = [x.strip() for x in f.readlines()]
     self.vocab_dict = {w: k for k, w in enumerate(vocab_list)}
コード例 #52
0
ファイル: model.py プロジェクト: vm/lessandmore
    def __init__(self, fn='models/GoogleNews-vectors-negative300.bin', threshold=0.4):
        """creates a Tranformer

        :param fn: location of the model to load
        :type fn: str
        """

        url = 'https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM'
        download_msg = 'Download the Google News model here: ' + url

        if not os.path.isfile(fn):
            raise ValueError('File {} not found!\n'.format(fn) + download_msg)

        self.model = Word2Vec.load_word2vec_format(fn, binary=True)
        self.threshold = threshold

        # dumb caching
        self._last_less = None
        self._last_more = None
コード例 #53
0
ファイル: lstm.py プロジェクト: danebell/this-before-that
    def create_embeddings_weights(self):
        config = self.config
        tk = self.tokenizer
        word2index = tk.word_index
        # reverse index
        index2word = {i:w for (w,i) in tk.word_index.items()}
        max_size = len(index2word) + 1
        # load w2v model
        w2v_vectors_file = config["w2v_data"]
        w2v = Word2Vec.load_word2vec_format(w2v_vectors_file, binary=True)
        word_vector_dims = w2v.vector_size
        embedding_weights = np.zeros((max_size, word_vector_dims))

        for i,w in index2word.items():
            try:
                embedding_weights[i,:] = w2v[w]
            except:
                print("{} not found".format(w))
        return (w2v, embedding_weights)
コード例 #54
0
ファイル: processors.py プロジェクト: mfomicheva/metric-dev
    def get(self, config, from_file=False):

        lines_ref = codecs.open(os.path.expanduser(config.get('Data', 'ref')) + '.' + 'token', 'r', 'utf-8').readlines()
        lines_tgt = codecs.open(os.path.expanduser(config.get('Data', 'tgt')) + '.' + 'token', 'r', 'utf-8').readlines()

        fvectors = os.path.expanduser(config.get('Vectors', 'path'))

        print("Loading word vectors from " + fvectors)
        wv = Word2Vec.load_word2vec_format(fvectors, binary=False)

        print("Finished loading word vectors from " + fvectors)

        print("Building sentence vectors for target...")
        AbstractProcessor.set_result_tgt(self, self.words2vec(lines_tgt, wv))
        print("Finished building sentence vectors for target")
        print("Building sentence vectors for reference...")
        AbstractProcessor.set_result_ref(self, self.words2vec(lines_ref, wv))
        print("Finished building sentence vectors for reference")

        wv = None
        print("Finished getting word vectors")