Example #1
0
def load_saved_word_embeddings(w2v, fasttext):
    global wv_model
    if w2v:
        wv_model = KeyedVectors.load(word2vec_data(w2v))
        global vector_size
        vector_size = w2v
    elif fasttext:
        wv_model = KeyedVectors.load(fasttext_data)
    return wv_model
Example #2
0
def train_translation(model_source_path, model_target_path, transmat_outpath):
    word_pairs = []
    source_model = KeyedVectors.load(model_source_path)
    target_model = KeyedVectors.load(model_target_path)

    for word in target_model.wv.vocab:
        if word in source_model.wv.vocab:
            word_pairs.append((word, word))

    trans_model = TranslationMatrix(source_model.wv,
                                    target_model.wv,
                                    word_pairs=word_pairs)

    trans_model.save(transmat_path)
    return trans_model
def plot_data(data_path, index, save_path, filenames_get, embedding_size):
    path_model_load = data_path + "/" + filenames_get
    print "########"
    print path_model_load
    model = KeyedVectors.load(path_model_load)
    length = len(model.wv.vocab.keys())
    words = model.wv.vocab.keys()
    print filenames_get[1]
    X = npx.empty((length, 128))
    count = 0
    un_seen_node = 0
    for node in words:
        try:
            vec_one = model[node]
        except:
            vec_one = npx.random.rand(embedding_size)
            un_seen_node += 1

        #print vec_one.shape
        X[count, :] = vec_one
        count += 1
    y_pred = KMeans(n_clusters=6, random_state=2017,
                    max_iter=15).fit_predict(X)
    X_embedded = None
    #kmeans.labels_
    #     print "do the TSNE"
    #     flag = exists(save_path)
    #     if not flag:
    #         X_embedded = TSNE(n_components=3).fit_transform(X)
    #         save_picle(save_path, X_embedded)
    #     else:
    #         X_embedded = load_pickle(save_path)
    return X, y_pred, X_embedded
def main():
    print("good")
    model = models.Word2Vec.load("D:\\NTUST\\人工智慧\\final\\csv\\code250.model.bin")
    # print(len(model.wv.vocab))
    #print(model.wv.vocab[0])
    word_vectors = KeyedVectors.load("D:\\NTUST\\人工智慧\\final\\csv\\code250.model.bin")
    print(word_vectors["市長"])
    '''
    output = open('D:\\NTUST\\人工智慧\\final\\csv\\wordvec.csv', 'w')
    count=0;
    for word in model.wv.vocab:
        print(word+"~~", end='')
        output.write(word)
        print(len(word_vectors[word]),"~", end='')
        for item in word_vectors[word]:
            print(item, end='')
            output.write(",")
            output.write(str(item))



        output.write("\n")
        print()
        #count+=1
        #if(count==20):
        #    break
    output.close()
    '''
    '''
Example #5
0
def get_embeddings():
    # build index mapping words in the embeddings set
    # to their embedding vector

    #glove vectors
    glove_embeddings_index = {}
    f = open(os.path.join(EMBEDDING_DIR, 'glove.6B.100d.txt'))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        glove_embeddings_index[word] = coefs
    f.close()

    #word2vec model
    bible_embeddings_index = {}
    bible_model = KeyedVectors.load(EMBEDDING_DIR + 'bible_word2vec')
    bible_vocab = bible_model.vocab

    for word in bible_vocab:
        bible_embeddings_index[word] = bible_model[word]
        #print("WORD: ", word)
        #print("VECTOR: ", bible_model[word])

    combined_index = combine_embeddings(glove_embeddings_index,
                                        bible_embeddings_index)

    print('Found word vectors: ', len(combined_index))
    print('dimensions: ', len(combined_index["god"]))
    print('dimensions: ', len(combined_index["bottle"]))

    return combined_index
Example #6
0
    def _load_gensim_word2vec_model(self,
                                    model_uri=None,
                                    max_lru_cache_size=1024):
        """
        Loads pre-trained Gensim word2vec keyed vector model from either local or Redis

        >>> from textpipe.doc import Doc
        >>> model = Doc('')._load_gensim_word2vec_model('tests/models/gensim_test_nl.kv')
        >>> type(model)
        <class 'gensim.models.keyedvectors.Word2VecKeyedVectors'>
        """
        lang = self.language if self.is_reliable_language else self.hint_language
        if not self._gensim_vectors or lang not in self._gensim_vectors:
            if urlparse(model_uri).scheme == 'redis':
                vectors = RedisKeyedVectors(model_uri, lang,
                                            max_lru_cache_size)
                if not vectors.exists:
                    raise TextpipeMissingModelException(
                        f'Redis does not contain a model '
                        f'for language {lang}. The model '
                        f'needs to be loaded before use '
                        f'(see load_keyed_vectors_into_redis).')
            elif model_uri:
                try:
                    vectors = KeyedVectors.load(model_uri, mmap='r')
                except FileNotFoundError:
                    raise TextpipeMissingModelException(
                        f'Gensim keyed vector file {model_uri} is not available.'
                    )
            else:
                raise TextpipeMissingModelException(
                    'Either specify model filename or redis URI')
            self._gensim_vectors[lang] = vectors
        return self._gensim_vectors[lang]
Example #7
0
def load_full_data():
    train_data = pd.read_excel(url_full_train_data, 'Sheet1')

    test_data = train_data[4415:4914]
    train_data = train_data.drop(test_data.index)

    texts = train_data.text

    tokenizer = Tokenizer(num_words=NUM_WORDS,
                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'',
                          lower=True)
    tokenizer.fit_on_texts(texts)

    sequences_train = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index

    X_train = pad_sequences(sequences_train, maxlen=max_length, padding=pad[0])

    word_vectors = KeyedVectors.load(url_word2vec_full, mmap='r')

    vocabulary_size = min(len(word_index) + 1, NUM_WORDS)
    embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))

    for word, i in word_index.items():
        if i >= NUM_WORDS:
            continue
        try:
            embedding_vector = word_vectors[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25),
                                                   EMBEDDING_DIM)

    del (word_vectors)
    return tokenizer, sequences_train, X_train
Example #8
0
    def __init__(self,
                 emb_dim,
                 hid_dim,
                 z_dim,
                 word2vec_file='data/word2vec_recipes.bin',
                 with_attention=True):
        super(IngredientsEncoderRNN, self).__init__()

        wv = KeyedVectors.load(word2vec_file, mmap='r')
        vec = torch.from_numpy(np.copy(wv.vectors)).float()
        # first two index has special meaning, see load_dict() in utils.py
        emb = nn.Embedding(vec.shape[0] + 2, vec.shape[1], padding_idx=0)
        emb.weight.data[2:].copy_(vec)
        # for p in emb.parameters():
        #     p.requires_grad = False
        self.embed_layer = emb
        print('IngredientsEncoderRNN:', emb)

        self.rnn = nn.GRU(input_size=emb_dim,
                          hidden_size=hid_dim,
                          bidirectional=True,
                          batch_first=True)

        self.with_attention = with_attention
        if with_attention:
            self.atten_layer = AttentionLayer(2 * hid_dim, with_attention)
Example #9
0
def main(args):
    print('Loading datasets...')
    X, y = load_data_and_labels(args.data_path)
    x_train, x_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=0.1,
                                                          random_state=42)
    embeddings = KeyedVectors.load(args.embedding_path).wv

    print('Transforming datasets...')
    p = IndexTransformer()
    p.fit(X, y)
    embeddings = filter_embeddings(embeddings, p._word_vocab,
                                   embeddings.vector_size)

    print('Building a model...')
    model = BiLSTMCRF(char_vocab_size=p.char_vocab_size,
                      word_vocab_size=p.word_vocab_size,
                      num_labels=p.label_size,
                      embeddings=embeddings,
                      char_embedding_dim=50)
    model.build()

    print('Training the model...')
    trainer = Trainer(model, preprocessor=p)
    trainer.train(x_train, y_train, x_valid, y_valid)

    print('Saving the model...')
    model.save(args.weights_file, args.params_file)
    p.save(args.preprocessor_file)
Example #10
0
def word2vec(request):
    model_path = '/home/hasher/Documents/textbook_trained'
    w2v_model = KeyedVectors.load(model_path)

    ds = DocSim(w2v_model)

    df = open(MEDIA_ROOT + "/texts/blueprint.txt", "r")
    blueprint = df.read()
    df.close()

    # target_docs = [source_doc,'Beggar was so poor that he had no money' ,'Beggar was very rich and had lots of money', 'Beggar had nothing to buy slippers']

    df = open(MEDIA_ROOT + "/texts/answersheet.txt", "r")
    answer = df.read()
    df.close()

    answer = nltk.sent_tokenize(answer)
    print(answer)

    answer.append(blueprint)

    # This will return 3 target docs with similarity score
    sim_scores = ds.calculate_similarity(blueprint, answer)

    output = dict()
    output.update({
        'model': 'Word2vec',
        'blueprint': blueprint,
        'output': sim_scores
    })
    request = HttpRequest()
    request.output = output
    return generate_model_report(request)
Example #11
0
def build_w2v(
        path_train_union_test,
        path_words_vectors,
        w2v_model_path='/home/penglu/LewPeng/TranSummary/lcsts_data/word2vec/embedding/w2v.model',
        min_count=10):
    w2v = Word2Vec(sentences=LineSentence(path_train_union_test),
                   size=512,
                   window=5,
                   min_count=min_count,
                   iter=5)
    w2v.save(w2v_model_path)

    model = Word2Vec.load(w2v_model_path)
    model.wv.save_word2vec_format(
        '/home/penglu/LewPeng/TranSummary/lcsts_data/word2vec/embedding/w2v.vector'
    )

    logging.info('语料数:{}'.format(model.corpus_count))
    logging.info('词表长度:{}'.format(len(model.wv.vocab)))

    model = KeyedVectors.load(w2v_model_path)

    words_vectors = {}
    for word in model.wv.vocab:
        words_vectors[word] = model[word]

    dump_pkl(words_vectors, path_words_vectors, overwrite=True)
Example #12
0
def trim_wemb(conf, econf):
    '''
    Trim embeddings to the minimally required size and load them.
    '''
    # Load the embeddings from disk.
    fn = econf.embedding_fn
    if fn.endswith('.kv'):
        wv = KeyedVectors.load(fn, mmap='r')
    else:
        wv = KeyedVectors.load_word2vec_format(fn, binary=fn.endswith('.bin'))

    # Account for mapping changes due to preprocessing (eg. stemming).
    vocab = _adapt_mapping(econf, wv)
    vocab = {w: e.index for w, e in vocab.items()}

    # Reduce the matrix to the actual vocabulary of the dataset.
    used = _get_dataset_vocab(conf, econf, vocab)
    mapping = {old: new
               for new, old in enumerate(sorted(used))}  # preserve order
    ds_vocab = [None] * len(mapping)
    # Add two rows in the beginning: one for padding and one for unknown words.
    shape = len(mapping) + 2, wv.vectors.shape[1]
    matrix = np.zeros(shape, dtype=wv.vectors.dtype)
    matrix[1] = np.random.standard_normal(shape[1])  # unknown words
    for w, i in vocab.items():
        n = mapping.get(i)
        if n is not None:
            ds_vocab[n] = w
            matrix[n + 2] = wv.vectors[i]

    return ds_vocab, matrix
Example #13
0
 def __init__(self, path_out: str) -> None:
     self.path_out = path_out
     self.classifier = HypernymClassifier(path_out)
     self.classifier.load()
     self.path_emb = os.path.join(
         path_out, 'embeddings/embs_token_global_Word2Vec.vec')
     self.term_id_to_emb = KeyedVectors.load(self.path_emb)
Example #14
0
def gensim_model2txt():
    from gensim.models.keyedvectors import KeyedVectors
    # https://bit.ly/3pXwIDx
    model = KeyedVectors.load(
        './trained_model/wiki_model/word2vec_wiki_zh.model.bin')
    model.wv.save_word2vec_format(
        './trained_model/wiki_model/word2vec_wiki_zh.model.txt', binary=False)
    def __init__(self):
        # self.model = Word2Vec.load(GENSIM_MODEL)
        self.model = KeyedVectors.load_word2vec_format(GENSIM_MODEL,
                                                       binary=False)
        self.gloveModel = KeyedVectors.load(GLOVE_MODEL)
        self.wordLemmas = pd.read_csv(INPUT_FILE, index_col=0)

        self.data = pd.read_csv(FILE_TO_UPDATE, index_col=0)
Example #16
0
def open_word2vec(w2v_bin_path, binary=True):
    # model = None
    try:
        model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    except Exception as e:  ## Loading a different format.
        print('Loading original word2vec format failed. Trying Gensim format.')
        model = KeyedVectors.load(w2v_bin_path, binary=True)
    return model
    def create_embedding_matrix(self):
        model_ug_cbow = KeyedVectors.load('w2v_model_ug_cbow.word2vec')
        model_ug_sg = KeyedVectors.load('w2v_model_ug_sg.word2vec')
        embeddings_index = {}
        for w in model_ug_cbow.wv.vocab.keys():
            embeddings_index[w] = np.append(model_ug_cbow.wv[w],
                                            model_ug_sg.wv[w])
        print('Found %s word vectors.' % len(embeddings_index))
        embedding_matrix = np.zeros((self.vocabulary_size, 200))
        for word, i in self.tokenizer.word_index.items():
            if i > self.vocabulary_size - 1:
                continue
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

        return embedding_matrix
Example #18
0
 def load_gensim_kv(filename=None, path=None, mmap=None):
     if path is not None:
         return KeyedVectors.load(path, mmap=mmap)
     elif filename is not None:
         for dir_path in ASSET_SEARCH_DIRS:
             try:
                 path = os.path.join(dir_path, filename)
                 return KeyedVectors.load(path, mmap=mmap)
             except FileNotFoundError:
                 continue
         raise FileNotFoundError("Please make sure that 'filename' \
                                 specifies the word vector binary name \
                                 in default search paths or 'path' \
                                 speficies file path of the binary")
     else:
         raise TypeError(
             "load_gensim_kv() requires either 'filename' or 'path' to be set."
         )
Example #19
0
def test_word_2vec():
    from gensim.models.keyedvectors import KeyedVectors

    id = "honeypot_clean_model_revised"
    mymodel = KeyedVectors.load(id)
    n_dim = mymodel.wv.syn0.shape[1]
    print(n_dim)

    print(mymodel.wv.most_similar("today"))
Example #20
0
def loadModel(model):
    #loadedModel = gensim.models.KeyedVectors.load_word2vec_format(model)
    #loadedModel = gensim.models.KeyedVectors.load_word2vec_format(model, binary=True )
    #loadedModel.save("2ndmodel.txt")
    print("SAVED")
    loadedModel = KeyedVectors.load(model)
    print("LOADED")
    #return word_vectors
    return loadedModel
Example #21
0
def obtainModel(path):
    #    file = basename(normpath(path))
    file = path + '.prep'  # saving in the same folder
    if exists(file):
        return KeyedVectors.load(file)
    else:
        model = KeyedVectors.load_word2vec_format(path, binary=False)
        model.save(file)
        return model
Example #22
0
def create_db_fasttext(file_name, path_to_db):
    documents1 = []
    documents2 = []
    related = {}

    morph = pymorphy2.MorphAnalyzer()
    df = pd.read_csv("quora.csv")

    for i, row in df[:5000].iterrows():
        id1 = "q" + str(i)
        id2 = "d" + str(i)

        doc1 = str(row["question1"]).lower()
        doc1 = re.split(r"[^а-яё]+", doc1)
        doc1 = [morph.parse(word)[0].normal_form for word in doc1]
        documents1.append(doc1)

        doc2 = str(row["question2"]).lower()
        doc2 = re.split(r"[^а-яё]+", doc2)
        doc2 = [morph.parse(word)[0].normal_form for word in doc2]
        documents2.append(doc2)

        if row["is_duplicate"] == 1:
            if id1 not in related:
                related[id1] = []
            related[id1].append(id2)

    model_file = './fasttext/model.model'
    model = KeyedVectors.load(model_file)

    dimensions = model.vector_size

    data = []

    for document in documents2:
        doc_embedding = np.array([0 for i in range(dimensions)],
                                 dtype="float64")
        for word in document:
            if word in model:
                doc_embedding += model[word]
        doc_embedding = doc_embedding / len(document)
        data.append(doc_embedding)

    data = np.array(data)
    data = data.reshape((5000, dimensions))

    path = os.path.join(path_to_db, "fasttext")

    if not os.path.exists(path):
        os.mkdir(path)

    with open(os.path.join(path, "documents.pickle"), "wb") as pickle_file:
        pickle.dump(documents2, pickle_file)

    with open(os.path.join(path, "data.pickle"), "wb") as pickle_file:
        pickle.dump(data, pickle_file)
Example #23
0
    def __init__(self, indexing=False):
        if indexing:
            self.model = KeyedVectors.load(fast_model)
            log.info("Indexing starts")
            with open("Lemmatized_corpus.pickle", 'rb') as f:
                texts = pickle.load(f)

            indexed = []
            for doc in texts:
                indexed.append(lookup(doc.split(' '), self.model.wv))

            with open("Fasttext_matrix.pickle", 'wb') as f:
                pickle.dump(indexed, f)
            self.indexed_corpora = indexed

        self.model = KeyedVectors.load(fast_model)
        if not self.indexed_corpora:
            with open("Fasttext_matrix.pickle", 'rb') as f:
                self.indexed_corpora = pickle.load(f)
Example #24
0
def load_model(filepath, keyed_vec=False):
    """
    Instantiate a pre-trained model located at `filepath`. If read-only model vectors 
    were trained by another application, set `keyed_vec=True`. Otherwise, word2vec model 
    is assumed. 
    """
    if keyed_vec:
        model = KeyedVectors.load(filepath)
    else:
        model = Word2Vec.load(filepath)
    return model
Example #25
0
    def __init__(self, filename):
        super().__init__()
        self.fasttext_model = KeyedVectors.load(filename)

        self.vocab = self.fasttext_model.wv.vocab
        self.wv = self.fasttext_model.wv
        self.vector_size = self.fasttext_model.vector_size

        self.Z = 0
        for k in self.vocab:
            self.Z += self.vocab[k].count
def load_model(filepath, keyed_vec=False):
    """
    Instantiate a pre-trained model located at `filepath`. If read-only model vectors 
    were trained by another application, set `keyed_vec=True`. Otherwise, word2vec model 
    is assumed. 
    """
    if keyed_vec:
        model = KeyedVectors.load(filepath)
    else:
        model = Word2Vec.load(filepath)
    return model
Example #27
0
def get_init_parameters(path, ext=None):
    if ext == 'vec':
        word_model = KeyedVectors.load_word2vec_format(path).wv
    else:
        word_model = KeyedVectors.load(path).wv
    n_words = len(word_model.vocab)
    vocab_dim = word_model[word_model.index2word[0]].shape[0]
    index_dict = dict()
    for i in range(n_words):
        index_dict[word_model.index2word[i]] = i+1
    return word_model, index_dict, n_words, vocab_dim
Example #28
0
    def load_embedding(embd_file_path):

        from gensim.models.keyedvectors import KeyedVectors
        model = KeyedVectors.load(embd_file_path)

        vocab = model.vocab
        embd = []
        for word in vocab:
            embd.append(model[word])

        return vocab, embd
    def test_ft_kv_backward_compat_w_360(self):
        kv = EuclideanKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz"))
        ft_kv = FastTextKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz"))

        expected = ['trees', 'survey', 'system', 'graph', 'interface']
        actual = [word for (word, similarity) in kv.most_similar("human", topn=5)]

        self.assertEqual(actual, expected)

        actual = [word for (word, similarity) in ft_kv.most_similar("human", topn=5)]

        self.assertEqual(actual, expected)
    def __init__(self, **kwargs):
        """
        :param model_file: path of model file. If not supplied, will be downloaded.
        """
        super().__init__(**kwargs)

        self._model_file = self._options.get("model_file")

        self._model = KeyedVectors.load(str(self._model_file), mmap="r")
        self._vector_size = 512
        self._zero_vector = np.zeros(self._vector_size, dtype=np.float32)
        self._window_size = 3
Example #31
0
    def test_ft_kv_backward_compat_w_360(self):
        kv = EuclideanKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz"))
        ft_kv = FastTextKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz"))

        expected = ['trees', 'survey', 'system', 'graph', 'interface']
        actual = [word for (word, similarity) in kv.most_similar("human", topn=5)]

        self.assertEqual(actual, expected)

        actual = [word for (word, similarity) in ft_kv.most_similar("human", topn=5)]

        self.assertEqual(actual, expected)
Example #32
0
def build_w2v(train_union_test_path, path_words_vectors, w2v_model_path='w2v.model', min_count=5):
    w2v = Word2Vec(sentences=LineSentence(train_union_test_path), size=128, window=5, min_count=min_count, iter=5)
    w2v.save(w2v_model_path)

    model = Word2Vec.load(w2v_model_path)
    model = KeyedVectors.load(w2v_model_path)

    words_vectors = {}
    for word in model.wv.vocab:
        words_vectors[word] = model[word]
    print(len(words_vectors))
    dump_pkl(words_vectors, path_words_vectors, overwrite=True)