Exemple #1
0
def build_av_tf_idf_dv(docs, doc_num, model, save, save_file):
    docs = list(docs)
    vectorizer = CountVectorizer()
    tfidf_transformer = TfidfTransformer()
    count_fv = vectorizer.fit_transform(util.word2sentence(docs))
    tfidf_fv = tfidf_transformer.fit_transform(count_fv)

    # {word: index}
    vocabulary = vectorizer.vocabulary_

    num_features = model.syn0.shape[1]
    doc_vector = np.zeros((doc_num, num_features), dtype="float32")
    word_set = set(model.index2word)

    index = 0
    for words in docs:
        vec = tfidf_fv[index].toarray()
        count = 0
        for word in words:
            if word in word_set and word in vocabulary:
                doc_vector[index] = doc_vector[index] + model[word] * vec[0][vocabulary[word]]
                count += 1
        doc_vector[index] = doc_vector[index] / (count+1)
        index += 1

    if save:
        np.save(save_file, doc_vector)

    return doc_vector
def build_av_tf_idf_dv(docs, doc_num, model, save=True, save_file="doc_vector_tfidf.bin"):
    docs = list(docs)
    vectorizer = CountVectorizer()
    tfidf_transformer = TfidfTransformer()
    count_fv = vectorizer.fit_transform(util.word2sentence(docs))
    tfidf_fv = tfidf_transformer.fit_transform(count_fv)

    num_features = model.syn0.shape[1]

    manager = Manager()
    global_word_set = manager.dict(util.get_word_vec_dict(model))
    global_vocabulary = manager.dict(vectorizer.vocabulary_);
    global_doc_vector = mp.Array('d', doc_num*num_features, lock=False)

    pool = mp.Pool(initializer=initprocess, initargs=[global_doc_vector])

    index = 0
    # test(docs[0], global_word_set, 0, doc_num, global_vocabulary, global_doc_vector, global_tfidf_fv)
    for words in docs:
        pool.apply_async(single_av_tf_idf_dv, [words, global_word_set, index, doc_num, global_vocabulary, tfidf_fv[index]])
        index += 1

    pool.close()
    pool.join()

    doc_vector = np.frombuffer(global_doc_vector).reshape((doc_num, num_features))
    if save:
        np.save(save_file, doc_vector)
    return doc_vector
Exemple #3
0
def build_av_tf_idf_dv(docs,
                       doc_num,
                       model,
                       save=True,
                       save_file="doc_vector_tfidf.bin"):
    docs = list(docs)
    vectorizer = CountVectorizer()
    tfidf_transformer = TfidfTransformer()
    count_fv = vectorizer.fit_transform(util.word2sentence(docs))
    tfidf_fv = tfidf_transformer.fit_transform(count_fv)

    # {word: index}
    vocabulary = vectorizer.vocabulary_

    num_features = model.syn0.shape[1]
    doc_vector = np.zeros((doc_num, num_features), dtype="float32")
    word_set = set(model.index2word)

    index = 0
    for words in docs:
        vec = tfidf_fv[index].toarray()
        count = 0
        for word in words:
            if word in word_set and word in vocabulary:
                doc_vector[index] = doc_vector[index] + model[word] * vec[0][
                    vocabulary[word]]
                count += 1
        doc_vector[index] = doc_vector[index] / (count + 1)
        index += 1

    if save:
        np.save(save_file, doc_vector)

    return doc_vector
Exemple #4
0
def build_av_tf_idf_dv(docs,
                       doc_num,
                       model,
                       save=True,
                       save_file="doc_vector_tfidf.bin"):
    docs = list(docs)
    vectorizer = CountVectorizer()
    tfidf_transformer = TfidfTransformer()
    count_fv = vectorizer.fit_transform(util.word2sentence(docs))
    tfidf_fv = tfidf_transformer.fit_transform(count_fv)

    num_features = model.syn0.shape[1]

    manager = Manager()
    global_word_set = manager.dict(util.get_word_vec_dict(model))
    global_vocabulary = manager.dict(vectorizer.vocabulary_)
    global_doc_vector = mp.Array('d', doc_num * num_features, lock=False)

    pool = mp.Pool(initializer=initprocess, initargs=[global_doc_vector])

    index = 0
    # test(docs[0], global_word_set, 0, doc_num, global_vocabulary, global_doc_vector, global_tfidf_fv)
    for words in docs:
        pool.apply_async(single_av_tf_idf_dv, [
            words, global_word_set, index, doc_num, global_vocabulary,
            tfidf_fv[index]
        ])
        index += 1

    pool.close()
    pool.join()

    doc_vector = np.frombuffer(global_doc_vector).reshape(
        (doc_num, num_features))
    if save:
        np.save(save_file, doc_vector)
    return doc_vector