def build_average_dv(docs, doc_num, model, save=True, save_file="doc_vector_ave.bin"): num_features = model.syn0.shape[1] manager = Manager() global_doc_vector = mp.Array('d', doc_num * num_features, lock=False) global_word_set = manager.dict(util.get_word_vec_dict(model)) pool = mp.Pool(initializer=initprocess, initargs=[global_doc_vector]) index = 0 for words in docs: pool.apply_async(single_average_dv, [ words, global_word_set, index, doc_num, ]) index += 1 pool.close() pool.join() doc_vector = np.frombuffer(global_doc_vector).reshape( (doc_num, num_features)) # print doc_vector if save: np.save(save_file, doc_vector) return doc_vector
def build_av_tf_idf_dv(docs, doc_num, model, save=True, save_file="doc_vector_tfidf.bin"): docs = list(docs) vectorizer = CountVectorizer() tfidf_transformer = TfidfTransformer() count_fv = vectorizer.fit_transform(util.word2sentence(docs)) tfidf_fv = tfidf_transformer.fit_transform(count_fv) num_features = model.syn0.shape[1] manager = Manager() global_word_set = manager.dict(util.get_word_vec_dict(model)) global_vocabulary = manager.dict(vectorizer.vocabulary_); global_doc_vector = mp.Array('d', doc_num*num_features, lock=False) pool = mp.Pool(initializer=initprocess, initargs=[global_doc_vector]) index = 0 # test(docs[0], global_word_set, 0, doc_num, global_vocabulary, global_doc_vector, global_tfidf_fv) for words in docs: pool.apply_async(single_av_tf_idf_dv, [words, global_word_set, index, doc_num, global_vocabulary, tfidf_fv[index]]) index += 1 pool.close() pool.join() doc_vector = np.frombuffer(global_doc_vector).reshape((doc_num, num_features)) if save: np.save(save_file, doc_vector) return doc_vector
def build_average_dv(docs, doc_num, model, save=True, save_file="doc_vector_ave.bin"): num_features = model.syn0.shape[1] manager = Manager() global_doc_vector = mp.Array('d', doc_num*num_features, lock=False) global_word_set = manager.dict(util.get_word_vec_dict(model)) pool = mp.Pool(initializer=initprocess, initargs=[global_doc_vector]) index = 0 for words in docs: pool.apply_async(single_average_dv, [words, global_word_set, index, doc_num, ]) index += 1 pool.close() pool.join() doc_vector = np.frombuffer(global_doc_vector).reshape((doc_num, num_features)) # print doc_vector if save: np.save(save_file, doc_vector) return doc_vector
def build_av_tf_idf_dv(docs, doc_num, model, save=True, save_file="doc_vector_tfidf.bin"): docs = list(docs) vectorizer = CountVectorizer() tfidf_transformer = TfidfTransformer() count_fv = vectorizer.fit_transform(util.word2sentence(docs)) tfidf_fv = tfidf_transformer.fit_transform(count_fv) num_features = model.syn0.shape[1] manager = Manager() global_word_set = manager.dict(util.get_word_vec_dict(model)) global_vocabulary = manager.dict(vectorizer.vocabulary_) global_doc_vector = mp.Array('d', doc_num * num_features, lock=False) pool = mp.Pool(initializer=initprocess, initargs=[global_doc_vector]) index = 0 # test(docs[0], global_word_set, 0, doc_num, global_vocabulary, global_doc_vector, global_tfidf_fv) for words in docs: pool.apply_async(single_av_tf_idf_dv, [ words, global_word_set, index, doc_num, global_vocabulary, tfidf_fv[index] ]) index += 1 pool.close() pool.join() doc_vector = np.frombuffer(global_doc_vector).reshape( (doc_num, num_features)) if save: np.save(save_file, doc_vector) return doc_vector