def vectorizeShortDoc(raw_docs, word_vectors, is_refine=False, word_limit=100): """ word vectors for each short doc """ # tokenize print("vectorize short docs...") docs = [] for raw_doc in raw_docs: docs.append(preprocess.tokenizeText(raw_doc)) #docs = preprocess.tokenizeText(raw_docs) if (is_refine): docs = tfidf_helper.extract(docs, word_limit) docs_vecs = match_helper.findWordVectors(docs, word_vectors) return docs_vecs
def vectorizeLongDoc(raw_docs, word_vectors, topic_num=10, is_refine=False, word_limit=100): """ raw_docs: a list of the concateation of reviewers' works vector space for each long doc """ # tokenize print("vectorize long docs...") docs = [] for raw_doc in raw_docs: docs.append(preprocess.tokenizeText(raw_doc)) #docs = preprocess.tokenizeText(raw_docs) # if refine with tf-idf methods if (is_refine): docs = tfidf_helper.extract(docs, word_limit) docs_topics, topic_weights = match_helper.findHiddenTopics(docs, word_vectors, topic_num) return docs_topics, topic_weights