def vectorizeShortDoc(raw_docs, word_vectors, is_refine=False, word_limit=100):
    """
    word vectors for each short doc
    """
    # tokenize
    print("vectorize short docs...")
    docs = []
    for raw_doc in raw_docs:
        docs.append(preprocess.tokenizeText(raw_doc))
    #docs = preprocess.tokenizeText(raw_docs)
    if (is_refine):
        docs = tfidf_helper.extract(docs, word_limit)
    docs_vecs = match_helper.findWordVectors(docs, word_vectors)
    return docs_vecs
def vectorizeLongDoc(raw_docs, word_vectors, topic_num=10, is_refine=False, word_limit=100):
    """
    raw_docs: a list of the concateation of reviewers' works
    vector space for each long doc
    """
    # tokenize
    print("vectorize long docs...")
    docs = []
    for raw_doc in raw_docs:
        docs.append(preprocess.tokenizeText(raw_doc))
    #docs = preprocess.tokenizeText(raw_docs)
    # if refine with tf-idf methods
    if (is_refine):
        docs = tfidf_helper.extract(docs, word_limit)
    docs_topics, topic_weights = match_helper.findHiddenTopics(docs, word_vectors, topic_num)
    return docs_topics, topic_weights