Beispiel #1
0
def build_doc_vector(dir_name, model, build_option, process_option=ProcessOption(), \
    save=True, save_file="doc_vector.bin", \
    to_normalize = False, to_scale = False, cluster_factor=20, num_cpus=-2):
    sentences = Sentences(dir_name)
    docs = sentences.paragraph_iterator()
    doc_num = sentences.doc_num
    stop_words = set(stopwords.words("english"))
    tknzr = TweetTokenizer(preserve_case=False)
    post_docs = util.process_sentences(docs, tknzr, process_option, stop_words)
    if build_option == 1:  # average
        doc_vector = build_average_dv(post_docs, doc_num, model, save,
                                      save_file)
    elif build_option == 2:  # cluster
        doc_vector = build_av_tf_idf_dv(post_docs, doc_num, model, save,
                                        save_file)
    else:
        doc_vector = build_cluster_dv(post_docs, doc_num, model,
                                      cluster_factor, num_cpus, save,
                                      save_file)

    if (normalize):
        doc_vector = normalize(doc_vector, copy=False)
    if (scale):
        doc_vector = scale(doc_vector, copy=True)

    return doc_vector


# "/Users/Crazyconv/Conv/DEVELOPMENT/GitFolder/Word2Vec2NLP/dataset"
Beispiel #2
0
def build_doc_vector(dir_name, model, build_option, process_option=ProcessOption(), \
    save=True, save_file="doc_vector.bin", \
    to_normalize = False, to_scale = False, cluster_factor=20, num_cpus=-2):
    sentences = Sentences(dir_name)
    docs = sentences.paragraph_iterator()
    doc_num = sentences.doc_num
    stop_words = set(stopwords.words("english"))
    tknzr = TweetTokenizer(preserve_case=False)
    post_docs = util.process_sentences(docs, tknzr, process_option, stop_words)
    if build_option == 1:        # average
        doc_vector = build_average_dv(post_docs, doc_num, model, save, save_file)
    elif build_option == 2:        # cluster
        doc_vector = build_av_tf_idf_dv(post_docs, doc_num, model, save, save_file)
    else:
        doc_vector = build_cluster_dv(post_docs, doc_num, model, cluster_factor, num_cpus, save, save_file)

    if(normalize):
        doc_vector = normalize(doc_vector, copy=False)
    if(scale):
        doc_vector = scale(doc_vector, copy=True)

    return doc_vector




# "/Users/Crazyconv/Conv/DEVELOPMENT/GitFolder/Word2Vec2NLP/dataset"    
def build_doc_vector(dir_name, model, build_option, process_option=ProcessOption(), cluster_factor=20, num_cpus=-2):
    sentences = Sentences(dir_name)
    docs = sentences.paragraph_iterator()
    doc_num = sentences.doc_num
    stop_words = set(stopwords.words("english"))
    post_docs = util.process_sentences(docs, process_option, stop_words)
    if build_option == 1:        # average
        doc_vector = build_average_dv(post_docs, doc_num, model)
    elif build_option == 2:        # cluster
        doc_vector = build_av_tf_idf_dv(post_docs, doc_num, model)
    else:
        doc_vector = build_cluster_dv(post_docs, doc_num, model, cluster_factor, num_cpus)

    return doc_vector
Beispiel #4
0
def build_doc_vector(dir_name,
                     model,
                     build_option,
                     process_option=ProcessOption(),
                     cluster_factor=20,
                     num_cpus=-2):
    sentences = Sentences(dir_name)
    docs = sentences.paragraph_iterator()
    doc_num = sentences.doc_num
    stop_words = set(stopwords.words("english"))
    post_docs = util.process_sentences(docs, process_option, stop_words)
    if build_option == 1:  # average
        doc_vector = build_average_dv(post_docs, doc_num, model)
    elif build_option == 2:  # cluster
        doc_vector = build_av_tf_idf_dv(post_docs, doc_num, model)
    else:
        doc_vector = build_cluster_dv(post_docs, doc_num, model,
                                      cluster_factor, num_cpus)

    return doc_vector