def save_train_and_test(data):
    with open(data, 'rb') as file:
        # read the data as binary data stream
        print("... Reading the pre-processed data from local binary file...")
        documents = pickle.load(file)

    documents = extract_important_words_tfidf(
        documents, 0.60)  # extracting top 60% (TF-IDF) terms per document
    documents = remove_low_high_frequent_words(documents, 0.03, 1.0)

    corpus = get_tfidf(documents)["corpus_tfidf"]
    dictionary = get_tfidf(documents)["index2word"]

    corpus = list(corpus)

    random.shuffle(corpus)
    train = corpus[:18000]
    test = corpus[18000:]

    with open('data/train_corpus.data', 'wb') as file:
        print("...Saving training corpus into local binary file...")
        pickle.dump(train, file)

    with open('data/test_corpus.data', 'wb') as file:
        print("...Saving test corpus into local binary file...")
        pickle.dump(test, file)

    with open('data/common_dictionary.data', 'wb') as file:
        print(
            "...Saving common dictionary for train and test corpus into binary file..."
        )
        pickle.dump(dictionary, file)
def get_corpus_words(data):
    with open(data, 'rb') as file:
        # read the data as binary data stream
        print("... Reading the pre-processed data from local binary file...")
        documents = pickle.load(file)

    documents = extract_important_words_tfidf(
        documents, 0.60)  # extracting top 60% (TF-IDF) terms per document
    documents = remove_low_high_frequent_words(documents, 0.03, 1.0)
    dictionary = Dictionary(documents)

    return dictionary.values()  # returning all words in the data corpus
def words_to_self_trained_word2vec(train_data, words):
    with open(train_data, 'rb') as file:
        # read the data as binary data stream
        print("... Reading the pre-processed data from local binary file...")
        documents = pickle.load(file)

    documents = extract_important_words_tfidf(
        documents, 0.60)  # extracting top 60% (TF-IDF) terms per document
    documents = remove_low_high_frequent_words(documents, 0.03, 1.0)

    # train word2vec model with the documents
    word2vec_model = Word2Vec(documents,
                              size=100,
                              window=10,
                              min_count=1,
                              workers=10)

    vectors = [word2vec_model.wv[word] for word in words]
    return vectors
Beispiel #4
0
def train_hlda_model(data):
    with open(data, 'rb') as file:
        # read the data as binary data stream
        print("... Reading the pre-processed data from local binary file...")
        documents = pickle.load(file)

    documents = extract_important_words_tfidf(
        documents, 0.60)  # extracting top 60% (TF-IDF) terms per document
    documents = remove_low_high_frequent_words(documents, 0.03, 1.0)

    # ++++++ preparing dictionary and corpus for h-LDA library ++++++ #
    dictionary_set = set()
    for doc in documents:
        for word in doc:
            dictionary_set.add(word)

    vocab = sorted(list(dictionary_set))
    vocab_index = {}
    for i, w in enumerate(vocab):
        vocab_index[w] = i

    new_corpus = []
    for doc in documents:
        new_doc = []
        for word in doc:
            word_idx = vocab_index[word]
            new_doc.append(word_idx)
        new_corpus.append(new_doc)
    # --------------------------------------------------------------- #

    hlda_model = HierarchicalLDA(
        corpus=new_corpus,
        vocab=vocab,
        alpha=10.0,  # default = 10.0
        gamma=1.0,  # default = 1.0
        eta=1.0,  # default = 0.1
        seed=0,  # default = 0
        verbose=True,  # default = True
        num_levels=3)  # default = 3

    return hlda_model, new_corpus, vocab_index
Beispiel #5
0
def compute_coherence_values_topic_num(data, limit, start=2, step=1):
    """
    Compute u_mass coherence for various number of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    with open(data, 'rb') as file:
        # read the data as binary data stream
        print("... Reading the pre-processed data from local binary file...")
        documents = pickle.load(file)

    documents = extract_important_words_tfidf(
        documents, 0.60)  # extracting top 60% (TF-IDF) terms per document
    documents = remove_low_high_frequent_words(documents, 0.03, 1.0)

    corpus = get_tfidf(documents)["corpus_tfidf"]
    dictionary = get_tfidf(documents)["index2word"]

    coherence_values = []
    model_list = []

    for num_topics in range(start, limit, step):
        model = models.ldamodel.LdaModel(corpus=corpus,
                                         id2word=dictionary,
                                         num_topics=num_topics,
                                         eta=0.3)
        model_list.append(model)

        coherencemodel = CoherenceModel(model=model,
                                        dictionary=dictionary,
                                        corpus=corpus,
                                        coherence='u_mass')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values
def self_trained_word2vec(training_corpus, topic_words):
    with open(training_corpus, 'rb') as file:
        # read the data as binary data stream
        print("... Reading the pre-processed data from local binary file...")
        documents = pickle.load(file)

    documents = extract_important_words_tfidf(
        documents, 0.60)  # extracting top 60% (TF-IDF) terms per document
    documents = remove_low_high_frequent_words(documents, 0.03, 1.0)

    # train word2vec model with the documents
    word2vec_model = Word2Vec(documents,
                              size=100,
                              window=10,
                              min_count=1,
                              workers=10)

    topics_in_vector = []
    for terms_list in topic_words:
        topics_in_vector.append(
            sent_vectorizer_word2vec(terms_list, word2vec_model))

    return topics_in_vector  # list of vectors (vector per topic)
def self_trained_doc2vec(training_corpus, topic_words):
    with open(training_corpus, 'rb') as file:
        # read the data as binary data stream
        print("... Reading the pre-processed data from local binary file...")
        documents = pickle.load(file)

    documents = extract_important_words_tfidf(
        documents, 0.60)  # extracting top 60% (TF-IDF) terms per document
    documents = remove_low_high_frequent_words(documents, 0.03, 1.0)
    documents = [" ".join(doc) for doc in documents]

    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]
    doc2vec_model = Doc2Vec(documents,
                            vector_size=10,
                            window=2,
                            min_count=1,
                            workers=5)

    topics_in_vector = []
    for terms_list in topic_words:
        topics_in_vector.append(doc2vec_model.infer_vector(terms_list))

    return topics_in_vector
Beispiel #8
0
def train_lda_model(data, num_topics):
    with open(data, 'rb') as file:
        # read the data as binary data stream
        print("... Reading the pre-processed data from local binary file...")
        documents = pickle.load(file)

    documents = extract_important_words_tfidf(
        documents, 0.60)  # extracting top 60% (TF-IDF) terms per document
    documents = remove_low_high_frequent_words(documents, 0.03, 1.0)

    corpus = get_tfidf(documents)["corpus_tfidf"]
    in2word = get_tfidf(documents)["index2word"]

    lda_model = models.ldamodel.LdaModel(
        corpus=corpus,
        num_topics=num_topics,
        id2word=in2word,
        distributed=False,  # default: False
        chunksize=2000,  # default: 2000
        passes=1,  # default: 1
        update_every=1,  # default: 1
        alpha='symmetric',  # default: 'symmetric'
        eta=0.3,  # default: None ; ** taking non-default value **
        decay=0.5,  # default: 0.5
        offset=1.0,  # default: 1.0
        eval_every=10,  # default: 10
        iterations=50,  # default: 50
        gamma_threshold=0.001,  # default: 0.001
        minimum_probability=0.01,  # default: 0.01
        random_state=None,  # default: None
        ns_conf=None,  # default: None
        minimum_phi_value=0.01,  # default: 0.01
        per_word_topics=False,  # default: False
        callbacks=None  # default: None
    )

    return lda_model, corpus, in2word