Ejemplo n.º 1
0
def lda_tuner(ingroup_otu, best_models):

    best_score = -1*np.inf
    dtp_series = [0.0001, 0.001, 0.01, 0.1, 0.2]
    twp_series = [0.0001, 0.001, 0.01, 0.1, 0.2]
    topic_series = [3]
    X = ingroup_otu.values
    eval_counter = 0

    for topics in topic_series: 
        for dtp in dtp_series:
            for twp in twp_series:
                eval_counter +=1
                X_train, X_test = train_test_split(X, test_size=0.5)
                lda = LatentDirichletAllocation(n_topics=topics, 
                                                doc_topic_prior=dtp, 
                                                topic_word_prior=twp, 
                                                learning_method='batch',
                                                random_state=42,
                                                max_iter=20)
                lda.fit(X_train)
                this_score = lda.score(X_test)
                this_perplexity = lda.perplexity(X_test)
                if this_score > best_score:
                    best_score = this_score
                    print "New Max Likelihood: {}".format(best_score)

                print "#{}: n:{}, dtp:{}, twp:{}, score:{}, perp:{}".format(eval_counter, 
                                                                 topics, dtp, twp,
                                                                 this_score, this_perplexity)

                best_models.append({'n': topics, 'dtp': dtp, 'twp': twp,
                                    'score': this_score, 'perp': this_perplexity})
                if (dtp == dtp_series[-1]) and (twp == twp_series[-1]):
                    eval_counter +=1
                    X_train, X_test = train_test_split(X, test_size=0.5)
                    lda = LatentDirichletAllocation(n_topics=topics, 
                                                    doc_topic_prior=1./topics, 
                                                    topic_word_prior=1./topics, 
                                                    learning_method='batch',
                                                    random_state=42,
                                                    max_iter=20)
                    lda.fit(X_train)
                    this_score = lda.score(X_test)
                    this_perplexity = lda.perplexity(X_test)
                    if this_score > best_score:
                        best_score = this_score
                        print "New Max Likelihood: {}".format(best_score)

                    print "#{}: n:{}, dtp:{}, twp:{}, score:{} perp: {}".format(eval_counter, 
                                                                                topics, 
                                                                                (1./topics), 
                                                                                (1./topics),
                                                                                this_score,
                                                                                this_perplexity)

                    best_models.append({'n': topics, 'dtp': (1./topics), 
                                        'twp': (1./topics), 'score': this_score,
                                        'perp': this_perplexity})
    return best_models
Ejemplo n.º 2
0
def test_lda_score():
    # Test LDA score for batch training
    # score should be higher after each iteration
    n_topics, X = _build_sparse_mtx()
    for method in ('online', 'batch'):
        lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method=method,
                                          total_samples=100, random_state=0)
        lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method=method,
                                          total_samples=100, random_state=0)
        lda_1.fit_transform(X)
        score_1 = lda_1.score(X)

        lda_2.fit_transform(X)
        score_2 = lda_2.score(X)
        assert_greater_equal(score_2, score_1)
Ejemplo n.º 3
0
def get_score(filepath, min_word_count, num_topics, max_df_, min_df_):
    text_corpus, raw_corpus, filepath = load_corpus(min_word_count, filepath)

    num_segs = len(text_corpus)

    #Create CountVectorizer to get Document-Term matrix
    vectorizer = CountVectorizer(stop_words='english',
                                 lowercase=True,
                                 max_df=max_df_,
                                 min_df=min_df_,
                                 tokenizer=LemmaTokenizer())

    #train vectorizer on corpus
    dt_matrix = vectorizer.fit_transform(text_corpus)

    feature_names = vectorizer.get_feature_names()

    #initialize model
    lda = LatentDirichletAllocation(n_components=NUM_TOPICS,
                                    max_iter=5,
                                    learning_method='batch')

    #train the model on the corpus and get a document topic matrix for the corpus
    doc_topic_matrix = lda.fit_transform(dt_matrix)

    feature_names = vectorizer.get_feature_names()
    num_features = len(feature_names)
    score = lda.score(dt_matrix) / get_num_tokens(dt_matrix)

    return score, (num_segs, len(raw_corpus)), num_features
Ejemplo n.º 4
0
def computeLDA(analyzer, xCol: str, nWords: int, n_topics: int, file: str):
    """ Compute LDA process for 1 file."""
    print(f'processing LDA for {file} and [{n_topics}] topics...')

    # create output directory to store results
    outputDir = createOutputDirectory(file, n_topics)
    rawCorpus = readCorpus(file)
    # get X_train. y_train is ignored
    X_train = prepareXyTrain(rawCorpus, xCol, rawCorpus.columns[0])[0]

    tfidf = getVectorizer('lda', analyzer)
    sparseX = tfidf.fit_transform(X_train)
    # lda model with default parameters
    lda = LatentDirichletAllocation(n_components=n_topics, n_jobs=-1)
    # size(n_documents, n_topics). Data = topic probas
    dfVectorized = pd.DataFrame(lda.fit_transform(sparseX))

    # prepare 2 dataframes to visualize frequencies and percentages
    dfCounts, dfProbasNormalized = prepareDfs(dfVectorized)
    prepareBarPLots(dfCounts, dfProbasNormalized, outputDir)

    # normalize and round lda components (size is (n_topics, n_features))
    probas = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]
    rProbas = np.apply_along_axis(lambda n: np.round(n, 4), 1, probas)

    # produce and save dataframe with top features
    features = tfidf.get_feature_names()
    headers = ["topic_" + str(c) for c in dfVectorized.columns]
    l2headers = ['word', 'proba']
    topDf = dfTopFeatures(features, headers, rProbas, l2headers, 'topic',
                          nWords)
    topDf.to_csv(f'{outputDir}topFeatures.csv')

    return [lda.score(sparseX), lda.perplexity(sparseX)]
Ejemplo n.º 5
0
def test_lda_score(method):
    # Test LDA score for batch training
    # score should be higher after each iteration
    n_components, X = _build_sparse_mtx()
    lda_1 = LatentDirichletAllocation(n_components=n_components,
                                      max_iter=1, learning_method=method,
                                      total_samples=100, random_state=0)
    lda_2 = LatentDirichletAllocation(n_components=n_components,
                                      max_iter=10, learning_method=method,
                                      total_samples=100, random_state=0)
    lda_1.fit_transform(X)
    score_1 = lda_1.score(X)

    lda_2.fit_transform(X)
    score_2 = lda_2.score(X)
    assert score_2 >= score_1
def main():
    print("\n-----LDA CONCEPT DETECITON-----")
    text_corpus, text_corpus_ids, raw_corpus, raw_corpus_ids, filepath = load_corpus(
        'v')

    # text_corpus_lemma = lemmatize_corpus(text_corpus, 'v')

    concepts_raw = load_document(CONCEPTS_PATH)
    concepts = parse_concepts(concepts_raw)
    num_segs = len(text_corpus)
    print("MAX_DF: " + str(MAX_DF))
    print("MIN_DF: " + str(MIN_DF))
    print("Number of Segs: %d/%d" % (len(text_corpus), len(raw_corpus)))

    #Create CountVectorizer to get Document-Term matrix
    vectorizer = CountVectorizer(stop_words='english',
                                 lowercase=True,
                                 max_df=MAX_DF,
                                 min_df=MIN_DF,
                                 tokenizer=LemmaTokenizer())

    #train vectorizer on corpus
    dt_matrix = vectorizer.fit_transform(text_corpus)

    feature_names = vectorizer.get_feature_names()

    print("Number of Features: " + str(len(feature_names)))

    #initialize model
    print("initialize model")
    lda = LatentDirichletAllocation(n_components=NUM_TOPICS,
                                    max_iter=400,
                                    learning_method='batch')

    #train the model on the corpus and get a document topic matrix for the corpus
    print('fit model to corpus')
    doc_topic_matrix = lda.fit_transform(dt_matrix)
    topic_term_matrix = lda.components_

    # print("visualizing")
    # visualize(doc_topic_matrix)

    print("Score: " + str(lda.score(dt_matrix) / get_num_tokens(dt_matrix)))

    print("running elbow")
    #print topics, 10 is the number of words in the topic dist to display (e.g. top 10)
    topic_str_list = print_topics(lda, feature_names, 10)
    run_elbow(lda, feature_names)
    #
    # for i in range(0, len(concepts)):
    #     query_list = concepts[i]
    #     topicid_list = get_topics_w_query(topic_term_matrix, TOP_N_WORDS, feature_names, query_list)
    #     seg_list, num_rel_segs = get_segs_w_query(doc_topic_matrix, topicid_list, 10, query_list)
    #
    #     if len(seg_list) > 0:
    #         write_output_file_xlsx(query_list, topic_str_list, topicid_list, filepath, num_segs, seg_list, num_rel_segs, text_corpus)
    #
    #

    return 0
class LDATopicGen:
    def __init__(self, data, topics=5):
        self.data = data
        self.components = topics
        self.model = None

    def fit_predict(self):
        self.model = LatentDirichletAllocation(n_components=self.components,
                                               random_state=0)

        topics = self.model.fit_transform(self.data)

        print("LDA Perplexity Score %s" % self.model.perplexity(self.data))
        print("LDA Log Likelihood Score %s" % self.model.score(self.data))

        return topics

    def plot(self):
        norm = matplotlib.colors.Normalize(-1, 1)
        colors = [[norm(-1.0), "midnightblue"], [norm(-0.5), "seagreen"],
                  [norm(0.5), "mediumspringgreen"], [norm(1.0), "yellow"]]

        cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", colors)

        ax = sns.clustermap(self.model.components_ /
                            self.model.components_.sum(axis=1)[:, np.newaxis],
                            linewidth=0.5,
                            cmap=cmap)
        plt.show()
Ejemplo n.º 8
0
def get_lda_model(X, y):

    from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
    # Build LDA Model
    lda_model = LatentDirichletAllocation(
        n_components=20,  # Number of topics
        max_iter=10,  # Max learning iterations
        learning_method='online',
        random_state=100,  # Random state
        batch_size=128,  # n docs in each learning iter
        evaluate_every=-1,  # compute perplexity every n iters, default: Don't
        n_jobs=-1,  # Use all available CPUs
    )
    lda_output = lda_model.fit_transform(X, y)

    print(lda_model)  # Model attributes
    from pprint import pprint
    # Log Likelyhood: Higher the better
    print("Log Likelihood: ", lda_model.score(X, y))

    # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
    #this is giving some error
    #print("Perplexity: ", lda_model.perplexity(X,y))

    # See model parameters
    pprint(lda_model.get_params())
    return lda_model
def generate_topics():
    db, cursor = dbConnect()
    for domain in c.domains:
        start_time = time.time()
        papers, tf, feature_names = load_corpus(domain, db)
        #lda,feature_names=load_model(domain,c.domain_topics[domain])
        lda = LatentDirichletAllocation(n_topics=c.domain_topics[domain],
                                        max_iter=5,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=0)
        lda.fit(tf)
        #---------- MODEL EVALUATION PARAMETERS --------------------------
        perplexity1 = lda.perplexity(tf)
        perplexity2 = lda.perplexity(tf, lda._e_step(tf, False, False)[0])
        score = lda.score(tf, lda._e_step(tf, False, False)[0])
        topic_paper_dist = lda.transform(tf)
        print "for", c.domain_topics[
            domain], domain, "topics ==> perplexity:", perplexity2, "log likelihood:", score

        save_model(lda, domain, c.domain_topics[domain], feature_names)
        #lda,feature_names=load_model(domain,c.domain_topics[domain])
        store_in_db(db, lda, topic_paper_dist, papers, feature_names, domain)
        print "--- time for " + domain + ": " + str(
            (time.time() - start_time) / 60) + " minutes ---"
 def train_model(self,
                 n_components,
                 learning_offset=10.0,
                 learning_decay=0.7,
                 max_doc_update_iter=100,
                 n_jobs=-1):
     '''
     训练LDA模型
     :param n_components: LDA的主题参数
     :param learning_offset:
     :param learning_decay:
     :param max_doc_update_iter:
     :param n_jobs:
     :return: lda model
     '''
     ldaModel = LatentDirichletAllocation(
         n_components=n_components,
         learning_decay=learning_decay,
         learning_offset=learning_offset,
         max_doc_update_iter=max_doc_update_iter,
         n_jobs=n_jobs)
     ldaModel.fit(self.tfVector)
     print('The Log Likelihood Score:{}'.format(
         np.round(ldaModel.score(self._get_tf_vector()), 3)))
     print('The Perplexity:{}'.format(
         np.round(ldaModel.perplexity(self._get_tf_vector()), 3)))
     return ldaModel
Ejemplo n.º 11
0
def cluster_sk_latent_dirichlet_allocation(content):
    """ SK LDA """
    _config = LatentDirichletAllocation(
        n_components=content['n_components'],
        doc_topic_prior=None,
        topic_word_prior=None,
        learning_method=content['learning_method'],
        learning_decay=content['learning_decay'],
        learning_offset=content['learning_offset'],
        max_iter=10,
        batch_size=128,
        mean_change_tol=content['mean_change_tol'],
        n_jobs=-1)
    _result = _config.fit(content['data']).transform(content['data'])
    return httpWrapper(
        json.dumps(
            {
                'result': _result.tolist(),
                'components': _config.components_.tolist(),
                'batchIter': _config.n_batch_iter_,
                'nIter': _config.n_iter_,
                'perplexity': _config.perplexity(content['data']),
                'score': _config.score(content['data'])
            },
            ignore_nan=True))
Ejemplo n.º 12
0
def topicmodel( comments ):

    _texts = []
    texts = []

    for c in comments:

        c = c['text']
        _texts.append( c )
        texts.append( c )



    tf_vectorizer = CountVectorizer(
                max_df=.20,
                min_df=10,
                stop_words = stopwords )
    texts = tf_vectorizer.fit_transform( texts )

    ## test between 2 and 20 topics
    topics = {}

    for k in range(2, 10):

        print "Testing", k

        model = LatentDirichletAllocation(
                    n_topics= k ,
                    max_iter=5,
                    learning_method='batch',
                    learning_offset=50.,
                    random_state=0
                )
        model.fit( texts )
        ll = model.score( texts )
        topics[ ll ] = model

    topic = max( topics.keys() )

    ret = collections.defaultdict( list )

    ## ugly, rewrite some day
    model = topics[ topic ]

    ## for debug pront chosen models' names
    feature_names = tf_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print "Topic #%d:" % topic_idx
        print " ".join( [feature_names[i].encode('utf8') for i in topic.argsort()[:-5 - 1:-1]])
        print

    for i, topic in enumerate( model.transform( texts ) ):

        topic = numpy.argmax( topic )
        text = _texts[ i ].encode('utf8')

        ret[ topic ].append( text )

    return ret
Ejemplo n.º 13
0
def test_lda_score():
    # Test LDA score for batch training
    # score should be higher after each iteration
    n_topics, X = _build_sparse_mtx()
    for method in ('online', 'batch'):
        lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1,
                                          learning_method=method,
                                          total_samples=100, random_state=0)
        lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,
                                          learning_method=method,
                                          total_samples=100, random_state=0)
        lda_1.fit_transform(X)
        score_1 = lda_1.score(X)

        lda_2.fit_transform(X)
        score_2 = lda_2.score(X)
        assert_greater_equal(score_2, score_1)
def wybor(topics, data):
    for data in data:
        loglikelihood=[]
        perplexity=[]
        for topics in topics:
            lda=LatentDirichletAllocation(n_topics=topics, learning_method="batch", max_iter=25, random_state=0)
            loglikelihood.append(lda.score(data)) 
            perplexity.append(lda.perplexity(data))  
Ejemplo n.º 15
0
def main():
    print("\n-----LDA CONCEPT DETECITON-----")
    text_corpus, raw_corpus = load_corpus()

    print("MAX_DF: " + str(MAX_DF))
    print("MIN_DF: " + str(MIN_DF))
    print("Number of Segs: %d/%d" % (len(text_corpus), len(raw_corpus)))

    #Create CountVectorizer to get Document-Term matrix
    vectorizer = CountVectorizer(stop_words='english',
                                 lowercase=True,
                                 max_df=MAX_DF,
                                 min_df=MIN_DF,
                                 tokenizer=LemmaTokenizer())

    #train vectorizer on corpus
    dt_matrix = vectorizer.fit_transform(text_corpus)

    feature_names = vectorizer.get_feature_names()

    print("Number of Features: " + str(len(feature_names)))

    #initialize model
    lda = LatentDirichletAllocation(n_components=NUM_TOPICS,
                                    max_iter=1000,
                                    learning_method='batch')

    #train the model on the corpus and get a document topic matrix for the corpus
    doc_topic_matrix = lda.fit_transform(dt_matrix)
    topic_term_matrix = lda.components_

    print("Score: " + str(lda.score(dt_matrix) / get_num_tokens(dt_matrix)))

    #get freq of topics in corpus
    topic_prev = get_topic_prevelance(doc_topic_matrix, NUM_TOPICS,
                                      len(text_corpus))

    #print topics
    print_topics(lda, feature_names, 10, topic_prev)

    #get top segs assoc with each topic
    top_segs = get_top_segs_threshold(NUM_TOPICS, doc_topic_matrix,
                                      TOPIC_PRESSENCE_THRESHOLD)
    #print_top_segs(top_segs, text_corpus)

    kw_per_topic = get_key_words(NUM_TOPICS)

    kw_segs = get_kw_segs(kw_per_topic, top_segs, text_corpus)

    print("--------------SEGMENTS CONTAINING KW--------------")
    for i in range(0, len(kw_segs)):
        print("\nTOPIC: %d\n" % (i))
        for j in range(0, len(kw_segs[i])):
            print("--------------")
            print("Seg: " + str(kw_segs[i][j]))

    return 0
Ejemplo n.º 16
0
def test_lda_score_perplexity():
    # Test the relationship between LDA score and perplexity
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=10,
                                    random_state=0)
    lda.fit(X)
    perplexity_1 = lda.perplexity(X, sub_sampling=False)

    score = lda.score(X)
    perplexity_2 = np.exp(-1. * (score / np.sum(X.data)))
    assert_almost_equal(perplexity_1, perplexity_2)
def test_lda_score_perplexity():
    # Test the relationship between LDA score and perplexity
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=10,
                                    random_state=0)
    lda.fit(X)
    perplexity_1 = lda.perplexity(X, sub_sampling=False)

    score = lda.score(X)
    perplexity_2 = np.exp(-1. * (score / np.sum(X.data)))
    assert_almost_equal(perplexity_1, perplexity_2)
Ejemplo n.º 18
0
def lda_operation(data_samples, num_features: int=400, num_topics: int=6)-> Tuple: 
    """Performs Latent Dirichlet Allocation on a list of our text samles 
    
    Args:
        data_samples List[str]: List of strings representing the text of each Piazza post
        num_features (int): Max number of features to be considered by term frequency
        num_topics (int): Number of topics 
    
    Returns:
        tuple: Trained LDA Model and the embedded text in the CountVectorizer
        
    """
    
    tf_vectorizer = CountVectorizer(max_df=.85, min_df=.05, max_features=num_features, stop_words='english', token_pattern=u'(?ui)\\b\\w\w*[a-z]+\\w*\\b')
    
    tf_data_samples = tf_vectorizer.fit_transform(data_samples) 
    tf_feature_names = tf_vectorizer.get_feature_names()

    lda = LatentDirichletAllocation(n_components=num_topics, max_iter=100, learning_method='online', learning_offset=10.,random_state=1).fit(tf_data_samples)
    lda.score(tf_data_samples)

    return lda, tf_vectorizer
Ejemplo n.º 19
0
def LDA_sklearn(text_data, num_topics, iterations, visualization = False, gridsearch = False ):
    vectorizer = OwnCountVectorizer(max_df = 0.95, min_df = 2, stop_words = 'english', lowercase = True,
                                    token_pattern = '[a-zA-Z\-][a-zA-Z\-]{2,}', ngram_range = (2, 3),
                                    decode_error = 'ignore')
    vectorized_text_data = vectorizer.fit_transform(text_data)
    lda_model = LatentDirichletAllocation(n_topics = num_topics, max_iter = iterations, learning_method = 'online',
                                          random_state = 100, batch_size = 120, evaluate_every = -1, n_jobs = -1)
    lda_output = lda_model.fit_transform(vectorized_text_data)
    print lda_model # model attributes
    print 'Log likelihood: ', lda_model.score(vectorized_text_data) # log-likelihood: the higher the better
    print 'Perplexity: ', lda_model.perplexity(vectorized_text_data) # perplexity = exp(-1. * log-likelihood per word, the lower the better
    pprint(lda_model.get_params()) # see model parameters

    # GridSearch the best model
    search_params = {'n_components': [41, 45, 50, 55, 60], 'learning_decay': [.5, .7, .9]}
    lda = LatentDirichletAllocation() # initialize the model
    model = GridSearchCV(lda, param_grid = search_params) # initialize the gridsearch class
    model.fit(vectorized_text_data) # do the grid search

    best_lda_model = model.best_estimator_ # best model
    print 'Best parameters: ', model.best_params_ # best parameters
    print 'Best Log-likelihood score: ', model.best_score_
    print 'Model perplexity: ', best_lda_model.perplexity(vectorized_text_data)

    # Compare LDA model performance scores

    # Get Log-likelihoods from Gridsearch otputs
    n_topics = [41, 45, 50, 55, 60]
    log_likelihoods_5 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if
                         g.score.parameters['learning_decay' == 0.5]]
    log_likelihoods_7 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if
                         g.score.parameters['learning_decay' == 0.7]]
    log_likelihoods_9 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if
                         g.score.parameters['learning_decay' == 0.9]]

    # Show graph
    plt.figure(figsize = (10, 8))
    plt.plot(n_topics, log_likelihoods_5, label = '0.5')
    plt.plot(n_topics, log_likelihoods_7, label = '0.7')
    plt.plot(n_topics, log_likelihoods_9, label = '0.9')
    plt.title('Gridsearch output on choosing optimal LDA model')
    plt.xlabel('Number of topics')
    plt.ylabel('Log likelihood scores')
    plt.legend(title = 'Learning decay', loc = 'best')
    plt.show()

    if visualize == True:
        panel = pyLDAvis.sklearn.prepare(lda_model, vectorized_text_data, vectorizer, mds = 'tsne')
        pyLDAvis.show(panel)
    else:
        return lda_output[0] # for verification that it works
Ejemplo n.º 20
0
def RandScore(CountsMatrix, K, no_iter):
    """
    Calculates score for observed data with LDA model fitted to randomized matrix
    
    CountsMatrix - numpy array. Counts matrix (Document x terms matrix) for our real data
    K - number of clusters
    """
    #Randomize CountsMatrix
    RandMatrix = RandomizeMatrix(CountsMatrix)
    #LDA for randomized matrix
    lda_rand = LatentDirichletAllocation(n_topics=K,
                                         learning_method='online',
                                         max_iter=no_iter).fit(
                                             RandMatrix)  #Model fitting
    return lda_rand.score(CountsMatrix)
Ejemplo n.º 21
0
def _test_LDA(data_samples=[], term=7, random_state=1, max_iter=100, **l):
    shuffle(data_samples)

    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=2,
                                    stop_words='english')
    tf = tf_vectorizer.fit_transform(data_samples)

    lda1 = LatentDirichletAllocation(max_iter=max_iter,
                                     learning_method='online',
                                     random_state=random_state,
                                     **l)

    lda1.fit_transform(tf)
    return lda1.score(tf)
Ejemplo n.º 22
0
def elbowplotlda(listoftopics, vectorizedcorpus):
    perplexitylst = []
    log_likelihood = []
    for num in listoftopics:
        model = LatentDirichletAllocation(n_jobs=-1, n_components=num)
        model.fit(vectorizedcorpus)
        log_likelihood.append(model.score(vectorizedcorpus))
        perplexitylst.append(model.perplexity(vectorizedcorpus))
    print(log_likelihood)
    print(perplexitylst)
    plt.plot(listoftopics, log_likelihood, '-', label="log_likelihood")
    #plt.plot(listoftopics, perplexitylst, '-', label='perplexitylst')
    plt.legend()
    plt.savefig("../images/elbowplot")
    plt.show()
Ejemplo n.º 23
0
def perform_lda_analysis(txtDir='', numOfTxts=None, numOfTopics=5, maxIter=20,
                         learningMode='online', randomState=100, batchSize=128,
                         evaluateEvery=-1, nJobs=-1):
    """

    :param txtDir:
    :param numOfTxts: an integer or None for selecting all files
    :param numOfTopics:
    :param maxIter:
    :param learningMode:
    :param randomState:
    :param batchSize:
    :param evaluateEvery:
    :param nJobs:
    :return:
    """
    warnings.simplefilter("ignore", DeprecationWarning)
    txtLst = []
    for fname in os.listdir(txtDir)[:numOfTxts]:
        with codecs.open(os.path.join(cfg.pwc['cleanTxtDir'], fname), 'r', 'utf-8-sig') as fh:
            txt = get_content_words(fh.read())
            txtLst.append(txt)
    txtLst = txtLst
    vectorizer = CountVectorizer(analyzer='word', min_df=4, lowercase=True,
                                 token_pattern='[a-zA-Z0-9]{3,}')

    dataVector = vectorizer.fit_transform(txtLst)
    dataDense = dataVector.todense()
    print("Sparsicity: ", ((dataDense > 0).sum() / dataDense.size) * 100, "%")

    lda_model = LatentDirichletAllocation(n_topics=numOfTopics,
                                          max_iter=maxIter,
                                          learning_method=learningMode,
                                          random_state=randomState,
                                          batch_size=batchSize,
                                          evaluate_every=evaluateEvery,
                                          n_jobs=nJobs)

    lda_result = lda_model.fit_transform(dataVector)
    results = { 'result':lda_result,
                'logLikelyhood': lda_model.score(dataVector), # the higher the better
                'perplexity': lda_model.perplexity(dataVector), # the lower the better
                'params': lda_model.get_params()
                }
    pprint(results)
    return results
Ejemplo n.º 24
0
def main():
    print("\n-----LDA CONCEPT DETECITON-----")
    text_corpus, text_corpus_ids, raw_corpus, raw_corpus_ids, filepath = load_corpus(
        'v')

    # text_corpus_lemma = lemmatize_corpus(text_corpus, 'v')

    concepts_raw = load_document(CONCEPTS_PATH)
    concepts = parse_concepts(concepts_raw)
    num_segs = len(text_corpus)
    print("MAX_DF: " + str(MAX_DF))
    print("MIN_DF: " + str(MIN_DF))
    print("Number of Segs: %d/%d" % (len(text_corpus), len(raw_corpus)))

    #Create CountVectorizer to get Document-Term matrix
    vectorizer = CountVectorizer(stop_words='english',
                                 lowercase=True,
                                 max_df=MAX_DF,
                                 min_df=MIN_DF,
                                 tokenizer=LemmaTokenizer())

    #train vectorizer on corpus
    dt_matrix = vectorizer.fit_transform(text_corpus)

    feature_names = vectorizer.get_feature_names()

    print("Number of Features: " + str(len(feature_names)))

    num_iter = 200
    #initialize model
    lda = LatentDirichletAllocation(n_components=NUM_TOPICS,
                                    max_iter=num_iter,
                                    learning_method='batch',
                                    verbose=1,
                                    random_state=55,
                                    evaluate_every=5)

    #train the model on the corpus and get a document topic matrix for the corpus
    doc_topic_matrix = lda.fit_transform(dt_matrix)
    topic_term_matrix = lda.components_
    print("Number of Iterations: ", lda.n_iter_)
    print("Score: " + str(lda.score(dt_matrix) / get_num_tokens(dt_matrix)))

    print_topics(lda, feature_names, 10)

    return 0
 def __init__(self, X, features, Klist=list(range(1, 10)), random_state=0):
     self.Klist = Klist
     self.features = features
     self.random_state = random_state
     self.X = X
     self.lda = []
     self.perplex = []
     self.score = []
     for k in Klist:
         lda = LatentDirichletAllocation(n_components=k,
                                         random_state=random_state)
         lda.fit(X)
         self.lda.append(lda)
         px = lda.perplexity(X)
         ll = lda.score(X)
         self.perplex.append(px)
         self.score.append(ll)
         print('K = %i, perplex = %f, log-like = %f' % (k, px, ll))
Ejemplo n.º 26
0
def test_topic_ks(text, ck = 80): #text is a list of documents

    count_vectorizer = CountVectorizer(stop_words='english')
    count_data = count_vectorizer.fit_transform(text)

    print("testing Ks...")
    cks = range(ck)
    candidate_ks = cks[40:]
    for number_topics in candidate_ks:
        print("K =", number_topics)
        lda = LDA(n_components=number_topics, n_jobs=-1)
        lda.fit(count_data)

        # Log Likelihood: Higher the better
        print("---> Log Likelihood: ", lda.score(count_data))

        # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
        print("---> Perplexity: ", lda.perplexity(count_data))
Ejemplo n.º 27
0
def get_model_metrics(model: LatentDirichletAllocation, doc_mat: np.array):
    """

    Args:
        model ():
        doc_mat ():

    Returns:

    """

    print(doc_mat.shape)

    print('Perplexity: ', model.perplexity(doc_mat))

    print('Log likelihood', model.score(doc_mat))

    print('Params', model.get_params())
Ejemplo n.º 28
0
def k_grid_search(X, test_size=0.25, gridval=[10, 100, 10], n_iter=30, seed=23):
    X_train, X_test = train_test_split(
        X, test_size=test_size, random_state=seed
        )
    grid = range(gridval[0], gridval[1], gridval[2])
    loglik = list()
    perplex = list()
    for k in grid:
        print("Estimating model at k: {}".format(k))
        lda = LatentDirichletAllocation(
            n_components=k, max_iter=n_iter, learning_method='online',
            learning_offset=50., random_state=seed, n_jobs=6
            )
        lda.fit(X_train)
        loglik.append(lda.score(X_test))
        perplex.append(lda.perplexity(X_test))
        lda = None

    return list(grid), loglik, perplex
Ejemplo n.º 29
0
def objective(space):
    print(space)
    global data_vectorized
    lda_model = LatentDirichletAllocation(
        n_components=int(space['n_topics']),  # number of topics
        learning_decay=space[
            'learning_decay'],  # control learning rate in the online learning method
        max_iter=10,  # max learning iterations
        learning_method='online',  # use mini-batch of training data
        batch_size=128,  # n docs in each learning iter
        n_jobs=-1,  # use all available CPUs
    )

    lda_model.fit_transform(data_vectorized)

    score = lda_model.score(data_vectorized)
    print("SCORE:", score)
    return {
        'loss': -score,
        'status': STATUS_OK
    }  # minnimizing negative log-likelihood is equivalent to maximing log-likelihood
Ejemplo n.º 30
0
def lda_decomp(t,
               n_components,
               learning_method="online",
               learning_offset=10.0,
               max_iter=20,
               random_state=1):
    #t0=time()
    #print(f"Fit LDA with {n_components} components")
    lda = LatentDirichletAllocation(n_components=n_components,
                                    max_iter=max_iter,
                                    learning_method=learning_method,
                                    learning_offset=learning_offset,
                                    random_state=random_state).fit(t)
    #print(f"Transform TD/IDF matrix with {n_components} components LDA")
    t_lda = lda.transform(t)
    score = lda.score(t)
    perplexity = lda.perplexity(t)
    #print("Approximate log likelihood score (higher the better): %.3f" % score)
    #print("Approximate perplexity (lower the better): %.3f" % perplexity)
    #print("done in %0.3fs." % (time() - t0))
    return (lda, t_lda)
def LDA_SK(data_vectorized, vectorizer):
    #Build LDA Model
    '''lda_model = LatentDirichletAllocation(n_topics=20,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
    '''
    lda_model = LatentDirichletAllocation(batch_size=128,
                                          doc_topic_prior=None,
                                          evaluate_every=-1,
                                          learning_decay=0.7,
                                          learning_method='online',
                                          learning_offset=10.0,
                                          max_doc_update_iter=100,
                                          max_iter=10,
                                          mean_change_tol=0.001,
                                          n_components=10,
                                          n_jobs=-1,
                                          n_topics=10,
                                          perp_tol=0.1,
                                          random_state=100,
                                          topic_word_prior=None,
                                          total_samples=1000000.0,
                                          verbose=0)

    lda_output = lda_model.fit_transform(data_vectorized)

    #print(lda_model)  # Model attributes

    # Log Likelyhood: Higher the better
    print("Log Likelihood: ", lda_model.score(data_vectorized))

    # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
    print("Perplexity: ", lda_model.perplexity(data_vectorized))
    return lda_output
Ejemplo n.º 32
0
Archivo: lda.py Proyecto: dpakpdl/NLP
def analyser(data):
    _, data_vectorized = get_vectorized_data(data)
    # Build LDA Model
    lda_model = LatentDirichletAllocation(
        n_components=20,  # Number of topics
        max_iter=10,  # Max learning iterations
        learning_method='online',
        random_state=100,  # Random state
        batch_size=128,  # n docs in each learning iter
        evaluate_every=-1,  # compute perplexity every n iters, default: Don't
        n_jobs=-1,  # Use all available CPUs
    )
    lda_output = lda_model.fit_transform(data_vectorized)

    print(lda_output)

    # Log Likelyhood: Higher the better
    print("Log Likelihood: ", lda_model.score(data_vectorized))

    # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
    print("Perplexity: ", lda_model.perplexity(data_vectorized))

    # See model parameters
    pprint(lda_model.get_params())
Ejemplo n.º 33
0
# Build LDA model
lda_model = LatentDirichletAllocation(
    n_components=10,  # Number or topics
    max_iter=10,  # Max learning iterations
    random_state=100,  # Random state (seed)
    learning_method='online',
    batch_size=128,  # No of docs in each iter
    evaluate_every=-1,  # Compute perplexity every n iters
    n_jobs=-1)  # Use all available CPUs

lda_output = lda_model.fit_transform(samples)
print(lda_model)

# Diagnose model performance with perplexity and log-likelihood
# Log Likelyhood: Higher the better
print "Log Likelihood: ", lda_model.score(samples)

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(samples))

# See model parameters
pprint(lda_model.get_params())

# Perform GridSearch for the best LDA model
# Define Search Param
search_params = {
    'n_components': [6, 7, 8, 9],  # take 10 topics
    'learning_decay': [0.5, 0.7, 0.9],
    'max_iter': [6, 7, 8, 9],
    'random_state': [2018]
}
Ejemplo n.º 34
0
print(X[1], data_lemmatized[1])

# ## Latent Dirichlet Allocation

# In[11]:

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import make_multilabel_classification

lda = LatentDirichletAllocation(n_jobs=10, n_components=20, random_state=0)
lda.fit(X)

matriz_topics = lda.transform(X)
samples = lda.decision_function(X)
acurracy = lda.score(X)
columna_nueva_fecha = np.array(df['FECHA SCRAPING'])
X_final = np.column_stack((columna_nueva_fecha, matriz_topics))

print(score)
print(samples)
print(lda.transform(X[:17]))

# In[65]:

#pd.DataFrame(X_final).to_csv('MatrizFrecuencia_LDA_index_fecha.csv',sep=',',header=None)

df_final = pd.DataFrame(matriz_topics, dtype='float')
df_final['FECHA'] = columna_nueva_fecha
#df_final.insert(0, 'id', df_final.index)
#df_final.rename(columns={'Unnamed: 0':'ID', 0: 'FECHA'}, inplace=True)
Ejemplo n.º 35
0
X = vectorizer.fit_transform(df.text)
vectorizer.get_feature_names()

vect_df = pd.DataFrame(X.toarray(), columns=[vectorizer.get_feature_names()])
vect_df.shape
vect_df.head()

lda_range= range(1,20)
lda_eval = []

for n in lda_range:
    lda = LatentDirichletAllocation(n_topics=n, max_iter=5,
                                    learning_method='online', learning_offset=50.,
                                    random_state=0)
    lda.fit(vect_df)
    score = lda.score(vect_df)
    perplexity = lda.perplexity(vect_df)
    print n,score,perplexity
    lda_eval.append({'topics':n,'score':score,'perplexity':perplexity})

for item in lda_eval:
    print item

lda = LatentDirichletAllocation(n_topics=5, n_jobs=-1)


topics = lda.fit_transform(vect_df)
lda.perplexity(vect_df)
lda.score(vect_df)
topics[2545]
df.ix[2545].text
Ejemplo n.º 36
0
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)

lda.fit(corpusVect)

tf_feature_names = vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


lda.score(corpusVect)
lda.perplexity(corpusVect)

#### Titles

corp2 = dataWeek.title
CleanTextTransformer().fit(corp2)
corpCTT2 = CleanTextTransformer().transform(corp2)

corpCTTvect = vectorizer.fit_transform(corpCTT2)
corpusTitlesVect = pd.DataFrame(corpCTTvect.todense(),columns=vectorizer.get_feature_names())

lda2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
Ejemplo n.º 37
0
                                stop_words='english')

tf = tf_vectorizer.fit_transform(blogs.article_body)



lda_eval2 = []

ldaRANGE = [9,10,11,12,13,14,15,16,17,18,19,20,30,40,50,60,70,80,90,100,150,200,300]

for n in ldaRANGE:
    lda = LatentDirichletAllocation(n_topics=n, max_iter=5,
                                    learning_method='online', learning_offset=50.,
                                    random_state=0)
    lda.fit(tf)
    score = lda.score(tf)
    perplexity = lda.perplexity(tf)
    print n,score,perplexity
    lda_eval2.append({'topics':n,'score':score,'perplexity':perplexity})

for item in lda_eval2:
    print item

lda_eval22 = pd.DataFrame(lda_eval2)

lda_eval22

import matplotlib.pyplot as plt

lda_eval22
plt.style.use('ggplot')
        test_perplexities = []  # size: (max_iter / valid_iter) * (n_splits)


        for i in range(int(max_iter / valid_iter)):
            train_s = []
            test_s = []
            train_p = []
            test_p = []

            print '\ntraining ', i * valid_iter + 1, '-th iteration'

            for train_index, test_index in splited_index:
                train_data, test_data = dataset[train_index], dataset[test_index]
                lda_model.partial_fit(train_data)

                train_s.append(lda_model.score(train_data))
                test_s.append(lda_model.score(test_data))

                train_p.append(lda_model.perplexity(train_data))
                test_p.append(lda_model.perplexity(test_data))

            train_scores.append(train_s)
            test_scores.append(test_s)
            train_perplexities.append(train_p)
            test_perplexities.append(test_p)

            print "train_scores: ", train_scores[i], " test_scores: ", test_scores[i], " train_perplexities: ", train_perplexities[i], " test_perplexities: ", test_perplexities[i]


        dict_num_topic[str(n_component) + '_topics'] = {
            "max_iter": max_iter, "valid_iter": valid_iter,