def train(links):
    from math import exp,fabs,log
    fwords=most_frequent_words()
    classifiers=[PredicateClassifier(HasWordsPredicate([w])) for w in fwords]
    #classifiers.extend(PredicateClassifier(HasWordsPredicate(duo)) for duo in most_frequent_duos(fwords))
    titles=[mash_post(l) for l in links]
    evaluations=[1. if l.evaluation else -1. for l in links]
    weights=[1./len(links) for l in links]
    trained=[]
    print "Training on %d features..." % len(classifiers)
    while True:
        print ".",
        min_error=1e6 ; best=None
        for c in classifiers:
            c.train(titles,weights,evaluations)
            error=sum(weights[n]*0.5*fabs(c.predict(t)-evaluations[n]) for n,t in enumerate(titles))
            if error < min_error:
                best=c; min_error=error
        if min_error>=0.5:
            print min_error
            break
        Zt=sum(weights[n]*exp(-best.predict(t)*evaluations[n]) for n,t in enumerate(titles))
        weights=[weights[n]*exp(-best.predict(t)*evaluations[n])/Zt for n,t in enumerate(titles)]
        alphat=0.5*log((1-min_error)/min_error)
        trained.append((best,alphat))
        classifiers.remove(best)
    for c,alpha in trained:
        print c.predicate,c.wordgood,alpha
    import cPickle
    cPickle.dump(trained,open("adaboost.pck","wb"),-1)
Beispiel #2
0
def text_analysis(
        data_path,
        column,
        groups,
        language,
        lemmatize,
        ngram_range,
        num_topics,
        num_words,
        manual_mappings,
        generate_word_cloud,
        word_cloud_filename,
        frequent_words_filename,
        frequent_words_plot_filename,
        top_tfidf_words_filename,
        top_tfidf_words_plot_filename,
        predict_topics,
        topics_filename,
        predicted_topics_filename,
        ldavis_filename_prefix,
        predict_sentiment,
        predicted_sentiment_filename,
        should_upload_db,
        account_key_path
):
    print("Loading data...")
    data_df = load_data(data_path, column, groups)
    print("Loaded data sample")
    print(data_df.head())
    print()

    print("Cleaning data...")
    data_df[column] = clean_data(data_df[column])
    print("Clean data sample")
    print(data_df.head())
    print()

    print("Removing stop words from data...")
    data_df[column] = remove_stopwords(data_df[column], language)
    print("Data sample")
    print(data_df.head())
    print()

    if lemmatize:
        print("Lemmatizing data...")
        data_df[column] = lemmatize_text(data_df[column], language)
        print("Lemmatized data sample")
        print(data_df.head())
        print()

    if manual_mappings:
        print("Applying manual mappings...")
        data_df[column] = apply_manual_mappings(data_df[column], manual_mappings)
        print("Manually mapped data sample")
        print(data_df.head())
        print()

    if generate_word_cloud:
        print("Generating word cloud...")
        plot_word_cloud(data_df[column], word_cloud_filename, language)
        print("word_cloud saved to:", word_cloud_filename)
        print()

    count_vectorizer, count_data = get_count_vectorizer_and_transformed_data(
        data_df[column], language, ngram_range
    )
    all_word_count_pair_list = most_frequent_words(
        count_data, count_vectorizer, count_data.shape[0] + 1
    )
    word_count_pair_list = all_word_count_pair_list[:num_words]

    tfidf_vectorizer, tfidf_data = get_tfidf_vectorizer_and_transformed_data(
        data_df[column], language, ngram_range
    )
    all_tfidf_pair_list = most_frequent_words(
        tfidf_data, tfidf_vectorizer, tfidf_data.shape[0] + 1
    )
    tfidf_pair_list = all_tfidf_pair_list[:num_words]

    print("Saving frequent words...")
    save_words(
        all_word_count_pair_list,
        frequent_words_filename
    )
    print("Frequent words saved to:", frequent_words_filename)
    print()

    if should_upload_db:
        db_client = connect_db(account_key_path)
    else:
        db_client = None

    if should_upload_db:
        print("Uploading frequent words to db...")
        upload_db(db_client, 'frequent_words', {
            column: {w: int(c) for w, c in word_count_pair_list}
        })
        print('Done')
        print()

    print("Generating frequent word plot...")
    plot_top_words(word_count_pair_list, frequent_words_plot_filename)
    print("Frequent word plot saved to:", frequent_words_plot_filename)
    print()

    print("Saving top tfidf words...")
    save_words(
        all_tfidf_pair_list,
        top_tfidf_words_filename
    )
    print("Top tfidf words saved to:", top_tfidf_words_filename)
    print()

    if should_upload_db:
        print("Uploading frequent words to db...")
        upload_db(db_client, 'top_tfidf', {
            column: {w: int(c) for w, c in tfidf_pair_list}
        })
        print('Done')
        print()

    print("Generating top tfidf word plot...")
    plot_top_words(tfidf_pair_list, top_tfidf_words_plot_filename)
    print("Top tfidf word plot saved to:", top_tfidf_words_plot_filename)
    print()

    if groups:
        group_unique_vals = {}
        for group in groups:
            group_unique_vals[group] = data_df[group].unique()

        splits = {}
        for group, unique_vals in group_unique_vals.items():
            for val in unique_vals:
                splits[(group, val)] = data_df[group] == val

        for i in range(len(groups) - 1):
            splits = concat_splits(splits)

        grouped_words_counts = {}
        grouped_words_tfidf = {}

        for key, split_idcs in splits.items():
            split = data_df[split_idcs]
            split_texts = split[column]

            if len(split_texts) > 0 and any(split_texts.str.len() > 0):
                word_cloud_filename_val = add_prefix_to_filename(
                    word_cloud_filename, key
                )
                frequent_words_filename_val = add_prefix_to_filename(
                    frequent_words_filename, key
                )
                frequent_words_plot_filename_val = add_prefix_to_filename(
                    frequent_words_plot_filename, key
                )
                top_tfidf_words_filename_val = add_prefix_to_filename(
                    top_tfidf_words_filename, key
                )
                top_tfidf_words_plot_filename_val = add_prefix_to_filename(
                    top_tfidf_words_plot_filename, key
                )

                if generate_word_cloud:
                    print("Generating word cloud...")
                    plot_word_cloud(split_texts, word_cloud_filename_val, language)
                    print("word_cloud saved to:", word_cloud_filename_val)
                    print()

                try:
                    count_vectorizer, count_data = get_count_vectorizer_and_transformed_data(
                        split_texts, language, ngram_range
                    )
                    all_word_count_pair_list = most_frequent_words(
                        count_data, count_vectorizer, count_data.shape[0] + 1
                    )
                    word_count_pair_list = all_word_count_pair_list[:num_words]

                    tfidf_vectorizer, tfidf_data = get_tfidf_vectorizer_and_transformed_data(
                        split_texts, language, ngram_range
                    )
                    all_tfidf_pair_list = most_frequent_words(
                        tfidf_data, tfidf_vectorizer, tfidf_data.shape[0] + 1
                    )
                    tfidf_pair_list = all_tfidf_pair_list[:num_words]

                    print("Saving frequent words...")
                    save_words(
                        all_word_count_pair_list,
                        frequent_words_filename_val
                    )
                    print("Frequent words saved to:", frequent_words_filename_val)
                    print()

                    print("Generating frequent word plot...")
                    plot_top_words(word_count_pair_list, frequent_words_plot_filename_val)
                    print("Frequent word plot saved to:", frequent_words_plot_filename_val)
                    print()

                    print("Saving top tfidf words...")
                    save_words(
                        all_tfidf_pair_list,
                        top_tfidf_words_filename_val
                    )
                    print("Top tfidf words saved to:", top_tfidf_words_filename_val)
                    print()

                    print("Generating top tfidf word plot...")
                    plot_top_words(tfidf_pair_list, top_tfidf_words_plot_filename_val)
                    print("Top tfidf word plot saved to:", top_tfidf_words_plot_filename_val)
                    print()

                    grouped_words_counts[key[1::2]] = {
                        w: int(c) for w, c in all_word_count_pair_list
                    }
                    grouped_words_tfidf[key[1::2]] = {
                        w: int(c) for w, c in all_tfidf_pair_list
                    }
                except:
                    print("Error processing", key,
                          "skipping it. texts are probably all stopwords")

        print("Saving grouped frequent words...")
        group_frequent_words_filename = add_prefix_to_filename(
            frequent_words_filename, groups
        )
        remapped_grouped_words_counts = remap_keys(grouped_words_counts, groups)
        with open(group_frequent_words_filename, 'w', encoding="utf8") as f:
            json.dump(remapped_grouped_words_counts, f, ensure_ascii=False)
        print("Frequent words saved to:", group_frequent_words_filename)
        print()

        if should_upload_db:
            print("Uploading grouped_words_counts to db...")
            upload_db(db_client, 'grouped_words_counts', {
                column: remap_to_dict(remapped_grouped_words_counts)
            })
            print('Done')
            print()

        print("Saving grouped top tfidf words...")
        group_top_tfidf_words_filename = add_prefix_to_filename(
            top_tfidf_words_filename, groups
        )
        remapped_grouped_words_tfidf = remap_keys(grouped_words_tfidf, groups)
        with open(group_top_tfidf_words_filename, 'w', encoding="utf8") as f:
            json.dump(remapped_grouped_words_tfidf, f, ensure_ascii=False)
        print("Top tfidf words saved to:", group_top_tfidf_words_filename)
        print()

        if should_upload_db:
            print("Uploading grouped_words_tfidf to db...")
            upload_db(db_client, 'grouped_words_tfidf', {
                column: remap_to_dict(remapped_grouped_words_tfidf)
            })
            print('Done')
            print()

    if predict_topics:
        print("Calculating topic model...")
        lda, predicted_topics = learn_topic_model(tfidf_data, num_topics)
        print("Topics found via LDA:")
        print_topics(lda, tfidf_vectorizer, num_words)
        print("Saving topics...")
        save_topics(lda, tfidf_vectorizer, topics_filename)
        print("Topics saved to:", topics_filename)
        print()

        print("Saving predicted topics...")
        save_predicted_topics(predicted_topics, predicted_topics_filename)
        print("Predicted topics saved to:", predicted_topics_filename)
        print()

        if should_upload_db:
            print("Uploading predicted topics to db...")
            upload_db(db_client, 'predicted_topics', {
                column: json.loads(pd.DataFrame(predicted_topics).to_json(
                    orient='index', force_ascii=False
                ))
            })
            print('Done')
            print()

        print("Generating LDA visualization...")
        visualize_topic_model(lda, count_data, tfidf_vectorizer,
                              num_topics, ldavis_filename_prefix)
        print("LDA visualization saved to:", ldavis_filename_prefix)
        print()

    if predict_sentiment:
        if language == 'it':
            print("Predict sentiment...")
            predicted_sentiment = predict_sentiment_with_sentita(data_df[column])
            save_sentiment(predicted_sentiment, predicted_sentiment_filename)
            print("Predict sentiment saved to:", predicted_sentiment_filename)
            print()

            if should_upload_db:
                print("Uploading predicted sentiment to db...")
                upload_db(db_client, 'predicted_sentiment', {
                    column: json.loads(pd.DataFrame(predicted_sentiment).to_json(
                        orient='index', force_ascii=False
                    ))
                })
                print('Done')
                print()

        elif language == 'en':
            print("Predict sentiment...")
            predicted_sentiment = predict_sentiment_with_paralleldots(data_df)
            save_sentiment(predicted_sentiment, predicted_sentiment_filename)
            print("Predict sentiment saved to:", predicted_sentiment_filename)
            print()

            if should_upload_db:
                print("Uploading predicted sentiment to db...")
                upload_db(db_client, 'predicted_sentiment', {
                    column: json.loads(pd.DataFrame(predicted_sentiment).to_json(
                        orient='index', force_ascii=False
                    ))
                })
                print('Done')
                print()
        else:
            print("Sentiment analysis on {} language is not supported")
            print()
Beispiel #3
0
import utils
from datamodel import *
import database as db

if __name__ == '__main__':
    s=db.Session()
    links=s.query(Link)
    total=0;good=0;bad=0;hidden=0
    for l in links:
        total+=1
        if l.evaluation:
            good+=1
        elif l.evaluation==False:
            bad+=1
        if l.hidden==True:
            hidden+=1
    titles=[utils.tokenize(l.title) for l in links]
    fwords=utils.most_frequent_words()
    print fwords
    not_null=0
    for t in titles:
        for w in fwords:
            if w in t:
                not_null+=1
                break
    print float(not_null)/len(titles)*100.,len(fwords)
    print "%d links, %d good, %d bad, %d hidden" %(total,good,bad,hidden)

    #for d in utils.most_frequent_duos(fwords):
    #    print d