def feature_pred(features, chik, ldak):
    global users
    wn.ensure_loaded()
    facts = gt.get_fact_topics(DIR)

    if NEW_DATA:
        users = gt.get_users(DIR)
        transactions = gt.get_transactions(DIR)
        print(transactions.describe())

        tr_hsh = transactions['fact'].values
        # if castillo: comment cond2 out
        cond = facts['hash'].isin(tr_hsh)
        cond2 = facts['true'] == 1 | facts['true'] == 0
        facts = facts[cond & cond2]
        facts = Parallel(n_jobs=num_jobs)(delayed(get_features)(
            fact, transactions[transactions['fact'] == fact['hash']], [
                u for u in users if int(u.user_id) in list(transactions[
                    transactions['fact'] == fact['hash']]['user_id'].values)
            ]) for idx, fact in facts.iterrows())
        facts = pd.DataFrame(facts)
        with open('model_data/feature_data', 'wb') as tmpfile:
            pickle.dump(facts, tmpfile)
    else:
        with open('model_data/feature_data', 'rb') as tmpfile:
            facts = pickle.load(tmpfile)

    print(facts[list(features)].describe())
    X = facts[list(features)].values
    y = facts['y'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    std_clf = make_pipeline(StandardScaler(), PCA(n_components=ldak),
                            SVC(C=1, gamma=1))
    std_clf.fit(X_train, y_train)
    pred_test_std = std_clf.predict(X_test)
    precision, recall, fscore, sup = precision_recall_fscore_support(
        y_test, pred_test_std, average='macro')
    score = metrics.accuracy_score(y_test, pred_test_std)
    print("Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f" %
          (score, precision, recall, fscore))
    acc_scores = cross_val_score(std_clf, X, y, cv=3)
    pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3)
    re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3)
    f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3)
    print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" %
          (acc_scores.mean(), acc_scores.std() * 2))
    print("\t Cross validated Precision: %0.3f (+/- %0.3f)" %
          (pr_scores.mean(), pr_scores.std() * 2))
    print("\t Cross validated Recall: %0.3f (+/- %0.3f)" %
          (re_scores.mean(), re_scores.std() * 2))
    print("\t Cross validated F1: %0.3f (+/- %0.3f)" %
          (f1_scores.mean(), f1_scores.std() * 2))

    return acc_scores.mean()
Ejemplo n.º 2
0
def main():
    global bow_corpus
    global word_to_idx
    wn.ensure_loaded()
    if NEW_CORPUS:
        bow_corpus = build_bow_corpus(get_users())
        save_corpus(bow_corpus)
    else:
        bow_corpus = get_corpus()

    bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2]
    word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)}
    idx_to_word = {idx: k for k, idx in word_to_idx.items()}

    users = get_users()
    facts = gt.get_fact_topics()
    transactions = gt.get_transactions()
    users_df = pd.DataFrame([vars(u) for u in users])
    print(users_df.describe())
    print(users_df[users_df['stance'] == 0].describe())
    print(users_df[users_df['stance'] == 1].describe())
    print(users_df[users_df['stance'] == 2].describe())
    print(users_df[users_df['stance'] == 3].describe())
    users_df['f_t'] = users_df['fact'].map(
        lambda x: facts[facts['hash'] == x]['true'].values[0])
    c_true = users_df['f_t'] == '1'
    c_fal = users_df['f_t'] == '0'
    c_fal1 = users_df['f_t'] == 0
    c_den = users_df['stance'] == 0
    c_sup = users_df['stance'] == 1
    print(users_df[c_true & c_sup].describe())
    print(users_df[c_fal | c_fal1][c_den].describe())
    print(users_df[c_fal | c_fal1][c_sup].describe())
    print(users_df[c_true & c_den].describe())
    print(users_df[users_df['was_correct'] == 1].describe())
    print(users_df[users_df['was_correct'] == 0].describe())
    print(len([t for u in users for t in u.tweets if u.tweets is not None]))

    corpus_analysis(bow_corpus, word_to_idx, idx_to_word)
    # temporal_analysis(get_users())

    cluster_users_on_tweets(users, word_to_idx, idx_to_word)
Ejemplo n.º 3
0
def main():
    global users
    wn.ensure_loaded()
    facts = gt.get_fact_topics(DIR)
    features = [
        'avg_mentions', 'avg_emoticons', 'avg_links', 'avg_questionM',
        'avg_personal_pronoun_first', 'avg_sent_pos', 'avg_sent_neg',
        'avg_sentiment', 'fr_has_url', 'share_most_freq_author', 'lvl_size',
        'avg_followers', 'avg_friends', 'avg_status_cnt', 'avg_reg_age'
    ]
    if NEW_DATA:
        users = gt.get_users(DIR)
        transactions = gt.get_transactions(DIR)
        print(transactions.describe())

        tr_hsh = transactions['fact'].values
        cond = facts['hash'].isin(tr_hsh)
        facts = facts[cond]
        facts = pd.DataFrame([
            get_features(fact, transactions, users)
            for idx, fact in facts.iterrows() if fact['true'] != 'unknown'
        ])
        with open('model_data/castillo_data', 'wb') as tmpfile:
            pickle.dump(facts, tmpfile)
    else:
        with open('model_data/castillo_data', 'rb') as tmpfile:
            facts = pickle.load(tmpfile)
    print(facts.describe())
    X = facts[list(features)].values
    y = facts['y'].values

    fig = plt.figure()
    fig.subplots_adjust(hspace=0.4, wspace=0.4)
    for i in range(1, len(features) + 1):
        ax = fig.add_subplot(3, 5, i)
        sns.boxplot(x="y", y=features[i - 1], data=facts, palette="Set3")
    #plt.show()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    std_clf = make_pipeline(StandardScaler(),
                            DecisionTreeClassifier(random_state=42))
    std_clf.fit(X_train, y_train)
    pred_test_std = std_clf.predict(X_test)
    precision, recall, fscore, sup = precision_recall_fscore_support(
        y_test, pred_test_std, average='macro')
    score = metrics.accuracy_score(y_test, pred_test_std)
    print(
        "Random split: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f"
        % (score, precision, recall, fscore))
    acc_scores = cross_val_score(std_clf, X, y, cv=3)
    pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3)
    re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3)
    f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3)
    print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" %
          (acc_scores.mean(), acc_scores.std() * 2))
    print("\t Cross validated Precision: %0.3f (+/- %0.3f)" %
          (pr_scores.mean(), pr_scores.std() * 2))
    print("\t Cross validated Recall: %0.3f (+/- %0.3f)" %
          (re_scores.mean(), re_scores.std() * 2))
    print("\t Cross validated F1: %0.3f (+/- %0.3f)" %
          (f1_scores.mean(), f1_scores.std() * 2))