def feature_pred(features, chik, ldak): global users wn.ensure_loaded() facts = gt.get_fact_topics(DIR) if NEW_DATA: users = gt.get_users(DIR) transactions = gt.get_transactions(DIR) print(transactions.describe()) tr_hsh = transactions['fact'].values # if castillo: comment cond2 out cond = facts['hash'].isin(tr_hsh) cond2 = facts['true'] == 1 | facts['true'] == 0 facts = facts[cond & cond2] facts = Parallel(n_jobs=num_jobs)(delayed(get_features)( fact, transactions[transactions['fact'] == fact['hash']], [ u for u in users if int(u.user_id) in list(transactions[ transactions['fact'] == fact['hash']]['user_id'].values) ]) for idx, fact in facts.iterrows()) facts = pd.DataFrame(facts) with open('model_data/feature_data', 'wb') as tmpfile: pickle.dump(facts, tmpfile) else: with open('model_data/feature_data', 'rb') as tmpfile: facts = pickle.load(tmpfile) print(facts[list(features)].describe()) X = facts[list(features)].values y = facts['y'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) std_clf = make_pipeline(StandardScaler(), PCA(n_components=ldak), SVC(C=1, gamma=1)) std_clf.fit(X_train, y_train) pred_test_std = std_clf.predict(X_test) precision, recall, fscore, sup = precision_recall_fscore_support( y_test, pred_test_std, average='macro') score = metrics.accuracy_score(y_test, pred_test_std) print("Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f" % (score, precision, recall, fscore)) acc_scores = cross_val_score(std_clf, X, y, cv=3) pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3) re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3) f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3) print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" % (acc_scores.mean(), acc_scores.std() * 2)) print("\t Cross validated Precision: %0.3f (+/- %0.3f)" % (pr_scores.mean(), pr_scores.std() * 2)) print("\t Cross validated Recall: %0.3f (+/- %0.3f)" % (re_scores.mean(), re_scores.std() * 2)) print("\t Cross validated F1: %0.3f (+/- %0.3f)" % (f1_scores.mean(), f1_scores.std() * 2)) return acc_scores.mean()
def main(): global bow_corpus global word_to_idx wn.ensure_loaded() if NEW_CORPUS: bow_corpus = build_bow_corpus(get_users()) save_corpus(bow_corpus) else: bow_corpus = get_corpus() bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2] word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)} idx_to_word = {idx: k for k, idx in word_to_idx.items()} users = get_users() facts = gt.get_fact_topics() transactions = gt.get_transactions() users_df = pd.DataFrame([vars(u) for u in users]) print(users_df.describe()) print(users_df[users_df['stance'] == 0].describe()) print(users_df[users_df['stance'] == 1].describe()) print(users_df[users_df['stance'] == 2].describe()) print(users_df[users_df['stance'] == 3].describe()) users_df['f_t'] = users_df['fact'].map( lambda x: facts[facts['hash'] == x]['true'].values[0]) c_true = users_df['f_t'] == '1' c_fal = users_df['f_t'] == '0' c_fal1 = users_df['f_t'] == 0 c_den = users_df['stance'] == 0 c_sup = users_df['stance'] == 1 print(users_df[c_true & c_sup].describe()) print(users_df[c_fal | c_fal1][c_den].describe()) print(users_df[c_fal | c_fal1][c_sup].describe()) print(users_df[c_true & c_den].describe()) print(users_df[users_df['was_correct'] == 1].describe()) print(users_df[users_df['was_correct'] == 0].describe()) print(len([t for u in users for t in u.tweets if u.tweets is not None])) corpus_analysis(bow_corpus, word_to_idx, idx_to_word) # temporal_analysis(get_users()) cluster_users_on_tweets(users, word_to_idx, idx_to_word)
def main(): global users wn.ensure_loaded() facts = gt.get_fact_topics(DIR) features = [ 'avg_mentions', 'avg_emoticons', 'avg_links', 'avg_questionM', 'avg_personal_pronoun_first', 'avg_sent_pos', 'avg_sent_neg', 'avg_sentiment', 'fr_has_url', 'share_most_freq_author', 'lvl_size', 'avg_followers', 'avg_friends', 'avg_status_cnt', 'avg_reg_age' ] if NEW_DATA: users = gt.get_users(DIR) transactions = gt.get_transactions(DIR) print(transactions.describe()) tr_hsh = transactions['fact'].values cond = facts['hash'].isin(tr_hsh) facts = facts[cond] facts = pd.DataFrame([ get_features(fact, transactions, users) for idx, fact in facts.iterrows() if fact['true'] != 'unknown' ]) with open('model_data/castillo_data', 'wb') as tmpfile: pickle.dump(facts, tmpfile) else: with open('model_data/castillo_data', 'rb') as tmpfile: facts = pickle.load(tmpfile) print(facts.describe()) X = facts[list(features)].values y = facts['y'].values fig = plt.figure() fig.subplots_adjust(hspace=0.4, wspace=0.4) for i in range(1, len(features) + 1): ax = fig.add_subplot(3, 5, i) sns.boxplot(x="y", y=features[i - 1], data=facts, palette="Set3") #plt.show() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) std_clf = make_pipeline(StandardScaler(), DecisionTreeClassifier(random_state=42)) std_clf.fit(X_train, y_train) pred_test_std = std_clf.predict(X_test) precision, recall, fscore, sup = precision_recall_fscore_support( y_test, pred_test_std, average='macro') score = metrics.accuracy_score(y_test, pred_test_std) print( "Random split: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f" % (score, precision, recall, fscore)) acc_scores = cross_val_score(std_clf, X, y, cv=3) pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3) re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3) f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3) print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" % (acc_scores.mean(), acc_scores.std() * 2)) print("\t Cross validated Precision: %0.3f (+/- %0.3f)" % (pr_scores.mean(), pr_scores.std() * 2)) print("\t Cross validated Recall: %0.3f (+/- %0.3f)" % (re_scores.mean(), re_scores.std() * 2)) print("\t Cross validated F1: %0.3f (+/- %0.3f)" % (f1_scores.mean(), f1_scores.std() * 2))