def feature_pred(features, chik, ldak): global users wn.ensure_loaded() facts = gt.get_fact_topics(DIR) if NEW_DATA: users = gt.get_users(DIR) transactions = gt.get_transactions(DIR) print(transactions.describe()) tr_hsh = transactions['fact'].values # if castillo: comment cond2 out cond = facts['hash'].isin(tr_hsh) cond2 = facts['true'] == 1 | facts['true'] == 0 facts = facts[cond & cond2] facts = Parallel(n_jobs=num_jobs)(delayed(get_features)( fact, transactions[transactions['fact'] == fact['hash']], [ u for u in users if int(u.user_id) in list(transactions[ transactions['fact'] == fact['hash']]['user_id'].values) ]) for idx, fact in facts.iterrows()) facts = pd.DataFrame(facts) with open('model_data/feature_data', 'wb') as tmpfile: pickle.dump(facts, tmpfile) else: with open('model_data/feature_data', 'rb') as tmpfile: facts = pickle.load(tmpfile) print(facts[list(features)].describe()) X = facts[list(features)].values y = facts['y'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) std_clf = make_pipeline(StandardScaler(), PCA(n_components=ldak), SVC(C=1, gamma=1)) std_clf.fit(X_train, y_train) pred_test_std = std_clf.predict(X_test) precision, recall, fscore, sup = precision_recall_fscore_support( y_test, pred_test_std, average='macro') score = metrics.accuracy_score(y_test, pred_test_std) print("Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f" % (score, precision, recall, fscore)) acc_scores = cross_val_score(std_clf, X, y, cv=3) pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3) re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3) f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3) print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" % (acc_scores.mean(), acc_scores.std() * 2)) print("\t Cross validated Precision: %0.3f (+/- %0.3f)" % (pr_scores.mean(), pr_scores.std() * 2)) print("\t Cross validated Recall: %0.3f (+/- %0.3f)" % (re_scores.mean(), re_scores.std() * 2)) print("\t Cross validated F1: %0.3f (+/- %0.3f)" % (f1_scores.mean(), f1_scores.std() * 2)) return acc_scores.mean()
def main(): global bow_corpus global word_to_idx wn.ensure_loaded() if NEW_CORPUS: bow_corpus = build_bow_corpus(get_users()) save_corpus(bow_corpus) else: bow_corpus = get_corpus() bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2] word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)} idx_to_word = {idx: k for k, idx in word_to_idx.items()} users = get_users() facts = gt.get_fact_topics() transactions = gt.get_transactions() users_df = pd.DataFrame([vars(u) for u in users]) print(users_df.describe()) print(users_df[users_df['stance'] == 0].describe()) print(users_df[users_df['stance'] == 1].describe()) print(users_df[users_df['stance'] == 2].describe()) print(users_df[users_df['stance'] == 3].describe()) users_df['f_t'] = users_df['fact'].map( lambda x: facts[facts['hash'] == x]['true'].values[0]) c_true = users_df['f_t'] == '1' c_fal = users_df['f_t'] == '0' c_fal1 = users_df['f_t'] == 0 c_den = users_df['stance'] == 0 c_sup = users_df['stance'] == 1 print(users_df[c_true & c_sup].describe()) print(users_df[c_fal | c_fal1][c_den].describe()) print(users_df[c_fal | c_fal1][c_sup].describe()) print(users_df[c_true & c_den].describe()) print(users_df[users_df['was_correct'] == 1].describe()) print(users_df[users_df['was_correct'] == 0].describe()) print(len([t for u in users for t in u.tweets if u.tweets is not None])) corpus_analysis(bow_corpus, word_to_idx, idx_to_word) # temporal_analysis(get_users()) cluster_users_on_tweets(users, word_to_idx, idx_to_word)
def main(): global bow_corpus global word_to_idx, idx_to_word, fact_to_words global bow_corpus_top_n wn.ensure_loaded() print('Grabbing Data') bow_corpus = gt.get_corpus() facts = gt.get_fact_topics() facts = facts[facts['true'] != 'unknown'] bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2] word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)} idx_to_word = {idx: k for k, idx in word_to_idx.items()} fact_to_words = { r['hash']: [w for w in r['fact_terms']] for index, r in facts[['hash', 'fact_terms']].iterrows() } if NEW_MODEL: users = gt.get_users() # Prepping lstm model top_words = 50000 X, y, user_order = lstm_cred.get_prebuilt_data() X, y, user_order = lstm_cred.balance_classes(X, y, user_order) #X_train, X_test, y_train, y_test = train_test_split_every_user(X, y, user_order) #X_train, X_test, y_train, y_test = train_test_split_on_facts(X, y, user_order, facts_train.values, users) #X_train, X_test, y_train, y_test = lstm_cred.train_test_split_on_users(X, y, user_order, users, 100) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) X_train, X_test, word_to_idx = lstm_cred.keep_n_best_words( X_train, y_train, X_test, y_test, idx_to_word, top_words) max_tweet_length = 12 X_train = sequence.pad_sequences(X_train, maxlen=max_tweet_length) X_test = sequence.pad_sequences(X_test, maxlen=max_tweet_length) # Training lstm model embedding_vecor_length = 32 model = Sequential() model.add( Embedding(top_words, embedding_vecor_length, input_length=max_tweet_length)) model.add(Dropout(0.2)) model.add(LSTM(100)) model.add(Dropout(0.2)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64) model.save('model_data/cred_model.h5') scores = model.evaluate(X_test, y_test, verbose=0) print("Accuracy: %.2f%%" % (scores[1] * 100)) if NEW_REL_TWEETS: print('Building new relevant tweets') users = Parallel(n_jobs=num_jobs)( delayed(get_relevant_tweets)(user) for user in users) #users = Parallel(n_jobs=num_jobs)(delayed(get_relevant_tweets_test_set)(user, X_test) for user in users) user_to_rel_tweet = { user.user_id: user.features['relevant_tweets'] for user in users if 'relevant_tweets' in user.features } with open('model_data/relevant_tweets.pkl', 'wb') as tmpfile: pickle.dump(user_to_rel_tweet, tmpfile) else: with open('model_data/relevant_tweets.pkl', 'rb') as tmpfile: user_to_rel_tweet = pickle.load(tmpfile) for user in users: if 'relevant_tweets' in user.features: user.features['relevant_tweets'] = user_to_rel_tweet[ user.user_id] # Build credibility scores for all users on their topic print('Computing credibility') users = [prebuild_cred(model, u) for u in users] users_df = pd.DataFrame([vars(u) for u in users]) [store_result(u) for u in users] with open('model_data/cred_pred_data', 'wb') as tmpfile: pickle.dump({'users': users_df, 'map': word_to_idx}, tmpfile) else: print('Loading users & model') with open('model_data/cred_pred_data', 'rb') as tmpfile: construct = pickle.load(tmpfile) users_df = construct['users'] word_to_idx = construct['map'] print('Making cred*sent predictions') X = [] y = [] for idx, hsh in enumerate(facts['hash'].values): this_users = users_df[users_df['fact'] == hsh] this_x = cred_stance_prediction(this_users) this_y = facts['true'].iloc[idx] X.append((np.average(this_x), np.std(this_x))) y.append(int(this_y)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) std_clf = make_pipeline(StandardScaler(), LinearSVC) std_clf.fit(X_train, y_train) pred = std_clf.predict(X_test) score = metrics.accuracy_score(y_test, pred) precision, recall, fscore, sup = metrics.precision_recall_fscore_support( y_test, pred, average='macro') print( "Rumors: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f" % (score, precision, recall, fscore)) acc_scores = cross_val_score(std_clf, X, y, cv=3) pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3) re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3) f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3) print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" % (acc_scores.mean(), acc_scores.std() * 2)) print("\t Cross validated Precision: %0.3f (+/- %0.3f)" % (pr_scores.mean(), pr_scores.std() * 2)) print("\t Cross validated Recall: %0.3f (+/- %0.3f)" % (re_scores.mean(), re_scores.std() * 2)) print("\t Cross validated F1: %0.3f (+/- %0.3f)" % (f1_scores.mean(), f1_scores.std() * 2)) print('Making cred*stance predictions') X = [] y = [] all_evidence = [] for idx, hsh in enumerate(facts['hash'].values): this_users = users_df[users_df['fact'] == hsh] this_x, evidence = only_cred_support_deny_pred(this_users) this_y = facts['true'].iloc[idx] evidence = sorted(evidence, reverse=True, key=lambda x: x[0]) # print(facts[facts['hash']==hsh]['text'].values, int(this_y), this_x[-1]) # print(evidence if len(evidence) <3 else evidence[:3]) X.append((np.average(this_x), np.std(this_x))) y.append(int(this_y)) print(X[:20]) print(y[:20]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) std_clf = make_pipeline(StandardScaler(), LinearSVC()) std_clf.fit(X_train, y_train) pred = std_clf.predict(X_test) score = metrics.accuracy_score(y_test, pred) precision, recall, fscore, sup = metrics.precision_recall_fscore_support( y_test, pred, average='macro') print( "Rumors: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f" % (score, precision, recall, fscore)) acc_scores = cross_val_score(std_clf, X, y, cv=3) pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3) re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3) f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3) print(acc_scores) print(pr_scores) print(re_scores) print(f1_scores) print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" % (acc_scores.mean(), acc_scores.std() * 2)) print("\t Cross validated Precision: %0.3f (+/- %0.3f)" % (pr_scores.mean(), pr_scores.std() * 2)) print("\t Cross validated Recall: %0.3f (+/- %0.3f)" % (re_scores.mean(), re_scores.std() * 2)) print("\t Cross validated F1: %0.3f (+/- %0.3f)" % (f1_scores.mean(), f1_scores.std() * 2))
def main(): global bow_corpus global word_to_idx, idx_to_word, fact_to_words global bow_corpus_top_n wn.ensure_loaded() print('Grabbing Data') bow_corpus = gt.get_corpus() facts = gt.get_fact_topics() facts = facts[facts['true'] != 'unknown'] bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2] word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)} idx_to_word = {idx: k for k, idx in word_to_idx.items()} fact_to_words = { r['hash']: [w for w in r['fact_terms']] for index, r in facts[['hash', 'fact_terms']].iterrows() } # Credibility data print('Loading users & model') with open('model_data/cred_pred_data', 'rb') as tmpfile: construct = pickle.load(tmpfile) users_df = construct['users'] word_to_idx = construct['map'] # Feature data with open('model_data/feature_data', 'rb') as tmpfile: fact_features = pickle.load(tmpfile) features = [ 'avg_links', 'avg_sent_neg', 'avg_sentiment', 'fr_has_url', 'lvl_size', 'avg_len', 'avg_special_symbol', 'avg_time_retweet', 'avg_count_distinct_words', 'avg_sent_pos', 'cred_pred', 'cred_pred_std' ] print('Making cred*stance +best features predictions') facts['cred_pred'] = facts['hash'].map( lambda x: only_cred_support_deny_pred(users_df[users_df['fact'] == x])) facts['cred_pred_std'] = facts['cred_pred'].map(lambda x: np.std(x)) facts['cred_pred'] = facts['cred_pred'].map(lambda x: x[-1]) facts = facts.set_index('hash').join(fact_features.set_index('hash'), rsuffix='_other') X = facts[features].values y = facts['y'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) std_clf = make_pipeline(StandardScaler(), SVC(C=1, gamma=1)) std_clf.fit(X_train, y_train) pred = std_clf.predict(X_test) score = metrics.accuracy_score(y_test, pred) precision, recall, fscore, sup = metrics.precision_recall_fscore_support( y_test, pred, average='macro') print( "Rumors: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f" % (score, precision, recall, fscore)) acc_scores = cross_val_score(std_clf, X, y, cv=3) pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3) re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3) f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3) print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" % (acc_scores.mean(), acc_scores.std() * 2)) print("\t Cross validated Precision: %0.3f (+/- %0.3f)" % (pr_scores.mean(), pr_scores.std() * 2)) print("\t Cross validated Recall: %0.3f (+/- %0.3f)" % (re_scores.mean(), re_scores.std() * 2)) print("\t Cross validated F1: %0.3f (+/- %0.3f)" % (f1_scores.mean(), f1_scores.std() * 2))
def main(k_tweets): global bow_corpus global word_to_idx, idx_to_word, fact_to_words global bow_corpus_top_n wn.ensure_loaded() print('Grabbing Data') bow_corpus = gt.get_corpus() facts = gt.get_fact_topics() facts = facts[facts['true'] != 'unknown'] bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2] word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)} idx_to_word = {idx: k for k, idx in word_to_idx.items()} fact_to_words = { r['hash']: [w for w in r['fact_terms']] for index, r in facts[['hash', 'fact_terms']].iterrows() } users = gt.get_users() if NEW_MODEL: # Prepping lstm model top_words = 50000 X, y, user_order = lstm_cred.get_prebuilt_data() X, y, user_order = lstm_cred.balance_classes(X, y, user_order) X_train, X_test, y_train, y_test = train_test_split_every_user( X, y, user_order) #X_train, X_test, y_train, y_test = train_test_split_on_facts(X, y, user_order, facts_train.values, users) #X_train, X_test, y_train, y_test = lstm_cred.train_test_split_on_users(X, y, user_order, users, 100) #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) X_train, X_test, word_to_idx = lstm_cred.keep_n_best_words( X_train, y_train, X_test, y_test, idx_to_word, top_words) max_tweet_length = 12 X_train = sequence.pad_sequences(X_train, maxlen=max_tweet_length) X_test = sequence.pad_sequences(X_test, maxlen=max_tweet_length) # Training lstm model embedding_vecor_length = 32 model = Sequential() model.add( Embedding(top_words, embedding_vecor_length, input_length=max_tweet_length)) model.add(Dropout(0.2)) model.add(LSTM(100)) model.add(Dropout(0.2)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) #print(model.summary()) model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64, verbose=0) model.save('model_data/cred_model.h5') #scores = model.evaluate(X_test, y_test, verbose=0) #print("Accuracy: %.2f%%" % (scores[1] * 100)) else: model = load_model('model_data/cred_model.h5') if NEW_REL_TWEETS: print('Building new relevant tweets') users = Parallel(n_jobs=num_jobs)(delayed(get_relevant_tweets)(user) for user in users) #users = Parallel(n_jobs=num_jobs)(delayed(get_relevant_tweets_test_set)(user, X_test) for user in users) user_to_rel_tweet = { user.user_id: user.features['relevant_tweets'] for user in users if 'relevant_tweets' in user.features } with open('model_data/relevant_tweets.pkl', 'wb') as tmpfile: pickle.dump(user_to_rel_tweet, tmpfile) else: with open('model_data/relevant_tweets.pkl', 'rb') as tmpfile: user_to_rel_tweet = pickle.load(tmpfile) for user in users: user.features['relevant_tweets'] = user_to_rel_tweet[user.user_id] if 'relevant_tweets' in user.features and user.user_id in user_to_rel_tweet else [] if NEW_CRED: # Build credibility scores for all users on their topic print('Computing credibility') users = [prebuild_cred(model, u, k_tweets) for u in users] users_df = pd.DataFrame([vars(u) for u in users]) [store_result(u) for u in users] with open('model_data/cred_pred_data', 'wb') as tmpfile: pickle.dump({'users': users_df, 'map': word_to_idx}, tmpfile) else: print('Loading users & model') with open('model_data/cred_pred_data', 'rb') as tmpfile: construct = pickle.load(tmpfile) users_df = construct['users'] word_to_idx = construct['map'] print('Making cred*stance predictions') X = [] y = [] all_evidence = [] for idx, hsh in enumerate(facts['hash'].values): this_users = users_df[users_df['fact'] == hsh] this_x, evidence = only_cred_support_deny_pred(this_users) this_y = facts['true'].iloc[idx] evidence = sorted(evidence, reverse=True, key=lambda x: x[0]) #print(facts[facts['hash']==hsh]['text'].values, int(this_y), this_x[-1]) #print(evidence if len(evidence) <3 else evidence[:3]) X.append((this_x[-1], np.std(this_x))) y.append(int(this_y)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) std_clf = make_pipeline(StandardScaler(), SVC(C=1, gamma=1)) std_clf.fit(X_train, y_train) pred = std_clf.predict(X_test) score = metrics.accuracy_score(y_test, pred) precision, recall, fscore, sup = metrics.precision_recall_fscore_support( y_test, pred, average='macro') print( "Rumors: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f" % (score, precision, recall, fscore)) acc_scores = cross_val_score(std_clf, X, y, cv=3) pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3) re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3) f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3) print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" % (acc_scores.mean(), acc_scores.std() * 2)) print("\t Cross validated Precision: %0.3f (+/- %0.3f)" % (pr_scores.mean(), pr_scores.std() * 2)) print("\t Cross validated Recall: %0.3f (+/- %0.3f)" % (re_scores.mean(), re_scores.std() * 2)) print("\t Cross validated F1: %0.3f (+/- %0.3f)" % (f1_scores.mean(), f1_scores.std() * 2)) if EXP1: return # Pred with faulty stance print('Making cred * faulty stance predictions') X = [] y = [] all_evidence = [] with open('model_data/faulty_stances.json', 'rb') as tmpfile: f_stances_raw = json.load(tmpfile) f_stances = {} for k, v in f_stances_raw.items(): this_val = 0 if v == 0: this_val = 1 elif v == 1: this_val = 2 elif v == 3: this_val = 3 f_stances[k] = this_val #print(sum([1 for x in users_df['tweet_id'].values if str(x) not in f_stances])) users_df['true_stance'] = users_df['stance'] users_df['stance'] = users_df['tweet_id'].map( lambda x: f_stances[str(x)] if str(x) in f_stances else users_df[ users_df['tweet_id'] == x]['true_stance'].values[0]) #print(users_df[['stance', 'true_stance']]) for idx, hsh in enumerate(facts['hash'].values): this_users = users_df[users_df['fact'] == hsh] this_x, evidence = only_cred_support_deny_pred(this_users) this_y = facts['true'].iloc[idx] X.append((this_x[-1], np.std(this_x))) y.append(int(this_y)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) std_clf = make_pipeline(StandardScaler(), SVC(C=1, gamma=1)) std_clf.fit(X_train, y_train) pred = std_clf.predict(X_test) score = metrics.accuracy_score(y_test, pred) precision, recall, fscore, sup = metrics.precision_recall_fscore_support( y_test, pred, average='macro') print( "Rumors: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f" % (score, precision, recall, fscore)) acc_scores = cross_val_score(std_clf, X, y, cv=3) pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3) re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3) f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3) print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" % (acc_scores.mean(), acc_scores.std() * 2)) print("\t Cross validated Precision: %0.3f (+/- %0.3f)" % (pr_scores.mean(), pr_scores.std() * 2)) print("\t Cross validated Recall: %0.3f (+/- %0.3f)" % (re_scores.mean(), re_scores.std() * 2)) print("\t Cross validated F1: %0.3f (+/- %0.3f)" % (f1_scores.mean(), f1_scores.std() * 2)) # Pred with cred and standard features print('Making cred * stance plus standard feature predictions') with open('model_data/feature_data', 'rb') as tmpfile: fact_features = pickle.load(tmpfile) features = [ 'avg_links', 'avg_sent_neg', 'avg_sentiment', 'fr_has_url', 'lvl_size', 'avg_len', 'avg_special_symbol', 'avg_time_retweet', 'avg_count_distinct_words', 'avg_sent_pos' ] X = [] y = [] users_df['stance'] = users_df['true_stance'] #print(fact_features['hash']) for idx, hsh in enumerate(facts['hash'].values): this_users = users_df[users_df['fact'] == hsh] this_x, evidence = only_cred_support_deny_pred(this_users) this_y = facts['true'].iloc[idx] this_fact_features = [0] * len(features) if hsh in fact_features['hash'].values: this_fact_features = fact_features[fact_features['hash'] == hsh][ list(features)].values X.append( np.concatenate(([this_x[-1], np.std(this_x)], this_fact_features), axis=None)) #X.append([this_x[-1], np.std(this_x)] + this_fact_features) y.append(int(this_y)) from sklearn.model_selection import KFold # import KFold kf = KFold(n_splits=3) # Define the split - into 2 folds kf.get_n_splits( X) # returns the number of splitting iterations in the cross-validator for train_index, test_index in kf.split(X): X_train, X_test, y_train, y_test = np.asarray( X)[train_index], np.asarray(X)[test_index], np.asarray( y)[train_index], np.asarray(y)[test_index] #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) X_train_cred = np.asarray(X_train)[:, :2] X_test_cred = np.asarray(X_test)[:, :2] std_clf = make_pipeline(StandardScaler(), SVC(C=1, gamma=1, probability=True)) std_clf.fit(X_train_cred, y_train) pred_cred = std_clf.predict_proba(X_test_cred) X_train_feat = np.asarray(X_train)[:, 2:] X_test_feat = np.asarray(X_test)[:, 2:] std_clf = make_pipeline(StandardScaler(), PCA(n_components=8), SVC(C=1, gamma=1, probability=True)) std_clf.fit(X_train_feat, y_train) pred_feat = std_clf.predict_proba(X_test_feat) #print(pred_feat) pred_proba = np.add(pred_cred, pred_feat) #print(pred_proba) pred = [np.argmax(x) for x in np.divide(pred_proba, 2)] print(pred) print(y_test) score = metrics.accuracy_score(y_test, pred) precision, recall, fscore, sup = metrics.precision_recall_fscore_support( y_test, pred, average='macro') print( "Rumors: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f" % (score, precision, recall, fscore))
def main(): global users wn.ensure_loaded() facts = gt.get_fact_topics(DIR) features = [ 'avg_mentions', 'avg_emoticons', 'avg_links', 'avg_questionM', 'avg_personal_pronoun_first', 'avg_sent_pos', 'avg_sent_neg', 'avg_sentiment', 'fr_has_url', 'share_most_freq_author', 'lvl_size', 'avg_followers', 'avg_friends', 'avg_status_cnt', 'avg_reg_age' ] if NEW_DATA: users = gt.get_users(DIR) transactions = gt.get_transactions(DIR) print(transactions.describe()) tr_hsh = transactions['fact'].values cond = facts['hash'].isin(tr_hsh) facts = facts[cond] facts = pd.DataFrame([ get_features(fact, transactions, users) for idx, fact in facts.iterrows() if fact['true'] != 'unknown' ]) with open('model_data/castillo_data', 'wb') as tmpfile: pickle.dump(facts, tmpfile) else: with open('model_data/castillo_data', 'rb') as tmpfile: facts = pickle.load(tmpfile) print(facts.describe()) X = facts[list(features)].values y = facts['y'].values fig = plt.figure() fig.subplots_adjust(hspace=0.4, wspace=0.4) for i in range(1, len(features) + 1): ax = fig.add_subplot(3, 5, i) sns.boxplot(x="y", y=features[i - 1], data=facts, palette="Set3") #plt.show() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) std_clf = make_pipeline(StandardScaler(), DecisionTreeClassifier(random_state=42)) std_clf.fit(X_train, y_train) pred_test_std = std_clf.predict(X_test) precision, recall, fscore, sup = precision_recall_fscore_support( y_test, pred_test_std, average='macro') score = metrics.accuracy_score(y_test, pred_test_std) print( "Random split: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f" % (score, precision, recall, fscore)) acc_scores = cross_val_score(std_clf, X, y, cv=3) pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3) re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3) f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3) print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" % (acc_scores.mean(), acc_scores.std() * 2)) print("\t Cross validated Precision: %0.3f (+/- %0.3f)" % (pr_scores.mean(), pr_scores.std() * 2)) print("\t Cross validated Recall: %0.3f (+/- %0.3f)" % (re_scores.mean(), re_scores.std() * 2)) print("\t Cross validated F1: %0.3f (+/- %0.3f)" % (f1_scores.mean(), f1_scores.std() * 2))