def train(): training_file_detail = 'devset_subtask/data.txt' img_dict = {} with open(training_file_detail, 'r') as f: headers = f.readline() lines = f.readlines() for l in lines: parts = l.split('\t') im_id = parts[0].strip('\n\t ') im_id = im_id.replace('.jpg', '') label = parts[2].strip('\n\t ') if label == 'non-tampered': img_dict[im_id] = 1 else: img_dict[im_id] = -1 forensic_eff_images = None with open('output/devset_subtask_eff_images.dat') as f: forensic_eff_images = f.readlines() labels = np.zeros((len(forensic_eff_images), ), dtype=int) for ind, im in enumerate(forensic_eff_images): im = im.strip('\n\t ') labels[ind] = img_dict[im] data = np.loadtxt('output/devset_subtask_forensic_features.dat', delimiter=',', dtype=float) X_train = data y_train = labels scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) with open('output/RUN_1_subtask_scaler.pickle', 'wb') as handle: pickle.dump(scaler, handle) detector = None if classifier == 'logis': detector = logis(C=1e5, solver='liblinear', multi_class='ovr') elif classifier == 'svm': detector = svm.SVC() elif classifier == 'randomforest': detector = ExtraTreesClassifier(n_estimators=200, max_depth=None, min_samples_split=1, random_state=0) detector.fit(X_train, y_train) with open('output/RUN_1_subtask_classifier.pickle', 'wb') as handle: pickle.dump(detector, handle)
def train(): training_file_detail = 'devset_subtask/data.txt' img_dict = {} with open(training_file_detail, 'r') as f: headers = f.readline() lines = f.readlines() for l in lines: parts = l.split('\t') im_id = parts[0].strip('\n\t ') im_id = im_id.replace('.jpg','') label = parts[2].strip('\n\t ') if label == 'non-tampered': img_dict[im_id] = 1 else: img_dict[im_id] = -1 forensic_eff_images = None with open('output/devset_subtask_eff_images.dat') as f: forensic_eff_images = f.readlines() labels = np.zeros((len(forensic_eff_images),), dtype=int) for ind,im in enumerate(forensic_eff_images): im = im.strip('\n\t ') labels[ind] = img_dict[im] data = np.loadtxt('output/devset_subtask_forensic_features.dat', delimiter=',', dtype=float) X_train = data y_train = labels scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) with open('output/RUN_1_subtask_scaler.pickle', 'wb') as handle: pickle.dump(scaler, handle) detector = None if classifier == 'logis': detector = logis(C=1e5, solver='liblinear', multi_class='ovr') elif classifier == 'svm': detector = svm.SVC() elif classifier == 'randomforest': detector = ExtraTreesClassifier(n_estimators=200, max_depth=None, min_samples_split=1, random_state=0) detector.fit(X_train, y_train) with open('output/RUN_1_subtask_classifier.pickle', 'wb') as handle: pickle.dump(detector, handle)
def train(): # # load tweet featurese # tweet_features = np.loadtxt('output/devset_tweet_features.dat', delimiter=',') tweet_labels = np.array(tweet_features[:, -1], dtype=int) tweet_features = tweet_features[:, :-1] # make the training set balanced training_posts = read_list('dataset_for_training/real_tweet_id.data') training_posts.extend(read_list('dataset_for_training/fake_tweet_id.data')) all_posts = read_list('output/devset_eff_posts.dat') used_ind = np.ones((len(all_posts),), dtype=bool) for ind,p in enumerate(all_posts): if not p in training_posts: used_ind[ind] = False tweet_features = tweet_features[used_ind, :] tweet_labels = tweet_labels[used_ind] # # training classifier 1 # detector = None if classifier1 == 'logis': detector = logis(C=1e5, solver='liblinear', multi_class='ovr') elif classifier1 == 'svm': detector = svm.SVC() elif classifier1 == 'randomforest': detector = ExtraTreesClassifier(n_estimators=200, max_depth=None, min_samples_split=1, random_state=0) scaler_1 = preprocessing.StandardScaler().fit(tweet_features) tweet_features = scaler_1.transform(tweet_features) detector.fit(tweet_features, tweet_labels) with open('output/RUN_1_classifier_1.pickle', 'wb') as handle: pickle.dump(detector, handle) with open('output/RUN_1_scaler_1.pickle', 'wb') as handle: pickle.dump(scaler_1, handle) print('Training statistics\n') print('Number of real tweets: ', sum(tweet_labels == 1)) print('Number of fake tweets: ', sum(tweet_labels == -1))
def train(): # # load tweet featurese # tweet_features = np.loadtxt('output/devset_tweet_features.dat', delimiter=',') tweet_labels = np.array(tweet_features[:, -1], dtype=int) tweet_features = tweet_features[:, :-1] # make the training set balanced training_posts = read_list('dataset_for_training/real_tweet_id.data') training_posts.extend(read_list('dataset_for_training/fake_tweet_id.data')) all_posts = read_list('output/devset_eff_posts.dat') used_ind = np.ones((len(all_posts),), dtype=bool) for ind,p in enumerate(all_posts): if not p in training_posts: used_ind[ind] = False tweet_features = tweet_features[used_ind, :] tweet_labels = tweet_labels[used_ind] # # training classifier 1 # detector = None if classifier1 == 'logis': detector = logis(C=1e5, solver='liblinear', multi_class='ovr') elif classifier1 == 'svm': detector = svm.SVC() elif classifier1 == 'randomforest': detector = ExtraTreesClassifier(n_estimators=200, max_depth=None, min_samples_split=1, random_state=0) scaler_1 = preprocessing.StandardScaler().fit(tweet_features) tweet_features = scaler_1.transform(tweet_features) detector.fit(tweet_features, tweet_labels) with open('output/RUN_2_classifier_1.pickle', 'wb') as handle: pickle.dump(detector, handle) with open('output/RUN_2_scaler_1.pickle', 'wb') as handle: pickle.dump(scaler_1, handle) # # load textual and forensic features # forensic_features = np.loadtxt('output/devset_forensic_features.dat', delimiter=',', dtype=float) eff_forensic_topics = read_list('output/devset_eff_forensic_topics.dat') textual_features = np.loadtxt('output/devset_textual_features.dat', delimiter=',', dtype=float) eff_textual_topics = read_list('output/devset_eff_textual_topics.dat') real_mul_list = read_list('dataset_for_training/real_image_id.data') fake_mul_list = read_list('dataset_for_training/fake_image_id.data') mul_list = list(real_mul_list) mul_list.extend(fake_mul_list) topic_features = np.zeros((len(mul_list),forensic_features.shape[1] + textual_features.shape[1]),dtype=float) topic_labels = np.zeros((len(mul_list),), dtype=int) used_ind = np.ones((len(mul_list),), dtype=bool) for ind,m in enumerate(mul_list): if m in eff_forensic_topics: ind1 = eff_forensic_topics.index(m) topic_features[ind,:forensic_features.shape[1]] = forensic_features[ind1] if m in eff_textual_topics: ind2 = eff_textual_topics.index(m) topic_features[ind, forensic_features.shape[1]:] = textual_features[ind2] if not (m in eff_forensic_topics or m in eff_textual_topics): used_ind[ind] = False label = 1 if m in fake_mul_list: label = -1 topic_labels[ind] = label # remove unused topic features topic_features = topic_features[used_ind,:] topic_labels = topic_labels[used_ind] detector_2 = None if classifier2 == 'logis': detector_2 = logis(C=1e5, solver='liblinear', multi_class='ovr') elif classifier2 == 'svm': detector_2 = svm.SVC() elif classifier2 == 'randomforest': detector_2 = ExtraTreesClassifier(n_estimators=200, max_depth=None, min_samples_split=1, random_state=0) scaler_2 = preprocessing.StandardScaler().fit(topic_features) topic_features = scaler_2.transform(topic_features) detector_2.fit(topic_features, topic_labels) with open('output/RUN_2_classifier_2.pickle', 'wb') as handle: pickle.dump(detector_2, handle) with open('output/RUN_2_scaler_2.pickle', 'wb') as handle: pickle.dump(scaler_2, handle) print('Training statistics\n') print('Number of real tweets: ', sum(tweet_labels == 1)) print('Number of fake tweets: ', sum(tweet_labels == -1)) print('Number of real topics: ', sum(topic_labels == 1)) print('Number of fake topics: ', sum(topic_labels == -1))
labels = np.array(data[:, -1], dtype=int) data = data[:, :-1] # cross-validation 10 fold n_folds = 10 kf = KFold(data.shape[0], n_folds=n_folds, shuffle=True) avg_score = 0 avg_acc = 0 for train_index, test_index in kf: X_train, X_test = data[train_index], data[test_index] y_train, y_test = labels[train_index], labels[test_index] detector = None if classifier == 'logis': detector = logis(C=1e5, solver='lbfgs', multi_class='ovr') elif classifier == 'svm': detector = svm.SVC() elif classifier == 'randomforest': detector = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0) scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) detector.fit(X_train, y_train) X_test = scaler.transform(X_test) pr_labels = detector.predict(X_test)
def train(): # # load tweet featurese # tweet_features = np.loadtxt('output/devset_tweet_features.dat', delimiter=',') tweet_labels = np.array(tweet_features[:, -1], dtype=int) tweet_features = tweet_features[:, :-1] # make the training set balanced training_posts = read_list('dataset_for_training/real_tweet_id.data') training_posts.extend(read_list('dataset_for_training/fake_tweet_id.data')) all_posts = read_list('output/devset_eff_posts.dat') used_ind = np.ones((len(all_posts),), dtype=bool) for ind,p in enumerate(all_posts): if not p in training_posts: used_ind[ind] = False tweet_features = tweet_features[used_ind, :] tweet_labels = tweet_labels[used_ind] # # training classifier 1 # detector = None if classifier1 == 'logis': detector = logis(C=1e5, solver='liblinear', multi_class='ovr') elif classifier1 == 'svm': detector = svm.SVC() elif classifier1 == 'randomforest': detector = ExtraTreesClassifier(n_estimators=200, max_depth=None, min_samples_split=1, random_state=0) scaler_1 = preprocessing.StandardScaler().fit(tweet_features) tweet_features = scaler_1.transform(tweet_features) detector.fit(tweet_features, tweet_labels) with open('output/RUN_3_classifier_1.pickle', 'wb') as handle: pickle.dump(detector, handle) with open('output/RUN_3_scaler_1.pickle', 'wb') as handle: pickle.dump(scaler_1, handle) # # load textual and forensic features # forensic_features = np.loadtxt('output/devset_forensic_features.dat', delimiter=',', dtype=float) eff_forensic_topics = read_list('output/devset_eff_forensic_topics.dat') textual_features = np.loadtxt('output/devset_textual_features.dat', delimiter=',', dtype=float) eff_textual_topics = read_list('output/devset_eff_textual_topics.dat') real_mul_list = read_list('dataset_for_training/real_image_id.data') fake_mul_list = read_list('dataset_for_training/fake_image_id.data') mul_list = list(real_mul_list) mul_list.extend(fake_mul_list) topic_features = np.zeros((len(mul_list),forensic_features.shape[1] + textual_features.shape[1]),dtype=float) topic_labels = np.zeros((len(mul_list),), dtype=int) used_ind = np.ones((len(mul_list),), dtype=bool) for ind,m in enumerate(mul_list): if m in eff_forensic_topics: ind1 = eff_forensic_topics.index(m) topic_features[ind,:forensic_features.shape[1]] = forensic_features[ind1] if m in eff_textual_topics: ind2 = eff_textual_topics.index(m) topic_features[ind, forensic_features.shape[1]:] = textual_features[ind2] if not (m in eff_forensic_topics or m in eff_textual_topics): used_ind[ind] = False label = 1 if m in fake_mul_list: label = -1 topic_labels[ind] = label # remove unused topic features topic_features = topic_features[used_ind,:] topic_labels = topic_labels[used_ind] detector_2 = None if classifier2 == 'logis': detector_2 = logis(C=1e5, solver='liblinear', multi_class='ovr') elif classifier2 == 'svm': detector_2 = svm.SVC() elif classifier2 == 'randomforest': detector_2 = ExtraTreesClassifier(n_estimators=200, max_depth=None, min_samples_split=1, random_state=0) scaler_2 = preprocessing.StandardScaler().fit(topic_features) topic_features = scaler_2.transform(topic_features) detector_2.fit(topic_features, topic_labels) with open('output/RUN_3_classifier_2.pickle', 'wb') as handle: pickle.dump(detector_2, handle) with open('output/RUN_3_scaler_2.pickle', 'wb') as handle: pickle.dump(scaler_2, handle) print('Training statistics\n') print('Number of real tweets: ', sum(tweet_labels == 1)) print('Number of fake tweets: ', sum(tweet_labels == -1)) print('Number of real topics: ', sum(topic_labels == 1)) print('Number of fake topics: ', sum(topic_labels == -1))
labels = np.array(data[:,-1],dtype=int) data = data[:,:-1] # cross-validation 10 fold n_folds = 10 kf = KFold(data.shape[0], n_folds=n_folds, shuffle = True) avg_score = 0 avg_acc = 0 for train_index, test_index in kf: X_train, X_test = data[train_index], data[test_index] y_train, y_test = labels[train_index], labels[test_index] detector = None if classifier == 'logis': detector = logis(C=1e5, solver='lbfgs', multi_class='ovr') elif classifier == 'svm': detector = svm.SVC() elif classifier == 'randomforest': detector = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0) scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) detector.fit(X_train, y_train) X_test = scaler.transform(X_test) pr_labels = detector.predict(X_test) acc = sum(y_test == pr_labels) / y_test.shape[0]
def main(): selectively_split_data() #load tweet features X_train = np.loadtxt('output/training_data.dat', dtype=float) y_train = np.array(X_train[:, -1], dtype=int) y_train[np.where(y_train == -1)] = 0 X_train = X_train[:, :-1] print('number of real training samples: ', sum(y_train == 1), '/', X_train.shape[0]) X_test = np.loadtxt('output/testing_data.dat', dtype=float) y_test = np.array(X_test[:, -1], dtype=int) y_test[np.where(y_test == -1)] = 0 X_test = X_test[:, :-1] detector = None if classifier1 == 'logis': detector = logis(C=1e5, solver='liblinear', multi_class='ovr') elif classifier1 == 'svm': detector = svm.SVC() elif classifier1 == 'randomforest': detector = ExtraTreesClassifier(n_estimators=200, max_depth=None, min_samples_split=1, random_state=0) scaler = preprocessing.StandardScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) detector.fit(X_train_scaled, y_train) pr_labels = detector.predict(X_test_scaled) pr_proba = detector.predict_proba(X_test_scaled) score = f1_score(y_test, pr_labels, average='binary') acc = sum(y_test == pr_labels) * 100 / X_test_scaled.shape[0] print('number of real testing samples: ', sum(y_test == 1), '/', y_test.shape[0]) print('\nwithout forensic and textual features: \n') print('F1 score: ', score * 100, '%') print('Acc: ', acc, '%') print('\nwith forensic and textual features: \n') topic_detector = None if classifier2 == 'logis': topic_detector = logis(C=1e5, solver='liblinear', multi_class='ovr') elif classifier2 == 'svm': topic_detector = svm.SVC() elif classifier2 == 'randomforest': topic_detector = ExtraTreesClassifier(n_estimators=200, max_depth=None, min_samples_split=1, random_state=0) posts_dict = None with open('output/' + dataset + '_posts_dict.pickle', 'rb') as handle: posts_dict = pickle.load(handle) # training_posts = read_list('output/training_posts.dat') # mul_list_train = {} # for p in training_posts: # mul_id = posts_dict[p][0].strip('\n\t ') # mul_list_train[mul_id] = 1 # # mul_list_train = list(mul_list_train.keys()) # topic_features_train, eff_topics_train = extract_topic_feature(mul_list_train) # # concat_data_train = np.zeros((X_train.shape[0], X_train.shape[1] + topic_features_train.shape[1])) # # for ind,p in enumerate(training_posts): # mul_id = posts_dict[p][0].strip('\n\t ') # concat_data_train[ind,:X_train.shape[1]] = X_train[ind,:] # if mul_id in eff_topics_train: # concat_data_train[ind,X_train.shape[1]:] = topic_features_train[eff_topics_train.index(mul_id)] # # testing_posts = read_list('output/testing_posts.dat') # mul_list_test = {} # for p in testing_posts: # mul_id = posts_dict[p][0].strip('\n\t ') # mul_list_test[mul_id] = 1 # # mul_list_test = list(mul_list_test.keys()) # topic_features_test, eff_topics_test = extract_topic_feature(mul_list_test) # # concat_data_test = np.zeros((X_test.shape[0], X_test.shape[1] + topic_features_test.shape[1])) # # for ind, p in enumerate(testing_posts): # mul_id = posts_dict[p][0].strip('\n\t ') # concat_data_test[ind, :X_test.shape[1]] = X_test[ind,:] # if mul_id in eff_topics_test: # concat_data_test[ind, X_test.shape[1]:] = topic_features_test[eff_topics_test.index(mul_id)] # # # test again # scaler = preprocessing.StandardScaler().fit(concat_data_train) # concat_data_train_scaled = scaler.transform(concat_data_train) # concat_data_test_scaled = scaler.transform(concat_data_test) # # detector.fit(concat_data_train_scaled, y_train) # # pr_labels = detector.predict(concat_data_test_scaled) # #pr_proba = detector.predict_proba(X_test_scaled) # # score = f1_score(y_test, pr_labels, average='binary') # acc = sum(y_test == pr_labels) * 100 / concat_data_test_scaled.shape[0] # # print('F1 score: ', score*100, '%') # print('Acc: ', acc, '%') extract_multimedia_labels() training_posts = read_list('output/training_posts.dat') topic_features_train, eff_topics_train = extract_topic_feature_train() X_topic_train = topic_features_train y_topic_train = np.array(X_topic_train[:, -1], dtype=int) y_train[np.where(y_train == -1)] = 0 X_topic_train = X_topic_train[:, :-1] scaler = preprocessing.StandardScaler().fit(X_topic_train) X_topic_train = scaler.transform(X_topic_train) topic_detector.fit(X_topic_train, y_topic_train) # refine results testing_posts = read_list('output/testing_posts.dat') mul_list_test = {} for p in testing_posts: mul_id = posts_dict[p][0].strip('\n\t ') mul_list_test[mul_id] = 1 mul_list_test = list(mul_list_test.keys()) topic_features_test, eff_topics_test = extract_topic_feature_test( mul_list_test) X_topic_test = topic_features_test X_topic_test = scaler.transform(X_topic_test) topic_pr_probas = topic_detector.predict_proba(X_topic_test) new_results = np.zeros((len(testing_posts), )) for ind, p in enumerate(testing_posts): proba_1 = pr_proba[ind, :] new_proba = proba_1 mul_id = posts_dict[p][0].strip('\n\t ') if mul_id in eff_topics_test: proba_2 = topic_pr_probas[eff_topics_test.index(mul_id), :] new_proba = 0.8 * proba_2 + 0.2 * proba_1 new_results[ind, ] = (new_proba[1] > new_proba[0]) new_results = np.array(new_results, dtype=int) score = f1_score(y_test, new_results, average='binary') acc = sum(y_test == new_results) * 100 / y_test.shape[0] # print('\nFail cases:\n') # for ind, p in enumerate(testing_posts): # mul_id = posts_dict[p][0].strip('\n\t ') # if y_test[ind,] != new_results[ind,] and mul_id in eff_topics_test: # print(mul_id + '\n') print('F1 score: ', score * 100) print('Acc: ', acc, '%')
def main(): # load tweet features data = np.loadtxt('output/' + dataset + '_tweet_features.dat', delimiter=',') labels = np.array(data[:, -1], dtype=int) count = sum(labels == -1) data = data[:, :-1] # load effective posts eff_posts = [] with open('output/' + dataset + '_eff_posts.dat') as f: for line in f: line = line.strip('\n\t ') eff_posts.append(line) use_topic_feature = 0 concat_data = None if use_topic_feature == 1: # load posts dict posts_dict = None with open('output/' + dataset + '_posts_dict.pickle', 'rb') as handle: posts_dict = pickle.load(handle) # concatenate features together mul_list = {} for p in eff_posts: mul_id = posts_dict[p][0].strip('\n\t ') mul_list[mul_id] = 1 mul_list = list(mul_list.keys()) topic_features, eff_topics = extract_topic_feature(mul_list) concat_data = np.zeros((data.shape[0], data.shape[1] + topic_features.shape[1])) for ind, p in enumerate(eff_posts): mul_id = posts_dict[p][0].strip('\n\t ') concat_data[ind, :data.shape[1]] = data[ind, :] if mul_id in eff_topics: concat_data[ind, data.shape[1]:] = topic_features[eff_topics.index(mul_id)] else: concat_data = data # cross-validation 10 folds n_folds = 20 kf = KFold(data.shape[0], n_folds=n_folds, shuffle = True) score = 0 count = 1 avg_score = 0 avg_acc = 0 for train_index, test_index in kf: X_train, X_test = concat_data[train_index], concat_data[test_index] y_train, y_test = labels[train_index], labels[test_index] scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) detector = None if classifier == 'logis': detector = logis(C=1e5, solver='liblinear', multi_class='ovr') elif classifier == 'svm': detector = svm.SVC() elif classifier == 'randomforest': detector = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0) detector.fit(X_train, y_train) pr_labels = detector.predict(X_test) # pr_proba = detector.predict_proba(X_test_scaled) score = f1_score(y_test, pr_labels, average='binary') avg_score += score acc = sum(y_test == pr_labels) / X_test.shape[0] avg_acc += acc print('\nLoop ', count, '\n') print('F1 score: ', score*100, '%') print('Acc: ', acc, '%') count += 1 print('Average F1 score: ', avg_score * 100 / n_folds, '%') print('Average accuracy: ', avg_acc * 100 / n_folds, '%')
def main(): selectively_split_data() #load tweet features X_train = np.loadtxt('output/training_data.dat', dtype=float) y_train = np.array(X_train[:, -1], dtype=int) y_train[np.where(y_train == -1)] = 0 X_train = X_train[:,:-1] print('number of real training samples: ', sum(y_train == 1), '/', X_train.shape[0]) X_test = np.loadtxt('output/testing_data.dat', dtype=float) y_test = np.array(X_test[:, -1], dtype=int) y_test[np.where(y_test == -1)] = 0 X_test = X_test[:, :-1] detector = None if classifier1 == 'logis': detector = logis(C=1e5, solver='liblinear', multi_class='ovr') elif classifier1 == 'svm': detector = svm.SVC() elif classifier1 == 'randomforest': detector = ExtraTreesClassifier(n_estimators=200, max_depth=None, min_samples_split=1, random_state=0) scaler = preprocessing.StandardScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) detector.fit(X_train_scaled, y_train) pr_labels = detector.predict(X_test_scaled) pr_proba = detector.predict_proba(X_test_scaled) score = f1_score(y_test, pr_labels, average='binary') acc = sum(y_test == pr_labels) * 100 / X_test_scaled.shape[0] print('number of real testing samples: ', sum(y_test == 1), '/', y_test.shape[0]) print('\nwithout forensic and textual features: \n') print('F1 score: ', score*100, '%') print('Acc: ', acc, '%') print('\nwith forensic and textual features: \n') topic_detector = None if classifier2 == 'logis': topic_detector = logis(C=1e5, solver='liblinear', multi_class='ovr') elif classifier2 == 'svm': topic_detector = svm.SVC() elif classifier2 == 'randomforest': topic_detector = ExtraTreesClassifier(n_estimators=200, max_depth=None, min_samples_split=1, random_state=0) posts_dict = None with open('output/' + dataset + '_posts_dict.pickle', 'rb') as handle: posts_dict = pickle.load(handle) # training_posts = read_list('output/training_posts.dat') # mul_list_train = {} # for p in training_posts: # mul_id = posts_dict[p][0].strip('\n\t ') # mul_list_train[mul_id] = 1 # # mul_list_train = list(mul_list_train.keys()) # topic_features_train, eff_topics_train = extract_topic_feature(mul_list_train) # # concat_data_train = np.zeros((X_train.shape[0], X_train.shape[1] + topic_features_train.shape[1])) # # for ind,p in enumerate(training_posts): # mul_id = posts_dict[p][0].strip('\n\t ') # concat_data_train[ind,:X_train.shape[1]] = X_train[ind,:] # if mul_id in eff_topics_train: # concat_data_train[ind,X_train.shape[1]:] = topic_features_train[eff_topics_train.index(mul_id)] # # testing_posts = read_list('output/testing_posts.dat') # mul_list_test = {} # for p in testing_posts: # mul_id = posts_dict[p][0].strip('\n\t ') # mul_list_test[mul_id] = 1 # # mul_list_test = list(mul_list_test.keys()) # topic_features_test, eff_topics_test = extract_topic_feature(mul_list_test) # # concat_data_test = np.zeros((X_test.shape[0], X_test.shape[1] + topic_features_test.shape[1])) # # for ind, p in enumerate(testing_posts): # mul_id = posts_dict[p][0].strip('\n\t ') # concat_data_test[ind, :X_test.shape[1]] = X_test[ind,:] # if mul_id in eff_topics_test: # concat_data_test[ind, X_test.shape[1]:] = topic_features_test[eff_topics_test.index(mul_id)] # # # test again # scaler = preprocessing.StandardScaler().fit(concat_data_train) # concat_data_train_scaled = scaler.transform(concat_data_train) # concat_data_test_scaled = scaler.transform(concat_data_test) # # detector.fit(concat_data_train_scaled, y_train) # # pr_labels = detector.predict(concat_data_test_scaled) # #pr_proba = detector.predict_proba(X_test_scaled) # # score = f1_score(y_test, pr_labels, average='binary') # acc = sum(y_test == pr_labels) * 100 / concat_data_test_scaled.shape[0] # # print('F1 score: ', score*100, '%') # print('Acc: ', acc, '%') extract_multimedia_labels() training_posts = read_list('output/training_posts.dat') topic_features_train, eff_topics_train = extract_topic_feature_train() X_topic_train = topic_features_train y_topic_train = np.array(X_topic_train[:, -1], dtype=int) y_train[np.where(y_train == -1)] = 0 X_topic_train = X_topic_train[:, :-1] scaler = preprocessing.StandardScaler().fit(X_topic_train) X_topic_train = scaler.transform(X_topic_train) topic_detector.fit(X_topic_train, y_topic_train) # refine results testing_posts = read_list('output/testing_posts.dat') mul_list_test = {} for p in testing_posts: mul_id = posts_dict[p][0].strip('\n\t ') mul_list_test[mul_id] = 1 mul_list_test = list(mul_list_test.keys()) topic_features_test, eff_topics_test = extract_topic_feature_test(mul_list_test) X_topic_test = topic_features_test X_topic_test = scaler.transform(X_topic_test) topic_pr_probas = topic_detector.predict_proba(X_topic_test) new_results = np.zeros((len(testing_posts),)) for ind,p in enumerate(testing_posts): proba_1 = pr_proba[ind,:] new_proba = proba_1 mul_id = posts_dict[p][0].strip('\n\t ') if mul_id in eff_topics_test: proba_2 = topic_pr_probas[eff_topics_test.index(mul_id),:] new_proba = 0.8*proba_2 + 0.2*proba_1 new_results[ind,] = (new_proba[1] > new_proba[0]) new_results = np.array(new_results,dtype=int) score = f1_score(y_test, new_results, average='binary') acc = sum(y_test == new_results) * 100 / y_test.shape[0] # print('\nFail cases:\n') # for ind, p in enumerate(testing_posts): # mul_id = posts_dict[p][0].strip('\n\t ') # if y_test[ind,] != new_results[ind,] and mul_id in eff_topics_test: # print(mul_id + '\n') print('F1 score: ', score*100) print('Acc: ', acc, '%')
# cross-validation 10 fold n_folds = 10 kf = KFold(data.shape[0], n_folds=n_folds, shuffle=True) avg_score = 0 avg_acc = 0 for train_index, test_index in kf: X_train, X_test = data[train_index], data[test_index] y_train, y_test = labels[train_index], labels[test_index] # normalize data scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) detector = None if classifier == 'logis': detector = logis(C=1e5, solver='lbfgs', multi_class='multinomial') elif classifier == 'svm': detector = svm.SVC() elif classifier == 'randomforest': detector = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0) detector.fit(X_train, y_train) X_test = scaler.transform(X_test) pr_labels = detector.predict(X_test) acc = sum(y_test == pr_labels) / y_test.shape[0] score = f1_score(y_test, pr_labels, average='binary') avg_acc += acc
# cross-validation 10 fold n_folds = 10 kf = KFold(data.shape[0], n_folds=n_folds, shuffle=True) avg_score = 0 avg_acc = 0 for train_index, test_index in kf: X_train, X_test = data[train_index], data[test_index] y_train, y_test = labels[train_index], labels[test_index] # normalize data scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) detector = None if classifier == 'logis': detector = logis(C=1e5, solver='lbfgs', multi_class='multinomial') elif classifier == 'svm': detector = svm.SVC() elif classifier == 'randomforest': detector = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0) detector.fit(X_train, y_train) X_test = scaler.transform(X_test) pr_labels = detector.predict(X_test) acc = sum(y_test == pr_labels) / y_test.shape[0] score = f1_score(y_test, pr_labels, average='binary')
def main(): # load tweet features data = np.loadtxt('output/' + dataset + '_tweet_features.dat', delimiter=',') labels = np.array(data[:, -1], dtype=int) count = sum(labels == -1) data = data[:, :-1] # load effective posts eff_posts = [] with open('output/' + dataset + '_eff_posts.dat') as f: for line in f: line = line.strip('\n\t ') eff_posts.append(line) use_topic_feature = 0 concat_data = None if use_topic_feature == 1: # load posts dict posts_dict = None with open('output/' + dataset + '_posts_dict.pickle', 'rb') as handle: posts_dict = pickle.load(handle) # concatenate features together mul_list = {} for p in eff_posts: mul_id = posts_dict[p][0].strip('\n\t ') mul_list[mul_id] = 1 mul_list = list(mul_list.keys()) topic_features, eff_topics = extract_topic_feature(mul_list) concat_data = np.zeros( (data.shape[0], data.shape[1] + topic_features.shape[1])) for ind, p in enumerate(eff_posts): mul_id = posts_dict[p][0].strip('\n\t ') concat_data[ind, :data.shape[1]] = data[ind, :] if mul_id in eff_topics: concat_data[ind, data.shape[1]:] = topic_features[ eff_topics.index(mul_id)] else: concat_data = data # cross-validation 10 folds n_folds = 20 kf = KFold(data.shape[0], n_folds=n_folds, shuffle=True) score = 0 count = 1 avg_score = 0 avg_acc = 0 for train_index, test_index in kf: X_train, X_test = concat_data[train_index], concat_data[test_index] y_train, y_test = labels[train_index], labels[test_index] scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) detector = None if classifier == 'logis': detector = logis(C=1e5, solver='liblinear', multi_class='ovr') elif classifier == 'svm': detector = svm.SVC() elif classifier == 'randomforest': detector = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0) detector.fit(X_train, y_train) pr_labels = detector.predict(X_test) # pr_proba = detector.predict_proba(X_test_scaled) score = f1_score(y_test, pr_labels, average='binary') avg_score += score acc = sum(y_test == pr_labels) / X_test.shape[0] avg_acc += acc print('\nLoop ', count, '\n') print('F1 score: ', score * 100, '%') print('Acc: ', acc, '%') count += 1 print('Average F1 score: ', avg_score * 100 / n_folds, '%') print('Average accuracy: ', avg_acc * 100 / n_folds, '%')