Ejemplo n.º 1
0
def train():

    training_file_detail = 'devset_subtask/data.txt'
    img_dict = {}

    with open(training_file_detail, 'r') as f:
        headers = f.readline()
        lines = f.readlines()
        for l in lines:
            parts = l.split('\t')
            im_id = parts[0].strip('\n\t ')
            im_id = im_id.replace('.jpg', '')
            label = parts[2].strip('\n\t ')
            if label == 'non-tampered':
                img_dict[im_id] = 1
            else:
                img_dict[im_id] = -1

    forensic_eff_images = None
    with open('output/devset_subtask_eff_images.dat') as f:
        forensic_eff_images = f.readlines()

    labels = np.zeros((len(forensic_eff_images), ), dtype=int)
    for ind, im in enumerate(forensic_eff_images):
        im = im.strip('\n\t ')
        labels[ind] = img_dict[im]

    data = np.loadtxt('output/devset_subtask_forensic_features.dat',
                      delimiter=',',
                      dtype=float)

    X_train = data
    y_train = labels

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)

    with open('output/RUN_1_subtask_scaler.pickle', 'wb') as handle:
        pickle.dump(scaler, handle)

    detector = None
    if classifier == 'logis':
        detector = logis(C=1e5, solver='liblinear', multi_class='ovr')
    elif classifier == 'svm':
        detector = svm.SVC()
    elif classifier == 'randomforest':
        detector = ExtraTreesClassifier(n_estimators=200,
                                        max_depth=None,
                                        min_samples_split=1,
                                        random_state=0)

    detector.fit(X_train, y_train)

    with open('output/RUN_1_subtask_classifier.pickle', 'wb') as handle:
        pickle.dump(detector, handle)
def train():

    training_file_detail = 'devset_subtask/data.txt'
    img_dict = {}

    with open(training_file_detail, 'r') as f:
        headers = f.readline()
        lines = f.readlines()
        for l in lines:
            parts = l.split('\t')
            im_id = parts[0].strip('\n\t ')
            im_id = im_id.replace('.jpg','')
            label = parts[2].strip('\n\t ')
            if label == 'non-tampered':
                img_dict[im_id] = 1
            else:
                img_dict[im_id] = -1

    forensic_eff_images = None
    with open('output/devset_subtask_eff_images.dat') as f:
        forensic_eff_images = f.readlines()

    labels = np.zeros((len(forensic_eff_images),), dtype=int)
    for ind,im in enumerate(forensic_eff_images):
        im = im.strip('\n\t ')
        labels[ind] = img_dict[im]

    data = np.loadtxt('output/devset_subtask_forensic_features.dat', delimiter=',', dtype=float)

    X_train = data
    y_train = labels

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)

    with open('output/RUN_1_subtask_scaler.pickle', 'wb') as handle:
        pickle.dump(scaler, handle)

    detector = None
    if classifier == 'logis':
        detector = logis(C=1e5, solver='liblinear', multi_class='ovr')
    elif classifier == 'svm':
        detector = svm.SVC()
    elif classifier == 'randomforest':
        detector = ExtraTreesClassifier(n_estimators=200, max_depth=None,
                                        min_samples_split=1, random_state=0)

    detector.fit(X_train, y_train)

    with open('output/RUN_1_subtask_classifier.pickle', 'wb') as handle:
        pickle.dump(detector, handle)
Ejemplo n.º 3
0
def train():

    #
    # load tweet featurese
    #

    tweet_features = np.loadtxt('output/devset_tweet_features.dat', delimiter=',')
    tweet_labels = np.array(tweet_features[:, -1], dtype=int)
    tweet_features = tweet_features[:, :-1]

    # make the training set balanced
    training_posts = read_list('dataset_for_training/real_tweet_id.data')
    training_posts.extend(read_list('dataset_for_training/fake_tweet_id.data'))
    all_posts = read_list('output/devset_eff_posts.dat')
    used_ind = np.ones((len(all_posts),), dtype=bool)

    for ind,p in enumerate(all_posts):
        if not p in training_posts:
            used_ind[ind] = False

    tweet_features = tweet_features[used_ind, :]
    tweet_labels = tweet_labels[used_ind]

    #
    # training classifier 1
    #

    detector = None
    if classifier1 == 'logis':
        detector = logis(C=1e5, solver='liblinear', multi_class='ovr')
    elif classifier1 == 'svm':
        detector = svm.SVC()
    elif classifier1 == 'randomforest':
        detector = ExtraTreesClassifier(n_estimators=200, max_depth=None,
                                        min_samples_split=1, random_state=0)

    scaler_1 = preprocessing.StandardScaler().fit(tweet_features)
    tweet_features = scaler_1.transform(tweet_features)
    detector.fit(tweet_features, tweet_labels)
    with open('output/RUN_1_classifier_1.pickle', 'wb') as handle:
        pickle.dump(detector, handle)
    with open('output/RUN_1_scaler_1.pickle', 'wb') as handle:
        pickle.dump(scaler_1, handle)

    print('Training statistics\n')
    print('Number of real tweets: ', sum(tweet_labels == 1))
    print('Number of fake tweets: ', sum(tweet_labels == -1))
Ejemplo n.º 4
0
def train():

    #
    # load tweet featurese
    #

    tweet_features = np.loadtxt('output/devset_tweet_features.dat', delimiter=',')
    tweet_labels = np.array(tweet_features[:, -1], dtype=int)
    tweet_features = tweet_features[:, :-1]

    # make the training set balanced
    training_posts = read_list('dataset_for_training/real_tweet_id.data')
    training_posts.extend(read_list('dataset_for_training/fake_tweet_id.data'))
    all_posts = read_list('output/devset_eff_posts.dat')
    used_ind = np.ones((len(all_posts),), dtype=bool)

    for ind,p in enumerate(all_posts):
        if not p in training_posts:
            used_ind[ind] = False

    tweet_features = tweet_features[used_ind, :]
    tweet_labels = tweet_labels[used_ind]

    #
    # training classifier 1
    #

    detector = None
    if classifier1 == 'logis':
        detector = logis(C=1e5, solver='liblinear', multi_class='ovr')
    elif classifier1 == 'svm':
        detector = svm.SVC()
    elif classifier1 == 'randomforest':
        detector = ExtraTreesClassifier(n_estimators=200, max_depth=None,
                                        min_samples_split=1, random_state=0)

    scaler_1 = preprocessing.StandardScaler().fit(tweet_features)
    tweet_features = scaler_1.transform(tweet_features)
    detector.fit(tweet_features, tweet_labels)
    with open('output/RUN_2_classifier_1.pickle', 'wb') as handle:
        pickle.dump(detector, handle)
    with open('output/RUN_2_scaler_1.pickle', 'wb') as handle:
        pickle.dump(scaler_1, handle)

    #
    # load textual and forensic features
    #

    forensic_features = np.loadtxt('output/devset_forensic_features.dat',
                                   delimiter=',', dtype=float)
    eff_forensic_topics = read_list('output/devset_eff_forensic_topics.dat')
    textual_features = np.loadtxt('output/devset_textual_features.dat',
                                  delimiter=',', dtype=float)
    eff_textual_topics = read_list('output/devset_eff_textual_topics.dat')

    real_mul_list = read_list('dataset_for_training/real_image_id.data')
    fake_mul_list = read_list('dataset_for_training/fake_image_id.data')
    mul_list = list(real_mul_list)
    mul_list.extend(fake_mul_list)

    topic_features = np.zeros((len(mul_list),forensic_features.shape[1] +
                               textual_features.shape[1]),dtype=float)
    topic_labels = np.zeros((len(mul_list),), dtype=int)
    used_ind = np.ones((len(mul_list),), dtype=bool)
    for ind,m in enumerate(mul_list):
        if m in eff_forensic_topics:
            ind1 = eff_forensic_topics.index(m)
            topic_features[ind,:forensic_features.shape[1]] = forensic_features[ind1]
        if m in eff_textual_topics:
            ind2 = eff_textual_topics.index(m)
            topic_features[ind, forensic_features.shape[1]:] = textual_features[ind2]
        if not (m in eff_forensic_topics or m in eff_textual_topics):
            used_ind[ind] = False

        label = 1
        if m in fake_mul_list:
            label = -1

        topic_labels[ind] = label

    # remove unused topic features
    topic_features = topic_features[used_ind,:]
    topic_labels = topic_labels[used_ind]

    detector_2 = None
    if classifier2 == 'logis':
        detector_2 = logis(C=1e5, solver='liblinear', multi_class='ovr')
    elif classifier2 == 'svm':
        detector_2 = svm.SVC()
    elif classifier2 == 'randomforest':
        detector_2 = ExtraTreesClassifier(n_estimators=200, max_depth=None,
                                        min_samples_split=1, random_state=0)

    scaler_2 = preprocessing.StandardScaler().fit(topic_features)
    topic_features = scaler_2.transform(topic_features)
    detector_2.fit(topic_features, topic_labels)
    with open('output/RUN_2_classifier_2.pickle', 'wb') as handle:
        pickle.dump(detector_2, handle)
    with open('output/RUN_2_scaler_2.pickle', 'wb') as handle:
        pickle.dump(scaler_2, handle)

    print('Training statistics\n')
    print('Number of real tweets: ', sum(tweet_labels == 1))
    print('Number of fake tweets: ', sum(tweet_labels == -1))
    print('Number of real topics: ', sum(topic_labels == 1))
    print('Number of fake topics: ', sum(topic_labels == -1))
Ejemplo n.º 5
0
labels = np.array(data[:, -1], dtype=int)
data = data[:, :-1]

# cross-validation 10 fold
n_folds = 10
kf = KFold(data.shape[0], n_folds=n_folds, shuffle=True)
avg_score = 0
avg_acc = 0

for train_index, test_index in kf:
    X_train, X_test = data[train_index], data[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    detector = None
    if classifier == 'logis':
        detector = logis(C=1e5, solver='lbfgs', multi_class='ovr')
    elif classifier == 'svm':
        detector = svm.SVC()
    elif classifier == 'randomforest':
        detector = ExtraTreesClassifier(n_estimators=100,
                                        max_depth=None,
                                        min_samples_split=1,
                                        random_state=0)

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)

    detector.fit(X_train, y_train)

    X_test = scaler.transform(X_test)
    pr_labels = detector.predict(X_test)
Ejemplo n.º 6
0
def train():

    #
    # load tweet featurese
    #

    tweet_features = np.loadtxt('output/devset_tweet_features.dat', delimiter=',')
    tweet_labels = np.array(tweet_features[:, -1], dtype=int)
    tweet_features = tweet_features[:, :-1]

    # make the training set balanced
    training_posts = read_list('dataset_for_training/real_tweet_id.data')
    training_posts.extend(read_list('dataset_for_training/fake_tweet_id.data'))
    all_posts = read_list('output/devset_eff_posts.dat')
    used_ind = np.ones((len(all_posts),), dtype=bool)

    for ind,p in enumerate(all_posts):
        if not p in training_posts:
            used_ind[ind] = False

    tweet_features = tweet_features[used_ind, :]
    tweet_labels = tweet_labels[used_ind]

    #
    # training classifier 1
    #

    detector = None
    if classifier1 == 'logis':
        detector = logis(C=1e5, solver='liblinear', multi_class='ovr')
    elif classifier1 == 'svm':
        detector = svm.SVC()
    elif classifier1 == 'randomforest':
        detector = ExtraTreesClassifier(n_estimators=200, max_depth=None,
                                        min_samples_split=1, random_state=0)

    scaler_1 = preprocessing.StandardScaler().fit(tweet_features)
    tweet_features = scaler_1.transform(tweet_features)
    detector.fit(tweet_features, tweet_labels)
    with open('output/RUN_3_classifier_1.pickle', 'wb') as handle:
        pickle.dump(detector, handle)
    with open('output/RUN_3_scaler_1.pickle', 'wb') as handle:
        pickle.dump(scaler_1, handle)

    #
    # load textual and forensic features
    #

    forensic_features = np.loadtxt('output/devset_forensic_features.dat',
                                   delimiter=',', dtype=float)
    eff_forensic_topics = read_list('output/devset_eff_forensic_topics.dat')
    textual_features = np.loadtxt('output/devset_textual_features.dat',
                                  delimiter=',', dtype=float)
    eff_textual_topics = read_list('output/devset_eff_textual_topics.dat')

    real_mul_list = read_list('dataset_for_training/real_image_id.data')
    fake_mul_list = read_list('dataset_for_training/fake_image_id.data')
    mul_list = list(real_mul_list)
    mul_list.extend(fake_mul_list)

    topic_features = np.zeros((len(mul_list),forensic_features.shape[1] +
                               textual_features.shape[1]),dtype=float)
    topic_labels = np.zeros((len(mul_list),), dtype=int)
    used_ind = np.ones((len(mul_list),), dtype=bool)
    for ind,m in enumerate(mul_list):
        if m in eff_forensic_topics:
            ind1 = eff_forensic_topics.index(m)
            topic_features[ind,:forensic_features.shape[1]] = forensic_features[ind1]
        if m in eff_textual_topics:
            ind2 = eff_textual_topics.index(m)
            topic_features[ind, forensic_features.shape[1]:] = textual_features[ind2]
        if not (m in eff_forensic_topics or m in eff_textual_topics):
            used_ind[ind] = False

        label = 1
        if m in fake_mul_list:
            label = -1

        topic_labels[ind] = label

    # remove unused topic features
    topic_features = topic_features[used_ind,:]
    topic_labels = topic_labels[used_ind]

    detector_2 = None
    if classifier2 == 'logis':
        detector_2 = logis(C=1e5, solver='liblinear', multi_class='ovr')
    elif classifier2 == 'svm':
        detector_2 = svm.SVC()
    elif classifier2 == 'randomforest':
        detector_2 = ExtraTreesClassifier(n_estimators=200, max_depth=None,
                                        min_samples_split=1, random_state=0)

    scaler_2 = preprocessing.StandardScaler().fit(topic_features)
    topic_features = scaler_2.transform(topic_features)
    detector_2.fit(topic_features, topic_labels)
    with open('output/RUN_3_classifier_2.pickle', 'wb') as handle:
        pickle.dump(detector_2, handle)
    with open('output/RUN_3_scaler_2.pickle', 'wb') as handle:
        pickle.dump(scaler_2, handle)

    print('Training statistics\n')
    print('Number of real tweets: ', sum(tweet_labels == 1))
    print('Number of fake tweets: ', sum(tweet_labels == -1))
    print('Number of real topics: ', sum(topic_labels == 1))
    print('Number of fake topics: ', sum(topic_labels == -1))
Ejemplo n.º 7
0
labels = np.array(data[:,-1],dtype=int)
data = data[:,:-1]

# cross-validation 10 fold
n_folds = 10
kf = KFold(data.shape[0], n_folds=n_folds, shuffle = True)
avg_score = 0
avg_acc = 0

for train_index, test_index in kf:
    X_train, X_test = data[train_index], data[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    detector = None
    if classifier == 'logis':
        detector = logis(C=1e5, solver='lbfgs', multi_class='ovr')
    elif classifier == 'svm':
        detector = svm.SVC()
    elif classifier == 'randomforest':
        detector = ExtraTreesClassifier(n_estimators=100, max_depth=None,
                                              min_samples_split=1, random_state=0)

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)

    detector.fit(X_train, y_train)

    X_test = scaler.transform(X_test)
    pr_labels = detector.predict(X_test)

    acc = sum(y_test == pr_labels) / y_test.shape[0]
Ejemplo n.º 8
0
def main():

    selectively_split_data()

    #load tweet features
    X_train = np.loadtxt('output/training_data.dat', dtype=float)
    y_train = np.array(X_train[:, -1], dtype=int)
    y_train[np.where(y_train == -1)] = 0
    X_train = X_train[:, :-1]

    print('number of real training samples: ', sum(y_train == 1), '/',
          X_train.shape[0])

    X_test = np.loadtxt('output/testing_data.dat', dtype=float)
    y_test = np.array(X_test[:, -1], dtype=int)
    y_test[np.where(y_test == -1)] = 0
    X_test = X_test[:, :-1]

    detector = None
    if classifier1 == 'logis':
        detector = logis(C=1e5, solver='liblinear', multi_class='ovr')
    elif classifier1 == 'svm':
        detector = svm.SVC()
    elif classifier1 == 'randomforest':
        detector = ExtraTreesClassifier(n_estimators=200,
                                        max_depth=None,
                                        min_samples_split=1,
                                        random_state=0)

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    detector.fit(X_train_scaled, y_train)

    pr_labels = detector.predict(X_test_scaled)
    pr_proba = detector.predict_proba(X_test_scaled)

    score = f1_score(y_test, pr_labels, average='binary')
    acc = sum(y_test == pr_labels) * 100 / X_test_scaled.shape[0]

    print('number of real testing samples: ', sum(y_test == 1), '/',
          y_test.shape[0])

    print('\nwithout forensic and textual features: \n')

    print('F1 score: ', score * 100, '%')
    print('Acc: ', acc, '%')

    print('\nwith forensic and textual features: \n')

    topic_detector = None
    if classifier2 == 'logis':
        topic_detector = logis(C=1e5, solver='liblinear', multi_class='ovr')
    elif classifier2 == 'svm':
        topic_detector = svm.SVC()
    elif classifier2 == 'randomforest':
        topic_detector = ExtraTreesClassifier(n_estimators=200,
                                              max_depth=None,
                                              min_samples_split=1,
                                              random_state=0)

    posts_dict = None
    with open('output/' + dataset + '_posts_dict.pickle', 'rb') as handle:
        posts_dict = pickle.load(handle)

    # training_posts = read_list('output/training_posts.dat')
    # mul_list_train = {}
    # for p in training_posts:
    #     mul_id = posts_dict[p][0].strip('\n\t ')
    #     mul_list_train[mul_id] = 1
    #
    # mul_list_train = list(mul_list_train.keys())
    # topic_features_train, eff_topics_train = extract_topic_feature(mul_list_train)
    #
    # concat_data_train = np.zeros((X_train.shape[0], X_train.shape[1] + topic_features_train.shape[1]))
    #
    # for ind,p in enumerate(training_posts):
    #     mul_id = posts_dict[p][0].strip('\n\t ')
    #     concat_data_train[ind,:X_train.shape[1]] = X_train[ind,:]
    #     if mul_id in eff_topics_train:
    #         concat_data_train[ind,X_train.shape[1]:] = topic_features_train[eff_topics_train.index(mul_id)]
    #
    # testing_posts = read_list('output/testing_posts.dat')
    # mul_list_test = {}
    # for p in testing_posts:
    #     mul_id = posts_dict[p][0].strip('\n\t ')
    #     mul_list_test[mul_id] = 1
    #
    # mul_list_test = list(mul_list_test.keys())
    # topic_features_test, eff_topics_test = extract_topic_feature(mul_list_test)
    #
    # concat_data_test = np.zeros((X_test.shape[0], X_test.shape[1] + topic_features_test.shape[1]))
    #
    # for ind, p in enumerate(testing_posts):
    #     mul_id = posts_dict[p][0].strip('\n\t ')
    #     concat_data_test[ind, :X_test.shape[1]] = X_test[ind,:]
    #     if mul_id in eff_topics_test:
    #         concat_data_test[ind, X_test.shape[1]:] = topic_features_test[eff_topics_test.index(mul_id)]
    #
    # # test again
    # scaler = preprocessing.StandardScaler().fit(concat_data_train)
    # concat_data_train_scaled = scaler.transform(concat_data_train)
    # concat_data_test_scaled = scaler.transform(concat_data_test)
    #
    # detector.fit(concat_data_train_scaled, y_train)
    #
    # pr_labels = detector.predict(concat_data_test_scaled)
    # #pr_proba = detector.predict_proba(X_test_scaled)
    #
    # score = f1_score(y_test, pr_labels, average='binary')
    # acc = sum(y_test == pr_labels) * 100 / concat_data_test_scaled.shape[0]
    #
    # print('F1 score: ', score*100, '%')
    # print('Acc: ', acc, '%')

    extract_multimedia_labels()

    training_posts = read_list('output/training_posts.dat')

    topic_features_train, eff_topics_train = extract_topic_feature_train()
    X_topic_train = topic_features_train
    y_topic_train = np.array(X_topic_train[:, -1], dtype=int)
    y_train[np.where(y_train == -1)] = 0
    X_topic_train = X_topic_train[:, :-1]

    scaler = preprocessing.StandardScaler().fit(X_topic_train)
    X_topic_train = scaler.transform(X_topic_train)

    topic_detector.fit(X_topic_train, y_topic_train)

    # refine results
    testing_posts = read_list('output/testing_posts.dat')

    mul_list_test = {}
    for p in testing_posts:
        mul_id = posts_dict[p][0].strip('\n\t ')
        mul_list_test[mul_id] = 1
    mul_list_test = list(mul_list_test.keys())

    topic_features_test, eff_topics_test = extract_topic_feature_test(
        mul_list_test)

    X_topic_test = topic_features_test
    X_topic_test = scaler.transform(X_topic_test)

    topic_pr_probas = topic_detector.predict_proba(X_topic_test)

    new_results = np.zeros((len(testing_posts), ))
    for ind, p in enumerate(testing_posts):
        proba_1 = pr_proba[ind, :]
        new_proba = proba_1
        mul_id = posts_dict[p][0].strip('\n\t ')

        if mul_id in eff_topics_test:
            proba_2 = topic_pr_probas[eff_topics_test.index(mul_id), :]
            new_proba = 0.8 * proba_2 + 0.2 * proba_1

        new_results[ind, ] = (new_proba[1] > new_proba[0])

    new_results = np.array(new_results, dtype=int)

    score = f1_score(y_test, new_results, average='binary')
    acc = sum(y_test == new_results) * 100 / y_test.shape[0]

    # print('\nFail cases:\n')
    # for ind, p in enumerate(testing_posts):
    #     mul_id = posts_dict[p][0].strip('\n\t ')
    #     if y_test[ind,] != new_results[ind,] and mul_id in eff_topics_test:
    #         print(mul_id + '\n')

    print('F1 score: ', score * 100)
    print('Acc: ', acc, '%')
def main():
    # load tweet features
    data = np.loadtxt('output/' + dataset + '_tweet_features.dat', delimiter=',')
    labels = np.array(data[:, -1], dtype=int)
    count = sum(labels == -1)
    data = data[:, :-1]

    # load effective posts
    eff_posts = []
    with open('output/' + dataset + '_eff_posts.dat') as f:
        for line in f:
            line = line.strip('\n\t ')
            eff_posts.append(line)

    use_topic_feature = 0
    concat_data = None

    if use_topic_feature == 1:
        # load posts dict
        posts_dict = None
        with open('output/' + dataset + '_posts_dict.pickle', 'rb') as handle:
            posts_dict = pickle.load(handle)

        # concatenate features together

        mul_list = {}
        for p in eff_posts:
            mul_id = posts_dict[p][0].strip('\n\t ')
            mul_list[mul_id] = 1

        mul_list = list(mul_list.keys())

        topic_features, eff_topics = extract_topic_feature(mul_list)
        concat_data = np.zeros((data.shape[0], data.shape[1] + topic_features.shape[1]))

        for ind, p in enumerate(eff_posts):
            mul_id = posts_dict[p][0].strip('\n\t ')
            concat_data[ind, :data.shape[1]] = data[ind, :]
            if mul_id in eff_topics:
                concat_data[ind, data.shape[1]:] = topic_features[eff_topics.index(mul_id)]
    else:
        concat_data = data

    # cross-validation 10 folds
    n_folds = 20
    kf = KFold(data.shape[0], n_folds=n_folds, shuffle = True)
    score = 0
    count = 1
    avg_score = 0
    avg_acc = 0

    for train_index, test_index in kf:
        X_train, X_test = concat_data[train_index], concat_data[test_index]
        y_train, y_test = labels[train_index], labels[test_index]

        scaler = preprocessing.StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        detector = None
        if classifier == 'logis':
            detector = logis(C=1e5, solver='liblinear', multi_class='ovr')
        elif classifier == 'svm':
            detector = svm.SVC()
        elif classifier == 'randomforest':
            detector = ExtraTreesClassifier(n_estimators=100, max_depth=None,
                                            min_samples_split=1, random_state=0)

        detector.fit(X_train, y_train)

        pr_labels = detector.predict(X_test)
        # pr_proba = detector.predict_proba(X_test_scaled)

        score = f1_score(y_test, pr_labels, average='binary')
        avg_score += score

        acc = sum(y_test == pr_labels) / X_test.shape[0]
        avg_acc += acc

        print('\nLoop ', count, '\n')
        print('F1 score: ', score*100, '%')
        print('Acc: ', acc, '%')

        count += 1

    print('Average F1 score: ', avg_score * 100 / n_folds, '%')
    print('Average accuracy: ', avg_acc * 100 / n_folds, '%')
Ejemplo n.º 10
0
def main():

    selectively_split_data()

    #load tweet features
    X_train = np.loadtxt('output/training_data.dat', dtype=float)
    y_train = np.array(X_train[:, -1], dtype=int)
    y_train[np.where(y_train == -1)] = 0
    X_train = X_train[:,:-1]

    print('number of real training samples: ', sum(y_train == 1), '/', X_train.shape[0])

    X_test = np.loadtxt('output/testing_data.dat', dtype=float)
    y_test = np.array(X_test[:, -1], dtype=int)
    y_test[np.where(y_test == -1)] = 0
    X_test = X_test[:, :-1]

    detector = None
    if classifier1 == 'logis':
        detector = logis(C=1e5, solver='liblinear', multi_class='ovr')
    elif classifier1 == 'svm':
        detector = svm.SVC()
    elif classifier1 == 'randomforest':
        detector = ExtraTreesClassifier(n_estimators=200, max_depth=None,
                                          min_samples_split=1, random_state=0)

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    detector.fit(X_train_scaled, y_train)

    pr_labels = detector.predict(X_test_scaled)
    pr_proba = detector.predict_proba(X_test_scaled)

    score = f1_score(y_test, pr_labels, average='binary')
    acc = sum(y_test == pr_labels) * 100 / X_test_scaled.shape[0]

    print('number of real testing samples: ', sum(y_test == 1), '/', y_test.shape[0])

    print('\nwithout forensic and textual features: \n')

    print('F1 score: ', score*100, '%')
    print('Acc: ', acc, '%')

    print('\nwith forensic and textual features: \n')

    topic_detector = None
    if classifier2 == 'logis':
        topic_detector = logis(C=1e5, solver='liblinear', multi_class='ovr')
    elif classifier2 == 'svm':
        topic_detector = svm.SVC()
    elif classifier2 == 'randomforest':
        topic_detector = ExtraTreesClassifier(n_estimators=200, max_depth=None,
                                              min_samples_split=1, random_state=0)

    posts_dict = None
    with open('output/' + dataset + '_posts_dict.pickle', 'rb') as handle:
        posts_dict = pickle.load(handle)

    # training_posts = read_list('output/training_posts.dat')
    # mul_list_train = {}
    # for p in training_posts:
    #     mul_id = posts_dict[p][0].strip('\n\t ')
    #     mul_list_train[mul_id] = 1
    #
    # mul_list_train = list(mul_list_train.keys())
    # topic_features_train, eff_topics_train = extract_topic_feature(mul_list_train)
    #
    # concat_data_train = np.zeros((X_train.shape[0], X_train.shape[1] + topic_features_train.shape[1]))
    #
    # for ind,p in enumerate(training_posts):
    #     mul_id = posts_dict[p][0].strip('\n\t ')
    #     concat_data_train[ind,:X_train.shape[1]] = X_train[ind,:]
    #     if mul_id in eff_topics_train:
    #         concat_data_train[ind,X_train.shape[1]:] = topic_features_train[eff_topics_train.index(mul_id)]
    #
    # testing_posts = read_list('output/testing_posts.dat')
    # mul_list_test = {}
    # for p in testing_posts:
    #     mul_id = posts_dict[p][0].strip('\n\t ')
    #     mul_list_test[mul_id] = 1
    #
    # mul_list_test = list(mul_list_test.keys())
    # topic_features_test, eff_topics_test = extract_topic_feature(mul_list_test)
    #
    # concat_data_test = np.zeros((X_test.shape[0], X_test.shape[1] + topic_features_test.shape[1]))
    #
    # for ind, p in enumerate(testing_posts):
    #     mul_id = posts_dict[p][0].strip('\n\t ')
    #     concat_data_test[ind, :X_test.shape[1]] = X_test[ind,:]
    #     if mul_id in eff_topics_test:
    #         concat_data_test[ind, X_test.shape[1]:] = topic_features_test[eff_topics_test.index(mul_id)]
    #
    # # test again
    # scaler = preprocessing.StandardScaler().fit(concat_data_train)
    # concat_data_train_scaled = scaler.transform(concat_data_train)
    # concat_data_test_scaled = scaler.transform(concat_data_test)
    #
    # detector.fit(concat_data_train_scaled, y_train)
    #
    # pr_labels = detector.predict(concat_data_test_scaled)
    # #pr_proba = detector.predict_proba(X_test_scaled)
    #
    # score = f1_score(y_test, pr_labels, average='binary')
    # acc = sum(y_test == pr_labels) * 100 / concat_data_test_scaled.shape[0]
    #
    # print('F1 score: ', score*100, '%')
    # print('Acc: ', acc, '%')

    extract_multimedia_labels()

    training_posts = read_list('output/training_posts.dat')

    topic_features_train, eff_topics_train = extract_topic_feature_train()
    X_topic_train = topic_features_train
    y_topic_train = np.array(X_topic_train[:, -1], dtype=int)
    y_train[np.where(y_train == -1)] = 0
    X_topic_train = X_topic_train[:, :-1]

    scaler = preprocessing.StandardScaler().fit(X_topic_train)
    X_topic_train = scaler.transform(X_topic_train)

    topic_detector.fit(X_topic_train, y_topic_train)

    # refine results
    testing_posts = read_list('output/testing_posts.dat')

    mul_list_test = {}
    for p in testing_posts:
        mul_id = posts_dict[p][0].strip('\n\t ')
        mul_list_test[mul_id] = 1
    mul_list_test = list(mul_list_test.keys())

    topic_features_test, eff_topics_test = extract_topic_feature_test(mul_list_test)

    X_topic_test = topic_features_test
    X_topic_test = scaler.transform(X_topic_test)

    topic_pr_probas = topic_detector.predict_proba(X_topic_test)

    new_results = np.zeros((len(testing_posts),))
    for ind,p in enumerate(testing_posts):
        proba_1 = pr_proba[ind,:]
        new_proba = proba_1
        mul_id = posts_dict[p][0].strip('\n\t ')

        if mul_id in eff_topics_test:
            proba_2 = topic_pr_probas[eff_topics_test.index(mul_id),:]
            new_proba = 0.8*proba_2 + 0.2*proba_1

        new_results[ind,] = (new_proba[1] > new_proba[0])

    new_results = np.array(new_results,dtype=int)

    score = f1_score(y_test, new_results, average='binary')
    acc = sum(y_test == new_results) * 100 / y_test.shape[0]

    # print('\nFail cases:\n')
    # for ind, p in enumerate(testing_posts):
    #     mul_id = posts_dict[p][0].strip('\n\t ')
    #     if y_test[ind,] != new_results[ind,] and mul_id in eff_topics_test:
    #         print(mul_id + '\n')

    print('F1 score: ', score*100)
    print('Acc: ', acc, '%')
Ejemplo n.º 11
0
# cross-validation 10 fold
n_folds = 10
kf = KFold(data.shape[0], n_folds=n_folds, shuffle=True)
avg_score = 0
avg_acc = 0
for train_index, test_index in kf:
    X_train, X_test = data[train_index], data[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    # normalize data
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)

    detector = None
    if classifier == 'logis':
        detector = logis(C=1e5, solver='lbfgs', multi_class='multinomial')
    elif classifier == 'svm':
        detector = svm.SVC()
    elif classifier == 'randomforest':
        detector = ExtraTreesClassifier(n_estimators=100, max_depth=None,
                                              min_samples_split=1, random_state=0)

    detector.fit(X_train, y_train)

    X_test = scaler.transform(X_test)
    pr_labels = detector.predict(X_test)

    acc = sum(y_test == pr_labels) / y_test.shape[0]
    score = f1_score(y_test, pr_labels, average='binary')

    avg_acc += acc
Ejemplo n.º 12
0
# cross-validation 10 fold
n_folds = 10
kf = KFold(data.shape[0], n_folds=n_folds, shuffle=True)
avg_score = 0
avg_acc = 0
for train_index, test_index in kf:
    X_train, X_test = data[train_index], data[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    # normalize data
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)

    detector = None
    if classifier == 'logis':
        detector = logis(C=1e5, solver='lbfgs', multi_class='multinomial')
    elif classifier == 'svm':
        detector = svm.SVC()
    elif classifier == 'randomforest':
        detector = ExtraTreesClassifier(n_estimators=100,
                                        max_depth=None,
                                        min_samples_split=1,
                                        random_state=0)

    detector.fit(X_train, y_train)

    X_test = scaler.transform(X_test)
    pr_labels = detector.predict(X_test)

    acc = sum(y_test == pr_labels) / y_test.shape[0]
    score = f1_score(y_test, pr_labels, average='binary')
def main():
    # load tweet features
    data = np.loadtxt('output/' + dataset + '_tweet_features.dat',
                      delimiter=',')
    labels = np.array(data[:, -1], dtype=int)
    count = sum(labels == -1)
    data = data[:, :-1]

    # load effective posts
    eff_posts = []
    with open('output/' + dataset + '_eff_posts.dat') as f:
        for line in f:
            line = line.strip('\n\t ')
            eff_posts.append(line)

    use_topic_feature = 0
    concat_data = None

    if use_topic_feature == 1:
        # load posts dict
        posts_dict = None
        with open('output/' + dataset + '_posts_dict.pickle', 'rb') as handle:
            posts_dict = pickle.load(handle)

        # concatenate features together

        mul_list = {}
        for p in eff_posts:
            mul_id = posts_dict[p][0].strip('\n\t ')
            mul_list[mul_id] = 1

        mul_list = list(mul_list.keys())

        topic_features, eff_topics = extract_topic_feature(mul_list)
        concat_data = np.zeros(
            (data.shape[0], data.shape[1] + topic_features.shape[1]))

        for ind, p in enumerate(eff_posts):
            mul_id = posts_dict[p][0].strip('\n\t ')
            concat_data[ind, :data.shape[1]] = data[ind, :]
            if mul_id in eff_topics:
                concat_data[ind, data.shape[1]:] = topic_features[
                    eff_topics.index(mul_id)]
    else:
        concat_data = data

    # cross-validation 10 folds
    n_folds = 20
    kf = KFold(data.shape[0], n_folds=n_folds, shuffle=True)
    score = 0
    count = 1
    avg_score = 0
    avg_acc = 0

    for train_index, test_index in kf:
        X_train, X_test = concat_data[train_index], concat_data[test_index]
        y_train, y_test = labels[train_index], labels[test_index]

        scaler = preprocessing.StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        detector = None
        if classifier == 'logis':
            detector = logis(C=1e5, solver='liblinear', multi_class='ovr')
        elif classifier == 'svm':
            detector = svm.SVC()
        elif classifier == 'randomforest':
            detector = ExtraTreesClassifier(n_estimators=100,
                                            max_depth=None,
                                            min_samples_split=1,
                                            random_state=0)

        detector.fit(X_train, y_train)

        pr_labels = detector.predict(X_test)
        # pr_proba = detector.predict_proba(X_test_scaled)

        score = f1_score(y_test, pr_labels, average='binary')
        avg_score += score

        acc = sum(y_test == pr_labels) / X_test.shape[0]
        avg_acc += acc

        print('\nLoop ', count, '\n')
        print('F1 score: ', score * 100, '%')
        print('Acc: ', acc, '%')

        count += 1

    print('Average F1 score: ', avg_score * 100 / n_folds, '%')
    print('Average accuracy: ', avg_acc * 100 / n_folds, '%')