/
jaccard_ngrams_word2vec_combined.py
132 lines (105 loc) · 4.82 KB
/
jaccard_ngrams_word2vec_combined.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# -*- coding: utf-8 -*-
from csv import DictReader
import os
import pdb
import string
import sys
import time
import nltk
from nltk.classify import NaiveBayesClassifier
from libs.dataset import DataSet
from libs.gen_ngrams import NgramsGenerator
from libs.gen_jaccard_sims import JaccardGenerator
from libs.gen_wordvectors_text8 import WordVector
from libs.generate_test_splits import generate_hold_out_split, kfold_split, get_stances_for_folds
from libs.score import score_submission
from gensim.models import word2vec
class StanceClassifier:
def __init__(self):
self._labeled_feature_set = []
self._test_feature_set = []
self.dataset = DataSet()
self._ngram_len = 2
def do_validation(self):
# each fold is a list of body ids.
folds, hold_out = kfold_split(self.dataset, n_folds=10)
# fold_stances is a dict. keys are fold number (e.g. 0-9). hold_out_stances is list
fold_stances, hold_out_stances = get_stances_for_folds(self.dataset, folds, hold_out)
# https://cs.fit.edu/~mmahoney/compression/textdata.html
sentences = word2vec.Text8Corpus('text8')
model = word2vec.Word2Vec(sentences, size=200)
labeled_feat_dict = {}
print "Generating features for each fold"
for fold_id in fold_stances:
print "Generating features for fold ", fold_id
bodies = folds[fold_id]
stances = fold_stances[fold_id]
fold_avg_sims, fold_max_sims = JaccardGenerator().gen_jaccard_sims(
self.dataset, bodies, stances)
common_ngrams = NgramsGenerator().gen_common_ngrams(
self.dataset, bodies, stances, self._ngram_len)
wordvectors = WordVector().gen_wordvectors(
self.dataset, bodies, stances, model)
labeled_feature_set = []
for i in range(len(stances)):
features = {
'avg_sims':fold_avg_sims[i],
'max_sims':fold_max_sims[i],
'common_ngrams':common_ngrams[i],
'word_vectors':wordvectors[i]}
label = self._process_stance(stances[i]['Stance'])
labeled_feature = (features, label)
labeled_feature_set.append(labeled_feature)
labeled_feat_dict[fold_id] = labeled_feature_set
print "Generating features for hold out fold"
holdout_avg_sims, holdout_max_sims = JaccardGenerator().gen_jaccard_sims(
self.dataset, hold_out, hold_out_stances)
holdout_common_ngrams = NgramsGenerator().gen_common_ngrams(
self.dataset, hold_out, hold_out_stances, self._ngram_len)
holdout_wordvectors = WordVector().gen_wordvectors(
self.dataset, hold_out, hold_out_stances, model)
h_unlabeled_features = []
h_labels = []
for i in range(len(hold_out_stances)):
unlabeled_feature = {
'avg_sims': holdout_avg_sims[i],
'max_sims': holdout_max_sims[i],
'common_ngrams': holdout_common_ngrams[i],
'word_vectors': holdout_wordvectors[i]}
label = self._process_stance(hold_out_stances[i]['Stance'])
h_unlabeled_features.append(unlabeled_feature)
h_labels.append(label)
fold_accuracy = {}
best_fold_accuracy = 0.0
classifiers = []
print "Validating using each fold as testing set"
for fold_id in fold_stances:
fold_ids = list(range(len(folds)))
del fold_ids[fold_id] # deleted fold is test set for this run
training_set = [feat for fid in fold_ids for feat in labeled_feat_dict[fid]]
testing_set = []
testing_labels = []
for feat, label in labeled_feat_dict[fold_id]:
testing_set.append(feat)
testing_labels.append(label)
classifier = NaiveBayesClassifier.train(training_set)
classifiers.append(classifier)
pred = classifier.classify_many(testing_set)
accuracy = self._score(pred, testing_labels)
print "Fold ", fold_id, "accuracy: ", accuracy
if accuracy > best_fold_accuracy:
best_fold_accuracy = accuracy
best_fold_cls = classifier
h_res = best_fold_cls.classify_many(h_unlabeled_features)
print 'holdout score:', self._score(h_res, h_labels)
def _score(self, predicted, actual):
num_correct = 0
for idx in range(len(predicted)):
if predicted[idx] == actual[idx]:
num_correct += 1
accuracy = num_correct / float(len(predicted))
return accuracy
def _process_stance(self, stance):
return stance
if __name__ == "__main__":
StanceClassifier().do_validation()