/
quasar_pipeline.py
113 lines (95 loc) · 4.28 KB
/
quasar_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# usr/bin/python3
import sys
import json
from sklearn.externals import joblib
import os
from datetime import datetime
from Retrieval import Retrieval
from Featurizer import Featurizer
from CountFeaturizer import CountFeaturizer
from TfidfFeaturizer import TfidfFeaturizer
from Classifier import Classifier
from MultinomialNaiveBayes import MultinomialNaiveBayes
from SVM import SVM
from MLP import MLP
from Evaluator import Evaluator
import pdb
import numpy as np
class Pipeline(object):
def __init__(self, trainFilePath, valFilePath, retrievalInstance, featurizerInstance, classifierInstance, resultsPATH):
self.retrievalInstance = retrievalInstance
self.featurizerInstance = featurizerInstance
self.classifierInstance = classifierInstance
trainfile = open(trainFilePath, 'r')
self.trainData = json.load(trainfile)
trainfile.close()
valfile = open(valFilePath, 'r')
self.valData = json.load(valfile)
valfile.close()
self.PATH = resultsPATH
self.question_answering()
def makeXY(self, dataQuestions):
X = []
Y = []
for question in dataQuestions:
long_snippets = self.retrievalInstance.getLongSnippets(question)
short_snippets = self.retrievalInstance.getShortSnippets(question)
X.append(short_snippets)
Y.append(question['answers'][0])
return X, Y
def question_answering(self):
print('Loading data...')
dataset_type = self.trainData['origin']
candidate_answers = self.trainData['candidates']
X_train, Y_train = self.makeXY(self.trainData['questions'][0:30000]) # 31049 questions
X_val, Y_val_true = self.makeXY(self.valData['questions'])
# featurization
print('Feature Extraction...')
X_features_train, X_features_val = self.featurizerInstance.getFeatureRepresentation(
X_train, X_val)
self.clf = self.classifierInstance.buildClassifier(X_features_train, Y_train)
# Prediction
print('Prediction...')
Y_val_pred = self.clf.predict(X_features_val)
self.evaluatorInstance = Evaluator()
a = self.evaluatorInstance.getAccuracy(Y_val_true, Y_val_pred)
p, r, f = self.evaluatorInstance.getPRF(Y_val_true, Y_val_pred)
print("Accuracy: " + str(a))
print("Precision: " + str(p))
print("Recall: " + str(r))
print("F-measure: " + str(f))
# Correctly answered questions
# correct_questions_indices = np.where(np.equal(Y_val_pred, Y_val_true))
# correct_questions = X_val[correct_questions_indices]
# Save predictions in json
results = {'feature': self.featurizerInstance.__class__.__name__,
'classifier': self.classifierInstance.__class__.__name__,
'training size': len(X_train),
'accuracy': a,
'precision': p,
'recall': r,
'F-measure': f,
'predictions': Y_val_pred.tolist()}
file = open(os.path.join(self.PATH, self.featurizerInstance.__class__.__name__ +
self.classifierInstance.__class__.__name__), 'w', encoding='utf-8')
json.dump(results, file, ensure_ascii=False)
if __name__ == '__main__':
trainFilePath = sys.argv[1] # please give the path to your reformatted quasar-s json train file
valFilePath = sys.argv[2] # provide the path to val file
retrievalInstance = Retrieval()
resultsPATH = os.path.join('./Results', datetime.now().strftime("%Y%m%d-%H%M%S"))
os.makedirs(resultsPATH)
# Featurizers
countfeaturizerInstance = CountFeaturizer()
tfidffeaturizerInstance = TfidfFeaturizer()
featurizerInstances = [countfeaturizerInstance, tfidffeaturizerInstance]
# Classifiers
MNBclassifierInstance = MultinomialNaiveBayes()
SVMclassifierInstance = SVM()
MLPclassifierInstance = MLP()
classifierInstances = [MNBclassifierInstance, SVMclassifierInstance, MLPclassifierInstance]
# all 2 x 3 combinations
for featurizer in featurizerInstances:
for classifier in classifierInstances:
trainInstance = Pipeline(trainFilePath, valFilePath, retrievalInstance,
featurizer, classifier, resultsPATH)