class Pipeline(object):
	def __init__(self, trainFilePath, valFilePath, retrievalInstance, featurizerInstance, classifierInstance, predFilePath):
		self.retrievalInstance = retrievalInstance
		self.featurizerInstance = featurizerInstance
		self.classifierInstance = classifierInstance
		self.predFile = predFilePath
		trainfile = open(trainFilePath, 'r')
		self.trainData = json.load(trainfile)
		trainfile.close()
		valfile = open(valFilePath, 'r')
		self.valData = json.load(valfile)
		valfile.close()
		self.question_answering()

	def makeXY(self, dataQuestions):
		X = []
		Y = []
		for question in dataQuestions:
			
			long_snippets = self.retrievalInstance.getLongSnippets(question)
			short_snippets = self.retrievalInstance.getShortSnippets(question)
			
			X.append(short_snippets)
			Y.append(question['answers'][0])
			
		return X, Y


	def question_answering(self):
		dataset_type = self.trainData['origin']
		candidate_answers = self.trainData['candidates']
		X_train, Y_train = self.makeXY(self.trainData['questions'][0:6000])
		X_val, Y_val_true = self.makeXY(self.valData['questions'])

		#featurization
		X_features_train, X_features_val = self.featurizerInstance.getFeatureRepresentation(X_train, X_val)
		self.clf = self.classifierInstance.buildClassifier(X_features_train, Y_train)
		
		#Prediction
		Y_val_pred = self.clf.predict(X_features_val)

		self.evaluatorInstance = Evaluator()
		a =  self.evaluatorInstance.getAccuracy(Y_val_true, Y_val_pred)
		p,r,f = self.evaluatorInstance.getPRF(Y_val_true, Y_val_pred)

		# write to file: comparison, pred, true
		print self.predFile
		predfile = open(self.predFile, 'w')
		for i in range(len(Y_val_true)):
			if Y_val_true[i] == Y_val_pred[i]:
				predfile.write("1 " + str(Y_val_pred[i]) + " " + str(Y_val_true[i]) + '\n')
			elif Y_val_true[i] != Y_val_pred[i]:
				predfile.write("0 " + str(Y_val_pred[i]) + " " + str(Y_val_true[i]) + '\n')

		print "Accuracy: " + str(a)
		print "Precision: " + str(p)
		print "Recall: " + str(r)
		print "F-measure: " + str(f)
class Pipeline(object):
    def __init__(self, trainFilePath, valFilePath, saveFilePath, retrievalInstance,
                 featurizerInstance, classifierInstance):

        self.saveFilePath = saveFilePath
        self.retrievalInstance = retrievalInstance
        self.featurizerInstance = featurizerInstance
        self.classifierInstance = classifierInstance
        trainfile = open(trainFilePath, 'r')
        self.trainData = json.load(trainfile)
        trainfile.close()
        valfile = open(valFilePath, 'r')
        self.valData = json.load(valfile)
        valfile.close()
        self.question_answering()

    def makeXY(self, dataQuestions):
        X = []
        Y = []
        for question in dataQuestions:

            long_snippets = self.retrievalInstance.getLongSnippets(question)
            short_snippets = self.retrievalInstance.getShortSnippets(question)

            X.append(short_snippets)
            Y.append(question['answers'][0])

        return X, Y

    def question_answering(self):
        dataset_type = self.trainData['origin']
        candidate_answers = self.trainData['candidates']
        X_train, Y_train = self.makeXY(self.trainData['questions'])
        X_val, Y_val_true = self.makeXY(self.valData['questions'])

        # featurization
        X_features_train, X_features_val = self.featurizerInstance.getFeatureRepresentation(
            X_train, X_val)
        self.clf = self.classifierInstance.buildClassifier(
            X_features_train, Y_train)

        # Prediction
        Y_val_pred = self.clf.predict(X_features_val)

        self.evaluatorInstance = Evaluator()
        a = self.evaluatorInstance.getAccuracy(Y_val_true, Y_val_pred)
        p, r, f = self.evaluatorInstance.getPRF(Y_val_true, Y_val_pred)
        print("Accuracy: " + str(a))
        print("Precision: " + str(p))
        print("Recall: " + str(r))
        print("F-measure: " + str(f))

        with open(self.saveFilePath, 'w') as fout:
            writer = csv.writer(fout)
            for tup in zip(Y_val_true, Y_val_pred):
                writer.writerow(tup)
        return Y_val_pred
Beispiel #3
0
class Pipeline(object):
    def __init__(self, trainFilePath, valFilePath, retrievalInstance,
                 featurizerInstances, classifierInstances):
        self.retrievalInstance = retrievalInstance
        self.featurizerInstances = featurizerInstances
        self.classifierInstances = classifierInstances
        trainfile = open(trainFilePath, 'r')
        self.trainData = json.load(trainfile)
        trainfile.close()
        valfile = open(valFilePath, 'r')
        self.valData = json.load(valfile)
        valfile.close()
        self.question_answering()

    def makeXY(self, dataQuestions):
        X = []
        Y = []
        for question in dataQuestions:

            long_snippets = self.retrievalInstance.getLongSnippets(question)
            short_snippets = self.retrievalInstance.getShortSnippets(question)

            X.append(short_snippets)
            Y.append(question['answers'][0])

        return X, Y

    def question_answering(self):
        dataset_type = self.trainData['origin']
        candidate_answers = self.trainData['candidates']
        X_train, Y_train = self.makeXY(self.trainData['questions'][0:1000])
        X_val, Y_val_true = self.makeXY(self.valData['questions'])

        for featurizer in self.featurizerInstances:
            for classifier in self.classifierInstances:
                print "Running pipeline with featurizer: ", featurizer, " and classifier ", classifier
                #featurization
                X_features_train, X_features_val = featurizer.getFeatureRepresentation(
                    X_train, X_val)
                self.clf = classifier.buildClassifier(X_features_train,
                                                      Y_train)

                #Prediction
                Y_val_pred = self.clf.predict(X_features_val)

                # Evaluation
                self.evaluatorInstance = Evaluator()
                a = self.evaluatorInstance.getAccuracy(Y_val_true, Y_val_pred)
                p, r, f = self.evaluatorInstance.getPRF(Y_val_true, Y_val_pred)
                print "Accuracy: " + str(a)
                print "Precision: " + str(a)
                print "Recall: " + str(a)
                print "F-measure: " + str(a)
                print '\n'
Beispiel #4
0
class Pipeline(object):
    def __init__(self, trainFilePath, valFilePath, retrievalInstance, featurizerInstance, classifierInstance):
        self.retrievalInstance = retrievalInstance
        self.featurizerInstance = featurizerInstance
        self.classifierInstance = classifierInstance
        self.evaluatorInstance = Evaluator()
        trainfile = open(trainFilePath, 'r')
        self.trainData = json.load(trainfile)
        self.trainData['questions'] = self.trainData['questions'][0:N]
        
        trainfile.close()
        valfile = open(valFilePath, 'r')
        self.valData = json.load(valfile)
        valfile.close()
        #self.question_answering()
        self.prepare_data()
        self.prepare_features()

    def makeXY(self, dataQuestions):
        X = []
        Y = []
        for question in dataQuestions:
            
            long_snippets = self.retrievalInstance.getLongSnippets(question)
            short_snippets = self.retrievalInstance.getShortSnippets(question)
            
            X.append(short_snippets)
            Y.append(question['answers'][0])
            
        return X, Y


    def get_data(self):
        dataset_type = self.trainData['origin']
        candidate_answers = self.trainData['candidates'] ##
        return self.makeXY(self.trainData['questions'])


    def prepare_data(self):
        dataset_type = self.trainData['origin']
        candidate_answers = self.trainData['candidates'] ##

        self.X_train, self.Y_train = self.makeXY(self.trainData['questions'])
        self.X_val, self.Y_val_true = self.makeXY(self.valData['questions'])

    def prepare_features(self):
        #featurization
        self.X_features_train, self.X_features_val = self.featurizerInstance.getFeatureRepresentation(self.X_train, self.X_val)

    def qa(self):
        self.clf = self.classifierInstance.buildClassifier(self.X_features_train, self.Y_train)
        #Prediction
        Y_val_pred = self.clf.predict(self.X_features_val)
        
        a = self.evaluatorInstance.getAccuracy(self.Y_val_true, Y_val_pred)
        p, r, f = self.evaluatorInstance.getPRF(self.Y_val_true, Y_val_pred)

        print("Accuracy: " + str(a))
        print("Precision: " + str(p))
        print("Recall: " + str(r))
        print("F-measure: " + str(f))
class Pipeline(object):
    def __init__(self, trainFilePath, valFilePath, retrievalInstance,
                 featurizerInstance, classifierInstance, resultsPATH):
        self.retrievalInstance = retrievalInstance
        self.featurizerInstance = featurizerInstance
        self.classifierInstance = classifierInstance
        trainfile = open(trainFilePath, 'r')
        self.trainData = json.load(trainfile)
        trainfile.close()
        valfile = open(valFilePath, 'r')
        self.valData = json.load(valfile)
        valfile.close()
        self.PATH = resultsPATH
        self.question_answering()

    def makeXY(self, dataQuestions):
        X = []
        Y = []
        for question in dataQuestions:

            long_snippets = self.retrievalInstance.getLongSnippets(question)
            short_snippets = self.retrievalInstance.getShortSnippets(question)

            X.append(short_snippets)
            Y.append(question['answers'][0])

        return X, Y

    def question_answering(self):
        print('Loading data...')
        dataset_type = self.trainData['origin']
        candidate_answers = self.trainData['candidates']
        X_train, Y_train = self.makeXY(
            self.trainData['questions'][0:30000])  # 31049 questions
        X_val, Y_val_true = self.makeXY(self.valData['questions'])

        # featurization
        print('Feature Extraction...')
        X_features_train, X_features_val = self.featurizerInstance.getFeatureRepresentation(
            X_train, X_val)
        self.clf = self.classifierInstance.buildClassifier(
            X_features_train, Y_train)

        # Prediction
        print('Prediction...')
        Y_val_pred = self.clf.predict(X_features_val)

        self.evaluatorInstance = Evaluator()
        a = self.evaluatorInstance.getAccuracy(Y_val_true, Y_val_pred)
        p, r, f = self.evaluatorInstance.getPRF(Y_val_true, Y_val_pred)

        print("Accuracy: " + str(a))
        print("Precision: " + str(p))
        print("Recall: " + str(r))
        print("F-measure: " + str(f))

        # Correctly answered questions
        # correct_questions_indices = np.where(np.equal(Y_val_pred, Y_val_true))
        # correct_questions = X_val[correct_questions_indices]

        # Save predictions in json
        results = {
            'feature': self.featurizerInstance.__class__.__name__,
            'classifier': self.classifierInstance.__class__.__name__,
            'training size': len(X_train),
            'accuracy': a,
            'precision': p,
            'recall': r,
            'F-measure': f,
            'predictions': Y_val_pred.tolist()
        }
        file = open(os.path.join(
            self.PATH, self.featurizerInstance.__class__.__name__ +
            self.classifierInstance.__class__.__name__),
                    'w',
                    encoding='utf-8')
        json.dump(results, file, ensure_ascii=False)