def createData(self, output, numDataPoints): allFeatures = getMergedFeatures( "C:/MissingWord/mergedFeatures" + str(self.sentenceLength) + ".txt", numDataPoints) print("Loaded Features") golds = loadGolds("C:/MissingWord/" + str(self.sentenceLength) + "Gold.txt", numGolds=len(allFeatures), length=self.sentenceLength) print(len(allFeatures)) sentences = [] for i in range(len(allFeatures)): sentences.append(Sentence(golds[i], allFeatures[i])) appendLabels(str(self.sentenceLength) + "rf15.txt", sentences) appendLabels(str(self.sentenceLength) + "rf110.txt", sentences) data = [(sentence.getAllFeatures(), sentence.getGold().getRemovedIndex()) for sentence in sentences] with open(output, "wb") as f: pickle.dump(data, f) return data
def makeData(self): allFeatures = getMergedFeatures( "C:/MissingWord/mergedFeatures" + str(self.sentenceLength) + ".txt", self.numData) golds = loadGolds("C:/MissingWord/" + str(self.sentenceLength) + "Gold.txt", numGolds=len(allFeatures), length=self.sentenceLength)
def createData(self, output, numDataPoints): allFeatures = getMergedFeatures("C:/MissingWord/mergedFeatures"+str(self.sentenceLength)+".txt", numDataPoints) print("Loaded Features") golds = loadGolds("C:/MissingWord/"+str(self.sentenceLength)+"Gold.txt", numGolds = len(allFeatures), length = self.sentenceLength) print(len(allFeatures)) sentences = [] for i in range(len(allFeatures)): sentences.append(Sentence(golds[i], allFeatures[i])) appendLabels(str(self.sentenceLength)+"rf15.txt", sentences) appendLabels(str(self.sentenceLength)+"rf110.txt", sentences) data = [(sentence.getAllFeatures(), sentence.getGold().getRemovedIndex()) for sentence in sentences] with open(output, "wb") as f: pickle.dump(data, f) return data
testFeatures = [datum[0] for datum in data[cutoff:]] testLabels = [datum[1] for datum in data[cutoff:]] clf = RandomForestClassifier(n_estimators = 500, n_jobs = 7) clf.fit(trainFeatures, trainLabels) trainingPred = clf.predict(trainFeatures) print(classification_report(trainLabels, trainingPred)) print(classification_report(testLabels, clf.predict(testFeatures))) with open(modelSystem.nPosModelFile(self.sentenceLength), "wb") as f: pickle.dump(clf, f) def test(self, predictions, predictionsMade, golds): print("Total Sentences "+str(len(predictions))) toVerifyPrediction = [] toVerifyLabel = [] for predictionIndex in predictionsMade: toVerifyPrediction.append(predictions[predictionIndex]) toVerifyLabel.append(golds[predictionIndex].getRemovedIndex()) print("Predicted "+str(len(toVerifyPrediction))) print(classification_report(toVerifyPrediction, toVerifyLabel)) if __name__ == "__main__": sentenceLength = 16 model = NPositionModel(sentenceLength) predictions, predictionsMade = model.predict(bridge.loadMergedFeatures(sentenceLength, start = 100000, end = 110000)) model.test(predictions, predictionsMade, gold.loadGolds("C:/MissingWord/"+str(sentenceLength)+"Gold.txt", numGolds = 110000)[100000:110000])
def makeData(self): allFeatures = getMergedFeatures("C:/MissingWord/mergedFeatures"+str(self.sentenceLength)+".txt", self.numData) golds = loadGolds("C:/MissingWord/"+str(self.sentenceLength)+"Gold.txt", numGolds = len(allFeatures), length = self.sentenceLength)
print(classification_report(self.clf.predict(trainFeatures), trainLabels)) print(classification_report(self.clf.predict(testFeatures), testLabels)) def predict(self, features): return self.clf.predict_proba(features) def chunks(l, n): for i in range(0, len(l), n): yield l[i:i+n] def export(self, filename): if self.clf is not None: with open(filename, "wb") as f: pickle.dump(self.clf, f) if __name__ == "__main__": #output(15) pms = PositionModelStructure(15, [5, 10]) ''' pms.trainIntermediateModels() ''' #pms.trainPredictiveModel() frameFeatures = bridge.loadMergedFeatures(15, 100000, 110000) sentenceFeatures = modelSystem.rawToSentenceFeatures(frameFeatures) pms.predict(sentenceFeatures) pms.validatePrediction(gold.loadGolds("C:/MissingWord/"+str(15)+"Gold.txt", numGolds = 110000)[100000:110000])
for line in f: chunks = parseSuperList(line.strip()) labelSet = [] for elem in chunks: labelSet.append(round(float(parseArrayList(elem)[1]))) frameLabels.append(labelSet) condensedFrameLabels = [] for label in frameLabels: if 1 in label: condensedFrameLabels.append(label.index(1)) else: condensedFrameLabels.append(0) golds = gold.loadGolds("C:/MissingWord/"+str(sentenceLength)+"gold.txt", numGolds = 100000) goldLabels = [gold.getRemovedIndex() for gold in golds] allPredictions = [] with open("C:/MissingWord/frame/15-10Combined.txt", "r") as f: for line in f: if len(line) > 0: line = line.replace("array(", "") line = line.replace(")", "") line = line.strip()[1: -1] predictionSet = [] for elem in line.split("], ["): elem = elem.replace("]", "") elem = elem.replace("[", "") elems = [float(e) for e in elem.split(", ")] if len(elems) == 1 or elems[0] > 0.5:
clf = RandomForestClassifier(n_estimators=500, n_jobs=7) clf.fit(trainFeatures, trainLabels) trainingPred = clf.predict(trainFeatures) print(classification_report(trainLabels, trainingPred)) print(classification_report(testLabels, clf.predict(testFeatures))) with open(modelSystem.nPosModelFile(self.sentenceLength), "wb") as f: pickle.dump(clf, f) def test(self, predictions, predictionsMade, golds): print("Total Sentences " + str(len(predictions))) toVerifyPrediction = [] toVerifyLabel = [] for predictionIndex in predictionsMade: toVerifyPrediction.append(predictions[predictionIndex]) toVerifyLabel.append(golds[predictionIndex].getRemovedIndex()) print("Predicted " + str(len(toVerifyPrediction))) print(classification_report(toVerifyPrediction, toVerifyLabel)) if __name__ == "__main__": sentenceLength = 16 model = NPositionModel(sentenceLength) predictions, predictionsMade = model.predict( bridge.loadMergedFeatures(sentenceLength, start=100000, end=110000)) model.test( predictions, predictionsMade, gold.loadGolds("C:/MissingWord/" + str(sentenceLength) + "Gold.txt", numGolds=110000)[100000:110000])