def createData(self, output, numDataPoints):

        allFeatures = getMergedFeatures(
            "C:/MissingWord/mergedFeatures" + str(self.sentenceLength) +
            ".txt", numDataPoints)

        print("Loaded Features")
        golds = loadGolds("C:/MissingWord/" + str(self.sentenceLength) +
                          "Gold.txt",
                          numGolds=len(allFeatures),
                          length=self.sentenceLength)

        print(len(allFeatures))

        sentences = []
        for i in range(len(allFeatures)):
            sentences.append(Sentence(golds[i], allFeatures[i]))

        appendLabels(str(self.sentenceLength) + "rf15.txt", sentences)
        appendLabels(str(self.sentenceLength) + "rf110.txt", sentences)

        data = [(sentence.getAllFeatures(),
                 sentence.getGold().getRemovedIndex())
                for sentence in sentences]

        with open(output, "wb") as f:
            pickle.dump(data, f)

        return data
Exemple #2
0
 def makeData(self):
     allFeatures = getMergedFeatures(
         "C:/MissingWord/mergedFeatures" + str(self.sentenceLength) +
         ".txt", self.numData)
     golds = loadGolds("C:/MissingWord/" + str(self.sentenceLength) +
                       "Gold.txt",
                       numGolds=len(allFeatures),
                       length=self.sentenceLength)
    def createData(self, output, numDataPoints):

        allFeatures = getMergedFeatures("C:/MissingWord/mergedFeatures"+str(self.sentenceLength)+".txt", numDataPoints)

        print("Loaded Features")
        golds = loadGolds("C:/MissingWord/"+str(self.sentenceLength)+"Gold.txt", numGolds = len(allFeatures), length = self.sentenceLength)

        print(len(allFeatures))

        sentences = []
        for i in range(len(allFeatures)):
            sentences.append(Sentence(golds[i], allFeatures[i]))

        appendLabels(str(self.sentenceLength)+"rf15.txt", sentences)
        appendLabels(str(self.sentenceLength)+"rf110.txt", sentences)

        data = [(sentence.getAllFeatures(), sentence.getGold().getRemovedIndex()) for sentence in sentences]

        with open(output, "wb") as f:
            pickle.dump(data, f)

        return data
Exemple #4
0
        testFeatures = [datum[0] for datum in data[cutoff:]]
        testLabels = [datum[1] for datum in data[cutoff:]]

        clf = RandomForestClassifier(n_estimators = 500, n_jobs = 7)
        clf.fit(trainFeatures, trainLabels)
        trainingPred = clf.predict(trainFeatures)
        print(classification_report(trainLabels, trainingPred))
        print(classification_report(testLabels, clf.predict(testFeatures)))

        with open(modelSystem.nPosModelFile(self.sentenceLength), "wb") as f:
            pickle.dump(clf, f)

    def test(self, predictions, predictionsMade, golds):
        print("Total Sentences "+str(len(predictions)))
        toVerifyPrediction = []
        toVerifyLabel = []
        for predictionIndex in predictionsMade:
            toVerifyPrediction.append(predictions[predictionIndex])
            toVerifyLabel.append(golds[predictionIndex].getRemovedIndex())

        print("Predicted "+str(len(toVerifyPrediction)))

        print(classification_report(toVerifyPrediction, toVerifyLabel))

if __name__ == "__main__":
    sentenceLength = 16
    model = NPositionModel(sentenceLength)
    predictions, predictionsMade = model.predict(bridge.loadMergedFeatures(sentenceLength, start = 100000, end = 110000))
    model.test(predictions, predictionsMade, gold.loadGolds("C:/MissingWord/"+str(sentenceLength)+"Gold.txt", numGolds = 110000)[100000:110000])
Exemple #5
0
 def makeData(self):
     allFeatures = getMergedFeatures("C:/MissingWord/mergedFeatures"+str(self.sentenceLength)+".txt", self.numData)
     golds = loadGolds("C:/MissingWord/"+str(self.sentenceLength)+"Gold.txt", numGolds = len(allFeatures), length = self.sentenceLength)
Exemple #6
0
            print(classification_report(self.clf.predict(trainFeatures), trainLabels))
            print(classification_report(self.clf.predict(testFeatures), testLabels))

    def predict(self, features):
        return self.clf.predict_proba(features)

    def chunks(l, n):
        for i in range(0, len(l), n):
            yield l[i:i+n]

    def export(self, filename):
        if self.clf is not None:
            with open(filename, "wb") as f:
                pickle.dump(self.clf, f)

if __name__ == "__main__":
    #output(15)
    pms = PositionModelStructure(15, [5, 10])
    '''
    pms.trainIntermediateModels()
'''

    #pms.trainPredictiveModel()


    frameFeatures = bridge.loadMergedFeatures(15, 100000, 110000)
    sentenceFeatures = modelSystem.rawToSentenceFeatures(frameFeatures)
    pms.predict(sentenceFeatures)

    pms.validatePrediction(gold.loadGolds("C:/MissingWord/"+str(15)+"Gold.txt", numGolds = 110000)[100000:110000])
    for line in f:
        chunks = parseSuperList(line.strip())
        labelSet = []
        for elem in chunks:
            labelSet.append(round(float(parseArrayList(elem)[1])))
        frameLabels.append(labelSet)

condensedFrameLabels = []
for label in frameLabels:
    if 1 in label:
        condensedFrameLabels.append(label.index(1))
    else:
        condensedFrameLabels.append(0)


golds = gold.loadGolds("C:/MissingWord/"+str(sentenceLength)+"gold.txt", numGolds = 100000)
goldLabels = [gold.getRemovedIndex() for gold in golds]

allPredictions = []
with open("C:/MissingWord/frame/15-10Combined.txt", "r") as f:
    for line in f:
        if len(line) > 0:
            line = line.replace("array(", "")
            line = line.replace(")", "")
            line = line.strip()[1: -1]
            predictionSet = []
            for elem in line.split("], ["):
                elem = elem.replace("]", "")
                elem = elem.replace("[", "")
                elems = [float(e) for e in elem.split(", ")]
                if len(elems) == 1 or elems[0] > 0.5:
Exemple #8
0
        clf = RandomForestClassifier(n_estimators=500, n_jobs=7)
        clf.fit(trainFeatures, trainLabels)
        trainingPred = clf.predict(trainFeatures)
        print(classification_report(trainLabels, trainingPred))
        print(classification_report(testLabels, clf.predict(testFeatures)))

        with open(modelSystem.nPosModelFile(self.sentenceLength), "wb") as f:
            pickle.dump(clf, f)

    def test(self, predictions, predictionsMade, golds):
        print("Total Sentences " + str(len(predictions)))
        toVerifyPrediction = []
        toVerifyLabel = []
        for predictionIndex in predictionsMade:
            toVerifyPrediction.append(predictions[predictionIndex])
            toVerifyLabel.append(golds[predictionIndex].getRemovedIndex())

        print("Predicted " + str(len(toVerifyPrediction)))

        print(classification_report(toVerifyPrediction, toVerifyLabel))


if __name__ == "__main__":
    sentenceLength = 16
    model = NPositionModel(sentenceLength)
    predictions, predictionsMade = model.predict(
        bridge.loadMergedFeatures(sentenceLength, start=100000, end=110000))
    model.test(
        predictions, predictionsMade,
        gold.loadGolds("C:/MissingWord/" + str(sentenceLength) + "Gold.txt",
                       numGolds=110000)[100000:110000])