Ejemplo n.º 1
0
def main():
    """" Preprocesses, extracts, learns, tests"""

    # process flags
    do_retrain, do_rebuildValidation, do_test = False, False, False

    for arg in sys.argv[1:]:
        if ("--retrain" in arg):
            if ("yes" in arg):
                do_retrain = True
        if ("--rebuildValidation" in arg):
            if ("yes" in arg):
                do_rebuildValidation = True
        if ("--test" in arg):
            if ("yes" in arg):
                do_test = True

    # preprocessing
    do = DataOrganizer()

    # __________________________________ TRAINING ________________________ #

    # use BoG to convert to frequency vector

    fe = FeatureExtractor(FeatureExtractor.ModelType.BagOfClusters)

    clf = 0
    clf_file = ""

    # get the latest trained model
    filenames = os.listdir("models/")
    if len(filenames) > 0:
        clf_file = "models/" + filenames[-1]
    else:
        clf_file = None

    # get sets of tweets as training data
    # trainData0, trainData1, validation0, validation1 \
    #     = do.organizeTrainWithValidation("data/trainValidate/", do_rebuildValidation)

    trainData0, trainData1 = do.organizeTrain("data/train/")

    if do_retrain or not clf_file:
        # split training set into validation and training set
        X0, X1 = fe.extractTrainFeatureVectors((trainData0, trainData1))
        clf = learn(X0, X1)

        millis = int(round(time.time() * 1000))
        clf_file = "trainedModel" + str(millis)
        print "Saving model to file..."

        joblib.dump(clf, "models/" + clf_file, compress=1)
    else:
        print "Using trained model and BoG..."
        fe.bog = BagOfWords()
        fe.bog.getLatestBoG()
        clf = joblib.load(clf_file)

    # we're either validating or testing based on the passed flag

    # ____________________________________VALIDATION__________________________#
    if not do_test:
        # feed in the validation sets as one set
        validationData = do.organizeTest("data/validation/")
        validationFeatures, validationLabels = fe.extractTestFeatureVectors(
            validationData)
        test("Validation", clf, validationFeatures, validationLabels)
    else:
        # ____________________________________TESTING _______________________ #

        # extract test features and test
        print "Using testing"
        testData, testLabels = do.organizeTest("data/test/")
        testFeatures = fe.extractTestFeatureVectors(testData)
        test("Testing, Global Protests With Background Subtraction", clf,
             testFeatures, testLabels)