Esempio n. 1
0
def main():
    plot_zipf()
    word2vec_explore()
    pickle_in = open("processed_text_list.pickle", "rb")
    processed_text_list = pickle.load(pickle_in)
    #preprocess_text(train_text)
    train_w2v_model(processed_text_list)

    #shuffle and partition dataset
    from sklearn.utils import shuffle
    data = pd.DataFrame({'text': processed_text_list, 'labels': polarity})
    data = shuffle(data)
    get_w2v_array(data[:400000])
    w2v_array = pickle.load(open('w2v_features.pickle', 'rb'))
    num_tweets = 400000  # number of tweets to consider
    w2v_array = w2v_array[:num_tweets]
    split_ratio = int(num_tweets * .8)

    w2v_train = w2v_array[:split_ratio]  # w2v averages for each tweet
    w2v_test = w2v_array[split_ratio:]

    data = shuffle(data)
    simple_train = data['text'][:split_ratio]  # preprocessed text
    simple_test = data['text'][split_ratio:]

    labels_list = data['labels'].tolist()[:num_tweets]
    train_labels = labels_list[:split_ratio]  # list of labels
    test_labels = labels_list[split_ratio:]

    # get_w2v_array(data=data)
    # pickle_in = open("w2v_features.pickle", "rb")
    # w2v_features = pickle.load(pickle_in)

    # naive_bayes = NaiveBayes(simple_train.tolist(), simple_test.tolist(), labels_list)
    # accuracy = naive_bayes.evaluate()
    # print("Naive Bayes accuracy: " + str(accuracy)) #.499

    # svm = SVM(simple_train, train_labels, simple_test, test_labels, 3000, .0000001)
    # accuracy = svm.predict()
    # print("SVM accuracy: " + str(accuracy)) #.744 with a=.0000001 and 3000 epochs

    random_forest = RandomForest(w2v_train,
                                 w2v_test,
                                 train_labels,
                                 test_labels,
                                 'sqrt',
                                 max_depth=25,
                                 min_leaf=2,
                                 n_trees=500,
                                 model_type='scikit')
    accuracy = random_forest.evaluate()
    print("Random Forest accuracy: " + str(accuracy))
def main():
    argument_parser = ArgumentParser(
        description="Script to run the RandomForest program.", add_help=False)
    mutually_exclusive_group = argument_parser.add_mutually_exclusive_group()
    mutually_exclusive_group.add_argument(
        '--use_gini',
        action='store_true',
        help="Use the Gini index for attribute splitting in the decision trees."
    )
    mutually_exclusive_group.add_argument(
        '--use_entropy',
        action='store_true',
        help="Use entropy for attribute splitting in the decision trees.")
    mutually_exclusive_group.add_argument(
        '--use_variance',
        action='store_true',
        help="Use entropy for attribute splitting in the decision trees.")
    mutually_exclusive_group2 = argument_parser.add_mutually_exclusive_group()
    mutually_exclusive_group2.add_argument(
        '--use_hockey_preprocessor',
        action='store_true',
        help=
        "Use hockey dataset preprocessing logic on the given dataset. (default)"
    )
    mutually_exclusive_group2.add_argument(
        '--use_custom_preprocessor',
        help=
        "Use custom dataset preprocessing logic on the given dataset.  Where USE_CUSTOM_PREPROCESSOR is the"
        "filename of the preprocessor file in the preprocessors directory to use, e.g. TemplateDataSetPreprocessor."
    )
    argument_parser.add_argument('-d',
                                 '--data_file',
                                 required=True,
                                 help="File containing the dataset.")
    argument_parser.add_argument(
        '-t',
        '--number_of_trees',
        type=int,
        default=4,
        help="The number of trees to create for the random forest.")
    argument_parser.add_argument(
        '-m',
        '--max_depth',
        type=int,
        help=
        "The maximum depth of all trees in the random forest.  (default: None)."
    )
    argument_parser.add_argument(
        '-s',
        '--min_split_size',
        type=int,
        default=1,
        help=
        "The threshold number of samples required at a node to stop further splitting.  (default: 1)."
    )
    argument_parser.add_argument(
        '-f',
        '--n_features',
        type=int,
        help=
        "The number of features to use when building each tree in the random forest.  Specifying None will use all"
        " the features (default: None).")
    argument_parser.add_argument('-c',
                                 '--target_label',
                                 type=str,
                                 required=True,
                                 help="Target label that we want to predict.")
    argument_parser.add_argument(
        '-k',
        '--sklearn_rf',
        action='store_true',
        help='Train and test dataset on SKlearn Random Forest')
    argument_parser.add_argument(
        '-w',
        '--number_of_workers',
        type=int,
        help=
        "The number of workers to spawn during training of the random forest.  Specifying None will disable this"
        "feature. (default: None).")
    argument_parser.add_argument(
        '-o',
        '--output_file',
        help=
        "Output file of the results.  If the file exists already, new entries will be appended to the end. (default: None)."
    )
    argument_parser.add_argument('-h',
                                 '--help',
                                 action='help',
                                 help="Show this message and exit.")
    arguments = argument_parser.parse_args()

    dataset_file = arguments.data_file
    output_file = arguments.output_file

    preprocessor = None

    if arguments.use_custom_preprocessor:
        preprocessor = import_module("preprocessors." +
                                     arguments.use_custom_preprocessor)
    else:
        preprocessor = HockeyPP

    #select class name
    class_name = arguments.target_label

    #select splitting cost function
    split_function = 'gini'

    if arguments.use_entropy:
        split_function = 'entropy'

    elif arguments.use_variance:
        split_function = 'variance'

    #Test regression with 'sum_7yr_GP'
    train_data, test_data = preprocessor.process(dataset_file, class_name)

    random_forest = RandomForest(arguments.number_of_trees,
                                 arguments.max_depth, arguments.min_split_size,
                                 arguments.n_features,
                                 arguments.number_of_workers, split_function)

    t0 = datetime.now()
    random_forest.train(train_data, class_name)
    diff = datetime.now() - t0
    t = divmod(diff.days * 86400 + diff.seconds, 60)
    train_results = random_forest.bagging_predict(train_data)
    t0 = datetime.now()
    test_results = random_forest.bagging_predict(test_data)
    diff = datetime.now() - t0
    tp = divmod(diff.days * 86400 + diff.seconds, 60)

    if arguments.use_variance:
        train_accuracy = random_forest.mse(train_results, train_data[:, -1])
        print("\nTrain Mean squared error: {}".format(train_accuracy))

        test_accuracy = random_forest.mse(test_results, test_data[:, -1])
        print("Test Mean squared error: {}\n".format(test_accuracy))
    else:
        train_accuracy = random_forest.evaluate(train_results, train_data[:,
                                                                          -1])
        print("\nTrain Percent Correct: {}".format(train_accuracy))

        test_accuracy = random_forest.evaluate(test_results, test_data[:, -1])
        print("Test Percent Correct: {}\n".format(test_accuracy))

    print("\nTime for train: {}min {}sec".format(t[0], t[1]))
    print("Time for prediction: {}min {}sec\n".format(tp[0], tp[1]))

    if arguments.sklearn_rf is True:
        sk_rf = Sklearn_RF(arguments.number_of_trees, arguments.max_depth,
                           arguments.min_split_size, arguments.n_features)

        # TODO: Need a sklearn_regression tree as well
        sk_rf.train(train_data, class_name)

        accuracy_sk = sk_rf.evaluate(
            test_data,
            tree_type='regressor' if arguments.use_variance else 'classifier')

        if arguments.use_variance:
            print('{}{}'.format('sklearn rf MSE: ', accuracy_sk))
        else:
            print('{}{}'.format('sklearn rf Percent correct: ',
                                accuracy_sk * 100))

    # Write out the results to a file, if one is specified, for downstream processing.
    if output_file:
        creating_new_file = True
        if os.path.isfile(output_file):
            creating_new_file = False

        headers = [
            "Features", "MaxDepth", "MinSplitThreshold", "Trees",
            "SplitCriteria", "Target", "TrainAccuracy", "TestAccuracy"
        ]

        with open(output_file, "a") as csv_file:
            writer = csv.DictWriter(csv_file,
                                    fieldnames=headers,
                                    lineterminator="\n")
            if creating_new_file:
                print("Creating a new file {}!\n".format(output_file))
                writer.writeheader()
            else:
                print("Appending to the file {}!\n".format(output_file))
            writer.writerow({
                "Features":
                arguments.n_features if arguments.n_features else "ALL",
                "MaxDepth":
                arguments.max_depth if arguments.max_depth else "NOLIMIT",
                "MinSplitThreshold":
                arguments.min_split_size,
                "Trees":
                arguments.number_of_trees,
                "SplitCriteria":
                split_function,
                "Target":
                class_name,
                "TrainAccuracy":
                train_accuracy,
                "TestAccuracy":
                test_accuracy
            })