Beispiel #1
0
    else:
        raise ValueError("Unknown tag: " + test_type)

    data = data_utils.read_from_csv(data_file)
    filtered_data = [x for x in data if getattr(x, tag_attr) != "" and getattr(x, tag_attr) != "u"]
    filtered_data = filtered_data[:2500]  # put a limit on the size for performance

    labels = [np.float32(getattr(x, tag_attr) == TARGET_POSITIVE) for x in filtered_data]
    report_ids = [x.report_id for x in filtered_data]
    sentences = [x.processed_sentence for x in filtered_data]

    train_data, train_labels, test_data, test_labels = data_utils.split_data(sentences, labels, report_ids, split_value)

    # Create transformation pipeline
    if USE_RF:
        pipe = pipelines.get_count_lsi_randomforest()
    else:
        pipe = pipelines.get_count_lsi_SVM()

    # set pipe parameters and train model
    pipe.set_params(**model_params)
    pipe.fit(train_data, train_labels)

    print "Total = " + str(len(filtered_data)) + " [" + str(labels.count(0)) + ", " + str(labels.count(1)) + "]"
    print "Train = " + str(len(train_data)) + " [" + str(train_labels.count(0)) + ", " + str(
        train_labels.count(1)
    ) + "]"
    print "Test = " + str(len(test_data)) + " [" + str(test_labels.count(0)) + ", " + str(test_labels.count(1)) + "]"

    # Training performance data
    y_true_train = train_labels
Beispiel #2
0
    # change these parameters for the grid search
    # parameters = {'lsi__n_components': [100],
    #               'classifier__C': [3, 4, 5, 6, 7, 8, 9, 10],
    #               'classifier__kernel': ["rbf"]
    #               }

    parameters = {'lsi__n_components': [100],
                  'classifier__n_estimators': [1000],
                  'classifier__max_depth': [5, 10],
                  'classifier__min_samples_split': [5, 10],
                  'classifier__min_samples_leaf': [5, 10],
                  }

    # clf = GridSearchCV(pipelines.get_count_lsi_SVM(), parameters)
    clf = GridSearchCV(pipelines.get_count_lsi_randomforest(), parameters)
    clf.fit(train_data, train_labels)
    print "Best parameters set found o      n development set:"
    print ""
    print clf.best_params_
    print ""
    print "Grid scores on development set:"
    print ""
    for params, mean_score, scores in clf.grid_scores_:
        print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)
    print ""

    print "Detailed classification report:"
    print ""
    print "The model is trained on the full development set."
    print "The scores are computed on the full evaluation set."