else: raise ValueError("Unknown tag: " + test_type) data = data_utils.read_from_csv(data_file) filtered_data = [x for x in data if getattr(x, tag_attr) != "" and getattr(x, tag_attr) != "u"] filtered_data = filtered_data[:2500] # put a limit on the size for performance labels = [np.float32(getattr(x, tag_attr) == TARGET_POSITIVE) for x in filtered_data] report_ids = [x.report_id for x in filtered_data] sentences = [x.processed_sentence for x in filtered_data] train_data, train_labels, test_data, test_labels = data_utils.split_data(sentences, labels, report_ids, split_value) # Create transformation pipeline if USE_RF: pipe = pipelines.get_count_lsi_randomforest() else: pipe = pipelines.get_count_lsi_SVM() # set pipe parameters and train model pipe.set_params(**model_params) pipe.fit(train_data, train_labels) print "Total = " + str(len(filtered_data)) + " [" + str(labels.count(0)) + ", " + str(labels.count(1)) + "]" print "Train = " + str(len(train_data)) + " [" + str(train_labels.count(0)) + ", " + str( train_labels.count(1) ) + "]" print "Test = " + str(len(test_data)) + " [" + str(test_labels.count(0)) + ", " + str(test_labels.count(1)) + "]" # Training performance data y_true_train = train_labels
# change these parameters for the grid search # parameters = {'lsi__n_components': [100], # 'classifier__C': [3, 4, 5, 6, 7, 8, 9, 10], # 'classifier__kernel': ["rbf"] # } parameters = {'lsi__n_components': [100], 'classifier__n_estimators': [1000], 'classifier__max_depth': [5, 10], 'classifier__min_samples_split': [5, 10], 'classifier__min_samples_leaf': [5, 10], } # clf = GridSearchCV(pipelines.get_count_lsi_SVM(), parameters) clf = GridSearchCV(pipelines.get_count_lsi_randomforest(), parameters) clf.fit(train_data, train_labels) print "Best parameters set found o n development set:" print "" print clf.best_params_ print "" print "Grid scores on development set:" print "" for params, mean_score, scores in clf.grid_scores_: print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params) print "" print "Detailed classification report:" print "" print "The model is trained on the full development set." print "The scores are computed on the full evaluation set."