if "classifier__kernel" in model_params: model_params["classifier__kernel"] = str(model_params["classifier__kernel"]) if test_type == "diagnostic": tag_attr = "diag_tag" TARGET_POSITIVE = "p" elif test_type == "sentiment": tag_attr = "sent_tag" TARGET_POSITIVE = "p" elif test_type == "class": tag_attr = "report_class" TARGET_POSITIVE = TARGET_CLASS else: raise ValueError("Unknown tag: " + test_type) data = data_utils.read_from_csv(data_file) filtered_data = [x for x in data if getattr(x, tag_attr) != "" and getattr(x, tag_attr) != "u"] filtered_data = filtered_data[:2500] # put a limit on the size for performance labels = [np.float32(getattr(x, tag_attr) == TARGET_POSITIVE) for x in filtered_data] report_ids = [x.report_id for x in filtered_data] sentences = [x.processed_sentence for x in filtered_data] train_data, train_labels, test_data, test_labels = data_utils.split_data(sentences, labels, report_ids, split_value) # Create transformation pipeline if USE_RF: pipe = pipelines.get_count_lsi_randomforest() else: pipe = pipelines.get_count_lsi_SVM()
from sklearn.metrics import classification_report, roc_curve, auc import json import data_utils from sklearn.grid_search import GridSearchCV import numpy as np import pipelines if __name__ == "__main__": if len(sys.argv) != 2: print "USAGE: " + sys.argv[0] + " input_file output_model_file" sys.exit(1) input_file = sys.argv[1] output_model_file = sys.argv[2] data = data_utils.read_from_csv(input_file) filtered_data = [x for x in data if x.diag_tag != "" and x.diag_tag != "u"] labels = [np.float32(x.diag_tag == "p") for x in filtered_data] data = [x.processed_sentence for x in filtered_data] report_ids = [x.report_id for x in filtered_data] train_data, train_labels, test_data, test_labels = data_utils.split_data(data, labels, report_ids, split=0.7) # change these parameters for the grid search # parameters = {'lsi__n_components': [100], # 'classifier__C': [3, 4, 5, 6, 7, 8, 9, 10], # 'classifier__kernel': ["rbf"] # } parameters = {'lsi__n_components': [100],