def data_sparse(cfg, disease, judgement, use_svd=False): """Bag-of-cuis data for sparse evaluation""" base = os.environ['DATA_ROOT'] train_data = os.path.join(base, cfg.get('data', 'train_data')) train_annot = os.path.join(base, cfg.get('data', 'train_annot')) test_data = os.path.join(base, cfg.get('data', 'test_data')) test_annot = os.path.join(base, cfg.get('data', 'test_annot')) # handle training data first train_data_provider = DatasetProvider(train_data, train_annot, disease, judgement) x_train, y_train = train_data_provider.load_raw() print('train examples:', len(x_train)) vectorizer = TfidfVectorizer(ngram_range=NGRAM_RANGE, stop_words='english', min_df=MIN_DF, vocabulary=None, binary=False) x_train = vectorizer.fit_transform(x_train) dump_svmlight_file(train_tfidf_matrix, y_train, disease + "_train.libsvm") # now handle the test set test_data_provider = DatasetProvider(test_data, test_annot, disease, judgement) x_test, y_test = test_data_provider.load_raw() print('test examples:', len(x_test)) x_test = vectorizer.transform(x_test) return x_train.toarray(), y_train, x_test.toarray(), y_test
def run_evaluation_svd(disease, judgement): """Train on train set and evaluate on test set""" print('disease:', disease) print('judgement:', judgement) cfg = configparser.ConfigParser() cfg.read(sys.argv[1]) base = os.environ['DATA_ROOT'] train_data = os.path.join(base, cfg.get('data', 'train_data')) train_annot = os.path.join(base, cfg.get('data', 'train_annot')) test_data = os.path.join(base, cfg.get('data', 'test_data')) test_annot = os.path.join(base, cfg.get('data', 'test_annot')) # handle training data first train_data_provider = DatasetProvider(train_data, train_annot, disease, judgement) x_train, y_train = train_data_provider.load_raw() print('train examples:', len(x_train)) # load tfidf vectorizer model and transform xs into it vectorizer = pickle.load(open('../Svd/Model/tfidf.p', 'rb')) train_tfidf_matrix = vectorizer.transform(x_train) # now handle the test set test_data_provider = DatasetProvider(test_data, test_annot, disease, judgement) x_test, y_test = test_data_provider.load_raw() print('test examples:', len(x_test)) test_tfidf_matrix = vectorizer.transform(x_test) # load svd model and map train/test to low dimensions print('input shape:', train_tfidf_matrix.shape) svd = pickle.load(open('../Svd/Model/svd.p', 'rb')) train_tfidf_matrix = svd.transform(train_tfidf_matrix) test_tfidf_matrix = svd.transform(test_tfidf_matrix) print('output shape:', train_tfidf_matrix.shape) classifier = LinearSVC(class_weight='balanced') classifier.fit(train_tfidf_matrix, y_train) predictions = classifier.predict(test_tfidf_matrix) p = precision_score(y_test, predictions, average='macro') r = recall_score(y_test, predictions, average='macro') f1 = f1_score(y_test, predictions, average='macro') print('unique labels in train:', len(set(y_train))) print('p = %.3f' % p) print('r = %.3f' % r) print('f1 = %.3f\n' % f1) print('%.3f & %.3f & %.3f\n' % (p, r, f1)) return p, r, f1
def run_evaluation_sparse(disease, judgement, use_svd=False): """Train on train set and evaluate on test set""" print 'disease:', disease print 'judgement:', judgement cfg = ConfigParser.ConfigParser() cfg.read(sys.argv[1]) base = os.environ['DATA_ROOT'] train_data = os.path.join(base, cfg.get('data', 'train_data')) train_annot = os.path.join(base, cfg.get('data', 'train_annot')) test_data = os.path.join(base, cfg.get('data', 'test_data')) test_annot = os.path.join(base, cfg.get('data', 'test_annot')) # handle training data first train_data_provider = DatasetProvider( train_data, train_annot, disease, judgement, use_pickled_alphabet=False, alphabet_pickle=cfg.get('data', 'alphabet_pickle')) x_train, y_train = train_data_provider.load_raw() print 'train examples:', len(x_train) vectorizer = CountVectorizer( ngram_range=NGRAM_RANGE, stop_words='english', min_df=MIN_DF, vocabulary=None, binary=False) train_count_matrix = vectorizer.fit_transform(x_train) tf = TfidfTransformer() train_tfidf_matrix = tf.fit_transform(train_count_matrix) # now handle the test set test_data_provider = DatasetProvider( test_data, test_annot, disease, judgement, use_pickled_alphabet=True, alphabet_pickle=cfg.get('data', 'alphabet_pickle')) x_test, y_test = test_data_provider.load_raw() print 'test examples:', len(x_test) test_count_matrix = vectorizer.transform(x_test) test_tfidf_matrix = tf.transform(test_count_matrix) if use_svd: # reduce sparse vector to 300 dimensions svd = TruncatedSVD(n_components=300) train_tfidf_matrix = svd.fit_transform(train_tfidf_matrix) test_tfidf_matrix = svd.transform(test_tfidf_matrix) classifier = LinearSVC(class_weight='balanced') classifier.fit(train_tfidf_matrix, y_train) predictions = classifier.predict(test_tfidf_matrix) p = precision_score(y_test, predictions, average='macro') r = recall_score(y_test, predictions, average='macro') f1 = f1_score(y_test, predictions, average='macro') print 'unique labels in train:', len(set(y_train)) print 'p = %.3f' % p print 'r = %.3f' % r print 'f1 = %.3f\n' % f1 return p, r, f1