Exemple #1
0
def data_sparse(cfg, disease, judgement, use_svd=False):
    """Bag-of-cuis data for sparse evaluation"""

    base = os.environ['DATA_ROOT']
    train_data = os.path.join(base, cfg.get('data', 'train_data'))
    train_annot = os.path.join(base, cfg.get('data', 'train_annot'))
    test_data = os.path.join(base, cfg.get('data', 'test_data'))
    test_annot = os.path.join(base, cfg.get('data', 'test_annot'))

    # handle training data first
    train_data_provider = DatasetProvider(train_data, train_annot, disease,
                                          judgement)
    x_train, y_train = train_data_provider.load_raw()
    print('train examples:', len(x_train))

    vectorizer = TfidfVectorizer(ngram_range=NGRAM_RANGE,
                                 stop_words='english',
                                 min_df=MIN_DF,
                                 vocabulary=None,
                                 binary=False)
    x_train = vectorizer.fit_transform(x_train)

    dump_svmlight_file(train_tfidf_matrix, y_train, disease + "_train.libsvm")

    # now handle the test set
    test_data_provider = DatasetProvider(test_data, test_annot, disease,
                                         judgement)
    x_test, y_test = test_data_provider.load_raw()
    print('test examples:', len(x_test))
    x_test = vectorizer.transform(x_test)

    return x_train.toarray(), y_train, x_test.toarray(), y_test
Exemple #2
0
def run_evaluation_svd(disease, judgement):
    """Train on train set and evaluate on test set"""

    print('disease:', disease)
    print('judgement:', judgement)

    cfg = configparser.ConfigParser()
    cfg.read(sys.argv[1])
    base = os.environ['DATA_ROOT']
    train_data = os.path.join(base, cfg.get('data', 'train_data'))
    train_annot = os.path.join(base, cfg.get('data', 'train_annot'))
    test_data = os.path.join(base, cfg.get('data', 'test_data'))
    test_annot = os.path.join(base, cfg.get('data', 'test_annot'))

    # handle training data first
    train_data_provider = DatasetProvider(train_data, train_annot, disease,
                                          judgement)
    x_train, y_train = train_data_provider.load_raw()
    print('train examples:', len(x_train))

    # load tfidf vectorizer model and transform xs into it
    vectorizer = pickle.load(open('../Svd/Model/tfidf.p', 'rb'))
    train_tfidf_matrix = vectorizer.transform(x_train)

    # now handle the test set
    test_data_provider = DatasetProvider(test_data, test_annot, disease,
                                         judgement)
    x_test, y_test = test_data_provider.load_raw()
    print('test examples:', len(x_test))
    test_tfidf_matrix = vectorizer.transform(x_test)

    # load svd model and map train/test to low dimensions
    print('input shape:', train_tfidf_matrix.shape)
    svd = pickle.load(open('../Svd/Model/svd.p', 'rb'))
    train_tfidf_matrix = svd.transform(train_tfidf_matrix)
    test_tfidf_matrix = svd.transform(test_tfidf_matrix)
    print('output shape:', train_tfidf_matrix.shape)

    classifier = LinearSVC(class_weight='balanced')
    classifier.fit(train_tfidf_matrix, y_train)
    predictions = classifier.predict(test_tfidf_matrix)

    p = precision_score(y_test, predictions, average='macro')
    r = recall_score(y_test, predictions, average='macro')
    f1 = f1_score(y_test, predictions, average='macro')
    print('unique labels in train:', len(set(y_train)))
    print('p = %.3f' % p)
    print('r = %.3f' % r)
    print('f1 = %.3f\n' % f1)

    print('%.3f & %.3f & %.3f\n' % (p, r, f1))

    return p, r, f1
def run_evaluation_sparse(disease, judgement, use_svd=False):
  """Train on train set and evaluate on test set"""

  print 'disease:', disease
  print 'judgement:', judgement

  cfg = ConfigParser.ConfigParser()
  cfg.read(sys.argv[1])
  base = os.environ['DATA_ROOT']
  train_data = os.path.join(base, cfg.get('data', 'train_data'))
  train_annot = os.path.join(base, cfg.get('data', 'train_annot'))
  test_data = os.path.join(base, cfg.get('data', 'test_data'))
  test_annot = os.path.join(base, cfg.get('data', 'test_annot'))

  # handle training data first
  train_data_provider = DatasetProvider(
    train_data,
    train_annot,
    disease,
    judgement,
    use_pickled_alphabet=False,
    alphabet_pickle=cfg.get('data', 'alphabet_pickle'))
  x_train, y_train = train_data_provider.load_raw()
  print 'train examples:', len(x_train)

  vectorizer = CountVectorizer(
    ngram_range=NGRAM_RANGE,
    stop_words='english',
    min_df=MIN_DF,
    vocabulary=None,
    binary=False)
  train_count_matrix = vectorizer.fit_transform(x_train)

  tf = TfidfTransformer()
  train_tfidf_matrix = tf.fit_transform(train_count_matrix)

  # now handle the test set
  test_data_provider = DatasetProvider(
    test_data,
    test_annot,
    disease,
    judgement,
    use_pickled_alphabet=True,
    alphabet_pickle=cfg.get('data', 'alphabet_pickle'))
  x_test, y_test = test_data_provider.load_raw()
  print 'test examples:', len(x_test)

  test_count_matrix = vectorizer.transform(x_test)
  test_tfidf_matrix = tf.transform(test_count_matrix)

  if use_svd:
    # reduce sparse vector to 300 dimensions
    svd = TruncatedSVD(n_components=300)
    train_tfidf_matrix = svd.fit_transform(train_tfidf_matrix)
    test_tfidf_matrix = svd.transform(test_tfidf_matrix)

  classifier = LinearSVC(class_weight='balanced')
  classifier.fit(train_tfidf_matrix, y_train)
  predictions = classifier.predict(test_tfidf_matrix)

  p = precision_score(y_test, predictions, average='macro')
  r = recall_score(y_test, predictions, average='macro')
  f1 = f1_score(y_test, predictions, average='macro')
  print 'unique labels in train:', len(set(y_train))
  print 'p = %.3f' % p
  print 'r = %.3f' % r
  print 'f1 = %.3f\n' % f1

  return p, r, f1