Beispiel #1
0
def run_100(task, ori_df, clf_model, params, args, threshold):
  preds = []
  targs = []
  probs = []

  seeds = list(range(args.start_seed, args.start_seed + 100))
  for seed in tqdm(seeds, desc=f'{task} Runs'):
    df = set_group_splits(task_df.copy(), group_col='hadm_id', seed=seed)
    vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), binary=True, max_features=60_000)

    x_train = vectorizer.fit_transform(df.loc[(df['split'] == 'train')]['processed_note'])
    x_test = vectorizer.transform(df.loc[(df['split'] == 'test')]['processed_note'])

    y_train = df.loc[(df['split'] == 'train')][f'{task}_label'].to_numpy()
    y_test = df.loc[(df['split'] == 'test')][f'{task}_label'].to_numpy()
    targs.append(y_test)

    clf = clf_model(**params)
    clf.fit(x_train, y_train)
    pickle.dump(clf, open(args.modeldir/f'{task}_seed_{seed}.pkl', 'wb'))

    pos_prob = clf.predict_proba(x_test)[:, 1]
    probs.append(pos_prob)

    y_pred = (pos_prob > threshold).astype(np.int64)
    preds.append(y_pred)

  with open(args.workdir/f'{task}_preds.pkl', 'wb') as f:
    pickle.dump(targs, f)
    pickle.dump(preds, f)
    pickle.dump(probs, f)
Beispiel #2
0
def run_100(task, task_df, args, threshold):
    reduce_lr = LRScheduler(
        policy='ReduceLROnPlateau',
        mode='min',
        factor=0.5,
        patience=1,
    )

    seeds = list(range(args.start_seed, args.start_seed + 100))
    for seed in tqdm(seeds, desc=f'{task} Runs'):
        logger.info(f"Spliting with seed {seed}")
        checkpoint = Checkpoint(dirname=args.modeldir /
                                f'{task}_seed_{seed}', )
        df = set_group_splits(task_df.copy(), group_col='hadm_id', seed=seed)
        vectorizer = TfidfVectorizer(sublinear_tf=True,
                                     ngram_range=(1, 2),
                                     binary=True,
                                     max_features=60_000)

        x_train = vectorizer.fit_transform(
            df.loc[(df['split'] == 'train')]['processed_note']).astype(
                np.float32)
        x_test = vectorizer.transform(
            df.loc[(df['split'] == 'test')]['processed_note']).astype(
                np.float32)

        x_train = np.asarray(x_train.todense())
        x_test = np.asarray(x_test.todense())
        vocab_sz = len(vectorizer.vocabulary_)

        y_train = df.loc[(df['split'] == 'train')][f'{task}_label'].to_numpy()
        y_test = df.loc[(df['split'] == 'test')][f'{task}_label'].to_numpy()

        clf = MLPModule(input_units=vocab_sz,
                        output_units=1,
                        hidden_units=args.hidden_dim,
                        num_hidden=1,
                        dropout=args.dropout_p,
                        squeeze_output=True)

        net = NeuralNetBinaryClassifier(
            clf,
            max_epochs=args.max_epochs,
            lr=args.lr,
            device=args.device,
            optimizer=optim.Adam,
            optimizer__weight_decay=args.wd,
            batch_size=args.batch_size,
            verbose=1,
            callbacks=[EarlyStopping, ProgressBar, checkpoint, reduce_lr],
            train_split=CVSplit(cv=0.15, stratified=True),
            iterator_train__shuffle=True,
            threshold=threshold,
        )
        net.set_params(callbacks__valid_acc=None)
        net.fit(x_train, y_train.astype(np.float32))
Beispiel #3
0
  if len(sys.argv) != 2:
    logger.error(f"Usage: {sys.argv[0]} task_name (ia|ps)")
    sys.exit(1)

  task = sys.argv[1]
  ori_df = pd.read_csv(args.dataset_csv, usecols=args.cols, parse_dates=args.dates)
  if task == 'ia':
    logger.info(f"Running hyperparameter search for Imminent Admission Prediction task")
    task_df = ori_df.loc[(ori_df['imminent_adm_label'] != -1)][args.imminent_adm_cols].reset_index(drop=True)
    label = 'imminent_adm_label'
  if task == 'ps':
    logger.info(f"Running hyperparameter search for Prolonged Stay Prediction task ")
    task_df = ori_df[args.prolonged_stay_cols].copy()
    label = 'prolonged_stay_label'

  df = set_group_splits(task_df.copy(), group_col='hadm_id', seed=42)
  vectorizer = TfidfVectorizer(min_df=args.min_freq, analyzer=str.split, sublinear_tf=True, ngram_range=(2,2))

  x_train = vectorizer.fit_transform(df.loc[(df['split'] == 'train')]['processed_note'])
  x_test = vectorizer.transform(df.loc[(df['split'] == 'test')]['processed_note'])
  y_train = df.loc[(df['split'] == 'train')][label].to_numpy()
  y_test = df.loc[(df['split'] == 'test')][label].to_numpy()

  clf_params = {
    'solver': 'liblinear',
    'multi_class': 'ovr',
  }

  clf = LogisticRegression(**clf_params)

  param_space = {