Esempio n. 1
0
def init(args):
    """
        Load data, build model, create optimizer, create vars to hold metrics, etc.
    """
    #need to handle really large text fields
    csv.field_size_limit(sys.maxsize)

    #load vocab and other lookups
    desc_embed = args.lmbda > 0
    print("loading lookups...")
    dicts = datasets.load_lookups(args, desc_embed=desc_embed)

    model = tools.pick_model(args, dicts)
    print(model)

    if not args.test_model:
        optimizer = optim.Adam(model.parameters(),
                               weight_decay=args.weight_decay,
                               lr=args.lr)
    else:
        optimizer = None

    params = tools.make_param_dict(args)

    return args, model, optimizer, params, dicts
Esempio n. 2
0
def setup():
    """
        Load data, build model, create optimizer, create vars to hold metrics, etc.
    """

    args = {
        "data_path": "train_full.csv",
        "vocab": "./datafiles/vocab.csv",
        "model": "conv_attn",
        "filter_size": 10,
        "num_filter_maps": 50,
        "dropout": .2,
        "lr": .0001,
        "gpu": False,
        "test_model": "model.pth",
        "public_model": "true",
        "Y": "full",
        "n_epochs": 200
    }

    dicts = datasets.load_lookups(args)
    dicts["code_descs"] = datasets.load_code_descriptions()
    model = tools.pick_model(args, dicts)

    # with open('./datafiles/example_note.txt', 'r') as notefile:
    #     note = notefile.read()
    #     test(model, False, note, dicts) #testing

    @app.route("/", methods=['POST'])
    def hello():
        note = request.get_json()["note"]
        print(note)
        if (note is None): return "Note not found"
        results = test(model, False, note, dicts)
        return jsonify(results)
Esempio n. 3
0
def init(args):
    """
        Load data, build model, create optimizer, create vars to hold metrics, etc.
    """
    #need to handle really large text fields
    csv.field_size_limit(sys.maxsize)

    freq_params = None
    if args.samples or args.lmbda > 0:
        print("loading code frequencies...")
        code_freqs, n = datasets.load_code_freqs(args.data_path)
        print(
            "code_freqs:",
            sorted(code_freqs.items(),
                   key=operator.itemgetter(1),
                   reverse=True)[:10], "n:", n)
        freq_params = (code_freqs, n)

    #load vocab and other lookups
    desc_embed = args.lmbda > 0
    dicts = datasets.load_lookups(args.data_path,
                                  args.vocab,
                                  desc_embed=desc_embed)

    model = tools.pick_model(args, dicts)
    print(model)

    optimizer = optim.Adam(model.params_to_optimize(),
                           weight_decay=args.weight_decay,
                           lr=args.lr)

    params = tools.make_param_dict(args)

    return args, model, optimizer, params, freq_params, dicts
Esempio n. 4
0
def init(args):
    """
        Load data, build model, create optimizer, create vars to hold metrics, etc.
    """

    #load vocab and other lookups
    print("loading lookups...")

    dicts = datasets.load_lookups(args, hier=args.hier)

    model, optimizer = init_model(args, dicts)

    print(model)

    params = vars(args)

    return args, model, optimizer, params, dicts
Esempio n. 5
0
def init(args):
    """
        Load data, build model, create optimizer, create vars to hold metrics, etc.
    """
    #need to handle really large text fields
    csv.field_size_limit(sys.maxsize)

    #load vocab and other lookups
    desc_embed = args.lmbda > 0
    print("loading lookups...")
    dicts = datasets.load_lookups(args, desc_embed=desc_embed)

    META_TEST = args.test_model is not None
    model, start_epoch, optimizer = tools.pick_model(args, dicts, META_TEST)
    print(model)

    params = tools.make_param_dict(args)

    return args, model, optimizer, params, dicts, start_epoch
Esempio n. 6
0
def init(args):
    """
        Load data, build model, create optimizer, create vars to hold metrics, etc.
    """
    # need to handle really large text fields
    csv.field_size_limit(sys.maxsize)

    freq_params = None

    # load vocab and other lookups
    dicts = datasets.load_lookups(args.data_path, args.vocab)

    model = tools.pick_model(args, dicts)
    print(model)

    optimizer = optim.Adam(model.params_to_optimize(),
                           weight_decay=args.weight_decay,
                           lr=args.lr)
    # optimizer = optim.Adam(model.module.params_to_optimize(), weight_decay=args.weight_decay, lr=args.lr)

    params = tools.make_param_dict(args)

    return args, model, optimizer, params, freq_params, dicts
Esempio n. 7
0
def init(args):
    """
        Load data, build model, create optimizer, create vars to hold metrics, etc.
    """
    # need to handle really large text fields
    csv.field_size_limit(sys.maxsize)

    # load vocab and other lookups
    desc_embed = args.lmbda > 0
    print("loading lookups...")
    dicts = datasets.load_lookups(args, desc_embed=desc_embed)

    model = transformer.TransformerAttn(args.Y, args.embed_file, dicts,
                                        args.lmbda, args.gpu, args.embed_size,
                                        args.num_layers, args.heads, args.d_ff,
                                        args.dropout,
                                        args.max_relative_positions)
    if args.gpu:
        model.cuda()
    print(model)

    if not args.test_model:
        optimizer = optim.Adam(model.parameters(),
                               weight_decay=args.weight_decay,
                               lr=args.lr)
        optimizer = NoamOpt(
            100, 2, 4000,
            torch.optim.Adam(model.parameters(),
                             lr=0,
                             betas=(0.9, 0.98),
                             eps=1e-9))
    else:
        optimizer = None

    params = tools.make_param_dict(args)

    return args, model, optimizer, params, dicts
Esempio n. 8
0
def main(Y, train_fname, dev_fname, vocab_file, version, n):
    n = int(n)

    #need to handle really large text fields
    csv.field_size_limit(sys.maxsize)

    #get lookups from non-BOW data
    data_path = train_fname.replace(
        '_bows', '') if "_bows" in train_fname else train_fname
    dicts = datasets.load_lookups(data_path,
                                  vocab_file=vocab_file,
                                  Y=Y,
                                  version=version)
    w2ind, ind2c, c2ind = dicts['w2ind'], dicts['ind2c'], dicts['c2ind']

    X, yy_tr, hids_tr = read_bows(Y, train_fname, c2ind, version)
    X_dv, yy_dv, hids_dv = read_bows(Y, dev_fname, c2ind, version)

    print("X.shape: " + str(X.shape))
    print("yy_tr.shape: " + str(yy_tr.shape))
    print("X_dv.shape: " + str(X_dv.shape))
    print("yy_dv.shape: " + str(yy_dv.shape))

    #deal with labels that don't have any positive examples
    #drop empty columns from yy. keep track of which columns kept
    #predict on test data with those columns. guess 0 on the others
    labels_with_examples = yy_tr.sum(axis=0).nonzero()[0]
    yy = yy_tr[:, labels_with_examples]

    # build the classifier
    clf = OneVsRestClassifier(LogisticRegression(C=C,
                                                 max_iter=MAX_ITER,
                                                 solver='sag'),
                              n_jobs=-1)

    # train
    print("training...")
    clf.fit(X, yy)

    #predict
    print("predicting...")
    yhat = clf.predict(X_dv)
    yhat_raw = clf.predict_proba(X_dv)

    #deal with labels that don't have positive training examples
    print("reshaping output to deal with labels missing from train set")
    labels_with_examples = set(labels_with_examples)
    yhat_full = np.zeros(yy_dv.shape)
    yhat_full_raw = np.zeros(yy_dv.shape)
    j = 0
    for i in range(yhat_full.shape[1]):
        if i in labels_with_examples:
            yhat_full[:, i] = yhat[:, j]
            yhat_full_raw[:, i] = yhat_raw[:, j]
            j += 1

    #evaluate
    metrics, fpr, tpr = evaluation.all_metrics(yhat_full,
                                               yy_dv,
                                               k=[8, 15],
                                               yhat_raw=yhat_full_raw)
    evaluation.print_metrics(metrics)

    #save metric history, model, params
    print("saving predictions")
    model_dir = os.path.join(
        MODEL_DIR,
        '_'.join(["log_reg",
                  time.strftime('%b_%d_%H:%M', time.localtime())]))
    os.mkdir(model_dir)
    preds_file = tools.write_preds(yhat_full, model_dir, hids_dv, 'test',
                                   yhat_full_raw)

    print("sanity check on train")
    yhat_tr = clf.predict(X)
    yhat_tr_raw = clf.predict_proba(X)

    #reshape output again
    yhat_tr_full = np.zeros(yy_tr.shape)
    yhat_tr_full_raw = np.zeros(yy_tr.shape)
    j = 0
    for i in range(yhat_tr_full.shape[1]):
        if i in labels_with_examples:
            yhat_tr_full[:, i] = yhat_tr[:, j]
            yhat_tr_full_raw[:, i] = yhat_tr_raw[:, j]
            j += 1

    #evaluate
    metrics_tr, fpr_tr, tpr_tr = evaluation.all_metrics(
        yhat_tr_full, yy_tr, k=[8, 15], yhat_raw=yhat_tr_full_raw)
    evaluation.print_metrics(metrics_tr)

    if n > 0:
        print("generating top important ngrams")
        if 'bows' in dev_fname:
            dev_fname = dev_fname.replace('_bows', '')
        print("calculating top ngrams using file %s" % dev_fname)
        calculate_top_ngrams(dev_fname, clf, c2ind, w2ind,
                             labels_with_examples, n)

    #Commenting this out because the models are huge (11G for mimic3 full)
    #print("saving model")
    #with open("%s/model.pkl" % model_dir, 'wb') as f:
    #    pickle.dump(clf, f)

    print("saving metrics")
    metrics_hist = defaultdict(lambda: [])
    metrics_hist_tr = defaultdict(lambda: [])
    for name in metrics.keys():
        metrics_hist[name].append(metrics[name])
    for name in metrics_tr.keys():
        metrics_hist_tr[name].append(metrics_tr[name])
    metrics_hist_all = (metrics_hist, metrics_hist, metrics_hist_tr)
    persistence.save_metrics(metrics_hist_all, model_dir)
Esempio n. 9
0

# Set parameters:
maxlen = 200
embedding_dims = 200
nb_filter = 500
filter_length = 4
batch_size = 8
nb_epoch = 10
nb_labels = 50
train_data_path = "../mimicdata/mimic3/train_50.csv"
dev_data_path = "../mimicdata/mimic3/dev_50.csv"
test_data_path = "../mimicdata/mimic3/test_50.csv"
vocab = "../mimicdata/mimic3/vocab.csv"
embed_file = "../mimicdata/mimic3/processed_full.embed"
dicts = datasets.load_lookups(train_data_path, vocab, Y=nb_labels)
vocab_size = len(dicts[0])
embed_weight = extract_wvs.load_embeddings(embed_file)

# Load data
print('Loading data...')


def slim_data_generator(data_path):
    while 1:
        for batch_idx, tup in enumerate(
                datasets.data_generator(data_path,
                                        dicts,
                                        batch_size=batch_size,
                                        num_labels=nb_labels)):
            X, y, _, code_set, descs = tup
Esempio n. 10
0
def init(args):
    """
        Load data, build model, create optimizer, create vars to hold metrics, etc.
    """
    #need to handle really large text fields
    csv.field_size_limit(sys.maxsize)

    #load vocab and other lookups
    desc_embed = args.lmbda > 0
    print("loading lookups...")
    dicts = datasets.load_lookups(args, desc_embed=desc_embed)

    model = tools.pick_model(args, dicts)
    print(model)

    if not args.test_model:
        if args.model in BERT_MODEL_LIST:
            param_optimizer = list(model.named_parameters())
            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]
            optimizer = optim.Adam(optimizer_grouped_parameters,
                                   weight_decay=args.weight_decay,
                                   lr=args.lr)
            length = datasets.data_length(args.data_path, args.version)
            t_total = length // args.batch_size * args.n_epochs
            scheduler = get_linear_schedule_with_warmup(optimizer, \
                                                        num_warmup_steps=args.warmup_steps, \
                                                        num_training_steps=t_total, \
                                                       )

            def get_label_distribution(filename, dicts):
                ind2w, w2ind, ind2c, c2ind, dv_dict = dicts['ind2w'], dicts[
                    'w2ind'], dicts['ind2c'], dicts['c2ind'], dicts['dv']
                if args.Y == 'full':
                    labels_idx = [1e-15] * 8921
                else:
                    labels_idx = [1e-15] * int(args.Y)
                with open(filename, 'r') as infile:
                    r = csv.reader(infile)
                    # header
                    next(r)
                    for row in r:
                        for l in row[3].split(';'):
                            if l in c2ind.keys():
                                code = int(c2ind[l])
                                labels_idx[code] += 1
                max_val = max(labels_idx)
                return max_val / np.array(labels_idx)

            if args.pos:
                labels_weight = get_label_distribution(args.data_path, dicts)
            else:
                labels_weight = None
        else:
            optimizer = optim.Adam(model.parameters(),
                                   weight_decay=args.weight_decay,
                                   lr=args.lr)
            scheduler = None
            labels_weight = None
    else:
        optimizer = None
        scheduler = None
        labels_weight = None

    params = tools.make_param_dict(args)

    return args, model, optimizer, params, dicts, scheduler, labels_weight