Ejemplo n.º 1
0
def test():
    input_vocab = load("diag_vocab.pkl")
    output_vocab = load("drug_vocab.pkl")
    test_set = load("mimic_episodes_index_test.pkl")
    train_set = load("mimic_episodes_index_train.pkl")
    seq2seq = Seq2Seq()
    seq2seq.load_data(input_vocab, output_vocab, train_set, test_set)
    seq2seq.load()
    evaluator = Evaluator()
    precisions = []
    recalls = []
    bleus = []
    rogues = []
    for t in test_set:
        outputs = seq2seq.predict(t[0])
        print(outputs, len(t[1]))
        precision, recall = evaluator.get_golden_eval(t[0], list(set(outputs) - {seq2seq.PAD_ID, seq2seq.GO_ID, seq2seq.EOS_ID, seq2seq.UNK_ID}))
        precisions.append(precision)
        recalls.append(recall)
        bleu, rogue = evaluator.get_result(t[1], list(set(outputs) - {seq2seq.PAD_ID, seq2seq.GO_ID, seq2seq.EOS_ID, seq2seq.UNK_ID}))
        bleus.append(bleu)
        rogues.append(rogue)
        print("inputs: ", t[0])
        print("target: ", t[1])
        print("output: ", outputs)
        print("pre", precision, "rec", recall)
        print("average", np.mean(precisions), np.mean(recalls))
        print("bleu", bleu, "rogue", rogue)
        print("average", np.mean(bleus), np.mean(rogues))
Ejemplo n.º 2
0
def convert_ground_truth():
    gt = load("icd_to_ndc.pkl")
    diag_vocab = load("diag_vocab.pkl")
    drug_vocab = load("drug_vocab.pkl")
    gt_index = {}
    for c in gt:
        code = normalize_icd(c)
        if code in diag_vocab:
            diag = diag_vocab[code]
            gt_index[diag] = []
            for code1 in gt[c]:
                if code1 in drug_vocab:
                    drug = drug_vocab[code1]
                    gt_index[diag].append(drug)
    for icd in gt_index:
        gt_index[icd] = list(set(gt_index[icd]))
    dump(gt_index, "icd_to_ndc_index.pkl")

    ndc_to_icd = dd(list)
    for icd in gt_index:
        for ndc in gt_index[icd]:
            ndc_to_icd[ndc].append(icd)
    for ndc in ndc_to_icd:
        ndc_to_icd[ndc] = list(set(ndc_to_icd[ndc]))
    dump(dict(ndc_to_icd), "ndc_to_icd_index.pkl")
Ejemplo n.º 3
0
def train():
    input_vocab = load("diag_vocab.pkl")
    output_vocab = load("drug_vocab.pkl")
    test_set = load("mimic_episodes_index_test.pkl")
    train_set = load("mimic_episodes_index_train.pkl")
    seq2seq = Seq2Seq()
    seq2seq.load_data(input_vocab, output_vocab, train_set, test_set)
    model = seq2seq.create_model(False)
    seq2seq.fit()
Ejemplo n.º 4
0
def sort_encounter():
    train = load("mimic_encounter_gpi.train.pkl")
    test = load("mimic_encounter_gpi.dev.pkl")
    sorted_train = []
    sorted_test = []
    for d in train:
        sorted_train.append((d[0], sorted(d[1])))
    for d in test:
        sorted_test.append((d[0], sorted(d[1])))
    dump(sorted_train, "mimic_encounter_gpi_sorted.train.pkl")
    dump(sorted_test, "mimic_encounter_gpi_sorted.dev.pkl")
Ejemplo n.º 5
0
def train_sutter():
    input_vocab = load("sutter_diag_vocab.pkl")
    output_vocab = load("sutter_drug_vocab_3.pkl")
    encounters = load("sutter_encounters_3.pkl")
    train_set = []
    for enc in encounters[:1000000]:
        train_set.append(([input_vocab[code] for code in enc[0]], [output_vocab[code] for code in enc[1]]))
    test_set = []
    for enc in encounters[1000000:]:
        test_set.append(([input_vocab[code] for code in enc[0]], [output_vocab[code] for code in enc[1]]))
    seq2seq = Seq2Seq()
    seq2seq.load_data(input_vocab, output_vocab, train_set, test_set)
    model = seq2seq.create_model(False)
    seq2seq.fit()
Ejemplo n.º 6
0
def get_icd_to_gpi_map():
    icd_to_ndc = load("icd_to_ndc.pkl")
    ndc_to_gpi = load("ndc_to_gpi_6.pkl")
    icd_to_gpi = dd(set)
    for icd in icd_to_ndc:
        for ndc in icd_to_ndc[icd]:
            if ndc in ndc_to_gpi:
                icd_to_gpi[icd].add(ndc_to_gpi[ndc])
    gpi_to_icd = dd(set)
    for icd in icd_to_gpi:
        for gpi in icd_to_gpi[icd]:
            gpi_to_icd[gpi].add(icd)

    dump((dict(icd_to_gpi), dict(gpi_to_icd)), "icd_gpi_map.pkl")
Ejemplo n.º 7
0
def train_mimic():
    train_set = load("mimic_encounter_gpi.train.pkl")
    test_set = load("mimic_encounter_gpi.dev.pkl")
    mfm = MostFreqMatch(3, "mimic")
    mfm.fit(train_set)
    results = []
    prediction_list = []
    truth_list = []
    for item in test_set:
        prediction = mfm.predict(item[0])
        prediction_list.append(prediction)
        truth_list.append(item[1])
        results.append((item[0], item[1], prediction))
    dump(results, "mimic_result_freq.pkl")
Ejemplo n.º 8
0
def eval_sutter():
    input_vocab = load("sutter_diag_vocab.pkl")
    output_vocab = load("sutter_drug_vocab_3.pkl")
    test_set = load("sutter_encounter.dev.pkl")
    rb = RuleBased(1)
    rb.load()
    results = []
    prediction_list = []
    truth_list = []
    for item in test_set:
        prediction = rb.predict(item[0])
        prediction_list.append(prediction)
        truth_list.append(item[1])
        results.append((item[0], item[1], prediction))
    dump(results, "sutter_result_rule.pkl")
Ejemplo n.º 9
0
def evaluate(data_test,
             path_images,
             model_file,
             file_ingredients,
             output_file_accuracy='test_accuracy.npy'):
    """Evaluate ou final model with the test data.
       (only execute in the final model)"""
    print 'Evaluating test data...'
    input_images_test, input_ingredients_test = data.load(
        data_test,
        path_images,
        img_width=C.IMG_WIDTH,
        img_height=C.IMG_HEIGHT,
        file_ingredients=file_ingredients)
    # Returns a compiled model identical to model.h5
    assert os.path.exists(
        model_file), 'File for the model <{}> not found.'.format(model_file)

    model = load_model(model_file)
    score = model.evaluate(x=input_images_test,
                           y=input_ingredients_test,
                           batch_size=32,
                           verbose=1,
                           sample_weight=None)

    np.save(open(output_file_accuracy, 'w'), score)

    print 'loss={}, accuracy={}'.format(score[0], score[1] * 100)
    print score
Ejemplo n.º 10
0
def build_ground_truth():
    rx_to_ndc = load("rx_to_ndc.pkl")
    rx_to_icd = dd(list)
    icd_to_ndc = dd(list)

    with open("./data/MEDI_11242015.csv") as f_in:
        cnt = -1
        for line in f_in:
            cnt += 1
            if cnt == 0:
                continue
            x = line.strip().split(",")
            rx_to_icd[x[0]].append(x[5])

    for rx in set(rx_to_icd.keys()).intersection(set(rx_to_ndc.keys())):
        for ndc in rx_to_ndc[rx]:
            for icd in rx_to_icd[rx]:
                codes = icd9.get_children(icd)
                for code in codes:
                    icd_to_ndc[code].append(ndc)

    for icd in icd_to_ndc:
        icd_to_ndc[icd] = list(set(icd_to_ndc[icd]))
    dump(dict(icd_to_ndc), "icd_to_ndc.pkl")

    ndc_to_icd = dd(list)
    for icd in icd_to_ndc:
        for ndc in icd_to_ndc[icd]:
            ndc_to_icd[ndc].append(icd)
    for ndc in ndc_to_icd:
        ndc_to_icd[ndc] = list(set(ndc_to_icd[ndc]))
    dump(dict(ndc_to_icd), "ndc_to_icd.pkl")
Ejemplo n.º 11
0
def order_encounters(name):
    import random
    print(name)
    encounters = load(name + '.pkl')
    orders = ["voc", "random", "freq", 'rare']
    ordered = [[] for _ in range(len(orders))]
    counters = dd(int)
    for enc in encounters:
        for code in enc[1]:
            counters[code] += 1
    vocab = sorted(list(counters.keys()))
    code_to_vocab_index = {}
    for i in range(len(vocab)):
        code_to_vocab_index[vocab[i]] = i
    for enc in encounters:
        if len(enc[1]) == 0:
            continue
        for i, order in enumerate(orders):
            enc_1 = list(set(enc[1]))
            if order == "voc":
                enc_1 = sorted(enc_1, key=lambda x: code_to_vocab_index[x])
                ordered[i].append((enc[0], enc_1))
            elif order == "random":
                random.shuffle(enc_1)
                ordered[i].append((enc[0], enc_1))
            elif order == "freq":
                enc_1 = sorted(enc_1, key=lambda x: counters[x], reverse=True)
                ordered[i].append((enc[0], enc_1))
            elif order == "rare":
                enc_1 = sorted(enc_1, key=lambda x: counters[x])
                ordered[i].append((enc[0], enc_1))
    for i, order in enumerate(orders):
        dump(ordered[i], name + "_" + order + ".pkl")
Ejemplo n.º 12
0
def train_mimicq():
    input_vocab = load("mimic_diag_vocab.pkl")
    output_vocab = load("mimic_drug_vocab.pkl")
    train_encounters = load("mimic_encounter_gpi.train.pkl")
    test_encounters  = load("mimic_encounter_gpi.dev.pkl")
    test_set = []
    train_set = []
    for enc in train_encounters:
        train_set.append(([input_vocab[code] for code in enc[0]], [output_vocab[code] for code in enc[1]]))
    for enc in test_encounters:
        test_set.append(([input_vocab[code] for code in enc[0]], [output_vocab[code] for code in enc[1]]))
    mlp = MLP("mimic")
    mlp.load_data(train_set, test_set, len(input_vocab), len(output_vocab))
    mlp.build_model()
    mlp.fit(5)
    mlp.predict(test_set)
Ejemplo n.º 13
0
def evaluate(name):
    results = load(name)
    input_list, truth_list, prediction_list = [], [], []
    for i, result in enumerate(results):
        input_list.append(result[0])
        truth_list.append(result[1])
        prediction_list.append(result[2])
    return input_list, truth_list, prediction_list
def get_Data(labels_test):
    train_size = int(19386625 * 0.2)
    X_train, y_train, X_test, y_test, X_val, y_val, normalizer = data.load(
        train_size,
        normalize=False,
        balance=False,
        osm_roads=(labels_test == 4),
        split_struct=(labels_test == 3))
    return X_train, y_train, X_val, y_val
Ejemplo n.º 15
0
def gen_parallel_text():
    f_out1 = open("sutter_diag.txt", "w")
    f_out2 = open("sutter_drug.txt", "w")
    encounters = load("sutter_encounters_3.pkl")
    for enc in encounters:
        f_out1.write(" ".join(enc[0]) + "\n")
        f_out2.write(" ".join(enc[1]) + "\n")
    f_out1.close()
    f_out2.close()
Ejemplo n.º 16
0
    def decode(self, test_set):
        # Create model and load parameters.
        model = self.model
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        input_vocab = load("diag_vocab.pkl")
        output_vocab = load("drug_vocab.pkl")

        output_id_to_token = {}
        for token in output_vocab:
            output_id_to_token[output_vocab[token]] = token

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        for pair in test_set:
            # Get token-ids for the input sentence.
            token_ids = [input_vocab[token] for token in pair[0]]
            # Which bucket does it belong to?
            bucket_id = len(self.buckets) - 1
            for i, bucket in enumerate(self.buckets):
                if bucket[0] >= len(token_ids):
                    bucket_id = i
                    break
            else:
                logging.warning("Sentence truncated: %s", pair[0])

            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = self.get_batch(
                    {bucket_id: [(token_ids, [])]}, bucket_id
            )
            # Get output logits for the sentence.
            _, _, output_logits = model.step(self.session, encoder_inputs, decoder_inputs,
                                             target_weights, bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
            # If there is an EOS symbol in outputs, cut them at that point.
            if self.EOS_ID in outputs:
                outputs = outputs[:outputs.index(self.EOS_ID)]
            # Print out French sentence corresponding to outputs.
            print(" ".join([output_id_to_token[output] for output in outputs]))
            print("> ", end="")
            sys.stdout.flush()
Ejemplo n.º 17
0
def get_data():
    dataset = "sutter"
    level = 2
    import os
    results = dd(list)
    xs = {}
    for file in os.listdir("/home/yzhang3151/project/AutoPrescribe2/data"):
        if file.endswith(".pkl") and file.startswith("%s_%s" % (dataset, level)):
            d = file.split("_")
            results[d[2]].append((int(d[5][1:]), int(d[6][1:]), float(d[7].replace("jacc", ""))))

    for k in results:
        xs[k] = [y[2] for y in sorted(results[k], key=lambda x: (x[0], x[1]))]

    dump(xs, "traj_%s_%s.pkl" % (dataset, level))

    xs = load("traj_%s_%s.pkl" % (dataset, level))

    name_mapping = {
        "voc": "Vocabulary",
        "random": "Random",
        "freq": "Frequent first",
        "rare": "Rare first"
    }
    line_type = {
        "random": "-",
        "freq": "--",
        "rare": "s",
        "voc": "^"
    }


    fig, ax = plt.subplots(figsize=(8, 4))

    for k in name_mapping:
        line, = ax.plot([x for x in xs[k]][:len(xs["random"])], line_type[k], linewidth=2, label=name_mapping[k])

    # x = np.linspace(0, 10, 500)
    # dashes = [10, 5, 100, 5]  # 10 points on, 5 off, 100 on, 5 off
    #
    # line1, = ax.plot(x, np.sin(x), '--', linewidth=2,
    #                  label='Dashes set retroactively')
    # line1.set_dashes(dashes)
    #
    # line2, = ax.plot(x, -1 * np.sin(x), dashes=[30, 5, 10, 5],
    #                  label='Dashes set proactively')
    ax.set_xlabel("Epochs", fontsize=20)
    ax.set_ylabel("Jaccard Coefficient", fontsize=20)
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(15)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(15)
    ax.legend(loc='lower right', fontsize=20)
    # plt.show()
    fig.tight_layout()
    plt.savefig("traj_%s_%s.pdf" % (dataset, level))
Ejemplo n.º 18
0
def model(dfs):
    start = time.time()
    train_size = 100_000
    X_train, y_train, X_test, y_test, _, _, _ = data.load(train_size,
                                                          datafiles=dfs,
                                                          normalize=False,
                                                          osm_roads=False,
                                                          split_struct=False)

    N_s = [500, 1000, 1500, 2000]
    min_samples_leaf = [1, 2, 4]
    min_samples_split = [2, 3, 6, 8]

    tuning_params = {
        'max_depth': [75, 80, 130, None],
        'min_samples_leaf': min_samples_leaf,
        'min_samples_split': min_samples_split,
        'n_estimators': N_s,
        'n_jobs': [-1],
        'class_weight': ['balanced', None, 'balanced_subsample']
    }

    print(
        f'# Tuning hyper-parameters for Random forest on { X_train.shape[0] } samples'
    )
    print()
    kappa_scorer = make_scorer(cohen_kappa_score)
    gs = RandomizedSearchCV(RandomForestClassifier(),
                            tuning_params,
                            cv=3,
                            scoring={'kappa': kappa_scorer},
                            refit='kappa',
                            return_train_score=True,
                            n_iter=5,
                            verbose=2)
    gs.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(gs.best_params_)
    print()
    print(gs.best_score_)
    print()

    clf = gs.best_estimator_
    y_pred = clf.predict(X_test)

    kappa = cohen_kappa_score(y_test, y_pred)

    matrix = confusion_matrix(y_test, y_pred)
    print(f'Kappa: {kappa}')
    print(classification_report(y_test, y_pred))

    end = time.time()
    elapsed = end - start
    print("Run time: " + str(timedelta(seconds=elapsed)))
Ejemplo n.º 19
0
def run():
    from utils.data import get_model_path, load
    from utils.eval import Evaluator
    input_vocab = load("sutter_diag_vocab.pkl")
    output_vocab = load("sutter_drug_vocab_3.pkl")
    encounters = load("sutter_encounters_3.pkl")
    train_set = []
    for enc in encounters[:1000000]:
        train_set.append(([input_vocab[code] for code in enc[0]], [output_vocab[code] for code in enc[1]]))
    test_set = []
    for enc in encounters[1000000:]:
        test_set.append(([input_vocab[code] for code in enc[0]], [output_vocab[code] for code in enc[1]]))
    train_x = np.zeros((len(train_set), len(input_vocab)))
    train_y = np.zeros((len(train_set), len(output_vocab)))
    test_x = np.zeros((len(test_set), len(input_vocab)))
    test_y = np.zeros((len(test_set), len(output_vocab)))
    model = AttentionSeq2Seq(input_dim=len(input_vocab), input_length=10, hidden_dim=100, output_length=10, output_dim=len(output_vocab), depth=4)
    model.compile(loss='mse', optimizer='rmsprop', metrics=['accuracy', 'precision', 'recall'])
    model.fit(train_x, train_y, validation_data=(test_x[:1000], test_y[:1000]))
Ejemplo n.º 20
0
def train():
    import numpy as np
    from sklearn import metrics
    level = 2
    data = "mimic"
    # input_vocab = load("sutter_diag_vocab.pkl")
    # output_vocab = load("sutter_drug_vocab_%s.pkl" % level)
    input_vocab = load("%s_diag_vocab.pkl" % (data))
    output_vocab = load("%s_drug_vocab_%s.pkl" % (data, level))
    train_encounters = load("%s_encounters_%s.train.pkl" % (data, level))
    test_encounters = load("%s_encounters_%s.test.pkl" % (data, level))
    # test_set = []
    # train_set = []
    # for enc in test_encounters:
    #     test_set.append((enc[0], [output_vocab[code] for code in enc[1]]))

    mfm = MostFreqMatch(3, "mimic")
    mfm.fit(train_encounters)

    # output_dim = len(output_vocab)
    # test_y = np.zeros((len(test_set), output_dim))
    # test_result = np.zeros((len(test_set), output_dim))
    # for i, pair in enumerate(test_set):
    #     for j in pair[1]:
    #         test_y[i, j] = 1
    #     for code in pair[0]:
    #         if code in mfm.freq:
    #             for tk in mfm.freq[code][:5]:
    #                 test_result[output_vocab[tk[0]]] += 1
    #
    #
    # auc = metrics.roc_auc_score(test_y, test_result, 'micro')

    results = []
    prediction_list = []
    truth_list = []
    for item in test_encounters:
        prediction = mfm.predict(item[0])
        prediction_list.append(prediction)
        truth_list.append(item[1])
        results.append((item[0], item[1], prediction))
    dump(results, "sutter_result_freq_%s.pkl" % level)
Ejemplo n.º 21
0
def main(argv):
    real_start = time.time()

    split_struct=False
    osm_roads=False

    # train_size = int(100_000)
    train_size = int(19_386_625*0.2)
    X_train, y_train, X_test, y_test,_,_,_ = data.load(train_size, normalize=True, osm_roads=osm_roads, split_struct=split_struct)

    start = time.time()
    # Build a sv and compute the feature importances
    sv = svm.SVC(C=6.685338321430641, gamma=6.507029881541734)

    print("Fitting data...")
    sv.fit(X_train, y_train)

    end = time.time()
    elapsed = end-start
    print("Training time: " + str(timedelta(seconds=elapsed)))

    yt_pred = sv.predict(X_train)
    kappa = cohen_kappa_score(y_train, yt_pred)
    print(f'Train Kappa: {kappa}')
    print(classification_report(y_train, yt_pred))

    y_pred = sv.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred)
    print(f'Kappa: {kappa}')
    print(classification_report(y_test, y_pred))
    return 0

    dump(sv, '../sensing_data/models/svm_static_group3.joblib')
    print("Saved model to disk")
    # Testing trash
    X, y, shape = data.load_prediction(
        ratio=1, normalize=True, osm_roads=osm_roads, split_struct=split_struct)

    start_pred = time.time()
    y_pred = sv.predict(X)
    print("Predict time: " + str(timedelta(seconds=time.time()-start_pred)))

    kappa = cohen_kappa_score(y, y_pred)
    print(f'Kappa: {kappa}')
    print(classification_report(y, y_pred))

    yr = y_pred.reshape(shape)

    viz.createGeotiff(OUT_RASTER, yr, DS_FOLDER +
                      "clipped_sentinel2_B08.vrt", gdal.GDT_Byte)

    end = time.time()
    elapsed = end-real_start
    print("Total run time: " + str(timedelta(seconds=elapsed)))
Ejemplo n.º 22
0
def eval_freq():
    level = 2
    input_vocab = load("sutter_diag_vocab.pkl")
    output_vocab = load("sutter_drug_vocab_%s.pkl" % level)
    test_set = load("sutter_encounters.test_%s.pkl" % level)
    mfm = MostFreqMatch(1)
    mfm.load("sutter_freq.pkl")
    results = []
    prediction_list = []
    truth_list = []
    for item in test_set:
        prediction = mfm.predict(item[0])
        prediction_list.append(prediction)
        truth_list.append(item[1])
        results.append((item[0], item[1], prediction))
    dump(results, "sutter_result_freq_%s.pkl" % level)

    evaluator = Evaluator()
    evaluator.eval(mfm)
    evaluator.eval_golden(mfm)
Ejemplo n.º 23
0
def load_mapping():
    diag_drug_mapping = load("diag_drug_mapping.pkl")
    diag_to_drug = {}
    drug_to_diag = {}
    for diag in diag_drug_mapping[0]:
        diag_to_drug[diag.replace(".", "")] = diag_drug_mapping[0][diag]
    for drug in diag_drug_mapping[1]:
        drug_to_diag[drug] = []
        for diag in diag_drug_mapping[1][drug]:
            drug_to_diag[drug].append(diag.replace(".", ""))
    dump((diag_to_drug, drug_to_diag), "mimic_diag_drug_mapping.pkl")
Ejemplo n.º 24
0
def get_average_golden_eval(input_list, prediction_list):
    diag_to_drug, drug_to_diag = load("icd_gpi_map.pkl")
    ave_precision, ave_recall = 0.0, 0.0
    for i, kk in enumerate(input_list):
        item = [k.replace(".", "") for k in kk]
        precision, recall = get_golden_eval(item, prediction_list[i],
                                            diag_to_drug, drug_to_diag)
        ave_precision += precision
        ave_recall += recall
        if i % 1000 == 0:
            print(ave_precision / (i + 1), ave_recall / (i + 1), precision,
                  recall)
def model(dfs):
    
    train_size = int(19386625*0.05)
    X_train, y_train, X_test, y_test, _, _, _ = data.load(train_size, map_classes=False, normalize=False, osm_roads=False, split_struct=False, gt_raster='cos_new_gt_2015t.tiff')
        
    start = time.time()
    print(f'Tuning on {X_train.shape}')
    xgb_model = xgb.XGBClassifier()
    # brute force scan for all parameters
    # usually max_depth is 6,7,8
    # learning rate is around 0.05, but small changes may make big diff
    # tuning min_child_weight subsample colsample_bytree can have
    # much fun of fighting against overfit
    # n_estimators is how many round of boosting
    # finally, ensemble xgboost with multiple seeds may reduce variance
    n_trees = [500,1000,1500]
    parameters = {
                  'tree_method': ['hist'],
                  'predictor': ['cpu_predictor'],
                  #'gpu_id': [0],
                  #'objective': ['multi:softmax'],
                  # params tuning
                  'learning_rate': uniform(0.001,0.3),  # `eta` value
                  'max_depth': [3, 5, 6, 8],
                  'min_child_weight': [1, 3, 5],
                  "gamma": [0, 1, 5],
                  'colsample_bytree': uniform(0.7,0.2),
                  'n_estimators': n_trees,
                  'max_delta_step': uniform(1,9)}

    kappa_scorer = make_scorer(cohen_kappa_score)
    gs = RandomizedSearchCV(xgb_model, parameters, cv=3, scoring={'kappa': kappa_scorer}, refit='kappa', return_train_score=False, n_iter=200, verbose=1, n_jobs=10)
    gs.fit(X_train, y_train)

    print("Best parameters set found on development set: ")
    print()
    print(gs.best_params_)
    print()
    print(gs.best_score_)
    print()

    clf = gs.best_estimator_
    y_pred = clf.predict(X_test)

    kappa = cohen_kappa_score(y_test, y_pred)
    matrix = confusion_matrix(y_test, y_pred)
    print(f'Kappa: {kappa}')
    print(classification_report(y_test, y_pred))
    print(matrix)

    end = time.time()
    elapsed = end-start
    print("Run time: " + str(timedelta(seconds=elapsed)))
Ejemplo n.º 26
0
 def __init__(self, ds="mimic"):
     if ds == "mimic":
         self.test_set = load("mimic_episodes_index_test.pkl")
         self.golden_rule = load("icd_to_ndc_index.pkl")
         diag_vocab = load("diag_vocab.pkl")
         drug_vocab = load("drug_vocab.pkl")
     elif ds == "sutter":
         self.test_set = load("sutter_encounters_3.pkl")
         self.golden_rule = load("icd_to_ndc_index.pkl")
         diag_vocab = load("sutter_diag_vocab.pkl")
         drug_vocab = load("sutter_drug_vocab_3.pkl")
     self.index_to_diag = {}
     self.index_to_drug = {}
     for diag in diag_vocab:
         self.index_to_diag[diag_vocab[diag]] = diag
     for drug in drug_vocab:
         self.index_to_drug[drug_vocab[drug]] = drug
Ejemplo n.º 27
0
def main():
    args = parse_arg()
    if args.book == 'wonderland':
        from config import DefaultConfig as Config
    elif args.book == 'copperfield':
        from config import Copperfield as Config
    else:
        raise NotImplementedError

    config = Config()
    data = load(config)
    X, y, chars, dataX, dataY = preprocessing(data, config)
    model, callbacks = build_model(X, y, config)
    train(model, X, y, callbacks, config)
    generate(model, dataX, chars, config)
def model(dfs):
    
    train_size = int(19386625*0.05)
    X_train, y_train, X_test, y_test = data.load(
        train_size, datafiles=dfs, normalize=True, balance=False, osm_roads=True)

    start = time.time()
    print(f'Tuning on {X_train.shape}')
    tuning_params = {
        'loss': ['hinge'],
        'penalty': ['elasticnet', 'l2', 'l1', 'none'],
        'alpha': 10.0**-np.arange(1,7),
        'l1_ratio': uniform(0, 1),
        'early_stopping': [True],
        'class_weight': ['balanced'],
        'tol': [1e-3],
        'max_iter': [1000, 500, 1500]
    }

    print(
        f'# Tuning hyper-parameters for Stochastic gradient descent (SVM) on { X_train.shape[0] } samples')
    print()
    kappa_scorer = make_scorer(cohen_kappa_score)
    gs = RandomizedSearchCV(SGDClassifier(), tuning_params, cv=3, scoring={
                            'kappa': kappa_scorer}, refit='kappa', return_train_score=False, n_iter=200, verbose=2, n_jobs=-1)
    gs.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(gs.best_params_)
    print()
    print(gs.best_score_)
    print()
    
    clf = gs.best_estimator_
    y_pred = clf.predict(X_test)

    kappa = cohen_kappa_score(y_test, y_pred)

    matrix = confusion_matrix(y_test, y_pred)
    print(f'Kappa: {kappa}')
    print(classification_report(y_test, y_pred))
    print(matrix)

    end = time.time()
    elapsed = end-start
    print("Run time: " + str(timedelta(seconds=elapsed)))
def model(dfs):
    start = time.time()
    train_size = 100_000

    X_train, y_train, X_test, y_test = data.load(train_size,
                                                 normalize=True,
                                                 balance=False,
                                                 osm_roads=False)
    # Set the parameters by cross-validation
    C_s = uniform(loc=0, scale=8)
    gamma = uniform(loc=0, scale=8)

    tuning_params = {'C': C_s, 'gamma': gamma, 'class_weight': ['balanced']}

    kappa_scorer = make_scorer(cohen_kappa_score)
    gs = RandomizedSearchCV(svm.SVC(),
                            tuning_params,
                            cv=3,
                            scoring={'kappa': kappa_scorer},
                            refit='kappa',
                            return_train_score=False,
                            n_iter=50,
                            verbose=1,
                            n_jobs=-1)
    gs.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(gs.best_params_)
    print()
    print(gs.best_score_)
    print()

    clf = gs.best_estimator_
    y_pred = clf.predict(X_test)

    kappa = cohen_kappa_score(y_test, y_pred)
    matrix = confusion_matrix(y_test, y_pred)

    print(f'Kappa: {kappa}')
    print(classification_report(y_test, y_pred))

    end = time.time()
    elapsed = end - start
    print("Run time: " + str(timedelta(seconds=elapsed)))

    viz.plot_confusionmx(matrix)
Ejemplo n.º 30
0
def model(dfs):
    start = time.time()
    train_size = 100_000
    X_train, y_train, X_test, y_test = data.load(train_size,
                                                 normalize=True,
                                                 balance=False)

    input_shape = X_train.shape[1]
    logits = 4

    y_train_onehot = tf.keras.utils.to_categorical(y_train, num_classes=logits)

    dnn = Sequential()
    # Define DNN structure
    dnn.add(Dense(32, input_dim=input_shape, activation='relu'))
    dnn.add(Dense(units=logits, activation='softmax'))

    dnn.compile(loss='categorical_crossentropy',
                optimizer='Adadelta',
                metrics=['accuracy'])
    dnn.summary()

    history_rmsprop = dnn.fit(X_train,
                              y_train_onehot,
                              epochs=10,
                              validation_split=0.1)

    # plot the accuracy
    gen_graph(history_rmsprop, "ResNet50 RMSprop")

    y_pred_onehot = dnn.predict(X_test)

    y_pred = [np.argmax(pred) for pred in y_pred_onehot]

    kappa = cohen_kappa_score(y_test, y_pred)
    matrix = confusion_matrix(y_test, y_pred)
    print(f'Kappa: {kappa}')
    print(classification_report(y_test, y_pred))

    end = time.time()
    elapsed = end - start
    print("Run time: " + str(timedelta(seconds=elapsed)))

    viz.plot_confusionmx(matrix)
def get_segregation(
        file_path= "/computed/sentence_analysis_reg.pkl.gz",
        error_metric = regression_metric):

    # Loading necessary data.
    best_module, testData, Y_test = data.load( root + file_path )
    n_samples = len(Y_test)

    # Computing error and sorting and grouping errors by rating groups
    Y_pred = ModuleValidator.calculateModuleOutput(best_module, testData)
    
    
    error = [ error_metric(Y_pred[i],Y_test[i]) for i in xrange(n_samples) ]
    err_and_revidx = zip( error, range(n_samples) )

    sorted_err = {0:[], 1:[], 2:[], 3:[], 4:[]}
    # for some reason the last n_samples/2 are corrupted and are not alligned to the reviews.
    for idx in range(n_samples/2):
        sorted_err[ Y_test[idx] ].append( err_and_revidx[idx] )
    for idx in range(5):
        sorted_err[idx] = sorted( sorted_err[idx] )
 
    return sorted_err
def extract_reviews_in_category(category="Pizza"):
    filepath = root + "/dataset/yelp_academic_dataset_review_randomize.json";
    savepath = root + "/dataset/yelp_academic_dataset_review_" +category.lower() + ".json";
    
    reviews_file = open(filepath);
    save_file = open(savepath, "w");
    lines_file = reviews_file.readlines();
    reviews_file.close();

    business_by_category = data.load("business_by_category.pkl.gz");
    #TODO check that the input category is present in the list!
    valid_businesses = business_by_category[category];

    i_review = 1;
    for line_json in lines_file:
        review_dict = json.loads(line_json);
        business_id = review_dict["business_id"];
        line_json = line_json.rstrip("\n")
        if business_id in valid_businesses:
            print(line_json, file=save_file);        
        i_review += 1;

    save_file.close();
Ejemplo n.º 33
0
from pybrain.tools.shortcuts import buildNetwork
from pybrain.structure import TanhLayer
from pybrain.structure.modules   import SoftmaxLayer
from pybrain.datasets import ClassificationDataSet
from pybrain.supervised.trainers import BackpropTrainer

import numpy as np

root = data.getParent(__file__)

#training_filename = root + '/computed/prototypes_tfidf.pkl.gz'
#weights_filename = root + '/computed/bestweights_classification_noregul.pkl.gz'
training_filename = root + '/computed/prototypes_sentence_regul_tfidf.pkl.gz'
bestweights_filename = root + '/computed/proto_final_sentence_regul_tfidf.pkl.gz'

train, valid, test = data.load(training_filename)
bestweights = data.load(bestweights_filename)
nunits = 50

X_train, Y_train = train
X_valid, Y_valid = valid
X_test, Y_test = test

net = buildNetwork(nunits, nunits, 5, bias=True, hiddenclass=TanhLayer, outclass=SoftmaxLayer)
# fast requires arac which is a pain in the butt to install but doable

def createDataset(X, Y):
  ds = ClassificationDataSet(nunits, 1, nb_classes=5)
  ds.setField('input', X)
  ds.setField('target', np.asmatrix(Y).T)
  ds._convertToOneOfMany()
Ejemplo n.º 34
0
def load_data(dataset, cast=True):
    ''' Loads the dataset

    :type dataset: string
    :param dataset: the path to the dataset (here MNIST)
    '''

    #############
    # LOAD DATA #
    #############

    # Download the MNIST dataset if it is not present
    data_dir, data_file = os.path.split(dataset)
    if data_dir == "" and not os.path.isfile(dataset):
        # Check if dataset is in the data directory.
        new_path = os.path.join(os.path.split(__file__)[0], "..", "dataset", dataset)
        if os.path.isfile(new_path):
            dataset = new_path

    # Load the dataset
    train_set, valid_set, test_set = data.load(dataset)
    #train_set, valid_set, test_set format: tuple(input, target)
    #input is an np.ndarray of 2 dimensions (a matrix)
    #witch row's correspond to an example. target is a
    #np.ndarray of 1 dimensions (vector)) that have the same length as
    #the number of rows in the input. It should give the target
    #target to the example with the same index in the input.

    def shared_dataset(data_xy, borrow=True, cast=True):
        """ Function that loads the dataset into shared variables

        The reason we store our dataset in shared variables is to allow
        Theano to copy it into the GPU memory (when code is run on GPU).
        Since copying data into the GPU is slow, copying a minibatch everytime
        is needed (the default behaviour if the data is not in a shared
        variable) would lead to a large decrease in performance.
        """
        data_x, data_y = data_xy
        shared_x = theano.shared(np.asarray(data_x,
                                               dtype=theano.config.floatX),
                                 borrow=borrow)
        shared_y = theano.shared(np.asarray(data_y,
                                               dtype=theano.config.floatX),
                                 borrow=borrow)
        # When storing data on the GPU it has to be stored as floats
        # therefore we will store the labels as ``floatX`` as well
        # (``shared_y`` does exactly that). But during our computations
        # we need them as ints (we use labels as index, and if they are
        # floats it doesn't make sense) therefore instead of returning
        # ``shared_y`` we will have to cast it to int. This little hack
        # lets ous get around this issue
        if cast:
            return shared_x, T.cast(shared_y, 'int32') 
        else:
            return shared_x, shared_y

    test_set_x, test_set_y = shared_dataset(test_set, cast=cast)
    valid_set_x, valid_set_y = shared_dataset(valid_set, cast=cast)
    train_set_x, train_set_y = shared_dataset(train_set, cast=cast)

    rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
            (test_set_x, test_set_y)]
    return rval
Ejemplo n.º 35
0
# yelp data
dataset_train_filename = root + "/dataset/yelp_academic_dataset_review_training.json"
dataset_holdout_filename = root + "/dataset/holdout/yelp_academic_dataset_review_holdout.json"

corpus_filename = root + '/computed/corpustrain.mm'
dict_filename = root + '/computed/corpustrain.dict'
tfidf_filename = root + '/computed/tfidf.model'
weights_filename = root + '/computed/proto_tfidf_noregul_weights.counter'
training_filename = root + '/computed/prototypes_sentence_cos_noregul_tfidf.pkl.gz'

corpus_train = corpora.MmCorpus(corpus_filename)
dictionary_train = corpora.Dictionary.load(dict_filename)
tfidf_model = tfidfmodel.TfidfModel.load(tfidf_filename)
corpus_train_tfidf = tfidf_model[corpus_train]
weights = data.load(weights_filename)


wordreps = io.loadmat(wordrep_filename)
prototypes = wordreps['We']

numbercosines = 5

def getValues(weights, filename):
  X = []
  Y = []
  leftouts = 0

  for sentence_tokens, stars in generateYelpSentenceExample(filename):
    sentence_proto = []
    sentence_weights = []
# vectors libs
from pybrain.tools.shortcuts import buildNetwork
from pybrain.structure import TanhLayer
from pybrain.structure.modules   import SoftmaxLayer
from pybrain.datasets import ClassificationDataSet
from pybrain.supervised.trainers import BackpropTrainer

import numpy as np

root = data.getParent(__file__)

training_filename = root + '/computed/prototypes_sentence_cos_noregul_tfidf.pkl.gz'
bestweights_filename = root + '/computed/proto_final_sentence_cos_noregul_tfidf.pkl.gz'
nunits = 55

train, valid, test = data.load(training_filename)

X_train, Y_train = train
X_valid, Y_valid = valid
X_test, Y_test = test

net = buildNetwork(nunits, nunits, 5, bias=True, hiddenclass=TanhLayer, outclass=SoftmaxLayer, fast=True)
# fast requires arac which is a pain in the butt to install but doable

def createDataset(X, Y):
  ds = ClassificationDataSet(nunits, 1, nb_classes=5)
  ds.setField('input', X)
  ds.setField('target', np.asmatrix(Y).T)
  ds._convertToOneOfMany()
  return ds
from collections import Counter
from utils import data

f_word_assign = open('word-assignments.dat');

n_topics = 20;
dict_topics = dict();
for i in range(n_topics):
    dict_topics[i] = Counter();

print "idx2word"
word2idx = data.load('slda_word2idx.pkl.gz');
idx2word = dict();
for word in word2idx:
    idx2word[ word2idx[word] ] = word;

print "topics distr"
for line in f_word_assign.readlines():
    word_assigns = line.split();
    for word_assign in word_assigns[1:]:
        idx, topic = word_assign.split(':');        
        word = idx2word[ int(idx) ];
        dict_topics[int(topic)][word] += 1;
slda_RMSE = linear_regression(slda_x_train,
                                slda_y_train,
                                slda_x_test,
                                slda_y_test,
                                ["sLDA - Regression","Rating","Error"],
                                "slda_linReg_error.eps");

print "SLDA - RMSE : %1.4e" %slda_RMSE


"""
 ##### LDA Regression #####
"""

print "Loading LDA data"
lda_corpus_train = data.load("lda_corpus_train.pkl.gz");
lda_corpus_test = data.load("lda_corpus_test.pkl.gz");
word2id = data.load("slda_word2idx.pkl.gz");

print "Generating id2word"
id2word = dict();
for key in word2id:
    id2word[ word2id[key] ] = key;

print "lda training"
lda = gensim.models.ldamodel.LdaModel( lda_corpus_train, num_topics=20, id2word=id2word);

print "lda inference"
lda_x_train = lda.inference( lda_corpus_train );
lda_x_train = lda_x_train[0];
lda_y_train = slda_y_train;