Ejemplo n.º 1
0
def train_svm(train_processed_path):
    # Load abbr index
    abbr_idx_mapper = pickle_reader(train_processed_path +
                                    '/abbr_idx_mapper.pkl')
    abbr_cui2idx_inventory = {}
    os.makedirs(train_processed_path + '/svm_models/', exist_ok=True)
    # generate training data & train model
    for abbr, abbr_idx in tqdm.tqdm(abbr_idx_mapper['abbr2idx'].items()):
        content_vector = pickle_reader(train_processed_path +
                                       '/content_vectors/%d_vector.pkl' %
                                       abbr_idx)
        label2idx = {}
        label_idx = 0
        x = []
        y = []
        for global_instance_idx, doc_id, pos, content_pos, content_vec, content, label in content_vector:
            if label not in label2idx:
                label2idx[label] = label_idx
                label_idx += 1
            x.append(content_vec)
            y.append(label2idx[label])

        abbr_cui2idx_inventory[abbr] = label2idx
        # no need to train if only have 1 CUI
        if len(label2idx) > 1:
            x_train, y_train = train_sample(x, y, 2000)
            # train svm model
            model = SVC(kernel='rbf', gamma=0.01, C=100).fit(x_train, y_train)
            pickle_writer(
                model,
                train_processed_path + '/svm_models/%d_svm.pkl' % abbr_idx)
    pickle_writer(abbr_cui2idx_inventory,
                  train_processed_path + '/abbr_cui_idx_inventory.pkl')
Ejemplo n.º 2
0
def evaluate_score_svm(test_processed_path, train_processed_path):
    # Load abbr index
    abbr_idx_mapper = pickle_reader(test_processed_path +
                                    '/abbr_idx_mapper.pkl')
    abbr_idx_mapper_train = pickle_reader(train_processed_path +
                                          '/abbr_idx_mapper.pkl')
    abbr2train_idx = abbr_idx_mapper_train['abbr2idx']
    abbr_cui2idx_inventory = pickle_reader(train_processed_path +
                                           '/abbr_cui_idx_inventory.pkl')

    count_correct = 0
    count_all = 0
    count_model_correct = 0
    count_model_all = 0
    count_no_label = 0
    count_correct_without_predict = 0
    # generate testing data
    for abbr, abbr_idx in tqdm.tqdm(abbr_idx_mapper['abbr2idx'].items()):
        content_vector = pickle_reader(test_processed_path +
                                       '/content_vectors/%d_vector.pkl' %
                                       abbr_idx)
        if abbr not in abbr_cui2idx_inventory:
            count_all += len(content_vector)
            count_no_label += len(content_vector)
        else:
            label2idx = abbr_cui2idx_inventory[abbr]
            count_all += len(content_vector)
            x = []
            y = []
            for _, _, _, _, content_vec, _, label in content_vector:
                # if true label not in train collection
                if label not in label2idx:
                    count_no_label += 1
                # if only have 1 CUI
                elif len(label2idx) == 1:
                    count_correct += 1
                    count_correct_without_predict += 1
                # need predict
                else:
                    x.append(content_vec)
                    y.append(label2idx[label])
            # predict
            if len(y) > 0:
                count_model_all += len(y)
                model = pickle_reader(train_processed_path +
                                      '/svm_models/%d_svm.pkl' %
                                      abbr2train_idx[abbr])
                y_pred = model.predict(np.vstack(x))
                temp_correct = sum(y == y_pred)
                count_correct += temp_correct
                count_model_correct += temp_correct

    print("DataSet Accuracy (all instances): ", count_correct / count_all)
    print("Model Accuracy (only ambiguous instances): ",
          count_model_correct / count_model_all)
    print("Num.instances: ", count_all)
    print("Num.gt abbr-CUI mapping not found: ", count_no_label)
    print("Num.correct without predict", count_correct_without_predict)
    print()
Ejemplo n.º 3
0
def predict_fasttext_classifier(train_processed_path,
                                test_processed_path,
                                use_pretrain=False,
                                use_softmax=False):
    train_path = train_processed_path + '/fasttext'
    if use_pretrain:
        model_path = train_path + '/model/pre_train'
    else:
        model_path = train_path + '/model'
    # Load abbr index
    train_abbr_idx_mapper = pickle_reader(train_path + '/abbr_idx_mapper.pkl')
    train_abbr2idx = train_abbr_idx_mapper['abbr2idx']
    test_abbr_idx_mapper = pickle_reader(test_processed_path +
                                         '/fasttext/abbr_idx_mapper.pkl')
    test_abbr_index = AbbrIndex(test_processed_path + '/abbr_index_data.pkl')

    # Load model
    if use_softmax:
        model_file = model_path + '/all_softmax.bin'
    else:
        model_file = model_path + '/all.bin'
    model = load_model(model_file)
    label_set = set(map(lambda x: x.lstrip("__label__"), model.get_labels()))

    instance_collection = []
    # generate testing data
    for abbr, test_abbr_idx in tqdm(test_abbr_idx_mapper['abbr2idx'].items()):
        # if abbr not in train_abbr2idx:
        #     for doc_id, pos_list in test_abbr_index[abbr].items():
        #         for global_instance_idx, pos, label in pos_list:
        #             instance_collection.append(InstancePred(index=global_instance_idx, abbr=abbr, sense_pred=None))
        # else:
        eval_abbr_instance_list = txt_reader(test_processed_path +
                                             '/fasttext/dataset/%d.txt' %
                                             test_abbr_idx)
        abbr_instance_idx = 0
        for doc_id, pos_list in test_abbr_index[abbr].items():
            for global_instance_idx, pos, label in pos_list:
                if label not in label_set:
                    instance_collection.append(
                        InstancePred(index=global_instance_idx,
                                     abbr=abbr,
                                     sense_pred=None))
                else:
                    # get instance
                    tokens = eval_abbr_instance_list[abbr_instance_idx].split()
                    label_in_txt = tokens[0].lstrip("__label__")
                    assert label == label_in_txt
                    context = " ".join(tokens[1:])
                    instance_collection.append(
                        InstancePred(index=global_instance_idx,
                                     abbr=abbr,
                                     sense_pred=model.predict(context)[0]
                                     [0].lstrip("__label__")))
                abbr_instance_idx += 1
    # sort collection list based on global instance idx
    instance_collection = sorted(instance_collection, key=lambda x: x.index)
    return instance_collection
Ejemplo n.º 4
0
def svm_cross_validation(train_processed_path, abbr_idx=0):
    """
    Tuning the parameters on largest abbr in train set.

    :param train_processed_path:
    :param abbr_idx:
    :return:
    """
    content_vector = pickle_reader(train_processed_path +
                                   '/content_vectors/%d_vector.pkl' % abbr_idx)
    label2idx = {}
    label_idx = 0
    x = []
    y = []
    for instance_id, doc_id, pos, content_pos, content_vec, content, label in content_vector:
        if label not in label2idx:
            label2idx[label] = label_idx
            label_idx += 1
        x.append(content_vec)
        y.append(label2idx[label])

    x_train, y_train = train_sample(x, y, 500)
    parameters = {'gamma': [1e-4, 1e-3, 1e-2], 'C': [1e-1, 1, 10, 100, 1000]}
    model = SVC(kernel='rbf')
    model_cv = GridSearchCV(model, parameters, cv=5).fit(x_train, y_train)
    print(model_cv.best_params_)
    print(model_cv.best_score_)
    return model_cv
Ejemplo n.º 5
0
def generate_whole_dataset(processed_path, shuffle=False):
    abbr_idx_mapper = pickle_reader(processed_path +
                                    '/fasttext/abbr_idx_mapper.pkl')
    with open(processed_path + '/fasttext/dataset/all.txt', 'w') as f:
        total_dataset = []
        for abbr, abbr_idx in tqdm(abbr_idx_mapper['abbr2idx'].items()):
            total_dataset.extend(
                txt_reader(processed_path +
                           '/fasttext/dataset/%d.txt' % abbr_idx))
        if shuffle:
            random.shuffle(total_dataset)
        f.write("\n".join(total_dataset))
Ejemplo n.º 6
0
def train_fasttext_classifier_multi_model(train_processed_path,
                                          use_pretrain=False,
                                          use_softmax=False):
    train_path = train_processed_path + '/fasttext'
    if use_pretrain:
        model_path = train_path + '/model/pre_train'
    else:
        model_path = train_path + '/model'
    if use_softmax:
        loss = 'softmax'
    else:
        loss = 'hs'
    os.makedirs(model_path, exist_ok=True)
    # Load abbr index
    abbr_idx_mapper = pickle_reader(train_path + '/abbr_idx_mapper.pkl')
    abbr_index = AbbrIndex(train_processed_path + '/abbr_index_data.pkl')
    abbr_label_set = {}
    # Load training data & train model
    for abbr, abbr_idx in tqdm(abbr_idx_mapper['abbr2idx'].items()):
        input_file = train_path + '/dataset/%d.txt' % abbr_idx
        model_file = model_path + '/%d.bin' % abbr_idx
        # load label list
        label_set = set()
        for doc_id, pos_list in abbr_index[abbr].items():
            for global_instance_idx, pos, label in pos_list:
                label_set.add(label)
        abbr_label_set[abbr] = label_set
        # no need to train if only have 1 CUI
        if len(label_set) > 1:
            model_config = {
                'input': input_file,
                'epoch': 50,
                'lr': 0.1,
                'lrUpdateRate': 100,
                'dim': 100,
                'ws': 5,
                'wordNgrams': 2,
                'loss': loss,
                'thread': 60,
            }
            if use_pretrain:
                model_config[
                    'pretrainedVectors'] = train_processed_path + '/fasttext.vec'
            model = train_supervised(**model_config)
            model.save_model(model_file)
    pickle_writer(abbr_label_set, train_path + '/abbr_label_set.pkl')
Ejemplo n.º 7
0
def train_fasttext_classifier(train_processed_path,
                              abbr=None,
                              use_pretrain=False,
                              use_softmax=False):
    train_path = train_processed_path + '/fasttext'
    if use_pretrain:
        model_path = train_path + '/model/pre_train'
    else:
        model_path = train_path + '/model'
    train_path = train_processed_path + '/fasttext'
    os.makedirs(model_path, exist_ok=True)
    if abbr is None:
        # train on whole dataset
        input_file = train_path + '/dataset/all.txt'
        if use_softmax:
            model_file = model_path + '/all_softmax.bin'
        else:
            model_file = model_path + '/all.bin'
    else:
        # Load abbr index
        abbr_idx_mapper = pickle_reader(train_path + '/abbr_idx_mapper.pkl')
        abbr_idx = abbr_idx_mapper['abbr2idx'][abbr]
        input_file = train_path + '/dataset/%d.txt' % abbr_idx
        model_file = model_path + '/%d.bin' % abbr_idx

    if use_softmax:
        loss = 'softmax'
    else:
        loss = 'hs'
    model_config = {
        'input': input_file,
        'epoch': 50,
        'lr': 0.1,
        'lrUpdateRate': 100,
        'dim': 100,
        'ws': 5,
        'wordNgrams': 2,
        'loss': loss,
        'thread': 60,
    }
    if use_pretrain:
        model_config[
            'pretrainedVectors'] = train_processed_path + '/fasttext.vec'
    model = train_supervised(**model_config)
    model.save_model(model_file)
    return model
Ejemplo n.º 8
0
    txt_list_processed_sorted = sorted(txt_list_processed, key=operator.itemgetter(0))
    return [txt for _, txt in txt_list_processed_sorted]


if __name__ == '__main__':

    ######################################
    # Read texts from dataset
    ######################################
    # BASE_FOLDER = '/home/mengr/Project/wsd/wsd_data/'
    dataset_paths = DataSetPaths(environment='luoz3_x1')
    DATASET_PATH = dataset_paths.upmc_all_no_mark_txt
    OUTPUT_PATH = dataset_paths.upmc_all_no_mark_folder

    PATH_PROCESSED_INVENTORY_PKL = dataset_paths.sense_inventory_pkl

    # Get pickle generated from mimic_inventory.py
    inventory = pickle_reader(PATH_PROCESSED_INVENTORY_PKL)
    inventory_rmapper = inventory['longform-abbr_cui']

    ######################################
    # Processing
    ######################################
    txt_list = list(open(DATASET_PATH, 'r').readlines())
    print("Loaded %d docs from %s" % (len(txt_list), DATASET_PATH))
    # Replace Long forms to abbrs
    mimic_txt_processed = longform_replacer(txt_list, inventory_rmapper, n_jobs=50)
    # Save to file
    txt_writer(mimic_txt_processed, OUTPUT_PATH+'train_no_mark_longform_replaced.txt')
Ejemplo n.º 9
0
def predict_fasttext_classifier_multi_model(train_processed_path,
                                            test_processed_path,
                                            use_pretrain=False):
    train_path = train_processed_path + '/fasttext'
    if use_pretrain:
        model_path = train_path + '/model/pre_train'
    else:
        model_path = train_path + '/model'
    # Load abbr index
    test_abbr_idx_mapper = pickle_reader(test_processed_path +
                                         '/fasttext/abbr_idx_mapper.pkl')
    test_abbr_index = AbbrIndex(test_processed_path + '/abbr_index_data.pkl')
    train_abbr_idx_mapper = pickle_reader(train_processed_path +
                                          '/fasttext/abbr_idx_mapper.pkl')
    train_abbr2idx = train_abbr_idx_mapper['abbr2idx']
    train_abbr_label_set = pickle_reader(train_processed_path +
                                         '/fasttext/abbr_label_set.pkl')

    instance_collection = []
    # generate testing data
    for abbr, test_abbr_idx in tqdm(test_abbr_idx_mapper['abbr2idx'].items()):
        if abbr not in train_abbr_label_set:
            for doc_id, pos_list in test_abbr_index[abbr].items():
                for global_instance_idx, pos, label in pos_list:
                    instance_collection.append(
                        InstancePred(index=global_instance_idx,
                                     abbr=abbr,
                                     sense_pred=None))
        else:
            train_label_set = train_abbr_label_set[abbr]
            eval_abbr_instance_list = txt_reader(test_processed_path +
                                                 '/fasttext/dataset/%d.txt' %
                                                 test_abbr_idx)

            abbr_instance_idx = 0
            context_list, global_idx_list = [], []
            for doc_id, pos_list in test_abbr_index[abbr].items():
                for global_instance_idx, pos, label in pos_list:
                    # if true label not in train collection
                    if label not in train_label_set:
                        instance_collection.append(
                            InstancePred(index=global_instance_idx,
                                         abbr=abbr,
                                         sense_pred=None))
                    # if only have 1 CUI
                    elif len(train_label_set) == 1:
                        instance_collection.append(
                            InstancePred(index=global_instance_idx,
                                         abbr=abbr,
                                         sense_pred=label))
                    # need predict
                    else:
                        # get instance
                        tokens = eval_abbr_instance_list[
                            abbr_instance_idx].split()
                        label_in_txt = tokens[0].lstrip("__label__")
                        assert label == label_in_txt
                        context = " ".join(tokens[1:])
                        context_list.append(context)
                        global_idx_list.append(global_instance_idx)
                    abbr_instance_idx += 1
            # predict
            if len(context_list) > 0:
                # Load model
                model_file = model_path + '/%d.bin' % train_abbr2idx[abbr]
                model = load_model(model_file)
                predict_list = model.predict(context_list)[0]
                for idx, predict in zip(global_idx_list, predict_list):
                    instance_collection.append(
                        InstancePred(
                            index=idx,
                            abbr=abbr,
                            sense_pred=predict[0].lstrip("__label__")))

    # sort collection list based on global instance idx
    instance_collection = sorted(instance_collection, key=lambda x: x.index)
    return instance_collection
Ejemplo n.º 10
0
def predict_svm(test_processed_path, train_processed_path):
    # Load abbr index
    abbr_idx_mapper = pickle_reader(test_processed_path +
                                    '/abbr_idx_mapper.pkl')
    abbr_idx_mapper_train = pickle_reader(train_processed_path +
                                          '/abbr_idx_mapper.pkl')
    abbr2train_idx = abbr_idx_mapper_train['abbr2idx']
    abbr_cui2idx_inventory = pickle_reader(train_processed_path +
                                           '/abbr_cui_idx_inventory.pkl')

    instance_collection = []
    # generate testing data
    for abbr, abbr_idx in tqdm.tqdm(abbr_idx_mapper['abbr2idx'].items()):
        content_vector = pickle_reader(test_processed_path +
                                       '/content_vectors/%d_vector.pkl' %
                                       abbr_idx)
        if abbr not in abbr_cui2idx_inventory:
            for global_instance_idx, _, _, _, _, _, _ in content_vector:
                instance_collection.append(
                    InstancePred(index=global_instance_idx,
                                 abbr=abbr,
                                 sense_pred=None))
        else:
            label2idx = abbr_cui2idx_inventory[abbr]
            x = []
            y = []
            global_idx_list = []
            for global_instance_idx, _, _, _, content_vec, _, label in content_vector:
                # if true label not in train collection
                if label not in label2idx:
                    instance_collection.append(
                        InstancePred(index=global_instance_idx,
                                     abbr=abbr,
                                     sense_pred=None))
                # if only have 1 CUI
                elif len(label2idx) == 1:
                    instance_collection.append(
                        InstancePred(index=global_instance_idx,
                                     abbr=abbr,
                                     sense_pred=label))
                # need predict
                else:
                    x.append(content_vec)
                    y.append(label2idx[label])
                    global_idx_list.append(global_instance_idx)
            # predict
            if len(y) > 0:
                model = pickle_reader(train_processed_path +
                                      '/svm_models/%d_svm.pkl' %
                                      abbr2train_idx[abbr])
                y_pred = model.predict(np.vstack(x))

                # get idx2label
                idx2label = {}
                for label, idx in label2idx.items():
                    idx2label[idx] = label

                for idx_pred, global_instance_idx in zip(
                        y_pred, global_idx_list):
                    instance_collection.append(
                        InstancePred(index=global_instance_idx,
                                     abbr=abbr,
                                     sense_pred=idx2label[idx_pred]))

    # sort collection list based on global instance idx
    instance_collection = sorted(instance_collection, key=lambda x: x.index)
    return instance_collection