Esempio n. 1
0
def metamap_ner_re(d):
    print("load umls ...")
    UMLS_dict, _ = umls.load_umls_MRCONSO(d.config['norm_dict'])

    predict_dir = "/Users/feili/Desktop/umass/CancerADE_SnoM_30Oct2017_test/metamap"
    annotation_dir = os.path.join(opt.test_file, 'bioc')
    corpus_dir = os.path.join(opt.test_file, 'txt')

    annotation_files = [
        f for f in listdir(annotation_dir) if isfile(join(annotation_dir, f))
    ]

    ct_ner_predict = 0
    ct_ner_gold = 0
    ct_ner_correct = 0

    ct_norm_predict = 0
    ct_norm_gold = 0
    ct_norm_correct = 0

    for gold_file_name in annotation_files:

        gold_document = parse_one_gold_file(annotation_dir, corpus_dir,
                                            gold_file_name)

        predict_document = metamap.load_metamap_result_from_file(
            join(predict_dir,
                 gold_file_name[:gold_file_name.find('.')] + ".field.txt"))

        ct_ner_gold += len(gold_document.entities)
        ct_ner_predict += len(predict_document.entities)

        for predict_entity in predict_document.entities:

            for gold_entity in gold_document.entities:

                if predict_entity.equals_span(gold_entity):

                    ct_ner_correct += 1

                    break

        p1, p2, p3 = evaluate_for_ehr(gold_document.entities,
                                      predict_document.entities, UMLS_dict)

        ct_norm_gold += p1
        ct_norm_predict += p2
        ct_norm_correct += p3

    p = ct_ner_correct * 1.0 / ct_ner_predict
    r = ct_ner_correct * 1.0 / ct_ner_gold
    f1 = 2.0 * p * r / (p + r)
    print("NER p: %.4f | r: %.4f | f1: %.4f" % (p, r, f1))

    p = ct_norm_correct * 1.0 / ct_norm_predict
    r = ct_norm_correct * 1.0 / ct_norm_gold
    f1 = 2.0 * p * r / (p + r)
    print("NORM p: %.4f | r: %.4f | f1: %.4f" % (p, r, f1))
Esempio n. 2
0
    def metamap_ner_my_norm(d):
        print("load umls ...")

        UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(d.config['norm_dict'])

        predict_dir = "/Users/feili/Desktop/umass/CancerADE_SnoM_30Oct2017_test/metamap"
        annotation_dir = os.path.join(opt.test_file, 'bioc')
        corpus_dir = os.path.join(opt.test_file, 'txt')
        annotation_files = [f for f in os.listdir(annotation_dir) if os.path.isfile(os.path.join(annotation_dir, f))]



        if opt.test_in_cpu:
            model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu')
        else:
            model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'))
        model.eval()


        ct_norm_predict = 0
        ct_norm_gold = 0
        ct_norm_correct = 0

        correct_counter = Counter()
        wrong_counter = Counter()

        for gold_file_name in annotation_files:
            print("# begin {}".format(gold_file_name))
            gold_document = parse_one_gold_file(annotation_dir, corpus_dir, gold_file_name)

            predict_document = metamap.load_metamap_result_from_file(
                os.path.join(predict_dir, gold_file_name[:gold_file_name.find('.')] + ".field.txt"))

            # copy entities from metamap entities
            pred_entities = []
            for gold in predict_document.entities:
                pred = Entity()
                pred.id = gold.id
                pred.type = gold.type
                pred.spans = gold.spans
                pred.section = gold.section
                pred.name = gold.name
                pred_entities.append(pred)


            model.process_one_doc(gold_document, pred_entities, UMLS_dict, UMLS_dict_reverse)


            p1, p2, p3 = evaluate_for_ehr(gold_document.entities, pred_entities, UMLS_dict,
                                          predict_document.entities, correct_counter, wrong_counter)

            ct_norm_gold += p1
            ct_norm_predict += p2
            ct_norm_correct += p3




        sorted_correct_entities = OrderedDict(correct_counter.most_common())
        sorted_correct_entities = json.dumps(sorted_correct_entities, indent=4)
        with codecs.open("sorted_correct_entities.txt", 'w', 'UTF-8') as fp:
            fp.write(sorted_correct_entities)

        sorted_wrong_entities = OrderedDict(wrong_counter.most_common())
        sorted_wrong_entities = json.dumps(sorted_wrong_entities, indent=4)
        with codecs.open("sorted_wrong_entities.txt", 'w', 'UTF-8') as fp:
            fp.write(sorted_wrong_entities)

        p = ct_norm_correct * 1.0 / ct_norm_predict
        r = ct_norm_correct * 1.0 / ct_norm_gold
        f1 = 2.0 * p * r / (p + r)
        print("NORM p: %.4f | r: %.4f | f1: %.4f" % (p, r, f1))
Esempio n. 3
0

    logging.info(d.config)

    makedir_and_clear(opt.output)

    logging.info("load data ...")
    train_data = data.loadData(opt.train_file, True, opt.types, opt.type_filter)
    dev_data = data.loadData(opt.dev_file, True, opt.types, opt.type_filter)
    if opt.test_file:
        test_data = data.loadData(opt.test_file, False, opt.types, opt.type_filter)
    else:
        test_data = None

    logging.info("load dict ...")
    UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(d.config['norm_dict'])
    logging.info("dict concept number {}".format(len(UMLS_dict)))

    train(train_data, dev_data, test_data, d, UMLS_dict, UMLS_dict_reverse, opt, None, False)

elif opt.whattodo == 2:



    logger = logging.getLogger()
    logger.setLevel(logging.INFO)

    from collections import OrderedDict
    import json
    def metamap_ner_my_norm(d):
        print("load umls ...")
Esempio n. 4
0
def metamap_ner_my_norm(d):
    print("load umls ...")

    UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(
        d.config['norm_dict'])

    predict_dir = "/Users/feili/Desktop/umass/CancerADE_SnoM_30Oct2017_test/metamap"
    annotation_dir = os.path.join(opt.test_file, 'bioc')
    corpus_dir = os.path.join(opt.test_file, 'txt')
    annotation_files = [
        f for f in listdir(annotation_dir) if isfile(join(annotation_dir, f))
    ]

    if opt.norm_rule:
        multi_sieve.init(opt, None, d, UMLS_dict, UMLS_dict_reverse, False)
    elif opt.norm_neural:
        logging.info("use neural-based normer")
        if opt.test_in_cpu:
            neural_model = torch.load(os.path.join(opt.output,
                                                   'norm_neural.pkl'),
                                      map_location='cpu')
        else:
            neural_model = torch.load(
                os.path.join(opt.output, 'norm_neural.pkl'))
        neural_model.eval()
    elif opt.norm_vsm:
        logging.info("use vsm-based normer")
        if opt.test_in_cpu:
            vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'),
                                   map_location='cpu')
        else:
            vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'))
        vsm_model.eval()

    ct_norm_predict = 0
    ct_norm_gold = 0
    ct_norm_correct = 0

    for gold_file_name in annotation_files:
        print("# begin {}".format(gold_file_name))
        gold_document = parse_one_gold_file(annotation_dir, corpus_dir,
                                            gold_file_name)

        predict_document = metamap.load_metamap_result_from_file(
            join(predict_dir,
                 gold_file_name[:gold_file_name.find('.')] + ".field.txt"))

        # copy entities from metamap entities
        pred_entities = []
        for gold in predict_document.entities:
            pred = Entity()
            pred.id = gold.id
            pred.type = gold.type
            pred.spans = gold.spans
            pred.section = gold.section
            pred.name = gold.name
            pred_entities.append(pred)

        if opt.norm_rule:
            multi_sieve.runMultiPassSieve(gold_document, pred_entities,
                                          UMLS_dict, False)
        elif opt.norm_neural:
            neural_model.process_one_doc(gold_document, pred_entities,
                                         UMLS_dict, UMLS_dict_reverse, False)
        elif opt.norm_vsm:
            vsm_model.process_one_doc(gold_document, pred_entities, UMLS_dict,
                                      UMLS_dict_reverse, False)
        else:
            raise RuntimeError("wrong configuration")

        p1, p2, p3 = evaluate_for_ehr(gold_document.entities, pred_entities,
                                      UMLS_dict)

        ct_norm_gold += p1
        ct_norm_predict += p2
        ct_norm_correct += p3

    p = ct_norm_correct * 1.0 / ct_norm_predict
    r = ct_norm_correct * 1.0 / ct_norm_gold
    f1 = 2.0 * p * r / (p + r)
    print("NORM p: %.4f | r: %.4f | f1: %.4f" % (p, r, f1))
Esempio n. 5
0
def pretrain(opt):

    samples_per_epoch = []
    pregenerated_data = Path(opt.instance_dir)
    for i in range(opt.iter):

        epoch_file = pregenerated_data / f"epoch_{i}.json"
        metrics_file = pregenerated_data / f"epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(
                f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({opt.iter})."
            )
            print(
                "This script will loop over the available data, but training diversity may be negatively impacted."
            )
            num_data_epochs = i
            break
    else:
        num_data_epochs = opt.iter

    if opt.gpu >= 0 and torch.cuda.is_available():
        if opt.multi_gpu:
            device = torch.device("cuda")
            n_gpu = torch.cuda.device_count()
        else:
            device = torch.device('cuda', opt.gpu)
            n_gpu = 1
    else:
        device = torch.device("cpu")
        n_gpu = 0

    logging.info("device: {} n_gpu: {}".format(device, n_gpu))

    if opt.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(opt.gradient_accumulation_steps))

    opt.batch_size = opt.batch_size // opt.gradient_accumulation_steps

    makedir_and_clear(opt.save)

    tokenizer = BertTokenizer.from_pretrained(opt.bert_dir,
                                              do_lower_case=opt.do_lower_case)

    total_train_examples = 0
    for i in range(opt.iter):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(total_train_examples / opt.batch_size /
                                       opt.gradient_accumulation_steps)

    logging.info("load dict ...")
    UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(opt.norm_dict)
    logging.info("dict concept number {}".format(len(UMLS_dict)))
    dict_alphabet = Alphabet('dict')
    init_dict_alphabet(dict_alphabet, UMLS_dict)
    dict_alphabet.close()

    # Prepare model
    model, _ = BertForPreTraining.from_pretrained(
        opt.bert_dir, num_norm_labels=get_dict_size(dict_alphabet))
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=opt.lr,
                         warmup=opt.warmup_proportion,
                         t_total=num_train_optimization_steps)

    global_step = 0
    logging.info("***** Running training *****")
    logging.info(f"  Num examples = {total_train_examples}")
    logging.info("  Batch size = %d", opt.batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)
    model.train()
    for epoch in range(opt.iter):
        epoch_dataset = PregeneratedDataset(epoch=epoch,
                                            training_path=pregenerated_data,
                                            tokenizer=tokenizer,
                                            num_data_epochs=num_data_epochs,
                                            dict_alphabet=dict_alphabet)
        train_sampler = RandomSampler(epoch_dataset)

        train_dataloader = DataLoader(epoch_dataset,
                                      sampler=train_sampler,
                                      batch_size=opt.batch_size)
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        epoch_start = time.time()
        sum_loss = 0
        sum_orginal_loss = 0
        num_iter = len(train_dataloader)

        with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar:
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next, input_ids_ent, input_mask_ent, norm_label_ids = batch
                loss, original_loss = model(input_ids, segment_ids, input_mask,
                                            lm_label_ids, input_ids_ent,
                                            input_mask_ent, is_next,
                                            norm_label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                    original_loss = original_loss.mean()
                if opt.gradient_accumulation_steps > 1:
                    loss = loss / opt.gradient_accumulation_steps
                    original_loss = original_loss / opt.gradient_accumulation_steps

                loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                pbar.update(1)
                mean_loss = tr_loss * opt.gradient_accumulation_steps / nb_tr_steps
                pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")

                if (step + 1) % opt.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                sum_loss += loss.item()
                sum_orginal_loss += original_loss.item()

        epoch_finish = time.time()
        logging.info(
            "epoch: %s training finished. Time: %.2fs. loss: %.4f, original_loss %.4f"
            % (epoch, epoch_finish - epoch_start, sum_loss / num_iter,
               sum_orginal_loss / num_iter))

        # Save a trained model
        logging.info("** ** * Saving fine-tuned model ** ** * ")
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(
            opt.save, "pytorch_model_{}.bin".format(str(epoch + 1)))
        torch.save(model_to_save.state_dict(), str(output_model_file))
Esempio n. 6
0
def test(data, opt):
    # corpus_dir = join(opt.test_file, 'corpus')
    # corpus_dir = join(opt.test_file, 'txt')
    corpus_dir = opt.test_file

    if opt.nlp_tool == "spacy":
        nlp_tool = spacy.load('en')
    elif opt.nlp_tool == "nltk":
        nlp_tool = nltk.data.load('tokenizers/punkt/english.pickle')
    elif opt.nlp_tool == "stanford":
        nlp_tool = StanfordCoreNLP('http://localhost:{0}'.format(9000))
    else:
        raise RuntimeError("invalid nlp tool")

    corpus_files = [
        f for f in listdir(corpus_dir) if isfile(join(corpus_dir, f))
    ]

    model = SeqModel(data, opt)
    if opt.test_in_cpu:
        model.load_state_dict(
            torch.load(os.path.join(opt.output, 'model.pkl'),
                       map_location='cpu'))
    else:
        model.load_state_dict(torch.load(os.path.join(opt.output,
                                                      'model.pkl')))

    dictionary, dictionary_reverse = umls.load_umls_MRCONSO(
        data.config['norm_dict'])
    isMeddra_dict = False

    # initialize norm models
    if opt.norm_rule and opt.norm_vsm and opt.norm_neural:  # ensemble
        logging.info("use ensemble normer")
        multi_sieve.init(opt, None, data, dictionary, dictionary_reverse,
                         False)
        if opt.ensemble == 'learn':
            if opt.test_in_cpu:
                ensemble_model = torch.load(os.path.join(
                    opt.output, 'ensemble.pkl'),
                                            map_location='cpu')
            else:
                ensemble_model = torch.load(
                    os.path.join(opt.output, 'ensemble.pkl'))
            ensemble_model.eval()
        else:
            if opt.test_in_cpu:
                vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'),
                                       map_location='cpu')
                neural_model = torch.load(os.path.join(opt.output,
                                                       'norm_neural.pkl'),
                                          map_location='cpu')
            else:
                vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'))
                neural_model = torch.load(
                    os.path.join(opt.output, 'norm_neural.pkl'))

            vsm_model.eval()
            neural_model.eval()

    elif opt.norm_rule:
        logging.info("use rule-based normer")
        multi_sieve.init(opt, None, data, dictionary, dictionary_reverse,
                         False)

    elif opt.norm_vsm:
        logging.info("use vsm-based normer")
        if opt.test_in_cpu:
            vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'),
                                   map_location='cpu')
        else:
            vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'))
        vsm_model.eval()

    elif opt.norm_neural:
        logging.info("use neural-based normer")
        if opt.test_in_cpu:
            neural_model = torch.load(os.path.join(opt.output,
                                                   'norm_neural.pkl'),
                                      map_location='cpu')
        else:
            neural_model = torch.load(
                os.path.join(opt.output, 'norm_neural.pkl'))
        neural_model.eval()
    else:
        logging.info("no normalization is performed.")

    makedir_and_clear(opt.predict)

    ct_success = 0
    ct_error = 0

    for fileName in corpus_files:
        try:
            start = time.time()
            document, _, _, _ = processOneFile(fileName, None, corpus_dir,
                                               nlp_tool, False, opt.types,
                                               opt.type_filter)

            data.test_texts = []
            data.test_Ids = []
            read_instance_from_one_document(document, data.word_alphabet,
                                            data.char_alphabet,
                                            data.label_alphabet,
                                            data.test_texts, data.test_Ids,
                                            data)

            _, _, _, _, _, pred_results, _ = evaluate(data, opt, model, 'test',
                                                      False, opt.nbest)

            entities = translateResultsintoEntities(document.sentences,
                                                    pred_results)

            if opt.norm_rule and opt.norm_vsm and opt.norm_neural:
                if opt.ensemble == 'learn':
                    ensemble_model.process_one_doc(document, entities,
                                                   dictionary,
                                                   dictionary_reverse,
                                                   isMeddra_dict)
                else:
                    pred_entities1 = copy.deepcopy(entities)
                    pred_entities2 = copy.deepcopy(entities)
                    pred_entities3 = copy.deepcopy(entities)
                    multi_sieve.runMultiPassSieve(document, pred_entities1,
                                                  dictionary, isMeddra_dict)
                    vsm_model.process_one_doc(document, pred_entities2,
                                              dictionary, dictionary_reverse,
                                              isMeddra_dict)
                    neural_model.process_one_doc(document, pred_entities3,
                                                 dictionary,
                                                 dictionary_reverse,
                                                 isMeddra_dict)

                    # merge pred_entities1, pred_entities2, pred_entities3 into entities
                    ensemble.merge_result(pred_entities1, pred_entities2,
                                          pred_entities3, entities, dictionary,
                                          isMeddra_dict,
                                          vsm_model.dict_alphabet, data)

            elif opt.norm_rule:
                multi_sieve.runMultiPassSieve(document, entities, dictionary,
                                              isMeddra_dict)
            elif opt.norm_vsm:
                vsm_model.process_one_doc(document, entities, dictionary,
                                          dictionary_reverse, isMeddra_dict)
            elif opt.norm_neural:
                neural_model.process_one_doc(document, entities, dictionary,
                                             dictionary_reverse, isMeddra_dict)

            dump_results(fileName, entities, opt)

            end = time.time()
            logging.info("process %s complete with %.2fs" %
                         (fileName, end - start))

            ct_success += 1
        except Exception as e:
            logging.error("process file {} error: {}".format(fileName, e))
            ct_error += 1

    logging.info("test finished, total {}, error {}".format(
        ct_success + ct_error, ct_error))
Esempio n. 7
0
def make_dictionary(d):
    logging.info("load dict ...")
    UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(
        d.config['norm_dict'])
    logging.info("dict concept number {}".format(len(UMLS_dict)))

    fp = codecs.open("dictionary.txt", 'w', 'UTF-8')
    fp1 = codecs.open("dictionary_full.txt", 'w', 'UTF-8')

    for cui, concept in UMLS_dict.items():
        new_names = set()
        new_names_full = set()
        write_str = cui + '|'
        write_str1 = cui + '|'
        for i, id in enumerate(concept.codes):
            if i == len(concept.codes) - 1:
                write_str += id + '|'
                write_str1 += id + '|'
            else:
                write_str += id + ','
                write_str1 += id + ','

        for name in concept.names:
            # replace (finding), NOS to whitespace
            name = dict_refine(name)

            # given a name, output its token set
            new_name, new_name_full = preprocess(name, True, False)
            if len(new_name) == 0 or len(new_name_full) == 0:
                raise RuntimeError("empty after preprocess: {}".format(name))
            # all synonym merged
            new_names = new_names | new_name
            new_names_full = new_names_full | new_name_full

        for i, name in enumerate(new_names):
            if i == len(new_names) - 1:
                write_str += name
            else:
                write_str += name + ','

        for i, name in enumerate(new_names_full):
            if i == len(new_names_full) - 1:
                write_str1 += name
            else:
                write_str1 += name + ','

        fp.write(write_str + "\n")
        fp1.write(write_str1 + "\n")

    fp.close()
    fp1.close()

    fp = codecs.open("dictionary_reverse.txt", 'w', 'UTF-8')

    for code, cui_list in UMLS_dict_reverse.items():
        write_str = code + '|'
        for i, cui in enumerate(cui_list):
            if i == len(cui_list) - 1:
                write_str += cui
            else:
                write_str += cui + ','

        fp.write(write_str + "\n")

    fp.close()