def bio2relation():
    TAG = "pred"
    sdiff = []
    relation_types = []
    pred_relations_plan1 = []
    pred_relations_plan2 = []
    mapping = []

    typed_entities = pkl_load(Path(NER_TYPING_ROOT) / "typed_entities_ens.pkl")
    for doc_id, ens in typed_entities.items():
        pre_txt = load_text(
            Path(PREPROCESSED_TEXT_DIR) /
            f"{doc_id}.preprocessed.txt").split("\n")
        sents = pkl_load(Path(TEXT_AS_SENT_DIR) / f"{doc_id}.sents.pkl")
        sent_bound = creat_sent_altered_boundary(sents)
        enids = range(len(ens))
        all_pairs = []
        for e1, e2 in permutations(enids, 2):
            all_pairs.append((e1, e2))

        for each in all_pairs:
            eid1, eid2 = each
            # (('son', 'FAMILYMEMBER', (334, 337), (342, 345)), ['NA', 'Son'], 'FAMILYMEMBER')
            en1 = ens[eid1]
            en2 = ens[eid2]
            if en1[-1].upper() != "FAMILYMEMBER" or en2[-1].upper(
            ) == "FAMILYMEMBER":
                continue

            sie1 = get_sent_idx(en1[0][3], sent_bound)
            sie2 = get_sent_idx(en2[0][3], sent_bound)
            if abs(sie1 - sie2) > GLOBAL_CUTOFF:
                continue

            bert_rels = insert_tags_for_relation(sents[sie1], sents[sie2],
                                                 en1[0], en2[0])
            tagged_s1, tagged_s2, pure_text1, pure_text2 = bert_rels
            pred_relations_plan1.append([
                TAG, tagged_s1, tagged_s2, pure_text1, pure_text2,
                f"{abs(sie1 - sie2)}",
                str()
            ])
            tp = generate_bert_relation_without_extra_sentence(
                sents[sie1], sents[sie2], en1[0], en2[0], sents, sie1, sie2)
            pred_relations_plan2.append([TAG, tp, f"{abs(sie1 - sie2)}"])
            mapping.append((doc_id, en1, en2))

    prel = Path(REL_TEST)
    prel.mkdir(parents=True, exist_ok=True)
    pkl_save(mapping, prel / "relation_mappings.tsv")
    to_tsv(pred_relations_plan2, prel / "test.tsv")

    prel = Path(REL_TESTa)
    prel.mkdir(parents=True, exist_ok=True)
    pkl_save(mapping, prel / "relation_mappings.tsv")
    to_tsv(pred_relations_plan1, prel / "test.tsv")
Ejemplo n.º 2
0
def read_word_embedding(replace=False):
    unknown_dir = "embedding/unknown.npy"
    vectors_dir = "embedding/vectors.npy"
    words_dir = "embedding/words.json"
    word_embeddings_dir = "embedding/word_embeddings.npy"
    word2idx_dir = "embedding/word2idx.json"

    print("read word embedding")
    if replace or not os.path.exists(
            word_embeddings_dir) or not os.path.exists(word2idx_dir):
        vectors = np.load(vectors_dir)
        unknown = np.load(unknown_dir)
        extension = utils.get_file_extension(words_dir)[1:]
        assert extension in ["json", "pl"]
        if extension == "json":
            words = utils.json_load(words_dir)
        else:
            words = utils.pkl_load(words_dir)
        word2idx = {"UNKNOWN": 1, **{w: i + 2 for i, w in enumerate(words)}}
        vectors = [
            unknown,
            *list(vectors),
        ]
        np.save(word_embeddings_dir, vectors)
        utils.json_dump(word2idx, word2idx_dir)
    else:
        word2idx = utils.json_load(word2idx_dir)
    print("vocab: %d words" % (len(word2idx) - 1))
    return word2idx
def bio2typing(res_dir, test_fids, tag=0):
    res = non_integrated_results(res_dir, test_fids)
    merged_entities = extract_entities(res, test_fids)
    ner_typing_root = Path(NER_TYPING_ROOT)
    ner_typing_root.mkdir(parents=True, exist_ok=True)
    pkl_save(merged_entities, ner_typing_root / f"merged_entities_{tag}.pkl")
    for test_fid in test_fids:
        pre_txt = load_text(
            Path(PREPROCESSED_TEXT_DIR) /
            f"{test_fid}.preprocessed.txt").split("\n")
        sents = pkl_load(Path(TEXT_AS_SENT_DIR) / f"{test_fid}.sents.pkl")
        sent_bound = creat_sent_altered_boundary(sents)

        ens = merged_entities[test_fid]
        fm, ls, ob = [], [], []
        for en_idx, en in enumerate(ens):
            # ('son', 'FAMILYMEMBER', (334, 337), (342, 345))
            en_span = en[-1]
            en_type = en[1].lower()
            sidx = get_sent_idx(en_span, sent_bound)
            en_loc_sent = sents[sidx]
            pure_text = pre_txt[sidx]
            tagged_sent = insert_token_and_creat_text_for_testing(
                en_loc_sent, en_span)

            if valida_by_sent(tagged_sent):
                print(test_fid, en, tagged_sent)

            if en_type == "familymember":
                fm.append([
                    f"{test_fid}@{en_idx}", f"{test_fid}@{en_idx}", pure_text,
                    tagged_sent
                ])
            elif en_type == "observation":
                ob.append([f"{test_fid}@{en_idx}", pure_text, tagged_sent])
            elif en_type == "livingstatus":
                ls.append([f"{test_fid}@{en_idx}", pure_text, tagged_sent])
            else:
                raise RuntimeError(f"{en_type} is not recognized for {en}")

        # # fms, fmr share the same dir
        pfm = Path(FMS_TEST.format(tag))
        pfm.mkdir(exist_ok=True, parents=True)
        to_tsv(fm, pfm / "test.tsv")

        pfo = Path(OBN_TEST.format(tag))
        pfo.mkdir(exist_ok=True, parents=True)
        to_tsv(ob, pfo / "test.tsv")

        pfl = Path(LSS_TEST.format(tag))
        pfl.mkdir(exist_ok=True, parents=True)
        to_tsv(ls, pfl / "test.tsv")

        pkl_save(fm, ner_typing_root / f"fm_{tag}.pkl")
        pkl_save(ob, ner_typing_root / f"ob_{tag}.pkl")
        pkl_save(ls, ner_typing_root / f"ls_{tag}.pkl")
Ejemplo n.º 4
0
def get_neighbors(args):
    neighbor_path = 'ed2_neighbors{}pt{}.pkl'.format(args.save_root,
                                                     args.perturb_type)
    neighbor_dict = pkl_load(neighbor_path)
    while True:
        print('broad' in neighbor_dict['bold'])
        inpt = input("Enter a word: ")
        inpt = inpt.lower()
        if inpt not in neighbor_dict:
            print("Word not preprocessed...")
        else:
            print("Neighbors for {}:".format(inpt))
            print(neighbor_dict[inpt])
Ejemplo n.º 5
0
    def __getitem__(self, index):
        """
        Args:
            index (int): Index in range [0, self.__len__ - 1]

        Returns:
            image: torch.Tensor of size [3,H,W].
            bboxes: torch.Tensor of size [n_bbox, 4] i.e. n bboxes each of [top_left_x, top_left_y, bottom_right_x, bottom_right_y]
            context_indices: torch.Tensor of size [n_bbox, 2*context_size] i.e. bbox indices (0-indexed) of contexts for all n bboxes.
                If not enough found, rest are -1
            labels: torch.Tensor of size [n_bbox] i.e. each value is label of the corresponding bbox
        """
        img_id = self.ids[index]
        
        img = Image.open('%s/imgs/%s.png' % (self.root, img_id)).convert('RGB')
        img = self.img_transform(img)
        
        bboxes = pkl_load('%s/bboxes/%s.pkl' % (self.root, img_id))
        if self.max_bg_boxes > 0:
            ## TODO: Make sure order is preserved
            bg_boxes = bboxes[bboxes[:,-1] == 0]
            pos_boxes = bboxes[bboxes[:,-1] != 0]

            indices = np.random.permutation(len(bg_boxes))[:self.max_bg_boxes]
            bg_boxes = bg_boxes[indices]

            bboxes = np.concatenate((pos_boxes, bg_boxes), axis=0)
        
        labels = torch.LongTensor(bboxes[:,-1])

        bboxes = torch.Tensor(bboxes[:,:-1])
        bboxes[:,2:] += bboxes[:,:2] # convert from [x,y,w,h] to [x1,y1,x2,y2]

        context_indices = []
        for i in range(bboxes.shape[0]):
            context = list(range(max(0, i-self.context_size), i)) + list(range(i+1, min(bboxes.shape[0], i+self.context_size+1)))
            context_indices.append(context + [-1]*(2*self.context_size - len(context)))
        context_indices = torch.LongTensor(context_indices)

        return img, bboxes, context_indices, labels
Ejemplo n.º 6
0
def get_results(test_ds, type):
    dl = DataLoader(test_ds, batch_size=len(test_ds), num_workers=5)
    retdict = train.eval_model(model, dl, args.OP_tgt, 3)
    model_dir = os.path.join('models', args.modelname)
    try:
        os.makedirs(os.path.join(model_dir, f'{type}_eval_results'))
    except:
        shutil.rmtree(os.path.join(model_dir, f'{type}_eval_results'))
        os.makedirs(os.path.join(model_dir, f'{type}_eval_results'))

    retdict['modelname'] = args.modelname
    # Print
    pprint.pprint(retdict)
    utils.pkl_dump(
        retdict, os.path.join('models', args.modelname, f'{type}_report.dict'))


valid_ds, test_ds = SeqDataset(args.dataset,
                               'valid'), SeqDataset(args.dataset, 'test')
X_Mean = utils.pkl_load(os.path.join('data', args.dataset, 'x_mean.pkl'))
input_size = X_Mean.size
model = model.StackedGRUDClassifier(input_size, args.output_dim, X_Mean, [])
model.load_state_dict(
    torch.load(os.path.join('models', args.modelname,
                            'checkpoint.pt'))['state_dict'])

if os.path.exists(os.path.join('models', args.modelname, 'report.txt')):
    os.remove(os.path.join('models', args.modelname, 'report.txt'))

print(get_results(test_ds, 'test'))
def gen_res_for_subtask2():
    mapping = pkl_load(os.path.join(REL_TEST, "relation_mappings.tsv"))
    rel_preds = load_bert_results(Path(REL_OUTPUT_ROOT) / "test_results.txt")
    rel_res = format_bert_output(rel_preds, mapping)
    to_task2_output(rel_res, PRED_SUBTASK_2)
def gen_res_for_subtask1(test_fids):
    typed_entities_en = []

    for tag in range(5):
        ner_typing_root = Path(NER_TYPING_ROOT)

        merged_entities = pkl_load(ner_typing_root /
                                   f"merged_entities_{tag}.pkl")
        fm_test_input = pkl_load(ner_typing_root / f"fm_{tag}.pkl")
        ob_test_input = pkl_load(ner_typing_root / f"ob_{tag}.pkl")
        ls_test_input = pkl_load(ner_typing_root / f"ls_{tag}.pkl")

        fms_test = load_ner_typing_results(CLS_OUTPUT_ROOT.format("fms", tag))
        fmr_test = load_ner_typing_results(CLS_OUTPUT_ROOT.format("fmr", tag))
        obn_test = load_ner_typing_results(CLS_OUTPUT_ROOT.format("obn", tag))
        lss_test = load_ner_typing_results(CLS_OUTPUT_ROOT.format("lss", tag))

        fm_merged_res = merge_test_with_result(fm_test_input, fms_test,
                                               fmr_test)
        ob_merged_res = merge_test_with_result(ob_test_input, obn_test)
        ls_merged_res = merge_test_with_result(ls_test_input, lss_test)

        typed_entities = defaultdict(list)
        for tfid in test_fids:
            entities = merged_entities[tfid]
            mrs = fm_merged_res[tfid] + ob_merged_res[tfid] + ls_merged_res[
                tfid]
            for each in mrs:
                en_id, ner_types = each
                en = entities[en_id]
                typed_entities[tfid].append((en, ner_types, en[1]))

        nd = []
        for k, v in typed_entities.items():
            # (('sister', 'FamilyMember', (576, 582), (597, 603)), ['NA', 'Sister'], 'FamilyMember')
            for each in v:
                nd.append((k, each[0], tuple(each[1]), each[2]))
        typed_entities_en.extend(nd)

    final_res = [
        e[0] for e in Counter(typed_entities_en).most_common()
        if e[1] > ENSEMBLE_THRESHOLD
    ]

    typed_entities_f = defaultdict(list)
    for each in final_res:
        typed_entities_f[each[0]].append((each[1], list(each[2]), each[3]))

    task1_ens = []
    for doc_id, ens in typed_entities_f.items():
        for en in ens:
            new_en = [doc_id]
            en_type = en[-1]
            if en_type.upper() == 'FAMILYMEMBER':
                new_en.append(en_map[en_type])
                new_en.append(en[1][1])
                new_en.append(en[1][0])
                task1_ens.append(new_en)
            elif en_type.upper() == "OBSERVATION":
                new_en.append(en_map[en_type])
                new_en.append(en[0][0])
                task1_ens.append(new_en)

    pkl_save(typed_entities_f,
             Path(NER_TYPING_ROOT) / "typed_entities_ens.pkl")

    task1_ens = sorted({tuple(e)
                        for e in task1_ens},
                       key=lambda x: int(x[0].split("_")[-1]))

    with open(PRED_SUBTASK_1, "w") as f:
        for each in task1_ens:
            cont = "\t".join(each)
            f.write(f"{cont}\n")
Ejemplo n.º 9
0
def main(args):
    # define location to save the model
    if args.save == "__":
        args.save = "save/IMN_%d_%d" % \
                    (args.lr, args.batch_size)
    ''' make sure the folder to save models exist '''
    if not os.path.exists(args.save):
        os.mkdir(args.save)

    in_dir = "../data/pkl/IMN/"

    datasets = utils.pkl_load(in_dir + "features.pkl")
    sent_train_index = datasets["train_sent"]["index"]
    sent_test_index = datasets["test_sent"]["index"]

    ae_tag_train = datasets["train_sent"]["target_opinion"]
    as_tag_train = datasets["train_sent"]["target_polarity"]

    ae_tag_test = datasets["test_sent"]["target_opinion"]
    as_tag_test = datasets["test_sent"]["target_polarity"]

    op_label_input_train = datasets["train_sent"]["opinion_ex"]
    op_label_input_test = datasets["test_sent"]["opinion_ex"]

    ap_label_input_train = datasets["train_sent"]["aspect_ex"]
    ap_label_input_test = datasets["test_sent"]["aspect_ex"]

    train_batch = args.batch_size
    test_batch = args.batch_size

    train_sent_set = utils.SentDataset(sent_train_index, ae_tag_train,
                                       as_tag_train, op_label_input_train,
                                       ap_label_input_train, args.max_len)
    test_set = utils.SentDataset(sent_test_index, ae_tag_test, as_tag_test,
                                 op_label_input_test, ap_label_input_test,
                                 args.max_len)
    test_set_loader = DataLoader(dataset=test_set,
                                 batch_size=test_batch,
                                 shuffle=False)
    train_sent_loader = DataLoader(dataset=train_sent_set,
                                   batch_size=train_batch,
                                   shuffle=True)
    general_embeddings = utils.pkl_load(in_dir + "general_embeddings.pkl")
    domain_embeddings = utils.pkl_load(in_dir + "domain_embeddings.pkl")
    general_embeddings = torch.from_numpy(general_embeddings).float()
    domain_embeddings = torch.from_numpy(domain_embeddings).float()
    model = NewImn(general_embeddings,
                   domain_embeddings,
                   ae_nums=args.ae_nums,
                   as_nums=args.as_nums,
                   ds_nums=args.ds_nums,
                   iters=args.iters,
                   dropout=args.dropout,
                   use_transission=args.use_transission)
    weight_p, bias_p = [], []
    for name, p in model.named_parameters():
        if 'bias' in name:
            bias_p += [p]
        else:
            weight_p += [p]
    optimizer = torch.optim.Adam([{
        'params': weight_p,
        'weight_decay': 0
    }, {
        'params': bias_p,
        'weight_decay': 0
    }],
                                 lr=args.lr)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,
                                                       0.95,
                                                       last_epoch=-1)
    model.cuda()
    f_aspect_best, f_opinion_best, f_absa_best = -np.inf, -np.inf, -np.inf
    tic = time.time()
    print("-----------------------------", args.epochs, len(train_sent_set),
          args.batch_size)
    for i in range(args.epochs):
        gold_prob = get_prob(i)
        rnd = np.random.uniform()
        # as epoch increasing, the probability of using gold opinion label descreases.
        if rnd < gold_prob:
            p_gold_op_train = np.ones((1, args.max_len))
        else:
            p_gold_op_train = np.zeros((1, args.max_len))
        # p_gold_op_train = np.zeros((1, args.max_len))
        p_gold_op_train = torch.from_numpy(p_gold_op_train).float()
        print("--------------\nEpoch %d begins!" % (i))
        loss = train(model, optimizer, train_sent_loader, p_gold_op_train)
        print("loss=", loss)
        print("  using %.5f seconds" % (time.time() - tic))
        tic = time.time()
        print("\n  Begin to predict the results on Validation")
        p_gold_op_test = torch.from_numpy(np.zeros((1, args.max_len))).float()
        f_aspect, f_opinion, f_absa = test(model,
                                           test_set_loader,
                                           p_gold_op_test,
                                           name="train")
        print("  ---%f   %f   %f---" % (f_aspect, f_opinion, f_absa))
        print("  ----Old best aspect f1 score on test is %f" % f_aspect_best)
        print("  ----Old best opinion f1 score on test is %f" % f_opinion_best)
        print("  ----Old best ABSA f1 score on test is %f" % f_absa_best)
        if f_aspect > f_aspect_best:
            print("  ----New best aspect f1 score on test is %f" % f_aspect)
            f_aspect_best = f_aspect
        if f_opinion > f_opinion_best:
            print("  ----New best opinion f1 score on test is %f" % f_opinion)
            f_opinion_best = f_opinion
        if f_absa > f_absa_best:
            print("  ----New best ABSA f1 score on test is %f" % f_absa)
            f_absa_best = f_absa
            with open(args.save + "/model.pt", 'wb') as to_save:
                torch.save(model, to_save)
        scheduler.step()
        print("lr=", optimizer.param_groups[0]['lr'])
    print("best ABSA f1 score on test is %f" % f_absa_best)
    print("best Aspect f1 score on test is %f" % f_aspect_best)
    print("best Opinion f1 score on test is %f" % f_opinion_best)
    log = f"{args.iters}.log"
    with open(log, "w") as f:
        f.write("best ABSA f1 score on test is %f" % f_absa_best)
        f.write("best Aspect f1 score on test is %f" % f_aspect_best)
        f.write("best Opinion f1 score on test is %f" % f_opinion_best)
Ejemplo n.º 10
0
 def __getitem__(self, index):
     ID = self.list_IDs[index]
     X = utils.pkl_load(os.path.join(self.datadir,
                                     ID + '.npy'))  # Numpy seq
     y = self.label_dict[ID]  # Dict of available target values
     return X, y
Ejemplo n.º 11
0
 def __init__(self, datadir, type):
     self.datadir = os.path.join('data', datadir, type)
     self.label_dict = utils.pkl_load(
         os.path.join(self.datadir, 'label_dict.pkl'))
     self.list_IDs = list(self.label_dict.keys())
Ejemplo n.º 12
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# filename: rander_prob_emit.py

import gzip
from utils import pkl_load

emitP_FILE = "../src/main/resources/prob_emit.p"
OUTPUT_TXT_FILE = "../src/main/resources/prob_emit.txt"
OUTPUT_GZ_FILE = "../src/main/resources/prob_emit.gz"

output = ""
for k1, v1 in pkl_load(emitP_FILE).items():
    output += f"\n{k1}\n"
    for k2, v2 in v1.items():
        #k2 = k2.encode("unicode-escape").decode("utf-8") # to unicode string
        output += f"{k2} {v2}D\n"

with open(OUTPUT_TXT_FILE, "w", encoding="utf-8") as fp:
    fp.write(output)

with gzip.open(OUTPUT_GZ_FILE, "wb") as fp:
    fp.write(output.encode("utf-8"))
Ejemplo n.º 13
0
import java.util.HashMap;
import java.util.Map;

class Model {""" + "\n"
print(header)
### END header ###

### startP ###
map_startP = ""
map_startP += tab(
    1) + r"protected static final Map<Character, Double> startP;" + "\n"
map_startP += "\n"
map_startP += tab(1) + r"static {" + "\n"
map_startP += tab(2) + r"startP = new HashMap<Character, Double>() {{" + "\n"
for k1, v1 in pkl_load(startP_FILE).items():
    map_startP += tab(3) + f"put('{k1}', {v1}D);" + "\n"
map_startP += tab(2) + r"}};" + "\n"
map_startP += tab(1) + r"}" + "\n"
print(map_startP)
### END startP ###

### transP ###
map_transP = ""
map_transP += tab(
    1
) + r"protected static final Map<Character, Map<Character, Double>> transP;" + "\n"
map_transP += "\n"
map_transP += tab(1) + r"static {" + "\n"
map_transP += tab(
    2) + r"transP = new HashMap<Character, Map<Character, Double>>() {{" + "\n"
Ejemplo n.º 14
0
def load_data(filename):
    return utils.pkl_load(f'{config.preprocessedfolder}/{filename}')