def read_parse_write(elmo: ElmoEmbedder, infile: str, outfile: str, mode: str = "average", batch_size=0) -> None:
    """
    Read the input files and write the vectors to the output files
    :param elmo: ELMo embedder
    :param infile: input files for the sentences
    :param outfile: output vector files
    :param mode: the mode of elmo vectors
    :return:
    """
    reader = Reader()
    insts = reader.read_txt(infile, -1)
    f = open(outfile, 'wb')
    all_vecs = []
    all_sents = []
    for inst in insts:
        all_sents.append(inst.input.words)
    if batch_size < 1: # Not using batch
        for sent in tqdm(all_sents, desc="Elmo Embedding"):        
            elmo_vecs = elmo.embed_sentence(sent) 
            vec = parse_sentence(elmo_vecs, mode=mode)    
            all_vecs.append(vec)
    else:   # Batched prediction
        for elmo_vecs in tqdm(elmo.embed_sentences(all_sents, batch_size=batch_size), desc="Elmo Embedding", total=len(all_sents)):
            vec = parse_sentence(elmo_vecs, mode=mode)
            all_vecs.append(vec)

    print("Finishing embedding ELMo sequences, saving the vector files.")
    pickle.dump(all_vecs, f)
    f.close()
def read_parse_write(tokenizer: BertTokenizer, bert_client: BertClient,
                     infile: str, outfile: str, mode) -> None:
    """
    Read the input files and write the vectors to the output files
    :param bert_client: BertClient
    :param infile: input files for the sentences
    :param outfile: output vector files
    :param mode: the mode of bert word piece
    :return:
    """
    reader = Reader()
    insts = reader.read_txt(infile, -1)
    f = open(outfile, 'wb')
    all_vecs = []
    all_sents = []
    for inst in insts:
        all_sents.append(inst.input.words)
    for sent in tqdm(all_sents, desc="BERT encoding"):
        word_piece_tokens, word_to_piece_index = bert_tokenize_words(tokenizer,
                                                                     sent,
                                                                     mode=mode)
        bert_vec = np.squeeze(bert_client.encode([word_piece_tokens],
                                                 is_tokenized=True),
                              axis=0)[1:-1, :]  ## exclude the [CLS] and [SEP]
        bert_vec = bert_vec[word_to_piece_index, :]
        print(bert_vec.shape)
        all_vecs.append(bert_vec)

    print("Finishing embedding BERT sequences, saving the vector files.")
    pickle.dump(all_vecs, f)
    f.close()
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(
        description="Dependency-Guided LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    setSeed(opt, conf.seed)

    trains = reader.read_conll(conf.train_file, -1, True)
    devs = reader.read_conll(conf.dev_file, conf.dev_num, False)
    tests = reader.read_conll(conf.test_file, conf.test_num, False)

    if conf.context_emb != ContextEmb.none:
        print('Loading the {} vectors for all datasets.'.format(
            conf.context_emb.name))
        conf.context_emb_size = reader.load_elmo_vec(
            conf.train_file.replace(".sd", "").replace(".ud", "").replace(
                ".sud", "").replace(".predsd", "").replace(
                    ".predud", "").replace(".stud", "").replace(".ssd", "") +
            "." + conf.context_emb.name + ".vec", trains)
        reader.load_elmo_vec(
            conf.dev_file.replace(".sd", "").replace(".ud", "").replace(
                ".sud", "").replace(".predsd", "").replace(
                    ".predud", "").replace(".stud", "").replace(".ssd", "") +
            "." + conf.context_emb.name + ".vec", devs)
        reader.load_elmo_vec(
            conf.test_file.replace(".sd", "").replace(".ud", "").replace(
                ".sud", "").replace(".predsd", "").replace(
                    ".predud", "").replace(".stud", "").replace(".ssd", "") +
            "." + conf.context_emb.name + ".vec", tests)

    conf.use_iobes(trains + devs + tests)
    conf.build_label_idx(trains)

    conf.build_deplabel_idx(trains + devs + tests)
    print("# deplabels: ", len(conf.deplabels))
    print("dep label 2idx: ", conf.deplabel2idx)

    conf.build_word_idx(trains, devs, tests)
    conf.build_emb_table()
    conf.map_insts_ids(trains + devs + tests)

    print("num chars: " + str(conf.num_char))
    # print(str(config.char2idx))

    print("num words: " + str(len(conf.word2idx)))
    # print(config.word2idx)
    if opt.mode == "train":
        if conf.train_num != -1:
            random.shuffle(trains)
            trains = trains[:conf.train_num]
        learn_from_insts(conf, conf.num_epochs, trains, devs, tests)
    else:
        ## Load the trained model.
        test_model(conf, tests)
        # pass

    print(opt.mode)
def read_parse_write(elmo, infile, outfile, mode):
    reader = Reader()
    insts = reader.read_conll(infile, -1, True)
    f = open(outfile, 'wb')
    all_vecs = []
    for inst in insts:
        vec = parse_sentence(elmo, inst.input.words, mode=mode)
        all_vecs.append(vec)
    pickle.dump(all_vecs, f)
    f.close()
def read_parse_write(elmo, infile, outfile,):
    reader = Reader()
    insts = reader.read_conll(infile, -1, True)
    f = open(outfile, 'wb')
    all_vecs = []
    for inst in insts:
        sent = embed_sent(elmo, inst.input.words)
        # np.empty((len(sent)),dtype=np.float32)
        arr = []
        for token in sent:
            # print(token)
            # print(token.embedding)
            arr.append(np.expand_dims(token.embedding.numpy(), axis=0))
        # all_vecs.append(vec)
        all_vecs.append(np.concatenate(arr))
    pickle.dump(all_vecs, f)
    f.close()
Beispiel #6
0
def read_parse_write(elmo: ElmoEmbedder, infile: str, outfile: str, mode: str = "average") -> None:
    """
    Read the input files and write the vectors to the output files
    :param elmo: ELMo embedder
    :param infile: input files for the sentences
    :param outfile: output vector files
    :param mode: the mode of elmo vectors
    :return:
    """
    reader = Reader()
    insts = reader.read_txt(infile, -1)
    f = open(outfile, 'wb')
    all_vecs = []
    for inst in tqdm(insts):
        vec = parse_sentence(elmo, inst.input.words, mode=mode)#Remove pos_tags argument for model without additional embeedding for materials
        all_vecs.append(vec)
    print("Finishing embedding ELMo sequences, saving the vector files.")
    pickle.dump(all_vecs, f)
    f.close()
Beispiel #7
0
def read_parse_write(bert: DistilBertModel,
                     bert_path: str,
                     infile: str,
                     outfile: str,
                     mode: str = "average",
                     batch_size=0) -> None:
    """
    Read the input files and write the vectors to the output files
    :param bert: Bert embedder
    :param infile: input files for the sentences
    :param outfile: output vector files
    :param mode: the mode of elmo vectors
    :return:
    """
    reader = Reader()
    insts = reader.read_txt(infile, -1)
    f = open(outfile, 'wb')
    all_vecs = []
    all_sents = []
    for inst in insts:
        all_sents.append(inst.input.words)

    dataset = CustomDataset(all_sents, bert_path)

    batch_size = max(1, batch_size)  # make sure batch_size is gt 0
    dataloader = DataLoader(dataset,
                            batch_size=batch_size,
                            shuffle=False,
                            num_workers=4)
    for _, (batch, n_pads) in tqdm(enumerate(dataloader)):
        with torch.no_grad():
            batch = batch.cuda() if CUDA else batch
            bert = bert.cuda() if CUDA else bert

            bert_batch_vecs = bert(batch)[0].cpu().numpy()
            vectors = parse_sentence(bert_batch_vecs, mode=mode)
            for j in range(vectors.shape[0]):
                all_vecs.append(vectors[j, :-n_pads[j], :])

    print("Finishing embedding Bert sequences, saving the vector files.")
    pickle.dump(all_vecs, f)
    f.close()
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser(
        description="Dependency-Guided LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    setSeed(opt, conf.seed)

    trains = reader.read_conll(conf.train_file, -1, True)
    devs = reader.read_conll(conf.dev_file, conf.dev_num, False)
    tests = reader.read_conll(conf.test_file, conf.test_num, False)

    conf.use_iobes(trains)
    conf.build_label_idx(trains)

    conf.build_deplabel_idx(trains + devs + tests)
    print("# deplabels: ", len(conf.deplabels))
    print("dep label 2idx: ", conf.deplabel2idx)

    conf.build_word_idx(trains + devs + tests)
    conf.build_emb_table()
    conf.map_insts_ids(trains)

    print("num chars: " + str(conf.num_char))
    # print(str(config.char2idx))

    print("num words: " + str(len(conf.word2idx)))
    # print(config.word2idx)
    if opt.mode == "train":
        if conf.train_num != -1:
            random.shuffle(trains)
            trains = trains[:conf.train_num]
        learn_from_insts(conf, conf.num_epochs, trains)

    print(opt.mode)
Beispiel #9
0
def main():
    TASKS = ['ner_german', 'ner']
    USE_DEV = True

    char_set = set()
    for task in TASKS:

        t = __import__(task)
        data_list = [t.TRAIN_DATA, t.DEV_DATA, t.TEST_DATA]
        char_index, _ = t.create_char_index(data_list)
        for k, v in char_index.items():
            char_set.add(k)
    char_index, char_cnt = {}, 0
    for char in char_set:
        char_index[char] = char_cnt
        char_cnt += 1

    for i, task in enumerate(TASKS):
        t = __import__(task)
        word_index, word_cnt = t.create_word_index(
            [t.TRAIN_DATA, t.DEV_DATA, t.TEST_DATA])
        wx, y, m = t.read_data(t.TRAIN_DATA, word_index)
        if USE_DEV and task == 'ner':
            dev_wx, dev_y, dev_m = t.read_data(t.TEST_DATA, word_index)
            wx, y, m = np.vstack((wx, dev_wx)), np.vstack(
                (y, dev_y)), np.vstack((m, dev_m))
        twx, ty, tm = t.read_data(t.DEV_DATA, word_index)
        x, cm = t.read_char_data(t.TRAIN_DATA, char_index)
        if USE_DEV and task == 'ner':
            dev_x, dev_cm = t.read_char_data(t.TEST_DATA, char_index)
            x, cm = np.vstack((x, dev_x)), np.vstack((cm, dev_cm))
        tx, tcm = t.read_char_data(t.DEV_DATA, char_index)
        if task == 'ner':
            list_prefix = t.read_list()
            gaze = t.read_list_data(t.TRAIN_DATA, list_prefix)
            tgaze = t.read_list_data(t.DEV_DATA, list_prefix)
            if USE_DEV:
                dev_gaze = t.read_list_data(t.TEST_DATA, list_prefix)
                gaze = np.vstack((gaze, dev_gaze))
        else:
            gaze, tgaze = None, None

    parser = argparse.ArgumentParser(description="LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    setSeed(opt, conf.seed)

    trains = reader.read_txt(conf.train_file, conf.train_num, True)
    devs = reader.read_txt(conf.dev_file, conf.dev_num, False)
    tests = reader.read_txt(conf.test_file, conf.test_num, False)
    trains_target = reader.read_txt(conf.train_target_file_file,
                                    conf.train_num, True)

    if conf.context_emb != ContextEmb.none:
        print('Loading the elmo vectors for all datasets.')
        conf.context_emb_size = reader.load_elmo_vec(
            conf.train_file + "." + conf.context_emb.name + ".vec", trains)
        reader.load_elmo_vec(
            conf.dev_file + "." + conf.context_emb.name + ".vec", devs)
        reader.load_elmo_vec(
            conf.test_file + "." + conf.context_emb.name + ".vec", tests)
    conf.use_iobes(trains)
    conf.use_iobes(devs)
    conf.use_iobes(tests)
    conf.build_label_idx(trains)
    conf.use_iobes(trains_target)
    conf.build_label_idx_target(trains_target)

    conf.build_word_idx(trains, devs, tests)
    conf.build_emb_table()

    ids_train = conf.map_insts_ids(trains)
    ids_dev = conf.map_insts_ids(devs)
    ids_test = conf.map_insts_ids(tests)

    print("num chars: " + str(conf.num_char))
    # print(str(config.char2idx))

    print("num words: " + str(len(conf.word2idx)))
    # print(config.word2idx)
    if opt.mode == "train":
        learn_from_insts(conf, conf.num_epochs, trains, devs, tests)
    else:
        ## Load the trained model.
        test_model(conf, tests)
        # pass

    print(opt.mode)
# 
# @author: Allan
#

from config.reader import Reader

file = "data/ontonotes/train.sd.conllx"
digit2zero = False
reader = Reader(digit2zero)

insts = reader.read_conll(file, -1, True)
# devs = reader.read_conll(conf.dev_file, conf.dev_num, False)
# tests = reader.read_conll(conf.test_file, conf.test_num, False)

out_dep_label2num = {}

out_doubledep2num = {}

out_word2num = {}

label2idx = {}

def not_entity(label:str):
    if label.startswith("B-") or label.startswith("I-"):
        return False
    return True

def is_entity(label:str):
    if label.startswith("B-") or label.startswith("I-"):
        return True
    return False
Beispiel #11
0
def main():
    print('Reading arguments')
    parser = argparse.ArgumentParser(description="LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)
    conf_conll = Config_conll(opt)
    conf_ontonotes = Config_ontonotes(opt)

    reader = Reader(conf.digit2zero)
    setSeed(opt, conf.seed)

    trains_0 = reader.read_conll(conf.train_file_1, 0, conf.train_num, True)
    devs_0 = reader.read_conll(conf.dev_file_1, 0, conf.dev_num, False)
    tests_0 = reader.read_conll(conf.test_file_1, 0, conf.test_num, False)

    trains_1 = reader.read_conll(conf.train_file_2, 1, conf.train_num, True)
    devs_1 = reader.read_conll(conf.dev_file_2, 1, conf.dev_num, False)
    tests_1 = reader.read_conll(conf.test_file_2, 1, conf.test_num, False)

    trains_all = trains_0 + trains_1
    devs_all = devs_0 + devs_1
    tests_all = tests_0 + tests_1

    if conf.context_emb != ContextEmb.none:
        print('Loading the elmo vectors for all datasets.')
        conf.context_emb_size = reader.load_elmo_vec(
            conf.train_file_1 + "." + conf.context_emb.name + ".vec", trains_1)
        reader.load_elmo_vec(
            conf.dev_file_1 + "." + conf.context_emb.name + ".vec", devs_1)
        reader.load_elmo_vec(
            conf.test_file_1 + "." + conf.context_emb.name + ".vec", tests_1)

    conf.use_iobes(trains_all)
    conf.use_iobes(devs_all)
    conf.use_iobes(tests_all)
    conf.build_label_idx(trains_all)

    conf.build_word_idx(trains_all, devs_all, tests_all)
    conf.build_emb_table()

    ids_train = conf.map_insts_ids(trains_all)
    ids_dev = conf.map_insts_ids(devs_all)
    ids_test = conf.map_insts_ids(tests_all)

    conf_conll.label_size = conf.label_size_0
    conf_conll.label2idx = conf.label2idx_0
    conf_conll.idx2labels = conf.idx2labels_0
    conf_ontonotes.label_size = conf.label_size_1
    conf_ontonotes.label2idx = conf.label2idx_1
    conf_ontonotes.idx2labels = conf.idx2labels_1

    print("num chars: " + str(conf.num_char))
    # print(str(config.char2idx))

    print("num words: " + str(len(conf.word2idx)))
    # print(config.word2idx)
    if opt.mode == "train":
        learn_from_insts(conf, conf_conll, conf_ontonotes, conf.num_epochs,
                         trains_all, devs_all, tests_all)
    else:
        ## Load the trained model.
        test_model(conf, tests_all)
        # pass

    print(opt.mode)
Beispiel #12
0
            else:
                next_entity = output[pos + 1]
                if curr_entity.startswith("B-"):
                    if next_entity.startswith("O") or next_entity.startswith("B-"):
                        output[pos] = curr_entity.replace("B-", "S-")
                elif curr_entity.startswith("I-"):
                    if next_entity.startswith("O") or next_entity.startswith("B-"):
                        output[pos] = curr_entity.replace("I-", "E-")


dataset = "ontonotes_chinese"
train = "../data/"+dataset+"/train.sd.conllx"
dev = "../data/"+dataset+"/dev.sd.conllx"
test = "../data/"+dataset+"/test.sd.conllx"
digit2zero = False
reader = Reader(digit2zero)

insts = reader.read_conll(train, -1, True)
insts += reader.read_conll(dev, -1, False)
insts += reader.read_conll(test, -1, False)
use_iobes(insts)
L = 3


def get_spans(output):
    output_spans = set()
    start = -1
    for i in range(len(output)):
        if output[i].startswith("B-"):
            start = i
        if output[i].startswith("E-"):
Beispiel #13
0
    def nonterms(self):
        return list(self.nonterms_iter())

    def __eq__(self, other):
        return other and self.pos == other.pos and self.children == other.children

    def __hash__(self):
        return hash((self.pos, self.children))


if __name__ == "__main__":
    '''###read the tree
    '''

    from config.reader import Reader
    reader = Reader()
    insts = reader.read_conll("../data/abc/train.conllx", number=1)

    for inst in insts:
        nodes = [Tree(pos) for pos in range(len(inst.input.words))]
        root = Tree(-1)
        for pos, head in enumerate(inst.input.heads):
            if head != -1:
                nodes[head].add_child(nodes[pos])
            else:
                root.add_child(nodes[pos])
        inst.nodes = nodes
        for node in nodes:
            node.sort_children()
        print(root.leaves())
        for pos, node in enumerate(nodes):