Beispiel #1
0
def main():
    parser = argparse.ArgumentParser(
        description="Dependency-Guided LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    setSeed(opt, conf.seed)

    trains = reader.read_conll(conf.train_file, -1, True)
    devs = reader.read_conll(conf.dev_file, conf.dev_num, False)
    tests = reader.read_conll(conf.test_file, conf.test_num, False)

    if conf.context_emb != ContextEmb.none:
        print('Loading the {} vectors for all datasets.'.format(
            conf.context_emb.name))
        conf.context_emb_size = reader.load_elmo_vec(
            conf.train_file.replace(".sd", "").replace(".ud", "").replace(
                ".sud", "").replace(".predsd", "").replace(
                    ".predud", "").replace(".stud", "").replace(".ssd", "") +
            "." + conf.context_emb.name + ".vec", trains)
        reader.load_elmo_vec(
            conf.dev_file.replace(".sd", "").replace(".ud", "").replace(
                ".sud", "").replace(".predsd", "").replace(
                    ".predud", "").replace(".stud", "").replace(".ssd", "") +
            "." + conf.context_emb.name + ".vec", devs)
        reader.load_elmo_vec(
            conf.test_file.replace(".sd", "").replace(".ud", "").replace(
                ".sud", "").replace(".predsd", "").replace(
                    ".predud", "").replace(".stud", "").replace(".ssd", "") +
            "." + conf.context_emb.name + ".vec", tests)

    conf.use_iobes(trains + devs + tests)
    conf.build_label_idx(trains)

    conf.build_deplabel_idx(trains + devs + tests)
    print("# deplabels: ", len(conf.deplabels))
    print("dep label 2idx: ", conf.deplabel2idx)

    conf.build_word_idx(trains, devs, tests)
    conf.build_emb_table()
    conf.map_insts_ids(trains + devs + tests)

    print("num chars: " + str(conf.num_char))
    # print(str(config.char2idx))

    print("num words: " + str(len(conf.word2idx)))
    # print(config.word2idx)
    if opt.mode == "train":
        if conf.train_num != -1:
            random.shuffle(trains)
            trains = trains[:conf.train_num]
        learn_from_insts(conf, conf.num_epochs, trains, devs, tests)
    else:
        ## Load the trained model.
        test_model(conf, tests)
        # pass

    print(opt.mode)
def read_parse_write(elmo, infile, outfile, mode):
    reader = Reader()
    insts = reader.read_conll(infile, -1, True)
    f = open(outfile, 'wb')
    all_vecs = []
    for inst in insts:
        vec = parse_sentence(elmo, inst.input.words, mode=mode)
        all_vecs.append(vec)
    pickle.dump(all_vecs, f)
    f.close()
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(
        description="Dependency-Guided LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)

    reader = Reader(conf.digit2zero)
    setSeed(opt, conf.seed)

    trains = reader.read_conll(conf.train_file, -1, True)
    devs = reader.read_conll(conf.dev_file, conf.dev_num, False)
    tests = reader.read_conll(conf.test_file, conf.test_num, False)

    conf.use_iobes(trains)
    conf.build_label_idx(trains)

    conf.build_deplabel_idx(trains + devs + tests)
    print("# deplabels: ", len(conf.deplabels))
    print("dep label 2idx: ", conf.deplabel2idx)

    conf.build_word_idx(trains + devs + tests)
    conf.build_emb_table()
    conf.map_insts_ids(trains)

    print("num chars: " + str(conf.num_char))
    # print(str(config.char2idx))

    print("num words: " + str(len(conf.word2idx)))
    # print(config.word2idx)
    if opt.mode == "train":
        if conf.train_num != -1:
            random.shuffle(trains)
            trains = trains[:conf.train_num]
        learn_from_insts(conf, conf.num_epochs, trains)

    print(opt.mode)
def read_parse_write(elmo, infile, outfile,):
    reader = Reader()
    insts = reader.read_conll(infile, -1, True)
    f = open(outfile, 'wb')
    all_vecs = []
    for inst in insts:
        sent = embed_sent(elmo, inst.input.words)
        # np.empty((len(sent)),dtype=np.float32)
        arr = []
        for token in sent:
            # print(token)
            # print(token.embedding)
            arr.append(np.expand_dims(token.embedding.numpy(), axis=0))
        # all_vecs.append(vec)
        all_vecs.append(np.concatenate(arr))
    pickle.dump(all_vecs, f)
    f.close()
# 
# @author: Allan
#

from config.reader import Reader

file = "data/ontonotes/train.sd.conllx"
digit2zero = False
reader = Reader(digit2zero)

insts = reader.read_conll(file, -1, True)
# devs = reader.read_conll(conf.dev_file, conf.dev_num, False)
# tests = reader.read_conll(conf.test_file, conf.test_num, False)

out_dep_label2num = {}

out_doubledep2num = {}

out_word2num = {}

label2idx = {}

def not_entity(label:str):
    if label.startswith("B-") or label.startswith("I-"):
        return False
    return True

def is_entity(label:str):
    if label.startswith("B-") or label.startswith("I-"):
        return True
    return False
Beispiel #6
0
def main():
    print('Reading arguments')
    parser = argparse.ArgumentParser(description="LSTM CRF implementation")
    opt = parse_arguments(parser)
    conf = Config(opt)
    conf_conll = Config_conll(opt)
    conf_ontonotes = Config_ontonotes(opt)

    reader = Reader(conf.digit2zero)
    setSeed(opt, conf.seed)

    trains_0 = reader.read_conll(conf.train_file_1, 0, conf.train_num, True)
    devs_0 = reader.read_conll(conf.dev_file_1, 0, conf.dev_num, False)
    tests_0 = reader.read_conll(conf.test_file_1, 0, conf.test_num, False)

    trains_1 = reader.read_conll(conf.train_file_2, 1, conf.train_num, True)
    devs_1 = reader.read_conll(conf.dev_file_2, 1, conf.dev_num, False)
    tests_1 = reader.read_conll(conf.test_file_2, 1, conf.test_num, False)

    trains_all = trains_0 + trains_1
    devs_all = devs_0 + devs_1
    tests_all = tests_0 + tests_1

    if conf.context_emb != ContextEmb.none:
        print('Loading the elmo vectors for all datasets.')
        conf.context_emb_size = reader.load_elmo_vec(
            conf.train_file_1 + "." + conf.context_emb.name + ".vec", trains_1)
        reader.load_elmo_vec(
            conf.dev_file_1 + "." + conf.context_emb.name + ".vec", devs_1)
        reader.load_elmo_vec(
            conf.test_file_1 + "." + conf.context_emb.name + ".vec", tests_1)

    conf.use_iobes(trains_all)
    conf.use_iobes(devs_all)
    conf.use_iobes(tests_all)
    conf.build_label_idx(trains_all)

    conf.build_word_idx(trains_all, devs_all, tests_all)
    conf.build_emb_table()

    ids_train = conf.map_insts_ids(trains_all)
    ids_dev = conf.map_insts_ids(devs_all)
    ids_test = conf.map_insts_ids(tests_all)

    conf_conll.label_size = conf.label_size_0
    conf_conll.label2idx = conf.label2idx_0
    conf_conll.idx2labels = conf.idx2labels_0
    conf_ontonotes.label_size = conf.label_size_1
    conf_ontonotes.label2idx = conf.label2idx_1
    conf_ontonotes.idx2labels = conf.idx2labels_1

    print("num chars: " + str(conf.num_char))
    # print(str(config.char2idx))

    print("num words: " + str(len(conf.word2idx)))
    # print(config.word2idx)
    if opt.mode == "train":
        learn_from_insts(conf, conf_conll, conf_ontonotes, conf.num_epochs,
                         trains_all, devs_all, tests_all)
    else:
        ## Load the trained model.
        test_model(conf, tests_all)
        # pass

    print(opt.mode)
Beispiel #7
0
                if curr_entity.startswith("B-"):
                    if next_entity.startswith("O") or next_entity.startswith("B-"):
                        output[pos] = curr_entity.replace("B-", "S-")
                elif curr_entity.startswith("I-"):
                    if next_entity.startswith("O") or next_entity.startswith("B-"):
                        output[pos] = curr_entity.replace("I-", "E-")


dataset = "ontonotes_chinese"
train = "../data/"+dataset+"/train.sd.conllx"
dev = "../data/"+dataset+"/dev.sd.conllx"
test = "../data/"+dataset+"/test.sd.conllx"
digit2zero = False
reader = Reader(digit2zero)

insts = reader.read_conll(train, -1, True)
insts += reader.read_conll(dev, -1, False)
insts += reader.read_conll(test, -1, False)
use_iobes(insts)
L = 3


def get_spans(output):
    output_spans = set()
    start = -1
    for i in range(len(output)):
        if output[i].startswith("B-"):
            start = i
        if output[i].startswith("E-"):
            end = i
            output_spans.add(Span(start, end, output[i][2:]))
Beispiel #8
0
        return list(self.nonterms_iter())

    def __eq__(self, other):
        return other and self.pos == other.pos and self.children == other.children

    def __hash__(self):
        return hash((self.pos, self.children))


if __name__ == "__main__":
    '''###read the tree
    '''

    from config.reader import Reader
    reader = Reader()
    insts = reader.read_conll("../data/abc/train.conllx", number=1)

    for inst in insts:
        nodes = [Tree(pos) for pos in range(len(inst.input.words))]
        root = Tree(-1)
        for pos, head in enumerate(inst.input.heads):
            if head != -1:
                nodes[head].add_child(nodes[pos])
            else:
                root.add_child(nodes[pos])
        inst.nodes = nodes
        for node in nodes:
            node.sort_children()
        print(root.leaves())
        for pos, node in enumerate(nodes):
            if node.is_leaf():