def __init__(self, fpath, IsTest=False):

        directory = pathlib.Path(fpath)
        dataset = make_dataset(directory)
        if IsTest:
            words, tags, ids = make_bert_testset(dataset)
        else:
            words, tags, ids = make_bert_dataset(dataset)
        flat_words, flat_tags, flat_ids = [], [], []
        for article_w, article_t, article_id in zip(words, tags, ids):
            for sentence, tag, id in zip(article_w, article_t, article_id):
                flat_words.append(sentence)
                flat_tags.append(tag)
                flat_ids.append(id)

        sents, ids = [], []
        tags_li = [[] for _ in range(num_task)]

        for word, tag, id in zip(flat_words, flat_tags, flat_ids):
            words = word
            tags = tag

            ids.append([id])
            sents.append(["[CLS]"] + words + ["[SEP]"])
            tmp_tags = []

            if num_task != 2:
                for i in range(num_task):
                    tmp_tags.append(['O'] * len(tags))
                    for j, tag in enumerate(tags):
                        if tag != 'O' and tag in VOCAB[i]:
                            tmp_tags[i][j] = tag
                    tags_li[i].append(["<PAD>"] + tmp_tags[i] + ["<PAD>"])
            elif num_task == 2:
                tmp_tags.append(['O'] * len(tags))
                tmp_tags.append(['Non-prop'])
                for j, tag in enumerate(tags):
                    if tag != 'O' and tag in VOCAB[0]:
                        tmp_tags[0][j] = tag
                        tmp_tags[1] = ['Prop']
                for i in range(num_task):
                    tags_li[i].append(["<PAD>"] + tmp_tags[i] + ["<PAD>"])

        self.sents, self.ids, self.tags_li = sents, ids, tags_li
        assert len(sents) == len(ids) == len(tags_li[0])
Beispiel #2
0
    def __init__(self, directory, is_test=False, verbose=False):
        dataset = make_dataset(directory)
        words, tags, ids = make_bert_dataset(dataset,
                                             is_test=is_test,
                                             verbose=verbose)
        # (
        #     [ [ ['first_word', 'second_word', ...], ... ], ... ],
        #     [ [ ['label1', 'label2', ...], ... ], ... ],
        #     [ [ id1, id2, ... ], ... ]
        # )

        flat_ids, flat_sents = [], []
        tags_li = [[] for _ in range(NUM_TASK)]
        for article_words, article_tags, article_ids in zip(words, tags, ids):
            for inner_words, inner_tags, id_ in zip(article_words,
                                                    article_tags, article_ids):
                flat_sents.append(["[CLS]"] + inner_words + ["[SEP]"])
                flat_ids.append(id_)

                tmp_tags = []
                if NUM_TASK == 1:  # technique classification
                    tmp_tags.append(['O'] * len(inner_tags))
                    for j, inner_tag in enumerate(inner_tags):
                        if inner_tag != 'O' and inner_tag in VOCAB[0]:
                            tmp_tags[0][j] = inner_tag
                    tags_li[0].append(["<PAD>"] + tmp_tags[0] + ["<PAD>"])
                else:  # sentence classification
                    tmp_tags.append(['O'] * len(inner_tags))
                    tmp_tags.append(['Non-prop'])
                    for j, inner_tag in enumerate(inner_tags):
                        if inner_tag != 'O' and inner_tag in VOCAB[0]:
                            tmp_tags[0][j] = inner_tag
                            tmp_tags[1] = ['Prop']
                    for i in range(NUM_TASK):
                        tags_li[i].append(["<PAD>"] + tmp_tags[i] + ["<PAD>"])

        self.sents, self.ids, self.tags_li = flat_sents, flat_ids, tags_li
        assert len(self.sents) == len(self.ids) == len(self.tags_li[0])
    def __init__(self, fpath, IsTest=False):

        directory = fpath
        dataset = make_dataset(directory)
        if IsTest:
            words, tags, ids = make_bert_testset(dataset)
        else:
            words, tags, ids = make_bert_dataset(dataset)
        flat_words, flat_tags, flat_ids, changed_ids = [], [], [], []

        count = 0
        not_dropped=0

        for article_w, article_t, article_id in zip(words, tags, ids):
            for sentence, tag, id in zip(article_w, article_t, article_id):
                

                # Seperated them from the groupings
                changed = [idx2tag[0][tag2idx[0][temp_tag]]
                           for temp_tag in tag]

                # which were article wise to make a list of just sentences
                if set(changed) == {'O'}:
                    count += 1
                    continue

                else:
                    not_dropped+=1
                    flat_words.append(sentence)
                    changed_ids.append(changed)

                    flat_tags.append(changed)

                    flat_ids.append(id)
        print("{} sentences dropped".format(count))
        print("{} sentences NOT dropped".format(not_dropped))

        # print("sentence is {} \n tag is {} \n id is {} \n changed_ids is {}".format(
        #     flat_words[:2], flat_tags[:2], flat_ids[:2], changed_ids[:2]))

        sents, ids = [], []
        tags_li = [[] for _ in range(num_task)]

        if params.dummy_run:
            flat_words = [flat_words[0]]
            flat_tags = [flat_tags[0]]
            flat_ids = [flat_ids[0]]

        for word, tag, id in zip(flat_words, flat_tags, flat_ids):
            words = word
            tags = tag

            ids.append([id])
            sents.append(["[CLS]"] + words + ["[SEP]"])
            tmp_tags = []

            #We here are just making the tags dict, basically and adding tags for the Sep and start tokens

            if num_task != 2:
                for i in range(num_task):
                    tmp_tags.append(['O']*len(tags))
                    for j, tag in enumerate(tags):
                        if tag != 'O' and tag in ["CD", "ST"]:
                            tmp_tags[i][j] = tag
                    tags_li[i].append(["<PAD>"] + tmp_tags[i] + ["<PAD>"])
            elif num_task == 2:
                tmp_tags.append(['O']*len(tags))
                tmp_tags.append(['Non-prop'])
                for j, tag in enumerate(tags):
                    if tag != 'O' and tag in VOCAB[0]:
                        tmp_tags[0][j] = tag
                        tmp_tags[1] = ['Prop']
                for i in range(num_task):
                    tags_li[i].append(["<PAD>"] + tmp_tags[i] + ["<PAD>"])

        self.sents, self.ids, self.tags_li = sents, ids, tags_li
        assert len(sents) == len(ids) == len(tags_li[0])