Exemple #1
0
    def collate_fn(self, samples):
        tokenizer, TAGS = self.tokenizer, self.tags

        CLS, SEP, PAD = tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id

        ## text tokenized, label vectoized
        b_docs, b_indexs, b_ids, _, b_masks = [], [], [], [], []
        for sample in samples:
            #text, p_text, tags, values = sample['text'],sample['p_text'],sample['tags'],sample['values']
            doc, index = sample['doc'], sample['index']
            text, p_text = sample['text'], sample['p_text']

            # string cleaning
            text = clean_str(text)
            p_text = clean_str(p_text)
            #tags = unicodedata.normalize("NFKC", re.sub('*|\*|\s+', '', tags)) if tags is not np.nan else tags
            #values = clean_str(values)

            # text to tokens
            text_ids = tokenizer.encode(text)[1:-1]
            p_text_ids = tokenizer.encode(p_text)[1:-1]

            # input, output, mask
            ids = [CLS] + text_ids + [SEP] + p_text_ids + [SEP]
            masks = [[False]] + [[True] for i in range(len(text_ids))] + [
                [False] for i in range(len(ids) - len(text_ids) - 1)
            ]

            b_docs.append(doc)
            b_indexs.append(index)
            b_ids.append(ids)
            b_masks.append(masks)

        ## pad to same lenght
        max_len = min([max([len(s) for s in b_ids]), 512])
        for i, (ids, masks) in enumerate(zip(b_ids, b_masks)):
            ids = ids[:max_len]
            ids += [PAD] * (max_len - len(ids))
            b_ids[i] = ids

            #             labels = labels[:max_len]
            #             labels += [[0 for j in range(len(TAGS))] for k in range(max_len-len(labels))]
            #             b_labels[i] = labels

            masks = masks[:max_len]
            masks += [[False]] * (max_len - len(masks))
            b_masks[i] = masks

        return b_docs, b_indexs, torch.tensor(b_ids), _, torch.tensor(
            b_masks), samples
Exemple #2
0
if dataset not in datasets:
    sys.exit("wrong dataset name")

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

doc_content_list = []

with open('../data/corpus/' + dataset + '.txt', 'rb') as f:
    for line in f.readlines():
        doc_content_list.append(line.strip().decode('latin1'))

word_freq = {}  # to remove rare words

for doc_content in doc_content_list:
    temp = clean_str(doc_content)
    words = temp.split()
    for word in words:
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1

clean_docs = []
for doc_content in doc_content_list:
    temp = clean_str(doc_content)
    words = temp.split()
    doc_words = []
    for word in words:
        # word not in stop_words and word_freq[word] >= 5
        if dataset == 'mr':
Exemple #3
0
    def collate_fn(self, samples):
        tokenizer, TAGS = self.tokenizer, self.tags

        CLS, SEP, PAD = tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id

        ## text tokenized, label vectoized
        b_ids, b_labels, b_masks = [], [], []
        for sample in samples:
            text, p_text, tags, values = sample['text'], sample[
                'p_text'], sample['tags'], sample['values']

            # string cleaning
            text = clean_str(text)
            p_text = clean_str(p_text)
            tags = unicodedata.normalize("NFKC", re.sub(
                '*|\*|\s+', '', tags)) if tags is not np.nan else tags
            values = clean_str(values)

            # text to tokens
            text_ids = tokenizer.encode(text)[1:-1]
            p_text_ids = tokenizer.encode(p_text)[1:-1]

            # input, output, mask
            ids = [CLS] + text_ids + [SEP] + p_text_ids + [SEP]
            labels = [[0 for i in range(len(TAGS))] for j in range(len(ids))]
            masks = [[False]] + [[True] for i in range(len(text_ids))] + [
                [False] for i in range(len(ids) - len(text_ids) - 1)
            ]

            # assign label
            if isinstance(tags, str):
                for tag, value in zip(tags.split(';'), str(values).split(';')):
                    tag = unicodedata.normalize("NFKC",
                                                re.sub('*|\*|\s+', '', tag))

                    value_ids = tokenizer.encode(value)[1:-1]
                    pivote = sub_idx_finder(text_ids, value_ids, tag)
                    if pivote is not None:
                        for k in range(len(value_ids)):
                            labels[1 + pivote + k][TAGS.index(tag)] = 1
                    else:
                        print("\t[ERROR] pivote not found ")
            b_ids.append(ids)
            b_labels.append(labels)
            b_masks.append(masks)

        ## pad to same lenght
        max_len = min([max([len(s) for s in b_ids]), 512])
        for i, (ids, labels, masks) in enumerate(zip(b_ids, b_labels,
                                                     b_masks)):
            ids = ids[:max_len]
            ids += [PAD] * (max_len - len(ids))
            b_ids[i] = ids

            labels = labels[:max_len]
            labels += [[0 for j in range(len(TAGS))]
                       for k in range(max_len - len(labels))]
            b_labels[i] = labels

            masks = masks[:max_len]
            masks += [[False]] * (max_len - len(masks))
            b_masks[i] = masks

        return torch.tensor(b_ids), torch.tensor(b_labels), torch.tensor(
            b_masks)