def collate_fn(self, samples): tokenizer, TAGS = self.tokenizer, self.tags CLS, SEP, PAD = tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id ## text tokenized, label vectoized b_docs, b_indexs, b_ids, _, b_masks = [], [], [], [], [] for sample in samples: #text, p_text, tags, values = sample['text'],sample['p_text'],sample['tags'],sample['values'] doc, index = sample['doc'], sample['index'] text, p_text = sample['text'], sample['p_text'] # string cleaning text = clean_str(text) p_text = clean_str(p_text) #tags = unicodedata.normalize("NFKC", re.sub('*|\*|\s+', '', tags)) if tags is not np.nan else tags #values = clean_str(values) # text to tokens text_ids = tokenizer.encode(text)[1:-1] p_text_ids = tokenizer.encode(p_text)[1:-1] # input, output, mask ids = [CLS] + text_ids + [SEP] + p_text_ids + [SEP] masks = [[False]] + [[True] for i in range(len(text_ids))] + [ [False] for i in range(len(ids) - len(text_ids) - 1) ] b_docs.append(doc) b_indexs.append(index) b_ids.append(ids) b_masks.append(masks) ## pad to same lenght max_len = min([max([len(s) for s in b_ids]), 512]) for i, (ids, masks) in enumerate(zip(b_ids, b_masks)): ids = ids[:max_len] ids += [PAD] * (max_len - len(ids)) b_ids[i] = ids # labels = labels[:max_len] # labels += [[0 for j in range(len(TAGS))] for k in range(max_len-len(labels))] # b_labels[i] = labels masks = masks[:max_len] masks += [[False]] * (max_len - len(masks)) b_masks[i] = masks return b_docs, b_indexs, torch.tensor(b_ids), _, torch.tensor( b_masks), samples
if dataset not in datasets: sys.exit("wrong dataset name") nltk.download('stopwords') stop_words = set(stopwords.words('english')) doc_content_list = [] with open('../data/corpus/' + dataset + '.txt', 'rb') as f: for line in f.readlines(): doc_content_list.append(line.strip().decode('latin1')) word_freq = {} # to remove rare words for doc_content in doc_content_list: temp = clean_str(doc_content) words = temp.split() for word in words: if word in word_freq: word_freq[word] += 1 else: word_freq[word] = 1 clean_docs = [] for doc_content in doc_content_list: temp = clean_str(doc_content) words = temp.split() doc_words = [] for word in words: # word not in stop_words and word_freq[word] >= 5 if dataset == 'mr':
def collate_fn(self, samples): tokenizer, TAGS = self.tokenizer, self.tags CLS, SEP, PAD = tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id ## text tokenized, label vectoized b_ids, b_labels, b_masks = [], [], [] for sample in samples: text, p_text, tags, values = sample['text'], sample[ 'p_text'], sample['tags'], sample['values'] # string cleaning text = clean_str(text) p_text = clean_str(p_text) tags = unicodedata.normalize("NFKC", re.sub( '*|\*|\s+', '', tags)) if tags is not np.nan else tags values = clean_str(values) # text to tokens text_ids = tokenizer.encode(text)[1:-1] p_text_ids = tokenizer.encode(p_text)[1:-1] # input, output, mask ids = [CLS] + text_ids + [SEP] + p_text_ids + [SEP] labels = [[0 for i in range(len(TAGS))] for j in range(len(ids))] masks = [[False]] + [[True] for i in range(len(text_ids))] + [ [False] for i in range(len(ids) - len(text_ids) - 1) ] # assign label if isinstance(tags, str): for tag, value in zip(tags.split(';'), str(values).split(';')): tag = unicodedata.normalize("NFKC", re.sub('*|\*|\s+', '', tag)) value_ids = tokenizer.encode(value)[1:-1] pivote = sub_idx_finder(text_ids, value_ids, tag) if pivote is not None: for k in range(len(value_ids)): labels[1 + pivote + k][TAGS.index(tag)] = 1 else: print("\t[ERROR] pivote not found ") b_ids.append(ids) b_labels.append(labels) b_masks.append(masks) ## pad to same lenght max_len = min([max([len(s) for s in b_ids]), 512]) for i, (ids, labels, masks) in enumerate(zip(b_ids, b_labels, b_masks)): ids = ids[:max_len] ids += [PAD] * (max_len - len(ids)) b_ids[i] = ids labels = labels[:max_len] labels += [[0 for j in range(len(TAGS))] for k in range(max_len - len(labels))] b_labels[i] = labels masks = masks[:max_len] masks += [[False]] * (max_len - len(masks)) b_masks[i] = masks return torch.tensor(b_ids), torch.tensor(b_labels), torch.tensor( b_masks)