def collate_fn_rnn(batch): # # PyTorch RNN requires batches to be transposed for speed and integration with CUDA transpose = lambda b: b.t_().squeeze(0).contiguous() # Shape tensors in right format sents_len_batch = [[len(sent) for sent in doc["sents"]] for doc in batch] max_sent_len = max([max(s) for s in sents_len_batch]) sents_batch, doc_lens_batch = stack_and_pad_tensors([ torch.stack([pad_tensor(sent, max_sent_len) for sent in doc["sents"]]) for doc in batch ]) tags_batch, _ = stack_and_pad_tensors([doc["tags"] for doc in batch]) # Move to device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") sents_batch = sents_batch.to(device) tags_batch = tags_batch.to(device) if "encoding" in batch[0].keys(): # add doc encoding if applicable encoding_batch = torch.stack([doc["encoding"] for doc in batch]).to(device) return (sents_batch, tags_batch, encoding_batch) # return (word_ids_batch, seq_len_batch, label_batch) return (sents_batch, tags_batch, None)
def parse_dataset(path, label_to_idx, word_to_idx, pos_target=False, pad_len=None, encoding='latin-1', max_len=100): sentences = [] UNK = 3 PAD = 1 target_index = 1 if pos_target else 3 nr_long = 0 max_sus = 0 with open(path, encoding=encoding) as f: sample = {'word_ids': [], 'labels': []} max_len_token = 0 for line in f.read().splitlines(): if line in ['\n', '\r\n', '']: # end of sequence if len(sample['labels']) > 100: nr_long += 1 if (len(sample['labels']) > 0) and (len(sample['word_ids']) < max_len): max_sus = max(max_sus, len(sample['word_ids'])) sample['labels'] = torch.LongTensor(sample['labels']) sentences.append(sample) sample = {'word_ids': [], 'labels': []} continue else: ls = line.split() max_len_token = max(max_len_token, len(ls[4:])) word = ls[4:] label = ls[target_index] if len(word) > 0: word_ids = [ word_to_idx[w] if w in word_to_idx.keys() else UNK for w in word ] sample['word_ids'].append( torch.LongTensor(word_ids)) # 3 -> <unk> sample['labels'].append(label_to_idx[label]) if len(word_ids) > 20: print(line) # padd all BPE encodings to max length in dataset if pad_len is not None: max_len_token = max(pad_len, max_len_token) for s in range(len(sentences)): sen = sentences[s] for i in range(len(sen['word_ids'])): sen['word_ids'][i] = pad_tensor(sen['word_ids'][i], length=max_len_token, padding_index=PAD) # stack word ids back together sen['word_ids'] = torch.stack(sen['word_ids'], dim=0).view(-1) print('max nr of SUs in sentence: {}'.format(max_sus)) print('Number of long sentences: {}'.format(nr_long)) return Dataset(sentences), max_len_token
def generate_encodings(self, data, labels): encoder = StaticTokenizerEncoder(data, tokenize=lambda s: s.split(), min_occurrences=3) encoded_data = [encoder.encode(document) for document in data] encoded_data = [pad_tensor(x, length=10000) for x in encoded_data] data = {'labels': labels, 'inputs': encoded_data} return pd.DataFrame(data=data)
def tokenize_pos_tags(X_tags, tag_to_index, max_sen_len=800): """ One hot encodes pos tags :param X_tags: :param tag_to_index: :param max_sen_len: :return: One hot encoded vector """ return torch.nn.functional.one_hot( torch.stack([pad_tensor(torch.LongTensor(lst), max_sen_len) for lst in X_tags]), num_classes=max(tag_to_index.values()) + 1, )
def preprocess_request(sentence, start_sign, end_sign, token, max_length): sentence = " ".join(jieba.cut(sentence)) sentence = preprocess_sentence(start_sign, end_sign, sentence) inputs = [token.get(i, 3) for i in sentence.split(' ')] inputs = torch.tensor(inputs) inputs = [ pad_tensor(tensor=inputs[:max_length], length=max_length, padding_index=0) ] inputs = stack_and_pad_tensors(inputs)[0] dec_input = torch.unsqueeze(torch.tensor([token[start_sign]]), 0) return inputs, dec_input
def pad_and_stack_list_of_list( list_of_list: list, max_sentence_len=800, pad_value=0, tensor_type=torch.FloatTensor ): """ :param list_of_list: list of list of sequence :param max_sentence_len: defaults to 800 :param pad_value: defaults to 0 :param tensor_type: defaults to torch.FloatTensor :return: stacked tensor """ padded = [ pad_tensor(tensor_type(lst), length=max_sentence_len, padding_index=pad_value) for lst in list_of_list ] stacked = torch.stack(padded) return stacked
def collate_fn_transformer(batch): # multigpu implementation # Shape tensors in right format sents_len_batch = [[len(sent) for sent in doc["sents"]] for doc in batch] max_sent_len = max([max(s) for s in sents_len_batch]) sents_batch, doc_lens_batch = stack_and_pad_tensors([ torch.stack([pad_tensor(sent, max_sent_len) for sent in doc["sents"]]) for doc in batch ]) tags_batch, _ = stack_and_pad_tensors([doc["tags"] for doc in batch]) # Move to device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") sents_batch = sents_batch.to(device) tags_batch = tags_batch.to(device) if "encoding" in batch[0].keys(): # add doc encoding if applicable encoding_batch = torch.stack([doc["encoding"] for doc in batch]).to(device) return (sents_batch, tags_batch, encoding_batch) # return (word_ids_batch, seq_len_batch, label_batch) return (sents_batch, tags_batch, None)
def load_data(dict_fn, data_fn, batch_size, start_sign, end_sign, checkpoint_dir, max_length, max_train_data_size=0): """ 数据加载方法,主要将分词好的数据进行整理,过程中保存字典文件,方便后续其他功能 使用,方法返回处理好的dataset,steps_per_epoch,checkpoint_prefix Args: dict_fn: 将训练数据的字典保存,用于以后使用,路径 data_fn: 分词好的训练数据路径 batch_size: batch大小 start_sign: 开始标记 end_sign: 结束标记 checkpoint_dir: 检查点保存路径 max_length: 最大句子长度 max_train_data_size: 最大训练数据大小 Returns: dataset: PyTorch的DataLoader steps_per_epoch: 每轮的步数 checkpoint_prefix: 保存检查点的前缀 """ print("训练数据读取中...") (input_lang, target_lang), diag_weight = read_tokenized_data(data_fn, start_sign, end_sign, max_train_data_size) diag_weight = torch.tensor(diag_weight, dtype=torch.float32) # 合并input,target用于生成统一的字典 lang = np.hstack((input_lang, target_lang)) print("读取完成,正在格式化训练数据...") tokenizer = StaticTokenizerEncoder(sample=lang, tokenize=lambda x: x.split()) # 将文本序列转换文token id之后,并进行填充 input_data = [ pad_tensor(tensor=tokenizer.encode(example)[:max_length], length=max_length, padding_index=0) for example in input_lang ] target_data = [ pad_tensor(tensor=tokenizer.encode(example)[:max_length], length=max_length, padding_index=0) for example in target_lang ] input_tensor = stack_and_pad_tensors(input_data)[0] target_tensor = stack_and_pad_tensors(target_data)[0] print("格式化完成,正在整理训练数据并保存字典") word_index = {} vocab_list = tokenizer.vocab for i in range(tokenizer.vocab_size): word_index[vocab_list[i]] = i word_index[i] = vocab_list[i] with open(dict_fn, 'w', encoding='utf-8') as file: file.write(json.dumps(word_index, indent=4, ensure_ascii=False)) print("数据字典保存完成!") dataset = PairDataset(input_tensor, target_tensor, diag_weight) loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=2) steps_per_epoch = len(input_tensor) // batch_size return loader, steps_per_epoch
def test_pad_tensor(): padded = pad_tensor(torch.LongTensor([1, 2, 3]), 5, DEFAULT_PADDING_INDEX) assert padded.tolist() == [ 1, 2, 3, DEFAULT_PADDING_INDEX, DEFAULT_PADDING_INDEX ]
def test_pad_tensor_multiple_dim_float_tensor(): padded = pad_tensor(torch.FloatTensor(778, 80), 804, DEFAULT_PADDING_INDEX) assert padded.size() == (804, 80) assert padded[-1].sum().item() == pytest.approx(0) assert padded.type() == 'torch.FloatTensor'
def test_pad_tensor_multiple_dim(): padded = pad_tensor(torch.LongTensor(1, 2, 3), 5, DEFAULT_PADDING_INDEX) assert padded.size() == (5, 2, 3) assert padded[1].sum().item() == pytest.approx(0)
max_sentence_len=max_sentence_len, pad_value=-1, tensor_type=torch.FloatTensor, ) ascii = pad_and_stack_list_of_list( ascii, max_sentence_len=max_sentence_len, pad_value=-1, tensor_type=torch.FloatTensor, ) x_enriched_features = torch.stack( (alnum, numeric, alpha, digit, lower, title, ascii), dim=2) x_encoded = [x_encoder.encode(text) for text in X_text_list] x_padded = [pad_tensor(tensor, max_sentence_len) for tensor in x_encoded] x_padded = torch.LongTensor(torch.stack(x_padded)) x_char_padded = [[ pad_tensor(x_char_encoder.encode(char[:max_word_length]), max_word_length) for char in word ] for word in X_text_list_as_is] x_char_padded = [ pad_tensor(torch.stack(lst), max_sentence_len) for lst in x_char_padded ] x_char_padded = torch.stack(x_char_padded).type(torch.LongTensor) x_postag_padded = tokenize_pos_tags(X_tags, tag_to_index=tag_to_index, max_sen_len=max_sentence_len)
reviews = [] labels = [] # 丢弃过长的数据 for i in tqdm(range(len(encoded_texts))): if len(encoded_texts[i]) < max_pad_length: reviews.append(encoded_texts[i]) labels.append(1 if labels_as_list[i] == "positive" else 0) assert len(reviews) == len( labels), "The labels and feature lists should have the same time" # 扩充单个sequence到最大序列长度 padded_dataset = [] for i in tqdm(range(len(reviews))): padded_dataset.append(pad_tensor(reviews[i], int(max_pad_length))) # preparing the final dataset: X = torch.stack(padded_dataset) y = torch.tensor(labels) (y == 1).float().mean() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42) X_train, y_train = torch.tensor(X_train), torch.tensor(y_train) X_test, y_test = torch.tensor(X_test), torch.tensor(y_test)