def align_process_2(align): align = [y[y != IGNORE_ID] for y in align] left_trun = [] right_trun = [] ys_trunc = [] for k in range(len(align)): lens = len(align[k]) lid = 0 left = [] right = [] ys = [] for i in range(1, lens): if align[k][i - 1] != align[k][i] and align[k][i - 1] != 0: left.append(lid) right.append(i) ys.append(align[k][i - 1]) lid = i if i == lens - 1 and align[k][i] != 0: left.append(lid) right.append(lens) ys.append(align[k][i]) left_trun.append(left) right_trun.append(right) ys_trunc.append(ys) left_pad = pad_list([torch.from_numpy(np.asarray(x)) for x in left_trun], IGNORE_ID) right_pad = pad_list([torch.from_numpy(np.asarray(x)) for x in right_trun], IGNORE_ID) ys_pad = pad_list([torch.from_numpy(np.asarray(x)) for x in ys_trunc], IGNORE_ID) return ys_pad, left_pad, right_pad
def _collate_fn(batch): """ Args: batch: list, len(batch) = 1. See AudioDataset.__getitem__() Returns: xs_pad: N x Ti x D, torch.Tensor ilens : N, torch.Tentor ys_pad: N x To, torch.Tensor """ # batch should be located in list assert len(batch) == 1 batch = load_inputs_and_targets(batch[0]) xs, ys = batch # TODO: perform subsamping # get batch of lengths of input sequences ilens = np.array([x.shape[0] for x in xs]) olens = np.array([y.shape[0] for y in ys]) # perform padding and convert to tensor xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0) ilens = torch.from_numpy(ilens) ys_pad = pad_list([torch.from_numpy(y).long() for y in ys], 1) olens = torch.from_numpy(olens) return xs_pad, ilens, ys_pad, olens
def __getitem__(self, index): group = list(self.user_group.groups)[index] df = self.user_group.get_group(group) sample = self._generate_sample(df) target_sample = sample['movieId'].tolist() source_sample, mask = generate_random_mask(target_sample, self.mode, self.valid_sample_size, self.masking_rate, len(self.item2idx), MASK_INDEX) padding_mode = 'left' if self.mode == 'train': padding_mode = random.choice(['left', 'right']) else: padding_mode = 'left' padded_source = pad_list(source_sample, padding_mode, self.seq_len, PADDING_INDEX) padded_target = pad_list(target_sample, padding_mode, self.seq_len, PADDING_INDEX) padded_mask = pad_list(mask, padding_mode, self.seq_len, False) source_tensor = torch.tensor(padded_source, dtype=torch.long) target_tensor = torch.tensor(padded_target, dtype=torch.long) mask_tensor = torch.tensor(padded_mask, dtype=torch.bool) return source_tensor, target_tensor, mask_tensor
def align_process(align): align = [y[y != IGNORE_ID] for y in align] truns = [] ys_truns = [] for k in range(len(align)): lens = len(align[k]) trun = [ 0, ] ys = [] for i in range(1, lens): if align[k][i - 1] != align[k][i]: if int(align[k][i - 1]) != 0: trun.append(i) ys.append(align[k][i - 1]) lid = i if i == lens - 1 and int(align[k][i]) != 0: trun.append(lens) ys.append(align[k][i]) truns.append(trun) ys_truns.append(ys) olnes = np.array([len(y) for y in ys_truns]) aligns_pad = pad_list([torch.from_numpy(np.asarray(x)) for x in truns], IGNORE_ID) ys_pad = pad_list([torch.from_numpy(np.asarray(x)) for x in ys_truns], IGNORE_ID) return ys_pad, aligns_pad, olnes
def _collate_fn(batch): # as do the minibatch already in dataset, so here batch size is 1 assert len(batch) == 1 ys, xs = zip(*batch[0]) ys_pad, ys_mask = pad_list([torch.from_numpy(y) for y in ys], 0) xs_pad, xs_mask = pad_list([torch.from_numpy(np.array(x)) for x in xs], 0) return ys_pad, xs_pad, ys_mask, xs_mask
def set_inputs(self, input_word, sentiment_label, next_word): # input_word = (batch_size, seq_len) batch_size = self.batch_size seq_len = self.seq_len # truncate sentences to make it shorter input_word = [trunc_list(input_word[i], seq_len) \ for i in xrange(len(input_word))] next_word = [trunc_list(next_word[i], seq_len) \ for i in xrange(len(next_word))] self._sentence_lengths = sentence_lengths = map(len, input_word) # pad sentences padded_input = np.array([pad_list(input_word[i], self.seq_len) \ for i in xrange(len(input_word))]) padded_next_word = np.array([pad_list(next_word[i], self.seq_len) \ for i in xrange(len(next_word))]) label = np.array(sentiment_label) # bind input for seqidx in xrange(self.seq_len): x = padded_input[:, seqidx] # fixed sentiment ''' mx.nd.onehot_encode(mx.nd.array(sentiment_label, ctx=self._seq_data[seqidx].context), out=self._seq_senti[seqidx]) ''' mx.nd.onehot_encode( \ mx.nd.array(x, ctx=self._seq_data[seqidx].context), out=self._seq_senti[seqidx]) mx.nd.onehot_encode( \ mx.nd.array(x, ctx=self._seq_data[seqidx].context), out=self._seq_data[seqidx]) for i in xrange(batch_size): self._senti_labels[i*seq_len : (i+1)*seq_len] = 0 self._senti_mask[i*seq_len : (i+1)*seq_len] = 0 self._lm_labels[i*seq_len : (i+1)*seq_len] = 0 self._lm_mask[i*seq_len : (i+1)*seq_len] = 0 # bind sentiment label for i in xrange(batch_size): pos_eos = (sentence_lengths[i]-1)*batch_size + i self._senti_labels[pos_eos : pos_eos+1] = label[i] self._senti_mask[pos_eos : pos_eos+1] = 1 # bind language model label for i in xrange(batch_size): for j in xrange(sentence_lengths[i]-1): pos = (j+1)*batch_size + i self._lm_labels[pos : pos+1] = padded_next_word[i][j] self._lm_mask[pos : pos+1] = 1
def preprocess(self, padded_input): """ Generate decoder input and output label from padded_input Add <sos> to decoder input, and add <eos> to decoder output label """ ys = [y[y != IGNORE_ID] for y in padded_input] # prepare input and output word sequences with sos/eos IDs eos = ys[0].new_ones([1]).fill_(self.eos_id) sos = ys[0].new_ones([1]).fill_(self.sos_id) ys_in = [flow.cat([sos, y], dim=0) for y in ys] ys_out = [flow.cat([y, eos], dim=0) for y in ys] ys_in_pad = pad_list(ys_in, self.eos_id) ys_out_pad = pad_list(ys_out, IGNORE_ID) assert ys_in_pad.size() == ys_out_pad.size() return ys_in_pad, ys_out_pad
def forward(self, spec: torch.Tensor, spec_length: torch.Tensor): """Apply mask along time direction. Args: spec: (batch, length, freq) or (batch, channel, length, freq) spec_lengths: (length) """ if all(le == spec_length[0] for le in spec_length): out = self.mask_by_batch(spec) else: org_size = spec.size() batch = spec.size(0) if spec.dim() == 4: ch = spec.size(1) # spec: (Batch, Channel, Length, Freq) -> (Batch*Channel, Length, Freq) spec = spec.view(-1, org_size[2], org_size[3]) else: ch = 1 outs = [] for i in range(batch): for j in range(ch): _out = self.mask_by_batch( spec[i*ch+j][None, :spec_length[i], :]) outs.append(_out) out = utils.pad_list(outs, 0.0, dim=1) out = out.view(*org_size) return out
def preprocess(self, padded_input): """Generate decoder input and output label from padded_input Add <sos> to decoder input, and add <eos> to decoder output label """ ys = [y[y != IGNORE_ID] for y in padded_input] # parse padded ys # prepare input and output word sequences with sos/eos IDs eos = ys[0].new([self.eos_id]) sos = ys[0].new([self.sos_id]) ys_in = [torch.cat([sos, y], dim=0) for y in ys] ys_out = [torch.cat([y, eos], dim=0) for y in ys] # padding for ys with -1 # pys: utt x olen ys_in_pad = pad_list(ys_in, self.eos_id) ys_out_pad = pad_list(ys_out, IGNORE_ID) assert ys_in_pad.size() == ys_out_pad.size() return ys_in_pad, ys_out_pad
def __call__(self, batch): """Collect data into batch by desending order and add padding. Args: batch : list of (mat, label, weight) mat : torch.FloatTensor label : torch.IntTensor weight : torch.FloatTensor Return: (logits, input_lengths, labels, label_lengths, weights) """ batches = [(mat, label, weight, mat.size(0)) for mat, label, weight in batch] batch_sorted = sorted(batches, key=lambda item: item[3], reverse=True) mats = utils.pad_list([x[0] for x in batch_sorted]) labels = torch.cat([x[1] for x in batch_sorted]) input_lengths = torch.LongTensor([x[3] for x in batch_sorted]) label_lengths = torch.IntTensor([x[1].size(0) for x in batch_sorted]) weights = torch.cat([x[2] for x in batch_sorted]) return mats, input_lengths, labels, label_lengths, weights
def load_deploy(self, dataset_filepath, parameters, annotator): _, tokens, _, _ = self._parse_dataset( dataset_filepath, annotator, force_preprocessing=parameters['do_split'], limit=self.max_tokens) self.tokens['deploy'] = tokens # Map tokens and labels to their indices self.token_indices['deploy'] = [] self.token_lengths['deploy'] = [] self.token_indices_padded['deploy'] = [] # Tokens for token_sequence in tokens: self.token_indices['deploy'].append( [self.token_to_index[token] for token in token_sequence]) self.token_lengths['deploy'].append(len(token_sequence)) # Pad tokens self.token_indices_padded['deploy'] = [] self.token_indices_padded['deploy'] = [ utils.pad_list(temp_token_indices, self.max_tokens, self.PADDING_TOKEN_INDEX) for temp_token_indices in self.token_indices['deploy'] ] self.labels['deploy'] = [] self.label_vector_indices['deploy'] = []
def forward(self, enc_pad, enc_len, dec_z, att_prev, scaling=2.0): batch_size =enc_pad.size(0) if self.pre_compute_enc_h is None: self.enc_h = enc_pad self.enc_length = self.enc_h.size(1) self.pre_compute_enc_h = self.mlp_enc(self.enc_h) if dec_z is None: dec_z = enc_pad.new_zeros(batch_size, self.decoder_dim) else: dec_z = dec_z.view(batch_size, self.decoder_dim) if att_prev is None: # initialize attention weights to uniform att_prev = pad_list([self.enc_h.new(l).fill_(1.0 / l) for l in enc_len], 0) #att_prev: batch_size x frame att_conv = self.loc_conv(att_prev.view(batch_size, 1, 1, self.enc_length)) # att_conv: batch_size x channel x 1 x frame -> batch_size x frame x channel att_conv = att_conv.squeeze(2).transpose(1, 2) # att_conv: batch_size x frame x channel -> batch_size x frame x att_dim att_conv = self.mlp_att(att_conv) # dec_z_tiled: batch_size x 1 x att_dim dec_z_tiled = self.mlp_dec(dec_z).view(batch_size, 1, self.att_dim) att_state = torch.tanh(self.pre_compute_enc_h + dec_z_tiled + att_conv) e = self.gvec(att_state).squeeze(2) # w: batch_size x frame w = F.softmax(scaling * e, dim=1) # w_expanded: batch_size x 1 x frame w_expanded = w.unsqueeze(1) #c = torch.sum(self.enc_h * w_expanded, dim=1) c = torch.bmm(w_expanded, self.enc_h).squeeze(1) c = self.mlp_o(c) return c, w
def _convert_to_indices(self, dataset_types): tokens = self.tokens labels = self.labels token_to_index = self.token_to_index character_to_index = self.character_to_index label_to_index = self.label_to_index index_to_label = self.index_to_label # Map tokens and labels to their indices token_indices = {} label_indices = {} characters = {} token_lengths = {} character_indices = {} character_indices_padded = {} for dataset_type in dataset_types: token_indices[dataset_type] = [] characters[dataset_type] = [] character_indices[dataset_type] = [] token_lengths[dataset_type] = [] character_indices_padded[dataset_type] = [] for token_sequence in tokens[dataset_type]: token_indices[dataset_type].append([token_to_index.get(token, self.UNK_TOKEN_INDEX) for token in token_sequence]) characters[dataset_type].append([list(token) for token in token_sequence]) character_indices[dataset_type].append([[character_to_index.get(character, random.randint(1, max(self.index_to_character.keys()))) for character in token] for token in token_sequence]) token_lengths[dataset_type].append([len(token) for token in token_sequence]) longest_token_length_in_sequence = max(token_lengths[dataset_type][-1]) character_indices_padded[dataset_type].append([utils.pad_list(temp_token_indices, longest_token_length_in_sequence, self.PADDING_CHARACTER_INDEX) for temp_token_indices in character_indices[dataset_type][-1]]) label_indices[dataset_type] = [] for label_sequence in labels[dataset_type]: label_indices[dataset_type].append([label_to_index[label] for label in label_sequence]) if self.verbose: print('token_lengths[\'train\'][0][0:10]: {0}'.format(token_lengths['train'][0][0:10])) if self.verbose: print('characters[\'train\'][0][0:10]: {0}'.format(characters['train'][0][0:10])) if self.verbose: print('token_indices[\'train\'][0:10]: {0}'.format(token_indices['train'][0:10])) if self.verbose: print('label_indices[\'train\'][0:10]: {0}'.format(label_indices['train'][0:10])) if self.verbose: print('character_indices[\'train\'][0][0:10]: {0}'.format(character_indices['train'][0][0:10])) if self.verbose: print('character_indices_padded[\'train\'][0][0:10]: {0}'.format(character_indices_padded['train'][0][0:10])) # Vectorize the labels # [Numpy 1-hot array](http://stackoverflow.com/a/42263603/395857) label_binarizer = sklearn.preprocessing.LabelBinarizer() label_binarizer.fit(range(max(index_to_label.keys()) + 1)) label_vector_indices = {} for dataset_type in dataset_types: label_vector_indices[dataset_type] = [] for label_indices_sequence in label_indices[dataset_type]: label_vector_indices[dataset_type].append(label_binarizer.transform(label_indices_sequence)) if self.verbose: print('label_vector_indices[\'train\'][0:2]: {0}'.format(label_vector_indices['train'][0:2])) if self.verbose: print('len(label_vector_indices[\'train\']): {0}'.format(len(label_vector_indices['train']))) return token_indices, label_indices, character_indices_padded, character_indices, token_lengths, characters, label_vector_indices
def recognize_align(self, input, input_length, char_list, align, args): """Sequence-to-Sequence beam search, decode one utterence now. Args: input: T x D char_list: list of characters args: args.beam Returns: nbest_hyps: """ #import pdb #pdb.set_trace() encoder_outputs, _, _ = self.encoder(input.unsqueeze(0), input_length) if args.ctc_weight > 0 or args.trun: lpz = self.ctc.log_softmax(encoder_outputs)[0] else: lpz = None aligns_pad = [] aligns = [ 0, ] for i in range(1, len(align)): if int(align[i - 1]) != int(align[i]): if int(align[i - 1]) != 0: aligns.append(i) if i == len(align) - 1 and int(align[i]) != 0: aligns.append(lens) aligns_pad.append(aligns) aligns_pad = pad_list([torch.Tensor(y).long() for y in aligns_pad], IGNORE_ID) nbest_hyps = self.decoder.recognize_beam(encoder_outputs[0], char_list, lpz, aligns_pad, args) return nbest_hyps
def _collate_fn(batch): batch = sorted(batch, key=lambda sample: sample[0].size(0), reverse=True) inputs = [] targets = [] input_sizes = torch.IntTensor(len(batch)) target_sizes = torch.IntTensor(len(batch)) filenames = [] for i, sample in enumerate(batch): spect, target, filename = sample inputs.append(spect) targets.append(target) input_sizes[i] = spect.size(0) target_sizes[i] = len(target) filenames.append(filename) inputs = pad_list(inputs, 0) targets = pad_list(targets, IGNORE_ID) return inputs, targets, input_sizes, target_sizes, filenames
def _collate_fn(batch, LFR_m, LFR_n): xs, ys = load_inputs_and_targets(batch, LFR_m, LFR_n) ilens = np.array([x.shape[0] for x in xs]) # perform padding and convert to tensor xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0) ilens = torch.from_numpy(ilens) ys = torch.tensor(ys, dtype=torch.long) return xs_pad, ilens, ys
def transform(self, tokens, labels=None): pattern = [[utils_re.get_pattern(token, self.expressions) for token in sequence] for sequence in tokens] token_indices = [] characters = [] character_indices = [] token_lengths = [] character_indices_padded = [] for token_sequence in tokens: token_indices.append( [self.token2index.get(token.lower(), self.UNK_INDEX) for token in token_sequence]) characters.append([list(token) for token in token_sequence]) character_indices.append( [[self.character2index.get(character, 0) for character in token] for token in token_sequence]) token_lengths.append([len(token) for token in token_sequence]) longest_token_length_in_sequence = max(token_lengths[-1]) character_indices_padded.append( [utils.pad_list(temp_token_indices, longest_token_length_in_sequence, self.PADDING_INDEX) for temp_token_indices in character_indices[-1]]) if labels == None: return token_indices, character_indices_padded, token_lengths, pattern label_indices = [] for label_sequence in labels: label_indices.append([self.label2index.get(label,self.label2index['O']) for label in label_sequence]) if self.verbose: print('token_lengths[\'train\'][0][0:10]: {0}'.format(token_lengths[0][0:10])) if self.verbose: print('characters[\'train\'][0][0:10]: {0}'.format(characters[0][0:10])) if self.verbose: print('token_indices[\'train\'][0:10]: {0}'.format(token_indices[0:10])) if self.verbose: print('label_indices[\'train\'][0:10]: {0}'.format(label_indices[0:10])) if self.verbose: print('character_indices[\'train\'][0][0:10]: {0}'.format(character_indices[0][0:10])) if self.verbose: print('character_indices_padded[\'train\'][0][0:10]: {0}'.format( character_indices_padded[0][0:10])) # Vectorize the labels # [Numpy 1-hot array](http://stackoverflow.com/a/42263603/395857) label_binarizer = sklearn.preprocessing.LabelBinarizer() label_binarizer.fit(range(len(self.labels) + 1)) label_vector_indices = [] for label_indices_sequence in label_indices: label_vector_indices.append(label_binarizer.transform(label_indices_sequence)) # self.number_of_classes = len(self.labels) + 1 if self.verbose: print('label_vector_indices[\'train\'][0:2]: {0}'.format(label_vector_indices['train'][0:2])) if self.verbose: print('len(label_vector_indices[\'train\']): {0}'.format(len(label_vector_indices['train']))) return token_indices, character_indices_padded, token_lengths, pattern, label_indices, label_vector_indices
def _collate_fn(batch, LFR_m=1, LFR_n=1, align_trun=0): """ Args: batch: list, len(batch) = 1. See AudioDataset.__getitem__() Returns: xs_pad: N x Ti x D, torch.Tensor ilens : N, torch.Tentor ys_pad: N x To, torch.Tensor """ # batch should be located in list assert len(batch) == 1 #import pdb #pdb.set_trace() batch = load_inputs_and_targets(batch[0], LFR_m=LFR_m, LFR_n=LFR_n, align_trun=align_trun) xs, ys, aligns = batch #print(xs.size(), ys.size(), align.size()) #print(xs[0][0]) #print(align) # TODO: perform subsamping # get batch of lengths of input sequences ilens = np.array([x.shape[0] for x in xs]) olens = np.array([y.shape[0] for y in ys]) # perform padding and convert to tensor xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0) ilens = torch.from_numpy(ilens) ys_pad = pad_list([torch.from_numpy(y).long() for y in ys], IGNORE_ID) if aligns: ys_pad, aligns_pad, olens = align_process(aligns) #align_pad = pad_list([torch.from_numpy(y).long() for y in align], IGNORE_ID) #return xs_pad, ilens, ys_pad, olens, aligns_pad else: aligns_pad = torch.from_numpy(np.asarray(aligns)) #return xs_pad, ilens, ys_pad, olens, aligns_pad olens = torch.from_numpy(olens) #print(xs_pad.size(), ys_pad.size(), align_pad.size()) return xs_pad, ilens, ys_pad, olens, aligns_pad
def _collate_fn(batch, LFR_m=1, LFR_n=1): """ Args: batch: list, len(batch) = 1. See AudioDataset.__getitem__() Returns: xs_pad: N x Ti x D, torch.Tensor ilens : N, torch.Tentor ys_pad: N x To, torch.Tensor """ # batch should be located in list assert len(batch) == 1 batch = load_inputs_and_targets(batch[0], LFR_m=LFR_m, LFR_n=LFR_n) xs, ys = batch # get batch of lengths of input sequences ilens = np.array([x.shape[0] for x in xs]) # perform padding and convert to tensor xs_pad = pad_list([flow.tensor(x).to(dtype=flow.float32) for x in xs], 0) ilens = flow.tensor(ilens) ys_pad = pad_list([flow.tensor(y) for y in ys], IGNORE_ID) return xs_pad, ilens, ys_pad
def align_truncate(align, padded_target, encoder_padded_outputs, blank=0): align = [y[y != IGNORE_ID] for y in align] xs_trunc = [] ys_trunc = [] for k in range(len(align)): lens = len(align[k]) lid = 0 for i in range(1, lens): if align[k][i - 1] != align[k][i]: if align[k][i - 1] != 0: xs_trunc.append(encoder_padded_outputs[k][lid:i]) ys_trunc.append(align[k][i - 1]) lid = i if i == lens - 1 and align[k][i] != 0: xs_trunc.append(encoder_padded_outputs[k][lid:lens]) ys_trunc.append(align[k][i]) xs_pad = pad_list(xs_trunc, 0) ys_pad = pad_list( (torch.from_numpy(np.asarray(ys_trunc)).unsqueeze(-1).cuda()), IGNORE_ID) #return xs_trunc, ys_trunc return xs_pad, ys_pad
def prediction(model, movies, item2idx, idx2item, seq_len, k=30): model.eval() input = pad_list([item2idx[i] for i in movies] + [1], 'left', seq_len, 0) input = torch.tensor(input, dtype=torch.long).unsqueeze(0) with torch.no_grad(): out = model(input) out = out[0, -1].numpy() out = np.argsort(out).tolist()[::-1] out = [idx2item[i] for i in out if i in idx2item] out = out[:k] print(out) return out[:k]
def forward(self, enc_pad, enc_len, dec_z, att_prev, scaling=2.0): batch_size = enc_pad.size(0) if self.pre_compute_enc_h is None: self.enc_h = enc_pad self.enc_length = self.enc_h.size(1) self.pre_compute_enc_h = [ self.mlp_enc[h](self.enc_h) for h in range(self.heads) ] if dec_z is None: dec_z = enc_pad.new_zeros(batch_size, self.decoder_dim) else: dec_z = dec_z.view(batch_size, self.decoder_dim) # initialize attention weights to uniform if att_prev is None: att_prev = [] for h in range(self.heads): att_prev += [ pad_list( [self.enc_h.new(l).fill_(1.0 / l) for l in enc_len], 0) ] cs, ws = [], [] for h in range(self.heads): #att_prev: batch_size x frame att_conv = self.loc_conv[h](att_prev[h].view( batch_size, 1, 1, self.enc_length)) # att_conv: batch_size x channel x 1 x frame -> batch_size x frame x channel att_conv = att_conv.squeeze(2).transpose(1, 2) # att_conv: batch_size x frame x channel -> batch_size x frame x att_dim att_conv = self.mlp_att[h](att_conv) # dec_z_tiled: batch_size x 1 x att_dim dec_z_tiled = self.mlp_dec[h](dec_z).view(batch_size, 1, self.att_dim) att_state = torch.tanh(self.pre_compute_enc_h[h] + dec_z_tiled + att_conv) e = self.gvec[h](att_state).squeeze(2) # w: batch_size x frame w = F.softmax(scaling * e, dim=1) ws.append(w) # w_expanded: batch_size x 1 x frame w_expanded = w.unsqueeze(1) #c = torch.sum(self.enc_h * w_expanded, dim=1) c = torch.bmm(w_expanded, self.enc_h).squeeze(1) cs.append(c) c = self.mlp_o(torch.cat(cs, dim=1)) return c, ws
def forward(self, enc_pad, enc_len, dec_h, att_prev, scaling=2.0): ''' enc_pad:(batch, enc_length, enc_dim) enc_len:(batch) of int dec_h:(batch, 1, dec_dim) att_prev:(batch, enc_length) ''' batch_size = enc_pad.size(0) enc_h = self.mlp_enc(enc_pad) # batch_size x enc_length x att_dim if dec_h is None: dec_h = enc_pad.new_zeros(batch_size, self.decoder_dim) else: dec_h = dec_h.view(batch_size, self.decoder_dim) # initialize attention weights to uniform if att_prev is None: att_prev = pad_list( [enc_pad.new(l).fill_(1.0 / l) for l in enc_len], 0) att_conv = self.loc_conv( att_prev.view(batch_size, 1, 1, enc_pad.size(1))) att_conv = att_conv.squeeze(2).transpose(1, 2) # att_conv: batch_size x channel x 1 x frame -> batch_size x frame x channel att_conv = self.mlp_att( att_conv ) # att_conv: batch_size x frame x channel -> batch_size x frame x att_dim dec_h_tiled = self.mlp_dec(dec_h).view(batch_size, 1, self.att_dim) att_state = torch.tanh(enc_h + dec_h_tiled + att_conv) e = self.gvec(att_state).squeeze(2) if enc_len is not None: mask = [] for b in range(batch_size): mask.append([0] * enc_len[b] + [1] * (enc_pad.size(1) - enc_len[b])) mask = cc(torch.ByteTensor(mask)) e = e.masked_fill_(mask, -1e15) attn = F.softmax(scaling * e, dim=1) w_expanded = attn.unsqueeze(1) # w_expanded: batch_size x 1 x frame c = torch.bmm(w_expanded, enc_pad).squeeze(1) # batch x 1 x frame * batch x enc_length x enc_dim => batch x 1 x enc_dim c = self.mlp_o(c) # batch x enc_dim return c, attn
def forward(self, padded_input, encoder_padded_outputs, aligns_pad): """ Args: padded_input: N x To # encoder_hidden: (num_layers * num_directions) x N x H encoder_padded_outputs: N x Ti x H Returns: """ # *********Get Input and Output # from espnet/Decoder.forward() # TODO: need to make more smart way #import pdb #pdb.set_trace() ys = [y[y != IGNORE_ID] for y in padded_input] # parse padded ys aligns = [y[y != IGNORE_ID] for y in aligns_pad] # prepare input and output word sequences with sos/eos IDs eos = ys[0].new([self.eos_id]) sos = ys[0].new([self.sos_id]) ys_in = [torch.cat([sos, y], dim=0) for y in ys] aligns_in = [torch.cat([sos, y], dim=0) for y in aligns] ys_out = [torch.cat([y, eos], dim=0) for y in ys] # padding for ys with -1 # pys: utt x olen ys_in_pad = pad_list(ys_in, self.eos_id).cuda() ys_out_pad = pad_list(ys_out, IGNORE_ID).cuda() aligns_in_pad = pad_list(aligns_in, IGNORE_ID).cuda() # print("ys_in_pad", ys_in_pad.size()) assert ys_in_pad.size() == ys_out_pad.size() batch_size = ys_in_pad.size(0) output_length = ys_in_pad.size(1) # max_length = ys_in_pad.size(1) - 1 # TODO: should minus 1(sos)? # *********Init decoder rnn h_list = [self.zero_state(encoder_padded_outputs)] c_list = [self.zero_state(encoder_padded_outputs)] for l in range(1, self.num_layers): h_list.append(self.zero_state(encoder_padded_outputs)) c_list.append(self.zero_state(encoder_padded_outputs)) att_c = self.zero_state(encoder_padded_outputs, H=encoder_padded_outputs.size(2)) y_all = [] # **********LAS: 1. decoder rnn 2. attention 3. concate and MLP embedded = self.embedding(ys_in_pad) for t in range(output_length): # step 1. decoder RNN: s_i = RNN(s_i−1,y_i−1,c_i−1) rnn_input = torch.cat((embedded[:, t, :], att_c), dim=1) h_list[0], c_list[0] = self.rnn[0](rnn_input, (h_list[0], c_list[0])) for l in range(1, self.num_layers): h_list[l], c_list[l] = self.rnn[l](h_list[l - 1], (h_list[l], c_list[l])) rnn_output = h_list[-1] # below unsqueeze: (N x H) -> (N x 1 x H) # step 2. attention: c_i = AttentionContext(s_i,h) mask = torch.ones(encoder_padded_outputs.size(0), encoder_padded_outputs.size(1), dtype=torch.uint8).cuda() if t + 1 < aligns_pad.size(1): for m in range(mask.size(0)): left_bound = min(aligns_in_pad[m][t] + self.offset, rnn_output.size(1)) right_bound = min(aligns_in_pad[m][t + 1] + self.offset, rnn_output.size(1)) if self.TA: mask[m][0:right_bound] = 0 else: mask[m][left_bound:right_bound] = 0 att_c, att_w = self.attention(rnn_output.unsqueeze(dim=1), encoder_padded_outputs, mask) att_c = att_c.squeeze(dim=1) # step 3. concate s_i and c_i, and input to MLP mlp_input = torch.cat((rnn_output, att_c), dim=1) predicted_y_t = self.mlp(mlp_input) y_all.append(predicted_y_t) y_all = torch.stack(y_all, dim=1) # N x To x C # **********Cross Entropy Loss # F.cross_entropy = NLL(log_softmax(input), target)) y_all = y_all.view(batch_size * output_length, self.vocab_size) ce_loss = F.cross_entropy(y_all, ys_out_pad.view(-1), ignore_index=IGNORE_ID, reduction='elementwise_mean') # TODO: should minus 1 here ? # ce_loss *= (np.mean([len(y) for y in ys_in]) - 1) # print("ys_in\n", ys_in) # temp = [len(x) for x in ys_in] # print(temp) # print(np.mean(temp) - 1) return ce_loss
def recognize_beam(self, encoder_outputs, char_list, lpz, aligns_pad, args): """Beam search, decode one utterence now. Args: encoder_outputs: T x H char_list: list of character args: args.beam Returns: nbest_hyps: """ # search params #import pdb #pdb.set_trace() beam = args.beam_size nbest = args.nbest ctc_weight = args.ctc_weight CTC_SCORING_RATIO = 1.5 if args.decode_max_len != 0: maxlen = args.decode_max_len elif lpz is not None: maxlen = int(len(torch.nonzero(torch.max(lpz, dim=-1)[1])) * 1.5) elif args.align_trun: maxlen = int(aligns_pad.size(1) * 1.5) # *********Init decoder rnn h_list = [self.zero_state(encoder_outputs.unsqueeze(0))] c_list = [self.zero_state(encoder_outputs.unsqueeze(0))] for l in range(1, self.num_layers): h_list.append(self.zero_state(encoder_outputs.unsqueeze(0))) c_list.append(self.zero_state(encoder_outputs.unsqueeze(0))) att_c = self.zero_state(encoder_outputs.unsqueeze(0), H=encoder_outputs.unsqueeze(0).size(2)) # prepare sos y = self.sos_id vy = encoder_outputs.new_zeros(1).long() hyp = { 'score': 0.0, 'yseq': [y], 'c_prev': c_list, 'h_prev': h_list, 'a_prev': att_c } if lpz is not None: #import pdb #pdb.set_trace() ctc_prefix_score = CTCPrefixScore(lpz.detach().cpu().numpy(), 0, self.eos_id, np) hyp['ctc_state_prev'] = ctc_prefix_score.initial_state() hyp['ctc_score_prev'] = 0.0 if ctc_weight != 1.0: ctc_beam = min(lpz.shape[-1], int(beam * CTC_SCORING_RATIO)) else: ctc_beam = lpz.shape[-1] if args.trun: ctc_greedy = torch.max(lpz, dim=-1)[1].unsqueeze(dim=0) #print(ctc_greedy) aligns = [] for k in range(ctc_greedy.size()[0]): align = (torch.nonzero(ctc_greedy[k]) + 1).reshape(-1).cpu().numpy().tolist() align.insert(0, 0) aligns.append(align) #print(aligns[0:2]) #print(np.shape(aligns)) #aligns = torch.Tensor(aligns).long().cuda() aligns_pad = pad_list([torch.Tensor(y).long() for y in aligns], IGNORE_ID) hyps = [hyp] ended_hyps = [] for i in range(maxlen): hyps_best_kept = [] for hyp in hyps: # vy.unsqueeze(1) vy[0] = hyp['yseq'][i] embedded = self.embedding(vy) # embedded.unsqueeze(0) # step 1. decoder RNN: s_i = RNN(s_i−1,y_i−1,c_i−1) rnn_input = torch.cat((embedded, hyp['a_prev']), dim=1) h_list[0], c_list[0] = self.rnn[0]( rnn_input, (hyp['h_prev'][0], hyp['c_prev'][0])) for l in range(1, self.num_layers): h_list[l], c_list[l] = self.rnn[l]( h_list[l - 1], (hyp['h_prev'][l], hyp['c_prev'][l])) rnn_output = h_list[-1] # step 2. attention: c_i = AttentionContext(s_i,h) # below unsqueeze: (N x H) -> (N x 1 x H) #import pdb #pdb.set_trace() mask = None if args.trun or args.align_trun: mask = torch.ones(encoder_outputs.unsqueeze(0).size(0), encoder_outputs.unsqueeze(0).size(1), dtype=torch.uint8).cuda() #mask = torch.zeros(encoder_outputs.unsqueeze(0).size(0),encoder_outputs.unsqueeze(0).size(1),dtype=torch.uint8).cuda() if i + 1 < aligns_pad.size(1): for m in range(mask.size(0)): if self.peak_left != 0: left_id = max(i - self.peak_left + 1, 0) else: left_id = 0 right_id = min(i + 1 + self.peak_right, aligns_pad.size(1) - 1) left_bound = min( aligns_pad[m][left_id] + self.offset, rnn_output.size(1)) right_bound = max( min(aligns_pad[m][right_id] + self.offset, rnn_output.size(1)), 0) #right_bound = max(min(aligns_pad[m][i+1] + self.offset, rnn_output.size(1)), 0) #left_bound = 0 #mask[m][0:right_bound] = 0 #mask[m][right_bound:-1] = 1 mask[m][left_bound:right_bound] = 0 att_c, att_w = self.attention(rnn_output.unsqueeze(dim=1), encoder_outputs.unsqueeze(0), mask) att_c = att_c.squeeze(dim=1) # step 3. concate s_i and c_i, and input to MLP mlp_input = torch.cat((rnn_output, att_c), dim=1) predicted_y_t = self.mlp(mlp_input) local_att_scores = F.log_softmax(predicted_y_t, dim=1) local_scores = local_att_scores if args.ctc_weight > 0: #import pdb #pdb.set_trace() local_best_scores, local_best_ids = torch.topk( local_att_scores, ctc_beam, dim=1) ctc_scores, ctc_states = ctc_prefix_score( hyp['yseq'], local_best_ids[0], hyp['ctc_state_prev']) local_scores = ( 1.0 - ctc_weight) * local_att_scores[:, local_best_ids[ 0]] + ctc_weight * torch.from_numpy( ctc_scores - hyp['ctc_score_prev']).cuda() local_best_scores, joint_best_ids = torch.topk( local_scores, beam, dim=1) local_best_ids = local_best_ids[:, joint_best_ids[0]] else: # topk scores local_best_scores, local_best_ids = torch.topk( local_scores, beam, dim=1) for j in range(beam): new_hyp = {} new_hyp['h_prev'] = h_list[:] new_hyp['c_prev'] = c_list[:] new_hyp['a_prev'] = att_c[:] new_hyp['score'] = hyp['score'] + local_best_scores[0, j] new_hyp['yseq'] = [0] * (1 + len(hyp['yseq'])) new_hyp['yseq'][:len(hyp['yseq'])] = hyp['yseq'] new_hyp['yseq'][len(hyp['yseq'])] = int(local_best_ids[0, j]) # will be (2 x beam) hyps at most if args.ctc_weight > 0: new_hyp['ctc_state_prev'] = ctc_states[joint_best_ids[ 0, j]] new_hyp['ctc_score_prev'] = ctc_scores[joint_best_ids[ 0, j]] hyps_best_kept.append(new_hyp) hyps_best_kept = sorted(hyps_best_kept, key=lambda x: x['score'], reverse=True)[:beam] # end for hyp in hyps hyps = hyps_best_kept # add eos in the final loop to avoid that there are no ended hyps if i == maxlen - 1: for hyp in hyps: hyp['yseq'].append(self.eos_id) # add ended hypothes to a final list, and removed them from current hypothes # (this will be a probmlem, number of hyps < beam) remained_hyps = [] for hyp in hyps: if hyp['yseq'][-1] == self.eos_id: # hyp['score'] += (i + 1) * penalty ended_hyps.append(hyp) else: remained_hyps.append(hyp) hyps = remained_hyps if len(hyps) > 0: print('remeined hypothes: ' + str(len(hyps))) else: print('no hypothesis. Finish decoding.') break #import pdb #pdb.set_trace() for hyp in hyps: print('hypo: ' + ' '.join([char_list[int(x)] for x in hyp['yseq'][1:]])) # end for i in range(maxlen) nbest_hyps = sorted(ended_hyps, key=lambda x: x['score'], reverse=True)[:min(len(ended_hyps), nbest)] #print(nbest_hyps) return nbest_hyps
def forward(self, padded_input, encoder_padded_outputs, aligns, trun, epoch): """ Args: padded_input: N x To # encoder_hidden: (num_layers * num_directions) x N x H encoder_padded_outputs: N x Ti x H Returns: """ # *********Get Input and Output # from espnet/Decoder.forward() # TODO: need to make more smart way #import pdb #pdb.set_trace() ys = [y[y != IGNORE_ID] for y in padded_input] # parse padded ys if aligns is not None: aligns = [y[y != IGNORE_ID] for y in aligns] # prepare input and output word sequences with sos/eos IDs eos = ys[0].new([self.eos_id]) sos = ys[0].new([self.sos_id]) ys_in = [torch.cat([sos, y], dim=0) for y in ys] ys_out = [torch.cat([y, eos], dim=0) for y in ys] #if len(aligns) != 0: # aligns = [torch.cat([sos, y], dim=0) for y in aligns] # padding for ys with -1 # pys: utt x olen ys_in_pad = pad_list(ys_in, self.eos_id) ys_out_pad = pad_list(ys_out, IGNORE_ID) if aligns != None: aligns_pad = pad_list(aligns, 0) #if aligns_pad.size(1) < ys_in_pad.size(1): # aligns_pad_end = aligns_pad.new_full((1, int(ys_in_pad.size(1) - aligns_pad.size(1))), 0) # aligns_pad = [torch.cat([y, aligns_pad_end], dim=0) for y in aligns_pad] # print("ys_in_pad", ys_in_pad.size()) assert ys_in_pad.size() == ys_out_pad.size() batch_size = ys_in_pad.size(0) output_length = ys_in_pad.size(1) #print(ys_in_pad[0]) # max_length = ys_in_pad.size(1) - 1 # TODO: should minus 1(sos)? # *********Init decoder rnn h_list = [self.zero_state(encoder_padded_outputs)] c_list = [self.zero_state(encoder_padded_outputs)] for l in range(1, self.num_layers): h_list.append(self.zero_state(encoder_padded_outputs)) c_list.append(self.zero_state(encoder_padded_outputs)) att_c = self.zero_state(encoder_padded_outputs, H=encoder_padded_outputs.size(2)) y_all = [] z_all = [] # **********LAS: 1. decoder rnn 2. attention 3. concate and MLP #import pdb #pdb.set_trace() if self.sampling_probability: if epoch <= 5: sp = 0 else: sp = self.sampling_probability + 0.01 * epoch embedded = self.dropout_emb(self.embedding(ys_in_pad)) for t in range(output_length): #print(output_length) # step 1. decoder RNN: s_i = RNN(s_i−1,y_i−1,c_i−1) if t > 0 and self.sampling_probability and random.random() < sp: y_out = self.mlp(z_all[-1]) y_out = np.argmax(y_out.detach().cpu(), axis=1) rnn_input = torch.cat( (self.dropout_emb(self.embedding(y_out.cuda())), att_c), dim=1) else: rnn_input = torch.cat((embedded[:, t, :], att_c), dim=1) h_list, c_list = self.rnn_forward(rnn_input, h_list, c_list, h_list, c_list) #h_list[0], c_list[0] = self.rnn[0]( # rnn_input, (h_list[0], c_list[0])) #for l in range(1, self.num_layers): # #h_list[l-1] = self.dropout(h_list[l-1]) # h_list[l], c_list[l] = self.rnn[l]( # h_list[l-1], (h_list[l], c_list[l])) rnn_output = h_list[-1] # below unsqueeze: (N x H) -> (N x 1 x H) # step 2. attention: c_i = AttentionContext(s_i,h) mask = None if aligns: mask = torch.ones(encoder_padded_outputs.size(0), encoder_padded_outputs.size(1), dtype=torch.uint8).cuda() #mask = torch.zeros(encoder_padded_outputs.size(0),encoder_padded_outputs.size(1),dtype=torch.uint8).cuda() if t + 1 < aligns_pad.size(1): for m in range(mask.size(0)): if self.peak_left != 0: left_id = max(t - self.peak_left + 1, 0) else: left_id = 0 right_id = min(t + 1 + self.peak_right, aligns_pad.size(1) - 1) left_bound = min(aligns_pad[m][left_id] + self.offset, rnn_output.size(1)) right_bound = max( min(aligns_pad[m][right_id] + self.offset, rnn_output.size(1)), 0) #right_bound = max(min(aligns_pad[m][t+1] + self.offset, rnn_output.size(1)), 0) #left_bound = 0 #mask[m][0:right_bound] = 0 #mask[m][right_bound:-1] = 1 mask[m][left_bound:right_bound] = 0 att_c, att_w = self.attention(rnn_output.unsqueeze(dim=1), encoder_padded_outputs, mask) #att_c, att_w = self.attention(rnn_output.unsqueeze(dim=1), # encoder_padded_outputs) att_c = att_c.squeeze(dim=1) # step 3. concate s_i and c_i, and input to MLP #mlp_input = torch.cat((rnn_output, att_c), dim=1) #if self.context_residual: z_all.append( torch.cat((self.dropout_dec[-1](rnn_output), att_c), dim=-1)) # utt x (zdim + hdim) #predicted_y_t = self.mlp(mlp_input) #y_all.append(predicted_y_t) z_all = torch.stack(z_all, dim=1).view(batch_size * output_length, -1) y_all = self.mlp(z_all) #y_all = torch.stack(y_all, dim=1) # N x To x C # **********Cross Entropy Loss # F.cross_entropy = NLL(log_softmax(input), target)) #import pdb #pdb.set_trace() if self.lsm_weight: ce_loss = self.criterion( y_all, ys_out_pad) / (1.0 / (np.mean([len(y) for y in ys_in]) - 1) * np.sum([len(y) for y in ys_in])) else: y_all = y_all.view(batch_size * output_length, self.vocab_size) ce_loss = F.cross_entropy(y_all, ys_out_pad.view(-1), ignore_index=IGNORE_ID, reduction='elementwise_mean') # TODO: should minus 1 here ? ce_loss *= (np.mean([len(y) for y in ys_in]) - 1) # print("ys_in\n", ys_in) # temp = [len(x) for x in ys_in] # print(temp) # print(np.mean(temp) - 1) return ce_loss
def load_dataset(self, dataset_filepaths, parameters): ''' args: dataset_filepaths : dictionary with keys 'train', 'valid', 'test' http://stackoverflow.com/questions/27416164/what-is-conll-data-format ''' all_pretrained_tokens = None if parameters['token_pretrained_embedding_filepath'] != '': all_pretrained_tokens = utils_nlp.load_tokens_from_pretrained_token_embeddings( parameters) if self.verbose: print("len(all_pretrained_tokens): {0}".format( len(all_pretrained_tokens))) remap_to_unk_count_threshold = 1 #if ['train'] not in dataset_filepaths.keys(): raise ValueError('') UNK_TOKEN_INDEX = 0 PADDING_CHARACTER_INDEX = 0 self.UNK = 'UNK' self.unique_labels = [] labels = {} tokens = {} characters = {} token_lengths = {} label_count = {} token_count = {} character_count = {} for dataset_type in ['train', 'valid', 'test']: labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], \ character_count[dataset_type] = self._parse_dataset(dataset_filepaths[dataset_type],dataset_type)#,all_pretrained_tokens,token_count) if self.verbose: print("dataset_type: {0}".format(dataset_type)) if self.verbose: print("len(token_count[dataset_type]): {0}".format( len(token_count[dataset_type]))) token_count['all'] = {} # utils.merge_dictionaries() for token in list(token_count['train'].keys()) + list( token_count['valid'].keys()) + list( token_count['test'].keys()): token_count['all'][ token] = token_count['train'][token] + token_count['valid'][ token] + token_count['test'][token] for dataset_type in ['train', 'valid', 'test']: if self.verbose: print("dataset_type: {0}".format(dataset_type)) if self.verbose: print("len(token_count[dataset_type]): {0}".format( len(token_count[dataset_type]))) character_count['all'] = {} # utils.merge_dictionaries() for character in list(character_count['train'].keys()) + list( character_count['valid'].keys()) + list( character_count['test'].keys()): character_count['all'][character] = character_count['train'][ character] + character_count['valid'][ character] + character_count['test'][character] label_count['all'] = {} # utils.merge_dictionaries() for character in list(label_count['train'].keys()) + list( label_count['valid'].keys()) + list( label_count['test'].keys()): label_count['all'][ character] = label_count['train'][character] + label_count[ 'valid'][character] + label_count['test'][character] token_count['all'] = utils.order_dictionary(token_count['all'], 'value', reverse=True) #label_count['train'] = utils.order_dictionary(label_count['train'], 'key', reverse = False) label_count['all'] = utils.order_dictionary(label_count['all'], 'key', reverse=False) label_count['train'] = utils.order_dictionary(label_count['train'], 'key', reverse=False) character_count['all'] = utils.order_dictionary(character_count['all'], 'value', reverse=True) if self.verbose: print('character_count[\'all\']: {0}'.format( character_count['all'])) token_to_index = {} token_to_index[self.UNK] = UNK_TOKEN_INDEX iteration_number = 0 number_of_unknown_tokens = 0 # if self.verbose: print("parameters['remove_unknown_tokens']: {0}".format(parameters['remove_unknown_tokens'])) # if self.verbose: print("len(token_count['train'].keys()): {0}".format(len(token_count['train'].keys()))) for token, count in token_count['all'].items(): if iteration_number == UNK_TOKEN_INDEX: iteration_number += 1 if parameters['remove_unknown_tokens'] == 1 and \ token_count['train'][token] == 0 and \ (all_pretrained_tokens == None or \ token not in all_pretrained_tokens and \ token.lower() not in all_pretrained_tokens and \ re.sub('\d', '0', token.lower()) not in all_pretrained_tokens):#all( [x not in all_pretrained_tokens for x in [ token, token.lower(), re.sub('\d', '0', token.lower()) ]]): # if self.verbose: print("token: {0}".format(token)) # if self.verbose: print("token.lower(): {0}".format(token.lower())) # if self.verbose: print("re.sub('\d', '0', token.lower()): {0}".format(re.sub('\d', '0', token.lower()))) # assert(token not in ) # assert(token.lower() not in all_pretrained_tokens) # assert(re.sub('\d', '0', token.lower()) not in all_pretrained_tokens) token_to_index[token] = UNK_TOKEN_INDEX number_of_unknown_tokens += 1 else: token_to_index[token] = iteration_number iteration_number += 1 if self.verbose: print("number_of_unknown_tokens: {0}".format( number_of_unknown_tokens)) # 0/0 infrequent_token_indices = [] for token, count in token_count['train'].items(): if 0 < count <= remap_to_unk_count_threshold: infrequent_token_indices.append(token_to_index[token]) if self.verbose: print("len(token_count['train']): {0}".format( len(token_count['train']))) if self.verbose: print("len(infrequent_token_indices): {0}".format( len(infrequent_token_indices))) label_to_index = {} iteration_number = 0 #for label, count in label_count['train'].items(): for label, count in label_count['all'].items(): label_to_index[label] = iteration_number iteration_number += 1 self.unique_labels.append(label) #for label, count in label_count['train'].items(): # self.unique_labels.append(label) if self.verbose: print('self.unique_labels: {0}'.format(self.unique_labels)) character_to_index = {} iteration_number = 0 for character, count in character_count['all'].items(): if iteration_number == PADDING_CHARACTER_INDEX: iteration_number += 1 character_to_index[character] = iteration_number iteration_number += 1 if self.verbose: print('token_count[\'train\'][0:10]: {0}'.format( list(token_count['train'].items())[0:10])) token_to_index = utils.order_dictionary(token_to_index, 'value', reverse=False) #if self.verbose: print('token_to_index[0:10]: {0}'.format(token_to_index[0:10])) index_to_token = utils.reverse_dictionary(token_to_index) if parameters['remove_unknown_tokens'] == 1: index_to_token[UNK_TOKEN_INDEX] = self.UNK #if self.verbose: print('index_to_token[0:10]: {0}'.format(index_to_token[0:10])) #if self.verbose: print('label_count[\'train\']: {0}'.format(label_count['train'])) label_to_index = utils.order_dictionary(label_to_index, 'value', reverse=False) if self.verbose: print('label_to_index: {0}'.format(label_to_index)) index_to_label = utils.reverse_dictionary(label_to_index) if self.verbose: print('index_to_label: {0}'.format(index_to_label)) index_to_character = utils.reverse_dictionary(character_to_index) if self.verbose: print('character_to_index: {0}'.format(character_to_index)) if self.verbose: print('index_to_character: {0}'.format(index_to_character)) if self.verbose: print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10])) if self.verbose: print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10])) # Map tokens and labels to their indices token_indices = {} label_indices = {} character_indices = {} character_indices_padded = {} for dataset_type in ['train', 'valid', 'test']: token_indices[dataset_type] = [] characters[dataset_type] = [] character_indices[dataset_type] = [] token_lengths[dataset_type] = [] character_indices_padded[dataset_type] = [] for token_sequence in tokens[dataset_type]: token_indices[dataset_type].append( [token_to_index[token] for token in token_sequence]) characters[dataset_type].append( [list(token) for token in token_sequence]) character_indices[dataset_type].append( [[character_to_index[character] for character in token] for token in token_sequence]) token_lengths[dataset_type].append( [len(token) for token in token_sequence]) longest_token_length_in_sequence = max( token_lengths[dataset_type][-1]) character_indices_padded[dataset_type].append([ utils.pad_list(temp_token_indices, longest_token_length_in_sequence, PADDING_CHARACTER_INDEX) for temp_token_indices in character_indices[dataset_type][-1] ]) label_indices[dataset_type] = [] for label_sequence in labels[dataset_type]: label_indices[dataset_type].append( [label_to_index[label] for label in label_sequence]) if self.verbose: print('token_lengths[\'train\'][0][0:10]: {0}'.format( token_lengths['train'][0][0:10])) if self.verbose: print('characters[\'train\'][0][0:10]: {0}'.format( characters['train'][0][0:10])) if self.verbose: print('token_indices[\'train\'][0:10]: {0}'.format( token_indices['train'][0:10])) if self.verbose: print('label_indices[\'train\'][0:10]: {0}'.format( label_indices['train'][0:10])) if self.verbose: print('character_indices[\'train\'][0][0:10]: {0}'.format( character_indices['train'][0][0:10])) if self.verbose: print('character_indices_padded[\'train\'][0][0:10]: {0}'.format( character_indices_padded['train'][0][0:10])) # Vectorize the labels # [Numpy 1-hot array](http://stackoverflow.com/a/42263603/395857) label_binarizer = sklearn.preprocessing.LabelBinarizer() label_binarizer.fit(range(max(index_to_label.keys()) + 1)) label_vector_indices = {} for dataset_type in ['train', 'valid', 'test']: label_vector_indices[dataset_type] = [] for label_indices_sequence in label_indices[dataset_type]: label_vector_indices[dataset_type].append( label_binarizer.transform(label_indices_sequence)) if self.verbose: print('label_vector_indices[\'train\'][0:2]: {0}'.format( label_vector_indices['train'][0:2])) if self.verbose: print('len(label_vector_indices[\'train\']): {0}'.format( len(label_vector_indices['train']))) self.token_to_index = token_to_index self.index_to_token = index_to_token self.token_indices = token_indices self.label_indices = label_indices self.character_indices_padded = character_indices_padded self.index_to_character = index_to_character self.character_to_index = character_to_index self.character_indices = character_indices self.token_lengths = token_lengths self.characters = characters self.tokens = tokens self.labels = labels self.label_vector_indices = label_vector_indices self.index_to_label = index_to_label self.label_to_index = label_to_index if self.verbose: print("len(self.token_to_index): {0}".format( len(self.token_to_index))) if self.verbose: print("len(self.index_to_token): {0}".format( len(self.index_to_token))) self.number_of_classes = max(self.index_to_label.keys()) + 1 self.vocabulary_size = max(self.index_to_token.keys()) + 1 self.alphabet_size = max(self.index_to_character.keys()) + 1 if self.verbose: print("self.number_of_classes: {0}".format(self.number_of_classes)) if self.verbose: print("self.alphabet_size: {0}".format(self.alphabet_size)) if self.verbose: print("self.vocabulary_size: {0}".format(self.vocabulary_size)) # unique_labels_of_interest is used to compute F1-scores. self.unique_labels_of_interest = list(self.unique_labels) self.unique_labels_of_interest.remove('O') self.unique_label_indices_of_interest = [] for lab in self.unique_labels_of_interest: self.unique_label_indices_of_interest.append(label_to_index[lab]) self.infrequent_token_indices = infrequent_token_indices if self.verbose: print('self.unique_labels_of_interest: {0}'.format( self.unique_labels_of_interest)) if self.verbose: print('self.unique_label_indices_of_interest: {0}'.format( self.unique_label_indices_of_interest)) print('Dataset formatting completed')
def forward(self, enc_pad, enc_len, ys=None, tf_rate=1.0, max_dec_timesteps=500, sample=False, smooth=False, scaling=1.0, label_smoothing=True): batch_size = enc_pad.size(0) if ys is not None: # prepare input and output sequences bos = ys[0].data.new([self.bos]) eos = ys[0].data.new([self.eos]) ys_in = [torch.cat([bos, y], dim=0) for y in ys] ys_out = [torch.cat([y, eos], dim=0) for y in ys] pad_ys_in = pad_list(ys_in, pad_value=self.eos) pad_ys_out = pad_list(ys_out, pad_value=self.eos) # get length info batch_size, olength = pad_ys_out.size(0), pad_ys_out.size(1) # map idx to embedding eys = self.embedding(pad_ys_in) # initialization dec_c = self.zero_state(enc_pad) dec_z = self.zero_state(enc_pad) c = self.zero_state(enc_pad, dim=self.att_odim) w = None logits, prediction, ws = [], [], [] # reset the attention module self.attention.reset() # loop for each timestep olength = max_dec_timesteps if not ys else olength for t in range(olength): # supervised learning: using teacher forcing if ys is not None: # teacher forcing tf = True if np.random.random_sample() <= tf_rate else False emb = eys[:, t, :] if tf or t == 0 else self.embedding(prediction[-1]) # else, label the data with greedy else: if t == 0: bos = cc(torch.Tensor([self.bos for _ in range(batch_size)]).type(torch.LongTensor)) emb = self.embedding(bos) else: # using argmax if not smooth: emb = self.embedding(prediction[-1]) # smooth approximation of embedding else: emb = F.softmax(logit * scaling, dim=-1) @ self.embedding.weight logit, dec_z, dec_c, c, w = \ self.forward_step(emb, dec_z, dec_c, c, w, enc_pad, enc_len) ws.append(w) logits.append(logit) if not sample: prediction.append(torch.argmax(logit, dim=-1)) else: sampled_indices = Categorical(logits=logit).sample() prediction.append(sampled_indices) logits = torch.stack(logits, dim=1) log_probs = F.log_softmax(logits, dim=2) prediction = torch.stack(prediction, dim=1) ws = torch.stack(ws, dim=1) if ys: ys_log_probs = torch.gather(log_probs, dim=2, index=pad_ys_out.unsqueeze(2)).squeeze(2) else: ys_log_probs = torch.gather(log_probs, dim=2, index=prediction.unsqueeze(2)).squeeze(2) # label smoothing if label_smoothing and self.ls_weight > 0 and self.training: loss_reg = torch.sum(log_probs * self.vlabeldist, dim=2) ys_log_probs = (1 - self.ls_weight) * ys_log_probs + self.ls_weight * loss_reg return logits, ys_log_probs, prediction, ws
def forward(self, xs_pad, ilens, ys_pad, iter, epoch, aligns_pad=None): """ Args: padded_input: N x Ti x D input_lengths: N padded_targets: N x To """ #import pdb #pdb.set_trace() #import time #time1 = time.time() hs_pad, hlens, _ = self.encoder(xs_pad, ilens) #time2 = time.time() if self.mode == 0: loss_ctc = 0 else: loss_ctc = self.ctc(hs_pad, hlens, ys_pad) if self.trun: lpz = self.ctc.log_softmax(hs_pad) ctc_greedy = torch.max(lpz, dim=-1)[1] #print(ctc_greedy) aligns = [] for k in range(ctc_greedy.size()[0]): align = (torch.nonzero(ctc_greedy[k]) + 1).reshape(-1).cpu().numpy().tolist() align.insert(0, 0) aligns.append(align) #print(aligns[0:2]) #print(np.shape(aligns)) #aligns = torch.Tensor(aligns).long().cuda() aligns_pad = pad_list([torch.Tensor(y).long() for y in aligns], IGNORE_ID) #time3 = time.time() if self.mode == 1: loss_att = 0 else: loss_att = self.decoder(ys_pad, hs_pad, aligns_pad, self.trun, epoch) #time4 = time.time() if self.mode == 0: cer_ctc = None else: cers = [] cer_ctc = 0 y_hats = self.ctc.argmax(hs_pad).data show_detail = 0 if iter % 100 == 0: for i, y in enumerate(y_hats): y_hat = [x[0] for x in groupby(y)] y_true = ys_pad[i] seq_hat = [ self.char_list[int(idx)] for idx in y_hat if int(idx) != -1 ] seq_true = [ self.char_list[int(idx)] for idx in y_true if int(idx) != -1 ] seq_hat_text = "".join(seq_hat).replace(self.space, ' ') seq_hat_text = seq_hat_text.replace(self.blank, '') seq_true_text = "".join(seq_true).replace(self.space, ' ') hyp_chars = seq_hat_text.replace(' ', '') ref_chars = seq_true_text.replace(' ', '') if len(ref_chars) > 0: cers.append( editdistance.eval(hyp_chars, ref_chars) / len(ref_chars)) #import pdb #pdb.set_trace() if i == (y_hats.size(0) - 1): print(hyp_chars) print(ref_chars) if self.trun: print(aligns_pad[-1].numpy().tolist()) cer_ctc = sum(cers) / len(cers) if cers else None #if self.report_wer: # if self.ctc_weight > 0.0: # lpz = self.ctc.log_softmax(hs_pad).data # else: # lpz = None # wers, cers = [], [] # nbest_hyps = self.decoder.recognize_beam(encoder_outputs[0], self.char_list, args) #time5 = time.time() alpha = self.mode if alpha == 0: self.loss = loss_att elif alpha == 1: self.loss = loss_ctc else: self.loss = alpha * loss_ctc + (1 - alpha) * loss_att #self.loss = alpha * loss_ctc + (1 - alpha) * loss_att if self.mode == 0: cer_ctc = 0 #print(1000 * (time2 - time1), 1000 * (time3 - time2), 1000 * (time4 - time3), 1000 * (time5 - time4)) print("ctc loss {0} | att loss {1} | loss {2} | cer {3}".format( float(loss_ctc), float(loss_att), float(self.loss), float(cer_ctc))) return self.loss
def _train_epoch(self, epoch): """ Training logic for an epoch :param epoch: Current training epoch. :return: A log that contains all information you want to save. Note: If you have additional information to record, for example: > additional_log = {"x": x, "y": y} merge it with log before return. i.e. > log = {**log, **additional_log} > return log The metrics in log must have the key 'metrics'. """ self.model.train() total_loss = 0. # begin train self.data_loader.train_iter.device = self.device for batch_idx, data in enumerate(self.data_loader.train_iter): # single answer or multi-answers if self.config["arch"]["type"] == "BiDAFMultiParasOrigin": p1, p2 = self.model(data) self.optimizer.zero_grad() # 计算s_idx, e_idx在多个para连接时的绝对值 max_p_len = data.paras_word[0].shape[2] s_idx = data.s_idx + data.answer_para_idx * max_p_len e_idx = data.e_idx + data.answer_para_idx * max_p_len all_loss = self.loss(p1, s_idx) + self.loss(p2, e_idx) else: input_data, label = self.build_data(data) p1, p2, score = self.model(input_data) self.optimizer.zero_grad() batch_size = p1.shape[0] max_ans_num = data.s_idxs.shape[1] max_p_len = input_data['paras_word'].shape[2] max_p_num = input_data['paras_word'].shape[1] match_scores = F.softmax(torch.Tensor(pad_list(label['match_scores'], pad=-1e12)).to(self.device), dim=1) reshape_s_idxs = data.s_idxs.reshape(-1) reshape_e_idxs = data.e_idxs.reshape(-1) reshape_match_scores = match_scores.reshape(-1) reshape_answer_para_idxs = data.answer_para_idxs.reshape(-1) # print(f'Data:{data}') # print(f'max_p_len:{max_p_len}') # print(f'reshape_s_idxs:{reshape_s_idxs}') # print(f'reshape_e_idxs:{reshape_e_idxs}') # print(f'reshape_match_scores:{reshape_match_scores}') # print(f'reshape_answer_para_idxs:{reshape_answer_para_idxs}') # print('Assert idx < max_p_len*max_p_num:') # print(reshape_s_idxs >= max_p_len * max_p_num) # print(reshape_e_idxs >= max_p_len * max_p_num) # print('assert answer_para_idxs < max_p_num') # print(reshape_answer_para_idxs >= max_p_num) dup_p1 = p1.unsqueeze(1).expand(-1, max_ans_num, -1).reshape(batch_size * max_ans_num, -1) dup_p2 = p2.unsqueeze(1).expand(-1, max_ans_num, -1).reshape(batch_size * max_ans_num, -1) dup_score = score.unsqueeze(1).expand(-1, max_ans_num, -1).reshape(batch_size * max_ans_num, -1) # print(f'p1:{p1}') # print(f'p2:{p2}') # # print(f'dup_p1:{dup_p1}') # print(f'dup_p2:{dup_p2}') # print(f'dup_score:{dup_score}') # 计算偏移 reshape_s_idxs = reshape_s_idxs + reshape_answer_para_idxs * max_p_len reshape_e_idxs = reshape_e_idxs + reshape_answer_para_idxs * max_p_len # print('After:') # print(reshape_s_idxs) # print(reshape_e_idxs) # print('assert:') # print(reshape_s_idxs >= max_p_len*max_p_num) # print(reshape_e_idxs >= max_p_len * max_p_num) ### 加match socre lamda = self.config["loss"]["lamda"] ans_span_loss = (self.loss(dup_p1, reshape_s_idxs) + self.loss(dup_p2, reshape_e_idxs)) * reshape_match_scores pr_loss = self.loss(dup_score, reshape_answer_para_idxs) * reshape_match_scores # all_loss = torch.mean((1 - lamda) * ans_span_loss + lamda * pr_loss) all_loss = torch.mean(ans_span_loss) # 不加match score # lamda = self.config["loss"]["lamda"] # ans_span_loss = (self.loss(dup_p1, reshape_s_idxs) + self.loss(dup_p2, reshape_e_idxs)) # pr_loss = self.loss(dup_score, reshape_answer_para_idxs) # all_loss = torch.mean((1 - lamda) * ans_span_loss + lamda * pr_loss) all_loss.backward() self.optimizer.step() # # 验证词向量是否部分训练 # sep_idx = self.data_loader.vocab.stoi['<sep>'] # eop_idx = self.data_loader.vocab.stoi['<eop>'] # # fix_ebd = data.q_word[0][0][:4] # self.logger.info('Train ebd before:') # self.logger.info(self.model.module.word_emb(torch.tensor([sep_idx, eop_idx], device=torch.device('cuda:0')))) # self.logger.info('Fix ebd before:') # self.logger.info(self.model.module.word_emb(fix_ebd)) # self.logger.info('Train ebd after:') # self.logger.info( # self.model.module.word_emb(torch.tensor([sep_idx, eop_idx], device=torch.device('cuda:0')))) # self.logger.info('Fix ebd after:') # self.logger.info(self.model.module.word_emb(fix_ebd)) total_loss += all_loss.item() * p1.size()[0] if batch_idx % self.log_step == 0: self.logger.info('Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format( epoch, batch_idx, len(self.data_loader.train_iter), 100.0 * batch_idx / len(self.data_loader.train_iter), all_loss.item())) # add scalar to writer global_step = (epoch-1) * len(self.data_loader.train) + batch_idx self.writer.add_scalar('train_loss', all_loss.item(), global_step=global_step) # if train avg_loss = total_loss / (len(self.data_loader.train) + 0.) metrics = np.array([avg_loss]) result = { "train_metrics": metrics } # if evaluate if self.do_validation: result = self._valid_epoch(epoch) self.logger.info("Training epoch {} done, avg loss: {}, ROUGE-L :{}, BLUE-4: {}".format(epoch, avg_loss, result["ROUGE-L"], result["BLUE-4"])) self.writer.add_scalar("eval_ROUGE-L", result["ROUGE-L"], global_step=epoch * len(self.data_loader.train)) self.writer.add_scalar("eval_BLUE-4", result["BLUE-4"], global_step=epoch * len(self.data_loader.train)) return result
def construct_rels(self, batch): batch = self.__class__.vectorizer.forward( pad_list(list(map(self.tokenizer, batch)), pad=lambda: '&')) return batch