def forward(self, insts: Dict[str, List[Union[List[str], InternalParseNode]]], return_charts: bool = False)\ -> Union[torch.Tensor, Tuple[List[InternalParseNode], List[np.ndarray]], List[np.ndarray]]: """forward func of the model. Args: insts: input insts, including 'pos_tags', 'snts', 'gold_trees' Returns: pred tensor outputed by model """ pos_tags, snts = insts['pos_tags'], insts['snts'] snts_len = [len(pos_tag) for pos_tag in pos_tags] batch_size, seq_len = len(snts_len), max(snts_len) embeddings, mask = self.embeddings(pos_tags, snts) assert (batch_size, seq_len + 2) == embeddings.shape[0:2] == mask.shape[0:2] words_repr = self.encoder(embeddings, mask) assert (batch_size, seq_len + 1) == words_repr.shape[0:2] spans_repr = words_repr.unsqueeze(1) - words_repr.unsqueeze( 2) # [batch_size, seq_len+1, seq_len+1, dim] assert (batch_size, seq_len + 1, seq_len + 1) == spans_repr.shape[0:3] joint_repr = self.joint_label_classifier(spans_repr) parsing_repr = self.parsing_label_classifier(spans_repr) ner_repr = joint_repr - parsing_repr joint_labels_score = self.joint_classifier(joint_repr) parsing_labels_score = self.parsing_classifier(parsing_repr) # ner_labels_score = self.ner_classifier(ner_repr[:, :-1, :-1, :]) ner_labels_score = self.ner_classifier(ner_repr[:, :-1, 1:, :]) empty_label_score = torch.zeros( (batch_size, seq_len + 1, seq_len + 1, 1), device=self.device) joint_charts = torch.cat([empty_label_score, joint_labels_score], dim=3) parsing_charts = torch.cat([empty_label_score, parsing_labels_score], dim=3) ner_labels_score = torch.cat( [ner_labels_score, empty_label_score[:, :-1, :-1, :]], dim=3) joint_charts_np = joint_charts.cpu().detach().numpy() parsing_charts_np = parsing_charts.cpu().detach().numpy() # compute loss and generate tree # Just return the charts, for ensembling if return_charts: ret_charts = [] for i, snt_len in enumerate(snts_len): ret_charts.append(joint_charts[i, :snt_len + 1, :snt_len + 1, :].cpu().numpy()) return ret_charts # when model test, just return trees and scores if not self.training: trees = [] scores = [] for i, snt_len in enumerate(snts_len): chart_np = joint_charts_np[i, :snt_len + 1, :snt_len + 1, :] score, p_i, p_j, p_label, _ = self.parse_from_chart( snt_len, chart_np, self.joint_labels_vocab) pos_tag, snt = pos_tags[i], snts[i] tree = self.generate_tree(p_i, p_j, p_label, pos_tag, snt) trees.append(tree) scores.append(score) return trees, scores # when model train, return loss # During training time, the forward pass needs to be computed for every # cell of the chart, but the backward pass only needs to be computed for # cells in either the predicted or the gold parse tree. It's slightly # faster to duplicate the forward pass for a subset of the chart than it # is to perform a backward pass that doesn't take advantage of sparsity. # Since this code is not undergoing algorithmic changes, it makes sense # to include the optimization even though it may only be a 10% speedup. # Note that no dropout occurs in the label portion of the network # cross_loss = torch.tensor(0., device=self.device) joint_golds = insts['joint_gold_trees'] parsing_golds = insts['parsing_gold_trees'] p_is, p_js, g_is, g_js, p_labels, g_labels, batch_ids, paugment_total_joint = [], [], [], [], [], [], [], 0.0 p_is_parsing, p_js_parsing, g_is_parsing, g_js_parsing, p_labels_parsing, g_labels_parsing, batch_ids_parsing,\ paugment_total_parsing = [], [], [], [], [], [], [], 0.0 # ner_is, ner_js, ner_labels, ner_batch_ids = [], [], [], [] for i, snt_len in enumerate(snts_len): # joint parser chart_np = joint_charts_np[i, :snt_len + 1, :snt_len + 1, :] p_i, p_j, p_label, p_augment, g_i, g_j, g_label =\ self.parse_from_chart(snt_len, chart_np, self.joint_labels_vocab, joint_golds[i]) paugment_total_joint += p_augment p_is.extend(p_i.tolist()) p_js.extend(p_j.tolist()) p_labels.extend(p_label.tolist()) g_is.extend(g_i.tolist()) g_js.extend(g_j.tolist()) g_labels.extend(g_label.tolist()) batch_ids.extend([i for _ in range(len(p_i))]) # parsing parser chart_np = parsing_charts_np[i, :snt_len + 1, :snt_len + 1, :] p_i, p_j, p_label, p_augment, g_i, g_j, g_label =\ self.parse_from_chart(snt_len, chart_np, self.parsing_labels_vocab, parsing_golds[i]) paugment_total_parsing += p_augment p_is_parsing.extend(p_i.tolist()) p_js_parsing.extend(p_j.tolist()) p_labels_parsing.extend(p_label.tolist()) g_is_parsing.extend(g_i.tolist()) g_js_parsing.extend(g_j.tolist()) g_labels_parsing.extend(g_label.tolist()) batch_ids_parsing.extend([i for _ in range(len(p_i))]) # ner idx # ner_i, ner_j, ner_label = self.generate_ner_spans(joint_golds[i]) # ner_is.extend(ner_i) # ner_js.extend([j-1 for j in ner_j]) # # ner_js.extend([j for j in ner_j]) # ner_labels.extend(ner_label) # ner_batch_ids.extend([i for _ in range(len(ner_i))]) p_scores_joint = torch.sum(joint_charts[batch_ids, p_is, p_js, p_labels]) g_scores_joint = torch.sum(joint_charts[batch_ids, g_is, g_js, g_labels]) loss_joint = p_scores_joint - g_scores_joint + paugment_total_joint p_scores_parsing = torch.sum(parsing_charts[batch_ids_parsing, p_is_parsing, p_js_parsing, p_labels_parsing]) g_scores_parsing = torch.sum(parsing_charts[batch_ids_parsing, g_is_parsing, g_js_parsing, g_labels_parsing]) loss_parsing = p_scores_parsing - g_scores_parsing + paugment_total_parsing # ner loss spans_mask = [[[0] * i + [1] * (snt_len - i) + [0] * (seq_len - snt_len) if i < snt_len else [0] * seq_len for i in range(seq_len)] for snt_len in snts_len] spans_mask = np.array(spans_mask, dtype=np.bool) # spans_mask = np.array(spans_mask, dtype=np.bool) * (np.random.rand(batch_size, seq_len, seq_len) < 1.0) spans_label_idx = [] for idx, gold_tree in enumerate(joint_golds): label_idx_np = np.full((snts_len[idx], snts_len[idx]), len(NER_LABELS), dtype=np.int) ner_i, ner_j, ner_label = self.generate_ner_spans(gold_tree) for label_idx, start_i, end_j in zip(ner_label, ner_i, ner_j): label_idx_np[start_i, end_j - 1] = label_idx spans_mask[idx, start_i, end_j - 1] = True spans_label_idx.extend(label_idx_np[spans_mask[ idx, :snts_len[idx], :snts_len[idx]]].tolist()) assert np.sum(np.array(spans_mask)) == len(spans_label_idx) target = torch.tensor(spans_label_idx, dtype=torch.long, device=self.device) spans_mask_tensor = torch.tensor(spans_mask, dtype=torch.bool, device=self.device).unsqueeze(3) ner_loss = self.criterion_ner( torch.masked_select(ner_labels_score, spans_mask_tensor).view( -1, len(NER_LABELS) + 1), target) # ner_score: torch.Tensor = ner_labels_score[ner_batch_ids, ner_is, ner_js, :] # assert ner_score.shape[0] == len(ner_labels) # ner_loss = self.criterion_ner(ner_score, torch.tensor(ner_labels, dtype=torch.long, device=self.device)) loss = loss_joint + self.lambda_scaler * ( (self.alpha_scaler + 1.) * loss_parsing + (1. - self.alpha_scaler) * ner_loss) return loss
def forward(self, insts: Dict[str, List[Union[List[str], InternalParseNode]]], return_charts: bool = False)\ -> Union[torch.Tensor, Tuple[List[InternalParseNode], List[np.ndarray]], List[np.ndarray]]: """forward func of the model. Args: insts: input insts, including 'pos_tags', 'snts', 'gold_trees' Returns: pred tensor outputed by model """ pos_tags, snts = insts['pos_tags'], insts['snts'] snts_len = [len(pos_tag) for pos_tag in pos_tags] batch_size, seq_len = len(snts_len), max(snts_len) embeddings, mask = self.embeddings(pos_tags, snts) assert (batch_size, seq_len + 2) == embeddings.shape[0:2] == mask.shape[0:2] words_repr = self.encoder(embeddings, mask) assert (batch_size, seq_len + 1) == words_repr.shape[0:2] spans_repr = words_repr.unsqueeze(1) - words_repr.unsqueeze( 2) # [batch_size, seq_len+1, seq_len+1, dim] assert (batch_size, seq_len + 1, seq_len + 1) == spans_repr.shape[0:3] joint_labels_score = self.joint_label_classifier(spans_repr) parsing_labels_score = self.parsing_label_classifier(spans_repr) ner_labels_score = self.ner_label_classifier(spans_repr) empty_label_score = torch.zeros( (batch_size, seq_len + 1, seq_len + 1, 1), device=self.device) joint_charts = torch.cat([empty_label_score, joint_labels_score], dim=3) parsing_charts = torch.cat([empty_label_score, parsing_labels_score], dim=3) empty_label_score = torch.full( (batch_size, seq_len + 1, seq_len + 1, 1), 0., device=self.device) ner_labels_score = torch.cat([ner_labels_score, empty_label_score], dim=3) joint_charts_np = joint_charts.cpu().detach().numpy() parsing_charts_np = parsing_charts.cpu().detach().numpy() # compute loss and generate tree # Just return the charts, for ensembling if return_charts: ret_charts = [] for i, snt_len in enumerate(snts_len): ret_charts.append(joint_charts[i, :snt_len + 1, :snt_len + 1, :].cpu().numpy()) return ret_charts # when model test, just return trees and scores if not self.training: trees = [] scores = [] for i, snt_len in enumerate(snts_len): chart_np = joint_charts_np[i, :snt_len + 1, :snt_len + 1, :] score, p_i, p_j, p_label, _ = self.parse_from_chart( snt_len, chart_np, self.joint_labels_vocab) pos_tag, snt = pos_tags[i], snts[i] tree = self.generate_tree(p_i, p_j, p_label, pos_tag, snt) trees.append(tree) scores.append(score) return trees, scores # when model train, return loss # During training time, the forward pass needs to be computed for every # cell of the chart, but the backward pass only needs to be computed for # cells in either the predicted or the gold parse tree. It's slightly # faster to duplicate the forward pass for a subset of the chart than it # is to perform a backward pass that doesn't take advantage of sparsity. # Since this code is not undergoing algorithmic changes, it makes sense # to include the optimization even though it may only be a 10% speedup. # Note that no dropout occurs in the label portion of the network # cross_loss = torch.tensor(0., device=self.device) joint_golds = insts['joint_gold_trees'] parsing_golds = insts['parsing_gold_trees'] p_is, p_js, g_is, g_js, p_labels, g_labels, batch_ids, paugment_total_joint = [], [], [], [], [], [], [], 0.0 p_is_parsing, p_js_parsing, g_is_parsing, g_js_parsing, p_labels_parsing, g_labels_parsing, batch_ids_parsing,\ paugment_total_parsing = [], [], [], [], [], [], [], 0.0 ner_is, ner_js, ner_labels, ner_batch_ids = [], [], [], [] for i, snt_len in enumerate(snts_len): # joint parser chart_np = joint_charts_np[i, :snt_len + 1, :snt_len + 1, :] p_i, p_j, p_label, p_augment, g_i, g_j, g_label =\ self.parse_from_chart(snt_len, chart_np, self.joint_labels_vocab, joint_golds[i]) paugment_total_joint += p_augment p_is.extend(p_i.tolist()) p_js.extend(p_j.tolist()) p_labels.extend(p_label.tolist()) g_is.extend(g_i.tolist()) g_js.extend(g_j.tolist()) g_labels.extend(g_label.tolist()) batch_ids.extend([i for _ in range(len(p_i))]) # parsing parser chart_np = parsing_charts_np[i, :snt_len + 1, :snt_len + 1, :] p_i, p_j, p_label, p_augment, g_i, g_j, g_label =\ self.parse_from_chart(snt_len, chart_np, self.parsing_labels_vocab, parsing_golds[i]) paugment_total_parsing += p_augment p_is_parsing.extend(p_i.tolist()) p_js_parsing.extend(p_j.tolist()) p_labels_parsing.extend(p_label.tolist()) g_is_parsing.extend(g_i.tolist()) g_js_parsing.extend(g_j.tolist()) g_labels_parsing.extend(g_label.tolist()) batch_ids_parsing.extend([i for _ in range(len(p_i))]) # cross loss # cross_spans = self.generate_cross_label_spans(golds[i]) # for constit, constit_gold, ner, ner_gold, span_start, span_end in cross_spans: # constit_idx = self.cross_label_idx[constit] # ner_idx = self.cross_label_idx[ner] # cross_constit_loss = self.log_softmax(charts[i, span_start, span_end, constit_idx])[constit_gold] # cross_ner_loss = self.log_softmax(charts[i, span_start, span_end, ner_idx])[ner_gold] # cross_loss = cross_loss - cross_constit_loss - cross_ner_loss # ner idx ner_i, ner_j, ner_label = self.generate_ner_spans(joint_golds[i]) ner_is.extend(ner_i) ner_js.extend(ner_j) ner_labels.extend(ner_label) ner_batch_ids.extend([i for _ in range(len(ner_i))]) p_scores_joint = torch.sum(joint_charts[batch_ids, p_is, p_js, p_labels]) g_scores_joint = torch.sum(joint_charts[batch_ids, g_is, g_js, g_labels]) loss_joint = p_scores_joint - g_scores_joint + paugment_total_joint p_scores_parsing = torch.sum(parsing_charts[batch_ids_parsing, p_is_parsing, p_js_parsing, p_labels_parsing]) g_scores_parsing = torch.sum(parsing_charts[batch_ids_parsing, g_is_parsing, g_js_parsing, g_labels_parsing]) loss_parsing = p_scores_parsing - g_scores_parsing + paugment_total_parsing ner_score: torch.Tensor = ner_labels_score[ner_batch_ids, ner_is, ner_js, :] assert ner_score.shape[0] == len(ner_labels) # ner_loss = torch.sum(torch.log2(self.softmax(ner_score)[[i for i in range(len(ner_labels))], ner_labels])) ner_loss = self.criterion_ner( ner_score, torch.tensor(ner_labels, dtype=torch.long, device=self.device)) loss = loss_joint + self.lambda_scaler * ( self.alpha_scaler * loss_parsing + (1. - self.alpha_scaler) * ner_loss) return loss
def forward(self, insts: Dict[str, List[Union[List[str], InternalParseNode]]], return_charts: bool = False)\ -> Union[torch.Tensor, Tuple[List[InternalParseNode], List[np.ndarray]], List[np.ndarray]]: """forward func of the model. Args: insts: input insts, including 'pos_tags', 'snts', 'gold_trees' Returns: pred tensor outputed by model """ pos_tags, snts = insts['pos_tags'], insts['snts'] snts_len = [len(pos_tag) for pos_tag in pos_tags] batch_size, seq_len = len(snts_len), max(snts_len) embeddings, mask = self.embeddings(pos_tags, snts) assert (batch_size, seq_len + 2) == embeddings.shape[0:2] == mask.shape[0:2] words_repr = self.encoder(embeddings, mask) assert (batch_size, seq_len + 1) == words_repr.shape[0:2] spans_repr = words_repr.unsqueeze(1) - words_repr.unsqueeze( 2) # [batch_size, seq_len+1, seq_len+1, dim] assert (batch_size, seq_len + 1, seq_len + 1) == spans_repr.shape[0:3] labels_score = self.label_classifier(spans_repr) charts = torch.cat([ torch.zeros((batch_size, seq_len + 1, seq_len + 1, 1), device=self.device), labels_score ], dim=3) charts_np = charts.cpu().detach().numpy() # compute loss and generate tree # Just return the charts, for ensembling if return_charts: ret_charts = [] for i, snt_len in enumerate(snts_len): ret_charts.append(charts[i, :snt_len + 1, :snt_len + 1, :].cpu().numpy()) return ret_charts # when model test, just return trees and scores if not self.training: trees = [] scores = [] for i, snt_len in enumerate(snts_len): chart_np = charts_np[i, :snt_len + 1, :snt_len + 1, :] score, p_i, p_j, p_label, _ = self.parse_from_chart( snt_len, chart_np) pos_tag, snt = pos_tags[i], snts[i] tree = self.generate_tree(p_i, p_j, p_label, pos_tag, snt) trees.append(tree) scores.append(score) return trees, scores # when model train, return loss # During training time, the forward pass needs to be computed for every # cell of the chart, but the backward pass only needs to be computed for # cells in either the predicted or the gold parse tree. It's slightly # faster to duplicate the forward pass for a subset of the chart than it # is to perform a backward pass that doesn't take advantage of sparsity. # Since this code is not undergoing algorithmic changes, it makes sense # to include the optimization even though it may only be a 10% speedup. # Note that no dropout occurs in the label portion of the network golds = insts['gold_trees'] p_is, p_js, g_is, g_js, p_labels, g_labels, batch_ids, paugment_total = [], [], [], [], [], [], [], 0.0 for i, snt_len in enumerate(snts_len): chart_np = charts_np[i, :snt_len + 1, :snt_len + 1, :] p_i, p_j, p_label, p_augment, g_i, g_j, g_label = self.parse_from_chart( snt_len, chart_np, golds[i]) paugment_total += p_augment p_is.extend(p_i.tolist()) p_js.extend(p_j.tolist()) p_labels.extend(p_label.tolist()) g_is.extend(g_i.tolist()) g_js.extend(g_j.tolist()) g_labels.extend(g_label.tolist()) batch_ids.extend([i for _ in range(len(p_i))]) p_scores = torch.sum(charts[batch_ids, p_is, p_js, p_labels]) g_scores = torch.sum(charts[batch_ids, g_is, g_js, g_labels]) loss = p_scores - g_scores + paugment_total return loss