def extract_features(self, inputs):
        if not "sentences" in inputs:
            raise ValueError(
                "inputs must be a parse containing `tokens` field!")

        all_tokens_cnt = 0

        # get number of tokens and sentence offsets
        for sent_id, sent in enumerate(inputs["sentences"]):
            all_tokens_cnt += len(sent["tokens"])

        src_feats, sent_mask = generate_flat_feats_and_mask(
            inputs, self._skip, self._labels_start_id)
        src_feats = [
            np.expand_dims(np.asarray(x, dtype=np.int32),
                           axis=self._views_axis) for x in src_feats
        ]
        sent_mask = [
            np.expand_dims(np.asarray(x, dtype=np.int32),
                           axis=self._views_axis) for x in sent_mask
        ]

        src_feats = [x for x in trim_feats_list(src_feats, self._max_views)]
        if self._use_features_as_mask:
            # In this setting we use attention between same labels
            sent_mask = [
                x for x in trim_feats_list(src_feats, self._max_views)
            ]
        else:
            sent_mask = [
                x for x in trim_feats_list(sent_mask, self._max_views)
            ]

        if len(src_feats) > 1:
            src_feats = np.concatenate(src_feats, axis=self._views_axis)
            sent_mask = np.concatenate(sent_mask, axis=self._views_axis)
        else:
            src_feats = src_feats[0]
            sent_mask = sent_mask[0]

        if not self._use_mask:
            sent_mask = np.ones_like(src_feats)

        assert src_feats.shape == sent_mask.shape, "Shapes of src_deats.shape={0} but sent_ids_mask.shape={1} " \
                                                       "should be the same but they are .".format(str(src_feats.shape),
                                                                                                  str(sent_mask.shape))

        return src_feats, sent_mask
Example #2
0
    def extract_features(self, inputs):
        if not "sentences" in inputs:
            raise ValueError(
                "inputs must be a parse containing `tokens` field!")

        all_tokens_cnt = 0

        # get number of tokens and sentence offsets
        for sent_id, sent in enumerate(inputs["sentences"]):
            all_tokens_cnt += len(sent["tokens"])

        coref_feats = [np.zeros(all_tokens_cnt, dtype=np.int32)]
        sent_ids_mask = np.zeros(all_tokens_cnt, dtype=np.int32)

        if "coref_clusters" in inputs:
            for coref_id, coref_cluster in enumerate(inputs["coref_clusters"]):
                cluster_label = coref_id + 1
                if "mentions" not in coref_cluster:
                    continue

                for mention in coref_cluster["mentions"]:
                    for tkn_id in range(mention["start"], mention["end"]):
                        sent_ids_mask[tkn_id] = cluster_label

        if len(coref_feats) == 0:
            coref_feats = [np.zeros(all_tokens_cnt, dtype=np.int32)]

        src_feats = [
            np.expand_dims(x, axis=self._views_axis)
            for x in trim_feats_list(coref_feats, self._max_views)
        ]

        if len(src_feats) > 1:
            sent_ids_mask = np.expand_dims(np.array(sent_ids_mask),
                                           axis=self._views_axis).repeat(
                                               len(src_feats),
                                               self._views_axis)
            src_feats = np.concatenate(src_feats, axis=self._views_axis)
        else:
            src_feats = src_feats[0]
            sent_ids_mask = np.array([sent_ids_mask])

        if self._pad_views and self._max_views < src_feats.shape[1]:
            src_feats = src_feats.repeat(self._max_views, self._views_axis)
            sent_ids_mask = sent_ids_mask.repeat(self._max_views,
                                                 self._views_axis)

        assert src_feats.shape == sent_ids_mask.shape, "Shapes of src_feats.shape={0} but sent_ids_mask.shape={1} " \
                                                       "should be the same but they are .".format(str(src_feats.shape),
                                                                                                  str(sent_ids_mask.shape))

        return src_feats, sent_ids_mask
    def extract_features(self, inputs):
        if not "sentences" in inputs:
            raise ValueError(
                "inputs must be a parse containing `tokens` field!")

        all_tokens_cnt = 0

        # get number of tokens and sentence offsets
        for sent_id, sent in enumerate(inputs["sentences"]):
            all_tokens_cnt += len(sent["tokens"])

        feats = []
        sent_ids_mask = []

        dr_type_to_label = {"Implicit": "NONEXP", "Explicit": "EXP"}
        if "sdp" in inputs:
            if self._use_nonexplicit:
                # odd
                feats.append(np.zeros(all_tokens_cnt, dtype=np.int32))
                sent_ids_mask.append(np.zeros(all_tokens_cnt, dtype=np.int32))

                #even
                feats.append(np.zeros(all_tokens_cnt, dtype=np.int32))
                sent_ids_mask.append(np.zeros(all_tokens_cnt, dtype=np.int32))

                for anno_id, annotation in enumerate(inputs["sdp"]):
                    if annotation["Type"] != "Implicit":
                        continue

                    arg1_sent_id = annotation["Arg1"]["Sent"]
                    feat_id = (arg1_sent_id + 1) % 2
                    mask_value = arg1_sent_id + 1

                    for arg_name in ["Arg1", "Arg2", "Conn"]:
                        span = annotation[arg_name]["Span"]
                        if len(span) == 0:
                            continue

                        dr_sense_label = "__" + annotation[
                            "Sense"] if self._use_senses_for_tags else ""
                        label = "DR_{0}{1}__{2}".format(
                            dr_type_to_label[annotation["Type"]],
                            dr_sense_label, arg_name)
                        label_id = self._vocab_feat_name2id.get(label, 0)
                        fill_span(feats[feat_id], span, label_id)
                        fill_span(sent_ids_mask[feat_id], span, mask_value)

            if self._use_explicit:
                mask_by_sent_ids = get_sent_ids_mask(inputs["sentences"])

                actual_exp_view_id = [
                ]  # here we maintain the mapping to the views
                for exp_v in range(self._max_explicit_views):
                    actual_exp_view_id.append(len(feats))
                    feats.append(np.zeros(all_tokens_cnt, dtype=np.int32))
                    sent_ids_mask.append(
                        np.asarray(mask_by_sent_ids, dtype=np.int32))

                sentid_to_viewid = {}
                for anno_id, annotation in enumerate(inputs["sdp"]):
                    if annotation["Type"] != "Explicit":
                        continue

                    # For each sentences we have maximum number of explicit views (maximum number of connectives).
                    # We have to count the used views and stop adding features for the connectives > _max_explicit_views
                    arg1_sent_id = annotation["Arg1"]["Sent"]
                    view_id = sentid_to_viewid.get(arg1_sent_id, -1)
                    view_id += 1
                    sentid_to_viewid[arg1_sent_id] = view_id
                    if view_id > (self._max_explicit_views - 1):
                        continue

                    feat_id = actual_exp_view_id[view_id]
                    mask_value = arg1_sent_id + 1

                    for arg_name in ["Arg1", "Arg2", "Conn"]:
                        span = annotation[arg_name]["Span"]
                        if len(span) == 0:
                            continue

                        dr_sense_label = "__" + annotation[
                            "Sense"] if self._use_senses_for_tags else ""
                        label = "DR_{0}{1}__{2}".format(
                            dr_type_to_label[annotation["Type"]],
                            dr_sense_label, arg_name)
                        label_id = self._vocab_feat_name2id.get(label, 0)
                        fill_span(feats[feat_id], span, label_id)
                        fill_span(sent_ids_mask[feat_id], span, mask_value)

        if len(feats) == 0:
            feats = [np.zeros(all_tokens_cnt, dtype=np.int32)]

        if len(sent_ids_mask) == 0:
            sent_ids_mask = [np.ones(all_tokens_cnt, dtype=np.int32)]

        src_feats = [
            np.expand_dims(x, axis=self._views_axis)
            for x in trim_feats_list(feats, self._max_views)
        ]
        if self._use_features_as_mask:
            # In this setting we use attention between same labels
            sent_ids_mask = [
                np.expand_dims(x, axis=self._views_axis)
                for x in trim_feats_list(feats, self._max_views)
            ]
        else:
            sent_ids_mask = [
                np.expand_dims(x, axis=self._views_axis)
                for x in trim_feats_list(sent_ids_mask, self._max_views)
            ]

        if len(src_feats) > 1:
            src_feats = np.concatenate(src_feats, axis=self._views_axis)
            sent_ids_mask = np.concatenate(sent_ids_mask,
                                           axis=self._views_axis)
        else:
            src_feats = src_feats[0]
            sent_ids_mask = sent_ids_mask[0]

        if not self._use_mask:
            sent_ids_mask = np.ones_like(src_feats)

        assert src_feats.shape == sent_ids_mask.shape, "Shapes of src_deats.shape={0} but sent_ids_mask.shape={1} " \
                                                       "should be the same but they are .".format(str(src_feats.shape),
                                                                                                  str(sent_ids_mask.shape))

        return src_feats, sent_ids_mask