def test_get_padding_lengths_correctly_returns_ordered_shape(self):
     shape = [3, 4, 5, 6]
     array = numpy.zeros(shape)
     array_field = ArrayField(array)
     lengths = array_field.get_padding_lengths()
     for i in range(len(lengths)):
         assert lengths["dimension_{}".format(i)] == shape[i]
Example #2
0
    def text_to_instance(
            self,
            tokens: List[str],
            wl_feats: List[float],
            sl_feats: List[float],
            labels: List[int],
            id: str,
            oov: Dict[str, Dict[str, List[float]]] = None) -> Instance:

        fields = {}

        if oov is None:

            sentence_field = MemoryOptimizedTextField(
                self.token_extender(tokens), self.token_indexers)

        else:

            sentence_field = MemoryOptimizedTextField(
                self.token_extender(self.oov_token_swapper.swap(tokens, oov)),
                self.token_indexers)

        fields["tokens"] = sentence_field

        fields["word_level_features"] = ArrayField(array=np.array(wl_feats))

        fields["sentence_level_features"] = ArrayField(
            array=np.array(sl_feats))

        label_field = ArrayField(array=np.array(labels))
        fields["label"] = label_field

        fields["id"] = MetadataField(id)

        return Instance(fields)
    def text_to_instance(self, example) -> Instance:  # type: ignore
        """
        Parameters
        ----------
        vec : ``np.array``, required.
            The text to classify

        Returns
        -------
        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence or phrase.
            label : ``LabelField``
                The label label of the sentence or phrase.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        fields['doc'] = ArrayField(example["text"])
        if len(example["entities"]) == 0:
            return
        stacked_entities = np.stack([
            np.asarray(entity["text"].sum(0)).squeeze(0)
            for entity in example["entities"]
        ])
        fields['entities'] = ArrayField(stacked_entities)
        return Instance(fields)
Example #4
0
    def test_alternative_dtypes(self):
        shape = [3, 4, 5, 6]
        array = numpy.zeros(shape)

        # Setting dtype to numpy.int64 should produce a torch.LongTensor when field is converted to
        # a tensor
        array_field1 = ArrayField(array, dtype=numpy.int64)
        returned_tensor1 = array_field1.as_tensor(
            array_field1.get_padding_lengths())
        assert returned_tensor1.dtype == torch.int64

        # Setting dtype to numpy.uint8 should produce a torch.ByteTensor when field is converted to
        # a tensor
        array_field2 = ArrayField(array, dtype=numpy.uint8)
        returned_tensor2 = array_field2.as_tensor(
            array_field2.get_padding_lengths())
        assert returned_tensor2.dtype == torch.uint8

        # Padding should not affect dtype
        padding_lengths = {
            "dimension_" + str(i): 10
            for i, _ in enumerate(shape)
        }
        padded_tensor = array_field2.as_tensor(padding_lengths)
        assert padded_tensor.dtype == torch.uint8

        # Empty fields should have the same dtype
        empty_field = array_field2.empty_field()
        assert empty_field.dtype == array_field2.dtype
Example #5
0
def get_token_mapping_field(
        token_to_document_indices: Dict[str, List[int]],
        summary: List[Token]) -> Tuple[ListField, ListField]:
    """
    Creates an ``ArrayField`` that, for each token in the summary, contains
    the list of document indices for which that token appears, plus the
    corresponding mask.

    Parameters
    ----------
    token_to_document_indices:
        The mapping from each token to the list of indices in the document it appears.
    summary:
        The summary tokens.

    Returns
    -------
    ``ArrayField``: (num_summary_tokens, max_num_matches)
        The mapping field.
    ``ArrayField``: (num_summary_tokens, max_num_matches)
        The corresponding mask.
    """
    summary_token_document_indices = []
    mask = []
    for token in summary:
        indices = token_to_document_indices[str(token)]
        summary_token_document_indices.append(ArrayField(np.array(indices)))
        mask.append(ArrayField(np.ones(len(indices))))

    # Convert these into fields
    summary_token_document_indices_field = ListField(
        summary_token_document_indices)
    mask_field = ListField(mask)
    return summary_token_document_indices_field, mask_field
Example #6
0
 def text_to_instance(self,
                      headline,
                      body,
                      headline_sentiment,
                      body_sentiment,
                      tfidf,
                      stance=None):
     headline_tokens = self._tokenizer.tokenize(headline)
     body_tokens = self._tokenizer.tokenize(body)
     headline_field = TextField(headline_tokens, self._token_indexers)
     body_field = TextField(body_tokens, self._token_indexers)
     headline_sentiment_field = ArrayField(headline_sentiment)
     body_sentiment_field = ArrayField(body_sentiment)
     tfidf_field = ArrayField(tfidf)
     fields = {
         'headline': headline_field,
         'body': body_field,
         'headline_sentiment': headline_sentiment_field,
         'body_sentiment': body_sentiment_field,
         'tfidf': tfidf_field
     }
     if stance is not None:
         fields['stance'] = LabelField(stance)
     metadata = {
         "headline_tokens": [x.text for x in headline_tokens],
         "body_tokens": [x.text for x in body_tokens]
     }
     fields["metadata"] = MetadataField(metadata)
     return Instance(fields)
Example #7
0
    def prepare_instance(self, article):
        sample = self.prepare_sample(article)

        context = '\n'.join(sample['paragraphs']).strip()

        context_tokens = self.tokenizer.tokenize(context)

        # proper_infos = self._get_context_names(context)

        fields = {
            # 'context': CopyTextField(context_tokens, self.token_indexers, proper_infos, proper_infos, 'context'),
            'context': TextField(context_tokens, self.token_indexers),
            'image': ImageField(sample['image'], self.preprocess),
            'face_embeds': ArrayField(sample['face_embeds'],
                                      padding_value=np.nan),
            'obj_embeds': ArrayField(sample['obj_embeds'],
                                     padding_value=np.nan),
        }

        metadata = {
            'title': sample['title'],
            'start': '\n'.join(sample['start']).strip(),
            'before': '\n'.join(sample['before']).strip(),
            'after': '\n'.join(sample['after']).strip(),
            'image': CenterCrop(224)(Resize(256)(sample['image']))
        }
        fields['metadata'] = MetadataField(metadata)

        return Instance(fields)
    def text_to_instance(
        self,  # type: ignore
        image: Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]],
        hypothesis: str,
        label: Optional[str] = None,
        *,
        use_cache: bool = True,
    ) -> Instance:

        tokenized_hypothesis = self._tokenizer.tokenize(hypothesis)
        hypothesis_field = TextField(tokenized_hypothesis, None)

        fields: Dict[str, Field] = {"hypothesis": hypothesis_field}

        if image is not None:
            if isinstance(image, str):
                features, coords, _, _ = next(
                    self._process_image_paths([image], use_cache=use_cache)
                )
            else:
                features, coords, _, _ = image

            fields["box_features"] = ArrayField(features)
            fields["box_coordinates"] = ArrayField(coords)
            fields["box_mask"] = ArrayField(
                features.new_ones((features.shape[0],), dtype=torch.bool),
                padding_value=False,
                dtype=torch.bool,
            )

        if label:
            fields["labels"] = LabelField(label)

        return Instance(fields)
Example #9
0
 def test_get_padding_lengths_correctly_returns_ordered_shape(self):
     shape = [3, 4, 5, 6]
     array = numpy.zeros(shape)
     array_field = ArrayField(array)
     lengths = array_field.get_padding_lengths()
     for i in range(len(lengths)):
         assert lengths["dimension_{}".format(i)] == shape[i]
Example #10
0
    def text_to_instance(self,
                         game_id,
                         qas,
                         image_id,
                         target_object,
                         dialogue_features,
                         target_attributes=None) -> Instance:
        metadata = {
            "game_id": game_id,
            "image_id": image_id,
            "target_object": target_object,
            "qas": qas
        }

        instance = {
            "metadata": MetadataField(metadata),
            "dialogue_states": ArrayField(dialogue_features)
        }

        if target_attributes is not None:
            instance["target_attributes"] = ArrayField(np.concatenate(target_attributes)) if isinstance(
                target_attributes, tuple) else \
                ArrayField(target_attributes)

        return Instance(instance)
Example #11
0
    def text_to_instance(self,
                         sentence: str,
                         target: str,
                         polarity_label: str = None) -> Instance:
        example: Example = text_to_example(sentence, target, polarity_label)
        tokens = [Token(token.text) for token in example.spacy_document]
        if len(tokens) == 0:
            print(sentence)
            raise RuntimeError("no sentence")
        text_field = TextField(tokens, self._token_indexers)
        adj_in_field = ArrayField(example.adj_in)
        adj_out_field = ArrayField(example.adj_out)
        transformer_indices = MetadataField(example.transformer_indices)
        span_indices = MetadataField(example.span_indices)

        fields = {
            "tokens": text_field,
            "adj_in": adj_in_field,
            "adj_out": adj_out_field,
            "transformer_indices": transformer_indices,
            "span_indices": span_indices
        }
        if example.polarity_label:
            label_field = LabelField(polarity_label, label_namespace="labels")
            fields["label"] = label_field
        return Instance(fields)
    def text_to_instance(self, text: str, one_array:Tuple[np.array, np.array]=None, one_position: Tuple[int, int]=None, many_array:Tuple[np.array, np.array]=None) -> Instance:
        "训练的时候,输入这些用于训练我们的模型。至于验证时,则应重新写一个验证数据读取类"
        length = len(text)
        if self.pretrained_tokenizer is not None:
            tokens = get_word_from_pretrained(self.pretrained_tokenizer, text)
        else:
            tokens = [Token(w) for w in text]
        text_field = TextField(tokens, self._token_indexers)

        span = SpanField(one_position[0], one_position[1], text_field)
        dtype:numpy.dtype = np.dtype(numpy.float32)
        one_s = ArrayField(one_array[0], dtype=dtype)
        one_e = ArrayField(one_array[1], dtype=dtype)
        mang_s = ArrayField(many_array[0], dtype=dtype)
        mang_e = ArrayField(many_array[1], dtype=dtype)
        fields = {
            "tokens": text_field,
            "span": span,
            "one_s": one_s,
            "one_e": one_e,
            "many_s": mang_s,
            "many_e": mang_e,
            "metadata": MetadataField(None) # 训练的时候,不需要知道这个。而验证集需要,故占此位置
        }
        return Instance(fields)
Example #13
0
    def line_to_instance(
            self, query: List[Token], *docs: List[Tuple[List[Token], float,
                                                        int]]) -> Instance:
        query_field = TextField(query, self.q_token_indexers)
        doc_fields = [TextField(doc[0], self.d_token_indexers) for doc in docs]

        fields = {
            'query': query_field,
            'docs': ListField(doc_fields),
        }

        if self.scores:
            lex_fields = [ArrayField(np.array([doc[1]])) for doc in docs]
            fields['scores'] = ListField(lex_fields)

        label_fields = [ArrayField(np.array([doc[2]])) for doc in docs]
        fields['labels'] = ListField(label_fields)

        # used to compute full AQWV and MAP scores from partial data
        relevant_ignored_field = ArrayField(np.array([relevant_ignored]))
        fields['relevant_ignored'] = relevant_ignored_field
        irrelevant_ignored_field = ArrayField(np.array([irrelevant_ignored]))
        fields['irrelevant_ignored'] = irrelevant_ignored_field

        return Instance(fields)
Example #14
0
    def text_to_instance(
        self,
        prefix: str,
        suffix_a: str,
        suffix_b: str,
    ) -> Instance:

        # HuggingFace's tokenizers require leading whitespace.
        prefix_tokens = self._tokenizer.tokenize(' ' + prefix)
        suffix_a_tokens = self._tokenizer.tokenize(' ' + suffix_a)
        suffix_b_tokens = self._tokenizer.tokenize(' ' + suffix_b)

        tokens_a = prefix_tokens + suffix_a_tokens
        tokens_b = prefix_tokens + suffix_b_tokens

        eval_mask_a = np.array([0] * len(prefix_tokens) + [1] * len(suffix_a_tokens))
        eval_mask_b = np.array([0] * len(prefix_tokens) + [1] * len(suffix_b_tokens))

        metadata = {
            'prefix': [t.text for t in prefix_tokens],
            'suffix_a': [t.text for t in suffix_a_tokens],
            'suffix_b': [t.text for t in suffix_b_tokens],
        }

        fields = {
            'tokens_a': TextField(tokens_a, token_indexers=self._token_indexers),
            'tokens_b': TextField(tokens_b, token_indexers=self._token_indexers),
            'eval_mask_a': ArrayField(eval_mask_a, dtype=bool),
            'eval_mask_b': ArrayField(eval_mask_b, dtype=bool),
            'metadata': MetadataField(metadata),
        }

        return Instance(fields)
Example #15
0
    def get_common_field(self,
                         context_flat: List[TokenAdd],
                         query: List[TokenAdd],
                         rewrite: Optional[List[TokenAdd]] = None):
        fields: Dict[str, Field] = {}
        # inspect the oov words in the context and query
        # and get the extend ids with oov words
        extend_context_ids, oovs = self.context2ids(context_words=context_flat)
        extend_query_ids, oovs = self.query2ids(query_words=query, oovs=oovs)
        oovs_len = LabelField(label=len(oovs),
                              label_namespace="len_tags",
                              skip_indexing=True)
        context_len_field = LabelField(label=len(context_flat),
                                       label_namespace="len_tags",
                                       skip_indexing=True)
        query_len_field = LabelField(label=len(query),
                                     label_namespace="len_tags",
                                     skip_indexing=True)
        fields['extend_context_ids'] = ArrayField(
            np.array(extend_context_ids, dtype=np.int32))
        fields['extend_query_ids'] = ArrayField(
            np.array(extend_query_ids, dtype=np.int32))
        # preserve the length info in order to get the mask
        fields['oovs_len'] = oovs_len
        fields['context_len'] = context_len_field
        fields['query_len'] = query_len_field
        # preserve the original text
        metadata = {
            "context_words":
            "".join([token.text for token in context_flat[1:-1]]),  # str
            "query_words": [token.text for token in query][:-1],  # List[str]
            "oovs": oovs  # List[str]
        }

        if rewrite is not None:
            rewrite_input_tokens, rewrite_targ_tokens = self.get_dec_inp_targ_seqs(
                rewrite)
            # get the extend rewrite ids
            extend_rewrite_ids = self.rewrite2ids(
                rewrite_words=rewrite_targ_tokens, oovs=oovs)
            rewrite_len_field = LabelField(label=len(rewrite_input_tokens),
                                           label_namespace="len_tags",
                                           skip_indexing=True)

            rewrite_input_tokens_field = TextField(rewrite_input_tokens,
                                                   self._token_indexers)
            rewrite_targ_tokens_field = TextField(rewrite_targ_tokens,
                                                  self._token_indexers)

            fields['rewrite_input_ids'] = rewrite_input_tokens_field
            fields['rewrite_target_ids'] = rewrite_targ_tokens_field
            fields['extend_rewrite_ids'] = ArrayField(
                np.array(extend_rewrite_ids, dtype=np.int32))
            fields['rewrite_len'] = rewrite_len_field
            metadata["rewrite"] = [token.text
                                   for token in rewrite]  # List[str]

        fields['metadata'] = MetadataField(metadata)
        return fields
Example #16
0
File: wordnet.py Project: zxlzr/kb
    def text_to_instance(self,
                         tokens: List[str],
                         candidate_entities: List[List[str]],
                         candidate_spans: List[List[int]],
                         candidate_entity_prior: List[List[float]],
                         gold_entities: List[str] = None,
                         gold_data_ids: List[str] = None):

        # prior needs to be 2D and full
        # can look like [[0.2, 0.8], [1.0]]  if one candidate for second
        # candidate span and two candidates for first
        max_cands = max(len(p) for p in candidate_entity_prior)
        for p in candidate_entity_prior:
            if len(p) < max_cands:
                p.extend([0.0] * (max_cands - len(p)))
        np_prior = np.array(candidate_entity_prior)

        fields = {
            "tokens":
            TextField([Token(t) for t in tokens],
                      token_indexers=self.token_indexers),

            # join by space, then retokenize in the "character indexer"
            "candidate_entities":
            TextField([
                Token(" ".join(candidate_list))
                for candidate_list in candidate_entities
            ],
                      token_indexers=self.entity_indexer),
            "candidate_entity_prior":
            ArrayField(np.array(np_prior)),
            # only one sentence
            "candidate_segment_ids":
            ArrayField(np.array([0] * len(candidate_entities)), dtype=np.int)
        }

        if gold_entities is not None:
            fields["gold_entities"] = TextField(
                [Token(entity) for entity in gold_entities],
                token_indexers=self.entity_indexer)
        if gold_data_ids is not None:
            fields["gold_data_ids"] = MetadataField(gold_data_ids)

        span_fields = []
        for span in candidate_spans:
            span_fields.append(SpanField(span[0], span[1], fields['tokens']))
        fields['candidate_spans'] = ListField(span_fields)

        if self.extra_candidate_generators:
            tokens = " ".join(tokens)
            extra_candidates = {
                key: generator.get_mentions_raw_text(tokens,
                                                     whitespace_tokenize=True)
                for key, generator in self.extra_candidate_generators.items()
            }
            fields['extra_candidates'] = MetadataField(extra_candidates)

        return Instance(
            fields, should_remap_span_indices=self.should_remap_span_indices)
Example #17
0
    def text_to_instance(self,
                         label,
                         response=None,
                         original_post=None,
                         weakpoints=None,
                         op_features=None,
                         response_features=None,
                         op_doc_features=None,
                         response_doc_features=None,
                         goodpoints=None) -> Instance:

        fields: Dict[str, Field] = {}

        if original_post is not None:
            fields['original_post'] = ListField([
                TextField(
                    self._tokenizer.tokenize(s)[:self.max_sentence_len],
                    self._token_indexers)
                for s in original_post[:self.max_post_len]
            ])
            if weakpoints is not None:
                fields['weakpoints'] = ListField([
                    IndexField(wp, fields['original_post'])
                    for wp in weakpoints
                ])

        if response is not None:
            fields['response'] = ListField([
                TextField(
                    self._tokenizer.tokenize(s)[:self.max_sentence_len],
                    self._token_indexers) for s in response[:self.max_post_len]
            ])

            if goodpoints is not None:
                fields['goodpoints'] = ListField(
                    [IndexField(gp, fields['response']) for gp in goodpoints])

        if op_features is not None:
            fields['op_features'] = ListField([
                ArrayField(np.array(f))
                for f in op_features[:self.max_post_len]
            ])

        if response_features is not None:
            fields['response_features'] = ListField([
                ArrayField(np.array(f))
                for f in response_features[:self.max_post_len]
            ])

        if op_doc_features is not None:
            fields['op_doc_features'] = ArrayField(np.array(op_doc_features))

        if response_doc_features is not None:
            fields['response_doc_features'] = ArrayField(
                np.array(response_doc_features))

        fields['label'] = LabelField(label, skip_indexing=True)

        return Instance(fields)
Example #18
0
    def test_as_tensor_handles_larger_padding_dimensions(self):
        shape = [3, 4]
        array = numpy.ones(shape)
        array_field = ArrayField(array)

        padded_tensor = array_field.as_tensor({"dimension_0": 5, "dimension_1": 6}).detach().cpu().numpy()
        numpy.testing.assert_array_equal(padded_tensor[:3, :4], array)
        numpy.testing.assert_array_equal(padded_tensor[3:, 4:], 0.)
Example #19
0
    def samples_to_instance(self, sample: SampleT, label: int) -> Instance:
        head = ArrayField(np.array(sample[0], dtype=np.int), dtype=np.int)
        relation = ArrayField(np.array(sample[2], dtype=np.int), dtype=np.int)
        tail = ArrayField(np.array(sample[1], dtype=np.int), dtype=np.int)
        label_f = LabelField(label, skip_indexing=True)
        fields = {'h': head, 't': tail, 'r': relation, 'label': label_f}

        return Instance(fields)
    def samples_to_instance(
            self, sample: Tuple[int, List[int], List[int]]) -> Instance:
        node = ArrayField(np.array(sample[0], dtype=np.int), dtype=np.int)
        parents = ArrayField(np.array(sample[1], dtype=np.int), dtype=np.int)
        children = ArrayField(np.array(sample[2], dtype=np.int), dtype=np.int)
        fields = {'node': node, 'gt_parent': parents, 'gt_child': children}

        return Instance(fields)
Example #21
0
 def test_eq(self):
     array1 = ArrayField(numpy.asarray([1, 1, 1]))
     array2 = ArrayField(numpy.asarray([[1, 1, 1], [1, 1, 1]]))
     array3 = ArrayField(numpy.asarray([1, 1, 2]))
     array4 = ArrayField(numpy.asarray([1, 1, 1]))
     assert array1 != array2
     assert array1 != array3
     assert array1 == array4
    def sample_to_instance(self, sample: Tuple[int, int, int,
                                               int]) -> Instance:
        head = ArrayField(np.array(sample[0], dtype=np.int), dtype=np.int)
        tail = ArrayField(np.array(sample[1], dtype=np.int), dtype=np.int)
        relation = ArrayField(np.array(sample[2], dtype=np.int), dtype=np.int)
        label = ArrayField(np.array(sample[3], dtype=np.int), dtype=np.int)

        return Instance({'h': head, 't': tail, 'r': relation, 'label': label})
 def __init__(self,
              array: np.ndarray,
              dtype: np.dtype,
              sequence_dim: int = 0,
              padding_value: int = 0) -> None:
     ArrayField.__init__(self, array=array, padding_value=padding_value)
     self._dtype = dtype
     self._sequence_dim = sequence_dim
Example #24
0
    def test_as_tensor_handles_larger_padding_dimensions(self):
        shape = [3, 4]
        array = numpy.ones(shape)
        array_field = ArrayField(array)

        padded_tensor = array_field.as_tensor({"dimension_0": 5, "dimension_1": 6}).detach().cpu().numpy()
        numpy.testing.assert_array_equal(padded_tensor[:3, :4], array)
        numpy.testing.assert_array_equal(padded_tensor[3:, 4:], 0.)
Example #25
0
    def text_to_instance(self, graph) -> Instance:
        """
        Does bulk of work converting a graph to an Instance of Fields 
        """
        # pylint: disable=arguments-differ

        fields: Dict[str, Field] = {}

        max_tgt_length = None if self.eval else 60
        d = UDGraph(graph)
        list_data = d.get_list_data(
             bert_tokenizer = self._tokenizer)
        if list_data is None:
            return None

        # These four fields are used for seq2seq model and target side self copy
        fields["source_tokens"] = TextField(
            tokens=[Token(x) for x in list_data["src_tokens"]],
            token_indexers=self._source_token_indexers
        )

        if list_data['src_token_ids'] is not None:
            fields['source_subtoken_ids'] = ArrayField(list_data['src_token_ids'])
            self._number_bert_ids += len(list_data['src_token_ids'])
            self._number_bert_oov_ids += len(
                [bert_id for bert_id in list_data['src_token_ids'] if bert_id == 100])

        if list_data['src_token_subword_index'] is not None:
            fields['source_token_recovery_matrix'] = ArrayField(list_data['src_token_subword_index'])

        fields["source_pos_tags"] = SequenceLabelField(
            labels=list_data["src_pos_tags"],
            sequence_field=fields["source_tokens"],
            label_namespace="pos_tags"
        )
        fields["syn_edge_types"] = TextField(
            tokens=[Token(x) for x in list_data["syn_head_tags"]],
            token_indexers=self._syntax_edge_type_indexers,
        )

        fields["syn_edge_heads"] = SequenceLabelField(
            labels=list_data["syn_head_indices"],
            sequence_field=fields["syn_edge_types"],
            label_namespace="syn_edge_heads"
        )

        fields['syn_edge_head_mask'] = ArrayField(list_data['syn_edge_mask'])
        fields['syn_valid_node_mask'] = ArrayField(list_data['syn_node_mask'])

        fields["syn_node_name_list"] = MetadataField(
                list_data["syn_node_name_list"])

        # Metadata fields, good for debugging
        fields["src_tokens_str"] = MetadataField(
            list_data["src_tokens"]
        )
        
        return Instance(fields)
Example #26
0
    def _read(self, file_path: str):
        # pylint: disable=logging-fstring-interpolation

        instances: List[Instance] = []
        with open(file_path) as dataset_file:
            dataset = json.load(dataset_file)
        logger.info(f"Reading the dataset from: {file_path}")

        for passage_id, passage_info in dataset.items():
            passage_text = passage_info[constants.tokenized_passage]
            passage_length = len(passage_text.split(' '))

            for question_answer in passage_info[constants.qa_pairs]:
                fields = {}

                answer_passage_spans = question_answer[
                    constants.answer_passage_spans]

                if len(answer_passage_spans) == 0:
                    print("NO PASSAGE SPAN AS ANS")
                    continue

                # TODO(nitish): Only using first span as answer
                answer_span = answer_passage_spans[0]

                start_position = answer_span[0]
                end_position = answer_span[1]

                span_length = end_position - start_position + 1

                attention = [0.0 for _ in range(passage_length)]

                attention[start_position:end_position +
                          1] = [1.0] * span_length

                if self._withnoise:
                    attention = [
                        x + abs(random.gauss(0, 0.001)) for x in attention
                    ]

                if self._normalized:
                    attention_sum = sum(attention)
                    attention = [float(x) / attention_sum for x in attention]

                passage_span_fields = ArrayField(np.array(
                    [[start_position, end_position]]),
                                                 padding_value=-1)

                fields["passage_attention"] = ArrayField(np.array(attention),
                                                         padding_value=0.0)

                fields["passage_lengths"] = MetadataField(passage_length)

                fields["answer_as_passage_spans"] = passage_span_fields

                instances.append(Instance(fields))

        return instances
    def text_to_instance(self, sample: list) -> Instance:
        fields = {}

        text: str = sample[0].strip()

        words = self.tokenizer(text)
        if 'max_word_len' in self.configuration:
            words = words[:self.configuration['max_word_len']]
        sample.append(words)

        graph = self._build_graph(text)
        sample.append(graph)

        tokens = [Token(word) for word in words]

        sentence_field = TextField(tokens, self.token_indexers)
        fields['tokens'] = sentence_field

        position = [Token(str(i)) for i in range(len(tokens))]
        position_field = TextField(position, self.position_indexers)
        fields['position'] = position_field

        aspects = [Token(category) for category in self.categories]
        aspect_field = TextField(aspects, self.aspect_indexers)
        fields['aspects'] = aspect_field

        category_labels = [0] * len(self.categories)
        polarity_labels = [-100] * len(self.categories)
        total_labels = []
        if len(sample) > 1:
            labels: list = sample[1]
            for label in labels:
                category_labels[label[0]] = 1
                polarity_labels[label[0]] = label[1]
        for i in range(len(self.categories)):
            if polarity_labels[i] == -100:
                total_labels.append(0)
            else:
                total_labels.append(polarity_labels[i] + category_labels[i])

        label_field = ArrayField(
            np.array(category_labels + polarity_labels + total_labels))
        fields["label"] = label_field
        polarity_mask = [
            1 if polarity_labels[i] != -100 else 0
            for i in range(len(self.categories))
        ]
        polarity_mask_field = ArrayField(np.array(polarity_mask))
        fields['polarity_mask'] = polarity_mask_field

        # stop_word_labels = [1 if word in english_stop_words else 0 for word in words]
        # stop_word_num = sum(stop_word_labels)
        # stop_word_labels = [label / stop_word_num for label in stop_word_labels]
        # sample.append(stop_word_labels)

        sample_field = MetadataField(sample)
        fields["sample"] = sample_field
        return Instance(fields)
Example #28
0
    def text_to_instance(self,
                         tokenized_text: List[str],
                         candidate_entities: List[List[str]],
                         candidate_spans: List[List[int]],
                         candidate_entity_prior: List[List[float]],
                         gold_entities: List[str] = None,
                         doc_id: str = None):

        #assert doc_id is not None

        token_field = TextField([Token(x) for x in tokenized_text],
                                self.token_indexers)
        span_fields = ListField(
            [SpanField(*span, token_field) for span in candidate_spans])

        candidate_entities = TextField([
            Token(" ".join(candidate_list))
            for candidate_list in candidate_entities
        ],
                                       token_indexers=self.entity_indexer)

        max_cands = max(len(p) for p in candidate_entity_prior)
        for p in candidate_entity_prior:
            if len(p) < max_cands:
                p.extend([0.0] * (max_cands - len(p)))
        np_prior = np.array(candidate_entity_prior)
        prior_field = ArrayField(np_prior)

        # only one segment
        candidate_segment_ids = ArrayField(np.array([0] *
                                                    len(candidate_entities)),
                                           dtype=np.int)

        fields = {
            "tokens": token_field,
            "candidate_spans": span_fields,
            "candidate_entities": candidate_entities,
            "candidate_entity_prior": prior_field,
            "candidate_segment_ids": candidate_segment_ids
        }
        if gold_entities:
            labels = TextField([Token(entity) for entity in gold_entities],
                               token_indexers=self.entity_indexer)
            fields["gold_entities"] = labels

        #fields["doc_id"] = MetadataField(doc_id)

        if self.extra_candidate_generators:
            tokens = " ".join(tokenized_text)
            extra_candidates = {
                key: generator.get_mentions_raw_text(tokens,
                                                     whitespace_tokenize=True)
                for key, generator in self.extra_candidate_generators.items()
            }
            fields['extra_candidates'] = MetadataField(extra_candidates)

        return Instance(
            fields, should_remap_span_indices=self.should_remap_span_indices)
Example #29
0
 def __init__(self,
              array: np.ndarray,
              dtype: np.dtype,
              sequence_dim: int = 0,
              padding_value: int = 0) -> None:
     # pylint: disable=super-init-not-called
     ArrayField.__init__(self, array, padding_value=padding_value)
     self._dtype = dtype
     self._sequence_dim = sequence_dim
Example #30
0
    def article_to_instance(self, paragraphs, relative_score, named_entities,
                            image, caption, image_path, web_url, pos,
                            face_embeds, obj_feats, image_id,
                            article_id) -> Instance:
        context = paragraphs

        # context_tokens = [self._tokenizer.tokenize(p["text"]) for p in paragraphs]
        # context_tokens = [self._tokenizer.tokenize(p["text"]) for p in paragraphs]
        context_tokens = [self._tokenizer.tokenize(c['text']) for c in context]
        caption_tokens = self._tokenizer.tokenize(caption)
        name_token_list = [self._tokenizer.tokenize(n) for n in named_entities]

        if name_token_list:
            name_field = [
                TextField(tokens, self._token_indexers)
                for tokens in name_token_list
            ]
        else:
            stub_field = ListTextField(
                [TextField(caption_tokens, self._token_indexers)])
            name_field = stub_field.empty_field()

        # print([TextField(p, self._token_indexers) for p in context_tokens])
        # print(ListTextField([TextField(p, self._token_indexers) for p in context_tokens]))

        fields = {
            # 'context': TextField(context_tokens, self._token_indexers),
            'context':
            ListTextField(
                [TextField(p, self._token_indexers) for p in context_tokens]),
            # 'context': ListTextField(context_tokens),
            'names':
            ListTextField(name_field),
            'image':
            ImageField(image, self.preprocess),
            'caption':
            TextField(caption_tokens, self._token_indexers),
            'face_embeds':
            ArrayField(face_embeds, padding_value=np.nan),
            'label':
            LabelField(int(relative_score), skip_indexing=True)
        }

        if obj_feats is not None:
            fields['obj_embeds'] = ArrayField(obj_feats, padding_value=np.nan)
        '''metadata = {'context': context,
                    'caption': caption,
                    'names': named_entities,
                    'web_url': web_url,
                    'image_path': image_path,
                    'image_pos': pos,
                    'image_id': image_id,
                    'article_id': article_id}'''
        metadata = {}
        fields['metadata'] = MetadataField(metadata)

        return Instance(fields)
    def _read(self, file_path: str) -> Iterator[Instance]:
        """
        This function takes a filename, read the data and produces a stream of Instances
        :param str file_path: the path to the file with the data
        :return:
        """
        # Load the data
        if 'csv' in file_path:
            df = pd.read_csv(file_path)
        else:
            df = joblib.load(file_path)

        # if we run with CV we need the pair ids to use
        if self.pair_ids is not None:
            df = df.loc[df.pair_id.isin(self.pair_ids)]

        # get the reviews and label columns -> no metadata, and metadata columns
        metadata_columns = ['raisha', 'pair_id', 'sample_id']
        rounds = list(range(1, 11))  # rounds 1-10

        for i, row in tqdm.tqdm(df.iterrows()):
            raisha = row.raisha  # raisha is between 0 to 9 (the rounds in the raisha are rounds <= raisha)
            if raisha == 0:
                continue
            saifa_text_list, raisha_text_list = list(), list()
            for round_num in rounds:
                # use only available rounds
                if row[f'features_round_{round_num}'] is not None:
                    if round_num <= raisha:  # rounds in raisha
                        extra_columns = [-1] * (self.input_dim - len(
                            row[f'features_round_{round_num}']))
                        raisha_data = row[
                            f'features_round_{round_num}'] + extra_columns
                        raisha_text_list.append(
                            ArrayField(np.array(raisha_data),
                                       padding_value=-1))
                    else:  # rounds in saifa
                        if self.only_raisha and round_num == raisha + 1:
                            saifa_data = [
                                100
                            ] * self.input_dim  # special vector to indicate the start of the saifa
                        else:
                            extra_columns = [-1] * (self.input_dim - len(
                                row[f'features_round_{round_num}']))
                            saifa_data = row[
                                f'features_round_{round_num}'] + extra_columns
                        saifa_text_list.append(
                            ArrayField(np.array(saifa_data), padding_value=-1))
            labels = row[self._label_column]
            metadata_dict = {
                column: row[column]
                for column in metadata_columns
            }
            yield self.text_to_instance(saifa_text_list=saifa_text_list,
                                        raisha_text_list=raisha_text_list,
                                        labels=labels,
                                        metadata=metadata_dict)
Example #32
0
    def article_to_bm_instance(self, paragraph, paragraph_score,
                               named_entities, image, caption, image_path,
                               web_url, pos, face_embeds, obj_feats,
                               image_id) -> Instance:
        # context = ' BLABLA '.join([p["text"] for p in paragraphs]).strip()
        context = paragraph

        # context_tokens = [self._tokenizer.tokenize(p["text"]) for p in paragraphs]
        # context_tokens = [self._tokenizer.tokenize(p["text"]) for p in paragraphs]
        context_tokens = self._tokenizer.tokenize(context)
        caption_tokens = self._tokenizer.tokenize(caption)
        name_token_list = [
            self._tokenizer.tokenize(n["text"]) for n in named_entities
        ]

        if name_token_list:
            name_field = [
                TextField(tokens, self._token_indexers)
                for tokens in name_token_list
            ]
        else:
            stub_field = ListTextField(
                [TextField(caption_tokens, self._token_indexers)])
            name_field = stub_field.empty_field()

        context = TextField(context_tokens, self._token_indexers)
        context.index(self.model.vocab)
        context = context.as_tensor(context.get_padding_lengths())
        fields = {
            'context': context,
            # 'context': ListTextField([TextField(p, self._token_indexers) for p in context_tokens]),
            # 'context': ListTextField(context_tokens),
            'names': ListTextField(name_field),
            'image': ImageField(image, self.preprocess),
            'caption': TextField(caption_tokens, self._token_indexers),
            'face_embeds': ArrayField(face_embeds, padding_value=np.nan),
            'label': ArrayField(np.array([paragraph_score]))
            # 'labels': ArrayField(paragraphs_score)
        }

        if obj_feats is not None:
            fields['obj_embeds'] = ArrayField(obj_feats, padding_value=np.nan)

        metadata = {
            'context': context,
            'caption': caption,
            'names': named_entities,
            'web_url': web_url,
            'image_path': image_path,
            'image_pos': pos,
            'image_id': image_id,
            'label': paragraph_score
        }
        fields['metadata'] = MetadataField(metadata)

        return Instance(fields)
Example #33
0
    def test_padding_handles_list_fields_with_padding_values(self):
        array1 = ArrayField(numpy.ones([2, 3]), padding_value=-1)
        array2 = ArrayField(numpy.ones([1, 5]), padding_value=-1)
        empty_array = array1.empty_field()
        list_field = ListField([array1, array2, empty_array])

        returned_tensor = list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy()
        correct_tensor = numpy.array([[[1., 1., 1., -1., -1.],
                                       [1., 1., 1., -1., -1.]],
                                      [[1., 1., 1., 1., 1.],
                                       [-1., -1., -1., -1., -1.]],
                                      [[-1., -1., -1., -1., -1.],
                                       [-1., -1., -1., -1., -1.]]])
        numpy.testing.assert_array_equal(returned_tensor, correct_tensor)
Example #34
0
 def test_as_tensor_works_with_scalar(self):
     array = ArrayField(numpy.asarray(42))
     returned_tensor = array.as_tensor(array.get_padding_lengths())
     current_tensor = numpy.asarray(42)
     numpy.testing.assert_array_equal(returned_tensor, current_tensor)