def test_get_padding_lengths_correctly_returns_ordered_shape(self): shape = [3, 4, 5, 6] array = numpy.zeros(shape) array_field = ArrayField(array) lengths = array_field.get_padding_lengths() for i in range(len(lengths)): assert lengths["dimension_{}".format(i)] == shape[i]
def text_to_instance( self, tokens: List[str], wl_feats: List[float], sl_feats: List[float], labels: List[int], id: str, oov: Dict[str, Dict[str, List[float]]] = None) -> Instance: fields = {} if oov is None: sentence_field = MemoryOptimizedTextField( self.token_extender(tokens), self.token_indexers) else: sentence_field = MemoryOptimizedTextField( self.token_extender(self.oov_token_swapper.swap(tokens, oov)), self.token_indexers) fields["tokens"] = sentence_field fields["word_level_features"] = ArrayField(array=np.array(wl_feats)) fields["sentence_level_features"] = ArrayField( array=np.array(sl_feats)) label_field = ArrayField(array=np.array(labels)) fields["label"] = label_field fields["id"] = MetadataField(id) return Instance(fields)
def text_to_instance(self, example) -> Instance: # type: ignore """ Parameters ---------- vec : ``np.array``, required. The text to classify Returns ------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence or phrase. label : ``LabelField`` The label label of the sentence or phrase. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} fields['doc'] = ArrayField(example["text"]) if len(example["entities"]) == 0: return stacked_entities = np.stack([ np.asarray(entity["text"].sum(0)).squeeze(0) for entity in example["entities"] ]) fields['entities'] = ArrayField(stacked_entities) return Instance(fields)
def test_alternative_dtypes(self): shape = [3, 4, 5, 6] array = numpy.zeros(shape) # Setting dtype to numpy.int64 should produce a torch.LongTensor when field is converted to # a tensor array_field1 = ArrayField(array, dtype=numpy.int64) returned_tensor1 = array_field1.as_tensor( array_field1.get_padding_lengths()) assert returned_tensor1.dtype == torch.int64 # Setting dtype to numpy.uint8 should produce a torch.ByteTensor when field is converted to # a tensor array_field2 = ArrayField(array, dtype=numpy.uint8) returned_tensor2 = array_field2.as_tensor( array_field2.get_padding_lengths()) assert returned_tensor2.dtype == torch.uint8 # Padding should not affect dtype padding_lengths = { "dimension_" + str(i): 10 for i, _ in enumerate(shape) } padded_tensor = array_field2.as_tensor(padding_lengths) assert padded_tensor.dtype == torch.uint8 # Empty fields should have the same dtype empty_field = array_field2.empty_field() assert empty_field.dtype == array_field2.dtype
def get_token_mapping_field( token_to_document_indices: Dict[str, List[int]], summary: List[Token]) -> Tuple[ListField, ListField]: """ Creates an ``ArrayField`` that, for each token in the summary, contains the list of document indices for which that token appears, plus the corresponding mask. Parameters ---------- token_to_document_indices: The mapping from each token to the list of indices in the document it appears. summary: The summary tokens. Returns ------- ``ArrayField``: (num_summary_tokens, max_num_matches) The mapping field. ``ArrayField``: (num_summary_tokens, max_num_matches) The corresponding mask. """ summary_token_document_indices = [] mask = [] for token in summary: indices = token_to_document_indices[str(token)] summary_token_document_indices.append(ArrayField(np.array(indices))) mask.append(ArrayField(np.ones(len(indices)))) # Convert these into fields summary_token_document_indices_field = ListField( summary_token_document_indices) mask_field = ListField(mask) return summary_token_document_indices_field, mask_field
def text_to_instance(self, headline, body, headline_sentiment, body_sentiment, tfidf, stance=None): headline_tokens = self._tokenizer.tokenize(headline) body_tokens = self._tokenizer.tokenize(body) headline_field = TextField(headline_tokens, self._token_indexers) body_field = TextField(body_tokens, self._token_indexers) headline_sentiment_field = ArrayField(headline_sentiment) body_sentiment_field = ArrayField(body_sentiment) tfidf_field = ArrayField(tfidf) fields = { 'headline': headline_field, 'body': body_field, 'headline_sentiment': headline_sentiment_field, 'body_sentiment': body_sentiment_field, 'tfidf': tfidf_field } if stance is not None: fields['stance'] = LabelField(stance) metadata = { "headline_tokens": [x.text for x in headline_tokens], "body_tokens": [x.text for x in body_tokens] } fields["metadata"] = MetadataField(metadata) return Instance(fields)
def prepare_instance(self, article): sample = self.prepare_sample(article) context = '\n'.join(sample['paragraphs']).strip() context_tokens = self.tokenizer.tokenize(context) # proper_infos = self._get_context_names(context) fields = { # 'context': CopyTextField(context_tokens, self.token_indexers, proper_infos, proper_infos, 'context'), 'context': TextField(context_tokens, self.token_indexers), 'image': ImageField(sample['image'], self.preprocess), 'face_embeds': ArrayField(sample['face_embeds'], padding_value=np.nan), 'obj_embeds': ArrayField(sample['obj_embeds'], padding_value=np.nan), } metadata = { 'title': sample['title'], 'start': '\n'.join(sample['start']).strip(), 'before': '\n'.join(sample['before']).strip(), 'after': '\n'.join(sample['after']).strip(), 'image': CenterCrop(224)(Resize(256)(sample['image'])) } fields['metadata'] = MetadataField(metadata) return Instance(fields)
def text_to_instance( self, # type: ignore image: Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]], hypothesis: str, label: Optional[str] = None, *, use_cache: bool = True, ) -> Instance: tokenized_hypothesis = self._tokenizer.tokenize(hypothesis) hypothesis_field = TextField(tokenized_hypothesis, None) fields: Dict[str, Field] = {"hypothesis": hypothesis_field} if image is not None: if isinstance(image, str): features, coords, _, _ = next( self._process_image_paths([image], use_cache=use_cache) ) else: features, coords, _, _ = image fields["box_features"] = ArrayField(features) fields["box_coordinates"] = ArrayField(coords) fields["box_mask"] = ArrayField( features.new_ones((features.shape[0],), dtype=torch.bool), padding_value=False, dtype=torch.bool, ) if label: fields["labels"] = LabelField(label) return Instance(fields)
def text_to_instance(self, game_id, qas, image_id, target_object, dialogue_features, target_attributes=None) -> Instance: metadata = { "game_id": game_id, "image_id": image_id, "target_object": target_object, "qas": qas } instance = { "metadata": MetadataField(metadata), "dialogue_states": ArrayField(dialogue_features) } if target_attributes is not None: instance["target_attributes"] = ArrayField(np.concatenate(target_attributes)) if isinstance( target_attributes, tuple) else \ ArrayField(target_attributes) return Instance(instance)
def text_to_instance(self, sentence: str, target: str, polarity_label: str = None) -> Instance: example: Example = text_to_example(sentence, target, polarity_label) tokens = [Token(token.text) for token in example.spacy_document] if len(tokens) == 0: print(sentence) raise RuntimeError("no sentence") text_field = TextField(tokens, self._token_indexers) adj_in_field = ArrayField(example.adj_in) adj_out_field = ArrayField(example.adj_out) transformer_indices = MetadataField(example.transformer_indices) span_indices = MetadataField(example.span_indices) fields = { "tokens": text_field, "adj_in": adj_in_field, "adj_out": adj_out_field, "transformer_indices": transformer_indices, "span_indices": span_indices } if example.polarity_label: label_field = LabelField(polarity_label, label_namespace="labels") fields["label"] = label_field return Instance(fields)
def text_to_instance(self, text: str, one_array:Tuple[np.array, np.array]=None, one_position: Tuple[int, int]=None, many_array:Tuple[np.array, np.array]=None) -> Instance: "训练的时候,输入这些用于训练我们的模型。至于验证时,则应重新写一个验证数据读取类" length = len(text) if self.pretrained_tokenizer is not None: tokens = get_word_from_pretrained(self.pretrained_tokenizer, text) else: tokens = [Token(w) for w in text] text_field = TextField(tokens, self._token_indexers) span = SpanField(one_position[0], one_position[1], text_field) dtype:numpy.dtype = np.dtype(numpy.float32) one_s = ArrayField(one_array[0], dtype=dtype) one_e = ArrayField(one_array[1], dtype=dtype) mang_s = ArrayField(many_array[0], dtype=dtype) mang_e = ArrayField(many_array[1], dtype=dtype) fields = { "tokens": text_field, "span": span, "one_s": one_s, "one_e": one_e, "many_s": mang_s, "many_e": mang_e, "metadata": MetadataField(None) # 训练的时候,不需要知道这个。而验证集需要,故占此位置 } return Instance(fields)
def line_to_instance( self, query: List[Token], *docs: List[Tuple[List[Token], float, int]]) -> Instance: query_field = TextField(query, self.q_token_indexers) doc_fields = [TextField(doc[0], self.d_token_indexers) for doc in docs] fields = { 'query': query_field, 'docs': ListField(doc_fields), } if self.scores: lex_fields = [ArrayField(np.array([doc[1]])) for doc in docs] fields['scores'] = ListField(lex_fields) label_fields = [ArrayField(np.array([doc[2]])) for doc in docs] fields['labels'] = ListField(label_fields) # used to compute full AQWV and MAP scores from partial data relevant_ignored_field = ArrayField(np.array([relevant_ignored])) fields['relevant_ignored'] = relevant_ignored_field irrelevant_ignored_field = ArrayField(np.array([irrelevant_ignored])) fields['irrelevant_ignored'] = irrelevant_ignored_field return Instance(fields)
def text_to_instance( self, prefix: str, suffix_a: str, suffix_b: str, ) -> Instance: # HuggingFace's tokenizers require leading whitespace. prefix_tokens = self._tokenizer.tokenize(' ' + prefix) suffix_a_tokens = self._tokenizer.tokenize(' ' + suffix_a) suffix_b_tokens = self._tokenizer.tokenize(' ' + suffix_b) tokens_a = prefix_tokens + suffix_a_tokens tokens_b = prefix_tokens + suffix_b_tokens eval_mask_a = np.array([0] * len(prefix_tokens) + [1] * len(suffix_a_tokens)) eval_mask_b = np.array([0] * len(prefix_tokens) + [1] * len(suffix_b_tokens)) metadata = { 'prefix': [t.text for t in prefix_tokens], 'suffix_a': [t.text for t in suffix_a_tokens], 'suffix_b': [t.text for t in suffix_b_tokens], } fields = { 'tokens_a': TextField(tokens_a, token_indexers=self._token_indexers), 'tokens_b': TextField(tokens_b, token_indexers=self._token_indexers), 'eval_mask_a': ArrayField(eval_mask_a, dtype=bool), 'eval_mask_b': ArrayField(eval_mask_b, dtype=bool), 'metadata': MetadataField(metadata), } return Instance(fields)
def get_common_field(self, context_flat: List[TokenAdd], query: List[TokenAdd], rewrite: Optional[List[TokenAdd]] = None): fields: Dict[str, Field] = {} # inspect the oov words in the context and query # and get the extend ids with oov words extend_context_ids, oovs = self.context2ids(context_words=context_flat) extend_query_ids, oovs = self.query2ids(query_words=query, oovs=oovs) oovs_len = LabelField(label=len(oovs), label_namespace="len_tags", skip_indexing=True) context_len_field = LabelField(label=len(context_flat), label_namespace="len_tags", skip_indexing=True) query_len_field = LabelField(label=len(query), label_namespace="len_tags", skip_indexing=True) fields['extend_context_ids'] = ArrayField( np.array(extend_context_ids, dtype=np.int32)) fields['extend_query_ids'] = ArrayField( np.array(extend_query_ids, dtype=np.int32)) # preserve the length info in order to get the mask fields['oovs_len'] = oovs_len fields['context_len'] = context_len_field fields['query_len'] = query_len_field # preserve the original text metadata = { "context_words": "".join([token.text for token in context_flat[1:-1]]), # str "query_words": [token.text for token in query][:-1], # List[str] "oovs": oovs # List[str] } if rewrite is not None: rewrite_input_tokens, rewrite_targ_tokens = self.get_dec_inp_targ_seqs( rewrite) # get the extend rewrite ids extend_rewrite_ids = self.rewrite2ids( rewrite_words=rewrite_targ_tokens, oovs=oovs) rewrite_len_field = LabelField(label=len(rewrite_input_tokens), label_namespace="len_tags", skip_indexing=True) rewrite_input_tokens_field = TextField(rewrite_input_tokens, self._token_indexers) rewrite_targ_tokens_field = TextField(rewrite_targ_tokens, self._token_indexers) fields['rewrite_input_ids'] = rewrite_input_tokens_field fields['rewrite_target_ids'] = rewrite_targ_tokens_field fields['extend_rewrite_ids'] = ArrayField( np.array(extend_rewrite_ids, dtype=np.int32)) fields['rewrite_len'] = rewrite_len_field metadata["rewrite"] = [token.text for token in rewrite] # List[str] fields['metadata'] = MetadataField(metadata) return fields
def text_to_instance(self, tokens: List[str], candidate_entities: List[List[str]], candidate_spans: List[List[int]], candidate_entity_prior: List[List[float]], gold_entities: List[str] = None, gold_data_ids: List[str] = None): # prior needs to be 2D and full # can look like [[0.2, 0.8], [1.0]] if one candidate for second # candidate span and two candidates for first max_cands = max(len(p) for p in candidate_entity_prior) for p in candidate_entity_prior: if len(p) < max_cands: p.extend([0.0] * (max_cands - len(p))) np_prior = np.array(candidate_entity_prior) fields = { "tokens": TextField([Token(t) for t in tokens], token_indexers=self.token_indexers), # join by space, then retokenize in the "character indexer" "candidate_entities": TextField([ Token(" ".join(candidate_list)) for candidate_list in candidate_entities ], token_indexers=self.entity_indexer), "candidate_entity_prior": ArrayField(np.array(np_prior)), # only one sentence "candidate_segment_ids": ArrayField(np.array([0] * len(candidate_entities)), dtype=np.int) } if gold_entities is not None: fields["gold_entities"] = TextField( [Token(entity) for entity in gold_entities], token_indexers=self.entity_indexer) if gold_data_ids is not None: fields["gold_data_ids"] = MetadataField(gold_data_ids) span_fields = [] for span in candidate_spans: span_fields.append(SpanField(span[0], span[1], fields['tokens'])) fields['candidate_spans'] = ListField(span_fields) if self.extra_candidate_generators: tokens = " ".join(tokens) extra_candidates = { key: generator.get_mentions_raw_text(tokens, whitespace_tokenize=True) for key, generator in self.extra_candidate_generators.items() } fields['extra_candidates'] = MetadataField(extra_candidates) return Instance( fields, should_remap_span_indices=self.should_remap_span_indices)
def text_to_instance(self, label, response=None, original_post=None, weakpoints=None, op_features=None, response_features=None, op_doc_features=None, response_doc_features=None, goodpoints=None) -> Instance: fields: Dict[str, Field] = {} if original_post is not None: fields['original_post'] = ListField([ TextField( self._tokenizer.tokenize(s)[:self.max_sentence_len], self._token_indexers) for s in original_post[:self.max_post_len] ]) if weakpoints is not None: fields['weakpoints'] = ListField([ IndexField(wp, fields['original_post']) for wp in weakpoints ]) if response is not None: fields['response'] = ListField([ TextField( self._tokenizer.tokenize(s)[:self.max_sentence_len], self._token_indexers) for s in response[:self.max_post_len] ]) if goodpoints is not None: fields['goodpoints'] = ListField( [IndexField(gp, fields['response']) for gp in goodpoints]) if op_features is not None: fields['op_features'] = ListField([ ArrayField(np.array(f)) for f in op_features[:self.max_post_len] ]) if response_features is not None: fields['response_features'] = ListField([ ArrayField(np.array(f)) for f in response_features[:self.max_post_len] ]) if op_doc_features is not None: fields['op_doc_features'] = ArrayField(np.array(op_doc_features)) if response_doc_features is not None: fields['response_doc_features'] = ArrayField( np.array(response_doc_features)) fields['label'] = LabelField(label, skip_indexing=True) return Instance(fields)
def test_as_tensor_handles_larger_padding_dimensions(self): shape = [3, 4] array = numpy.ones(shape) array_field = ArrayField(array) padded_tensor = array_field.as_tensor({"dimension_0": 5, "dimension_1": 6}).detach().cpu().numpy() numpy.testing.assert_array_equal(padded_tensor[:3, :4], array) numpy.testing.assert_array_equal(padded_tensor[3:, 4:], 0.)
def samples_to_instance(self, sample: SampleT, label: int) -> Instance: head = ArrayField(np.array(sample[0], dtype=np.int), dtype=np.int) relation = ArrayField(np.array(sample[2], dtype=np.int), dtype=np.int) tail = ArrayField(np.array(sample[1], dtype=np.int), dtype=np.int) label_f = LabelField(label, skip_indexing=True) fields = {'h': head, 't': tail, 'r': relation, 'label': label_f} return Instance(fields)
def samples_to_instance( self, sample: Tuple[int, List[int], List[int]]) -> Instance: node = ArrayField(np.array(sample[0], dtype=np.int), dtype=np.int) parents = ArrayField(np.array(sample[1], dtype=np.int), dtype=np.int) children = ArrayField(np.array(sample[2], dtype=np.int), dtype=np.int) fields = {'node': node, 'gt_parent': parents, 'gt_child': children} return Instance(fields)
def test_eq(self): array1 = ArrayField(numpy.asarray([1, 1, 1])) array2 = ArrayField(numpy.asarray([[1, 1, 1], [1, 1, 1]])) array3 = ArrayField(numpy.asarray([1, 1, 2])) array4 = ArrayField(numpy.asarray([1, 1, 1])) assert array1 != array2 assert array1 != array3 assert array1 == array4
def sample_to_instance(self, sample: Tuple[int, int, int, int]) -> Instance: head = ArrayField(np.array(sample[0], dtype=np.int), dtype=np.int) tail = ArrayField(np.array(sample[1], dtype=np.int), dtype=np.int) relation = ArrayField(np.array(sample[2], dtype=np.int), dtype=np.int) label = ArrayField(np.array(sample[3], dtype=np.int), dtype=np.int) return Instance({'h': head, 't': tail, 'r': relation, 'label': label})
def __init__(self, array: np.ndarray, dtype: np.dtype, sequence_dim: int = 0, padding_value: int = 0) -> None: ArrayField.__init__(self, array=array, padding_value=padding_value) self._dtype = dtype self._sequence_dim = sequence_dim
def text_to_instance(self, graph) -> Instance: """ Does bulk of work converting a graph to an Instance of Fields """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} max_tgt_length = None if self.eval else 60 d = UDGraph(graph) list_data = d.get_list_data( bert_tokenizer = self._tokenizer) if list_data is None: return None # These four fields are used for seq2seq model and target side self copy fields["source_tokens"] = TextField( tokens=[Token(x) for x in list_data["src_tokens"]], token_indexers=self._source_token_indexers ) if list_data['src_token_ids'] is not None: fields['source_subtoken_ids'] = ArrayField(list_data['src_token_ids']) self._number_bert_ids += len(list_data['src_token_ids']) self._number_bert_oov_ids += len( [bert_id for bert_id in list_data['src_token_ids'] if bert_id == 100]) if list_data['src_token_subword_index'] is not None: fields['source_token_recovery_matrix'] = ArrayField(list_data['src_token_subword_index']) fields["source_pos_tags"] = SequenceLabelField( labels=list_data["src_pos_tags"], sequence_field=fields["source_tokens"], label_namespace="pos_tags" ) fields["syn_edge_types"] = TextField( tokens=[Token(x) for x in list_data["syn_head_tags"]], token_indexers=self._syntax_edge_type_indexers, ) fields["syn_edge_heads"] = SequenceLabelField( labels=list_data["syn_head_indices"], sequence_field=fields["syn_edge_types"], label_namespace="syn_edge_heads" ) fields['syn_edge_head_mask'] = ArrayField(list_data['syn_edge_mask']) fields['syn_valid_node_mask'] = ArrayField(list_data['syn_node_mask']) fields["syn_node_name_list"] = MetadataField( list_data["syn_node_name_list"]) # Metadata fields, good for debugging fields["src_tokens_str"] = MetadataField( list_data["src_tokens"] ) return Instance(fields)
def _read(self, file_path: str): # pylint: disable=logging-fstring-interpolation instances: List[Instance] = [] with open(file_path) as dataset_file: dataset = json.load(dataset_file) logger.info(f"Reading the dataset from: {file_path}") for passage_id, passage_info in dataset.items(): passage_text = passage_info[constants.tokenized_passage] passage_length = len(passage_text.split(' ')) for question_answer in passage_info[constants.qa_pairs]: fields = {} answer_passage_spans = question_answer[ constants.answer_passage_spans] if len(answer_passage_spans) == 0: print("NO PASSAGE SPAN AS ANS") continue # TODO(nitish): Only using first span as answer answer_span = answer_passage_spans[0] start_position = answer_span[0] end_position = answer_span[1] span_length = end_position - start_position + 1 attention = [0.0 for _ in range(passage_length)] attention[start_position:end_position + 1] = [1.0] * span_length if self._withnoise: attention = [ x + abs(random.gauss(0, 0.001)) for x in attention ] if self._normalized: attention_sum = sum(attention) attention = [float(x) / attention_sum for x in attention] passage_span_fields = ArrayField(np.array( [[start_position, end_position]]), padding_value=-1) fields["passage_attention"] = ArrayField(np.array(attention), padding_value=0.0) fields["passage_lengths"] = MetadataField(passage_length) fields["answer_as_passage_spans"] = passage_span_fields instances.append(Instance(fields)) return instances
def text_to_instance(self, sample: list) -> Instance: fields = {} text: str = sample[0].strip() words = self.tokenizer(text) if 'max_word_len' in self.configuration: words = words[:self.configuration['max_word_len']] sample.append(words) graph = self._build_graph(text) sample.append(graph) tokens = [Token(word) for word in words] sentence_field = TextField(tokens, self.token_indexers) fields['tokens'] = sentence_field position = [Token(str(i)) for i in range(len(tokens))] position_field = TextField(position, self.position_indexers) fields['position'] = position_field aspects = [Token(category) for category in self.categories] aspect_field = TextField(aspects, self.aspect_indexers) fields['aspects'] = aspect_field category_labels = [0] * len(self.categories) polarity_labels = [-100] * len(self.categories) total_labels = [] if len(sample) > 1: labels: list = sample[1] for label in labels: category_labels[label[0]] = 1 polarity_labels[label[0]] = label[1] for i in range(len(self.categories)): if polarity_labels[i] == -100: total_labels.append(0) else: total_labels.append(polarity_labels[i] + category_labels[i]) label_field = ArrayField( np.array(category_labels + polarity_labels + total_labels)) fields["label"] = label_field polarity_mask = [ 1 if polarity_labels[i] != -100 else 0 for i in range(len(self.categories)) ] polarity_mask_field = ArrayField(np.array(polarity_mask)) fields['polarity_mask'] = polarity_mask_field # stop_word_labels = [1 if word in english_stop_words else 0 for word in words] # stop_word_num = sum(stop_word_labels) # stop_word_labels = [label / stop_word_num for label in stop_word_labels] # sample.append(stop_word_labels) sample_field = MetadataField(sample) fields["sample"] = sample_field return Instance(fields)
def text_to_instance(self, tokenized_text: List[str], candidate_entities: List[List[str]], candidate_spans: List[List[int]], candidate_entity_prior: List[List[float]], gold_entities: List[str] = None, doc_id: str = None): #assert doc_id is not None token_field = TextField([Token(x) for x in tokenized_text], self.token_indexers) span_fields = ListField( [SpanField(*span, token_field) for span in candidate_spans]) candidate_entities = TextField([ Token(" ".join(candidate_list)) for candidate_list in candidate_entities ], token_indexers=self.entity_indexer) max_cands = max(len(p) for p in candidate_entity_prior) for p in candidate_entity_prior: if len(p) < max_cands: p.extend([0.0] * (max_cands - len(p))) np_prior = np.array(candidate_entity_prior) prior_field = ArrayField(np_prior) # only one segment candidate_segment_ids = ArrayField(np.array([0] * len(candidate_entities)), dtype=np.int) fields = { "tokens": token_field, "candidate_spans": span_fields, "candidate_entities": candidate_entities, "candidate_entity_prior": prior_field, "candidate_segment_ids": candidate_segment_ids } if gold_entities: labels = TextField([Token(entity) for entity in gold_entities], token_indexers=self.entity_indexer) fields["gold_entities"] = labels #fields["doc_id"] = MetadataField(doc_id) if self.extra_candidate_generators: tokens = " ".join(tokenized_text) extra_candidates = { key: generator.get_mentions_raw_text(tokens, whitespace_tokenize=True) for key, generator in self.extra_candidate_generators.items() } fields['extra_candidates'] = MetadataField(extra_candidates) return Instance( fields, should_remap_span_indices=self.should_remap_span_indices)
def __init__(self, array: np.ndarray, dtype: np.dtype, sequence_dim: int = 0, padding_value: int = 0) -> None: # pylint: disable=super-init-not-called ArrayField.__init__(self, array, padding_value=padding_value) self._dtype = dtype self._sequence_dim = sequence_dim
def article_to_instance(self, paragraphs, relative_score, named_entities, image, caption, image_path, web_url, pos, face_embeds, obj_feats, image_id, article_id) -> Instance: context = paragraphs # context_tokens = [self._tokenizer.tokenize(p["text"]) for p in paragraphs] # context_tokens = [self._tokenizer.tokenize(p["text"]) for p in paragraphs] context_tokens = [self._tokenizer.tokenize(c['text']) for c in context] caption_tokens = self._tokenizer.tokenize(caption) name_token_list = [self._tokenizer.tokenize(n) for n in named_entities] if name_token_list: name_field = [ TextField(tokens, self._token_indexers) for tokens in name_token_list ] else: stub_field = ListTextField( [TextField(caption_tokens, self._token_indexers)]) name_field = stub_field.empty_field() # print([TextField(p, self._token_indexers) for p in context_tokens]) # print(ListTextField([TextField(p, self._token_indexers) for p in context_tokens])) fields = { # 'context': TextField(context_tokens, self._token_indexers), 'context': ListTextField( [TextField(p, self._token_indexers) for p in context_tokens]), # 'context': ListTextField(context_tokens), 'names': ListTextField(name_field), 'image': ImageField(image, self.preprocess), 'caption': TextField(caption_tokens, self._token_indexers), 'face_embeds': ArrayField(face_embeds, padding_value=np.nan), 'label': LabelField(int(relative_score), skip_indexing=True) } if obj_feats is not None: fields['obj_embeds'] = ArrayField(obj_feats, padding_value=np.nan) '''metadata = {'context': context, 'caption': caption, 'names': named_entities, 'web_url': web_url, 'image_path': image_path, 'image_pos': pos, 'image_id': image_id, 'article_id': article_id}''' metadata = {} fields['metadata'] = MetadataField(metadata) return Instance(fields)
def _read(self, file_path: str) -> Iterator[Instance]: """ This function takes a filename, read the data and produces a stream of Instances :param str file_path: the path to the file with the data :return: """ # Load the data if 'csv' in file_path: df = pd.read_csv(file_path) else: df = joblib.load(file_path) # if we run with CV we need the pair ids to use if self.pair_ids is not None: df = df.loc[df.pair_id.isin(self.pair_ids)] # get the reviews and label columns -> no metadata, and metadata columns metadata_columns = ['raisha', 'pair_id', 'sample_id'] rounds = list(range(1, 11)) # rounds 1-10 for i, row in tqdm.tqdm(df.iterrows()): raisha = row.raisha # raisha is between 0 to 9 (the rounds in the raisha are rounds <= raisha) if raisha == 0: continue saifa_text_list, raisha_text_list = list(), list() for round_num in rounds: # use only available rounds if row[f'features_round_{round_num}'] is not None: if round_num <= raisha: # rounds in raisha extra_columns = [-1] * (self.input_dim - len( row[f'features_round_{round_num}'])) raisha_data = row[ f'features_round_{round_num}'] + extra_columns raisha_text_list.append( ArrayField(np.array(raisha_data), padding_value=-1)) else: # rounds in saifa if self.only_raisha and round_num == raisha + 1: saifa_data = [ 100 ] * self.input_dim # special vector to indicate the start of the saifa else: extra_columns = [-1] * (self.input_dim - len( row[f'features_round_{round_num}'])) saifa_data = row[ f'features_round_{round_num}'] + extra_columns saifa_text_list.append( ArrayField(np.array(saifa_data), padding_value=-1)) labels = row[self._label_column] metadata_dict = { column: row[column] for column in metadata_columns } yield self.text_to_instance(saifa_text_list=saifa_text_list, raisha_text_list=raisha_text_list, labels=labels, metadata=metadata_dict)
def article_to_bm_instance(self, paragraph, paragraph_score, named_entities, image, caption, image_path, web_url, pos, face_embeds, obj_feats, image_id) -> Instance: # context = ' BLABLA '.join([p["text"] for p in paragraphs]).strip() context = paragraph # context_tokens = [self._tokenizer.tokenize(p["text"]) for p in paragraphs] # context_tokens = [self._tokenizer.tokenize(p["text"]) for p in paragraphs] context_tokens = self._tokenizer.tokenize(context) caption_tokens = self._tokenizer.tokenize(caption) name_token_list = [ self._tokenizer.tokenize(n["text"]) for n in named_entities ] if name_token_list: name_field = [ TextField(tokens, self._token_indexers) for tokens in name_token_list ] else: stub_field = ListTextField( [TextField(caption_tokens, self._token_indexers)]) name_field = stub_field.empty_field() context = TextField(context_tokens, self._token_indexers) context.index(self.model.vocab) context = context.as_tensor(context.get_padding_lengths()) fields = { 'context': context, # 'context': ListTextField([TextField(p, self._token_indexers) for p in context_tokens]), # 'context': ListTextField(context_tokens), 'names': ListTextField(name_field), 'image': ImageField(image, self.preprocess), 'caption': TextField(caption_tokens, self._token_indexers), 'face_embeds': ArrayField(face_embeds, padding_value=np.nan), 'label': ArrayField(np.array([paragraph_score])) # 'labels': ArrayField(paragraphs_score) } if obj_feats is not None: fields['obj_embeds'] = ArrayField(obj_feats, padding_value=np.nan) metadata = { 'context': context, 'caption': caption, 'names': named_entities, 'web_url': web_url, 'image_path': image_path, 'image_pos': pos, 'image_id': image_id, 'label': paragraph_score } fields['metadata'] = MetadataField(metadata) return Instance(fields)
def test_padding_handles_list_fields_with_padding_values(self): array1 = ArrayField(numpy.ones([2, 3]), padding_value=-1) array2 = ArrayField(numpy.ones([1, 5]), padding_value=-1) empty_array = array1.empty_field() list_field = ListField([array1, array2, empty_array]) returned_tensor = list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy() correct_tensor = numpy.array([[[1., 1., 1., -1., -1.], [1., 1., 1., -1., -1.]], [[1., 1., 1., 1., 1.], [-1., -1., -1., -1., -1.]], [[-1., -1., -1., -1., -1.], [-1., -1., -1., -1., -1.]]]) numpy.testing.assert_array_equal(returned_tensor, correct_tensor)
def test_as_tensor_works_with_scalar(self): array = ArrayField(numpy.asarray(42)) returned_tensor = array.as_tensor(array.get_padding_lengths()) current_tensor = numpy.asarray(42) numpy.testing.assert_array_equal(returned_tensor, current_tensor)