def add_numeric_values_to_questions(interaction): """Adds numeric value spans to all questions.""" for question in interaction.questions: question.text = text_utils.normalize_for_match(question.original_text) question.annotations.CopyFrom( interaction_pb2.NumericValueSpans( spans=number_utils.parse_text(question.text)))
def _add_numeric_reference_from_cell( cell, references, row_index, column_index, ): """Adds number and date references.""" text = text_utils.normalize_for_match(cell.text) spans = number_utils.parse_text(text) for span in spans: # Only keep spans that match the entire cell. if span.end_index - span.begin_index != len(text): continue for value in span.values: if _is_numerically_one(value): # One is special because of singuglar/plural and the pronoun. continue identifier, reference_type = _to_identifier(text, value) _add_identifier( identifier, reference_type, cell.text, references, row_index, column_index, )
def _get_column_values(table, col_index): """Parses text in column and returns a dict mapping row_index to values.""" index_to_values = {} for row_index, row in enumerate(table.rows): text = text_utils.normalize_for_match(row.cells[col_index].text) index_to_values[row_index] = list(_get_numeric_values(text)) return index_to_values
def _add_text_fn(element): key, interaction = element new_interaction = interaction_pb2.Interaction() new_interaction.CopyFrom(interaction) text_utils.filter_invalid_unicode_from_table(new_interaction.table) for question in new_interaction.questions: question.text = text_utils.normalize_for_match(question.original_text) return key, new_interaction
def _get_question_cost( tokenizer, question, ): r"""Computes length of the serialized question (w/ special token offset).""" tokens = tokenizer.tokenize( text_utils.normalize_for_match(question.original_text)) return tokenizer.question_encoding_cost(tokens)
def get_interaction(interaction, table, statement, result, name): new_interaction = interaction_pb2.Interaction() if interaction.id: new_interaction.id = interaction.id else: new_interaction.id = interaction.table.table_id new_interaction.table.CopyFrom(_to_table_proto(table)) new_interaction.table.table_id = interaction.table.table_id new_question = new_interaction.questions.add() new_question.id = new_interaction.id + '_' + name new_question.original_text = statement.verbalize() new_question.text = text_utils.normalize_for_match( new_question.original_text) if result == EvaluationResult.TRUE: new_question.answer.class_index = 1 elif result == EvaluationResult.FALSE: new_question.answer.class_index = 0 else: raise ValueError('Unexpected: {result}') return new_interaction
def _get_question_references(question): """Converts numeric and entity annotations in question to references.""" references = {} spans = number_utils.parse_text( text_utils.normalize_for_match(question.original_text)) for span in spans: for value in span.values: if _is_numerically_one(value): # One is special because of singular/plural and the pronoun. continue text = question.original_text[span.begin_index:span.end_index] identifier, reference_type = _to_identifier(text, value) _add_identifier( identifier, reference_type, text, references, span.begin_index, span.end_index, ) annotated_text = question.Extensions[ annotated_text_pb2.AnnotatedText.annotated_question_ext] for annotation in annotated_text.annotations: begin_index = annotation.begin_byte_index end_index = annotation.end_byte_index _add_identifier( annotation.identifier, ReferenceType.ENTITY, question.original_text[begin_index:end_index], references, begin_index, end_index, ) return references
def test_normalize_for_match_lowercases(self): self.assertEqual("lowercase", text_utils.normalize_for_match("LOWERCASE"))