def synthesize_fn( key_interaction, config, add_opposite_table, use_fake_table, ): """Synthesizes up to 4 statements.""" key, interaction = key_interaction rng = np.random.RandomState(beam_utils.to_numpy_seed(key)) for new_interaction in synthesize_entablement.synthesize_from_interaction( config, rng, interaction, FlumeCounter(), add_opposite_table): if use_fake_table: _clear_table(new_interaction) yield new_interaction.id, new_interaction
def _to_retrieval_interaction_fn(interaction): """Converts pretraining interaction to retrieval interaction.""" beam.metrics.Metrics.counter(_NS, "Interactions").inc() title = get_title(interaction) if title is None or not title: beam.metrics.Metrics.counter(_NS, "Interactions without title").inc() return interaction = beam_utils.rekey(interaction) interaction.table.document_title = title word_tok = nltk.tokenize.treebank.TreebankWordTokenizer() for question in interaction.questions: if question.id == _TITLE_QUESTION_ID: continue text = question.original_text for paragraph in text.split("\n"): for sentence in sentence_tokenizer.tokenize(paragraph): sentence = sentence.strip() if not sentence: continue beam.metrics.Metrics.counter(_NS, "Sentences").inc() num_tokens = word_tok.tokenize(sentence) if len(num_tokens) < 4: beam.metrics.Metrics.counter(_NS, "Sentence too short").inc() continue if len(num_tokens) > 32: beam.metrics.Metrics.counter(_NS, "Sentence too long").inc() continue new_interaction = interaction_pb2.Interaction() new_interaction.CopyFrom(interaction) del new_interaction.questions[:] new_question = new_interaction.questions.add() new_question.id = hex( beam_utils.to_numpy_seed(obj=(interaction.id, sentence))) new_interaction.id = new_question.id new_question.original_text = sentence beam.metrics.Metrics.counter(_NS, "Examples").inc() yield new_interaction.id, new_interaction
def _to_id(obj): return hex(beam_utils.to_numpy_seed(obj))
def _to_contrastive_statements_fn( key_interaction, use_fake_table, drop_without_support_rate, ): """Converts pretraining interaction to contrastive interaction.""" # Make a copy since beam functions should not manipulate inputs. new_interaction = interaction_pb2.Interaction() new_interaction.CopyFrom(key_interaction[1]) interaction = new_interaction iid = interaction.table.table_id rng = random.Random(beam_utils.to_numpy_seed(iid)) generated_statements = set() for result in contrastive_statements.get_contrastive_statements( rng, interaction, count_fn=_count): has_support, statement, contrastive_statement = result beam.metrics.Metrics.counter(_NS, "Pairs").inc() if not has_support and rng.random() < drop_without_support_rate: beam.metrics.Metrics.counter( _NS, "Pairs: Down-sampled pairs without support").inc() continue if contrastive_statement in generated_statements: beam.metrics.Metrics.counter(_NS, "Pairs: Duplicates").inc() continue generated_statements.add(contrastive_statement) new_interaction = interaction_pb2.Interaction() new_interaction.CopyFrom(interaction) del new_interaction.questions[:] new_interaction.id = _to_id(( iid, (statement, contrastive_statement), )) if use_fake_table: _clear_table(new_interaction) new_interaction.table.table_id = new_interaction.id new_question = new_interaction.questions.add() new_question.id = _to_id((iid, statement)) new_question.original_text = statement new_question.answer.class_index = 1 new_question = new_interaction.questions.add() new_question.id = _to_id((iid, contrastive_statement)) new_question.original_text = contrastive_statement new_question.answer.class_index = 0 beam.metrics.Metrics.counter(_NS, "Pairs emitted").inc() yield new_interaction.id, new_interaction