def synthesize_fn(
    key_interaction,
    config,
    add_opposite_table,
    use_fake_table,
):
    """Synthesizes up to 4 statements."""
    key, interaction = key_interaction
    rng = np.random.RandomState(beam_utils.to_numpy_seed(key))
    for new_interaction in synthesize_entablement.synthesize_from_interaction(
            config, rng, interaction, FlumeCounter(), add_opposite_table):
        if use_fake_table:
            _clear_table(new_interaction)
        yield new_interaction.id, new_interaction
def _to_retrieval_interaction_fn(interaction):
    """Converts pretraining interaction to retrieval interaction."""
    beam.metrics.Metrics.counter(_NS, "Interactions").inc()
    title = get_title(interaction)
    if title is None or not title:
        beam.metrics.Metrics.counter(_NS, "Interactions without title").inc()
        return

    interaction = beam_utils.rekey(interaction)
    interaction.table.document_title = title

    word_tok = nltk.tokenize.treebank.TreebankWordTokenizer()

    for question in interaction.questions:
        if question.id == _TITLE_QUESTION_ID:
            continue

        text = question.original_text

        for paragraph in text.split("\n"):
            for sentence in sentence_tokenizer.tokenize(paragraph):
                sentence = sentence.strip()
                if not sentence:
                    continue

                beam.metrics.Metrics.counter(_NS, "Sentences").inc()
                num_tokens = word_tok.tokenize(sentence)
                if len(num_tokens) < 4:
                    beam.metrics.Metrics.counter(_NS,
                                                 "Sentence too short").inc()
                    continue
                if len(num_tokens) > 32:
                    beam.metrics.Metrics.counter(_NS,
                                                 "Sentence too long").inc()
                    continue

                new_interaction = interaction_pb2.Interaction()
                new_interaction.CopyFrom(interaction)
                del new_interaction.questions[:]
                new_question = new_interaction.questions.add()
                new_question.id = hex(
                    beam_utils.to_numpy_seed(obj=(interaction.id, sentence)))
                new_interaction.id = new_question.id
                new_question.original_text = sentence

                beam.metrics.Metrics.counter(_NS, "Examples").inc()
                yield new_interaction.id, new_interaction
def _to_id(obj):
    return hex(beam_utils.to_numpy_seed(obj))
def _to_contrastive_statements_fn(
    key_interaction,
    use_fake_table,
    drop_without_support_rate,
):
    """Converts pretraining interaction to contrastive interaction."""

    # Make a copy since beam functions should not manipulate inputs.
    new_interaction = interaction_pb2.Interaction()
    new_interaction.CopyFrom(key_interaction[1])
    interaction = new_interaction

    iid = interaction.table.table_id
    rng = random.Random(beam_utils.to_numpy_seed(iid))

    generated_statements = set()

    for result in contrastive_statements.get_contrastive_statements(
            rng, interaction, count_fn=_count):

        has_support, statement, contrastive_statement = result

        beam.metrics.Metrics.counter(_NS, "Pairs").inc()

        if not has_support and rng.random() < drop_without_support_rate:
            beam.metrics.Metrics.counter(
                _NS, "Pairs: Down-sampled pairs without support").inc()
            continue

        if contrastive_statement in generated_statements:
            beam.metrics.Metrics.counter(_NS, "Pairs: Duplicates").inc()
            continue

        generated_statements.add(contrastive_statement)

        new_interaction = interaction_pb2.Interaction()
        new_interaction.CopyFrom(interaction)
        del new_interaction.questions[:]

        new_interaction.id = _to_id((
            iid,
            (statement, contrastive_statement),
        ))

        if use_fake_table:
            _clear_table(new_interaction)

        new_interaction.table.table_id = new_interaction.id

        new_question = new_interaction.questions.add()
        new_question.id = _to_id((iid, statement))
        new_question.original_text = statement
        new_question.answer.class_index = 1

        new_question = new_interaction.questions.add()
        new_question.id = _to_id((iid, contrastive_statement))
        new_question.original_text = contrastive_statement
        new_question.answer.class_index = 0

        beam.metrics.Metrics.counter(_NS, "Pairs emitted").inc()
        yield new_interaction.id, new_interaction