Exemple #1
0
def _key_with_random_key(
    example,
    seed,
    num_random_table_bins,
    num_corpus_bins,
):
    """Computes a new random key (train/test split aware) for random pairing."""
    key, _ = example
    seed = seed + tf_example_utils.fingerprint(key)
    rng = random.Random(seed)
    # Add train/test information so that random buckets don't cross over.
    partition = pretrain_utils.partition_fn(example, 2, num_corpus_bins)
    return f'{partition}_{rng.randint(0, num_random_table_bins)}', example
    def process(self, element):
        beam.metrics.Metrics.counter(_NS, "Interactions").inc()

        key, (interaction, random_table) = element

        seed = tf_example_utils.fingerprint(
            "%s_%d_%d" %
            (key, self._config.random_seed, self._config.max_seq_length))
        rng = random.Random(seed)
        example = self._converter.convert(rng, interaction, random_table)
        if example:
            beam.metrics.Metrics.counter(_NS, "Examples").inc()
            yield prepand_fingerprint(key), example
def fingerprint(key):
    return "%08X" % abs(tf_example_utils.fingerprint(key))
def to_numpy_seed(obj):
    return tf_example_utils.fingerprint(repr(obj)) % _MAX_INT
    def convert(
        self,
        interaction,
        index,
        negative_example,
    ):
        """Converts question at 'index' to example."""
        table = interaction.table

        num_rows = len(table.rows)
        if num_rows >= self._max_row_id:
            num_rows = self._max_row_id - 1

        num_columns = len(table.columns)
        if num_columns >= self._max_column_id:
            num_columns = self._max_column_id - 1

        title = table.document_title
        if not self._use_document_title:
            title = ''
        title_tokens = self._tokenizer.tokenize(title)
        tokenized_table = self._tokenize_table(table)

        while True:
            try:
                _, features = self._to_trimmed_features(
                    question=None,
                    table=table,
                    question_tokens=title_tokens,
                    tokenized_table=tokenized_table,
                    num_columns=num_columns,
                    num_rows=num_rows)
                break
            except ValueError:
                pass
            # Since this is retrieval we might get away with removing some cells of
            # the table.
            # TODO(thomasmueller) Consider taking the token length into account.
            if num_columns >= num_rows:
                num_columns -= 1
            else:
                num_rows -= 1
            if num_columns == 0 or num_rows == 0:
                raise ValueError('Cannot fit table into sequence.')

        question = interaction.questions[index]
        features['question_id'] = base.create_string_feature(
            [question.id.encode('utf8')])
        features['question_id_ints'] = base.create_int_feature(
            text_utils.str_to_ints(question.id,
                                   length=text_utils.DEFAULT_INTS_LENGTH))

        q_tokens = self._tokenizer.tokenize(question.text)
        q_tokens = self._serialize_text(q_tokens)[0]
        q_tokens.append(base.Token(_SEP, _SEP))
        q_input_ids = self._to_token_ids(q_tokens)
        self._pad_to_seq_length(q_input_ids)
        q_input_mask = [1] * len(q_tokens)
        self._pad_to_seq_length(q_input_mask)
        features['question_input_ids'] = base.create_int_feature(q_input_ids)
        features['question_input_mask'] = base.create_int_feature(q_input_mask)
        if question:
            features['question_hash'] = base.create_int_feature(
                [base.fingerprint(question.text) % _MAX_INT])

        if negative_example is not None:
            n_table = negative_example.table
            n_title_tokens = self._tokenizer.tokenize(n_table.document_title)
            n_tokenized_table = self._tokenize_table(n_table)
            n_num_rows = self._get_num_rows(n_table, drop_rows_to_fit=True)
            n_num_columns = self._get_num_columns(n_table)
            _, n_example_features = self._to_trimmed_features(
                question=None,
                table=n_table,
                question_tokens=n_title_tokens,
                tokenized_table=n_tokenized_table,
                num_columns=n_num_columns,
                num_rows=n_num_rows,
                drop_rows_to_fit=True)
            _join_features(features, n_example_features)
        return tf.train.Example(features=tf.train.Features(feature=features))