Ejemplo n.º 1
0
def input_fn(file_path,
             vocab_table,
             batch_size,
             num_epochs=None,
             num_examples=None,
             seed=0,
             noiser=None,
             use_free_set=False,
             shuffle_input=True):
    vocab_table = vocab.get_vocab_lookup_tables()[vocab.STR_TO_INT]

    pad_token = tf.constant(bytes(PAD_TOKEN, encoding='utf8'), dtype=tf.string)
    pad_id = vocab_table.lookup(pad_token)

    base_dataset = read_examples_from_file(
        file_path, num_examples, seed, noiser,
        util.get_free_words_set() if use_free_set else None)

    dataset_splits = []
    for index in range(len(base_dataset[0])):
        split_dtype = infer_dtype(base_dataset[0][index])

        split = tf.data.Dataset.from_generator(generator=get_generator(
            base_dataset, index),
                                               output_types=(split_dtype),
                                               output_shapes=(None, ))

        if split_dtype == tf.string:
            pad = pad_token
        else:
            pad = pad_id

        split = split.padded_batch(batch_size,
                                   padded_shapes=[None],
                                   padding_values=pad)

        dataset_splits.append(split)

    dataset = tf.data.Dataset.zip(tuple(dataset_splits))
    if num_epochs and shuffle_input:
        dataset = dataset.apply(
            tf.contrib.data.shuffle_and_repeat(500, num_epochs))
    elif num_epochs:
        dataset = dataset.repeat(num_epochs)

    fake_label = tf.data.Dataset.from_tensor_slices(tf.constant([0])).repeat()

    dataset = dataset.zip((dataset, fake_label)) \
        .prefetch(1)

    return dataset
Ejemplo n.º 2
0
def create_formulas(plans, config):
    noiser = EditNoiser.from_config(config)
    free_set = util.get_free_words_set() if config.get('editor.use_free_set', False) else None

    formulas = []
    formula2plan = []
    for i, (base, edits) in enumerate(plans):
        for j, edit_vector_pair in enumerate(edits):
            base_words = convert_to_bytes(base.split(' ')),
            edit_instance = parse_instance(edit_vector_pair, noiser, free_set)
            formula = base_words + edit_instance

            formulas.append(formula)
            formula2plan.append((i, j))

    return formulas, formula2plan
def create_formulas(plans, config):
    noiser = EditNoiser.from_config(config)
    free_set = util.get_free_words_set() if config.get('editor.use_free_set',
                                                       False) else None

    formulas = []
    formula2plan = []
    for i, (base, edits) in enumerate(plans):
        for j, (src, tgt) in enumerate(edits):
            instance = {'base': base, 'src': src, 'tgt': tgt}
            formula = parse_instance(instance, noiser, free_set)

            formulas.append(formula)
            formula2plan.append((i, j))

    return formulas, formula2plan
Ejemplo n.º 4
0
def input_fn(file_path, vocab_table, config, batch_size, num_epochs=None, num_examples=None, seed=0, noiser=None,
             use_free_set=False, shuffle_input=True):
    gen = read_examples_from_file(
        file_path, config, num_examples, seed,
        noiser, util.get_free_words_set() if use_free_set else None, return_gen=True
    )

    probs = util.load_str_list(str(file_path) + '_probs')
    probs = [float(p) for p in probs]
    dataset_probs = tf.data.Dataset.from_tensor_slices(
        np.array(probs, dtype=np.float32).reshape((-1, 1)))
    dataset_probs = dataset_probs.batch(batch_size)

    vocab_table = vocab.get_vocab_lookup_tables()[vocab.STR_TO_INT]

    pad_id = tf.constant(vocab.SPECIAL_TOKENS.index(PAD_TOKEN), dtype=tf.int64)

    dataset = tf.data.Dataset.from_generator(
        generator=gen,
        output_types=(tf.string, tf.string, tf.string, tf.string, tf.string, tf.string),
        output_shapes=(tf.TensorShape([None]), tf.TensorShape([None]),
                       tf.TensorShape([None]), tf.TensorShape([None]),
                       tf.TensorShape([None]), tf.TensorShape([None]))
    )
    dataset = dataset.map(lambda *x: tuple([vocab_table.lookup(i) for i in x]))
    dataset = dataset.padded_batch(
        batch_size,
        padded_shapes=(tf.TensorShape([None]), tf.TensorShape([None]),
                       tf.TensorShape([None]), tf.TensorShape([None]),
                       tf.TensorShape([None]), tf.TensorShape([None])),
        padding_values=tuple([pad_id] * 6)
    )

    dataset = tf.data.Dataset.zip((dataset, dataset_probs))

    if num_epochs and shuffle_input:
        dataset = dataset.apply(tf.contrib.data.shuffle_and_repeat(500, num_epochs))
    elif num_epochs:
        dataset = dataset.repeat(num_epochs)

    fake_label = tf.data.Dataset.from_tensor_slices(tf.constant([0])).repeat()

    dataset = dataset.zip((dataset, fake_label))
    dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE)

    return dataset
def create_formulas(plans, config):
    noiser = EditNoiser.from_config(config)
    free_set = util.get_free_words_set() if config.get('editor.use_free_set', False) else None

    process_instance_fn = get_process_example_fn(config)

    formulas = []
    formula2plan = []
    for i, (base, edits) in enumerate(plans):
        output = ''
        basic_formula = (base, output)
        for j, edit_vector_pair in enumerate(edits):
            instance = basic_formula + edit_vector_pair
            instance = parse_instance('\t'.join(instance), noiser, free_set)
            formula = process_instance_fn(instance)
            formulas.append(formula)
            formula2plan.append((i, j))

    return formulas, formula2plan
Ejemplo n.º 6
0
def input_fn(file_path,
             vocab_table,
             config,
             batch_size,
             num_epochs=None,
             num_examples=None,
             seed=0,
             noiser=None,
             use_free_set=False,
             shuffle_input=True):
    gen = read_examples_from_file(
        file_path,
        config,
        num_examples,
        seed,
        noiser,
        util.get_free_words_set() if use_free_set else None,
        return_gen=True)

    # gen = lambda: iter(base_dataset)

    # return input_fn_from_gen_multi(
    #     gen,
    #     vocab_table, batch_size,
    #     shuffle_input=shuffle_input,
    #     num_epochs=num_epochs,
    #     prefetch=True
    # )

    vocab_table = vocab.get_vocab_lookup_tables()[vocab.STR_TO_INT]
    # base_dataset = list(gen())

    pad_id = tf.constant(vocab.SPECIAL_TOKENS.index(PAD_TOKEN), dtype=tf.int64)

    dataset = tf.data.Dataset.from_generator(
        generator=gen,
        output_types=(tf.string, tf.string, tf.string, tf.string, tf.string,
                      tf.string),
        output_shapes=(tf.TensorShape([None]), tf.TensorShape([None]),
                       tf.TensorShape([None]), tf.TensorShape([None]),
                       tf.TensorShape([None]), tf.TensorShape([None])))
    dataset = dataset.map(lambda *x: tuple([vocab_table.lookup(i) for i in x]))
    dataset = dataset.padded_batch(batch_size,
                                   padded_shapes=(tf.TensorShape([None]),
                                                  tf.TensorShape([None]),
                                                  tf.TensorShape([None]),
                                                  tf.TensorShape([None]),
                                                  tf.TensorShape([None]),
                                                  tf.TensorShape([None])),
                                   padding_values=tuple([pad_id] * 6))

    if num_epochs and shuffle_input:
        dataset = dataset.apply(
            tf.contrib.data.shuffle_and_repeat(500, num_epochs))
    elif num_epochs:
        dataset = dataset.repeat(num_epochs)

    fake_label = tf.data.Dataset.from_tensor_slices(tf.constant([0])).repeat()

    dataset = dataset.zip((dataset, fake_label))
    dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE)

    return dataset