Esempio n. 1
0
def main(_):
    random.seed(FLAGS.random_seed)
    task_name = FLAGS.task_name.lower()

    with gfile.Open(FLAGS.input_path, "r") as f:
        sents_data = f.read().strip().split("\n")

    with gfile.Open(FLAGS.thief_dataset, "r") as f:
        thief_data = f.read().strip().split("\n")

    header = sents_data[0]
    sents_data = sents_data[1:]

    vocab, probs = pp_util.build_vocab(thief_data,
                                       task_name="list-sentences",
                                       vocab_mode=FLAGS.vocab_mode,
                                       vocab_path=FLAGS.vocab_path)
    vocab_dict = {x: i for i, x in enumerate(vocab)}
    output_data = []

    if FLAGS.dataset_size:
        points_remaining = FLAGS.dataset_size
        new_sents_data = []
        while points_remaining > len(sents_data):
            new_sents_data.extend([x for x in sents_data])
            points_remaining = points_remaining - len(sents_data)
        new_sents_data.extend([x for x in sents_data[:points_remaining]])
        sents_data = new_sents_data

    for _ in range(FLAGS.augmentations):
        for sent in tqdm.tqdm(sents_data):
            data_point_parts = sent.split("\t")

            if FLAGS.scheme.startswith("random_ed_k_"):
                premise_ind, hypo_ind = pp_util.task_input_indices[task_name]
                # sample random sentence from the thief dataset
                new_premise = pp_util.sample_thief_data(
                    thief_data,
                    sanitize=FLAGS.sanitize_samples,
                    vocab=vocab,
                    vocab_dict=vocab_dict).split()
                data_point_parts[premise_ind] = pp_util.detokenize(new_premise)
                # Starting from premise, make multiple ed1 changes to form hypothesis
                new_premise = pp_util.token_replace(
                    token_list=new_premise,
                    vocab=vocab,
                    probs=None,
                    num_changes=FLAGS.ed1_changes)

                data_point_parts[hypo_ind] = pp_util.detokenize(new_premise)

            elif FLAGS.scheme.startswith("random_"):
                # For every index having textual input, do a random replacement
                for index in pp_util.task_input_indices[task_name]:
                    # sample random sentence from the thief dataset
                    new_sent = pp_util.sample_thief_data(
                        thief_data,
                        sanitize=FLAGS.sanitize_samples,
                        vocab=vocab,
                        vocab_dict=vocab_dict).split()
                    data_point_parts[index] = pp_util.detokenize(new_sent)

            elif FLAGS.scheme.startswith("shuffle_"):
                # only a valid scheme for pairwise datasets
                premise_ind, hypo_ind = pp_util.task_input_indices[task_name]
                # sample random sentence from the thief dataset
                new_premise = pp_util.sample_thief_data(
                    thief_data,
                    sanitize=FLAGS.sanitize_samples,
                    vocab=vocab,
                    vocab_dict=vocab_dict).split()
                data_point_parts[premise_ind] = pp_util.detokenize(new_premise)
                # Shuffle words for hypothesis
                random.shuffle(new_premise)
                data_point_parts[hypo_ind] = pp_util.detokenize(new_premise)

            elif FLAGS.scheme.startswith("random_ed_all_"):
                premise_ind, hypo_ind = pp_util.task_input_indices[task_name]
                # sample random sentence from the thief dataset
                new_premise = pp_util.sample_thief_data(
                    thief_data,
                    sanitize=FLAGS.sanitize_samples,
                    vocab=vocab,
                    vocab_dict=vocab_dict).split()
                data_point_parts[premise_ind] = pp_util.detokenize(new_premise)
                # Starting from premise, make multiple ed1 changes to form hypothesis

                # First, randomly sample the type of change that needs to be made
                change_type = random.choice(
                    ["replace", "drop", "add", "random"])
                # Next, randomly sample the number of ed1 changes that need to be made
                # FLAGS.ed1_changes represents the upper-bound
                num_changes = random.choice(
                    [i for i in range(1, FLAGS.ed1_changes + 1)])

                if change_type == "drop" and num_changes >= len(new_premise):
                    change_type = random.choice(["replace", "add"])

                if change_type == "replace":
                    new_premise = pp_util.token_replace(
                        token_list=new_premise,
                        vocab=vocab,
                        probs=probs,
                        num_changes=num_changes)

                elif change_type == "drop":
                    new_premise = pp_util.token_drop(token_list=new_premise,
                                                     num_changes=num_changes)

                elif change_type == "add":
                    new_premise = pp_util.token_add(token_list=new_premise,
                                                    vocab=vocab,
                                                    probs=probs,
                                                    scheme=FLAGS.scheme,
                                                    num_changes=num_changes)

                elif change_type == "random":
                    # in the random mode, just sample another sentence from corpus
                    new_premise = pp_util.sample_thief_data(
                        thief_data,
                        sanitize=FLAGS.sanitize_samples,
                        vocab=vocab,
                        vocab_dict=vocab_dict).split()

                data_point_parts[hypo_ind] = pp_util.detokenize(new_premise)

            # Once all sentences have been replaced, add to corpus
            output_data.append("\t".join(data_point_parts))

    logging.info("Final dataset size = %d", len(output_data))

    output_data = [header] + output_data

    with gfile.Open(FLAGS.output_path, "w") as f:
        f.write("\n".join(output_data) + "\n")

    return
Esempio n. 2
0
def main(_):
    random.seed(FLAGS.random_seed)

    task_name = FLAGS.task_name.lower()

    with gfile.Open(FLAGS.input_path, "r") as f:
        sents_data = f.read().strip().split("\n")

    header = sents_data[0]
    sents_data = sents_data[1:]

    if FLAGS.thief_dataset:
        with gfile.Open(FLAGS.thief_dataset, "r") as f:
            thief_data = f.read().strip().split("\n")
        vocab, probs = pp_util.build_vocab(sents_data=thief_data,
                                           task_name="list-sentences",
                                           vocab_mode=FLAGS.vocab_mode,
                                           vocab_path=FLAGS.vocab_path)
        thief_lengths_pool = pp_util.get_lengths_pool(thief_data)
    else:
        vocab, probs = pp_util.build_vocab(sents_data=sents_data,
                                           task_name=task_name,
                                           vocab_mode=FLAGS.vocab_mode,
                                           vocab_path=FLAGS.vocab_path)
        thief_lengths_pool = None

    output_data = []

    if FLAGS.dataset_size:
        points_remaining = FLAGS.dataset_size
        new_sents_data = []
        while points_remaining > len(sents_data):
            new_sents_data.extend([x for x in sents_data])
            points_remaining = points_remaining - len(sents_data)
        new_sents_data.extend([x for x in sents_data[:points_remaining]])
        sents_data = new_sents_data

    for _ in range(FLAGS.augmentations):
        for sent in tqdm.tqdm(sents_data):
            data_point_parts = sent.split("\t")

            if FLAGS.scheme.startswith("random_ed_k_"):
                # only relevant for pairwise text classification tasks
                premise_ind, hypo_ind = pp_util.task_input_indices[task_name]
                # Randomly choose premise
                original_premise = data_point_parts[premise_ind].split()

                new_len = pp_util.get_length(
                    original_sequence=original_premise,
                    thief_lengths_pool=thief_lengths_pool,
                    lengths_scheme=FLAGS.lengths_scheme)

                # randomly sample a word for every position in the premise
                new_premise = pp_util.sample_next_sequence(vocab=vocab,
                                                           probs=probs,
                                                           seq_length=new_len,
                                                           scheme=FLAGS.scheme)

                data_point_parts[premise_ind] = pp_util.detokenize(
                    new_premise, FLAGS.vocab_mode)
                # Starting from premise, make multiple ed1 changes to form hypothesis
                new_premise = pp_util.token_replace(
                    token_list=new_premise,
                    vocab=vocab,
                    probs=probs,
                    num_changes=FLAGS.ed1_changes)

                data_point_parts[hypo_ind] = pp_util.detokenize(
                    new_premise, FLAGS.vocab_mode)

            elif FLAGS.scheme.startswith("random_"):
                # For every index having textual input, do a random replacement
                for index in pp_util.task_input_indices[task_name]:
                    original_sent = data_point_parts[index].split()

                    new_len = pp_util.get_length(
                        original_sequence=original_sent,
                        thief_lengths_pool=thief_lengths_pool,
                        lengths_scheme=FLAGS.lengths_scheme)
                    # randomly sample a word for every position in the premise
                    new_sent = pp_util.sample_next_sequence(
                        vocab=vocab,
                        probs=probs,
                        seq_length=new_len,
                        scheme=FLAGS.scheme)

                    data_point_parts[index] = pp_util.detokenize(
                        new_sent, FLAGS.vocab_mode)

            elif FLAGS.scheme.startswith("shuffle_"):
                # only relevant for pairwise text classification tasks
                premise_ind, hypo_ind = pp_util.task_input_indices[task_name]
                # Randomly choose premise
                original_premise = data_point_parts[premise_ind].split()

                # sample lengths according to a thief dataset or uniform random sampling
                new_len = pp_util.get_length(
                    original_sequence=original_premise,
                    thief_lengths_pool=thief_lengths_pool,
                    lengths_scheme=FLAGS.lengths_scheme)

                # randomly sample a word for every position in the premise
                new_premise = pp_util.sample_next_sequence(vocab=vocab,
                                                           probs=probs,
                                                           seq_length=new_len,
                                                           scheme=FLAGS.scheme)

                data_point_parts[premise_ind] = pp_util.detokenize(
                    new_premise, FLAGS.vocab_mode)
                # Shuffle words for hypothesis
                random.shuffle(new_premise)

                data_point_parts[hypo_ind] = pp_util.detokenize(
                    new_premise, FLAGS.vocab_mode)

            # Once all sentences have been replaced, add to corpus
            output_data.append("\t".join(data_point_parts))

    output_data = [header] + output_data

    with gfile.Open(FLAGS.output_path, "w") as f:
        f.write("\n".join(output_data) + "\n")

    return