def read_naps_dataset_batched(batch_size=100, num_epochs=300, trainB_weight=10):
    trainA = Compose([
        RandomAccessFile(TRAIN_A_PATH),
        JsonLoader(),
        Cycle(shuffle=True, times=num_epochs),
        SelectPseudocode,
        DropKeys(["texts", "is_training"])
    ])

    trainB = Compose([
        RandomAccessFile(TRAIN_B_PATH),
        JsonLoader(),
        Cycle(shuffle=True, times=num_epochs*trainB_weight)
    ])

    train = Compose([
        Merge(input_pipes=[trainA, trainB], mode='random'),
        Batch(batch_size=batch_size)
    ])

    test = Compose([
        RandomAccessFile(TEST_PATH),
        JsonLoader(),
        Batch(batch_size=batch_size)
    ])
    return train, test
Exemple #2
0
def read_naps_dataset():
    trainA = Compose([
        open(TRAIN_A_PATH),
        JsonLoader(),
        SelectPseudocode(text_key="text", texts_key="texts"),
        DropKeys(["texts", "is_training"])
    ])

    trainB = Compose([open(TRAIN_B_PATH), JsonLoader()])

    test = Compose([open(TEST_PATH), JsonLoader()])
    return trainA, trainB, test
Exemple #3
0
def read_conala(batch_size=100, num_epochs=300):
    train = Compose([
        OpenJsonFile(CONALA_TRAIN),
        ToCodeExample(),
        Cycle(shuffle=True, times=num_epochs),
        Batch(batch_size=batch_size),
    ])
    return train
Exemple #4
0
def read_naps_dataset_batched(batch_size=100):
    test = Compose([
        open(TEST_PATH),
        JsonLoader(),
        SkipPartial(is_partial_key="is_partial"),
        SplitTests(tests_key="tests",
                   input_tests_key="search_tests",
                   eval_tests_key="eval_tests"),
        Batch(batch_size=batch_size),
        SortBatchByLen(key="text")
    ])
    return test
Exemple #5
0
def read_naps_dataset_batched(batch_size=100,
                              trainB_weight=0.3,
                              max_num_steps=None,
                              shuffle_variables=False,
                              sort_batch=False):
    trainA = Compose([
        RandomAccessFile(TRAIN_A_PATH),
        JsonLoader(),
        EndlessShuffleCycle(),
        SelectPseudocode(text_key="text", texts_key="texts")
    ])

    trainB = Compose([
        RandomAccessFile(TRAIN_B_PATH),
        JsonLoader(),
        EndlessShuffleCycle(),
        SkipPartial(is_partial_key="is_partial")
    ])

    train = Compose([
        WeightedMerge(input_pipes=[trainA, trainB], p=[1.0, trainB_weight]),
        ShuffleVariables(code_tree_key="code_tree",
                         code_sequence_key="code_sequence",
                         text_key="text") if shuffle_variables else Identity(),
        KeepKeys(["text", "code_sequence"]),
        Batch(batch_size=batch_size),
        LimitOutput(
            max_output_num=max_num_steps) if max_num_steps else Identity(),
        SortBatchByLen(key="text") if sort_batch else Identity()
    ])

    test = Compose([
        RandomAccessFile(TEST_PATH),
        JsonLoader(),
        SkipPartial(is_partial_key="is_partial"),
        Batch(batch_size=batch_size),
        SortBatchByLen(key="text") if sort_batch else Identity()
    ])
    return train, test
Exemple #6
0
def create_vocabs(text_vocab_filepath, code_vocab_filepath, min_occurencies=50):
    ds = Compose([
        OpenJsonFile(CONALA_TRAIN), 
        ToCodeExample()])
    words, codes = collections.Counter(), collections.Counter()
    with ds:
        for example in ds:
            for word in example.text:
                words[word] += 1
            for token in example.code_sequence:
                codes[token] += 1

    def f(l): return sorted(k for k, v in l.items() if v >= min_occurencies)
    text_vocab = f(words)
    code_vocab = f(codes)
    def dump_to_file(filepath, vocab):
        with open(filepath, "w") as f:
            f.write("<S>\n</S>\n<UNK>\n|||\n")
            f.write("\n".join(vocab))
    dump_to_file(text_vocab_filepath, text_vocab)
    dump_to_file(code_vocab_filepath, code_vocab)
Exemple #7
0
            tests,
            # debug_info=True, cleanup=False
        )
    except (cpp_executor.ProgramCompilationError,
            cpp_executor.ProgramSourceGenerationError) as e:
        return program_idx, False, str(type(e))
    if test_compilation_errors:
        return program_idx, False, "test_compilation_errors"
    if test_runtime_errors:
        return program_idx, False, "test_runtime_errors"
    return program_idx, sucessful_tests == total_num_tests, "no exception"


if __name__ == "__main__":
    trainA, trainB, test = read_naps_dataset()
    trainB = Compose([trainB, FilterPartial(is_partial_key="is_partial")])
    pool = mp.Pool()
    map_fn = pool.imap_unordered
    # map_fn = map  # For debugging.
    # Compilation success rate 99%.
    failed = dict()
    failed_num = 0
    total_num = 0

    with trainA, trainB, test, tqdm.tqdm(smoothing=0.1) as pbar:
        for program_idx, is_success, e in map_fn(
                compile_program_worker,
            ((d['solution_id'], d['code_tree'], d['tests'])
             for program_idx, d in enumerate(chain(trainA, trainB, test)))):
            total_num += 1
            if not is_success: