def read_naps_dataset_batched(batch_size=100, num_epochs=300, trainB_weight=10): trainA = Compose([ RandomAccessFile(TRAIN_A_PATH), JsonLoader(), Cycle(shuffle=True, times=num_epochs), SelectPseudocode, DropKeys(["texts", "is_training"]) ]) trainB = Compose([ RandomAccessFile(TRAIN_B_PATH), JsonLoader(), Cycle(shuffle=True, times=num_epochs*trainB_weight) ]) train = Compose([ Merge(input_pipes=[trainA, trainB], mode='random'), Batch(batch_size=batch_size) ]) test = Compose([ RandomAccessFile(TEST_PATH), JsonLoader(), Batch(batch_size=batch_size) ]) return train, test
def read_naps_dataset(): trainA = Compose([ open(TRAIN_A_PATH), JsonLoader(), SelectPseudocode(text_key="text", texts_key="texts"), DropKeys(["texts", "is_training"]) ]) trainB = Compose([open(TRAIN_B_PATH), JsonLoader()]) test = Compose([open(TEST_PATH), JsonLoader()]) return trainA, trainB, test
def read_conala(batch_size=100, num_epochs=300): train = Compose([ OpenJsonFile(CONALA_TRAIN), ToCodeExample(), Cycle(shuffle=True, times=num_epochs), Batch(batch_size=batch_size), ]) return train
def read_naps_dataset_batched(batch_size=100): test = Compose([ open(TEST_PATH), JsonLoader(), SkipPartial(is_partial_key="is_partial"), SplitTests(tests_key="tests", input_tests_key="search_tests", eval_tests_key="eval_tests"), Batch(batch_size=batch_size), SortBatchByLen(key="text") ]) return test
def read_naps_dataset_batched(batch_size=100, trainB_weight=0.3, max_num_steps=None, shuffle_variables=False, sort_batch=False): trainA = Compose([ RandomAccessFile(TRAIN_A_PATH), JsonLoader(), EndlessShuffleCycle(), SelectPseudocode(text_key="text", texts_key="texts") ]) trainB = Compose([ RandomAccessFile(TRAIN_B_PATH), JsonLoader(), EndlessShuffleCycle(), SkipPartial(is_partial_key="is_partial") ]) train = Compose([ WeightedMerge(input_pipes=[trainA, trainB], p=[1.0, trainB_weight]), ShuffleVariables(code_tree_key="code_tree", code_sequence_key="code_sequence", text_key="text") if shuffle_variables else Identity(), KeepKeys(["text", "code_sequence"]), Batch(batch_size=batch_size), LimitOutput( max_output_num=max_num_steps) if max_num_steps else Identity(), SortBatchByLen(key="text") if sort_batch else Identity() ]) test = Compose([ RandomAccessFile(TEST_PATH), JsonLoader(), SkipPartial(is_partial_key="is_partial"), Batch(batch_size=batch_size), SortBatchByLen(key="text") if sort_batch else Identity() ]) return train, test
def create_vocabs(text_vocab_filepath, code_vocab_filepath, min_occurencies=50): ds = Compose([ OpenJsonFile(CONALA_TRAIN), ToCodeExample()]) words, codes = collections.Counter(), collections.Counter() with ds: for example in ds: for word in example.text: words[word] += 1 for token in example.code_sequence: codes[token] += 1 def f(l): return sorted(k for k, v in l.items() if v >= min_occurencies) text_vocab = f(words) code_vocab = f(codes) def dump_to_file(filepath, vocab): with open(filepath, "w") as f: f.write("<S>\n</S>\n<UNK>\n|||\n") f.write("\n".join(vocab)) dump_to_file(text_vocab_filepath, text_vocab) dump_to_file(code_vocab_filepath, code_vocab)
tests, # debug_info=True, cleanup=False ) except (cpp_executor.ProgramCompilationError, cpp_executor.ProgramSourceGenerationError) as e: return program_idx, False, str(type(e)) if test_compilation_errors: return program_idx, False, "test_compilation_errors" if test_runtime_errors: return program_idx, False, "test_runtime_errors" return program_idx, sucessful_tests == total_num_tests, "no exception" if __name__ == "__main__": trainA, trainB, test = read_naps_dataset() trainB = Compose([trainB, FilterPartial(is_partial_key="is_partial")]) pool = mp.Pool() map_fn = pool.imap_unordered # map_fn = map # For debugging. # Compilation success rate 99%. failed = dict() failed_num = 0 total_num = 0 with trainA, trainB, test, tqdm.tqdm(smoothing=0.1) as pbar: for program_idx, is_success, e in map_fn( compile_program_worker, ((d['solution_id'], d['code_tree'], d['tests']) for program_idx, d in enumerate(chain(trainA, trainB, test)))): total_num += 1 if not is_success: