Example #1
0
def type_dumper(args):
    tgt_folder, fname = args
    typelib = TypeLib()
    with open(fname, "r") as f:
        for line in f:
            e = Example.from_json(json.loads(line))
            for var in e.target.values():
                typelib.add(var.typ)
    typelib.sort()
    with open(os.path.join(tgt_folder, "types",
                           fname.split("/")[-1]), "w") as type_lib_file:
        encoded = TypeLibCodec.encode(typelib)
        type_lib_file.write(encoded)
Example #2
0
 def __init__(self):
     # Load the type library
     self.type_lib_file_name = os.path.join(
         os.environ["OUTPUT_DIR"],
         "types",
         os.environ["PREFIX"] + ".json.gz",
     )
     try:
         with gzip.open(self.type_lib_file_name, "rt") as type_lib_file:
             self.type_lib = TypeLibCodec.decode(type_lib_file.read())
     except Exception as e:
         print(e)
         print("Could not find type library, creating a new one")
         self.type_lib = TypeLib()
     super().__init__()
Example #3
0
def main(args):
    np.random.seed(1234)
    random.seed(1992)

    tgt_folder = args["TARGET_FOLDER"]
    input_folder = args["INPUT_FOLDER"]
    input_fnames_file = args["INPUT_FNAMES"]
    input_fnames = []
    max_files = int(args["--max"])
    with open(input_fnames_file) as f:
        for s in f:
            s = s.strip()
            if s.endswith(".gz"):
                input_fnames.append(s)
            if len(input_fnames) >= max_files:
                break
    shard_size = int(args["--shard-size"])

    if os.path.exists(tgt_folder):
        op = input(f"{tgt_folder} exists. remove? (y/n) ")
        if op == "y":
            shutil.rmtree(tgt_folder)

    os.system(f"mkdir -p {tgt_folder}")
    os.system(f"mkdir -p {tgt_folder}/files")
    os.system(f"mkdir -p {tgt_folder}/types")
    num_workers = 16

    valid_example_count = 0

    print("loading examples")
    with multiprocessing.Pool(num_workers) as pool:
        json_iter = pool.imap(
            json_line_reader,
            ((input_folder, fname) for fname in input_fnames),
            chunksize=64,
        )

        example_iter = pool.imap(example_generator, json_iter, chunksize=64)

        for examples in tqdm(example_iter):
            if not examples:
                continue
            json_file_name = examples[0].binary_file["file_name"].split(
                "/")[-1]
            with open(os.path.join(tgt_folder, "files/", json_file_name),
                      "w") as f:
                for example in examples:
                    f.write(dumps(example.to_json()) + "\n")
                    all_functions.setdefault(
                        json_file_name,
                        dict())[example.name] = example.canonical_code

            valid_example_count += len(examples)

    print("valid examples: ", valid_example_count)

    cur_dir = os.getcwd()
    all_files = glob.glob(os.path.join(tgt_folder, "files/*.jsonl"))
    file_prefix = os.path.join(tgt_folder, "files/")
    sorted(all_files)  # sort all files by names
    all_files = list(all_files)
    file_num = len(all_files)
    print("Total valid binary file num: ", file_num)

    test_file = args["--test-file"]
    if test_file:
        print(f"using test file {test_file}")
        with tarfile.open(test_file, "r") as f:
            test_files = [
                os.path.join(file_prefix,
                             x.name.split("/")[-1]) for x in f.getmembers()
                if x.name.endswith(".jsonl")
            ]
        dev_file_num = 0
    else:
        print(f"randomly sample test file {test_file}")
        test_file_num = int(file_num * 0.1)
        dev_file_num = int(file_num * 0.1)
        test_files = list(
            np.random.choice(all_files, size=test_file_num, replace=False))

    test_files_set = set(test_files)
    train_files = [fname for fname in all_files if fname not in test_files_set]

    if dev_file_num == 0:
        dev_file_num = int(len(train_files) * 0.1)

    np.random.shuffle(train_files)
    dev_files = train_files[-dev_file_num:]
    train_files = train_files[:-dev_file_num]

    # Create types from filtered training set
    with multiprocessing.Pool(num_workers) as pool:
        pool.map(
            type_dumper,
            ((tgt_folder, fname) for fname in train_files),
            chunksize=64,
        )
    print("reading typelib")
    typelib = TypeLib()
    for fname in tqdm(train_files):
        fname = os.path.basename(fname)
        fname = fname[:fname.index(".")] + ".jsonl"
        typelib.add_json_file(os.path.join(tgt_folder, "types", fname))
    typelib.prune(5)
    typelib.sort()

    print("dumping typelib")
    with open(os.path.join(tgt_folder, "typelib.json"), "w") as type_lib_file:
        encoded = TypeLibCodec.encode(typelib)
        type_lib_file.write(encoded)

    train_functions = dict()
    for train_file in train_files:
        file_name = train_file.split("/")[-1]
        for func_name, func in all_functions[file_name].items():
            train_functions.setdefault(func_name, set()).add(func)

    print(
        f"number training: {len(train_files)}",
        f"number dev: {len(dev_files)}",
        f"number test: {len(test_files)}",
        sep=", ",
    )
    print("dump training files")
    shards = [
        train_files[i:i + shard_size]
        for i in range(0, len(train_files), shard_size)
    ]
    for shard_id, shard_files in enumerate(shards):
        print(f"Preparing shard {shard_id}, {len(shard_files)} files: ")
        with open(os.path.join(tgt_folder, "file_list.txt"), "w") as f:
            for file_name in shard_files:
                f.write(file_name.split("/")[-1] + "\n")

        os.chdir(os.path.join(tgt_folder, "files"))
        print("creating tar file...")
        os.system(f"tar cf ../train-shard-{shard_id}.tar -T ../file_list.txt")
        os.chdir(cur_dir)

    def _dump_dev_file(tgt_file_name, file_names):
        with open(os.path.join(tgt_folder, "file_list.txt"), "w") as f:
            for file_name in file_names:
                last_file_name = file_name.split("/")[-1]
                f.write(last_file_name + "\n")

                with open(file_name) as fr:
                    all_lines = fr.readlines()

                replace_lines = []
                for line in all_lines:
                    json_dict = json.loads(line.strip())
                    func_name = json_dict["name"]
                    canonical_code = all_functions[last_file_name][func_name]
                    func_name_in_train = False
                    func_body_in_train = False
                    if func_name in train_functions:
                        func_name_in_train = True
                        if canonical_code in train_functions[func_name]:
                            func_body_in_train = True

                    json_dict["test_meta"] = dict(
                        function_name_in_train=func_name_in_train,
                        function_body_in_train=func_body_in_train,
                    )
                    new_json_str = json.dumps(json_dict)
                    replace_lines.append(new_json_str.strip())

                with open(file_name, "w") as fw:
                    for line in replace_lines:
                        fw.write(line + "\n")

        os.chdir(os.path.join(tgt_folder, "files"))
        print("creating tar file...")
        os.system(f"tar cf ../{tgt_file_name} -T ../file_list.txt")
        os.chdir(cur_dir)

    print("dump dev files")
    _dump_dev_file("dev.tar", dev_files)
    print("dump test files")
    _dump_dev_file("test.tar", test_files)