def compute(config) -> Dict[int, int]: dataset = Dataset(config["data"]["train_file"], config["data"]) dataloader: DataLoader = DataLoader(dataset, num_workers=8, batch_size=None) most_common_for_src: DefaultDict[str, Counter] = defaultdict(Counter) for example in tqdm(dataloader): for src_type, tgt_type in zip(example.src_var_types_str, example.tgt_var_types_str): most_common_for_src[src_type][tgt_type] += 1 for key in most_common_for_src: most_common_for_src[key] = most_common_for_src[key].most_common( 1)[0][0] return most_common_for_src # type: ignore
def main(): config = json.loads(_jsonnet.evaluate_file("retype.xfmr.jsonnet")) dataset = Dataset(config["data"]["test_file"], config["data"]) dataloader: DataLoader = DataLoader( # noqa: F841 dataset, num_workers=8, batch_size=None) types_model = dataset.vocab.types # noqa: F841 results = {} for example in tqdm(dataset): for src_name, src_type in zip(example.src_var_names, example.src_var_types_str): results.setdefault(example.binary, {}).setdefault(example.name, {})[src_name[2:-2]] = (src_type, "") json.dump(results, open("copy_decompiler.json", "w"))
def evaluate(config, most_common_for_src: Dict[int, int]): dataset = Dataset(config["data"]["test_file"], config["data"]) dataloader: DataLoader = DataLoader(dataset, num_workers=8, batch_size=None) results: Dict[str, Dict[str, Any]] = {} for example in tqdm(dataloader): src_name: str src_type: int for src_name, src_type in zip(example.src_var_names, example.src_var_types_str): results.setdefault(example.binary, {}).setdefault( example.name, {})[src_name[2:-2]] = ( most_common_for_src.get(src_type, src_type), "", ) json.dump(results, open("most_common_decompiler.json", "w"))
def load_data(config_file): config = json.loads(_jsonnet.evaluate_file(config_file))["data"] config["max_num_var"] = 1 << 30 dataset = Dataset(config["test_file"], config) return dataset
import json from typing import Any, Dict import _jsonnet from csvnpm.binary.dire_types import TypeLibCodec from torch.utils.data import DataLoader from tqdm import tqdm from dirty.utils.dataset import Dataset # type: ignore if __name__ == "__main__": config = json.loads(_jsonnet.evaluate_file("retype.xfmr.jsonnet")) dataset = Dataset(config["data"]["test_file"], config["data"]) dataloader: DataLoader = DataLoader(dataset, num_workers=8, batch_size=None) with open(config["data"]["typelib_file"]) as type_f: typelib = TypeLibCodec.decode(type_f.read()) most_common_for_size = {} types_model = dataset.vocab.types for size in typelib: freq, tp = typelib[size][0] most_common_for_size[size] = str(tp) results: Dict[str, Dict[str, Any]] = {} for example in tqdm(dataloader): for src_name, src_type, tgt_var_mem in zip( example.src_var_names, example.src_var_types_str, example.tgt_var_src_mems, ):
def train(args): config = json.loads(_jsonnet.evaluate_file(args["CONFIG_FILE"])) if args["--extra-config"]: extra_config = args["--extra-config"] extra_config = json.loads(extra_config) config = util.update(config, extra_config) # dataloaders batch_size = config["train"]["batch_size"] train_set = Dataset( config["data"]["train_file"], config["data"], percent=float(args["--percent"]), ) dev_set = Dataset(config["data"]["dev_file"], config["data"]) train_loader = DataLoader( train_set, batch_size=batch_size, collate_fn=Dataset.collate_fn, num_workers=16, pin_memory=True, ) val_loader = DataLoader( dev_set, batch_size=batch_size, collate_fn=Dataset.collate_fn, num_workers=8, pin_memory=True, ) # model model = TypeReconstructionModel(config) wandb_logger = WandbLogger(name=args["--expname"], project="dire", log_model=True) wandb_logger.log_hyperparams(config) resume_from_checkpoint = (args["--eval-ckpt"] if args["--eval-ckpt"] else args["--resume"]) if resume_from_checkpoint == "": resume_from_checkpoint = None trainer = pl.Trainer( max_epochs=config["train"]["max_epoch"], logger=wandb_logger, gpus=1 if args["--cuda"] else None, auto_select_gpus=True, gradient_clip_val=1, callbacks=[ EarlyStopping( monitor="val_retype_acc" if config["data"]["retype"] else "val_rename_acc", mode="max", patience=config["train"]["patience"], ) ], check_val_every_n_epoch=config["train"]["check_val_every_n_epoch"], progress_bar_refresh_rate=10, accumulate_grad_batches=config["train"]["grad_accum_step"], resume_from_checkpoint=resume_from_checkpoint, ) if args["--eval-ckpt"]: # HACK: necessary to make pl test work for IterableDataset Dataset.__len__ = lambda self: 1000000 test_set = Dataset(config["data"]["test_file"], config["data"]) test_loader = DataLoader( test_set, batch_size=config["test"]["batch_size"], collate_fn=Dataset.collate_fn, num_workers=8, pin_memory=True, ) trainer.test(model, test_dataloaders=test_loader, ckpt_path=args["--eval-ckpt"]) else: trainer.fit(model, train_loader, val_loader)
for key, val in params.items(): # if key in ('grammar', ): # entry = Grammar.load(val) # else: entry = VocabEntry.load(params=val) entries[key] = entry return cls(**entries) if __name__ == "__main__": args = docopt(__doc__) vocab_size = int(args["--size"]) vocab_file = args["VOCAB_FILE"] type_file = args["TYPE_FILE"] train_set = Dataset(args["TRAIN_FILE"]) with open(type_file) as type_f: typelib = TypeLibCodec.decode(type_f.read()) type_counter = Counter() subtype_counter = Counter() for size in typelib: for freq, tp in typelib[size]: # Treat types as discrete tokens type_counter[str(tp)] += freq # Tokenize compositonal types, mainly structs for subtype in tp.tokenize(): subtype_counter[subtype] += freq print(f"{len(type_counter)} types in typelib") print(f"{len(subtype_counter)} subtypes in typelib")