Exemple #1
0
def compute(config) -> Dict[int, int]:
    dataset = Dataset(config["data"]["train_file"], config["data"])
    dataloader: DataLoader = DataLoader(dataset,
                                        num_workers=8,
                                        batch_size=None)
    most_common_for_src: DefaultDict[str, Counter] = defaultdict(Counter)
    for example in tqdm(dataloader):
        for src_type, tgt_type in zip(example.src_var_types_str,
                                      example.tgt_var_types_str):
            most_common_for_src[src_type][tgt_type] += 1
    for key in most_common_for_src:
        most_common_for_src[key] = most_common_for_src[key].most_common(
            1)[0][0]
    return most_common_for_src  # type: ignore
Exemple #2
0
def main():
    config = json.loads(_jsonnet.evaluate_file("retype.xfmr.jsonnet"))
    dataset = Dataset(config["data"]["test_file"], config["data"])
    dataloader: DataLoader = DataLoader(  # noqa: F841
        dataset, num_workers=8, batch_size=None)
    types_model = dataset.vocab.types  # noqa: F841

    results = {}
    for example in tqdm(dataset):
        for src_name, src_type in zip(example.src_var_names,
                                      example.src_var_types_str):
            results.setdefault(example.binary,
                               {}).setdefault(example.name,
                                              {})[src_name[2:-2]] = (src_type,
                                                                     "")

    json.dump(results, open("copy_decompiler.json", "w"))
Exemple #3
0
def evaluate(config, most_common_for_src: Dict[int, int]):
    dataset = Dataset(config["data"]["test_file"], config["data"])
    dataloader: DataLoader = DataLoader(dataset,
                                        num_workers=8,
                                        batch_size=None)

    results: Dict[str, Dict[str, Any]] = {}
    for example in tqdm(dataloader):
        src_name: str
        src_type: int
        for src_name, src_type in zip(example.src_var_names,
                                      example.src_var_types_str):
            results.setdefault(example.binary, {}).setdefault(
                example.name, {})[src_name[2:-2]] = (
                    most_common_for_src.get(src_type, src_type),
                    "",
                )

    json.dump(results, open("most_common_decompiler.json", "w"))
Exemple #4
0
def load_data(config_file):
    config = json.loads(_jsonnet.evaluate_file(config_file))["data"]
    config["max_num_var"] = 1 << 30
    dataset = Dataset(config["test_file"], config)
    return dataset
Exemple #5
0
import json
from typing import Any, Dict

import _jsonnet
from csvnpm.binary.dire_types import TypeLibCodec
from torch.utils.data import DataLoader
from tqdm import tqdm

from dirty.utils.dataset import Dataset  # type: ignore

if __name__ == "__main__":
    config = json.loads(_jsonnet.evaluate_file("retype.xfmr.jsonnet"))
    dataset = Dataset(config["data"]["test_file"], config["data"])
    dataloader: DataLoader = DataLoader(dataset,
                                        num_workers=8,
                                        batch_size=None)
    with open(config["data"]["typelib_file"]) as type_f:
        typelib = TypeLibCodec.decode(type_f.read())
    most_common_for_size = {}
    types_model = dataset.vocab.types
    for size in typelib:
        freq, tp = typelib[size][0]
        most_common_for_size[size] = str(tp)

    results: Dict[str, Dict[str, Any]] = {}
    for example in tqdm(dataloader):
        for src_name, src_type, tgt_var_mem in zip(
                example.src_var_names,
                example.src_var_types_str,
                example.tgt_var_src_mems,
        ):
Exemple #6
0
def train(args):
    config = json.loads(_jsonnet.evaluate_file(args["CONFIG_FILE"]))

    if args["--extra-config"]:
        extra_config = args["--extra-config"]
        extra_config = json.loads(extra_config)
        config = util.update(config, extra_config)

    # dataloaders
    batch_size = config["train"]["batch_size"]
    train_set = Dataset(
        config["data"]["train_file"],
        config["data"],
        percent=float(args["--percent"]),
    )
    dev_set = Dataset(config["data"]["dev_file"], config["data"])
    train_loader = DataLoader(
        train_set,
        batch_size=batch_size,
        collate_fn=Dataset.collate_fn,
        num_workers=16,
        pin_memory=True,
    )
    val_loader = DataLoader(
        dev_set,
        batch_size=batch_size,
        collate_fn=Dataset.collate_fn,
        num_workers=8,
        pin_memory=True,
    )

    # model
    model = TypeReconstructionModel(config)

    wandb_logger = WandbLogger(name=args["--expname"],
                               project="dire",
                               log_model=True)
    wandb_logger.log_hyperparams(config)
    resume_from_checkpoint = (args["--eval-ckpt"]
                              if args["--eval-ckpt"] else args["--resume"])
    if resume_from_checkpoint == "":
        resume_from_checkpoint = None
    trainer = pl.Trainer(
        max_epochs=config["train"]["max_epoch"],
        logger=wandb_logger,
        gpus=1 if args["--cuda"] else None,
        auto_select_gpus=True,
        gradient_clip_val=1,
        callbacks=[
            EarlyStopping(
                monitor="val_retype_acc"
                if config["data"]["retype"] else "val_rename_acc",
                mode="max",
                patience=config["train"]["patience"],
            )
        ],
        check_val_every_n_epoch=config["train"]["check_val_every_n_epoch"],
        progress_bar_refresh_rate=10,
        accumulate_grad_batches=config["train"]["grad_accum_step"],
        resume_from_checkpoint=resume_from_checkpoint,
    )
    if args["--eval-ckpt"]:
        # HACK: necessary to make pl test work for IterableDataset
        Dataset.__len__ = lambda self: 1000000
        test_set = Dataset(config["data"]["test_file"], config["data"])
        test_loader = DataLoader(
            test_set,
            batch_size=config["test"]["batch_size"],
            collate_fn=Dataset.collate_fn,
            num_workers=8,
            pin_memory=True,
        )
        trainer.test(model,
                     test_dataloaders=test_loader,
                     ckpt_path=args["--eval-ckpt"])
    else:
        trainer.fit(model, train_loader, val_loader)
Exemple #7
0
        for key, val in params.items():
            # if key in ('grammar', ):
            #     entry = Grammar.load(val)
            # else:
            entry = VocabEntry.load(params=val)
            entries[key] = entry
        return cls(**entries)


if __name__ == "__main__":

    args = docopt(__doc__)
    vocab_size = int(args["--size"])
    vocab_file = args["VOCAB_FILE"]
    type_file = args["TYPE_FILE"]
    train_set = Dataset(args["TRAIN_FILE"])

    with open(type_file) as type_f:
        typelib = TypeLibCodec.decode(type_f.read())
        type_counter = Counter()
        subtype_counter = Counter()
        for size in typelib:
            for freq, tp in typelib[size]:
                # Treat types as discrete tokens
                type_counter[str(tp)] += freq
                # Tokenize compositonal types, mainly structs
                for subtype in tp.tokenize():
                    subtype_counter[subtype] += freq
    print(f"{len(type_counter)} types in typelib")
    print(f"{len(subtype_counter)} subtypes in typelib")