def build_holdout_asts(data_path: str, holdout_name: str) -> str:
    print(f"build asts for {holdout_name} data...")
    projects = os.listdir(os.path.join(data_path, holdout_name))
    output_folder_path = os.path.join(data_path, f'{holdout_name}_asts')
    create_folder(output_folder_path)
    successful_builds = 0
    for project in tqdm(projects):
        print(f"working with {project} project")
        project_path = os.path.join(data_path, holdout_name, project)
        output_project_path = os.path.join(output_folder_path, project)
        create_folder(output_project_path)
        if build_project_asts(project_path, output_project_path):
            successful_builds += 1
            desc_path = os.path.join(output_project_path, 'java', 'description.csv')

            # remove asts with nan labels
            project_description = pd.read_csv(desc_path)
            bad_labels_mask = project_description['label'].isna()
            filenames = project_description[bad_labels_mask]['dot_file'].unique()
            source_files = project_description[bad_labels_mask]['source_file'].unique()
            print(f"remove functions from {source_files} for {project} project")
            for filename in filenames:
                filepath = os.path.join(output_project_path, 'java', 'asts', filename)
                os.remove(filepath)
            project_description.dropna(subset=['label'], inplace=True)
            project_description.to_csv(desc_path, index=False)
    print(f"create asts for {successful_builds}/{len(projects)} {holdout_name} projects")
    return output_folder_path
    def run(self):
        self.log.info("downloading imagebuilder %s", self.path)
        if not self.created():
            create_folder(self.path)

            regular_tar_url = os.path.join(self.download_url(),
                                           self.tar_name())
            if get_statuscode(regular_tar_url) != 404:
                if not self.download(regular_tar_url):
                    return False
            else:
                self.log.info("did not find regular imagebuilder name")
                # this is only due to arm64 missing -generic in filename
                # this is very ugly, can this just be deleted?
                special_tar_url = os.path.join(self.download_url(),
                                               self.tar_name(True))
                if get_statuscode(special_tar_url) != 404:
                    self.log.debug("remove -generic from url")

                    if not self.download(special_tar_url):
                        return False
                else:
                    return False
            self.patch_makefile()
            self.add_custom_repositories()
            self.pkg_arch = self.parse_packages_arch()

        self.log.info("initialized imagebuilder %s", self.path)
        return True
Beispiel #3
0
def build_dataset_asts(dataset_info: IDatasetInfo, dataset_path: str,
                       astminer_path: str) -> None:
    for holdout in dataset_info.holdout_folders:
        holdout_folder = os.path.join(dataset_path, holdout)
        output_folder = os.path.join(dataset_path, f'{holdout}_asts')
        create_folder(output_folder)
        build_projects_asts(holdout_folder, output_folder, astminer_path,
                            dataset_info.astminer_params)
def interactive(path_to_function: str, path_to_model: str):
    fix_seed()
    device = get_device()
    print(f"using {device} device")

    # convert function to dot format
    print(f"prepare ast...")
    create_folder(TMP_FOLDER)
    if not build_ast(path_to_function):
        return
    ast_folder = os.path.join(TMP_FOLDER, 'java', 'asts')
    ast = os.listdir(ast_folder)
    if len(ast) == 0:
        print("didn't find any functions in given file")
        return
    if len(ast) > 1:
        print(
            "too many functions in given file, for interactive prediction you need only one"
        )
        return
    dgl_ast = convert_dot_to_dgl(os.path.join(ast_folder, ast[0]))
    ast_desc = pd.read_csv(os.path.join(TMP_FOLDER, 'java', 'description.csv'))
    ast_desc['token'].fillna('NAN', inplace=True)
    with open(vocab_path, 'rb') as pkl_file:
        vocab = pkl_load(pkl_file)
        token_to_id, type_to_id = vocab['token_to_id'], vocab['type_to_id']
    ast_desc = transform_keys(ast_desc, token_to_id, type_to_id)
    batched_graph, labels, paths = prepare_batch(ast_desc, ['ast_0.dot'],
                                                 lambda: [dgl_ast])
    batched_graph = dgl.batch(
        list(
            map(lambda g: dgl.reverse(g, share_ndata=True),
                dgl.unbatch(batched_graph))))

    # load model
    print("loading model..")
    model, _ = load_model(path_to_model, device)
    criterion = nn.CrossEntropyLoss(
        ignore_index=model.decoder.pad_index).to(device)
    info = LearningInfo()

    print("forward pass...")
    batch_info, prediction = eval_on_batch(model, criterion, batched_graph,
                                           labels, device)

    info.accumulate_info(batch_info)
    id_to_sublabel = {v: k for k, v in model.decoder.label_to_id.items()}
    label = ''
    for cur_sublabel in prediction:
        if cur_sublabel.item() == model.decoder.label_to_id[EOS]:
            break
        label += '|' + id_to_sublabel[cur_sublabel.item()]
    label = label[1:]
    print(f"Predicted function name is\n{label}")
    print(
        f"Calculated metrics with respect to '{labels[0]}' name\n{info.get_state_dict()}"
    )
def convert_holdout(holdout_name: str, vocab: Vocabulary, config: PreprocessingConfig, n_jobs: int):
    holdout_data_path = path.join(DATA_FOLDER, config.dataset_name, f"{config.dataset_name}.{holdout_name}.c2s")
    holdout_output_folder = path.join(DATA_FOLDER, config.dataset_name, holdout_name)
    create_folder(holdout_output_folder)
    with open(path.join(holdout_output_folder, DESCRIPTION_FILE), "w") as desc_file:
        desc_file.write("id,filename,n_samples,n_paths\n")
    with Pool(n_jobs) as pool:
        results = pool.imap(
            _convert_raw_buffer,
            (
                (lines, config, vocab, holdout_output_folder, pos)
                for pos, lines in enumerate(_read_file_by_batch(holdout_data_path, config.buffer_size))
            ),
        )
        n_buffers = ceil(count_lines_in_file(holdout_data_path) / config.buffer_size)
        _ = [_ for _ in tqdm(results, total=n_buffers)]
Beispiel #6
0
def build_projects_asts(projects_folder: str, output_folder: str,
                        astminer_path: str, astminer_params: List[str]) -> int:
    print(f"build asts for projects in {projects_folder} folder")
    projects = os.listdir(projects_folder)
    successful_builds = 0
    for project in tqdm(projects):
        print(f"build asts for {project} project")
        project_path = os.path.join(projects_folder, project)
        output_path = os.path.join(output_folder, project)
        create_folder(output_path)
        if build_asts(project_path, output_path, astminer_path,
                      astminer_params):
            successful_builds += 1
    print(
        f"create asts for {successful_builds} out of {len(projects)} projects")
    return successful_builds
Beispiel #7
0
def interactive(path_to_function: str, path_to_model: str):
    fix_seed()
    device = get_device()
    print(f"using {device} device")

    # load model
    print("loading model...")
    checkpoint = torch.load(path_to_model, map_location=device)

    model = Tree2Seq(**checkpoint['configuration']).to(device)
    model.load_state_dict(checkpoint['state_dict'])

    token_to_id = model.token_to_id
    type_to_id = model.type_to_id
    label_to_id = model.label_to_id
    id_to_label = {v: k for k, v in label_to_id.items()}

    # convert function to dgl format
    print("convert function to dgl format...")
    create_folder(TMP_FOLDER)
    build_asts(path_to_function, TMP_FOLDER, ASTMINER_PATH, *ASTMINER_PARAMS)
    project_folder = os.path.join(TMP_FOLDER, 'java')
    convert_project(project_folder, token_to_id, type_to_id, label_to_id, True,
                    True, 5, 6, False, True, '|')

    # load function
    graph, labels = load_graphs(os.path.join(project_folder, 'converted.dgl'))
    labels = labels['labels']
    assert len(labels) == 1, f"found {len('labels')} functions, instead of 1"
    ast = graph[0].reverse(share_ndata=True)
    ast.ndata['token'] = ast.ndata['token'].to(device)
    ast.ndata['type'] = ast.ndata['type'].to(device)
    labels = labels.t().to(device)
    root_indexes = torch.tensor([0], dtype=torch.long)

    # forward pass
    model.eval()
    with torch.no_grad():
        logits = model(ast, root_indexes, labels, device)
    logits = logits[1:]
    prediction = model.predict(logits).reshape(-1)
    sublabels = [id_to_label[label_id.item()] for label_id in prediction]
    label = '|'.join(takewhile(lambda sl: sl != EOS, sublabels))
    print(f"the predicted label is:\n{label}")
Beispiel #8
0
    def build(self):
        imagebuilder_path = os.path.abspath(
            os.path.join("imagebuilder", self.distro, self.target,
                         self.subtarget))
        self.imagebuilder = ImageBuilder(self.distro, self.release,
                                         self.target, self.subtarget)

        self.log.info("use imagebuilder %s", self.imagebuilder.path)

        with tempfile.TemporaryDirectory(
                dir=get_folder("tempdir")) as self.build_path:
            already_created = False

            # only add manifest hash if special packages
            extra_image_name_array = []
            if not self.vanilla:
                extra_image_name_array.append(self.request_hash)

            cmdline = ['make', 'image', "-j", str(os.cpu_count())]
            cmdline.append('PROFILE=%s' % self.profile)
            #            if self.network_profile:
            #                cmdline.append('FILES=%s' % self.network_profile_path)
            extra_image_name = "-".join(extra_image_name_array)
            self.log.debug("extra_image_name %s", extra_image_name)
            cmdline.append('EXTRA_IMAGE_NAME=%s' % extra_image_name)
            if not self.vanilla:
                self.diff_packages()
            cmdline.append('PACKAGES=%s' % ' '.join(self.packages))
            cmdline.append('BIN_DIR=%s' % self.build_path)

            self.log.info("start build: %s", " ".join(cmdline))

            env = os.environ.copy()

            build_start = datetime.now()
            proc = subprocess.Popen(cmdline,
                                    cwd=self.imagebuilder.path,
                                    stdout=subprocess.PIPE,
                                    shell=False,
                                    stderr=subprocess.STDOUT,
                                    env=env)

            output, erros = proc.communicate()
            build_end = datetime.now()
            self.build_seconds = int((build_end - build_start).total_seconds())
            self.build_log = output.decode("utf-8")
            returnCode = proc.returncode
            if returnCode == 0:
                self.log.info("build successfull")
                self.manifest_hash = hashlib.sha256(
                    open(
                        glob.glob(os.path.join(self.build_path,
                                               '*.manifest'))[0],
                        'rb').read()).hexdigest()[0:15]
                self.parse_manifest()
                self.image_hash = get_hash(" ".join(self.as_array_build()), 15)

                path_array = [
                    get_folder("downloaddir"), self.distro, self.release,
                    self.target, self.subtarget, self.profile
                ]
                if not self.vanilla:
                    path_array.append(self.manifest_hash)
                else:
                    path_array.append("vanilla")

                self.store_path = os.path.join(*path_array)
                create_folder(self.store_path)

                self.log.debug(os.listdir(self.build_path))
                for filename in os.listdir(self.build_path):
                    if filename == "sha256sums":
                        with open(os.path.join(self.build_path, filename),
                                  'r+') as sums:
                            content = sums.read()
                            sums.seek(0)
                            sums.write(self.filename_rename(content))
                            sums.truncate()
                    filename_output = os.path.join(
                        self.store_path, self.filename_rename(filename))

                    self.log.info("move file %s", filename_output)
                    shutil.move(os.path.join(self.build_path, filename),
                                filename_output)

                if sign_file(os.path.join(self.store_path, "sha256sums")):
                    self.log.info("signed sha256sums")

                if not already_created or entry_missing:
                    sysupgrade_files = [
                        "*-squashfs-sysupgrade.bin",
                        "*-squashfs-sysupgrade.tar", "*-squashfs.trx",
                        "*-squashfs.chk", "*-squashfs.bin",
                        "*-squashfs-sdcard.img.gz", "*-combined-squashfs*"
                    ]

                    sysupgrade = None

                    profile_in_sysupgrade = ""
                    if self.profile.lower() != "generic":
                        profile_in_sysupgrade = "*" + self.profile

                    for sysupgrade_file in sysupgrade_files:
                        if not sysupgrade:
                            sysupgrade = glob.glob(
                                os.path.join(
                                    self.store_path,
                                    profile_in_sysupgrade + sysupgrade_file))
                        else:
                            break

                    if not sysupgrade:
                        self.log.debug("sysupgrade not found")
                        if self.build_log.find("too big") != -1:
                            self.log.warning("created image was to big")
                            self.store_log(
                                os.path.join(
                                    get_folder("downloaddir"),
                                    "faillogs/request-{}".format(
                                        self.request_hash)))
                            self.database.set_image_requests_status(
                                self.request_hash, 'imagesize_fail')
                            return False
                        else:
                            self.profile_in_name = None
                            self.subtarget_in_name = None
                            self.sysupgrade_suffix = ""
                            self.build_status = "no_sysupgrade"
                    else:
                        self.path = sysupgrade[0]
                        sysupgrade_image = os.path.basename(self.path)

                        self.subtarget_in_name = self.subtarget in sysupgrade_image
                        self.profile_in_name = self.profile in sysupgrade_image

                        # ath25/generic/generic results in lede-17.01.4-ath25-generic-squashfs-sysupgrade...
                        if (self.profile == self.subtarget and "{}-{}".format(
                                self.subtarget, self.profile)
                                not in sysupgrade_image):
                            self.subtarget_in_name = False

                        name_array = [self.distro]

                        # snapshot build are no release
                        if self.release != "snapshot":
                            name_array.append(self.release)

                        if not self.vanilla:
                            name_array.append(self.manifest_hash)

                        name_array.append(self.target)

                        if self.subtarget_in_name:
                            name_array.append(self.subtarget)

                        if self.profile_in_name:
                            name_array.append(self.profile)

                        self.name = "-".join(name_array)

                        self.sysupgrade_suffix = sysupgrade_image.replace(
                            self.name + "-", "")
                        self.build_status = "created"

                    self.store_log(
                        os.path.join(self.store_path,
                                     "build-{}".format(self.image_hash)))

                    self.log.debug("add image: {} {} {} {} {}".format(
                        self.image_hash, self.as_array_build(),
                        self.sysupgrade_suffix, self.subtarget_in_name,
                        self.profile_in_name, self.vanilla,
                        self.build_seconds))
                    self.database.add_image(self.image_hash,
                                            self.as_array_build(),
                                            self.sysupgrade_suffix,
                                            self.subtarget_in_name,
                                            self.profile_in_name, self.vanilla,
                                            self.build_seconds)
                self.database.done_build_job(self.request_hash,
                                             self.image_hash,
                                             self.build_status)
                return True
            else:
                self.log.info("build failed")
                self.database.set_image_requests_status(
                    self.request_hash, 'build_fail')
                self.store_log(
                    os.path.join(
                        get_folder("downloaddir"),
                        "faillogs/request-{}".format(self.request_hash)))
                return False
def main(args: Namespace) -> None:
    dataset_name = dataset_mapping[args.dataset]
    data_path = os.path.join(data_folder, dataset_name)
    create_folder(data_folder, is_clean=False)
    create_folder(data_path, is_clean=False)

    if args.download:
        print(f"download {dataset_name} dataset...")
        tar_file_path = download_dataset(dataset_name, data_folder)
        print(f"extract files from tar archive {tar_file_path}...")
        train_path, val_path, test_path = extract_dataset(tar_file_path, data_folder, dataset_name)
        print("remove tar file...")
        os.remove(tar_file_path)
    else:
        train_path, val_path, test_path = [os.path.join(data_path, folder) for folder in holdout_folders]

    if args.build_ast:
        if not all([os.path.exists(holdout_path) for holdout_path in [train_path, val_path, test_path]]):
            raise RuntimeError("download and extract data before processing it via --download arg")
        if not os.path.exists(astminer_cli_path):
            raise RuntimeError(f"can't find astminer-cli in this location {astminer_cli_path}")
        holdout_ast_paths = {}
        for holdout in holdout_folders:
            holdout_ast_paths[holdout] = build_holdout_asts(data_path, holdout)
    else:
        holdout_ast_paths = {
            holdout: os.path.join(data_path, f'{holdout}_asts') for holdout in holdout_folders
        }

    vocabulary_path = os.path.join(data_path, vocabulary_name)
    if args.collect_vocabulary:
        token_to_id, type_to_id, label_to_id = collect_vocabulary(os.path.join(data_path, f'{holdout_folders[0]}_asts'))
        with open(vocabulary_path, 'wb') as pkl_file:
            pkl_dump({'token_to_id': token_to_id, 'type_to_id': type_to_id, 'label_to_id': label_to_id}, pkl_file)

    if args.convert:
        if not all([os.path.exists(path[1]) for path in holdout_ast_paths.items()]):
            raise RuntimeError("build ast before converting it via --build_ast arg")
        if not os.path.exists(vocabulary_path):
            raise RuntimeError("collect vocabulary before converting it via --build_ast arg")
        with open(vocabulary_path, 'rb') as pkl_file:
            pkl_data = pkl_load(pkl_file)
            token_to_id = pkl_data['token_to_id']
            type_to_id = pkl_data['type_to_id']

        holdout_preprocessed_paths = {}
        for holdout in holdout_folders:
            holdout_preprocessed_paths[holdout] = convert_holdout(
                data_path, holdout, token_to_id, type_to_id, args.n_jobs, args.batch_size, args.high_memory
            )
    else:
        holdout_preprocessed_paths = {
            holdout: os.path.join(data_path, f'{holdout}_preprocessed') for holdout in holdout_folders
        }

    if args.remove_outliers:
        if not all([os.path.exists(path[1]) for path in holdout_preprocessed_paths.items()]):
            raise RuntimeError("convert ast before removing outliers via --convert arg")
        if args.min_outlier == -1 or args.max_outlier == -1:
            raise ValueError("specify a min and max border for removing outliers")
        removed = remove_outliers(holdout_preprocessed_paths[holdout_folders[0]], args.min_outlier, args.max_outlier)
        print(f"remove {removed} functions for training holdout")

    if args.upload:
        if not all([os.path.exists(path[1]) for path in holdout_preprocessed_paths.items()]):
            raise RuntimeError("convert ast before uploading using it via --convert arg")
        tar_file_name = f'{dataset_name}_{args.tar_suffix}.tar.gz'
        completed_process = subprocess_run(
            ['tar', '-czf', tar_file_name, vocabulary_name] +
            [f'{holdout}_preprocessed' for holdout in holdout_folders],
            cwd=data_path
        )
        if completed_process.returncode != 0:
            print(f"can't create tar for preprocessed data, failed with\n{completed_process.stdout}")
        else:
            upload_file(os.path.join(data_path, tar_file_name), s3_bucket_name, tar_file_name)

    if args.download_preprocessed:
        for holdout, path in holdout_preprocessed_paths.items():
            tar_file_name = f'{dataset_name}_{holdout}_preprocessed.tar.gz'
            tar_path = os.path.join(data_path, tar_file_name)
            download_file(tar_path, s3_bucket_name, tar_file_name)
            create_folder(path)
            extract_tar_gz(tar_path, path)
        vocabulary_path = os.path.join(data_path, vocabulary_name)
        download_file(vocabulary_path, s3_bucket_name, f'{dataset_name}_{vocabulary_name}')

    if all([os.path.exists(holdout_path) for _, holdout_path in holdout_preprocessed_paths.items()]):
        for holdout, path in holdout_preprocessed_paths.items():
            number_of_batches = len(os.listdir(path))
            print(f"There are {number_of_batches} batches in {holdout} data")
Beispiel #10
0
 def __init__(self, checkpoints_folder: str):
     self.timestamp = datetime.now().strftime('%Y_%m_%d_%H:%M:%S')
     self.checkpoints_folder = join_path(checkpoints_folder, self.timestamp)
     create_folder(self.checkpoints_folder)
Beispiel #11
0
 def __init__(self, log_dir: str, checkpoints_dir: str, config: Dict):
     self.timestamp = datetime.now().strftime('%Y_%m_%d_%H:%M:%S')
     self.log_file = join_path(log_dir, f'{self.timestamp}.log')
     self.checkpoints_dir = join_path(checkpoints_dir, self.timestamp)
     create_folder(self.checkpoints_dir)
     self.add_to_saving('config', config)
def main(args: Namespace) -> None:
    fix_seed()
    if args.dataset not in known_datasets:
        raise ValueError(f"Unknown dataset: {args.dataset}")
    dataset_info = known_datasets[args.dataset]()
    dataset_path = os.path.join(DATA_FOLDER, dataset_info.name)
    vocabulary_path = os.path.join(dataset_path, VOCABULARY_NAME)
    create_folder(dataset_path, is_clean=False)

    if args.download:
        download_dataset(dataset_info, dataset_path)

    if args.build_ast:
        if not all([
                os.path.exists(os.path.join(dataset_path, holdout))
                for holdout in dataset_info.holdout_folders
        ]):
            raise RuntimeError("download and extract data before building ast")
        if not os.path.exists(ASTMINER_PATH):
            raise RuntimeError(
                f"can't find astminer-cli in this location {ASTMINER_PATH}")
        build_dataset_asts(dataset_info, dataset_path, ASTMINER_PATH)

    if args.collect_vocabulary:
        train_asts = os.path.join(dataset_path,
                                  f'{dataset_info.holdout_folders[0]}_asts')
        if not os.path.exists(train_asts):
            raise RuntimeError(
                "build training asts before collecting vocabulary")
        collect_vocabulary(train_asts, vocabulary_path, args.n_tokens,
                           args.n_types, args.n_labels, args.split_vocabulary,
                           args.wrap_tokens, args.wrap_labels, '|')

    if args.convert:
        if not os.path.exists(vocabulary_path):
            raise RuntimeError(
                "collect vocabulary before converting data to DGL format")
        with open(vocabulary_path, 'rb') as pkl_file:
            vocab = pickle_load(pkl_file)
        token_to_id, type_to_id, label_to_id = vocab['token_to_id'], vocab[
            'type_to_id'], vocab['label_to_id']
        for holdout in dataset_info.holdout_folders:
            ast_folder = os.path.join(dataset_path, f'{holdout}_asts')
            if not os.path.exists(ast_folder):
                raise RuntimeError(
                    f"build asts for {holdout} before converting it to DGL format"
                )
            output_folder = os.path.join(dataset_path,
                                         f'{holdout}_preprocessed')
            create_folder(output_folder)
            convert_holdout(ast_folder, output_folder, args.batch_size,
                            token_to_id, type_to_id, label_to_id,
                            args.tokens_to_leaves, args.split_vocabulary,
                            args.max_token_len, args.max_label_len,
                            args.wrap_tokens, args.wrap_labels, '|', True,
                            args.n_jobs)

    if args.upload:
        if not all([
                os.path.exists(
                    os.path.join(dataset_path, f'{holdout}_preprocessed'))
                for holdout in dataset_info.holdout_folders
        ]):
            raise RuntimeError(
                "preprocess data before uploading it to the cloud")
        upload_dataset(dataset_info, dataset_path, VOCABULARY_NAME, args.store,
                       args.tar_suffix)

    preprocessed_paths = [
        os.path.join(dataset_path, f'{holdout}_preprocessed')
        for holdout in dataset_info.holdout_folders
    ]
    if all([os.path.exists(path) for path in preprocessed_paths]):
        for holdout, path in zip(dataset_info.holdout_folders,
                                 preprocessed_paths):
            number_of_batches = len(os.listdir(path))
            print(f"There are {number_of_batches} batches in {holdout} data")
Beispiel #13
0
from flask import Flask
from flask import render_template
import logging

from utils.config import Config
from utils.common import create_folder, get_folder, init_usign

app = Flask(__name__)

import server.views

config = Config()
create_folder("{}/{}".format(get_folder("downloaddir"), "faillogs"))
if config.get("sign_images"):
    print("sign images")
    init_usign()

if config.get("dev"):
    from worker.worker import Worker
    worker = Worker()
    worker.start()
    #app.debug = True