def build_holdout_asts(data_path: str, holdout_name: str) -> str: print(f"build asts for {holdout_name} data...") projects = os.listdir(os.path.join(data_path, holdout_name)) output_folder_path = os.path.join(data_path, f'{holdout_name}_asts') create_folder(output_folder_path) successful_builds = 0 for project in tqdm(projects): print(f"working with {project} project") project_path = os.path.join(data_path, holdout_name, project) output_project_path = os.path.join(output_folder_path, project) create_folder(output_project_path) if build_project_asts(project_path, output_project_path): successful_builds += 1 desc_path = os.path.join(output_project_path, 'java', 'description.csv') # remove asts with nan labels project_description = pd.read_csv(desc_path) bad_labels_mask = project_description['label'].isna() filenames = project_description[bad_labels_mask]['dot_file'].unique() source_files = project_description[bad_labels_mask]['source_file'].unique() print(f"remove functions from {source_files} for {project} project") for filename in filenames: filepath = os.path.join(output_project_path, 'java', 'asts', filename) os.remove(filepath) project_description.dropna(subset=['label'], inplace=True) project_description.to_csv(desc_path, index=False) print(f"create asts for {successful_builds}/{len(projects)} {holdout_name} projects") return output_folder_path
def run(self): self.log.info("downloading imagebuilder %s", self.path) if not self.created(): create_folder(self.path) regular_tar_url = os.path.join(self.download_url(), self.tar_name()) if get_statuscode(regular_tar_url) != 404: if not self.download(regular_tar_url): return False else: self.log.info("did not find regular imagebuilder name") # this is only due to arm64 missing -generic in filename # this is very ugly, can this just be deleted? special_tar_url = os.path.join(self.download_url(), self.tar_name(True)) if get_statuscode(special_tar_url) != 404: self.log.debug("remove -generic from url") if not self.download(special_tar_url): return False else: return False self.patch_makefile() self.add_custom_repositories() self.pkg_arch = self.parse_packages_arch() self.log.info("initialized imagebuilder %s", self.path) return True
def build_dataset_asts(dataset_info: IDatasetInfo, dataset_path: str, astminer_path: str) -> None: for holdout in dataset_info.holdout_folders: holdout_folder = os.path.join(dataset_path, holdout) output_folder = os.path.join(dataset_path, f'{holdout}_asts') create_folder(output_folder) build_projects_asts(holdout_folder, output_folder, astminer_path, dataset_info.astminer_params)
def interactive(path_to_function: str, path_to_model: str): fix_seed() device = get_device() print(f"using {device} device") # convert function to dot format print(f"prepare ast...") create_folder(TMP_FOLDER) if not build_ast(path_to_function): return ast_folder = os.path.join(TMP_FOLDER, 'java', 'asts') ast = os.listdir(ast_folder) if len(ast) == 0: print("didn't find any functions in given file") return if len(ast) > 1: print( "too many functions in given file, for interactive prediction you need only one" ) return dgl_ast = convert_dot_to_dgl(os.path.join(ast_folder, ast[0])) ast_desc = pd.read_csv(os.path.join(TMP_FOLDER, 'java', 'description.csv')) ast_desc['token'].fillna('NAN', inplace=True) with open(vocab_path, 'rb') as pkl_file: vocab = pkl_load(pkl_file) token_to_id, type_to_id = vocab['token_to_id'], vocab['type_to_id'] ast_desc = transform_keys(ast_desc, token_to_id, type_to_id) batched_graph, labels, paths = prepare_batch(ast_desc, ['ast_0.dot'], lambda: [dgl_ast]) batched_graph = dgl.batch( list( map(lambda g: dgl.reverse(g, share_ndata=True), dgl.unbatch(batched_graph)))) # load model print("loading model..") model, _ = load_model(path_to_model, device) criterion = nn.CrossEntropyLoss( ignore_index=model.decoder.pad_index).to(device) info = LearningInfo() print("forward pass...") batch_info, prediction = eval_on_batch(model, criterion, batched_graph, labels, device) info.accumulate_info(batch_info) id_to_sublabel = {v: k for k, v in model.decoder.label_to_id.items()} label = '' for cur_sublabel in prediction: if cur_sublabel.item() == model.decoder.label_to_id[EOS]: break label += '|' + id_to_sublabel[cur_sublabel.item()] label = label[1:] print(f"Predicted function name is\n{label}") print( f"Calculated metrics with respect to '{labels[0]}' name\n{info.get_state_dict()}" )
def convert_holdout(holdout_name: str, vocab: Vocabulary, config: PreprocessingConfig, n_jobs: int): holdout_data_path = path.join(DATA_FOLDER, config.dataset_name, f"{config.dataset_name}.{holdout_name}.c2s") holdout_output_folder = path.join(DATA_FOLDER, config.dataset_name, holdout_name) create_folder(holdout_output_folder) with open(path.join(holdout_output_folder, DESCRIPTION_FILE), "w") as desc_file: desc_file.write("id,filename,n_samples,n_paths\n") with Pool(n_jobs) as pool: results = pool.imap( _convert_raw_buffer, ( (lines, config, vocab, holdout_output_folder, pos) for pos, lines in enumerate(_read_file_by_batch(holdout_data_path, config.buffer_size)) ), ) n_buffers = ceil(count_lines_in_file(holdout_data_path) / config.buffer_size) _ = [_ for _ in tqdm(results, total=n_buffers)]
def build_projects_asts(projects_folder: str, output_folder: str, astminer_path: str, astminer_params: List[str]) -> int: print(f"build asts for projects in {projects_folder} folder") projects = os.listdir(projects_folder) successful_builds = 0 for project in tqdm(projects): print(f"build asts for {project} project") project_path = os.path.join(projects_folder, project) output_path = os.path.join(output_folder, project) create_folder(output_path) if build_asts(project_path, output_path, astminer_path, astminer_params): successful_builds += 1 print( f"create asts for {successful_builds} out of {len(projects)} projects") return successful_builds
def interactive(path_to_function: str, path_to_model: str): fix_seed() device = get_device() print(f"using {device} device") # load model print("loading model...") checkpoint = torch.load(path_to_model, map_location=device) model = Tree2Seq(**checkpoint['configuration']).to(device) model.load_state_dict(checkpoint['state_dict']) token_to_id = model.token_to_id type_to_id = model.type_to_id label_to_id = model.label_to_id id_to_label = {v: k for k, v in label_to_id.items()} # convert function to dgl format print("convert function to dgl format...") create_folder(TMP_FOLDER) build_asts(path_to_function, TMP_FOLDER, ASTMINER_PATH, *ASTMINER_PARAMS) project_folder = os.path.join(TMP_FOLDER, 'java') convert_project(project_folder, token_to_id, type_to_id, label_to_id, True, True, 5, 6, False, True, '|') # load function graph, labels = load_graphs(os.path.join(project_folder, 'converted.dgl')) labels = labels['labels'] assert len(labels) == 1, f"found {len('labels')} functions, instead of 1" ast = graph[0].reverse(share_ndata=True) ast.ndata['token'] = ast.ndata['token'].to(device) ast.ndata['type'] = ast.ndata['type'].to(device) labels = labels.t().to(device) root_indexes = torch.tensor([0], dtype=torch.long) # forward pass model.eval() with torch.no_grad(): logits = model(ast, root_indexes, labels, device) logits = logits[1:] prediction = model.predict(logits).reshape(-1) sublabels = [id_to_label[label_id.item()] for label_id in prediction] label = '|'.join(takewhile(lambda sl: sl != EOS, sublabels)) print(f"the predicted label is:\n{label}")
def build(self): imagebuilder_path = os.path.abspath( os.path.join("imagebuilder", self.distro, self.target, self.subtarget)) self.imagebuilder = ImageBuilder(self.distro, self.release, self.target, self.subtarget) self.log.info("use imagebuilder %s", self.imagebuilder.path) with tempfile.TemporaryDirectory( dir=get_folder("tempdir")) as self.build_path: already_created = False # only add manifest hash if special packages extra_image_name_array = [] if not self.vanilla: extra_image_name_array.append(self.request_hash) cmdline = ['make', 'image', "-j", str(os.cpu_count())] cmdline.append('PROFILE=%s' % self.profile) # if self.network_profile: # cmdline.append('FILES=%s' % self.network_profile_path) extra_image_name = "-".join(extra_image_name_array) self.log.debug("extra_image_name %s", extra_image_name) cmdline.append('EXTRA_IMAGE_NAME=%s' % extra_image_name) if not self.vanilla: self.diff_packages() cmdline.append('PACKAGES=%s' % ' '.join(self.packages)) cmdline.append('BIN_DIR=%s' % self.build_path) self.log.info("start build: %s", " ".join(cmdline)) env = os.environ.copy() build_start = datetime.now() proc = subprocess.Popen(cmdline, cwd=self.imagebuilder.path, stdout=subprocess.PIPE, shell=False, stderr=subprocess.STDOUT, env=env) output, erros = proc.communicate() build_end = datetime.now() self.build_seconds = int((build_end - build_start).total_seconds()) self.build_log = output.decode("utf-8") returnCode = proc.returncode if returnCode == 0: self.log.info("build successfull") self.manifest_hash = hashlib.sha256( open( glob.glob(os.path.join(self.build_path, '*.manifest'))[0], 'rb').read()).hexdigest()[0:15] self.parse_manifest() self.image_hash = get_hash(" ".join(self.as_array_build()), 15) path_array = [ get_folder("downloaddir"), self.distro, self.release, self.target, self.subtarget, self.profile ] if not self.vanilla: path_array.append(self.manifest_hash) else: path_array.append("vanilla") self.store_path = os.path.join(*path_array) create_folder(self.store_path) self.log.debug(os.listdir(self.build_path)) for filename in os.listdir(self.build_path): if filename == "sha256sums": with open(os.path.join(self.build_path, filename), 'r+') as sums: content = sums.read() sums.seek(0) sums.write(self.filename_rename(content)) sums.truncate() filename_output = os.path.join( self.store_path, self.filename_rename(filename)) self.log.info("move file %s", filename_output) shutil.move(os.path.join(self.build_path, filename), filename_output) if sign_file(os.path.join(self.store_path, "sha256sums")): self.log.info("signed sha256sums") if not already_created or entry_missing: sysupgrade_files = [ "*-squashfs-sysupgrade.bin", "*-squashfs-sysupgrade.tar", "*-squashfs.trx", "*-squashfs.chk", "*-squashfs.bin", "*-squashfs-sdcard.img.gz", "*-combined-squashfs*" ] sysupgrade = None profile_in_sysupgrade = "" if self.profile.lower() != "generic": profile_in_sysupgrade = "*" + self.profile for sysupgrade_file in sysupgrade_files: if not sysupgrade: sysupgrade = glob.glob( os.path.join( self.store_path, profile_in_sysupgrade + sysupgrade_file)) else: break if not sysupgrade: self.log.debug("sysupgrade not found") if self.build_log.find("too big") != -1: self.log.warning("created image was to big") self.store_log( os.path.join( get_folder("downloaddir"), "faillogs/request-{}".format( self.request_hash))) self.database.set_image_requests_status( self.request_hash, 'imagesize_fail') return False else: self.profile_in_name = None self.subtarget_in_name = None self.sysupgrade_suffix = "" self.build_status = "no_sysupgrade" else: self.path = sysupgrade[0] sysupgrade_image = os.path.basename(self.path) self.subtarget_in_name = self.subtarget in sysupgrade_image self.profile_in_name = self.profile in sysupgrade_image # ath25/generic/generic results in lede-17.01.4-ath25-generic-squashfs-sysupgrade... if (self.profile == self.subtarget and "{}-{}".format( self.subtarget, self.profile) not in sysupgrade_image): self.subtarget_in_name = False name_array = [self.distro] # snapshot build are no release if self.release != "snapshot": name_array.append(self.release) if not self.vanilla: name_array.append(self.manifest_hash) name_array.append(self.target) if self.subtarget_in_name: name_array.append(self.subtarget) if self.profile_in_name: name_array.append(self.profile) self.name = "-".join(name_array) self.sysupgrade_suffix = sysupgrade_image.replace( self.name + "-", "") self.build_status = "created" self.store_log( os.path.join(self.store_path, "build-{}".format(self.image_hash))) self.log.debug("add image: {} {} {} {} {}".format( self.image_hash, self.as_array_build(), self.sysupgrade_suffix, self.subtarget_in_name, self.profile_in_name, self.vanilla, self.build_seconds)) self.database.add_image(self.image_hash, self.as_array_build(), self.sysupgrade_suffix, self.subtarget_in_name, self.profile_in_name, self.vanilla, self.build_seconds) self.database.done_build_job(self.request_hash, self.image_hash, self.build_status) return True else: self.log.info("build failed") self.database.set_image_requests_status( self.request_hash, 'build_fail') self.store_log( os.path.join( get_folder("downloaddir"), "faillogs/request-{}".format(self.request_hash))) return False
def main(args: Namespace) -> None: dataset_name = dataset_mapping[args.dataset] data_path = os.path.join(data_folder, dataset_name) create_folder(data_folder, is_clean=False) create_folder(data_path, is_clean=False) if args.download: print(f"download {dataset_name} dataset...") tar_file_path = download_dataset(dataset_name, data_folder) print(f"extract files from tar archive {tar_file_path}...") train_path, val_path, test_path = extract_dataset(tar_file_path, data_folder, dataset_name) print("remove tar file...") os.remove(tar_file_path) else: train_path, val_path, test_path = [os.path.join(data_path, folder) for folder in holdout_folders] if args.build_ast: if not all([os.path.exists(holdout_path) for holdout_path in [train_path, val_path, test_path]]): raise RuntimeError("download and extract data before processing it via --download arg") if not os.path.exists(astminer_cli_path): raise RuntimeError(f"can't find astminer-cli in this location {astminer_cli_path}") holdout_ast_paths = {} for holdout in holdout_folders: holdout_ast_paths[holdout] = build_holdout_asts(data_path, holdout) else: holdout_ast_paths = { holdout: os.path.join(data_path, f'{holdout}_asts') for holdout in holdout_folders } vocabulary_path = os.path.join(data_path, vocabulary_name) if args.collect_vocabulary: token_to_id, type_to_id, label_to_id = collect_vocabulary(os.path.join(data_path, f'{holdout_folders[0]}_asts')) with open(vocabulary_path, 'wb') as pkl_file: pkl_dump({'token_to_id': token_to_id, 'type_to_id': type_to_id, 'label_to_id': label_to_id}, pkl_file) if args.convert: if not all([os.path.exists(path[1]) for path in holdout_ast_paths.items()]): raise RuntimeError("build ast before converting it via --build_ast arg") if not os.path.exists(vocabulary_path): raise RuntimeError("collect vocabulary before converting it via --build_ast arg") with open(vocabulary_path, 'rb') as pkl_file: pkl_data = pkl_load(pkl_file) token_to_id = pkl_data['token_to_id'] type_to_id = pkl_data['type_to_id'] holdout_preprocessed_paths = {} for holdout in holdout_folders: holdout_preprocessed_paths[holdout] = convert_holdout( data_path, holdout, token_to_id, type_to_id, args.n_jobs, args.batch_size, args.high_memory ) else: holdout_preprocessed_paths = { holdout: os.path.join(data_path, f'{holdout}_preprocessed') for holdout in holdout_folders } if args.remove_outliers: if not all([os.path.exists(path[1]) for path in holdout_preprocessed_paths.items()]): raise RuntimeError("convert ast before removing outliers via --convert arg") if args.min_outlier == -1 or args.max_outlier == -1: raise ValueError("specify a min and max border for removing outliers") removed = remove_outliers(holdout_preprocessed_paths[holdout_folders[0]], args.min_outlier, args.max_outlier) print(f"remove {removed} functions for training holdout") if args.upload: if not all([os.path.exists(path[1]) for path in holdout_preprocessed_paths.items()]): raise RuntimeError("convert ast before uploading using it via --convert arg") tar_file_name = f'{dataset_name}_{args.tar_suffix}.tar.gz' completed_process = subprocess_run( ['tar', '-czf', tar_file_name, vocabulary_name] + [f'{holdout}_preprocessed' for holdout in holdout_folders], cwd=data_path ) if completed_process.returncode != 0: print(f"can't create tar for preprocessed data, failed with\n{completed_process.stdout}") else: upload_file(os.path.join(data_path, tar_file_name), s3_bucket_name, tar_file_name) if args.download_preprocessed: for holdout, path in holdout_preprocessed_paths.items(): tar_file_name = f'{dataset_name}_{holdout}_preprocessed.tar.gz' tar_path = os.path.join(data_path, tar_file_name) download_file(tar_path, s3_bucket_name, tar_file_name) create_folder(path) extract_tar_gz(tar_path, path) vocabulary_path = os.path.join(data_path, vocabulary_name) download_file(vocabulary_path, s3_bucket_name, f'{dataset_name}_{vocabulary_name}') if all([os.path.exists(holdout_path) for _, holdout_path in holdout_preprocessed_paths.items()]): for holdout, path in holdout_preprocessed_paths.items(): number_of_batches = len(os.listdir(path)) print(f"There are {number_of_batches} batches in {holdout} data")
def __init__(self, checkpoints_folder: str): self.timestamp = datetime.now().strftime('%Y_%m_%d_%H:%M:%S') self.checkpoints_folder = join_path(checkpoints_folder, self.timestamp) create_folder(self.checkpoints_folder)
def __init__(self, log_dir: str, checkpoints_dir: str, config: Dict): self.timestamp = datetime.now().strftime('%Y_%m_%d_%H:%M:%S') self.log_file = join_path(log_dir, f'{self.timestamp}.log') self.checkpoints_dir = join_path(checkpoints_dir, self.timestamp) create_folder(self.checkpoints_dir) self.add_to_saving('config', config)
def main(args: Namespace) -> None: fix_seed() if args.dataset not in known_datasets: raise ValueError(f"Unknown dataset: {args.dataset}") dataset_info = known_datasets[args.dataset]() dataset_path = os.path.join(DATA_FOLDER, dataset_info.name) vocabulary_path = os.path.join(dataset_path, VOCABULARY_NAME) create_folder(dataset_path, is_clean=False) if args.download: download_dataset(dataset_info, dataset_path) if args.build_ast: if not all([ os.path.exists(os.path.join(dataset_path, holdout)) for holdout in dataset_info.holdout_folders ]): raise RuntimeError("download and extract data before building ast") if not os.path.exists(ASTMINER_PATH): raise RuntimeError( f"can't find astminer-cli in this location {ASTMINER_PATH}") build_dataset_asts(dataset_info, dataset_path, ASTMINER_PATH) if args.collect_vocabulary: train_asts = os.path.join(dataset_path, f'{dataset_info.holdout_folders[0]}_asts') if not os.path.exists(train_asts): raise RuntimeError( "build training asts before collecting vocabulary") collect_vocabulary(train_asts, vocabulary_path, args.n_tokens, args.n_types, args.n_labels, args.split_vocabulary, args.wrap_tokens, args.wrap_labels, '|') if args.convert: if not os.path.exists(vocabulary_path): raise RuntimeError( "collect vocabulary before converting data to DGL format") with open(vocabulary_path, 'rb') as pkl_file: vocab = pickle_load(pkl_file) token_to_id, type_to_id, label_to_id = vocab['token_to_id'], vocab[ 'type_to_id'], vocab['label_to_id'] for holdout in dataset_info.holdout_folders: ast_folder = os.path.join(dataset_path, f'{holdout}_asts') if not os.path.exists(ast_folder): raise RuntimeError( f"build asts for {holdout} before converting it to DGL format" ) output_folder = os.path.join(dataset_path, f'{holdout}_preprocessed') create_folder(output_folder) convert_holdout(ast_folder, output_folder, args.batch_size, token_to_id, type_to_id, label_to_id, args.tokens_to_leaves, args.split_vocabulary, args.max_token_len, args.max_label_len, args.wrap_tokens, args.wrap_labels, '|', True, args.n_jobs) if args.upload: if not all([ os.path.exists( os.path.join(dataset_path, f'{holdout}_preprocessed')) for holdout in dataset_info.holdout_folders ]): raise RuntimeError( "preprocess data before uploading it to the cloud") upload_dataset(dataset_info, dataset_path, VOCABULARY_NAME, args.store, args.tar_suffix) preprocessed_paths = [ os.path.join(dataset_path, f'{holdout}_preprocessed') for holdout in dataset_info.holdout_folders ] if all([os.path.exists(path) for path in preprocessed_paths]): for holdout, path in zip(dataset_info.holdout_folders, preprocessed_paths): number_of_batches = len(os.listdir(path)) print(f"There are {number_of_batches} batches in {holdout} data")
from flask import Flask from flask import render_template import logging from utils.config import Config from utils.common import create_folder, get_folder, init_usign app = Flask(__name__) import server.views config = Config() create_folder("{}/{}".format(get_folder("downloaddir"), "faillogs")) if config.get("sign_images"): print("sign images") init_usign() if config.get("dev"): from worker.worker import Worker worker = Worker() worker.start() #app.debug = True