def save(self, filename: str): """Save system state to .scad file. Parameters ---------- filename: str Name of file to save (with extension). """ with open(filename, 'wb') as f: pkl_dump(self._state, f)
def remove_outliers(holdout_path: str, min_border: int, max_border: int) -> int: batches = os.listdir(holdout_path) removed = 0 for batch_path in tqdm(batches): with open(os.path.join(holdout_path, batch_path), 'rb') as pkl_file: batch = pkl_load(pkl_file) graphs = dgl.unbatch(batch['batched_graph']) labels = batch['labels'] paths = batch['paths'] orig_size = len(graphs) graphs, labels, paths = zip(*filter( lambda cur: min_border <= cur[0].number_of_nodes() <= max_border, zip(graphs, labels, paths) )) with open(os.path.join(holdout_path, batch_path), 'wb') as pkl_file: pkl_dump({'batched_graph': dgl.batch(graphs), 'labels': labels, 'paths': paths}, pkl_file) removed += orig_size - len(graphs) return removed
def main(args: Namespace) -> None: dataset_name = dataset_mapping[args.dataset] data_path = os.path.join(data_folder, dataset_name) create_folder(data_folder, is_clean=False) create_folder(data_path, is_clean=False) if args.download: print(f"download {dataset_name} dataset...") tar_file_path = download_dataset(dataset_name, data_folder) print(f"extract files from tar archive {tar_file_path}...") train_path, val_path, test_path = extract_dataset(tar_file_path, data_folder, dataset_name) print("remove tar file...") os.remove(tar_file_path) else: train_path, val_path, test_path = [os.path.join(data_path, folder) for folder in holdout_folders] if args.build_ast: if not all([os.path.exists(holdout_path) for holdout_path in [train_path, val_path, test_path]]): raise RuntimeError("download and extract data before processing it via --download arg") if not os.path.exists(astminer_cli_path): raise RuntimeError(f"can't find astminer-cli in this location {astminer_cli_path}") holdout_ast_paths = {} for holdout in holdout_folders: holdout_ast_paths[holdout] = build_holdout_asts(data_path, holdout) else: holdout_ast_paths = { holdout: os.path.join(data_path, f'{holdout}_asts') for holdout in holdout_folders } vocabulary_path = os.path.join(data_path, vocabulary_name) if args.collect_vocabulary: token_to_id, type_to_id, label_to_id = collect_vocabulary(os.path.join(data_path, f'{holdout_folders[0]}_asts')) with open(vocabulary_path, 'wb') as pkl_file: pkl_dump({'token_to_id': token_to_id, 'type_to_id': type_to_id, 'label_to_id': label_to_id}, pkl_file) if args.convert: if not all([os.path.exists(path[1]) for path in holdout_ast_paths.items()]): raise RuntimeError("build ast before converting it via --build_ast arg") if not os.path.exists(vocabulary_path): raise RuntimeError("collect vocabulary before converting it via --build_ast arg") with open(vocabulary_path, 'rb') as pkl_file: pkl_data = pkl_load(pkl_file) token_to_id = pkl_data['token_to_id'] type_to_id = pkl_data['type_to_id'] holdout_preprocessed_paths = {} for holdout in holdout_folders: holdout_preprocessed_paths[holdout] = convert_holdout( data_path, holdout, token_to_id, type_to_id, args.n_jobs, args.batch_size, args.high_memory ) else: holdout_preprocessed_paths = { holdout: os.path.join(data_path, f'{holdout}_preprocessed') for holdout in holdout_folders } if args.remove_outliers: if not all([os.path.exists(path[1]) for path in holdout_preprocessed_paths.items()]): raise RuntimeError("convert ast before removing outliers via --convert arg") if args.min_outlier == -1 or args.max_outlier == -1: raise ValueError("specify a min and max border for removing outliers") removed = remove_outliers(holdout_preprocessed_paths[holdout_folders[0]], args.min_outlier, args.max_outlier) print(f"remove {removed} functions for training holdout") if args.upload: if not all([os.path.exists(path[1]) for path in holdout_preprocessed_paths.items()]): raise RuntimeError("convert ast before uploading using it via --convert arg") tar_file_name = f'{dataset_name}_{args.tar_suffix}.tar.gz' completed_process = subprocess_run( ['tar', '-czf', tar_file_name, vocabulary_name] + [f'{holdout}_preprocessed' for holdout in holdout_folders], cwd=data_path ) if completed_process.returncode != 0: print(f"can't create tar for preprocessed data, failed with\n{completed_process.stdout}") else: upload_file(os.path.join(data_path, tar_file_name), s3_bucket_name, tar_file_name) if args.download_preprocessed: for holdout, path in holdout_preprocessed_paths.items(): tar_file_name = f'{dataset_name}_{holdout}_preprocessed.tar.gz' tar_path = os.path.join(data_path, tar_file_name) download_file(tar_path, s3_bucket_name, tar_file_name) create_folder(path) extract_tar_gz(tar_path, path) vocabulary_path = os.path.join(data_path, vocabulary_name) download_file(vocabulary_path, s3_bucket_name, f'{dataset_name}_{vocabulary_name}') if all([os.path.exists(holdout_path) for _, holdout_path in holdout_preprocessed_paths.items()]): for holdout, path in holdout_preprocessed_paths.items(): number_of_batches = len(os.listdir(path)) print(f"There are {number_of_batches} batches in {holdout} data")
def save(self, arr, pth): with gzip.open(pth, 'wb+') as fh: pkl_dump(arr, fh) sync(fh)
max_num_steps=MAX_NUM_STEPS) experiment_filename = '{epoch_train_time}-a_{alpha}-g_{gamma}-e_{epsilon}-edecay_{epsilon_decay}-emin_{epsilon_min}'\ .format(epoch_train_time=stats.epoch_train_time, alpha=ALPHA, gamma=GAMMA, epsilon=EPSILON_START, epsilon_min=EPSILON_MIN, epsilon_decay=EPSILON_DECAY) print("\n\nScore: {}".format(acc_rewards)) if RESULTS_CONFIG.SAVE_MODEL: torch_save(agent.online_q_network.state_dict(), experiment_filename + 'pth') if RESULTS_CONFIG.SAVE_REWARDS_DATA: pkl_dump(acc_rewards, open('./results/' + experiment_filename + ".p", 'wb')) if TEST_AGENT: model = torch.load('./models/checkpoint' + '.pth') model = agent.target_q_network model.eval() test_epoch_rewards = [] # list containing scores from each episode_idx first_time_solved = False test_start_time = time() for episode_idx in range(NUM_TEST_EPISODES): episode_acc_reward = 0 state = env.reset(train_mode=True)[brain_name].vector_observations[ 0] # reset the environment
def search(self): drives = self.drive_finder() #list of all drives copy_path = self.dataPath() + '7\\' #address of folder 7 dbfile_path = self.dataPath( ) + '11\\coldstore_file.json' #address of coldstore json final_list = [] if exists(dbfile_path): #checks whether the path exists, for coldstore open_db = open(dbfile_path, 'rb') #open coldstore in read mode final_dict = pkl_load( open_db ) #load the pickle format of the coldstore to make it readable & save it in final_dict as it is a dictionar open_db.close() else: final_dict = {} # sets blank if file deleted or not created. for drive in drives: #loop through drives if GetDriveType( drive ) == DRIVE_FIXED: #check if drive is permanent or removable. for root, dirs, files in walk( drive, topdown=True ): #loop to walk through all the files and folders of a drive. if self.dataPath() not in root or root.split( "\\")[1] not in [ "Windows", "Program Files", "ProgramData", "Intel", "PrefLogs", "MSOCache", "Boot", "Recovery", "Python27", "$Recycle.Bin" ]: for file in files: #files contains the list of all the files in a directory, Dirs contain all the folders. target_path = copy_path + file #location where the file has to be formed. name, ext = splitext( file) #splits name and extention of a file. src_path = join( root, file ) #contains the address of the present file. src_mtime = getmtime( src_path ) #contains the epoch time of formation of the file. if ext in self.extensions or ext in self.special_extensions and src_path.__contains__( self.dataPath()): if src_path.__contains__(self.dataPath( )) and ext not in self.special_extensions: continue elif src_path not in final_dict or src_path in final_dict and src_mtime != final_dict[ src_path]: print(src_path) mod_time = getmtime(src_path) final_dict[src_path] = mod_time if exists(target_path) and isfile( target_path ) and self.md5_checksum( src_path ) == self.md5_checksum( target_path ): #checks if file that exists in target location and src location are same. continue elif exists(target_path) and isfile( target_path ) and self.md5_checksum( src_path ) != self.md5_checksum( target_path ): #checks if files are of same name but different content new_name = self.md5_checksum( src_path )[:8] + '__' + file #adds first 8bits of md5 checksum to the name if not exists( copy_path + new_name ): #checks if file does not exists already #check this part copy2( src_path, copy_path + new_name ) #imports the file to 7 folder else: continue else: if not exists(target_path): copy2(src_path, target_path) else: continue else: continue print(final_dict) jsn = open(dbfile_path, 'wb') pkl_dump(final_dict, jsn) jsn.close() temp_list = listdir(copy_path) for file in temp_list: filepath = copy_path + file final_list.append(filepath) final_list.sort(key=getmtime, reverse=True) with open(self.dataPath() + '11\\' + 'coldstore_sorted.pkl', 'wb') as pkl: pkl_dump(final_list, pkl) self.status_updater(search_flag=1, date_of_completion=time.time()) self.file_feeder()
def file_slicer(self, compressed_path, original_path): megabyte = int(pow(1024, 2)) counter = 1 file_name = basename(compressed_path) #Name of the compressed file file_size = getsize(compressed_path) #Size of the compressed file sliced_path = self.dataPath() + '8\\' #address of the sliced file sliced_db = self.dataPath( ) + '11\\cold_sliced.json' #json to store information about the sliced files chunk_dict = {} chunk_list = [] state_dict = {} if file_size > megabyte: #if file size more than 1 MB total = ceil(file_size / megabyte) #Total number of slices to be formed with open(compressed_path, 'rb') as file: file_data = file.read() #read compressed file while counter <= total: chunk = file_data[:megabyte] #!MB of chunk taken file_data = file_data[megabyte:] #1MB of chunk removed chunk_dir = sliced_path + file_name + '_' + str( total) + '\\' #Address of the folder of sliced files chunk_path = chunk_dir + file_name + '_' + str( counter) + '_' + str(total) chunk_list.append(chunk_path) if not exists(chunk_dir): makedirs(chunk_dir) current_modtime = getmtime( original_path) #stores last modified time state_dict[ original_path] = current_modtime #stores modified time with key of original path state_path = chunk_dir + '\\' + basename( original_path ) + '_state' #path of the json that stores information about the file with open(chunk_path, 'wb') as chunk_file: chunk_file.write(chunk) with open(state_path, 'wb') as state: pkl_dump(state_dict, state) counter += 1 if exists(sliced_db): with open(sliced_db, 'rb') as db: #opening sliced path chunk_dict = pkl_load(db) chunk_dict[ original_path] = chunk_list #adding info on the new file with open(sliced_db, 'wb') as sl: #writing the new file pkl_dump(chunk_dict, sl) for chunk in chunk_list: #sending chunk for encryption self.encryption(chunk, original_path, True) remove(chunk) else: self.encryption(compressed_path, original_path, False) remove(compressed_path)