Example #1
0
    def save(self, filename: str):
        """Save system state to .scad file.

        Parameters
        ----------
        filename: str
            Name of file to save (with extension).
        """
        with open(filename, 'wb') as f:
            pkl_dump(self._state, f)
def remove_outliers(holdout_path: str, min_border: int, max_border: int) -> int:
    batches = os.listdir(holdout_path)
    removed = 0
    for batch_path in tqdm(batches):
        with open(os.path.join(holdout_path, batch_path), 'rb') as pkl_file:
            batch = pkl_load(pkl_file)
        graphs = dgl.unbatch(batch['batched_graph'])
        labels = batch['labels']
        paths = batch['paths']
        orig_size = len(graphs)
        graphs, labels, paths = zip(*filter(
            lambda cur: min_border <= cur[0].number_of_nodes() <= max_border,
            zip(graphs, labels, paths)
        ))
        with open(os.path.join(holdout_path, batch_path), 'wb') as pkl_file:
            pkl_dump({'batched_graph': dgl.batch(graphs), 'labels': labels, 'paths': paths}, pkl_file)
        removed += orig_size - len(graphs)
    return removed
def main(args: Namespace) -> None:
    dataset_name = dataset_mapping[args.dataset]
    data_path = os.path.join(data_folder, dataset_name)
    create_folder(data_folder, is_clean=False)
    create_folder(data_path, is_clean=False)

    if args.download:
        print(f"download {dataset_name} dataset...")
        tar_file_path = download_dataset(dataset_name, data_folder)
        print(f"extract files from tar archive {tar_file_path}...")
        train_path, val_path, test_path = extract_dataset(tar_file_path, data_folder, dataset_name)
        print("remove tar file...")
        os.remove(tar_file_path)
    else:
        train_path, val_path, test_path = [os.path.join(data_path, folder) for folder in holdout_folders]

    if args.build_ast:
        if not all([os.path.exists(holdout_path) for holdout_path in [train_path, val_path, test_path]]):
            raise RuntimeError("download and extract data before processing it via --download arg")
        if not os.path.exists(astminer_cli_path):
            raise RuntimeError(f"can't find astminer-cli in this location {astminer_cli_path}")
        holdout_ast_paths = {}
        for holdout in holdout_folders:
            holdout_ast_paths[holdout] = build_holdout_asts(data_path, holdout)
    else:
        holdout_ast_paths = {
            holdout: os.path.join(data_path, f'{holdout}_asts') for holdout in holdout_folders
        }

    vocabulary_path = os.path.join(data_path, vocabulary_name)
    if args.collect_vocabulary:
        token_to_id, type_to_id, label_to_id = collect_vocabulary(os.path.join(data_path, f'{holdout_folders[0]}_asts'))
        with open(vocabulary_path, 'wb') as pkl_file:
            pkl_dump({'token_to_id': token_to_id, 'type_to_id': type_to_id, 'label_to_id': label_to_id}, pkl_file)

    if args.convert:
        if not all([os.path.exists(path[1]) for path in holdout_ast_paths.items()]):
            raise RuntimeError("build ast before converting it via --build_ast arg")
        if not os.path.exists(vocabulary_path):
            raise RuntimeError("collect vocabulary before converting it via --build_ast arg")
        with open(vocabulary_path, 'rb') as pkl_file:
            pkl_data = pkl_load(pkl_file)
            token_to_id = pkl_data['token_to_id']
            type_to_id = pkl_data['type_to_id']

        holdout_preprocessed_paths = {}
        for holdout in holdout_folders:
            holdout_preprocessed_paths[holdout] = convert_holdout(
                data_path, holdout, token_to_id, type_to_id, args.n_jobs, args.batch_size, args.high_memory
            )
    else:
        holdout_preprocessed_paths = {
            holdout: os.path.join(data_path, f'{holdout}_preprocessed') for holdout in holdout_folders
        }

    if args.remove_outliers:
        if not all([os.path.exists(path[1]) for path in holdout_preprocessed_paths.items()]):
            raise RuntimeError("convert ast before removing outliers via --convert arg")
        if args.min_outlier == -1 or args.max_outlier == -1:
            raise ValueError("specify a min and max border for removing outliers")
        removed = remove_outliers(holdout_preprocessed_paths[holdout_folders[0]], args.min_outlier, args.max_outlier)
        print(f"remove {removed} functions for training holdout")

    if args.upload:
        if not all([os.path.exists(path[1]) for path in holdout_preprocessed_paths.items()]):
            raise RuntimeError("convert ast before uploading using it via --convert arg")
        tar_file_name = f'{dataset_name}_{args.tar_suffix}.tar.gz'
        completed_process = subprocess_run(
            ['tar', '-czf', tar_file_name, vocabulary_name] +
            [f'{holdout}_preprocessed' for holdout in holdout_folders],
            cwd=data_path
        )
        if completed_process.returncode != 0:
            print(f"can't create tar for preprocessed data, failed with\n{completed_process.stdout}")
        else:
            upload_file(os.path.join(data_path, tar_file_name), s3_bucket_name, tar_file_name)

    if args.download_preprocessed:
        for holdout, path in holdout_preprocessed_paths.items():
            tar_file_name = f'{dataset_name}_{holdout}_preprocessed.tar.gz'
            tar_path = os.path.join(data_path, tar_file_name)
            download_file(tar_path, s3_bucket_name, tar_file_name)
            create_folder(path)
            extract_tar_gz(tar_path, path)
        vocabulary_path = os.path.join(data_path, vocabulary_name)
        download_file(vocabulary_path, s3_bucket_name, f'{dataset_name}_{vocabulary_name}')

    if all([os.path.exists(holdout_path) for _, holdout_path in holdout_preprocessed_paths.items()]):
        for holdout, path in holdout_preprocessed_paths.items():
            number_of_batches = len(os.listdir(path))
            print(f"There are {number_of_batches} batches in {holdout} data")
 def save(self, arr, pth):
     with gzip.open(pth, 'wb+') as fh:
         pkl_dump(arr, fh)
         sync(fh)
                                     max_num_steps=MAX_NUM_STEPS)

    experiment_filename = '{epoch_train_time}-a_{alpha}-g_{gamma}-e_{epsilon}-edecay_{epsilon_decay}-emin_{epsilon_min}'\
        .format(epoch_train_time=stats.epoch_train_time,
                alpha=ALPHA,
                gamma=GAMMA,
                epsilon=EPSILON_START, epsilon_min=EPSILON_MIN, epsilon_decay=EPSILON_DECAY)

    print("\n\nScore: {}".format(acc_rewards))

    if RESULTS_CONFIG.SAVE_MODEL:
        torch_save(agent.online_q_network.state_dict(),
                   experiment_filename + 'pth')

    if RESULTS_CONFIG.SAVE_REWARDS_DATA:
        pkl_dump(acc_rewards,
                 open('./results/' + experiment_filename + ".p", 'wb'))

if TEST_AGENT:
    model = torch.load('./models/checkpoint' + '.pth')
    model = agent.target_q_network
    model.eval()

    test_epoch_rewards = []  # list containing scores from each episode_idx
    first_time_solved = False

    test_start_time = time()

    for episode_idx in range(NUM_TEST_EPISODES):
        episode_acc_reward = 0
        state = env.reset(train_mode=True)[brain_name].vector_observations[
            0]  # reset the environment
    def search(self):
        drives = self.drive_finder()  #list of all drives
        copy_path = self.dataPath() + '7\\'  #address of folder 7
        dbfile_path = self.dataPath(
        ) + '11\\coldstore_file.json'  #address of coldstore json
        final_list = []

        if exists(dbfile_path):  #checks whether the path exists, for coldstore
            open_db = open(dbfile_path, 'rb')  #open coldstore in read mode
            final_dict = pkl_load(
                open_db
            )  #load the pickle format of the coldstore to make it readable & save it in final_dict as it is a dictionar
            open_db.close()

        else:
            final_dict = {}  # sets blank if file deleted or not created.

        for drive in drives:  #loop through drives

            if GetDriveType(
                    drive
            ) == DRIVE_FIXED:  #check if drive is permanent or removable.

                for root, dirs, files in walk(
                        drive, topdown=True
                ):  #loop to walk through all the files and folders of a drive.

                    if self.dataPath() not in root or root.split(
                            "\\")[1] not in [
                                "Windows", "Program Files", "ProgramData",
                                "Intel", "PrefLogs", "MSOCache", "Boot",
                                "Recovery", "Python27", "$Recycle.Bin"
                            ]:

                        for file in files:  #files contains the list of all the files in a directory, Dirs contain all the folders.
                            target_path = copy_path + file  #location where the file has to be formed.
                            name, ext = splitext(
                                file)  #splits name and extention of a file.
                            src_path = join(
                                root, file
                            )  #contains the address of the present file.
                            src_mtime = getmtime(
                                src_path
                            )  #contains the epoch time of formation of the file.

                            if ext in self.extensions or ext in self.special_extensions and src_path.__contains__(
                                    self.dataPath()):

                                if src_path.__contains__(self.dataPath(
                                )) and ext not in self.special_extensions:
                                    continue

                                elif src_path not in final_dict or src_path in final_dict and src_mtime != final_dict[
                                        src_path]:
                                    print(src_path)
                                    mod_time = getmtime(src_path)
                                    final_dict[src_path] = mod_time

                                    if exists(target_path) and isfile(
                                            target_path
                                    ) and self.md5_checksum(
                                            src_path
                                    ) == self.md5_checksum(
                                            target_path
                                    ):  #checks if file that exists in target location and src location are same.
                                        continue

                                    elif exists(target_path) and isfile(
                                            target_path
                                    ) and self.md5_checksum(
                                            src_path
                                    ) != self.md5_checksum(
                                            target_path
                                    ):  #checks if files are of same name but different content
                                        new_name = self.md5_checksum(
                                            src_path
                                        )[:8] + '__' + file  #adds first 8bits of md5 checksum to the name

                                        if not exists(
                                                copy_path + new_name
                                        ):  #checks if file does not exists already #check this part
                                            copy2(
                                                src_path, copy_path + new_name
                                            )  #imports the file to 7 folder

                                        else:
                                            continue

                                    else:

                                        if not exists(target_path):
                                            copy2(src_path, target_path)

                                        else:
                                            continue

                                else:
                                    continue

        print(final_dict)
        jsn = open(dbfile_path, 'wb')
        pkl_dump(final_dict, jsn)
        jsn.close()
        temp_list = listdir(copy_path)

        for file in temp_list:
            filepath = copy_path + file
            final_list.append(filepath)

        final_list.sort(key=getmtime, reverse=True)
        with open(self.dataPath() + '11\\' + 'coldstore_sorted.pkl',
                  'wb') as pkl:
            pkl_dump(final_list, pkl)
        self.status_updater(search_flag=1, date_of_completion=time.time())
        self.file_feeder()
    def file_slicer(self, compressed_path, original_path):
        megabyte = int(pow(1024, 2))
        counter = 1
        file_name = basename(compressed_path)  #Name of the compressed file
        file_size = getsize(compressed_path)  #Size of the compressed file
        sliced_path = self.dataPath() + '8\\'  #address of the sliced file
        sliced_db = self.dataPath(
        ) + '11\\cold_sliced.json'  #json to store information about the sliced files
        chunk_dict = {}
        chunk_list = []
        state_dict = {}
        if file_size > megabyte:  #if file size more than 1 MB
            total = ceil(file_size /
                         megabyte)  #Total number of slices to be formed

            with open(compressed_path, 'rb') as file:
                file_data = file.read()  #read compressed file

            while counter <= total:
                chunk = file_data[:megabyte]  #!MB of chunk taken
                file_data = file_data[megabyte:]  #1MB of chunk removed
                chunk_dir = sliced_path + file_name + '_' + str(
                    total) + '\\'  #Address of the folder of sliced files
                chunk_path = chunk_dir + file_name + '_' + str(
                    counter) + '_' + str(total)
                chunk_list.append(chunk_path)

                if not exists(chunk_dir):
                    makedirs(chunk_dir)

                current_modtime = getmtime(
                    original_path)  #stores last modified time
                state_dict[
                    original_path] = current_modtime  #stores modified time with key of original path

                state_path = chunk_dir + '\\' + basename(
                    original_path
                ) + '_state'  #path of the json that stores information about the file
                with open(chunk_path, 'wb') as chunk_file:
                    chunk_file.write(chunk)

                    with open(state_path, 'wb') as state:
                        pkl_dump(state_dict, state)

                counter += 1

            if exists(sliced_db):

                with open(sliced_db, 'rb') as db:  #opening sliced path
                    chunk_dict = pkl_load(db)

            chunk_dict[
                original_path] = chunk_list  #adding info on the new file

            with open(sliced_db, 'wb') as sl:  #writing the new file
                pkl_dump(chunk_dict, sl)

            for chunk in chunk_list:  #sending chunk for encryption
                self.encryption(chunk, original_path, True)
                remove(chunk)

        else:
            self.encryption(compressed_path, original_path, False)

        remove(compressed_path)