def __init__(self, x_size, h_size, type_relationship: Union[str, Dict]):
        """type relationship is a dict with information about children
        key: tuple with ids for src group, e.g. If and Switch statements
        value: list of lists, where each list corresponding to type ids of some group
        { ...
            (src_type_id_1, ..., src_type_id_n): [
                [dst_group_type_id_1, ..., dst_group_type_id_k],
                ...,
                [dst_group_type_id_1, ..., dst_group_type_id_m]
            ]
        ... }
        """
        super().__init__(x_size, h_size)
        if isinstance(type_relationship, str):
            with open(type_relationship, 'rb') as pkl_file:
                self.type_relationship = pkl_load(pkl_file)
        else:
            self.type_relationship = type_relationship
        count_diff_matrix = 1
        # dict of matrices ids, key: (src_type_id, dst_type_id), value: matrix_id
        self.edge_matrix_id = {}
        for type_ids, groups in self.type_relationship.items():
            for dst_group in groups:
                for child_id in dst_group:
                    for src_id in type_ids:
                        self.edge_matrix_id[(src_id, child_id)] = count_diff_matrix
                count_diff_matrix += 1

        self.U_f = nn.Parameter(torch.rand(count_diff_matrix, self.h_size, self.h_size), requires_grad=True)
def interactive(path_to_function: str, path_to_model: str):
    fix_seed()
    device = get_device()
    print(f"using {device} device")

    # convert function to dot format
    print(f"prepare ast...")
    create_folder(TMP_FOLDER)
    if not build_ast(path_to_function):
        return
    ast_folder = os.path.join(TMP_FOLDER, 'java', 'asts')
    ast = os.listdir(ast_folder)
    if len(ast) == 0:
        print("didn't find any functions in given file")
        return
    if len(ast) > 1:
        print(
            "too many functions in given file, for interactive prediction you need only one"
        )
        return
    dgl_ast = convert_dot_to_dgl(os.path.join(ast_folder, ast[0]))
    ast_desc = pd.read_csv(os.path.join(TMP_FOLDER, 'java', 'description.csv'))
    ast_desc['token'].fillna('NAN', inplace=True)
    with open(vocab_path, 'rb') as pkl_file:
        vocab = pkl_load(pkl_file)
        token_to_id, type_to_id = vocab['token_to_id'], vocab['type_to_id']
    ast_desc = transform_keys(ast_desc, token_to_id, type_to_id)
    batched_graph, labels, paths = prepare_batch(ast_desc, ['ast_0.dot'],
                                                 lambda: [dgl_ast])
    batched_graph = dgl.batch(
        list(
            map(lambda g: dgl.reverse(g, share_ndata=True),
                dgl.unbatch(batched_graph))))

    # load model
    print("loading model..")
    model, _ = load_model(path_to_model, device)
    criterion = nn.CrossEntropyLoss(
        ignore_index=model.decoder.pad_index).to(device)
    info = LearningInfo()

    print("forward pass...")
    batch_info, prediction = eval_on_batch(model, criterion, batched_graph,
                                           labels, device)

    info.accumulate_info(batch_info)
    id_to_sublabel = {v: k for k, v in model.decoder.label_to_id.items()}
    label = ''
    for cur_sublabel in prediction:
        if cur_sublabel.item() == model.decoder.label_to_id[EOS]:
            break
        label += '|' + id_to_sublabel[cur_sublabel.item()]
    label = label[1:]
    print(f"Predicted function name is\n{label}")
    print(
        f"Calculated metrics with respect to '{labels[0]}' name\n{info.get_state_dict()}"
    )
Beispiel #3
0
    def load(self, filename: str):
        """Load system state from .scad file.

        Parameters
        ----------
        filename: str
            Name of file to load.
        """
        with open(filename, 'rb') as f:
            state = pkl_load(f)

        if not isinstance(state, ProjectState):
            raise IncorrectTypeOfLoadedObject

        self._state = state
        self._history.clear()
        self._cancelled.clear()
        self._commit()
def remove_outliers(holdout_path: str, min_border: int, max_border: int) -> int:
    batches = os.listdir(holdout_path)
    removed = 0
    for batch_path in tqdm(batches):
        with open(os.path.join(holdout_path, batch_path), 'rb') as pkl_file:
            batch = pkl_load(pkl_file)
        graphs = dgl.unbatch(batch['batched_graph'])
        labels = batch['labels']
        paths = batch['paths']
        orig_size = len(graphs)
        graphs, labels, paths = zip(*filter(
            lambda cur: min_border <= cur[0].number_of_nodes() <= max_border,
            zip(graphs, labels, paths)
        ))
        with open(os.path.join(holdout_path, batch_path), 'wb') as pkl_file:
            pkl_dump({'batched_graph': dgl.batch(graphs), 'labels': labels, 'paths': paths}, pkl_file)
        removed += orig_size - len(graphs)
    return removed
    def __getitem__(self, item) -> Tuple[BatchedDGLGraph, List[str]]:
        batch_basename, batch_slice = self.batch_desc[item]

        # read file only if previous wasn't the same
        if self.loaded_batch_basename != batch_basename:
            with open(path_join(self.batched_graphs_path, batch_basename),
                      'rb') as pkl_file:
                self.loaded_batched_graph = pkl_load(pkl_file)
            self.loaded_batch_basename = batch_basename

        graphs = unbatch(self.loaded_batched_graph['batched_graph'])

        graphs_for_batch = graphs[batch_slice]
        if self.invert_edges:
            graphs_for_batch = list(
                map(lambda g: reverse(g, share_ndata=True), graphs_for_batch))

        batched_graph = batch(graphs_for_batch)
        batched_labels = self.loaded_batched_graph['labels'][batch_slice]

        return batched_graph, batched_labels
    def __init__(self,
                 batched_graphs_path: str,
                 batch_size: int,
                 invert_edges: bool = False) -> None:
        self.batched_graphs_path = batched_graphs_path
        self.batch_size = batch_size
        self.invert_edges = invert_edges
        assert path_exists(self.batched_graphs_path)

        self.batched_graph_files = sorted(list(
            filter(lambda filename: filename.endswith('.pkl'),
                   listdir(self.batched_graphs_path))),
                                          key=lambda name: int(name[6:-4]))
        self.batch_desc = {}
        self.n_batches = 0

        self.loaded_batch_basename = None
        self.loaded_batched_graph = None

        # iterate over pkl files to aggregate information about batches
        print(f"prepare the {batched_graphs_path} dataset...")
        for batched_graph_file in tqdm(self.batched_graph_files):
            with open(path_join(self.batched_graphs_path, batched_graph_file),
                      'rb') as pkl_file:
                batched_graph = pkl_load(pkl_file)
            n_graphs = len(batched_graph['batched_graph'].batch_num_nodes)
            batches_per_file = n_graphs // self.batch_size + (
                1 if n_graphs % self.batch_size > 0 else 0)

            # collect information from the file
            for batch_id in range(batches_per_file):
                batch_slice = slice(
                    batch_id * self.batch_size,
                    min((batch_id + 1) * self.batch_size, n_graphs))
                self.batch_desc[self.n_batches +
                                batch_id] = (batched_graph_file, batch_slice)

            self.n_batches += batches_per_file
    def file_feeder(self, last_feeder_file=None):
        file_list_path = self.dataPath() + '11\\' + 'coldstore_sorted.pkl'

        if exists(file_list_path):

            with open(file_list_path, 'rb') as pkl:
                file_list = pkl_load(pkl)

                if last_feeder_file != None:
                    try:
                        file_list = file_list[file_list.index(last_feeder_file
                                                              ) + 1:]
                    except:
                        pass

                for file in file_list:
                    self.compressor(file_path=file)
                    self.status_updater(last_feeder_file=file)

                # remove(file_list_path) #Removes json of file list

        else:
            pass
Beispiel #8
0
def train(params: Dict, logging: str) -> None:
    fix_seed()
    device = get_device()
    print(f"using {device} device")

    training_set = JavaDataset(params['paths']['train'], params['batch_size'],
                               True)
    validation_set = JavaDataset(params['paths']['validate'],
                                 params['batch_size'], True)

    with open(params['paths']['vocabulary'], 'rb') as pkl_file:
        vocabulary = pkl_load(pkl_file)
        token_to_id = vocabulary['token_to_id']
        type_to_id = vocabulary['type_to_id']
        label_to_id = vocabulary['label_to_id']

    print('model initializing...')
    is_resumed = 'resume' in params
    if is_resumed:
        # load model
        model, checkpoint = load_model(params['resume'], device)
        start_batch_id = checkpoint['batch_id'] + 1
        configuration = checkpoint['configuration']
    else:
        # create model
        model_factory = ModelFactory(params['embedding'], params['encoder'],
                                     params['decoder'],
                                     params['hidden_states'], token_to_id,
                                     type_to_id, label_to_id)
        model: Tree2Seq = model_factory.construct_model(device)
        configuration = model_factory.save_configuration()
        start_batch_id = 0

    # create optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=params['lr'],
                                 weight_decay=params['weight_decay'])
    # create scheduler
    scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer,
        step_size=params['scheduler_step_size'],
        gamma=params['scheduler_gamma'])

    # define loss function
    criterion = nn.CrossEntropyLoss(
        ignore_index=model.decoder.pad_index).to(device)

    # init logging class
    logger = None
    if logging == TerminalLogger.name:
        logger = TerminalLogger(params['checkpoints_folder'])
    elif logging == FileLogger.name:
        logger = FileLogger(params, params['logging_folder'],
                            params['checkpoints_folder'])
    elif logging == WandBLogger.name:
        logger_args = ['treeLSTM', params, model, params['checkpoints_folder']]
        if 'resume_wandb_id' in params:
            logger_args.append(params['resume_wandb_id'])
        logger = WandBLogger(*logger_args)

    # train loop
    print("ok, let's train it")
    for epoch in range(params['n_epochs']):
        train_acc_info = LearningInfo()

        if epoch > 0:
            # specify start batch id only for first epoch
            start_batch_id = 0
        tqdm_batch_iterator = tqdm(range(start_batch_id, len(training_set)),
                                   total=len(training_set))
        tqdm_batch_iterator.update(start_batch_id)
        tqdm_batch_iterator.refresh()

        # iterate over training set
        for batch_id in tqdm_batch_iterator:
            graph, labels = training_set[batch_id]
            graph.ndata['token_id'] = graph.ndata['token_id'].to(device)
            graph.ndata['type_id'] = graph.ndata['type_id'].to(device)
            batch_info = train_on_batch(model, criterion, optimizer, scheduler,
                                        graph, labels, params, device)
            train_acc_info.accumulate_info(batch_info)
            # log current train process
            if is_current_step_match(batch_id, params['logging_step']):
                logger.log(train_acc_info.get_state_dict(), epoch, batch_id)
                train_acc_info = LearningInfo()
            # validate current model
            if is_current_step_match(
                    batch_id, params['evaluation_step']) and batch_id != 0:
                eval_epoch_info = evaluate_dataset(validation_set, model,
                                                   criterion, device)
                logger.log(eval_epoch_info.get_state_dict(), epoch, batch_id,
                           False)
            # save current model
            if is_current_step_match(
                    batch_id, params['checkpoint_step']) and batch_id != 0:
                logger.save_model(model,
                                  f'epoch_{epoch}_batch_{batch_id}.pt',
                                  configuration,
                                  batch_id=batch_id)

        logger.log(train_acc_info.get_state_dict(), epoch, len(training_set))
        eval_epoch_info = evaluate_dataset(validation_set, model, criterion,
                                           device)
        logger.log(eval_epoch_info.get_state_dict(), epoch, len(training_set),
                   False)

        logger.save_model(model, f'epoch_{epoch}.pt', configuration)
Beispiel #9
0
from sklearn.grid_search import GridSearchCV
from preprocessor import transform, processes as lang_processes
from build_committee import param_sets, prefs as bc_prefs, analyzer, classes, int_to_class

application = flask.Flask(__name__)

committee = {}
for params in param_sets:
	committee[params] = {}
	votes = 1
	for pref in bc_prefs:
		committee[params][pref] = (None, 0)
		try:
			clf_name = params + '.' + pref
			f = open('committee/' + clf_name + '.pkl')
			clf = pkl_load(f.read())
			clf.estimator.steps[0][1].analyzer = analyzer
			clf.best_estimator_.steps[0][1].analyzer = analyzer
			committee[params][pref] = (clf, votes)
			print '[' + __name__ + ']\t' + clf_name + ' loaded'
			f.close()
		except IOError:
			pass

params = {}
prefs = ['ie', 'ns', 'ft', 'jp']
clf_types = ['svc', 'nb', 'knn']
doc_types = ['text', 'tweet']
classifiers = {}
personalities = {}
personalities_description = {}
def main(args: Namespace) -> None:
    dataset_name = dataset_mapping[args.dataset]
    data_path = os.path.join(data_folder, dataset_name)
    create_folder(data_folder, is_clean=False)
    create_folder(data_path, is_clean=False)

    if args.download:
        print(f"download {dataset_name} dataset...")
        tar_file_path = download_dataset(dataset_name, data_folder)
        print(f"extract files from tar archive {tar_file_path}...")
        train_path, val_path, test_path = extract_dataset(tar_file_path, data_folder, dataset_name)
        print("remove tar file...")
        os.remove(tar_file_path)
    else:
        train_path, val_path, test_path = [os.path.join(data_path, folder) for folder in holdout_folders]

    if args.build_ast:
        if not all([os.path.exists(holdout_path) for holdout_path in [train_path, val_path, test_path]]):
            raise RuntimeError("download and extract data before processing it via --download arg")
        if not os.path.exists(astminer_cli_path):
            raise RuntimeError(f"can't find astminer-cli in this location {astminer_cli_path}")
        holdout_ast_paths = {}
        for holdout in holdout_folders:
            holdout_ast_paths[holdout] = build_holdout_asts(data_path, holdout)
    else:
        holdout_ast_paths = {
            holdout: os.path.join(data_path, f'{holdout}_asts') for holdout in holdout_folders
        }

    vocabulary_path = os.path.join(data_path, vocabulary_name)
    if args.collect_vocabulary:
        token_to_id, type_to_id, label_to_id = collect_vocabulary(os.path.join(data_path, f'{holdout_folders[0]}_asts'))
        with open(vocabulary_path, 'wb') as pkl_file:
            pkl_dump({'token_to_id': token_to_id, 'type_to_id': type_to_id, 'label_to_id': label_to_id}, pkl_file)

    if args.convert:
        if not all([os.path.exists(path[1]) for path in holdout_ast_paths.items()]):
            raise RuntimeError("build ast before converting it via --build_ast arg")
        if not os.path.exists(vocabulary_path):
            raise RuntimeError("collect vocabulary before converting it via --build_ast arg")
        with open(vocabulary_path, 'rb') as pkl_file:
            pkl_data = pkl_load(pkl_file)
            token_to_id = pkl_data['token_to_id']
            type_to_id = pkl_data['type_to_id']

        holdout_preprocessed_paths = {}
        for holdout in holdout_folders:
            holdout_preprocessed_paths[holdout] = convert_holdout(
                data_path, holdout, token_to_id, type_to_id, args.n_jobs, args.batch_size, args.high_memory
            )
    else:
        holdout_preprocessed_paths = {
            holdout: os.path.join(data_path, f'{holdout}_preprocessed') for holdout in holdout_folders
        }

    if args.remove_outliers:
        if not all([os.path.exists(path[1]) for path in holdout_preprocessed_paths.items()]):
            raise RuntimeError("convert ast before removing outliers via --convert arg")
        if args.min_outlier == -1 or args.max_outlier == -1:
            raise ValueError("specify a min and max border for removing outliers")
        removed = remove_outliers(holdout_preprocessed_paths[holdout_folders[0]], args.min_outlier, args.max_outlier)
        print(f"remove {removed} functions for training holdout")

    if args.upload:
        if not all([os.path.exists(path[1]) for path in holdout_preprocessed_paths.items()]):
            raise RuntimeError("convert ast before uploading using it via --convert arg")
        tar_file_name = f'{dataset_name}_{args.tar_suffix}.tar.gz'
        completed_process = subprocess_run(
            ['tar', '-czf', tar_file_name, vocabulary_name] +
            [f'{holdout}_preprocessed' for holdout in holdout_folders],
            cwd=data_path
        )
        if completed_process.returncode != 0:
            print(f"can't create tar for preprocessed data, failed with\n{completed_process.stdout}")
        else:
            upload_file(os.path.join(data_path, tar_file_name), s3_bucket_name, tar_file_name)

    if args.download_preprocessed:
        for holdout, path in holdout_preprocessed_paths.items():
            tar_file_name = f'{dataset_name}_{holdout}_preprocessed.tar.gz'
            tar_path = os.path.join(data_path, tar_file_name)
            download_file(tar_path, s3_bucket_name, tar_file_name)
            create_folder(path)
            extract_tar_gz(tar_path, path)
        vocabulary_path = os.path.join(data_path, vocabulary_name)
        download_file(vocabulary_path, s3_bucket_name, f'{dataset_name}_{vocabulary_name}')

    if all([os.path.exists(holdout_path) for _, holdout_path in holdout_preprocessed_paths.items()]):
        for holdout, path in holdout_preprocessed_paths.items():
            number_of_batches = len(os.listdir(path))
            print(f"There are {number_of_batches} batches in {holdout} data")
 def load(self, pth):
     with gzip.open(pth, 'rb') as fh:
         return pkl_load(fh)
            if done:
                break

        test_epoch_rewards.append(episode_acc_reward)  # save most recent score

        if PROGRESS_LOG_STEP_FREQUENCY and episode_idx % PROGRESS_LOG_STEP_FREQUENCY == 0:
            print('\rEpisode {}\tAcc. Reward: {:.2f}\tEps: {:.3f}'.format(
                episode_idx, reward, agent.epsilon))

    test_stats = DotMap(
        {'test_epoch_time': '{:.3f}'.format(time() - test_start_time)})

    results_savetofilename = 'results/test'
    plot_rewards(saveto_filename=results_savetofilename,
                 data=test_epoch_rewards,
                 ylim=(-5, 25),
                 dpi=320)

if RESULTS_CONFIG.SAVE_REWARDS_PLOT:
    results_savetofilename = 'acc_rewards_01'
    acc_rewards = pkl_load(
        open('./results/' + results_savetofilename + '.p', 'rb'))

    plot_rewards(saveto_filename=results_savetofilename,
                 data=acc_rewards,
                 ylim=(-5, 25),
                 dpi=320)

env.close()
    def search(self):
        drives = self.drive_finder()  #list of all drives
        copy_path = self.dataPath() + '7\\'  #address of folder 7
        dbfile_path = self.dataPath(
        ) + '11\\coldstore_file.json'  #address of coldstore json
        final_list = []

        if exists(dbfile_path):  #checks whether the path exists, for coldstore
            open_db = open(dbfile_path, 'rb')  #open coldstore in read mode
            final_dict = pkl_load(
                open_db
            )  #load the pickle format of the coldstore to make it readable & save it in final_dict as it is a dictionar
            open_db.close()

        else:
            final_dict = {}  # sets blank if file deleted or not created.

        for drive in drives:  #loop through drives

            if GetDriveType(
                    drive
            ) == DRIVE_FIXED:  #check if drive is permanent or removable.

                for root, dirs, files in walk(
                        drive, topdown=True
                ):  #loop to walk through all the files and folders of a drive.

                    if self.dataPath() not in root or root.split(
                            "\\")[1] not in [
                                "Windows", "Program Files", "ProgramData",
                                "Intel", "PrefLogs", "MSOCache", "Boot",
                                "Recovery", "Python27", "$Recycle.Bin"
                            ]:

                        for file in files:  #files contains the list of all the files in a directory, Dirs contain all the folders.
                            target_path = copy_path + file  #location where the file has to be formed.
                            name, ext = splitext(
                                file)  #splits name and extention of a file.
                            src_path = join(
                                root, file
                            )  #contains the address of the present file.
                            src_mtime = getmtime(
                                src_path
                            )  #contains the epoch time of formation of the file.

                            if ext in self.extensions or ext in self.special_extensions and src_path.__contains__(
                                    self.dataPath()):

                                if src_path.__contains__(self.dataPath(
                                )) and ext not in self.special_extensions:
                                    continue

                                elif src_path not in final_dict or src_path in final_dict and src_mtime != final_dict[
                                        src_path]:
                                    print(src_path)
                                    mod_time = getmtime(src_path)
                                    final_dict[src_path] = mod_time

                                    if exists(target_path) and isfile(
                                            target_path
                                    ) and self.md5_checksum(
                                            src_path
                                    ) == self.md5_checksum(
                                            target_path
                                    ):  #checks if file that exists in target location and src location are same.
                                        continue

                                    elif exists(target_path) and isfile(
                                            target_path
                                    ) and self.md5_checksum(
                                            src_path
                                    ) != self.md5_checksum(
                                            target_path
                                    ):  #checks if files are of same name but different content
                                        new_name = self.md5_checksum(
                                            src_path
                                        )[:8] + '__' + file  #adds first 8bits of md5 checksum to the name

                                        if not exists(
                                                copy_path + new_name
                                        ):  #checks if file does not exists already #check this part
                                            copy2(
                                                src_path, copy_path + new_name
                                            )  #imports the file to 7 folder

                                        else:
                                            continue

                                    else:

                                        if not exists(target_path):
                                            copy2(src_path, target_path)

                                        else:
                                            continue

                                else:
                                    continue

        print(final_dict)
        jsn = open(dbfile_path, 'wb')
        pkl_dump(final_dict, jsn)
        jsn.close()
        temp_list = listdir(copy_path)

        for file in temp_list:
            filepath = copy_path + file
            final_list.append(filepath)

        final_list.sort(key=getmtime, reverse=True)
        with open(self.dataPath() + '11\\' + 'coldstore_sorted.pkl',
                  'wb') as pkl:
            pkl_dump(final_list, pkl)
        self.status_updater(search_flag=1, date_of_completion=time.time())
        self.file_feeder()
    def file_slicer(self, compressed_path, original_path):
        megabyte = int(pow(1024, 2))
        counter = 1
        file_name = basename(compressed_path)  #Name of the compressed file
        file_size = getsize(compressed_path)  #Size of the compressed file
        sliced_path = self.dataPath() + '8\\'  #address of the sliced file
        sliced_db = self.dataPath(
        ) + '11\\cold_sliced.json'  #json to store information about the sliced files
        chunk_dict = {}
        chunk_list = []
        state_dict = {}
        if file_size > megabyte:  #if file size more than 1 MB
            total = ceil(file_size /
                         megabyte)  #Total number of slices to be formed

            with open(compressed_path, 'rb') as file:
                file_data = file.read()  #read compressed file

            while counter <= total:
                chunk = file_data[:megabyte]  #!MB of chunk taken
                file_data = file_data[megabyte:]  #1MB of chunk removed
                chunk_dir = sliced_path + file_name + '_' + str(
                    total) + '\\'  #Address of the folder of sliced files
                chunk_path = chunk_dir + file_name + '_' + str(
                    counter) + '_' + str(total)
                chunk_list.append(chunk_path)

                if not exists(chunk_dir):
                    makedirs(chunk_dir)

                current_modtime = getmtime(
                    original_path)  #stores last modified time
                state_dict[
                    original_path] = current_modtime  #stores modified time with key of original path

                state_path = chunk_dir + '\\' + basename(
                    original_path
                ) + '_state'  #path of the json that stores information about the file
                with open(chunk_path, 'wb') as chunk_file:
                    chunk_file.write(chunk)

                    with open(state_path, 'wb') as state:
                        pkl_dump(state_dict, state)

                counter += 1

            if exists(sliced_db):

                with open(sliced_db, 'rb') as db:  #opening sliced path
                    chunk_dict = pkl_load(db)

            chunk_dict[
                original_path] = chunk_list  #adding info on the new file

            with open(sliced_db, 'wb') as sl:  #writing the new file
                pkl_dump(chunk_dict, sl)

            for chunk in chunk_list:  #sending chunk for encryption
                self.encryption(chunk, original_path, True)
                remove(chunk)

        else:
            self.encryption(compressed_path, original_path, False)

        remove(compressed_path)