def apply_async(self, func, callback=None): exec_func = 'func{}{}'.format(round(time.time()), random.randint(0, round(time.time()))) pickle_func = codecs.encode(cloudpickle.dumps(func()), 'base64').decode() write_func = '{}/{}.csv'.format(get_file_dir(__file__), exec_func) if self.mode == 'local': return self._get_pool().apply_async(SafeFunction( self._local_execute_func), args=[ exec_func, write_func, pickle_func, self._python_path, ], callback=callback) else: exec_dict = {exec_func: pickle_func} return self._get_pool().apply_async(SafeFunction( self._remote_execute_func), args=[ exec_func, write_func, exec_dict, self._job_manager, self._python_path, ], callback=callback)
def startOneJob(self, userFunc, jobInfo): """ Start one job. Uses the MPI send call. MPI does a certain level of pickling, but we do our own here so that the function etc gets pickled. """ jobInfo = jobInfo.prepareForPickling() allInputs = (userFunc, jobInfo) allInputsPickled = cloudpickle.dumps(allInputs) # send info off to sub process # we also send a flag telling the subprocess # not to exit and be ready for another message self.comm.send([True, allInputsPickled], dest=self.dest) # return the current one proc = self.dest # set self.dest back to zero if we have done them all self.dest += 1 if self.dest >= (self.numSubJobs - 1): self.dest = 0 return proc
def dumps_fn(fn: Callable) -> bytes: fn_hash = _get_fn_hash(fn) try: fn_bytes = _fn_dump_cache[fn_hash] except KeyError: fn_bytes = cloudpickle.dumps(fn) _fn_dump_cache[fn_hash] = fn_bytes return fn_bytes
def extract_feature_vecs(candset, ltable, rtable, feature_table, show_progress=True, n_jobs=1): key = "_id" l_key = "id" r_key = "id" fk_rtable = "rtable_id" fk_ltable = "ltable_id" # Extract features # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # Apply feature functions col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) # 1 fk_rtable_idx = col_names.index(fk_rtable) # 2 n_procs = get_num_procs(n_jobs, len(candset)) c_splits = pd.np.array_split(candset, n_procs) pickled_obj = cloudpickle.dumps(feature_table) feat_vals_by_splits = Parallel(n_jobs=n_procs)( delayed(get_feature_vals_by_cand_split)( pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i], show_progress and i == len(c_splits) - 1) for i in range(len(c_splits))) feat_vals = sum(feat_vals_by_splits, []) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) return feature_vectors
def startOneJob(self, userFunc, jobInfo): """ Start one job. We execute the rios_subproc.py command, communicating via its stdin/stdout. We give it the pickled function and all input objects, and we get back a pickled outputs object. """ jobInfo = jobInfo.prepareForPickling() allInputs = (userFunc, jobInfo) allInputsPickled = cloudpickle.dumps(allInputs) proc = subprocess.Popen(['rios_subproc.py'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) proc.stdin.write(allInputsPickled) return proc
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True, n_jobs=1): """ This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) >>> # G is the labeled dataframe which should be converted into feature vectors >>> H = em.extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels']) """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # Apply feature functions ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) n_procs = get_num_procs(n_jobs, len(candset)) c_splits = pd.np.array_split(candset, n_procs) pickled_obj = cloudpickle.dumps(feature_table) feat_vals_by_splits = Parallel(n_jobs=n_procs)(delayed(get_feature_vals_by_cand_split)(pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i], show_progress and i == len( c_splits) - 1) for i in range(len(c_splits))) feat_vals = sum(feat_vals_by_splits, []) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def run_experiment( self, method_call=None, batch_tasks=None, exp_prefix="experiment", exp_name=None, log_dir=None, script="garage.experiment.experiment_wrapper", python_command="python", dry=False, env=None, variant=None, force_cpu=False, pre_commands=None, **kwargs, ): """Serialize the method call and run the experiment using the specified mode. Args: method_call (callable): A method call. batch_tasks (list[dict]): A batch of method calls. exp_prefix (str): Name prefix for the experiment. exp_name (str): Name of the experiment. log_dir (str): Log directory for the experiment. script (str): The name of the entrance point python script. python_command (str): Python command to run the experiment. dry (bool): Whether to do a dry-run, which only prints the commands without executing them. env (dict): Extra environment variables. variant (dict): If provided, should be a dictionary of parameters. force_cpu (bool): Whether to set all GPU devices invisible to force use CPU. pre_commands (str): Pre commands to run the experiment. """ if method_call is None and batch_tasks is None: raise Exception( "Must provide at least either method_call or batch_tasks") for task in batch_tasks or [method_call]: if not hasattr(task, "__call__"): raise ValueError("batch_tasks should be callable") # ensure variant exists if variant is None: variant = dict() if batch_tasks is None: batch_tasks = [ dict( kwargs, pre_commands=pre_commands, method_call=method_call, exp_name=exp_name, log_dir=log_dir, env=env, variant=variant, ) ] global exp_count if force_cpu: os.environ["CUDA_VISIBLE_DEVICES"] = "-1" for task in batch_tasks: call = task.pop("method_call") data = base64.b64encode(cloudpickle.dumps(call)).decode("utf-8") task["args_data"] = data exp_count += 1 if task.get("exp_name", None) is None: task[ "exp_name"] = f"{exp_prefix}_{time.time()}_{exp_count:04n}" if task.get("log_dir", None) is None: task["log_dir"] = ( f"{Path.cwd() / 'data'}/local/{exp_prefix.replace('_', '-')}/" f"{task['exp_name']}") if task.get("variant", None) is not None: variant = task.pop("variant") if "exp_name" not in variant: variant["exp_name"] = task["exp_name"] task["variant_data"] = base64.b64encode( pickle.dumps(variant)).decode("utf-8") elif "variant" in task: del task["variant"] task["env"] = task.get("env", dict()) or dict() task["env"]["GARAGE_FORCE_CPU"] = str(force_cpu) for task in batch_tasks: env = task.pop("env", None) command = garage.to_local_command(task, python_command=python_command, script=script) print(command) if dry: return try: if env is None: env = dict() os.subprocess.run(command, shell=True, env=dict(os.environ, **env), check=True) except Exception as e: print(e) raise
def dask_extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. AssertionError: If `n_chunks` is not of type int. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_extract_features import dask_extract_feature_vecs >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) >>> # G is the labeled dataframe which should be converted into feature vectors >>> H = dask_extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels']) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # Apply feature functions ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) n_chunks = get_num_partitions(n_chunks, len(candset)) c_splits = np.array_split(candset, n_chunks) pickled_obj = cloudpickle.dumps(feature_table) feat_vals_by_splits = [] for i in range(len(c_splits)): partial_result = delayed(get_feature_vals_by_cand_split)( pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i], False) feat_vals_by_splits.append(partial_result) feat_vals_by_splits = delayed(wrap)(feat_vals_by_splits) if show_progress: with ProgressBar(): feat_vals_by_splits = feat_vals_by_splits.compute( scheduler="processes", num_workers=get_num_cores()) else: feat_vals_by_splits = feat_vals_by_splits.compute( scheduler="processes", num_workers=get_num_cores()) feat_vals = sum(feat_vals_by_splits, []) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def serialize(data): code = cloudpickle.dumps(data) return base64.b64encode(code).decode("utf-8")
def __getstate__(self): return cloudpickle.dumps(self._x)
def startOneJob(self, userFunc, jobInfo): """ Start one job. We create a shell script to submit to a SLURM batch queue. When executed, the job will execute the rios_subproc.py command, giving it the names of two pickle files. The first is the pickle of all inputs (including the function), and the second is where it will write the pickle of outputs. Uses $RIOS_SLURMJOBMGR_SBATCHOPTIONS to pick up any desired options to the sbatch command. This should be used to control such things as requested amount of memory or walltime for each job, which will otherwise be defaulted by SLURM. """ jobInfo = jobInfo.prepareForPickling() allInputs = (userFunc, jobInfo) allInputsPickled = cloudpickle.dumps(allInputs) (fd, inputsfile) = tempfile.mkstemp(prefix='rios_slurmin_', dir=self.tempdir, suffix='.tmp') os.close(fd) outputsfile = inputsfile.replace('slurmin', 'slurmout') scriptfile = inputsfile.replace('slurmin', 'slurm').replace('.tmp', '.sl') logfile = outputsfile.replace('.tmp', '.log') sbatchOptions = os.getenv('RIOS_SLURMJOBMGR_SBATCHOPTIONS') scriptCmdList = [ "#!/bin/bash", "#SBATCH -o %s" % logfile, "#SBATCH -e %s" % logfile ] if sbatchOptions is not None: scriptCmdList.append("#SBATCH %s" % sbatchOptions) slurmInitCmds = os.getenv('RIOS_SLURMJOBMGR_INITCMDS') if slurmInitCmds is not None: scriptCmdList.append(slurmInitCmds) scriptCmdList.append("rios_subproc.py %s %s" % (inputsfile, outputsfile)) scriptStr = '\n'.join(scriptCmdList) open(scriptfile, 'w').write(scriptStr + '\n') open(inputsfile, 'wb').write(allInputsPickled) submitCmdWords = ["sbatch", scriptfile] proc = subprocess.Popen(submitCmdWords, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) # The sbatch command exits almost immediately, printing the SLURM job id # to stdout. So, we just wait for the sbatch to finish, and grab the # jobID string. (stdout, stderr) = proc.communicate() slurmOutputList = stdout.strip().split() slurmJobID = None # slurm prints a sentence to the stdout: # 'Submitted batch job X' if len(slurmOutputList) >= 4: slurmJobID = slurmOutputList[3] # Remove the script file, assuming that sbatch took a copy of it. os.remove(scriptfile) # If there was something in stderr from the sbatch command, then probably # something bad happened, so we pass it on to the user in the form of # an exception. if slurmJobID is None or len(stderr) > 0: msg = "Error from sbatch. Message:\n" + stderr raise rioserrors.JobMgrError(msg) return (slurmJobID, outputsfile, logfile)
def serialize(fn, args: Tuple[Any] = None, kwargs: Dict[Any, Any] = None): code = cloudpickle.dumps(dict(thunk=fn, args=args, kwargs=kwargs)) return base64.b64encode(code).decode("utf-8")