def master_node(chunk_start: int, block: int, file_path: str, q: BaseProxy) -> None: """Module implementing process wrapper for individual task splits Performs aggregation on the file-splits it's operating upon. Parameters ---------- chunk_start*: Pointer to the line from the file. block*: Explicit chunk size (dynamically provided by the user or through RunningConstants enum). file_path*: Support both relative, absolute references to the input file location. q*: Multiprocessing manager queue to support FIFO based execution of the processed chunks (* - Required parameters) Returns ------- None """ with open(file_path, 'r') as f: f.seek(chunk_start) lines = f.read(block).splitlines() process_dict = dict() for line in lines: if line: if 'Driver' in line: # this if block may be redundant (if there's no requirement # to validate Business_Rule-1 and Business_Rule-2 : Refer to # section 'Possible Implementation' in Readme.md file) if not line[7:] in process_dict: process_dict[line[7:]] = (0, 0) elif line[:4] == 'Trip': driver_name, distance, time_spent = slave_node(line[5:]) if len(driver_name): # (Business_Rule-1 and Business_Rule-2 : Refer to # section 'Possible Implementation' in Readme.md file)) if driver_name not in process_dict: process_dict[driver_name] = (distance, time_spent) else: # Adding miles, time_spent to the existing record # in the dictionary current_distance, current_time = process_dict[ driver_name] process_dict[driver_name] = (current_distance + distance, current_time + time_spent) q.put(process_dict) return
def prepare_jobs(batch_iterator, model_params, schema_params, num_features, model_weights: dict, enable_local_indexing: bool, job_queue: BaseProxy, has_intercept: bool): """ Utility method to take batches of TF grouped data and convert it into one or more Jobs. Useful for running training and inference :param batch_iterator: TF dataset feature, label batch iterator :param model_params: model parameters to aid in converting to Job objects :param schema_params: schema parameters to aid in converting to Job objects :param num_features Number of features in global space :param model_weights: Model coefficients, dict of {model_id: TrainingResult} :param enable_local_indexing: Whether to index the features locally instead of use global indices :param job_queue: A managed queue containing the generated jobs :param has_intercept: whether to include intercept in the model :return: a generator of entity_ids. The feature_bag is represented in sparse tensor format. Take per_member feature bag for example. The following batch has three records, two belonging to member #0 and one belonging to member #1. member #0 has two records per_member_indices = [[0, 7, 60, 80, 95], [34, 57]] per_member_values = [[1.0, 2.0, 3.0, 5.0, 6.6], [1.0, 2.0]] member #1 has one record per_member_indices = [[10, 11]] per_member_values = [[-3.5, 2.3]] The batch combines both members' records: per_member_indices = [[[0, 7, 60, 80, 95], [34, 57]], [[10, 11]]] per_member_values = [[[1.0, 2.0, 3.0, 5.0, 6.6], [1.0, 2.0]], [[-3.5, 2.3]]] Tensorflow representation of the batch above: SparseTensorValue(indices=array( [[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [0, 0, 4], [0, 1, 0], [0, 1, 1], [1, 0, 0], [1, 0, 1]]), values=array([ 1. , 2. , 3. , 5. , 6.6, 1. , 2. , -3.5, 2.3], dtype=float32), dense_shape=array([2, 2, 5])) Note the first dimension is the batch dimension. """ logger.info( f"Kicking off job producer with enable_local_indexing = {enable_local_indexing}." ) for features_val, labels_val in dataset_reader(batch_iterator()): # Extract number of entities in batch num_entities = features_val[model_params.partition_entity].shape[0] # Now, construct entity_id, X, y, offsets and weights X_index = 0 y_index = 0 for entity in range(num_entities): ids_indices = features_val[schema_params.uid_column_name].indices rows = ids_indices[np.where(ids_indices[:, 0] == entity)][:, 1] sample_count_from_ids = rows.size if model_params.feature_bag is None: # intercept only model assert (num_features == 1) sample_count = sample_count_from_ids values = np.zeros(sample_count) cols = np.zeros(sample_count, dtype=int) else: # Construct data matrix X. Slice portion of arrays from X_index # through the number of rows for the entity features = features_val[model_params.feature_bag + INDICES_SUFFIX] indices = features.indices rows = indices[np.where(indices[:, 0] == entity)][:, 1] cols = features.values[X_index:X_index + len(rows)] values = features_val[model_params.feature_bag + VALUES_SUFFIX].values[X_index:X_index + len(rows)] # Get sample count sample_count = np.amax(rows) + 1 # sanity check assert (sample_count == sample_count_from_ids) # Construct entity ID raw_entity_id = features_val[model_params.partition_entity][entity] if isinstance(raw_entity_id, bytes): entity_id = raw_entity_id.decode('utf-8') else: entity_id = str(raw_entity_id) result = model_weights.get(entity_id, None) # generate index map unique_global_indices, locally_indexed_cols = np.unique( cols, return_inverse=True) if enable_local_indexing: # Use local indices to represent the data matrix. X = coo_matrix((values, (rows, locally_indexed_cols))) else: # Use global indices to represent the data matrix. X = coo_matrix((values, (rows, cols)), shape=(sample_count, num_features)) # Construct y, offsets, weights and ids. Slice portion of arrays from y_index through sample_count y = labels_val[ schema_params.label_column_name].values[y_index:y_index + sample_count] offsets = features_val[ model_params.offset_column_name].values[y_index:y_index + sample_count] weights = (features_val[schema_params.weight_column_name]. values[y_index:y_index + sample_count] if schema_params.weight_column_name in features_val else np.ones(sample_count)) ids = features_val[ schema_params.uid_column_name].values[y_index:y_index + sample_count] # If a prior model exists, get the coefficients to warm start the training. # Note the prior model may have fewer or more features than the current dataset. theta = None if result: model_rows = [] model_values = [] coeffs_without_intercept = result.theta[ 1:] if has_intercept else result.theta prior_model = { u: v for u, v in zip(result.unique_global_indices, coeffs_without_intercept) } idx_offset = 0 if has_intercept: # account for the intercept term model_rows.append(0) model_values.append(result.theta[0]) idx_offset = 1 # account for the intercept for i, u in enumerate(unique_global_indices): if u in prior_model: r = i if enable_local_indexing else u model_rows.append( idx_offset + r) # +1 if bias is the first element. model_values.append(prior_model[u]) model_cols = [0] * len(model_rows) if enable_local_indexing: # +1 if bias is used coeffs_length = len( unique_global_indices) + 1 if has_intercept else len( unique_global_indices) theta = csr_matrix( (model_values, (model_rows, model_cols)), shape=(coeffs_length, 1)) else: # +1 if bias is used coeffs_length = num_features + 1 if has_intercept else num_features theta = csr_matrix( (model_values, (model_rows, model_cols)), shape=(coeffs_length, 1)) job = Job(entity_id, X, y, offsets, weights, ids, unique_global_indices, theta=theta) job_queue.put(job) # use entity_id as a token, it may not be unique yield entity_id # Update X_index and y_index y_index += sample_count X_index += len(rows)
def _run_command(self, cmd: list[str], execution_time: list[float], gpu_queue: BaseProxy = None) -> bool: """Run the encoding and decoding command. Based on the `debug` flag, it will raise the ``CalledProcessError`` exception or add the count of failures silently. Error messages will log into files with timestamp in ``logs/``. Parameters ---------- cmd : `list[str]` Command to execute. In the same format with what subprocess use. execution_time : `list[float]` A list to store the execution_time. gpu_queue : `BaseProxy`, optional A multiprocessing Manager.Queue() object. The queue stores the GPU device IDs get from GPUtil.getAvailable(). Must be assigned if running a PCC algorithm using GPUs. Defaults to None. Returns ------- `bool` True is successfully executed, False otherwise. Raises ------ `e` Exception ``subprocess.CalledProcessError``. """ if gpu_queue is not None: gpu_id = gpu_queue.get() # Inject environment variable `CUDA_VISIBLE_DEVICES` to ``cmd``. env = dict(os.environ, CUDA_VISIBLE_DEVICES=str(gpu_id)) else: env = os.environ try: start_time = time.time() ret = sp.run(cmd, cwd=self._algs_cfg['rootdir'], env=env, capture_output=True, check=True) end_time = time.time() except sp.CalledProcessError as e: timestamp = datetime.datetime.now().strftime('%Y%m%d_%H:%M:%S') log_file = (Path(__file__).parents[1].joinpath( f'logs/execute_cmd_{timestamp}.log')) with open(log_file, 'w') as f: lines = [ f"The stdout and stderr of executed command: ", f"{''.join(str(s)+' ' for s in cmd)}", "\n", "===== stdout =====", f"{e.stdout.decode('utf-8')}", "\n", "===== stderr =====", f"{e.stderr.decode('utf-8')}", ] f.writelines('\n'.join(lines)) logger.error(f"Error occurs when executing command: ", f"{''.join(str(s)+' ' for s in cmd)}", "\n" f"Check {log_file} for more informations.") if self._debug is True: raise e else: self._failure_cnt += 1 return False else: execution_time[0] = end_time - start_time return True finally: if gpu_queue is not None: gpu_queue.put(gpu_id)