def report_load_from_pipeline_reason( source: DataSource, pipeline: SourceCreatingPipeline, reason: LoadFromPipelineReason ): if reason == LoadFromPipelineReason.NO_LAST_MODIFIED_IN_PIPELINE: logger.warning( f"Was not able to determine last modified of pipeline " f"{pipeline.name}. Will always run pipeline due to this. " f"Consider manually setting last_modified when creating " f"the pipeline." ) elif reason == LoadFromPipelineReason.NO_DATA_AT_LOCATION: logger.warning( f"Was not able to determine last modified of source " f"{source.name}. Will run pipeline due to this. " f"This is due to no file currently existing for this source." ) elif reason == LoadFromPipelineReason.PIPELINE_NEWER: recent_obj, obj_lm = source.pipeline_obj_last_modified try: recent_obj_name = recent_obj.name except AttributeError: # Must be Operation, get name from pipeline instead recent_obj_name = pipeline.name logger.info( f'{recent_obj_name} was modified at {obj_lm}. ' f'This data source {source.name} was modified at ' f'{source.last_modified}. To get new changes, ' f'will load this data source through pipeline ' f'rather than from file.' )
def run_pipeline_then_load(pipeline: SourceCreatingPipeline): logger.info(f'Running pipeline then loading source {self.name}') pipeline.execute() # outputs to file result = loader.load_from_existing_source( pipeline.result, preserve_original=not pipeline.allow_modifying_result ) return result
def summary(self, *summary_args, summary_method: str = None, summary_function: Callable = None, summary_attr: str = None, **summary_method_kwargs): # TODO [#53]: better summary for DataTransformationPipeline logger.info(f'Calls transform {self.options.transform} on existing ' f'data source {self.data_source}')
def summary(self, *summary_args, summary_method: str = None, summary_function: Callable = None, summary_attr: str = None, **summary_method_kwargs): # TODO [#46]: better summary for DataGeneratorPipeline logger.info( f'Calling function {self.options.func.__name__} with kwargs {self.options.func_kwargs} to ' f'generate a DataSource')
def summary(self, *summary_args, summary_method: str = None, summary_function: Callable = None, summary_attr: str = None, **summary_method_kwargs): # TODO [#73]: better summary for DataCombinationPipeline logger.info( f'Combining {self.data_sources[0]} with {self.data_sources[1]} with ' f'options {self.options}')
def summary(self, *summary_args, summary_method: str = None, summary_function: Callable = None, summary_attr: str = None, **summary_method_kwargs): # TODO [#45]: better summary for DataAnalysisPipeline logger.info( f'Runs func {self.options.func.__name__} with kwargs {self.options.func_kwargs} on {self.data_source}:' ) logger.info(f'{self.data_source.describe()}')
def _do_operation(self): try: operation = self.operations[self._operation_index] logger.info( f'Now running operation {self._operation_index + 1}: {operation}' ) except IndexError: raise LastOperationFinishedException operation.execute() # Set current df to result of merge if isinstance(operation.result, DataSource): # Need to check as may be analysis result, in which case df should not be changed self.df = operation.result.df self._operation_index += 1
def _execute(self): logger.info(f'Running merge function {self.merge_str}') left_df, right_df = self._get_merge_dfs() self.result.df = self.options.merge_function( left_df, right_df, self.options.on_names, **self.options.merge_function_kwargs) if self.options.post_merge_func is not None: self.result.df = self.options.post_merge_func(self.result.df) # TODO [#78]: merge source variable combine logic doesn't seem to be working completely correctly # # Had to put safe=False in merge pipeline output to make it happen if 'columns' not in self.result_kwargs: left_ds, right_ds = self.data_sources[0], self.data_sources[1] load_variables = [] columns = [] if left_ds.load_variables: for var in left_ds.load_variables: if self.options.left_df_keep_cols is None or var.name in self.options.left_df_keep_cols: load_variables.append(var) columns.append(left_ds.col_for(var)) if right_ds.load_variables: for var in right_ds.load_variables: if self.options.right_df_keep_cols is None or var.name in self.options.right_df_keep_cols: # don't repeat variables and columns. Merge on variables will be repeated, # perhaps even with different transformations, so explictly skip them if var not in load_variables and var.name not in self.options.on_names: load_variables.append(var) columns.append(right_ds.col_for(var)) self.result.columns = columns self.result.load_variables = load_variables logger.info(f""" {self.data_sources[0].name} obs: {len(left_df)} {self.data_sources[1].name} obs: {len(right_df)} Merged obs: {len(self.result.df)} """)