Beispiel #1
0
def report_load_from_pipeline_reason(
    source: DataSource,
    pipeline: SourceCreatingPipeline,
    reason: LoadFromPipelineReason
):
    if reason == LoadFromPipelineReason.NO_LAST_MODIFIED_IN_PIPELINE:
        logger.warning(
            f"Was not able to determine last modified of pipeline "
            f"{pipeline.name}. Will always run pipeline due to this. "
            f"Consider manually setting last_modified when creating "
            f"the pipeline."
        )
    elif reason == LoadFromPipelineReason.NO_DATA_AT_LOCATION:
        logger.warning(
            f"Was not able to determine last modified of source "
            f"{source.name}. Will run pipeline due to this. "
            f"This is due to no file currently existing for this source."
        )
    elif reason == LoadFromPipelineReason.PIPELINE_NEWER:
        recent_obj, obj_lm = source.pipeline_obj_last_modified
        try:
            recent_obj_name = recent_obj.name
        except AttributeError:
            # Must be Operation, get name from pipeline instead
            recent_obj_name = pipeline.name
        logger.info(
            f'{recent_obj_name} was modified at {obj_lm}. '
            f'This data source {source.name} was modified at '
            f'{source.last_modified}. To get new changes, '
            f'will load this data source through pipeline '
            f'rather than from file.'
        )
Beispiel #2
0
 def run_pipeline_then_load(pipeline: SourceCreatingPipeline):
     logger.info(f'Running pipeline then loading source {self.name}')
     pipeline.execute() # outputs to file
     result = loader.load_from_existing_source(
         pipeline.result,
         preserve_original=not pipeline.allow_modifying_result
     )
     return result
Beispiel #3
0
 def summary(self,
             *summary_args,
             summary_method: str = None,
             summary_function: Callable = None,
             summary_attr: str = None,
             **summary_method_kwargs):
     # TODO [#53]: better summary for DataTransformationPipeline
     logger.info(f'Calls transform {self.options.transform} on existing '
                 f'data source {self.data_source}')
Beispiel #4
0
 def summary(self,
             *summary_args,
             summary_method: str = None,
             summary_function: Callable = None,
             summary_attr: str = None,
             **summary_method_kwargs):
     # TODO [#46]: better summary for DataGeneratorPipeline
     logger.info(
         f'Calling function {self.options.func.__name__} with kwargs {self.options.func_kwargs} to '
         f'generate a DataSource')
Beispiel #5
0
 def summary(self,
             *summary_args,
             summary_method: str = None,
             summary_function: Callable = None,
             summary_attr: str = None,
             **summary_method_kwargs):
     # TODO [#73]: better summary for DataCombinationPipeline
     logger.info(
         f'Combining {self.data_sources[0]} with {self.data_sources[1]} with '
         f'options {self.options}')
Beispiel #6
0
 def summary(self,
             *summary_args,
             summary_method: str = None,
             summary_function: Callable = None,
             summary_attr: str = None,
             **summary_method_kwargs):
     # TODO [#45]: better summary for DataAnalysisPipeline
     logger.info(
         f'Runs func {self.options.func.__name__} with kwargs {self.options.func_kwargs} on {self.data_source}:'
     )
     logger.info(f'{self.data_source.describe()}')
Beispiel #7
0
    def _do_operation(self):
        try:
            operation = self.operations[self._operation_index]
            logger.info(
                f'Now running operation {self._operation_index + 1}: {operation}'
            )
        except IndexError:
            raise LastOperationFinishedException

        operation.execute()

        # Set current df to result of merge
        if isinstance(operation.result, DataSource):
            # Need to check as may be analysis result, in which case df should not be changed
            self.df = operation.result.df

        self._operation_index += 1
Beispiel #8
0
    def _execute(self):
        logger.info(f'Running merge function {self.merge_str}')
        left_df, right_df = self._get_merge_dfs()
        self.result.df = self.options.merge_function(
            left_df, right_df, self.options.on_names,
            **self.options.merge_function_kwargs)
        if self.options.post_merge_func is not None:
            self.result.df = self.options.post_merge_func(self.result.df)

        # TODO [#78]: merge source variable combine logic doesn't seem to be working completely correctly
        #
        # Had to put safe=False in merge pipeline output to make it happen
        if 'columns' not in self.result_kwargs:
            left_ds, right_ds = self.data_sources[0], self.data_sources[1]
            load_variables = []
            columns = []
            if left_ds.load_variables:
                for var in left_ds.load_variables:
                    if self.options.left_df_keep_cols is None or var.name in self.options.left_df_keep_cols:
                        load_variables.append(var)
                        columns.append(left_ds.col_for(var))
            if right_ds.load_variables:
                for var in right_ds.load_variables:
                    if self.options.right_df_keep_cols is None or var.name in self.options.right_df_keep_cols:
                        # don't repeat variables and columns. Merge on variables will be repeated,
                        # perhaps even with different transformations, so explictly skip them
                        if var not in load_variables and var.name not in self.options.on_names:
                            load_variables.append(var)
                            columns.append(right_ds.col_for(var))
            self.result.columns = columns
            self.result.load_variables = load_variables

        logger.info(f"""
        {self.data_sources[0].name} obs: {len(left_df)}
        {self.data_sources[1].name} obs: {len(right_df)}
        Merged obs: {len(self.result.df)}
        """)