Beispiel #1
0
    def __init__(self, **kwargs):
        """Initialize link instance.

        :param str name: name of link
        :param str read_key: key of input data to read from data store
        :param str results_path: output path of summary result files
        :param str column: column pick up from input data to use as boxplot input
        :param list cause_columns: list of columns (str) to group-by, and per unique value plot a boxplot
        :param list statistics: a list of strings of the statistics you want to generate for the boxplot
               the full list is taken from statistics.ArrayStats.get_latex_table
               defaults to: ['count', 'mean', 'min', 'max']
        :param str pages_key: data store key of existing report pages
        """
        # initialize Link
        Link.__init__(self, kwargs.pop('name', 'df_boxplot'))

        # process keyword arguments
        self._process_kwargs(kwargs,
                             read_key='',
                             results_path='',
                             column=None,
                             cause_columns=None,
                             var_labels={},
                             var_units={},
                             statistics=['count', 'mean', 'min', 'max'],
                             pages_key='')
        self.check_extra_kwargs(kwargs)

        # initialize attributes
        self.pages = []
    def __init__(self, **kwargs):
        """Initialize link instance.

        Input dataframe is not overwritten, unless instructed to do so in kwargs.

        :param str name: name of link
        :param str read_key: key of data to read from data store
        :param str store_key: key of data to store in data store. If not set read_key is overwritten.
        :param list query_set: list of strings, query expressions to evaluate in the same order,
            see pandas documentation
        :param list select_columns: column names to select after querying
        :param bool continue_if_failure: if True continues with next query after failure (optional)
        :param kwargs: all other key word arguments are passed on to the pandas queries.
        """
        Link.__init__(self, kwargs.pop('name', 'ApplySelectionToDf'))

        # process and register all relevant kwargs. kwargs are added as attributes of the link.
        # second arg is default value for an attribute. key is popped from kwargs.
        self._process_kwargs(kwargs,
                             read_key='',
                             store_key=None,
                             query_set=[],
                             select_columns=[],
                             continue_if_failure=False)

        # pass on remaining kwargs to pandas query
        self.kwargs = copy.deepcopy(kwargs)
    def __init__(self, **kwargs):
        """Initialize link instance.

        :param str name: name of link
        :param str data: key of input data to read from data store or workspace
        :param str function: key of rooabsreal function to read from data store or workspace
        :param str fit_result: key of roofitresult object to read from data store or workspace,
                               used as input for error propagation
        :param bool from_ws: if true, try to retrieve data, function, and fit_result
                             from workspace instead of datastore. Default is false.
        :param str function_error_name: column name assigned to propagated errors that are appended to data
        :param bool add_function_to_data: add column of the function values to the data. Default is true
        """
        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name',
                                       'AddPropagatedErrorToRooDataSet'))

        # process and register all relevant kwargs. kwargs are added as attributes of the link.
        # second arg is default value for an attribute. key is popped from kwargs.
        self._process_kwargs(kwargs,
                             from_ws=False,
                             data='',
                             function='',
                             fit_result='',
                             function_error_name='',
                             add_function_to_data=True)

        # check residual kwargs. exit if any present.
        self.check_extra_kwargs(kwargs)
Beispiel #4
0
    def __init__(self, **kwargs):
        """Initialize link instance.

        Store and do basic check on the attributes of link RecordVectorizer.

        :param str read_key: key to read dataframe from the data store. Dataframe of records that is to be transformed.
        :param list columns: list of columns that are to be vectorized
        :param str store_key: store key of output dataFrame. Default is read_key + '_vectorized'. (optional)
        :param dict column_compare_with: dict of unique items per column with which column values are compared.
               If not given, this is derived automatically from the column. (optional)
        :param type astype: store answer of comparison of column with value as certain type. Default is bool. (optional)
        """
        Link.__init__(self, kwargs.pop('name', 'RecordVectorizer'))

        # process and register all relevant kwargs. kwargs are added as attributes of the link.
        # second arg is default value for an attribute. key is popped from kwargs.
        self._process_kwargs(kwargs,
                             read_key='',
                             store_key=None,
                             columns=[],
                             column_compare_with={},
                             astype=bool)

        # check residual kwargs. exit if any present
        self.check_extra_kwargs(kwargs)
Beispiel #5
0
    def __init__(self, **kwargs):
        """Initialize an instance.

        :param str name: name of link
        :param str read_key: key of input data to read from data store
        :param str store_key: key of output data to store in data store
        :param list/dict feature_cols: columns to take daily aggregates of. If list, all columns in the
            list are aggregated with the min, mean, max, stddev, count, and sum. If dict, the keys are
            column names to aggregate, and the values are lists of aggregation functions to apply. These
            must be built in spark aggregation functions.
        :param str new_date_col: name of the 'date' column which will be created (default 'date')
        :param str datetime_col: name of column with datetime information in the dataframe
        :param list partitionby_cols: identifying columns to partition by before aggregating
        """
        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'DailySummary'))

        # Process and register keyword arguments. If the arguments are not given, all arguments are popped from
        # kwargs and added as attributes of the link. Otherwise, only the provided arguments are processed.
        self._process_kwargs(kwargs,
                             read_key=None,
                             store_key=None,
                             feature_cols=[],
                             new_date_col='date',
                             datetime_col=None,
                             partitionby_cols=[])

        # check residual kwargs; exit if any present
        self.check_extra_kwargs(kwargs)
Beispiel #6
0
    def __init__(self, **kwargs):
        """
        Store the configuration of link RandomSampleSplitter

        :param str name: name of link
        :param str readKey: key of data to read from data store
        :param list storeKey: keys of datasets to store in data store. Number of sub samples equals length of storeKey list.
        :param list fractions: list of fractions (0<fraction<1) of records assigned to the sub samples. Sum can be less than 1. Needs to be set. 
        :param list nevents: list of number of random records assigned to the sub samples. (optional instead of 'fractions')
        """

        Link.__init__(self, kwargs.pop('name', 'RandomSampleSplitter'))

        # process and register all relevant kwargs. kwargs are added as attributes of the link.
        # second arg is default value for an attribute. key is popped from kwargs.
        self._process_kwargs(kwargs,
                             readKey=None,
                             storeKey=None,
                             fractions=None,
                             nevents=False)

        # check residual kwargs. exit if any present.
        self.check_extra_kwargs(kwargs)

        return
Beispiel #7
0
    def __init__(self, **kwargs):
        """Initialize HistSummary instance

        Store and do basic check on the attributes of link HistSummary.

        :param str name: name of link
        :param str read_key: key of input histograms dictionary to read from data store
        :param str results_path: output path of summary result files
        :param list hist_keys: histograms keys pick up from input histogram dict to make & plot summaries for
        :param dict var_labels: dict of column names with a label per column
        :param dict var_units: dict of column names with a unit per column
        :param dict var_bins: dict of column names with the number of bins per column. Default per column is 30.
        :param str hist_y_label: y-axis label to plot for all columns. Default is 'Bin Counts'.
        """

        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'HistSummary'))

        # process keyword arguments
        self._process_kwargs(kwargs,
                             read_key='',
                             results_path='',
                             hist_keys=None,
                             var_labels={},
                             var_units={},
                             var_bins={},
                             hist_y_label='Bin counts')
        self.check_extra_kwargs(kwargs)

        # initialize attributes
        self.pages = []
    def __init__(self, **kwargs):
        """Initialize link instance.

        :param str name: name of link
        :param str read_key: key of input data in data store
        :param schema: schema to create data frame if input data have a different format
        :param iterable write_methods: methods to apply sequentially on data-frame writer
        :param dict write_meth_args: positional arguments for write methods
        :param dict write_meth_kwargs: keyword arguments for write methods
        :param int num_files: requested number of output files
        :param bool fail_missing_data: fail execution if data are missing (default is "True")
        """
        # initialize Link
        Link.__init__(self, kwargs.pop('name', 'SparkDfWriter'))

        # process keyword arguments
        self._process_kwargs(kwargs,
                             read_key='',
                             schema=None,
                             write_methods=[],
                             write_meth_args={},
                             write_meth_kwargs={},
                             num_files=1,
                             fail_missing_data=True)
        self.check_extra_kwargs(kwargs)
Beispiel #9
0
    def __init__(self, **kwargs):
        """Initialize link instance.

        :param str name: name of link
        :param str read_key: key of data to read from data store
        :param str db: hive database name
        :param str table: hive table name
        :param dict schemSpec: if writing spark rdd, schema of hive types
        :param str prefix: prefix for hive column names
        :param list column_names_not_to_change: column names not to give the prefix
        :param list columns: columns to store in hive. If empty all columns will be stored
        :param list not_columns: columns to store not in hive
        :param list change_column_names: columns only to add prefix to
        """
        # initialize Link
        Link.__init__(self, kwargs.pop('name', 'HiveWriter'))

        # process keyword arguments
        self._process_kwargs(kwargs,
                             read_key='',
                             db='',
                             table='',
                             schema_spec=None,
                             prefix='',
                             column_names_not_to_change=[],
                             columns=[],
                             not_columns=[],
                             change_column_names=[])
        self.check_extra_kwargs(kwargs)
Beispiel #10
0
    def __init__(self, **kwargs):
        """Initialize an instance.

        :param str name: name of link
        :param str read_key: key of input data to read from data store
        :param str store_key: key of output data to store in data store
        """
        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'dash_builder'))

        # Process and register keyword arguments. If the arguments are not given, all arguments are popped from
        # kwargs and added as attributes of the link. Otherwise, only the provided arguments are processed.
        self._process_kwargs(kwargs,
                             read_key=None,
                             store_key=None,
                             layout_kwargs=None,
                             figure_strings=[],
                             control_strings=[],
                             figure_titles=[],
                             filter_controls=[],
                             assets_folder=None,
                             app_title='Dash template')

        # check residual kwargs; exit if any present
        self.check_extra_kwargs(kwargs)
    def __init__(self, **kwargs):
        """Find the number of days until a particular event in an ordered dataframe.

        :param str name: name of link
        :param str read_key: key of input data to read from data store
        :param str store_key: key of output data to store in data store
        :param str datetime_col: column with datetime information
        :param str event_col: the column containing the events (0 for rows with
            no events, >0 otherwise)
        :param str countdown_col_name: column where the number of days until the
            next event will be stored
        :param str partitionby_cols: columns to partition the countdown by
        """

        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'FindDaysUntilEvent'))

        # Process and register keyword arguments. If the arguments are not given, all arguments are popped from
        # kwargs and added as attributes of the link. Otherwise, only the provided arguments are processed.
        self._process_kwargs(kwargs,
                             read_key=None,
                             store_key=None,
                             datetime_col=None,
                             event_col=None,
                             countdown_col_name='days_until_event',
                             partitionby_cols=None)

        # check residual kwargs; exit if any present
        self.check_extra_kwargs(kwargs)
Beispiel #12
0
    def __init__(self, **kwargs):
        """Initialize link instance.

        :param str name: name of link
        :param str filename: file name where the strings are located (txt or similar). Default is None. (optional)
        :param str store_key: key to collect in datastore. If set lines are collected. (optional)
        :param list line_processor_set: list of functions to apply to input lines. (optional)
        :param bool sort: if true, sort lines before storage (optional)
        :param bool unique: if true, keep only unique lines before storage (optional),
        :param list skip_line_beginning_with: skip line if it starts with any of the list. input is list of strings.
            Default is ['#'] (optional)
        """
        # initialize Link
        Link.__init__(self, kwargs.pop('name', 'EventLooper'))

        # process and register all relevant kwargs. kwargs are added as attributes of the link.
        # second arg is default value for an attribute. key is popped from kwargs.
        self._process_kwargs(kwargs,
                             filename=None,
                             store_key=None,
                             line_processor_set=[],
                             sort=False,
                             unique=False,
                             skip_line_beginning_with=['#'])

        # process keyword arguments
        self.check_extra_kwargs(kwargs)

        # default line stream to pick up lines is set to sys.stdin below
        # input stream and possible input file
        self._f = None
        self._linestream = None

        # collect lines for storage
        self._collect = False
Beispiel #13
0
    def __init__(self, **kwargs):
        """
        Store the configuration of link DfMerger

        :param str name: name of link
        :param str input_collection1: datastore key of the first pandas.DataFrame to merge
        :param str input_collection2: datastore key of the second pandas.DataFrame to merge
        :param str output_collection: datastore key of the merged output pandas.DataFrame
        :param str how: merge modus. See pandas documentation.
        :param list on: column names. See pandas documentation.
        :param list columns1: column names of the first pandas.DataFrame. Only these columns are included in the merge. If not set, use all columns.
        :param list columns2: column names of the second pandas.DataFrame. Only these columns are included in the merge. If not set, use all columns.
        :param bool remove_duplicate_cols2: if True duplicate columns will be taken out before the merge (default=True)
        :param kwargs: all other key word arguments are passed on to the pandas merge function.
        """

        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'DfMerger'))

        # process and register all relevant kwargs. kwargs are added as attributes of the link.
        # second arg is default value for an attribute. key is popped from kwargs.
        self._process_kwargs(kwargs,
                             input_collection1=None,
                             input_collection2=None,
                             output_collection='',
                             how='inner',
                             on=['record_id'],
                             columns1=[],
                             columns2=[],
                             remove_duplicate_cols2=True)

        # pass on remaining kwargs to pandas reader
        self.kwargs = copy.deepcopy(kwargs)

        return
Beispiel #14
0
    def __init__(self, **kwargs):
        """Initialize an instance.

        :param str name: name of link
        :param str read_key: key of input data to read from data store
        :param str store_key: key of output data to store in data store
        """
        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'df_summary_dash'))

        # Process and register keyword arguments. If the arguments are not given, all arguments are popped from
        # kwargs and added as attributes of the link. Otherwise, only the provided arguments are processed.
        self._process_kwargs(kwargs,
                             read_key=None,
                             app_store_key=None,
                             label_key=None,
                             col_key=None,
                             hue_key=None,
                             hue_cols=None,
                             stats_key=None,
                             assets_path=None,
                             plt_bgcolor='#263740',
                             plt_papercolor='#1d2930',
                             text_color='white',
                             ext_sheets=['https://codepen.io/crhiddyp/pen/bWLwgP.css'])

        # check residual kwargs; exit if any present
        self.check_extra_kwargs(kwargs)
Beispiel #15
0
    def __init__(self, **kwargs):
        """Initialize link instance.

        :param str name: name of link
        :param str read_key: key of the input data in the data store
        :param str store_key: key of the output data frame in the data store
        :param str schema_key: key to store the data-frame schema in the data store
        :param str output_format: data format to store: {"df" (default), "RDD", "list", "pd"}
        :param bool preserve_col_names: preserve column names for non-data-frame output formats (default is True)
        :param iterable process_methods: methods to apply sequentially on the produced data
        :param dict process_meth_args: positional arguments for process methods
        :param dict process_meth_kwargs: keyword arguments for process methods
        :param bool fail_missing_data: fail execution if the input data frame is missing (default is "True")
        """
        # initialize Link
        Link.__init__(self, kwargs.pop('name', 'SparkDfConverter'))

        # process keyword arguments
        self._process_kwargs(kwargs,
                             read_key='',
                             store_key=None,
                             schema_key=None,
                             output_format='df',
                             preserve_col_names=True,
                             process_methods=[],
                             process_meth_args={},
                             process_meth_kwargs={},
                             fail_missing_data=True)
        self.kwargs = kwargs
Beispiel #16
0
    def __init__(self, **kwargs):
        """
        Store the configuration of link WriteFromDf

        :param str name: Name given to the link
        :param str key: the DataStore key
        :param str path: path where to save the DataFrame
        :param writer: file extension that can be written by a pandas writer function from pd.DataFrame. For example: 'csv'
        :param dict dictionary: keys (as in the arg above) and paths (as in the arg above) it will write out all the keys
            to the associated paths.
        :param bool add_counter_to_name: if true, add an index to the output file name. Useful when running in loops. Default is false.
        :param kwargs: all other key word arguments are passed on to the pandas writers.
        """

        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'WriteFromDf'))

        # process and register all relevant kwargs. kwargs are added as attributes of the link.
        # second arg is default value for an attribute. key is popped from kwargs.
        self._process_kwargs(kwargs,
                             path='',
                             key='',
                             writer=None,
                             dictionary={},
                             add_counter_to_name=False)

        # pass on remaining kwargs to pandas writer
        self.kwargs = copy.deepcopy(kwargs)

        # execute counter
        self._counter = 0

        return
Beispiel #17
0
    def __init__(self, **kwargs):
        """Initialize link instance

        :param str name: name of link
        :param str read_key: key of the input data in the data store
        :param str store_key: key of the output data frame in the data store
        :param group_map: map function for group values
        :param input_map: map function for input rows; optional, e.g. to create group key-value pairs
        :param result_map: map function for output group values; optional, e.g. to flatten group key-value pairs
        :param bool flatten_output_groups: create a row for each item in the group output values (default is False)
        :param int num_group_partitions: number of partitions for group map (optional, no repartitioning by default)
        """

        # initialize Link
        Link.__init__(self, kwargs.pop('name', 'RddGroupMapper'))

        # process keyword arguments
        self._process_kwargs(kwargs,
                             read_key='',
                             store_key=None,
                             group_map=None,
                             input_map=None,
                             result_map=None,
                             flatten_output_groups=False,
                             num_group_partitions=None)
        self.kwargs = kwargs
Beispiel #18
0
    def __init__(self, **kwargs):
        """Initialize an instance.

        :param str name: name of link
        :param str read_key: key of raw input data to read from data store
        :param str store_key: key of output data to store in data store
        :param str hypotest_chain: name of the chain containing the
            UncorrelationHypothesisTester link
        :param str hypotest_link: name of the UncorrelationHypothesisTester
            link (default "UncorrelationHypothesisTester")
        :param str residuals_key: key of residulas map (see
            UncorrelationHypothesisTester.sk_residuals_map)
        :param list[str] columns: only include these columns from the data

        """
        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'correlation_frontend'))

        # Process and register keyword arguments. If the arguments are not given, all arguments are popped from
        # kwargs and added as attributes of the link. Otherwise, only the provided arguments are processed.
        self._process_kwargs(
            kwargs,
            read_key=None,
            store_key=None,
            hypotest_chain="hypotest",
            hypotest_link="UncorrelationHypothesisTester",
            residuals_key="residuals",
            columns=[],
        )

        # check residual kwargs; exit if any present
        self.check_extra_kwargs(kwargs)
Beispiel #19
0
    def __init__(self, **kwargs):
        """Initialize link instance.

        :param str name: name of link
        :param str read_key: key of input dataframe (or histogram-dict) to read from data store
        :param str results_path: output path of summary result files
        :param list columns: columns (or histogram keys) pick up from input data to make & plot summaries for
        :param list hist_keys: alternative to columns (optional)
        :param dict var_labels: dict of column names with a label per column
        :param dict var_units: dict of column names with a unit per column
        :param dict var_bins: dict of column names with the number of bins per column. Default per column is 30.
        :param str hist_y_label: y-axis label to plot for all columns. Default is 'Bin Counts'.
        :param str pages_key: data store key of existing report pages
        """
        # initialize Link
        Link.__init__(self, kwargs.pop('name', 'df_summary'))

        # process keyword arguments
        self._process_kwargs(kwargs, read_key='', results_path='', columns=[],
                             hist_keys=[],
                             var_labels={}, var_units={}, var_bins={},
                             hist_y_label='Bin counts', pages_key='')
        self.check_extra_kwargs(kwargs)

        # initialize attributes
        self.pages = []
        self.nan_counts = []
Beispiel #20
0
    def __init__(self, **kwargs):
        """Initialize link instance.

        :param str name: name of link
        :param str read_key: key of the input data in the data store
        :param str store_key: key of the output data frame in the data store
        :param schema: schema to create data frame if input data have a different format
        :param iterable process_methods: methods to apply sequentially on the produced data frame
        :param dict process_meth_args: positional arguments for process methods
        :param dict process_meth_kwargs: keyword arguments for process methods
        :param bool fail_missing_data: fail execution if data are missing (default is "True")
        """
        # initialize Link
        Link.__init__(self, kwargs.pop('name', 'SparkDfCreator'))

        # process keyword arguments
        self._process_kwargs(kwargs,
                             read_key='',
                             store_key=None,
                             schema=None,
                             process_methods=[],
                             process_meth_args={},
                             process_meth_kwargs={},
                             fail_missing_data=True)
        self.kwargs = kwargs
Beispiel #21
0
    def __init__(self, **kwargs):
        """Initialize link instance.

        :param str name: name of link
        :param str read_key: key of data to read from datastore
        :param list store_key: keys of datasets to store in datastore.
            Number of sub samples equals length of store_key list (optional instead of 'column' and 'nclasses').
        :param str column: name of new column that specifies the randomly assigned class.
            Default is randomclass (optional instead of 'store_key').
        :param int nclasses: number of random classes. Needs to be set
            (optional instead of 'store_key').
        :param list fractions: list of fractions (0<fraction<1) of records assigned to the sub samples.
            Can be one less than n classes. Sum can be less than 1. Needs to be set.
        :param list nevents: list of number of random records assigned to the sub samples
            Can be one less than n classes (optional instead of 'fractions').
        """
        Link.__init__(self, kwargs.pop('name', 'RandomSampleSplitter'))

        # process and register all relevant kwargs. kwargs are added as attributes of the link.
        # second arg is default value for an attribute. key is popped from kwargs.
        self._process_kwargs(kwargs,
                             read_key=None,
                             store_key=None,
                             column='randomclass',
                             fractions=None,
                             nevents=False,
                             nclasses=None)

        # check residual kwargs. exit if any present.
        self.check_extra_kwargs(kwargs)
    def __init__(self, **kwargs):
        """Initialize link instance.

        Store the configuration of link SparkToGeneralFuncProcessor.

        :param str name: name of link
        :param str read_key: key of data to read from data store. It should contain a spark dataframe or spark rdd.
        :param str store_key: key of data to store in data store
        :param list groupby: spark dataframe columns to group by
        :param list columns: The columns of the spark dataframe or RDD. Obligatory for RDD, not for spark dataframe.
        :param func generalfunc: The general function. Should be defined by the user. Arguments should be list of
            tuples (rows of RDD), column names and if necessary keyword arguments. Should return a list of native
            python types.
        :param dict function_args: Keyword arguments for the function
        :param int nb_partitions: The number of partitions for repartitioning after groupByKey
        :param func return_map: Function used by the map on the RDD after the generalfunc is applied. The default return
            a tuple of the groupby columns (row[0]) and the list returned by the generalfunc (row[1]).
        """
        # initialize Link
        Link.__init__(self, kwargs.pop('name', 'SparkToGeneralFuncProcessor'))

        # process keyword arguments
        self._process_kwargs(
            kwargs,
            read_key='',
            store_key='',
            groupby=[],
            columns=None,
            generalfunc=None,
            function_args={},
            nb_partitions=1200,
            return_map=lambda row: tuple(list(row[0]) + row[1]))
        # check residual kwargs.
        # (turn line off if you wish to keep these to pass on.)
        self.check_extra_kwargs(kwargs)
Beispiel #23
0
    def __init__(self, **kwargs):
        """
        Store the configuration of link AssignRandomClass

        :param str name: name of link
        :param str readKey: key of data to read from data store
        :param str column: name of new column that specifies the randomly assigned class. Default is randomclass.
        :param int nclasses: number of random classes. Needs to be set. 
        :param list fractions: list of fractions of random records assigned to n classes. Needs to be set. Can be one less than n classes.
        :param list nevents: list of number of random records assigned to n classes. Can be one less than n classes. (optional instead of 'fractions')
        """

        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'AssignRandomClass'))

        # process and register all relevant kwargs. kwargs are added as attributes of the link.
        # second arg is default value for an attribute. key is popped from kwargs.
        self._process_kwargs(kwargs,
                             readKey=None,
                             column='randomclass',
                             fractions=None,
                             nevents=False,
                             nclasses=None)

        # check residual kwargs. exit if any present.
        self.check_extra_kwargs(kwargs)

        return
Beispiel #24
0
    def __init__(self, **kwargs):
        """
        Link to store one external object in the DataStore dict during run time.

        :param str name: name of link
        :param str store_key: key of object to store in data store
        :param obj: object to store
        :param bool force: overwrite if already present in datastore. default is false. (optional)
        :param bool at_initialize: store at initialize of link. Default is false.
        :param bool at_execute: store at execute of link. Default is true.
        :param bool copydict: if true and obj is a dict, copy all key value pairs into datastore. Default is false.
        """

        Link.__init__(self, kwargs.pop('name', 'ToDsDict'))

        # process keyword arguments
        self._process_kwargs(kwargs,
                             store_key=None,
                             obj=None,
                             at_initialize=False,
                             at_execute=True,
                             force=False,
                             copydict=False)
        self.check_extra_kwargs(kwargs)

        return
Beispiel #25
0
    def __init__(self, **kwargs):
        """Initialize link instance.

        Store the configuration of link ReadToDf.

        :param str name: Name given to the link
        :param str path: path of your file to read into pandas DataFrame .
        :param str key: storage key for the DataStore.
        :param reader: pandas reader is determined automatically. But can be set by hand, e.g. csv, xlsx.
        :param bool itr_over_files: Iterate over individual files, default is false.
            If false, are files are collected in one dataframe. NB chunksize takes priority!
        :param int chunksize: Default is none. If positive integer then will always iterate.
            chunksize requires pd.read_csv or pd.read_table.
        :param kwargs: all other key word arguments are passed on to the pandas reader.
        """
        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'ReadToDf'))

        # process and register all relevant kwargs. kwargs are added as attributes of the link.
        # second arg is default value for an attribute. key is popped from kwargs.
        self._process_kwargs(kwargs, path='', key='', reader=None, itr_over_files=False, chunksize=None)

        # pass on remaining kwargs to pandas reader
        self.kwargs = copy.deepcopy(kwargs)

        self._paths = None
        self._path_itr = None
        self._current_path = None
        self._latest_data_length = 0
        self._sum_data_length = 0
        self._iterate = False
        self._reader = None
        self._usecols = self.kwargs.get('usecols', [])
    def __init__(self, **kwargs):
        """Initialize ConvertRootHist2RooDataHist instance

        :param str name: name of link
        :param str read_key: histogram to pick up from datastore (or, if set, from histogram dict)
        :param str hist_dict_key: histograms dictionary from data store.
                                  If set, the histogram is read from this dict (optional)
        :param str store_key: key of roodatahist (optional)
        :param bool into_ws: if true, store in workspace, not datastore. Default is True.
        :param bool rm_original: if true, remove original histogram. Default is False.
        :param str create_hist_pdf: if set, create keys pdf from roodatahist with this name and add to ds or workspace
        """

        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'ConvertRootHist2RooDataHist'))

        # process and register all relevant kwargs. kwargs are added as attributes of the link.
        # second arg is default value for an attribute. key is popped from kwargs.
        self._process_kwargs(kwargs,
                             read_key='',
                             hist_dict_key='',
                             store_key='',
                             into_ws=False,
                             rm_original=False,
                             create_hist_pdf='')

        # check residual kwargs. exit if any present.
        self.check_extra_kwargs(kwargs)
Beispiel #27
0
    def __init__(self, **kwargs):
        """Initialize link instance.

        :param str read_key: data-store input key
        :param str store_key: data-store output key
        :param list apply_funcs: functions to apply (list of dicts)
          - 'func': function to apply
          - 'colout' (string): output column
          - 'colin' (string, optional): input column
          - 'entire' (boolean, optional): apply to the entire dataframe?
          - 'args' (tuple, optional): args for 'func'
          - 'kwargs' (dict, optional): kwargs for 'func'
          - 'groupby' (list, optional): column names to group by
          - 'groupbyColout' (string) output column after the split-apply-combine combination
        :param dict add_columns: columns to add to output (name, column)
        """
        Link.__init__(self, kwargs.pop('name', 'apply_func_to_dataframe'))

        # process keyword arguments
        self._process_kwargs(kwargs,
                             read_key='',
                             store_key='',
                             apply_funcs=[],
                             add_columns=None)
        self.check_extra_kwargs(kwargs)
Beispiel #28
0
    def __init__(self, **kwargs):
        """Initialize link instance.

        :param str databaseName: name of the hive database
        :param str tableName: name of the hive table
        :param str store_key: key of data to store in data store
        :param list columns: hive columns to read. If empty all columns will be queried.
        :param str selection: where clause of the hive query
        :param str limit: limit clause of the hive query
        :param dict processFuncs: process spark functions after query
        :param str full_query: if not empty execute only this querystring
        :param str hive_sql_file: path to an hive.sql file. If not empty the query in this file will be executed
        """

        # initialize Link
        Link.__init__(self, kwargs.pop('name', 'HiveReader'))

        # process keyword arguments
        self._process_kwargs(kwargs,
                             databaseName='',
                             tableName='',
                             store_key='',
                             columns=[],
                             selection='',
                             limit='',
                             processFuncs={},
                             full_query='',
                             hive_sql_file='')
        self.check_extra_kwargs(kwargs)
Beispiel #29
0
    def __init__(self, **kwargs):
        """Initialize link instance.

        :param str name: name of link
        :param str read_key: key of input data to read from data store
        :param list columns: list of columns to pick up from dataset. Default is all columns. (optional)
        :param list ignore_columns: list of columns to ignore from dataset. (optional)
        :param str store_key: key of output roodataset to store in data store. (optional)
        :param str store_key_vars: key of output rooargset of all observables to store in data store. (optional)
        :param str store_key_cats: key of output rooargset of category observables to store in data store. (optional)
        :param bool store_at_finalize: if true, store in workspace at finalize(), not at execute(). (optional)
        :param bool into_ws: if true, store in workspace, not datastore. Default is False. (optional)
        :param bool rm_original: if true, remove original dataframe. Default is False. (optional)
        :param dict map_to_factorized: dictiorary or key to dictionary to map columns to factorized ones.
                                       map_to_factorized is a dict of dicts, one dict for each column. (optional)
        :param str sk_map_to_original: store key of dictiorary to map factorized columns to original.
                                       Default is 'key' + '_' + store_key + '_to_original'. (optional)
        :param dict var_number_of_bins: number of bins for histogram of certain variable (optional)
        :param dict var_min_value: min value for histogram of certain variable (optional)
        :param dict var_max_value: max value for histogram of certain variable (optional)
        :param int n_max_total_bins: max number of bins in roodatahist. Default is 1e6. (optional)
        :param str create_hist_pdf: if filled, create hist pdf from rdh with this name and
                                    add to datastore or workspace. (optional)
        :param bool create_new_rdh_in_loop: if true, create a new rdh when running in a loop. (optional)
        """
        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'RooDataHistFiller'))

        # process and register all relevant kwargs. kwargs are added as attributes of the link.
        # second arg is default value for an attribute. key is popped from kwargs.
        self._process_kwargs(kwargs,
                             read_key='',
                             columns=[],
                             ignore_columns=[],
                             store_key='',
                             store_key_vars='',
                             store_key_cats='',
                             store_at_finalize=False,
                             into_ws=False,
                             rm_original=False,
                             map_to_factorized={},
                             sk_map_to_original='',
                             var_number_of_bins={},
                             var_min_value={},
                             var_max_value={},
                             n_max_total_bins=1e6,
                             create_hist_pdf='',
                             create_new_rdh_in_loop=False)

        # check residual kwargs. exit if any present.
        self.check_extra_kwargs(kwargs)

        # internal roodatahist and its variables
        self._rdh = None
        self._varset = None
        self._catset = None
        # dict mapping category observables back to original string values
        self._mto = {}
Beispiel #30
0
    def __init__(self, **kwargs):
        """Initialize HistogrammarFiller instance

        Store and do basic check on the attributes of link
        HistogrammarFiller.

        :param str name: name of link
        :param str read_key: key of input data to read from data store
        :param str store_key: key of output data to store histograms in data store
        :param list columns: colums to pick up from input data
        :param dict bin_specs: dictionaries used for rebinning numeric or timestamp columns.

        Example bin_specs dictionary is:

        >>> bin_specs = {'x': {'bin_width': 1, 'bin_offset': 0},
                         'y': {'bin_edges': [0,2,3,4,5,7,8]}}

        :param dict datatype: dict of datatypes of the columns to study from dataframe.
                              If not provided, try to determine datatypes directy from dataframe.
        :param dict quantity: dictionary of lambda functions of how to pars certain columns.

        Example quantity dictionary is:

        >>> quantity = {'y': lambda x: x}

        :param drop_keys dict: dictionary used for dropping specific keys from bins dictionaries of histograms.
               Example drop_keys dictionary is:

        >>> drop_keys = {'x': [1,4,8,19],
                         'y': ['apple', 'pear', 'tomato']
                         'x:y': [(1,'apple'),(19,'tomato')]}
        """

        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'HistogrammarFiller'))

        # process and register all relevant kwargs. kwargs are added as attributes of the link.
        # second arg is default value for an attribute. key is popped from kwargs.
        self._process_kwargs(kwargs,
                             read_key=None,
                             store_key=None,
                             columns=[],
                             bin_specs={},
                             datatype={},
                             quantity={},
                             drop_keys={})

        # check residual kwargs. exit if any present.
        self.check_extra_kwargs(kwargs)

        self._unit_bin_specs = {'bin_width': 1.0, 'bin_offset': 0.0}
        self._unit_timestamp_specs = {
            'bin_width': pd.Timedelta(days=30).value,
            'bin_offset': pd.Timestamp('2010-01-04').value
        }
        # these get filled during execution
        self._hists = {}