Beispiel #1
0
    def test_partion(self):

        distributor = MultiprocessingDistributor(n_workers=1)

        data = [1, 3, 10, -10, 343.0]
        distro = distributor.partition(data, 3)
        self.assertEqual(next(distro), [1, 3, 10])
        self.assertEqual(next(distro), [-10, 343.0])

        data = np.arange(10)
        distro = distributor.partition(data, 2)
        self.assertEqual(next(distro), [0, 1])
        self.assertEqual(next(distro), [2, 3])
Beispiel #2
0
    def test_partion(self):

        distributor = MultiprocessingDistributor(n_workers=1)

        data = [1, 3, 10, -10, 343.0]
        distro = distributor.partition(data, 3)
        self.assertEqual(next(distro), [1, 3, 10])
        self.assertEqual(next(distro), [-10, 343.0])

        data = np.arange(10)
        distro = distributor.partition(data, 2)
        self.assertEqual(next(distro), [0, 1])
        self.assertEqual(next(distro), [2, 3])
Beispiel #3
0
    def test__calculate_best_chunk_size(self):

        distributor = MultiprocessingDistributor(n_workers=2)
        self.assertEqual(distributor.calculate_best_chunk_size(10), 1)
        self.assertEqual(distributor.calculate_best_chunk_size(11), 2)
        self.assertEqual(distributor.calculate_best_chunk_size(100), 10)
        self.assertEqual(distributor.calculate_best_chunk_size(101), 11)

        distributor = MultiprocessingDistributor(n_workers=3)
        self.assertEqual(distributor.calculate_best_chunk_size(10), 1)
        self.assertEqual(distributor.calculate_best_chunk_size(30), 2)
        self.assertEqual(distributor.calculate_best_chunk_size(31), 3)
Beispiel #4
0
    def test__calculate_best_chunk_size(self):

        distributor = MultiprocessingDistributor(n_workers=2)
        self.assertEqual(distributor.calculate_best_chunk_size(10), 1)
        self.assertEqual(distributor.calculate_best_chunk_size(11), 2)
        self.assertEqual(distributor.calculate_best_chunk_size(100), 10)
        self.assertEqual(distributor.calculate_best_chunk_size(101), 11)

        distributor = MultiprocessingDistributor(n_workers=3)
        self.assertEqual(distributor.calculate_best_chunk_size(10), 1)
        self.assertEqual(distributor.calculate_best_chunk_size(30), 2)
        self.assertEqual(distributor.calculate_best_chunk_size(31), 3)
Beispiel #5
0
def extractFeature(dataName, outName):

    Distributor = MultiprocessingDistributor(
        n_workers=16,
        disable_progressbar=False,
        progressbar_title="Feature Extraction")

    xlData = pd.ExcelFile(dataName)
    data = []
    for sheets in xlData.sheet_names:
        dataX = xlData.parse(sheet_name=sheets)
        data.append(dataX)
    data = pd.concat(data)
    dataX = data.drop(columns=["app"])
    extracted_features = extract_features(dataX,
                                          column_id="ID",
                                          column_sort="start",
                                          distributor=Distributor)
    extracted_features.to_csv(outName, index=False)
Beispiel #6
0
def roll_time_series(df_or_dict,
                     column_id,
                     column_sort=None,
                     column_kind=None,
                     rolling_direction=1,
                     max_timeshift=None,
                     min_timeshift=0,
                     chunksize=defaults.CHUNKSIZE,
                     n_jobs=defaults.N_PROCESSES,
                     show_warnings=defaults.SHOW_WARNINGS,
                     disable_progressbar=defaults.DISABLE_PROGRESSBAR,
                     distributor=None):
    """
    This method creates sub windows of the time series. It rolls the (sorted) data frames for each kind and each id
    separately in the "time" domain (which is represented by the sort order of the sort column given by `column_sort`).

    For each rolling step, a new id is created by the scheme ({id}, {shift}), here id is the former id of
    the column and shift is the amount of "time" shifts.
    You can think of it as having a window of fixed length (the max_timeshift) moving one step at a time over
    your time series.
    Each cut-out seen by the window is a new time series with a new identifier.

    A few remarks:

     * This method will create new IDs!
     * The sign of rolling defines the direction of time rolling, a positive value means we are shifting
       the cut-out window foreward in time. The name of each new sub time series is given by the last time point.
       This means, the time series named `([id=]4,[timeshift=]5)` with a `max_timeshift` of 3 includes the data
       of the times 3, 4 and 5.
       A negative rolling direction means, you go in negative time direction over your data.
       The time series named `([id=]4,[timeshift=]5)` with `max_timeshift` of 3 would then include the data
       of the times 5, 6 and 7.
     * It is possible to shift time series of different lengths, but:
     * We assume that the time series are uniformly sampled
     * For more information, please see :ref:`forecasting-label`.

    :param df_or_dict: a pandas DataFrame or a dictionary. The required shape/form of the object depends on the rest of
        the passed arguments.
    :type df_or_dict: pandas.DataFrame or dict

    :param column_id: it must be present in the pandas DataFrame or in all DataFrames in the dictionary.
        It is not allowed to have NaN values in this column.
    :type column_id: basestring

    :param column_sort: if not None, sort the rows by this column. It is not allowed to
        have NaN values in this column. If not given, will be filled by an increasing number,
        meaning that the order of the passed dataframes are used as "time" for the time series.
    :type column_sort: basestring or None

    :param column_kind: It can only be used when passing a pandas DataFrame (the dictionary is already assumed to be
        grouped by the kind). Is must be present in the DataFrame and no NaN values are allowed.
        If the kind column is not passed, it is assumed that each column in the pandas DataFrame (except the id or
        sort column) is a possible kind.
    :type column_kind: basestring or None

    :param rolling_direction: The sign decides, if to shift our cut-out window backwards or forwards in "time".
    :type rolling_direction: int

    :param max_timeshift: If not None, the cut-out window is at maximum `max_timeshift` large. If none, it grows
         infinitely.
    :type max_timeshift: int

    :param min_timeshift: Throw away all extracted forecast windows smaller or equal than this. Must be larger
         than or equal 0.
    :type min_timeshift: int

    :param n_jobs: The number of processes to use for parallelization. If zero, no parallelization is used.
    :type n_jobs: int

    :param chunksize: How many shifts per job should be calculated.
    :type chunksize: None or int

    :param show_warnings: Show warnings during the feature extraction (needed for debugging of calculators).
    :type show_warnings: bool

    :param disable_progressbar: Do not show a progressbar while doing the calculation.
    :type disable_progressbar: bool

    :param distributor: Advanced parameter: set this to a class name that you want to use as a
             distributor. See the utilities/distribution.py for more information. Leave to None, if you want
             TSFresh to choose the best distributor.
    :type distributor: class

    :return: The rolled data frame or dictionary of data frames
    :rtype: the one from df_or_dict
    """

    if rolling_direction == 0:
        raise ValueError("Rolling direction of 0 is not possible")

    if max_timeshift is not None and max_timeshift <= 0:
        raise ValueError("max_timeshift needs to be positive!")

    if min_timeshift < 0:
        raise ValueError("min_timeshift needs to be positive or zero!")

    if isinstance(df_or_dict, dict):
        if column_kind is not None:
            raise ValueError(
                "You passed in a dictionary and gave a column name for the kind. Both are not possible."
            )

        return {
            key: roll_time_series(df_or_dict=df_or_dict[key],
                                  column_id=column_id,
                                  column_sort=column_sort,
                                  column_kind=column_kind,
                                  rolling_direction=rolling_direction,
                                  max_timeshift=max_timeshift,
                                  min_timeshift=min_timeshift,
                                  chunksize=chunksize,
                                  n_jobs=n_jobs,
                                  show_warnings=show_warnings,
                                  disable_progressbar=disable_progressbar,
                                  distributor=distributor)
            for key in df_or_dict
        }

    # Now we know that this is a pandas data frame
    df = df_or_dict

    if len(df) <= 1:
        raise ValueError(
            "Your time series container has zero or one rows!. Can not perform rolling."
        )

    if column_id is not None:
        if column_id not in df:
            raise AttributeError(
                "The given column for the id is not present in the data.")
    else:
        raise ValueError(
            "You have to set the column_id which contains the ids of the different time series"
        )

    if column_kind is not None:
        grouper = [column_kind, column_id]
    else:
        grouper = [
            column_id,
        ]

    if column_sort is not None:
        # Require no Nans in column
        if df[column_sort].isnull().any():
            raise ValueError("You have NaN values in your sort column.")

        df = df.sort_values(column_sort)

        if df[column_sort].dtype != np.object:
            # if rolling is enabled, the data should be uniformly sampled in this column
            # Build the differences between consecutive time sort values

            differences = df.groupby(grouper)[column_sort].apply(
                lambda x: x.values[:-1] - x.values[1:])
            # Write all of them into one big list
            differences = sum(map(list, differences), [])
            # Test if all differences are the same
            if differences and min(differences) != max(differences):
                warnings.warn(
                    "Your time stamps are not uniformly sampled, which makes rolling "
                    "nonsensical in some domains.")

    # Roll the data frames if requested
    rolling_direction = np.sign(rolling_direction)

    grouped_data = df.groupby(grouper)
    prediction_steps = grouped_data.count().max().max()

    max_timeshift = max_timeshift or prediction_steps

    # Todo: not default for columns_sort to be None
    if column_sort is None:
        df["sort"] = range(df.shape[0])

    range_of_shifts = range(1, prediction_steps + 1)

    if distributor is None:
        if n_jobs == 0:
            distributor = MapDistributor(
                disable_progressbar=disable_progressbar,
                progressbar_title="Rolling")
        else:
            distributor = MultiprocessingDistributor(
                n_workers=n_jobs,
                disable_progressbar=disable_progressbar,
                progressbar_title="Rolling",
                show_warnings=show_warnings)

    if not isinstance(distributor, DistributorBaseClass):
        raise ValueError(
            "the passed distributor is not an DistributorBaseClass object")

    kwargs = {
        "grouped_data": grouped_data,
        "rolling_direction": rolling_direction,
        "max_timeshift": max_timeshift,
        "min_timeshift": min_timeshift,
        "column_sort": column_sort,
        "column_id": column_id,
    }

    shifted_chunks = distributor.map_reduce(_roll_out_time_series,
                                            data=range_of_shifts,
                                            chunk_size=chunksize,
                                            function_kwargs=kwargs)

    distributor.close()

    df_shift = pd.concat(shifted_chunks, ignore_index=True)

    return df_shift.sort_values(by=["id", column_sort or "sort"])
Beispiel #7
0
def _do_extraction(df, column_id, column_value, column_kind,
                   default_fc_parameters, kind_to_fc_parameters,
                   n_jobs, chunk_size, disable_progressbar, distributor):
    """
    Wrapper around the _do_extraction_on_chunk, which calls it on all chunks in the data frame.
    A chunk is a subset of the data, with a given kind and id - so a single time series.

    The data is separated out into those single time series and the _do_extraction_on_chunk is
    called on each of them. The results are then combined into a single pandas DataFrame.

    The call is either happening in parallel or not and is showing a progress bar or not depending
    on the given flags.

    :param df: The dataframe in the normalized format which is used for extraction.
    :type df: pd.DataFrame

    :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names
           which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for
           more information.
    :type default_fc_parameters: dict

    :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for
            default_fc_parameters. If you put a kind as a key here, the fc_parameters
            object (which is the value), will be used instead of the default_fc_parameters.
    :type kind_to_fc_parameters: dict

    :param column_id: The name of the id column to group by.
    :type column_id: str

    :param column_kind: The name of the column keeping record on the kind of the value.
    :type column_kind: str

    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param chunk_size: The size of one chunk for the parallelization
    :type chunk_size: None or int

    :param n_jobs: The number of processes to use for parallelization. If zero, no parallelization is used.
    :type n_jobs: int

    :param disable_progressbar: Do not show a progressbar while doing the calculation.
    :type disable_progressbar: bool

    :param distributor: Advanced parameter:  See the utilities/distribution.py for more information.
                         Leave to None, if you want TSFresh to choose the best distributor.
    :type distributor: DistributorBaseClass

    :return: the extracted features
    :rtype: pd.DataFrame
    """

    data_in_chunks = generate_data_chunk_format(df, column_id, column_kind, column_value)

    if distributor is None:
        if n_jobs == 0:
            distributor = MapDistributor(disable_progressbar=disable_progressbar,
                                         progressbar_title="Feature Extraction")
        else:
            distributor = MultiprocessingDistributor(n_workers=n_jobs,
                                                     disable_progressbar=disable_progressbar,
                                                     progressbar_title="Feature Extraction")

    if not isinstance(distributor, DistributorBaseClass):
        raise ValueError("the passed distributor is not an DistributorBaseClass object")

    kwargs = dict(default_fc_parameters=default_fc_parameters,
                  kind_to_fc_parameters=kind_to_fc_parameters)

    result = distributor.map_reduce(_do_extraction_on_chunk, data=data_in_chunks,
                                    chunk_size=chunk_size,
                                    function_kwargs=kwargs)
    distributor.close()

    # Return a dataframe in the typical form (id as index and feature names as columns)
    result = pd.DataFrame(result)
    if result.columns.contains("value"):
        result["value"] = result["value"].astype(float)

    if len(result) != 0:
        result = result.pivot("id", "variable", "value")
        result.index = result.index.astype(df[column_id].dtype)

    return result
Beispiel #8
0
def calculateFeatures(path,
                      parameters,
                      reset_df,
                      raster_mask=None,
                      tiff_output=True,
                      workers=None):
    '''
    Calculates features or the statistical characteristics of time-series raster data.
    It can also save features as a csv file (dataframe) and/or tiff file.
    
    :param path: directory path to the raster files
    :param parameters: a dictionary of features to be extracted
    :param reset_df: boolean option for existing raster inputs as dataframe
    :param raster_mask: path to binary raster mask
    :param tiff_output: boolean option for exporting tiff file
    :return: extracted features as a dataframe and tiff file
    '''

    if reset_df == False:
        #if reset_df =F read in csv file holding saved version of my_df
        my_df = tr.read_my_df(path)

    else:
        #if reset_df =T calculate ts_series and save csv
        my_df = image_to_series(path)
        print('df: ' + os.path.join(path, 'my_df.csv'))
        my_df.to_csv(os.path.join(path, 'my_df.csv'),
                     chunksize=10000,
                     index=False)

    # mask
    if raster_mask is not None:
        my_df = tr.mask_df(raster_mask=raster_mask, original_df=my_df)

    if workers is not None:
        Distributor = MultiprocessingDistributor(
            n_workers=workers,
            disable_progressbar=False,
            progressbar_title="Feature Extraction")
        #Distributor = LocalDaskDistributor(n_workers=workers)
    else:
        Distributor = None

    extracted_features = extract_features(
        my_df,
        default_fc_parameters=parameters,
        column_sort="time",
        column_value="value",
        column_id="pixel_id",
        column_kind="kind",
        #chunksize = 1000,
        distributor=Distributor)

    # change index name to match pixel and time period
    extracted_features.index.rename('pixel_id', inplace=True)
    extracted_features.reset_index(inplace=True, level=['pixel_id'])

    extracted_features['time'] = str(my_df.time.min()) + "_" + str(
        my_df.time.max())
    extracted_features.set_index(['pixel_id', 'time'], inplace=True)

    # unmask extracted features
    extracted_features = tr.unmask_from_mask(mask_df_output=extracted_features,
                                             missing_value=-9999,
                                             raster_mask=raster_mask)

    # deal with output location
    out_path = Path(path).parent.joinpath(Path(path).stem + "_features")
    out_path.mkdir(parents=True, exist_ok=True)

    # write out features to csv file
    print("features:" + os.path.join(out_path, 'extracted_features.csv'))
    extracted_features.to_csv(os.path.join(out_path, 'extracted_features.csv'),
                              chunksize=10000)

    # write out feature names
    kr = pd.DataFrame(list(extracted_features.columns))
    kr.index += 1
    kr.index.names = ['band']
    kr.columns = ['feature_name']
    kr.to_csv(os.path.join(out_path, "features_names.csv"))

    # write out features to tiff file
    if tiff_output == False:
        return extracted_features
    else:
        # get image dimension from raw data
        rows, cols, num = image_to_array(path).shape
        # get the total number of features extracted
        matrix_features = extracted_features.values
        num_of_layers = matrix_features.shape[1]

        #reshape the dimension of features extracted
        f2Array = matrix_features.reshape(rows, cols, num_of_layers)
        output_file = 'extracted_features.tiff'

        #Get Meta Data from raw data
        raw_data = read_images(path)
        GeoTransform = raw_data[0].GetGeoTransform()
        driver = gdal.GetDriverByName('GTiff')

        noData = -9999

        Projection = raw_data[0].GetProjectionRef()
        DataType = gdal.GDT_Float32

        #export tiff
        CreateTiff(output_file,
                   f2Array,
                   driver,
                   noData,
                   GeoTransform,
                   Projection,
                   DataType,
                   path=out_path)
        return extracted_features
Beispiel #9
0
def _do_extraction(df, column_id, column_value, column_kind, column_sort,
                   default_fc_parameters, kind_to_fc_parameters, n_jobs,
                   chunk_size, disable_progressbar, show_warnings, distributor,
                   pivot):
    """
    Wrapper around the _do_extraction_on_chunk, which calls it on all chunks in the data frame.
    A chunk is a subset of the data, with a given kind and id - so a single time series.

    The data is separated out into those single time series and the _do_extraction_on_chunk is
    called on each of them. The results are then combined into a single pandas DataFrame.

    The call is either happening in parallel or not and is showing a progress bar or not depending
    on the given flags.

    :param df: The dataframe in the normalized format which is used for extraction.
    :type df: pd.DataFrame

    :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names
           which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for
           more information.
    :type default_fc_parameters: dict

    :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for
            default_fc_parameters. If you put a kind as a key here, the fc_parameters
            object (which is the value), will be used instead of the default_fc_parameters.
    :type kind_to_fc_parameters: dict

    :param column_id: The name of the id column to group by.
    :type column_id: str

    :param column_kind: The name of the column keeping record on the kind of the value.
    :type column_kind: str

    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param chunk_size: The size of one chunk for the parallelization
    :type chunk_size: None or int

    :param n_jobs: The number of processes to use for parallelization. If zero, no parallelization is used.
    :type n_jobs: int

    :param disable_progressbar: Do not show a progressbar while doing the calculation.
    :type disable_progressbar: bool

    :param show_warnings: Show warnings during the feature extraction (needed for debugging of calculators).
    :type show_warnings: bool

    :param distributor: Advanced parameter:  See the utilities/distribution.py for more information.
                         Leave to None, if you want TSFresh to choose the best distributor.
    :type distributor: DistributorBaseClass

    :return: the extracted features
    :rtype: pd.DataFrame
    """

    data = to_tsdata(df, column_id, column_kind, column_value, column_sort)

    if distributor is None:
        if isinstance(data, Iterable):
            if n_jobs == 0:
                distributor = MapDistributor(
                    disable_progressbar=disable_progressbar,
                    progressbar_title="Feature Extraction")
            else:
                distributor = MultiprocessingDistributor(
                    n_workers=n_jobs,
                    disable_progressbar=disable_progressbar,
                    progressbar_title="Feature Extraction",
                    show_warnings=show_warnings)
        else:
            distributor = ApplyDistributor(
                meta=[(data.column_id,
                       'int64'), ('variable', 'object'), ('value', 'float64')])

    if not isinstance(distributor, DistributorBaseClass):
        raise ValueError(
            "the passed distributor is not an DistributorBaseClass object")

    kwargs = dict(default_fc_parameters=default_fc_parameters,
                  kind_to_fc_parameters=kind_to_fc_parameters)

    result = distributor.map_reduce(_do_extraction_on_chunk,
                                    data=data,
                                    chunk_size=chunk_size,
                                    function_kwargs=kwargs)

    if not pivot:
        return result

    return_df = data.pivot(result)
    return return_df
Beispiel #10
0
:param reset_df: boolean option for existing raster inputs as dataframe
:param tiff_output: boolean option for exporting tiff file
:return: extracted features as a dataframe and tiff file
'''
  
if reset_df == False:
    #if reset_df =F read in csv file holding saved version of my_df
	    my_df = pd.read_csv(os.path.join(path,'my_df.csv'))
else:
    #if reset_df =T calculate ts_series and save csv
    my_df = image_to_series(path)
    print('df: '+os.path.join(path,'my_df.csv'))
    my_df.to_csv(os.path.join(path,'my_df.csv'), chunksize=10000, index=False)

Distributor = MultiprocessingDistributor(n_workers=6,
                                         disable_progressbar=False,
                                         progressbar_title="Feature Extraction")


extracted_features = extract_features(my_df, 
                                      default_fc_parameters=parameters,
                                      column_sort="time",
                                      column_value="value",
                                      column_id=my_df.index(level="index"),
                                      distributor=Distributor
                                      )

# deal with output location 
out_path = Path(path).parent.joinpath(Path(path).stem+"_features")
out_path.mkdir(parents=True, exist_ok=True)