def linear_calculate_chunks(chunks, features, approximate, training_window,
                            profile, verbose, save_progress, entityset,
                            no_unapproximated_aggs, cutoff_df_time_var,
                            target_time, pass_columns):
    backend = PandasBackend(entityset, features)
    feature_matrix = []

    # if verbose, create progess bar
    if verbose:
        pbar_string = ("Elapsed: {elapsed} | Remaining: {remaining} | "
                       "Progress: {l_bar}{bar}| "
                       "Calculated: {n}/{total} chunks")
        chunks = make_tqdm_iterator(iterable=chunks,
                                    total=len(chunks),
                                    bar_format=pbar_string)

    for chunk in chunks:
        _feature_matrix = calculate_chunk(chunk, features, approximate,
                                          training_window,
                                          profile, verbose,
                                          save_progress,
                                          no_unapproximated_aggs,
                                          cutoff_df_time_var,
                                          target_time, pass_columns,
                                          backend=backend)
        feature_matrix.append(_feature_matrix)
        # Do a manual garbage collection in case objects from calculate_chunk
        # weren't collected automatically
        gc.collect()
    if verbose:
        chunks.close()
    return feature_matrix
def linear_calculate_chunks(chunks, feature_set, approximate, training_window,
                            verbose, save_progress, entityset,
                            no_unapproximated_aggs, cutoff_df_time_var,
                            target_time, pass_columns):
    feature_matrix = []

    # if verbose, create progess bar
    if verbose:
        pbar_string = ("Elapsed: {elapsed} | Remaining: {remaining} | "
                       "Progress: {l_bar}{bar}| "
                       "Calculated: {n}/{total} chunks")
        chunks = make_tqdm_iterator(iterable=chunks,
                                    total=len(chunks),
                                    bar_format=pbar_string)

    for chunk in chunks:
        _feature_matrix = calculate_chunk(chunk, feature_set, entityset,
                                          approximate, training_window,
                                          verbose, save_progress,
                                          no_unapproximated_aggs,
                                          cutoff_df_time_var, target_time,
                                          pass_columns)
        feature_matrix.append(_feature_matrix)
        # Do a manual garbage collection in case objects from calculate_chunk
        # weren't collected automatically
        gc.collect()
    if verbose:
        chunks.close()
    return feature_matrix
def encode_features(feature_matrix, features, top_n=10, include_unknown=True,
                    to_encode=None, inplace=False, verbose=False):
    """Encode categorical features

        Args:
            feature_matrix (pd.DataFrame): Dataframe of features
            features (list[:class:`.PrimitiveBase`]): Feature definitions in feature_matrix
            top_n (pd.DataFrame): number of top values to include
            include_unknown (pd.DataFrame): add feature encoding an unkwown class.
                defaults to True
            to_encode (list[str]): list of feature names to encode.
                features not in this list are unencoded in the output matrix
                defaults to encode all necessary features
            inplace (bool): encode feature_matrix in place. Defaults to False.
            verbose (str): Print progress info.

        Returns:
            (pd.Dataframe, list) : encoded feature_matrix, encoded features

        Example:
            .. ipython:: python
                :suppress:

                from featuretools.tests.testing_utils import make_ecommerce_entityset
                from featuretools.primitives import Feature
                import featuretools as ft
                es = make_ecommerce_entityset()

            .. ipython:: python

                f1 = Feature(es["log"]["product_id"])
                f2 = Feature(es["log"]["purchased"])
                f3 = Feature(es["log"]["value"])

                features = [f1, f2, f3]
                ids = [0, 1, 2, 3, 4, 5]
                feature_matrix = ft.calculate_feature_matrix(features,
                                                             instance_ids=ids)

                fm_encoded, f_encoded = ft.encode_features(feature_matrix,
                                                           features)
                f_encoded

                fm_encoded, f_encoded = ft.encode_features(feature_matrix,
                                                           features, top_n=2)
                f_encoded

                fm_encoded, f_encoded = ft.encode_features(feature_matrix, features,
                                                           include_unknown=False)
                f_encoded

                fm_encoded, f_encoded = ft.encode_features(feature_matrix, features,
                                                           to_encode=['purchased'])
                f_encoded
    """
    if inplace:
        X = feature_matrix
    else:
        X = feature_matrix.copy()

    encoded = []

    if verbose:
        iterator = make_tqdm_iterator(iterable=features,
                                      total=len(features),
                                      desc="Encoding pass 1",
                                      unit="feature")
    else:
        iterator = features

    for f in iterator:
        if (f.expanding or (not issubclass(f.variable_type, Discrete))):
            encoded.append(f)
            continue

        if to_encode is not None and f.get_name() not in to_encode:
            encoded.append(f)
            continue

        unique = X[f.get_name()].value_counts().head(top_n).index.tolist()
        for label in unique:
            add = f == label
            encoded.append(add)
            X[add.get_name()] = (X[f.get_name()] == label).astype(int)

        if include_unknown:
            unknown = f.isin(unique).NOT().rename(f.get_name() + " = unknown")
            encoded.append(unknown)
            X[unknown.get_name()] = (~X[f.get_name()].isin(unique)).astype(int)

        X.drop(f.get_name(), axis=1, inplace=True)

    new_X = X[[e.get_name() for e in encoded]]
    iterator = new_X.columns
    if verbose:
        iterator = make_tqdm_iterator(iterable=new_X.columns,
                                      total=len(new_X.columns),
                                      desc="Encoding pass 2",
                                      unit="feature")

    for c in iterator:
        try:
            new_X[c] = pd.to_numeric(new_X[c], errors='raise')
        except (TypeError, ValueError):
            pass

    return new_X, encoded
def parallel_calculate_chunks(chunks,
                              features,
                              approximate,
                              training_window,
                              verbose,
                              save_progress,
                              entityset,
                              n_jobs,
                              no_unapproximated_aggs,
                              cutoff_df_time_var,
                              target_time,
                              pass_columns,
                              dask_kwargs=None):
    from distributed import Client, LocalCluster, as_completed
    from dask.base import tokenize

    client = None
    cluster = None
    try:
        if 'cluster' in dask_kwargs:
            cluster = dask_kwargs['cluster']
        else:
            diagnostics_port = None
            if 'diagnostics_port' in dask_kwargs:
                diagnostics_port = dask_kwargs['diagnostics_port']
                del dask_kwargs['diagnostics_port']

            workers = n_jobs_to_workers(n_jobs)
            workers = min(workers, len(chunks))
            cluster = LocalCluster(n_workers=workers,
                                   threads_per_worker=1,
                                   diagnostics_port=diagnostics_port,
                                   **dask_kwargs)
            # if cluster has bokeh port, notify user if unxepected port number
            if diagnostics_port is not None:
                if hasattr(cluster, 'scheduler') and cluster.scheduler:
                    info = cluster.scheduler.identity()
                    if 'bokeh' in info['services']:
                        msg = "Dashboard started on port {}"
                        print(msg.format(info['services']['bokeh']))

        client = Client(cluster)
        # scatter the entityset
        # denote future with leading underscore
        start = time.time()
        es_token = "EntitySet-{}".format(tokenize(entityset))
        if es_token in client.list_datasets():
            print("Using EntitySet persisted on the cluster as dataset %s" %
                  (es_token))
            _es = client.get_dataset(es_token)
        else:
            _es = client.scatter([entityset])[0]
            client.publish_dataset(**{_es.key: _es})

        # save features to a tempfile and scatter it
        pickled_feats = cloudpickle.dumps(features)
        _saved_features = client.scatter(pickled_feats)
        client.replicate([_es, _saved_features])
        end = time.time()
        scatter_time = end - start
        scatter_string = "EntitySet scattered to workers in {:.3f} seconds"
        print(scatter_string.format(scatter_time))

        # map chunks
        # TODO: consider handling task submission dask kwargs
        _chunks = client.map(calculate_chunk,
                             chunks,
                             features=_saved_features,
                             entityset=_es,
                             approximate=approximate,
                             training_window=training_window,
                             profile=False,
                             verbose=False,
                             save_progress=save_progress,
                             no_unapproximated_aggs=no_unapproximated_aggs,
                             cutoff_df_time_var=cutoff_df_time_var,
                             target_time=target_time,
                             pass_columns=pass_columns)

        feature_matrix = []
        iterator = as_completed(_chunks).batches()
        if verbose:
            pbar_str = ("Elapsed: {elapsed} | Remaining: {remaining} | "
                        "Progress: {l_bar}{bar}| "
                        "Calculated: {n}/{total} chunks")
            pbar = make_tqdm_iterator(total=len(_chunks), bar_format=pbar_str)
        for batch in iterator:
            results = client.gather(batch)
            for result in results:
                feature_matrix.append(result)
                if verbose:
                    pbar.update()
        if verbose:
            pbar.close()
    except Exception:
        raise
    finally:
        if 'cluster' not in dask_kwargs and cluster is not None:
            cluster.close()
        if client is not None:
            client.close()

    return feature_matrix
Exemple #5
0
    def calculate_all_features(self,
                               instance_ids,
                               time_last,
                               training_window=None,
                               profile=False,
                               precalculated_features=None,
                               ignored=None,
                               verbose=False):
        """
        Given a list of instance ids and features with a shared time window,
        generate and return a mapping of instance -> feature values.

        Args:
            instance_ids (list): List of instance id for which to build features.

            time_last (pd.Timestamp): Last allowed time. Data from exactly this
                time not allowed.

            training_window (Timedelta, optional): Data older than
                time_last by more than this will be ignored.

            profile (bool): Enable profiler if True.

            verbose (bool): Print output progress if True.

        Returns:
            pd.DataFrame : Pandas DataFrame of calculated feature values.
                Indexed by instance_ids. Columns in same order as features
                passed in.

        """
        assert len(instance_ids) > 0, "0 instance ids provided"
        self.instance_ids = instance_ids

        self.time_last = time_last
        if self.time_last is None:
            self.time_last = datetime.now()

        # For debugging
        if profile:
            pr = cProfile.Profile()
            pr.enable()

        if precalculated_features is None:
            precalculated_features = {}
        # Access the index to get the filtered data we need
        target_entity = self.entityset[self.target_eid]
        if ignored:
            # TODO: Just want to remove entities if don't have any (sub)features defined
            # on them anymore, rather than recreating
            ordered_entities = FeatureTree(self.entityset,
                                           self.features,
                                           ignored=ignored).ordered_entities
        else:
            ordered_entities = self.feature_tree.ordered_entities

        necessary_columns = self.feature_tree.necessary_columns
        eframes_by_filter = \
            self.entityset.get_pandas_data_slice(filter_entity_ids=ordered_entities,
                                                 index_eid=self.target_eid,
                                                 instances=instance_ids,
                                                 entity_columns=necessary_columns,
                                                 time_last=time_last,
                                                 training_window=training_window,
                                                 verbose=verbose)
        large_eframes_by_filter = None
        if any([
                f.primitive.uses_full_entity
                for f in self.feature_tree.all_features
                if isinstance(f, TransformFeature)
        ]):
            large_necessary_columns = self.feature_tree.necessary_columns_for_all_values_features
            large_eframes_by_filter = \
                self.entityset.get_pandas_data_slice(filter_entity_ids=ordered_entities,
                                                     index_eid=self.target_eid,
                                                     instances=None,
                                                     entity_columns=large_necessary_columns,
                                                     time_last=time_last,
                                                     training_window=training_window,
                                                     verbose=verbose)

        # Handle an empty time slice by returning a dataframe with defaults
        if eframes_by_filter is None:
            return self.generate_default_df(instance_ids=instance_ids)

        finished_entity_ids = []
        # Populate entity_frames with precalculated features
        if len(precalculated_features) > 0:
            for entity_id, precalc_feature_values in precalculated_features.items(
            ):
                if entity_id in eframes_by_filter:
                    frame = eframes_by_filter[entity_id][entity_id]
                    eframes_by_filter[entity_id][entity_id] = pd.merge(
                        frame,
                        precalc_feature_values,
                        left_index=True,
                        right_index=True)
                else:
                    # Only features we're taking from this entity
                    # are precomputed
                    # Make sure the id variable is a column as well as an index
                    entity_id_var = self.entityset[entity_id].index
                    precalc_feature_values[
                        entity_id_var] = precalc_feature_values.index.values
                    eframes_by_filter[entity_id] = {
                        entity_id: precalc_feature_values
                    }
                    finished_entity_ids.append(entity_id)

        # Iterate over the top-level entities (filter entities) in sorted order
        # and calculate all relevant features under each one.
        if verbose:
            total_groups_to_compute = sum(
                len(group)
                for group in self.feature_tree.ordered_feature_groups.values())

            pbar = make_tqdm_iterator(total=total_groups_to_compute,
                                      desc="Computing features",
                                      unit="feature group")
            if verbose:
                pbar.update(0)

        for filter_eid in ordered_entities:
            entity_frames = eframes_by_filter[filter_eid]
            large_entity_frames = None
            if large_eframes_by_filter is not None:
                large_entity_frames = large_eframes_by_filter[filter_eid]

            # update the current set of entity frames with the computed features
            # from previously finished entities
            for eid in finished_entity_ids:
                # only include this frame if it's not from a descendent entity:
                # descendent entity frames will have to be re-calculated.
                # TODO: this check might not be necessary, depending on our
                # constraints
                if not self.entityset.find_backward_path(
                        start_entity_id=filter_eid, goal_entity_id=eid):
                    entity_frames[eid] = eframes_by_filter[eid][eid]
                    # TODO: look this over again
                    # precalculated features will only be placed in entity_frames,
                    # and it's possible that that they are the only features computed
                    # for an entity. In this case, the entity won't be present in
                    # large_eframes_by_filter. The relevant lines that this case passes
                    # through are 136-143
                    if (large_eframes_by_filter is not None
                            and eid in large_eframes_by_filter
                            and eid in large_eframes_by_filter[eid]):
                        large_entity_frames[eid] = large_eframes_by_filter[
                            eid][eid]

            if filter_eid in self.feature_tree.ordered_feature_groups:
                for group in self.feature_tree.ordered_feature_groups[
                        filter_eid]:
                    if verbose:
                        pbar.set_postfix({'running': 0})

                    test_feature = group[0]
                    entity_id = test_feature.entity.id

                    input_frames_type = self.feature_tree.input_frames_type(
                        test_feature)

                    input_frames = large_entity_frames
                    if input_frames_type == "subset_entity_frames":
                        input_frames = entity_frames

                    handler = self._feature_type_handler(test_feature)
                    result_frame = handler(group, input_frames)

                    output_frames_type = self.feature_tree.output_frames_type(
                        test_feature)
                    if output_frames_type in [
                            'full_and_subset_entity_frames',
                            'subset_entity_frames'
                    ]:
                        index = entity_frames[entity_id].index
                        # If result_frame came from a uses_full_entity feature,
                        # and the input was large_entity_frames,
                        # then it's possible it doesn't contain some of the features
                        # in the output entity_frames
                        # We thus need to concatenate the existing frame with the result frame,
                        # making sure not to duplicate any columns
                        _result_frame = result_frame.reindex(index)
                        cols_to_keep = [
                            c for c in _result_frame.columns
                            if c not in entity_frames[entity_id].columns
                        ]
                        entity_frames[entity_id] = pd.concat([
                            entity_frames[entity_id],
                            _result_frame[cols_to_keep]
                        ],
                                                             axis=1)

                    if output_frames_type in [
                            'full_and_subset_entity_frames',
                            'full_entity_frames'
                    ]:
                        index = large_entity_frames[entity_id].index
                        _result_frame = result_frame.reindex(index)
                        cols_to_keep = [
                            c for c in _result_frame.columns
                            if c not in large_entity_frames[entity_id].columns
                        ]
                        large_entity_frames[entity_id] = pd.concat([
                            large_entity_frames[entity_id],
                            _result_frame[cols_to_keep]
                        ],
                                                                   axis=1)

                    if verbose:
                        pbar.update(1)

            finished_entity_ids.append(filter_eid)

        if verbose:
            pbar.set_postfix({'running': 0})
            pbar.refresh()
            sys.stdout.flush()
            pbar.close()

        # debugging
        if profile:
            pr.disable()
            ROOT_DIR = os.path.expanduser("~")
            prof_folder_path = os.path.join(ROOT_DIR, 'prof')
            if not os.path.exists(prof_folder_path):
                os.mkdir(prof_folder_path)
            with open(
                    os.path.join(prof_folder_path,
                                 'inst-%s.log' % list(instance_ids)[0]),
                    'w') as f:
                pstats.Stats(pr, stream=f).strip_dirs().sort_stats(
                    "cumulative", "tottime").print_stats()

        df = eframes_by_filter[self.target_eid][self.target_eid]

        # fill in empty rows with default values
        missing_ids = [
            i for i in instance_ids if i not in df[target_entity.index]
        ]
        if missing_ids:
            default_df = self.generate_default_df(instance_ids=missing_ids,
                                                  extra_columns=df.columns)
            df = df.append(default_df, sort=True)

        df.index.name = self.entityset[self.target_eid].index
        column_list = []
        for feat in self.features:
            column_list.extend(feat.get_feature_names())
        return df[column_list]
def calculate_feature_matrix(features,
                             cutoff_time=None,
                             instance_ids=None,
                             entities=None,
                             relationships=None,
                             entityset=None,
                             cutoff_time_in_index=False,
                             training_window=None,
                             approximate=None,
                             save_progress=None,
                             verbose=False,
                             backend_verbose=False,
                             verbose_desc='calculate_feature_matrix',
                             profile=False):
    """Calculates a matrix for a given set of instance ids and calculation times.

    Args:
        features (list[PrimitiveBase]): Feature definitions to be calculated.

        cutoff_time (pd.DataFrame or Datetime): Specifies at which time to calculate
            the features for each instance.  Can either be a DataFrame with
            'instance_id' and 'time' columns, DataFrame with the name of the
            index variable in the target entity and a time column, a list of values, or a single
            value to calculate for all instances. If the dataframe has more than two columns, any additional
            columns will be added to the resulting feature matrix.

        instance_ids (list): List of instances to calculate features on. Only
            used if cutoff_time is a single datetime.

        entities (dict[str -> tuple(pd.DataFrame, str, str)]): dictionary of
            entities. Entries take the format
            {entity id: (dataframe, id column, (time_column))}.

        relationships (list[(str, str, str, str)]): list of relationships
            between entities. List items are a tuple with the format
            (parent entity id, parent variable, child entity id, child variable).

        entityset (EntitySet): An already initialized entityset. Required if
            entities and relationships are not defined.

        cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex
            where the second index is the cutoff time (first is instance id).
            DataFrame will be sorted by (time, instance_id).

        training_window (dict[str -> Timedelta] or Timedelta, optional):
            Window or windows defining how much older than the cutoff time data
            can be to be included when calculating the feature.  To specify
            which entities to apply windows to, use a dictionary mapping entity
            id -> Timedelta. If None, all older data is used.

        approximate (Timedelta or str): Frequency to group instances with similar
            cutoff times by for features with costly calculations. For example,
            if bucket is 24 hours, all instances with cutoff times on the same
            day will use the same calculation for expensive features.

        verbose (bool, optional): Print progress info. The time granularity is per time group
            unless there is only a single cutoff time, in which case backend_verbose is turned on

        backend_verbose (bool, optional): Print progress info of each feature calculatation step per time group.

        profile (bool, optional): Enables profiling if True.

        save_progress (str, optional): path to save intermediate computational results.
    """
    assert (isinstance(features, list) and features != [] and
            all([isinstance(feature, PrimitiveBase) for feature in features])), \
        "features must be a non-empty list of features"

    # handle loading entityset
    from featuretools.entityset.entityset import EntitySet
    if not isinstance(entityset, EntitySet):
        if entities is not None and relationships is not None:
            entityset = EntitySet("entityset", entities, relationships)

    if entityset is not None:
        for f in features:
            f.entityset = entityset

    entityset = features[0].entityset
    target_entity = features[0].entity
    pass_columns = []

    if not isinstance(cutoff_time, pd.DataFrame):
        if cutoff_time is None:
            cutoff_time = datetime.now()

        if instance_ids is None:
            index_var = target_entity.index
            instance_ids = target_entity.df[index_var].tolist()

        if not isinstance(cutoff_time, list):
            cutoff_time = [cutoff_time] * len(instance_ids)

        map_args = [(id, time) for id, time in zip(instance_ids, cutoff_time)]
        df_args = pd.DataFrame(map_args, columns=['instance_id', 'time'])
        to_calc = df_args.values
        cutoff_time = pd.DataFrame(to_calc, columns=['instance_id', 'time'])
    else:
        cutoff_time = cutoff_time.copy()

        # handle how columns are names in cutoff_time
        if "instance_id" not in cutoff_time.columns:
            if target_entity.index not in cutoff_time.columns:
                raise AttributeError(
                    'Name of the index variable in the target entity'
                    ' or "instance_id" must be present in cutoff_time')
            # rename to instance_id
            cutoff_time.rename(columns={target_entity.index: "instance_id"},
                               inplace=True)

        if "time" not in cutoff_time.columns:
            # take the first column that isn't instance_id and assume it is time
            not_instance_id = [
                c for c in cutoff_time.columns if c != "instance_id"
            ]
            cutoff_time.rename(columns={not_instance_id[0]: "time"},
                               inplace=True)
        pass_columns = [column_name for column_name in cutoff_time.columns[2:]]

    # Get dictionary of features to approximate
    if approximate is not None:
        to_approximate, all_approx_feature_set = gather_approximate_features(
            features)
    else:
        to_approximate = defaultdict(list)
        all_approx_feature_set = None

    # Check if there are any non-approximated aggregation features
    no_unapproximated_aggs = True
    for feature in features:
        if isinstance(feature, AggregationPrimitive):
            # do not need to check if feature is in to_approximate since
            # only base features of direct features can be in to_approximate
            no_unapproximated_aggs = False
            break

        deps = feature.get_deep_dependencies(all_approx_feature_set)
        for dependency in deps:
            if (isinstance(dependency, AggregationPrimitive) and dependency
                    not in to_approximate[dependency.entity.id]):
                no_unapproximated_aggs = False
                break

    cutoff_df_time_var = 'time'
    target_time = '_original_time'

    if approximate is not None:
        # If there are approximated aggs, bin times
        binned_cutoff_time = bin_cutoff_times(cutoff_time.copy(), approximate)

        # Think about collisions: what if original time is a feature
        binned_cutoff_time[target_time] = cutoff_time[cutoff_df_time_var]

        grouped = binned_cutoff_time.groupby(cutoff_df_time_var, sort=True)

    else:
        grouped = cutoff_time.groupby(cutoff_df_time_var, sort=True)

    # if the backend is going to be verbose, don't make cutoff times verbose
    if verbose and not backend_verbose:
        iterator = make_tqdm_iterator(iterable=grouped,
                                      total=len(grouped),
                                      desc="Progress",
                                      unit="cutoff time")
    else:
        iterator = grouped

    feature_matrix = []
    backend = PandasBackend(entityset, features)
    for _, group in iterator:
        _feature_matrix = calculate_batch(
            features, group, approximate, entityset, backend_verbose,
            training_window, profile, verbose, save_progress, backend,
            no_unapproximated_aggs, cutoff_df_time_var, target_time,
            pass_columns)
        feature_matrix.append(_feature_matrix)
        # Do a manual garbage collection in case objects from calculate_batch
        # weren't collected automatically
        gc.collect()

    feature_matrix = pd.concat(feature_matrix)
    if not cutoff_time_in_index:
        feature_matrix.reset_index(level='time', drop=True, inplace=True)

    if save_progress and os.path.exists(os.path.join(save_progress, 'temp')):
        shutil.rmtree(os.path.join(save_progress, 'temp'))

    return feature_matrix
def encode_features(feature_matrix,
                    features,
                    top_n=DEFAULT_TOP_N,
                    include_unknown=True,
                    to_encode=None,
                    inplace=False,
                    drop_first=False,
                    verbose=False):
    """Encode categorical features

        Args:
            feature_matrix (pd.DataFrame): Dataframe of features.
            features (list[PrimitiveBase]): Feature definitions in feature_matrix.
            top_n (int or dict[string -> int]): Number of top values to include.
                If dict[string -> int] is used, key is feature name and value is
                the number of top values to include for that feature.
                If a feature's name is not in dictionary, a default value of 10 is used.
            include_unknown (pd.DataFrame): Add feature encoding an unknown class.
                defaults to True
            to_encode (list[str]): List of feature names to encode.
                features not in this list are unencoded in the output matrix
                defaults to encode all necessary features.
            inplace (bool): Encode feature_matrix in place. Defaults to False.
            drop_first (bool): Whether to get k-1 dummies out of k categorical
                    levels by removing the first level.
                    defaults to False
            verbose (str): Print progress info.

        Returns:
            (pd.Dataframe, list) : encoded feature_matrix, encoded features

        Example:
            .. ipython:: python
                :suppress:

                from featuretools.tests.testing_utils import make_ecommerce_entityset
                import featuretools as ft
                es = make_ecommerce_entityset()

            .. ipython:: python

                f1 = ft.Feature(es["log"]["product_id"])
                f2 = ft.Feature(es["log"]["purchased"])
                f3 = ft.Feature(es["log"]["value"])

                features = [f1, f2, f3]
                ids = [0, 1, 2, 3, 4, 5]
                feature_matrix = ft.calculate_feature_matrix(features, es,
                                                             instance_ids=ids)

                fm_encoded, f_encoded = ft.encode_features(feature_matrix,
                                                           features)
                f_encoded

                fm_encoded, f_encoded = ft.encode_features(feature_matrix,
                                                           features, top_n=2)
                f_encoded

                fm_encoded, f_encoded = ft.encode_features(feature_matrix, features,
                                                           include_unknown=False)
                f_encoded

                fm_encoded, f_encoded = ft.encode_features(feature_matrix, features,
                                                           to_encode=['purchased'])
                f_encoded

                fm_encoded, f_encoded = ft.encode_features(feature_matrix, features,
                                                           drop_first=True)
                f_encoded
    """
    if not isinstance(feature_matrix, pd.DataFrame):
        msg = "feature_matrix must be a Pandas DataFrame"
        raise TypeError(msg)

    if inplace:
        X = feature_matrix
    else:
        X = feature_matrix.copy()

    old_feature_names = set()
    for feature in features:
        for fname in feature.get_feature_names():
            assert fname in X.columns, (
                "Feature %s not found in feature matrix" % (fname))
            old_feature_names.add(fname)

    pass_through = [col for col in X.columns if col not in old_feature_names]

    if verbose:
        iterator = make_tqdm_iterator(iterable=features,
                                      total=len(features),
                                      desc="Encoding pass 1",
                                      unit="feature")
    else:
        iterator = features

    new_feature_list = []
    new_columns = []
    encoded_columns = set()

    for f in iterator:
        # TODO: features with multiple columns are not encoded by this method,
        # which can cause an "encoded" matrix with non-numeric vlaues
        is_discrete = issubclass(f.variable_type, Discrete)
        if (f.number_output_features > 1 or not is_discrete):
            if f.number_output_features > 1:
                logger.warning("Feature %s has multiple columns and will not "
                               "be encoded.  This may result in a matrix with"
                               " non-numeric values." % (f))
            new_feature_list.append(f)
            new_columns.extend(f.get_feature_names())
            continue

        if to_encode is not None and f.get_name() not in to_encode:
            new_feature_list.append(f)
            new_columns.extend(f.get_feature_names())
            continue

        val_counts = X[f.get_name()].value_counts().to_frame()
        index_name = val_counts.index.name
        if index_name is None:
            if 'index' in val_counts.columns:
                index_name = 'level_0'
            else:
                index_name = 'index'
        val_counts.reset_index(inplace=True)
        val_counts = val_counts.sort_values([f.get_name(), index_name],
                                            ascending=False)
        val_counts.set_index(index_name, inplace=True)
        select_n = top_n
        if isinstance(top_n, dict):
            select_n = top_n.get(f.get_name(), DEFAULT_TOP_N)
        if drop_first:
            select_n = min(len(val_counts), top_n)
            select_n = max(select_n - 1, 1)
        unique = val_counts.head(select_n).index.tolist()
        for label in unique:
            add = f == label
            add_name = add.get_name()
            new_feature_list.append(add)
            new_columns.append(add_name)
            encoded_columns.add(add_name)
            X[add_name] = (X[f.get_name()] == label)

        if include_unknown:
            unknown = f.isin(unique).NOT().rename(f.get_name() + " is unknown")
            unknown_name = unknown.get_name()
            new_feature_list.append(unknown)
            new_columns.append(unknown_name)
            encoded_columns.add(unknown_name)
            X[unknown_name] = (~X[f.get_name()].isin(unique))

        X.drop(f.get_name(), axis=1, inplace=True)

    new_columns.extend(pass_through)
    new_X = X[new_columns]
    iterator = new_X.columns
    if verbose:
        iterator = make_tqdm_iterator(iterable=new_X.columns,
                                      total=len(new_X.columns),
                                      desc="Encoding pass 2",
                                      unit="feature")
    for c in iterator:
        if c in encoded_columns:
            try:
                new_X[c] = pd.to_numeric(new_X[c], errors='raise')
            except (TypeError, ValueError):
                pass

    return new_X, new_feature_list
Exemple #8
0
    def calculate_all_features(self,
                               instance_ids,
                               time_last,
                               training_window=None,
                               profile=False,
                               precalculated_features=None,
                               ignored=None,
                               verbose=False):
        """
        Given a list of instance ids and features with a shared time window,
        generate and return a mapping of instance -> feature values.

        Args:
            instance_ids (list): list of instance id to build features for

            time_last (pd.Timestamp): last allowed time. Data from exactly this
                time not allowed

            training_window (:class:Timedelta, optional): Data older than
                time_last by more than this will be ignored

            profile (boolean): enable profiler if True

            verbose (boolean): print output progress if True

        Returns:
            pd.DataFrame : Pandas DataFrame of calculated feature values.
                Indexed by instance_ids. Columns in same order as features
                passed in.

        """
        assert len(instance_ids) > 0, "0 instance ids provided"
        self.instance_ids = instance_ids

        self.time_last = time_last
        if self.time_last is None:
            self.time_last = datetime.now()

        # For debugging
        if profile:
            pr = cProfile.Profile()
            pr.enable()

        if precalculated_features is None:
            precalculated_features = {}
        # Access the index to get the filtered data we need
        target_entity = self.entityset[self.target_eid]
        if ignored:
            # TODO: Just want to remove entities if don't have any (sub)features defined
            # on them anymore, rather than recreating
            ordered_entities = FeatureTree(self.entityset,
                                           self.features,
                                           ignored=ignored).ordered_entities
        else:
            ordered_entities = self.feature_tree.ordered_entities
        eframes_by_filter = \
            self.entityset.get_pandas_data_slice(filter_entity_ids=ordered_entities,
                                                 index_eid=self.target_eid,
                                                 instances=instance_ids,
                                                 time_last=time_last,
                                                 training_window=training_window,
                                                 verbose=verbose)

        # Handle an empty time slice by returning a dataframe with defaults
        if eframes_by_filter is None:
            return self.generate_default_df(instance_ids=instance_ids)

        finished_entity_ids = []
        # Populate entity_frames with precalculated features
        if len(precalculated_features) > 0:
            for entity_id, precalc_feature_values in precalculated_features.items(
            ):
                if entity_id in eframes_by_filter:
                    frame = eframes_by_filter[entity_id][entity_id]
                    eframes_by_filter[entity_id][entity_id] = pd.merge(
                        frame,
                        precalc_feature_values,
                        left_index=True,
                        right_index=True)
                else:
                    # Only features we're taking from this entity
                    # are precomputed
                    # Make sure the id variable is a column as well as an index
                    entity_id_var = self.entityset[entity_id].index
                    precalc_feature_values[
                        entity_id_var] = precalc_feature_values.index.values
                    eframes_by_filter[entity_id] = {
                        entity_id: precalc_feature_values
                    }
                    finished_entity_ids.append(entity_id)

        # Iterate over the top-level entities (filter entities) in sorted order
        # and calculate all relevant features under each one.

        if verbose:
            total_groups_to_compute = sum(
                len(group)
                for group in self.feature_tree.ordered_feature_groups.values())

            pbar = make_tqdm_iterator(total=total_groups_to_compute,
                                      desc="Computing features",
                                      unit="feature group")
            if verbose:
                pbar.update(0)

        for filter_eid in ordered_entities:
            entity_frames = eframes_by_filter[filter_eid]

            # update the current set of entity frames with the computed features
            # from previously finished entities
            for eid in finished_entity_ids:
                # only include this frame if it's not from a descendent entity:
                # descendent entity frames will have to be re-calculated.
                # TODO: this check might not be necessary, depending on our
                # constraints
                if not self.entityset.find_backward_path(
                        start_entity_id=filter_eid, goal_entity_id=eid):
                    entity_frames[eid] = eframes_by_filter[eid][eid]

            if filter_eid in self.feature_tree.ordered_feature_groups:
                for group in self.feature_tree.ordered_feature_groups[
                        filter_eid]:
                    if verbose:

                        pbar.set_postfix({'running': 0})

                    handler = self._feature_type_handler(group[0])
                    handler(group, entity_frames)

                    if verbose:
                        pbar.update(1)

            finished_entity_ids.append(filter_eid)

        if verbose:
            pbar.set_postfix({'running': 0})
            pbar.refresh()
            sys.stdout.flush()
            pbar.close()

        # debugging
        if profile:
            pr.disable()
            s = cStringIO.StringIO()
            ps = pstats.Stats(pr, stream=s).sort_stats("cumulative", "tottime")
            ps.print_stats()
            prof_folder_path = os.path.join(ROOT_DIR, 'prof')
            if not os.path.exists(prof_folder_path):
                os.mkdir(prof_folder_path)
            with open(
                    os.path.join(prof_folder_path,
                                 'inst-%s.log' % list(instance_ids)[0]),
                    'w') as f:
                f.write(s.getvalue())

        df = eframes_by_filter[self.target_eid][self.target_eid]

        # fill in empty rows with default values
        missing_ids = [
            i for i in instance_ids if i not in df[target_entity.index]
        ]
        if missing_ids:
            df = df.append(
                self.generate_default_df(instance_ids=missing_ids,
                                         extra_columns=df.columns))
        return df[[feat.get_name() for feat in self.features]]
Exemple #9
0
def parallel_calculate_chunks(chunks,
                              features,
                              approximate,
                              training_window,
                              verbose,
                              save_progress,
                              entityset,
                              n_jobs,
                              no_unapproximated_aggs,
                              cutoff_df_time_var,
                              target_time,
                              pass_columns,
                              dask_kwargs=None):
    from distributed import as_completed
    from dask.base import tokenize

    client = None
    cluster = None
    try:
        client, cluster = create_client_and_cluster(n_jobs=n_jobs,
                                                    num_tasks=len(chunks),
                                                    dask_kwargs=dask_kwargs)
        # scatter the entityset
        # denote future with leading underscore
        start = time.time()
        es_token = "EntitySet-{}".format(tokenize(entityset))
        if es_token in client.list_datasets():
            print("Using EntitySet persisted on the cluster as dataset %s" %
                  (es_token))
            _es = client.get_dataset(es_token)
        else:
            _es = client.scatter([entityset])[0]
            client.publish_dataset(**{_es.key: _es})

        # save features to a tempfile and scatter it
        pickled_feats = cloudpickle.dumps(features)
        _saved_features = client.scatter(pickled_feats)
        client.replicate([_es, _saved_features])
        end = time.time()
        scatter_time = end - start
        scatter_string = "EntitySet scattered to workers in {:.3f} seconds"
        print(scatter_string.format(scatter_time))

        # map chunks
        # TODO: consider handling task submission dask kwargs
        _chunks = client.map(calculate_chunk,
                             chunks,
                             features=_saved_features,
                             entityset=_es,
                             approximate=approximate,
                             training_window=training_window,
                             profile=False,
                             verbose=False,
                             save_progress=save_progress,
                             no_unapproximated_aggs=no_unapproximated_aggs,
                             cutoff_df_time_var=cutoff_df_time_var,
                             target_time=target_time,
                             pass_columns=pass_columns)

        feature_matrix = []
        iterator = as_completed(_chunks).batches()
        if verbose:
            pbar_str = ("Elapsed: {elapsed} | Remaining: {remaining} | "
                        "Progress: {l_bar}{bar}| "
                        "Calculated: {n}/{total} chunks")
            pbar = make_tqdm_iterator(total=len(_chunks), bar_format=pbar_str)
        for batch in iterator:
            results = client.gather(batch)
            for result in results:
                feature_matrix.append(result)
                if verbose:
                    pbar.update()
        if verbose:
            pbar.close()
    except Exception:
        raise
    finally:
        if 'cluster' not in dask_kwargs and cluster is not None:
            cluster.close()
        if client is not None:
            client.close()

    return feature_matrix
Exemple #10
0
    def get_pandas_data_slice(self, filter_entity_ids, index_eid,
                              instances, time_last=None, training_window=None,
                              verbose=False):
        """
        Get the slice of data related to the supplied instances of the index
        entity.
        """
        eframes_by_filter = {}

        if verbose:
            iterator = make_tqdm_iterator(iterable=filter_entity_ids,
                                          desc="Gathering relevant data",
                                          unit="entity")
        else:
            iterator = filter_entity_ids
        # gather frames for each child, for each parent
        for filter_eid in iterator:
            # get the instances of the top-level entity linked by our instances
            toplevel_slice = self._related_instances(start_entity_id=index_eid,
                                                     final_entity_id=filter_eid,
                                                     instance_ids=instances,
                                                     time_last=time_last,
                                                     training_window=training_window)

            eframes = {filter_eid: toplevel_slice}

            # Do a bredth-first search of the relationship tree rooted at this
            # entity, filling out eframes for each entity we hit on the way.
            r_queue = self.get_backward_relationships(filter_eid)
            while r_queue:
                r = r_queue.pop(0)
                child_eid = r.child_variable.entity.id
                parent_eid = r.parent_variable.entity.id

                # If we've already seen this child, this is a diamond graph and
                # we don't know what to do
                if child_eid in eframes:
                    raise RuntimeError('Diamond graph detected!')

                # Add this child's children to the queue
                r_queue += self.get_backward_relationships(child_eid)

                # Query the child of the current backwards relationship for the
                # instances we want
                instance_vals = eframes[parent_eid][r.parent_variable.id]
                eframes[child_eid] =\
                    self.entity_stores[child_eid].query_by_values(
                        instance_vals, variable_id=r.child_variable.id,
                        time_last=time_last, training_window=training_window)

                # add link variables to this dataframe in order to link it to its
                # (grand)parents
                self._add_multigenerational_link_vars(frames=eframes,
                                                      start_entity_id=filter_eid,
                                                      end_entity_id=child_eid)

            eframes_by_filter[filter_eid] = eframes

        # If there are no instances of *this* entity in the index, return None
        if eframes_by_filter[index_eid][index_eid].shape[0] == 0:
            return None

        return eframes_by_filter
    def calculate_all_features(self, instance_ids, time_last,
                               training_window=None, profile=False,
                               precalculated_features=None, ignored=None,
                               verbose=False):
        """
        Given a list of instance ids and features with a shared time window,
        generate and return a mapping of instance -> feature values.

        Args:
            instance_ids (list): List of instance id for which to build features.

            time_last (pd.Timestamp): Last allowed time. Data from exactly this
                time not allowed.

            training_window (Timedelta, optional): Data older than
                time_last by more than this will be ignored.

            profile (bool): Enable profiler if True.

            verbose (bool): Print output progress if True.

        Returns:
            pd.DataFrame : Pandas DataFrame of calculated feature values.
                Indexed by instance_ids. Columns in same order as features
                passed in.

        """
        assert len(instance_ids) > 0, "0 instance ids provided"
        self.instance_ids = instance_ids

        self.time_last = time_last
        if self.time_last is None:
            self.time_last = datetime.now()

        # For debugging
        if profile:
            pr = cProfile.Profile()
            pr.enable()

        if precalculated_features is None:
            precalculated_features = {}
        # Access the index to get the filtered data we need
        target_entity = self.entityset[self.target_eid]
        if ignored:
            # TODO: Just want to remove entities if don't have any (sub)features defined
            # on them anymore, rather than recreating
            ordered_entities = FeatureTree(self.entityset, self.features, ignored=ignored).ordered_entities
        else:
            ordered_entities = self.feature_tree.ordered_entities

        necessary_columns = self.feature_tree.necessary_columns
        eframes_by_filter = \
            self.entityset.get_pandas_data_slice(filter_entity_ids=ordered_entities,
                                                 index_eid=self.target_eid,
                                                 instances=instance_ids,
                                                 entity_columns=necessary_columns,
                                                 time_last=time_last,
                                                 training_window=training_window,
                                                 verbose=verbose)
        large_eframes_by_filter = None
        if any([f.uses_full_entity for f in self.feature_tree.all_features]):
            large_necessary_columns = self.feature_tree.necessary_columns_for_all_values_features
            large_eframes_by_filter = \
                self.entityset.get_pandas_data_slice(filter_entity_ids=ordered_entities,
                                                     index_eid=self.target_eid,
                                                     instances=None,
                                                     entity_columns=large_necessary_columns,
                                                     time_last=time_last,
                                                     training_window=training_window,
                                                     verbose=verbose)

        # Handle an empty time slice by returning a dataframe with defaults
        if eframes_by_filter is None:
            return self.generate_default_df(instance_ids=instance_ids)

        finished_entity_ids = []
        # Populate entity_frames with precalculated features
        if len(precalculated_features) > 0:
            for entity_id, precalc_feature_values in precalculated_features.items():
                if entity_id in eframes_by_filter:
                    frame = eframes_by_filter[entity_id][entity_id]
                    eframes_by_filter[entity_id][entity_id] = pd.merge(frame,
                                                                       precalc_feature_values,
                                                                       left_index=True,
                                                                       right_index=True)
                else:
                    # Only features we're taking from this entity
                    # are precomputed
                    # Make sure the id variable is a column as well as an index
                    entity_id_var = self.entityset[entity_id].index
                    precalc_feature_values[entity_id_var] = precalc_feature_values.index.values
                    eframes_by_filter[entity_id] = {entity_id: precalc_feature_values}
                    finished_entity_ids.append(entity_id)

        # Iterate over the top-level entities (filter entities) in sorted order
        # and calculate all relevant features under each one.
        if verbose:
            total_groups_to_compute = sum(len(group)
                                          for group in self.feature_tree.ordered_feature_groups.values())

            pbar = make_tqdm_iterator(total=total_groups_to_compute,
                                      desc="Computing features",
                                      unit="feature group")
            if verbose:
                pbar.update(0)

        for filter_eid in ordered_entities:
            entity_frames = eframes_by_filter[filter_eid]
            large_entity_frames = None
            if large_eframes_by_filter is not None:
                large_entity_frames = large_eframes_by_filter[filter_eid]

            # update the current set of entity frames with the computed features
            # from previously finished entities
            for eid in finished_entity_ids:
                # only include this frame if it's not from a descendent entity:
                # descendent entity frames will have to be re-calculated.
                # TODO: this check might not be necessary, depending on our
                # constraints
                if not self.entityset.find_backward_path(start_entity_id=filter_eid,
                                                         goal_entity_id=eid):
                    entity_frames[eid] = eframes_by_filter[eid][eid]
                    # TODO: look this over again
                    # precalculated features will only be placed in entity_frames,
                    # and it's possible that that they are the only features computed
                    # for an entity. In this case, the entity won't be present in
                    # large_eframes_by_filter. The relevant lines that this case passes
                    # through are 136-143
                    if (large_eframes_by_filter is not None and
                            eid in large_eframes_by_filter and eid in large_eframes_by_filter[eid]):
                        large_entity_frames[eid] = large_eframes_by_filter[eid][eid]

            if filter_eid in self.feature_tree.ordered_feature_groups:
                for group in self.feature_tree.ordered_feature_groups[filter_eid]:
                    if verbose:
                        pbar.set_postfix({'running': 0})

                    test_feature = group[0]
                    entity_id = test_feature.entity.id

                    input_frames_type = self.feature_tree.input_frames_type(test_feature)

                    input_frames = large_entity_frames
                    if input_frames_type == "subset_entity_frames":
                        input_frames = entity_frames

                    handler = self._feature_type_handler(test_feature)
                    result_frame = handler(group, input_frames)

                    output_frames_type = self.feature_tree.output_frames_type(test_feature)
                    if output_frames_type in ['full_and_subset_entity_frames', 'subset_entity_frames']:
                        index = entity_frames[entity_id].index
                        # If result_frame came from a uses_full_entity feature,
                        # and the input was large_entity_frames,
                        # then it's possible it doesn't contain some of the features
                        # in the output entity_frames
                        # We thus need to concatenate the existing frame with the result frame,
                        # making sure not to duplicate any columns
                        _result_frame = result_frame.reindex(index)
                        cols_to_keep = [c for c in _result_frame.columns
                                        if c not in entity_frames[entity_id].columns]
                        entity_frames[entity_id] = pd.concat([entity_frames[entity_id],
                                                              _result_frame[cols_to_keep]],
                                                             axis=1)

                    if output_frames_type in ['full_and_subset_entity_frames', 'full_entity_frames']:
                        index = large_entity_frames[entity_id].index
                        _result_frame = result_frame.reindex(index)
                        cols_to_keep = [c for c in _result_frame.columns
                                        if c not in large_entity_frames[entity_id].columns]
                        large_entity_frames[entity_id] = pd.concat([large_entity_frames[entity_id],
                                                                    _result_frame[cols_to_keep]],
                                                                   axis=1)

                    if verbose:
                        pbar.update(1)

            finished_entity_ids.append(filter_eid)

        if verbose:
            pbar.set_postfix({'running': 0})
            pbar.refresh()
            sys.stdout.flush()
            pbar.close()

        # debugging
        if profile:
            pr.disable()
            s = io.StringIO()
            ps = pstats.Stats(pr, stream=s).sort_stats("cumulative", "tottime")
            ps.print_stats()
            prof_folder_path = os.path.join(ROOT_DIR, 'prof')
            if not os.path.exists(prof_folder_path):
                os.mkdir(prof_folder_path)
            with open(os.path.join(prof_folder_path, 'inst-%s.log' %
                                   list(instance_ids)[0]), 'w') as f:
                f.write(s.getvalue())

        df = eframes_by_filter[self.target_eid][self.target_eid]

        # fill in empty rows with default values
        missing_ids = [i for i in instance_ids if i not in
                       df[target_entity.index]]
        if missing_ids:
            df = df.append(self.generate_default_df(instance_ids=missing_ids,
                                                    extra_columns=df.columns))
        return df[[feat.get_name() for feat in self.features]]
Exemple #12
0
    def build_features(self, variable_types=None, verbose=False):
        """Automatically builds feature definitions for target
            entity using Deep Feature Synthesis algorithm

        Args:
            variable_types (list[:class:`variable_types.Variable`] or str,
                optional): Types of variables to return. If None, default to
                Numeric, Categorical, Ordinal, and Boolean. If given as
                the string 'all', use all available variable types.

            verbose (bool, optional): If True, print progress.

        Returns:
            list[:class:`.primitives.BaseFeature`]: returns a list of
                features for target entity, sorted by feature depth
                (shallow first)
        """
        self.verbose = verbose
        if verbose:
            self.pbar = make_tqdm_iterator(desc="Building features")
        all_features = {}
        for e in self.es.entities:
            if e not in self.ignore_entities:
                all_features[e.id] = {}

        # add seed features, if any, for dfs to build on top of
        if self.seed_features is not None:
            for f in self.seed_features:
                self._handle_new_feature(all_features=all_features,
                                         new_feature=f)

        self.where_clauses = defaultdict(set)
        self._run_dfs(self.es[self.target_entity_id], [],
                      all_features,
                      max_depth=self.max_depth)

        new_features = list(all_features[self.target_entity_id].values())

        if variable_types is None:
            variable_types = [Numeric, Discrete, Boolean]
        elif variable_types == 'all':
            variable_types = None
        else:
            msg = "variable_types must be a list, or 'all'"
            assert isinstance(variable_types, list), msg

        if variable_types is not None:
            new_features = [
                f for f in new_features if any(
                    issubclass(f.variable_type, vt) for vt in variable_types)
            ]

        def check_secondary_index(f):
            secondary_time_index = self.es[
                self.target_entity_id].secondary_time_index
            for s_time_index, exclude in secondary_time_index.items():
                if isinstance(f, IdentityFeature) and f.variable.id in exclude:
                    return False
                elif isinstance(f, (BinaryFeature, Compare)):
                    if (not check_secondary_index(f.left)
                            or not check_secondary_index(f.right)):
                        return False
                if isinstance(f, TimeSince) and not check_secondary_index(
                        f.base_features[0]):
                    return False

            return True

        def filt(f):
            # remove identity features of the ID field of the target entity
            if (isinstance(f, IdentityFeature)
                    and f.entity.id == self.target_entity_id
                    and f.variable.id == self.es[self.target_entity_id].index):
                return False

            if (isinstance(
                    f, (IdentityFeature, BinaryFeature, Compare, TimeSince))
                    and not check_secondary_index(f)):

                return False

            return True

        new_features = list(filter(filt, new_features))

        # sanity check for duplicate features
        l = [f.hash() for f in new_features]
        assert len(set([f for f in l if l.count(f) > 1])) == 0, \
            'Multiple features with same name' + \
            str(set([f for f in l if l.count(f) > 1]))

        new_features.sort(key=lambda f: f.get_depth())

        new_features = self._filter_features(new_features)

        if self.max_features > 0:
            new_features = new_features[:self.max_features]

        if verbose:
            self.pbar.update(0)
            sys.stdout.flush()
            self.pbar.close()
            self.verbose = None
        return new_features
def parallel_calculate_chunks(chunks, features, approximate, training_window,
                              verbose, save_progress, entityset, n_jobs,
                              no_unapproximated_aggs, cutoff_df_time_var,
                              target_time, pass_columns, dask_kwargs=None):
    from distributed import as_completed
    from dask.base import tokenize

    client = None
    cluster = None
    try:
        client, cluster = create_client_and_cluster(n_jobs=n_jobs,
                                                    num_tasks=len(chunks),
                                                    dask_kwargs=dask_kwargs,
                                                    entityset_size=entityset.__sizeof__())
        # scatter the entityset
        # denote future with leading underscore
        if verbose:
            start = time.time()
        es_token = "EntitySet-{}".format(tokenize(entityset))
        if es_token in client.list_datasets():
            if verbose:
                msg = "Using EntitySet persisted on the cluster as dataset {}"
                print(msg.format(es_token))
            _es = client.get_dataset(es_token)
        else:
            _es = client.scatter([entityset])[0]
            client.publish_dataset(**{_es.key: _es})

        # save features to a tempfile and scatter it
        pickled_feats = cloudpickle.dumps(features)
        _saved_features = client.scatter(pickled_feats)
        client.replicate([_es, _saved_features])
        if verbose:
            end = time.time()
            scatter_time = end - start
            scatter_string = "EntitySet scattered to workers in {:.3f} seconds"
            print(scatter_string.format(scatter_time))

        # map chunks
        # TODO: consider handling task submission dask kwargs
        _chunks = client.map(calculate_chunk,
                             chunks,
                             features=_saved_features,
                             entityset=_es,
                             approximate=approximate,
                             training_window=training_window,
                             profile=False,
                             verbose=False,
                             save_progress=save_progress,
                             no_unapproximated_aggs=no_unapproximated_aggs,
                             cutoff_df_time_var=cutoff_df_time_var,
                             target_time=target_time,
                             pass_columns=pass_columns)

        feature_matrix = []
        iterator = as_completed(_chunks).batches()
        if verbose:
            pbar_str = ("Elapsed: {elapsed} | Remaining: {remaining} | "
                        "Progress: {l_bar}{bar}| "
                        "Calculated: {n}/{total} chunks")
            pbar = make_tqdm_iterator(total=len(_chunks), bar_format=pbar_str)
        for batch in iterator:
            results = client.gather(batch)
            for result in results:
                feature_matrix.append(result)
                if verbose:
                    pbar.update()
        if verbose:
            pbar.close()
    except Exception:
        raise
    finally:
        if 'cluster' not in dask_kwargs and cluster is not None:
            cluster.close()
        if client is not None:
            client.close()

    return feature_matrix
def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None,
                             entities=None, relationships=None, entityset=None,
                             cutoff_time_in_index=False,
                             training_window=None, approximate=None,
                             save_progress=None, verbose=False,
                             chunk_size=None,
                             profile=False):
    """Calculates a matrix for a given set of instance ids and calculation times.

    Args:
        features (list[PrimitiveBase]): Feature definitions to be calculated.

        cutoff_time (pd.DataFrame or Datetime): Specifies at which time to calculate
            the features for each instance.  Can either be a DataFrame with
            'instance_id' and 'time' columns, DataFrame with the name of the
            index variable in the target entity and a time column, a list of values, or a single
            value to calculate for all instances. If the dataframe has more than two columns, any additional
            columns will be added to the resulting feature matrix.

        instance_ids (list): List of instances to calculate features on. Only
            used if cutoff_time is a single datetime.

        entities (dict[str -> tuple(pd.DataFrame, str, str)]): dictionary of
            entities. Entries take the format
            {entity id: (dataframe, id column, (time_column))}.

        relationships (list[(str, str, str, str)]): list of relationships
            between entities. List items are a tuple with the format
            (parent entity id, parent variable, child entity id, child variable).

        entityset (EntitySet): An already initialized entityset. Required if
            entities and relationships are not defined.

        cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex
            where the second index is the cutoff time (first is instance id).
            DataFrame will be sorted by (time, instance_id).

        training_window (dict[str -> Timedelta] or Timedelta, optional):
            Window or windows defining how much older than the cutoff time data
            can be to be included when calculating the feature.  To specify
            which entities to apply windows to, use a dictionary mapping entity
            id -> Timedelta. If None, all older data is used.

        approximate (Timedelta or str): Frequency to group instances with similar
            cutoff times by for features with costly calculations. For example,
            if bucket is 24 hours, all instances with cutoff times on the same
            day will use the same calculation for expensive features.

        verbose (bool, optional): Print progress info. The time granularity is
            per chunk.

        profile (bool, optional): Enables profiling if True.

        chunk_size (int or float or None or "cutoff time"): Number of rows of
            output feature matrix to calculate at time. If passed an integer
            greater than 0, will try to use that many rows per chunk. If passed
            a float value between 0 and 1 sets the chunk size to that
            percentage of all instances. If passed the string "cutoff time",
            rows are split per cutoff time.

        save_progress (str, optional): path to save intermediate computational results.
    """
    assert (isinstance(features, list) and features != [] and
            all([isinstance(feature, PrimitiveBase) for feature in features])), \
        "features must be a non-empty list of features"

    # handle loading entityset
    from featuretools.entityset.entityset import EntitySet
    if not isinstance(entityset, EntitySet):
        if entities is not None and relationships is not None:
            entityset = EntitySet("entityset", entities, relationships)

    if entityset is not None:
        for f in features:
            f.entityset = entityset

    entityset = features[0].entityset
    target_entity = features[0].entity
    pass_columns = []

    if not isinstance(cutoff_time, pd.DataFrame):
        if cutoff_time is None:
            if entityset.time_type == NumericTimeIndex:
                cutoff_time = np.inf
            else:
                cutoff_time = datetime.now()

        if instance_ids is None:
            index_var = target_entity.index
            instance_ids = target_entity.df[index_var].tolist()

        if not isinstance(cutoff_time, list):
            cutoff_time = [cutoff_time] * len(instance_ids)

        map_args = [(id, time) for id, time in zip(instance_ids, cutoff_time)]
        df_args = pd.DataFrame(map_args, columns=['instance_id', 'time'])
        to_calc = df_args.values
        cutoff_time = pd.DataFrame(to_calc, columns=['instance_id', 'time'])
    else:
        cutoff_time = cutoff_time.copy()

        # handle how columns are names in cutoff_time
        if "instance_id" not in cutoff_time.columns:
            if target_entity.index not in cutoff_time.columns:
                raise AttributeError('Name of the index variable in the target entity'
                                     ' or "instance_id" must be present in cutoff_time')
            # rename to instance_id
            cutoff_time.rename(columns={target_entity.index: "instance_id"}, inplace=True)

        if "time" not in cutoff_time.columns:
            # take the first column that isn't instance_id and assume it is time
            not_instance_id = [c for c in cutoff_time.columns if c != "instance_id"]
            cutoff_time.rename(columns={not_instance_id[0]: "time"}, inplace=True)
        pass_columns = [column_name for column_name in cutoff_time.columns[2:]]

    if _check_time_type(cutoff_time['time'].iloc[0]) is None:
        raise ValueError("cutoff_time time values must be datetime or numeric")

    backend = PandasBackend(entityset, features)

    # Get dictionary of features to approximate
    if approximate is not None:
        to_approximate, all_approx_feature_set = gather_approximate_features(features, backend)
    else:
        to_approximate = defaultdict(list)
        all_approx_feature_set = None

    # Check if there are any non-approximated aggregation features
    no_unapproximated_aggs = True
    for feature in features:
        if isinstance(feature, AggregationPrimitive):
            # do not need to check if feature is in to_approximate since
            # only base features of direct features can be in to_approximate
            no_unapproximated_aggs = False
            break

        deps = feature.get_deep_dependencies(all_approx_feature_set)
        for dependency in deps:
            if (isinstance(dependency, AggregationPrimitive) and
                    dependency not in to_approximate[dependency.entity.id]):
                no_unapproximated_aggs = False
                break

    cutoff_df_time_var = 'time'
    target_time = '_original_time'
    num_per_chunk = calc_num_per_chunk(chunk_size, cutoff_time.shape)

    if approximate is not None:
        # If there are approximated aggs, bin times
        binned_cutoff_time = bin_cutoff_times(cutoff_time.copy(), approximate)

        # Think about collisions: what if original time is a feature
        binned_cutoff_time[target_time] = cutoff_time[cutoff_df_time_var]

        cutoff_time_to_pass = binned_cutoff_time

    else:
        cutoff_time_to_pass = cutoff_time

    if num_per_chunk == "cutoff time":
        iterator = cutoff_time_to_pass.groupby(cutoff_df_time_var)
    else:
        iterator = get_next_chunk(cutoff_time=cutoff_time_to_pass,
                                  time_variable=cutoff_df_time_var,
                                  num_per_chunk=num_per_chunk)

    # if verbose, create progess bar
    if verbose:
        chunks = []
        if num_per_chunk == "cutoff time":
            for _, group in iterator:
                chunks.append(group)
        else:
            for chunk in iterator:
                chunks.append(chunk)

        pbar_string = ("Elapsed: {elapsed} | Remaining: {remaining} | "
                       "Progress: {l_bar}{bar}| "
                       "Calculated: {n}/{total} chunks")
        iterator = make_tqdm_iterator(iterable=chunks,
                                      total=len(chunks),
                                      bar_format=pbar_string)
    feature_matrix = []
    backend = PandasBackend(entityset, features)

    for chunk in iterator:
        # if not using chunks, pull out the group dataframe
        if isinstance(chunk, tuple):
            chunk = chunk[1]
        _feature_matrix = calculate_chunk(features, chunk, approximate,
                                          entityset, training_window,
                                          profile, verbose,
                                          save_progress, backend,
                                          no_unapproximated_aggs,
                                          cutoff_df_time_var,
                                          target_time, pass_columns)
        feature_matrix.append(_feature_matrix)
        # Do a manual garbage collection in case objects from calculate_chunk
        # weren't collected automatically
        gc.collect()
    if verbose:
        iterator.close()
    feature_matrix = pd.concat(feature_matrix)
    feature_matrix.sort_index(level='time', kind='mergesort', inplace=True)
    if not cutoff_time_in_index:
        feature_matrix.reset_index(level='time', drop=True, inplace=True)

    if save_progress and os.path.exists(os.path.join(save_progress, 'temp')):
        shutil.rmtree(os.path.join(save_progress, 'temp'))

    return feature_matrix
def calculate_feature_matrix(features,
                             entityset=None,
                             cutoff_time=None,
                             instance_ids=None,
                             entities=None,
                             relationships=None,
                             cutoff_time_in_index=False,
                             training_window=None,
                             approximate=None,
                             save_progress=None,
                             verbose=False,
                             chunk_size=None,
                             n_jobs=1,
                             dask_kwargs=None,
                             progress_callback=None,
                             include_cutoff_time=True):
    """Calculates a matrix for a given set of instance ids and calculation times.

    Args:
        features (list[:class:`.FeatureBase`]): Feature definitions to be calculated.

        entityset (EntitySet): An already initialized entityset. Required if `entities` and `relationships`
            not provided

        cutoff_time (pd.DataFrame or Datetime): Specifies times at which to calculate
            the features for each instance. The resulting feature matrix will use data
            up to and including the cutoff_time. Can either be a DataFrame or a single
            value. If a DataFrame is passed the instance ids for which to calculate features
            must be in a column with the same name as the target entity index or a column
            named `instance_id`. The cutoff time values in the DataFrame must be in a column with
            the same name as the target entity time index or a column named `time`. If the
            DataFrame has more than two columns, any additional columns will be added to the
            resulting feature matrix. If a single value is passed, this value will be used for
            all instances.

        instance_ids (list): List of instances to calculate features on. Only
            used if cutoff_time is a single datetime.

        entities (dict[str -> tuple(pd.DataFrame, str, str, dict[str -> Variable])]): dictionary of
            entities. Entries take the format
            {entity id -> (dataframe, id column, (time_column), (variable_types))}.
            Note that time_column and variable_types are optional.

        relationships (list[(str, str, str, str)]): list of relationships
            between entities. List items are a tuple with the format
            (parent entity id, parent variable, child entity id, child variable).

        cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex
            where the second index is the cutoff time (first is instance id).
            DataFrame will be sorted by (time, instance_id).

        training_window (Timedelta or str, optional):
            Window defining how much time before the cutoff time data
            can be used when calculating features. If ``None``, all data before cutoff time is used.
            Defaults to ``None``.

        approximate (Timedelta or str): Frequency to group instances with similar
            cutoff times by for features with costly calculations. For example,
            if bucket is 24 hours, all instances with cutoff times on the same
            day will use the same calculation for expensive features.

        verbose (bool, optional): Print progress info. The time granularity is
            per chunk.

        chunk_size (int or float or None): maximum number of rows of
            output feature matrix to calculate at time. If passed an integer
            greater than 0, will try to use that many rows per chunk. If passed
            a float value between 0 and 1 sets the chunk size to that
            percentage of all rows. if None, and n_jobs > 1 it will be set to 1/n_jobs

        n_jobs (int, optional): number of parallel processes to use when
            calculating feature matrix.

        dask_kwargs (dict, optional): Dictionary of keyword arguments to be
            passed when creating the dask client and scheduler. Even if n_jobs
            is not set, using `dask_kwargs` will enable multiprocessing.
            Main parameters:

            cluster (str or dask.distributed.LocalCluster):
                cluster or address of cluster to send tasks to. If unspecified,
                a cluster will be created.
            diagnostics port (int):
                port number to use for web dashboard.  If left unspecified, web
                interface will not be enabled.

            Valid keyword arguments for LocalCluster will also be accepted.

        save_progress (str, optional): path to save intermediate computational results.

        progress_callback (callable): function to be called with incremental progress updates.
            Has the following parameters:

                update: percentage change (float between 0 and 100) in progress since last call
                progress_percent: percentage (float between 0 and 100) of total computation completed
                time_elapsed: total time in seconds that has elapsed since start of call

        include_cutoff_time (bool): Include data at cutoff times in feature calculations. Defaults to ``True``.

    Returns:
        pd.DataFrame: The feature matrix.
    """
    assert (isinstance(features, list) and features != [] and
            all([isinstance(feature, FeatureBase) for feature in features])), \
        "features must be a non-empty list of features"

    # handle loading entityset
    from featuretools.entityset.entityset import EntitySet
    if not isinstance(entityset, EntitySet):
        if entities is not None and relationships is not None:
            entityset = EntitySet("entityset", entities, relationships)

    if any(isinstance(es.df, dd.DataFrame) for es in entityset.entities):
        if approximate:
            msg = "Using approximate is not supported with Dask Entities"
            raise ValueError(msg)
        if training_window:
            msg = "Using training_window is not supported with Dask Entities"
            raise ValueError(msg)

    target_entity = entityset[features[0].entity.id]

    cutoff_time = _validate_cutoff_time(cutoff_time, target_entity)

    if isinstance(cutoff_time, pd.DataFrame):
        if instance_ids:
            msg = "Passing 'instance_ids' is valid only if 'cutoff_time' is a single value or None - ignoring"
            warnings.warn(msg)
        pass_columns = [
            col for col in cutoff_time.columns
            if col not in ['instance_id', 'time']
        ]
        # make sure dtype of instance_id in cutoff time
        # is same as column it references
        target_entity = features[0].entity
        dtype = entityset[target_entity.id].df[target_entity.index].dtype
        cutoff_time["instance_id"] = cutoff_time["instance_id"].astype(dtype)
    else:
        pass_columns = []
        if cutoff_time is None:
            if entityset.time_type == NumericTimeIndex:
                cutoff_time = np.inf
            else:
                cutoff_time = datetime.now()

        if instance_ids is None:
            index_var = target_entity.index
            df = target_entity._handle_time(
                target_entity.df,
                time_last=cutoff_time,
                training_window=training_window,
                include_cutoff_time=include_cutoff_time)
            instance_ids = df[index_var]

        if isinstance(instance_ids, dd.Series):
            instance_ids = instance_ids.compute()
        elif is_instance(instance_ids, ks, 'Series'):
            instance_ids = instance_ids.to_pandas()

        # convert list or range object into series
        if not isinstance(instance_ids, pd.Series):
            instance_ids = pd.Series(instance_ids)

        cutoff_time = (cutoff_time, instance_ids)

    _check_cutoff_time_type(cutoff_time, entityset.time_type)

    # Approximate provides no benefit with a single cutoff time, so ignore it
    if isinstance(cutoff_time, tuple) and approximate is not None:
        msg = "Using approximate with a single cutoff_time value or no cutoff_time " \
            "provides no computational efficiency benefit"
        warnings.warn(msg)
        cutoff_time = pd.DataFrame({
            "instance_id":
            cutoff_time[1],
            "time": [cutoff_time[0]] * len(cutoff_time[1])
        })

    feature_set = FeatureSet(features)

    # Get features to approximate
    if approximate is not None:
        approximate_feature_trie = gather_approximate_features(feature_set)
        # Make a new FeatureSet that ignores approximated features
        feature_set = FeatureSet(
            features, approximate_feature_trie=approximate_feature_trie)

    # Check if there are any non-approximated aggregation features
    no_unapproximated_aggs = True
    for feature in features:
        if isinstance(feature, AggregationFeature):
            # do not need to check if feature is in to_approximate since
            # only base features of direct features can be in to_approximate
            no_unapproximated_aggs = False
            break

        if approximate is not None:
            all_approx_features = {
                f
                for _, feats in feature_set.approximate_feature_trie
                for f in feats
            }
        else:
            all_approx_features = set()
        deps = feature.get_dependencies(deep=True, ignored=all_approx_features)
        for dependency in deps:
            if isinstance(dependency, AggregationFeature):
                no_unapproximated_aggs = False
                break

    cutoff_df_time_var = 'time'
    target_time = '_original_time'

    if approximate is not None:
        # If there are approximated aggs, bin times
        binned_cutoff_time = bin_cutoff_times(cutoff_time, approximate)

        # Think about collisions: what if original time is a feature
        binned_cutoff_time[target_time] = cutoff_time[cutoff_df_time_var]

        cutoff_time_to_pass = binned_cutoff_time

    else:
        cutoff_time_to_pass = cutoff_time

    if isinstance(cutoff_time, pd.DataFrame):
        cutoff_time_len = cutoff_time.shape[0]
    else:
        cutoff_time_len = len(cutoff_time[1])

    chunk_size = _handle_chunk_size(chunk_size, cutoff_time_len)
    tqdm_options = {
        'total': (cutoff_time_len / FEATURE_CALCULATION_PERCENTAGE),
        'bar_format': PBAR_FORMAT,
        'disable': True
    }

    if verbose:
        tqdm_options.update({'disable': False})
    elif progress_callback is not None:
        # allows us to utilize progress_bar updates without printing to anywhere
        tqdm_options.update({'file': open(os.devnull, 'w'), 'disable': False})

    with make_tqdm_iterator(**tqdm_options) as progress_bar:
        if n_jobs != 1 or dask_kwargs is not None:
            feature_matrix = parallel_calculate_chunks(
                cutoff_time=cutoff_time_to_pass,
                chunk_size=chunk_size,
                feature_set=feature_set,
                approximate=approximate,
                training_window=training_window,
                save_progress=save_progress,
                entityset=entityset,
                n_jobs=n_jobs,
                no_unapproximated_aggs=no_unapproximated_aggs,
                cutoff_df_time_var=cutoff_df_time_var,
                target_time=target_time,
                pass_columns=pass_columns,
                progress_bar=progress_bar,
                dask_kwargs=dask_kwargs or {},
                progress_callback=progress_callback,
                include_cutoff_time=include_cutoff_time)
        else:
            feature_matrix = calculate_chunk(
                cutoff_time=cutoff_time_to_pass,
                chunk_size=chunk_size,
                feature_set=feature_set,
                approximate=approximate,
                training_window=training_window,
                save_progress=save_progress,
                entityset=entityset,
                no_unapproximated_aggs=no_unapproximated_aggs,
                cutoff_df_time_var=cutoff_df_time_var,
                target_time=target_time,
                pass_columns=pass_columns,
                progress_bar=progress_bar,
                progress_callback=progress_callback,
                include_cutoff_time=include_cutoff_time)

        # ensure rows are sorted by input order
        if isinstance(feature_matrix, pd.DataFrame):
            if isinstance(cutoff_time, pd.DataFrame):
                feature_matrix = feature_matrix.reindex(
                    pd.MultiIndex.from_frame(
                        cutoff_time[["instance_id", "time"]],
                        names=feature_matrix.index.names))
            else:
                # Maintain index dtype
                index_dtype = feature_matrix.index.get_level_values(0).dtype
                feature_matrix = feature_matrix.reindex(
                    cutoff_time[1].astype(index_dtype), level=0)
            if not cutoff_time_in_index:
                feature_matrix.reset_index(level='time',
                                           drop=True,
                                           inplace=True)

        if save_progress and os.path.exists(os.path.join(
                save_progress, 'temp')):
            shutil.rmtree(os.path.join(save_progress, 'temp'))

        # force to 100% since we saved last 5 percent
        previous_progress = progress_bar.n
        progress_bar.update(progress_bar.total - progress_bar.n)

        if progress_callback is not None:
            update, progress_percent, time_elapsed = update_progress_callback_parameters(
                progress_bar, previous_progress)
            progress_callback(update, progress_percent, time_elapsed)

        progress_bar.refresh()

    return feature_matrix
def encode_features(feature_matrix, features, top_n=10, include_unknown=True,
                    to_encode=None, inplace=False, verbose=False):
    """Encode categorical features

        Args:
            feature_matrix (pd.DataFrame): Dataframe of features.
            features (list[PrimitiveBase]): Feature definitions in feature_matrix.
            top_n (pd.DataFrame): Number of top values to include.
            include_unknown (pd.DataFrame): Add feature encoding an unknown class.
                defaults to True
            to_encode (list[str]): List of feature names to encode.
                features not in this list are unencoded in the output matrix
                defaults to encode all necessary features.
            inplace (bool): Encode feature_matrix in place. Defaults to False.
            verbose (str): Print progress info.

        Returns:
            (pd.Dataframe, list) : encoded feature_matrix, encoded features

        Example:
            .. ipython:: python
                :suppress:

                from featuretools.tests.testing_utils import make_ecommerce_entityset
                import featuretools as ft
                es = make_ecommerce_entityset()

            .. ipython:: python

                f1 = ft.Feature(es["log"]["product_id"])
                f2 = ft.Feature(es["log"]["purchased"])
                f3 = ft.Feature(es["log"]["value"])

                features = [f1, f2, f3]
                ids = [0, 1, 2, 3, 4, 5]
                feature_matrix = ft.calculate_feature_matrix(features, es,
                                                             instance_ids=ids)

                fm_encoded, f_encoded = ft.encode_features(feature_matrix,
                                                           features)
                f_encoded

                fm_encoded, f_encoded = ft.encode_features(feature_matrix,
                                                           features, top_n=2)
                f_encoded

                fm_encoded, f_encoded = ft.encode_features(feature_matrix, features,
                                                           include_unknown=False)
                f_encoded

                fm_encoded, f_encoded = ft.encode_features(feature_matrix, features,
                                                           to_encode=['purchased'])
                f_encoded
    """
    if inplace:
        X = feature_matrix
    else:
        X = feature_matrix.copy()

    encoded = []
    feature_names = []
    for feature in features:
        for fname in feature.get_feature_names():
            assert fname in X.columns, (
                "Feature %s not found in feature matrix" % (fname)
            )
            feature_names.append(fname)

    extra_columns = [col for col in X.columns if col not in feature_names]

    if verbose:
        iterator = make_tqdm_iterator(iterable=features,
                                      total=len(features),
                                      desc="Encoding pass 1",
                                      unit="feature")
    else:
        iterator = features

    for f in iterator:
        # TODO: features with multiple columns are not encoded by this method,
        # which can cause an "encoded" matrix with non-numeric vlaues
        is_discrete = issubclass(f.variable_type, Discrete)
        if (f.number_output_features > 1 or not is_discrete):
            if f.number_output_features > 1:
                logger.warning("Feature %s has multiple columns and will not "
                               "be encoded.  This may result in a matrix with"
                               " non-numeric values." % (f))
            encoded.append(f)
            continue

        if to_encode is not None and f.get_name() not in to_encode:
            encoded.append(f)
            continue

        val_counts = X[f.get_name()].value_counts().to_frame()
        index_name = val_counts.index.name
        if index_name is None:
            if 'index' in val_counts.columns:
                index_name = 'level_0'
            else:
                index_name = 'index'
        val_counts.reset_index(inplace=True)
        val_counts = val_counts.sort_values([f.get_name(), index_name],
                                            ascending=False)
        val_counts.set_index(index_name, inplace=True)
        unique = val_counts.head(top_n).index.tolist()
        for label in unique:
            add = f == label
            encoded.append(add)
            X[add.get_name()] = (X[f.get_name()] == label).astype(int)

        if include_unknown:
            unknown = f.isin(unique).NOT().rename(f.get_name() + " is unknown")
            encoded.append(unknown)
            X[unknown.get_name()] = (~X[f.get_name()].isin(unique)).astype(int)

        X.drop(f.get_name(), axis=1, inplace=True)

    new_columns = []
    for e in encoded:
        new_columns.extend(e.get_feature_names())

    new_columns.extend(extra_columns)
    new_X = X[new_columns]
    iterator = new_X.columns
    if verbose:
        iterator = make_tqdm_iterator(iterable=new_X.columns,
                                      total=len(new_X.columns),
                                      desc="Encoding pass 2",
                                      unit="feature")
    for c in iterator:
        if c in extra_columns:
            continue
        try:
            new_X[c] = pd.to_numeric(new_X[c], errors='raise')
        except (TypeError, ValueError):
            pass

    return new_X, encoded
def encode_features(feature_matrix, features, top_n=10, include_unknown=True,
                    to_encode=None, inplace=False, verbose=False):
    """Encode categorical features

        Args:
            feature_matrix (pd.DataFrame): Dataframe of features.
            features (list[PrimitiveBase]): Feature definitions in feature_matrix.
            top_n (pd.DataFrame): Number of top values to include.
            include_unknown (pd.DataFrame): Add feature encoding an unkwown class.
                defaults to True
            to_encode (list[str]): List of feature names to encode.
                features not in this list are unencoded in the output matrix
                defaults to encode all necessary features.
            inplace (bool): Encode feature_matrix in place. Defaults to False.
            verbose (str): Print progress info.

        Returns:
            (pd.Dataframe, list) : encoded feature_matrix, encoded features

        Example:
            .. ipython:: python
                :suppress:

                from featuretools.tests.testing_utils import make_ecommerce_entityset
                from featuretools.primitives import Feature
                import featuretools as ft
                es = make_ecommerce_entityset()

            .. ipython:: python

                f1 = Feature(es["log"]["product_id"])
                f2 = Feature(es["log"]["purchased"])
                f3 = Feature(es["log"]["value"])

                features = [f1, f2, f3]
                ids = [0, 1, 2, 3, 4, 5]
                feature_matrix = ft.calculate_feature_matrix(features, es,
                                                             instance_ids=ids)

                fm_encoded, f_encoded = ft.encode_features(feature_matrix,
                                                           features)
                f_encoded

                fm_encoded, f_encoded = ft.encode_features(feature_matrix,
                                                           features, top_n=2)
                f_encoded

                fm_encoded, f_encoded = ft.encode_features(feature_matrix, features,
                                                           include_unknown=False)
                f_encoded

                fm_encoded, f_encoded = ft.encode_features(feature_matrix, features,
                                                           to_encode=['purchased'])
                f_encoded
    """
    if inplace:
        X = feature_matrix
    else:
        X = feature_matrix.copy()

    encoded = []
    feature_names = []
    for feature in features:
        fname = feature.get_name()
        assert fname in X.columns, (
            "Feature %s not found in feature matrix" % (fname)
        )
        feature_names.append(fname)

    extra_columns = [col for col in X.columns if col not in feature_names]

    if verbose:
        iterator = make_tqdm_iterator(iterable=features,
                                      total=len(features),
                                      desc="Encoding pass 1",
                                      unit="feature")
    else:
        iterator = features

    for f in iterator:
        if (f.expanding or (not issubclass(f.variable_type, Discrete))):
            encoded.append(f)
            continue

        if to_encode is not None and f.get_name() not in to_encode:
            encoded.append(f)
            continue

        val_counts = X[f.get_name()].value_counts().to_frame()
        index_name = val_counts.index.name
        if index_name is None:
            if 'index' in val_counts.columns:
                index_name = 'level_0'
            else:
                index_name = 'index'
        val_counts.reset_index(inplace=True)
        val_counts = val_counts.sort_values([f.get_name(), index_name],
                                            ascending=False)
        val_counts.set_index(index_name, inplace=True)
        unique = val_counts.head(top_n).index.tolist()
        for label in unique:
            add = f == label
            encoded.append(add)
            X[add.get_name()] = (X[f.get_name()] == label).astype(int)

        if include_unknown:
            unknown = f.isin(unique).NOT().rename(f.get_name() + " = unknown")
            encoded.append(unknown)
            X[unknown.get_name()] = (~X[f.get_name()].isin(unique)).astype(int)

        X.drop(f.get_name(), axis=1, inplace=True)

    new_X = X[[e.get_name() for e in encoded] + extra_columns]
    iterator = new_X.columns
    if verbose:
        iterator = make_tqdm_iterator(iterable=new_X.columns,
                                      total=len(new_X.columns),
                                      desc="Encoding pass 2",
                                      unit="feature")
    for c in iterator:
        if c in extra_columns:
            continue
        try:
            new_X[c] = pd.to_numeric(new_X[c], errors='raise')
        except (TypeError, ValueError):
            pass

    return new_X, encoded
def encode_features(
    feature_matrix,
    features,
    top_n=DEFAULT_TOP_N,
    include_unknown=True,
    to_encode=None,
    inplace=False,
    drop_first=False,
    verbose=False,
):
    """Encode categorical features

    Args:
        feature_matrix (pd.DataFrame): Dataframe of features.
        features (list[PrimitiveBase]): Feature definitions in feature_matrix.
        top_n (int or dict[string -> int]): Number of top values to include.
            If dict[string -> int] is used, key is feature name and value is
            the number of top values to include for that feature.
            If a feature's name is not in dictionary, a default value of 10 is used.
        include_unknown (pd.DataFrame): Add feature encoding an unknown class.
            defaults to True
        to_encode (list[str]): List of feature names to encode.
            features not in this list are unencoded in the output matrix
            defaults to encode all necessary features.
        inplace (bool): Encode feature_matrix in place. Defaults to False.
        drop_first (bool): Whether to get k-1 dummies out of k categorical
                levels by removing the first level.
                defaults to False
        verbose (str): Print progress info.

    Returns:
        (pd.Dataframe, list) : encoded feature_matrix, encoded features

    Example:
        .. ipython:: python
            :suppress:

            from featuretools.tests.testing_utils import make_ecommerce_entityset
            import featuretools as ft
            es = make_ecommerce_entityset()

        .. ipython:: python

            f1 = ft.Feature(es["log"].ww["product_id"])
            f2 = ft.Feature(es["log"].ww["purchased"])
            f3 = ft.Feature(es["log"].ww["value"])

            features = [f1, f2, f3]
            ids = [0, 1, 2, 3, 4, 5]
            feature_matrix = ft.calculate_feature_matrix(features, es,
                                                         instance_ids=ids)

            fm_encoded, f_encoded = ft.encode_features(feature_matrix,
                                                       features)
            f_encoded

            fm_encoded, f_encoded = ft.encode_features(feature_matrix,
                                                       features, top_n=2)
            f_encoded

            fm_encoded, f_encoded = ft.encode_features(feature_matrix, features,
                                                       include_unknown=False)
            f_encoded

            fm_encoded, f_encoded = ft.encode_features(feature_matrix, features,
                                                       to_encode=['purchased'])
            f_encoded

            fm_encoded, f_encoded = ft.encode_features(feature_matrix, features,
                                                       drop_first=True)
            f_encoded
    """
    if not isinstance(feature_matrix, pd.DataFrame):
        msg = "feature_matrix must be a Pandas DataFrame"
        raise TypeError(msg)

    if inplace:
        X = feature_matrix
    else:
        X = feature_matrix.copy()

    old_feature_names = set()
    for feature in features:
        for fname in feature.get_feature_names():
            assert fname in X.columns, "Feature %s not found in feature matrix" % (
                fname)
            old_feature_names.add(fname)

    pass_through = [col for col in X.columns if col not in old_feature_names]

    if verbose:
        iterator = make_tqdm_iterator(
            iterable=features,
            total=len(features),
            desc="Encoding pass 1",
            unit="feature",
        )
    else:
        iterator = features

    new_feature_list = []
    kept_columns = []
    encoded_columns = []
    columns_info = feature_matrix.ww.columns

    for f in iterator:
        # TODO: features with multiple columns are not encoded by this method,
        # which can cause an "encoded" matrix with non-numeric values
        is_discrete = {"category", "foreign_key"
                       }.intersection(f.column_schema.semantic_tags)
        if f.number_output_features > 1 or not is_discrete:
            if f.number_output_features > 1:
                logger.warning("Feature %s has multiple columns and will not "
                               "be encoded.  This may result in a matrix with"
                               " non-numeric values." % (f))
            new_feature_list.append(f)
            kept_columns.extend(f.get_feature_names())
            continue

        if to_encode is not None and f.get_name() not in to_encode:
            new_feature_list.append(f)
            kept_columns.extend(f.get_feature_names())
            continue

        val_counts = X[f.get_name()].value_counts()
        # Remove 0 count category values
        val_counts = val_counts[val_counts > 0].to_frame()
        index_name = val_counts.index.name
        if index_name is None:
            if "index" in val_counts.columns:
                index_name = "level_0"
            else:
                index_name = "index"
        val_counts.reset_index(inplace=True)
        val_counts = val_counts.sort_values([f.get_name(), index_name],
                                            ascending=False)
        val_counts.set_index(index_name, inplace=True)
        select_n = top_n
        if isinstance(top_n, dict):
            select_n = top_n.get(f.get_name(), DEFAULT_TOP_N)
        if drop_first:
            select_n = min(len(val_counts), top_n)
            select_n = max(select_n - 1, 1)
        unique = val_counts.head(select_n).index.tolist()
        for label in unique:
            add = f == label
            add_name = add.get_name()
            new_feature_list.append(add)
            new_col = X[f.get_name()] == label
            new_col.rename(add_name, inplace=True)
            encoded_columns.append(new_col)

        if include_unknown:
            unknown = f.isin(unique).NOT().rename(f.get_name() + " is unknown")
            unknown_name = unknown.get_name()
            new_feature_list.append(unknown)
            new_col = ~X[f.get_name()].isin(unique)
            new_col.rename(unknown_name, inplace=True)
            encoded_columns.append(new_col)

        if inplace:
            X.drop(f.get_name(), axis=1, inplace=True)

    kept_columns.extend(pass_through)

    if inplace:
        for encoded_column in encoded_columns:
            X[encoded_column.name] = encoded_column
    else:
        X = pd.concat([X[kept_columns]] + encoded_columns, axis=1)

    entityset = new_feature_list[0].entityset
    ww_init_kwargs = get_ww_types_from_features(new_feature_list, entityset)

    # Grab ww metadata from feature matrix since it may be more exact
    for column in kept_columns:
        ww_init_kwargs["logical_types"][column] = columns_info[
            column].logical_type
        ww_init_kwargs["semantic_tags"][column] = columns_info[
            column].semantic_tags
        ww_init_kwargs["column_origins"][column] = columns_info[column].origin

    X.ww.init(**ww_init_kwargs)
    return X, new_feature_list
def calculate_feature_matrix(features,
                             entityset=None,
                             cutoff_time=None,
                             instance_ids=None,
                             entities=None,
                             relationships=None,
                             cutoff_time_in_index=False,
                             training_window=None,
                             approximate=None,
                             save_progress=None,
                             verbose=False,
                             chunk_size=None,
                             n_jobs=1,
                             dask_kwargs=None,
                             progress_callback=None):
    """Calculates a matrix for a given set of instance ids and calculation times.

    Args:
        features (list[:class:`.FeatureBase`]): Feature definitions to be calculated.

        entityset (EntitySet): An already initialized entityset. Required if `entities` and `relationships`
            not provided

        cutoff_time (pd.DataFrame or Datetime): Specifies at which time to calculate
            the features for each instance. The resulting feature matrix will use data
            up to and including the cutoff_time. Can either be a DataFrame with
            'instance_id' and 'time' columns, DataFrame with the name of the
            index variable in the target entity and a time column, or a single
            value to calculate for all instances. If the dataframe has more than two columns, any additional
            columns will be added to the resulting feature matrix.

        instance_ids (list): List of instances to calculate features on. Only
            used if cutoff_time is a single datetime.

        entities (dict[str -> tuple(pd.DataFrame, str, str)]): dictionary of
            entities. Entries take the format
            {entity id: (dataframe, id column, (time_column))}.

        relationships (list[(str, str, str, str)]): list of relationships
            between entities. List items are a tuple with the format
            (parent entity id, parent variable, child entity id, child variable).

        cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex
            where the second index is the cutoff time (first is instance id).
            DataFrame will be sorted by (time, instance_id).

        training_window (Timedelta or str, optional):
            Window defining how much time before the cutoff time data
            can be used when calculating features. If ``None``, all data before cutoff time is used.
            Defaults to ``None``.

        approximate (Timedelta or str): Frequency to group instances with similar
            cutoff times by for features with costly calculations. For example,
            if bucket is 24 hours, all instances with cutoff times on the same
            day will use the same calculation for expensive features.

        verbose (bool, optional): Print progress info. The time granularity is
            per chunk.

        chunk_size (int or float or None): maximum number of rows of
            output feature matrix to calculate at time. If passed an integer
            greater than 0, will try to use that many rows per chunk. If passed
            a float value between 0 and 1 sets the chunk size to that
            percentage of all rows. if None, and n_jobs > 1 it will be set to 1/n_jobs

        n_jobs (int, optional): number of parallel processes to use when
            calculating feature matrix.

        dask_kwargs (dict, optional): Dictionary of keyword arguments to be
            passed when creating the dask client and scheduler. Even if n_jobs
            is not set, using `dask_kwargs` will enable multiprocessing.
            Main parameters:

            cluster (str or dask.distributed.LocalCluster):
                cluster or address of cluster to send tasks to. If unspecified,
                a cluster will be created.
            diagnostics port (int):
                port number to use for web dashboard.  If left unspecified, web
                interface will not be enabled.

            Valid keyword arguments for LocalCluster will also be accepted.

        save_progress (str, optional): path to save intermediate computational results.

        progress_callback (callable): function to be called with incremental progress updates.
            Has the following parameters:

                update: percentage change (float between 0 and 100) in progress since last call
                progress_percent: percentage (float between 0 and 100) of total computation completed
                time_elapsed: total time in seconds that has elapsed since start of call

    """
    assert (isinstance(features, list) and features != [] and
            all([isinstance(feature, FeatureBase) for feature in features])), \
        "features must be a non-empty list of features"

    # handle loading entityset
    from featuretools.entityset.entityset import EntitySet
    if not isinstance(entityset, EntitySet):
        if entities is not None and relationships is not None:
            entityset = EntitySet("entityset", entities, relationships)

    target_entity = entityset[features[0].entity.id]
    pass_columns = []

    if not isinstance(cutoff_time, pd.DataFrame):
        if isinstance(cutoff_time, list):
            raise TypeError("cutoff_time must be a single value or DataFrame")

        if cutoff_time is None:
            if entityset.time_type == NumericTimeIndex:
                cutoff_time = np.inf
            else:
                cutoff_time = datetime.now()

        if instance_ids is None:
            index_var = target_entity.index
            df = target_entity._handle_time(target_entity.df,
                                            time_last=cutoff_time,
                                            training_window=training_window)
            instance_ids = df[index_var].tolist()

        cutoff_time = [cutoff_time] * len(instance_ids)
        map_args = [(id, time) for id, time in zip(instance_ids, cutoff_time)]
        cutoff_time = pd.DataFrame(map_args, columns=['instance_id', 'time'])

    cutoff_time = cutoff_time.reset_index(drop=True)
    # handle how columns are names in cutoff_time
    # maybe add _check_time_dtype helper function
    if "instance_id" not in cutoff_time.columns:
        if target_entity.index not in cutoff_time.columns:
            raise AttributeError(
                'Name of the index variable in the target entity'
                ' or "instance_id" must be present in cutoff_time')
        # rename to instance_id
        cutoff_time.rename(columns={target_entity.index: "instance_id"},
                           inplace=True)

    if "time" not in cutoff_time.columns:
        # take the first column that isn't instance_id and assume it is time
        not_instance_id = [
            c for c in cutoff_time.columns if c != "instance_id"
        ]
        cutoff_time.rename(columns={not_instance_id[0]: "time"}, inplace=True)

    # Check that cutoff_time time type matches entityset time type
    if entityset.time_type == NumericTimeIndex:
        if cutoff_time['time'].dtype.name not in PandasTypes._pandas_numerics:
            raise TypeError("cutoff_time times must be numeric: try casting "
                            "via pd.to_numeric(cutoff_time['time'])")
    elif entityset.time_type == DatetimeTimeIndex:
        if cutoff_time['time'].dtype.name not in PandasTypes._pandas_datetimes:
            raise TypeError(
                "cutoff_time times must be datetime type: try casting via pd.to_datetime(cutoff_time['time'])"
            )
    assert (cutoff_time[['instance_id', 'time']].duplicated().sum() == 0), \
        "Duplicated rows in cutoff time dataframe."
    pass_columns = [column_name for column_name in cutoff_time.columns[2:]]

    if _check_time_type(cutoff_time['time'].iloc[0]) is None:
        raise ValueError("cutoff_time time values must be datetime or numeric")

    # make sure dtype of instance_id in cutoff time
    # is same as column it references
    target_entity = features[0].entity
    dtype = entityset[target_entity.id].df[target_entity.index].dtype
    cutoff_time["instance_id"] = cutoff_time["instance_id"].astype(dtype)

    feature_set = FeatureSet(features)

    # Get features to approximate
    if approximate is not None:
        approximate_feature_trie = gather_approximate_features(feature_set)
        # Make a new FeatureSet that ignores approximated features
        feature_set = FeatureSet(
            features, approximate_feature_trie=approximate_feature_trie)

    # Check if there are any non-approximated aggregation features
    no_unapproximated_aggs = True
    for feature in features:
        if isinstance(feature, AggregationFeature):
            # do not need to check if feature is in to_approximate since
            # only base features of direct features can be in to_approximate
            no_unapproximated_aggs = False
            break

        if approximate is not None:
            all_approx_features = {
                f
                for _, feats in feature_set.approximate_feature_trie
                for f in feats
            }
        else:
            all_approx_features = set()
        deps = feature.get_dependencies(deep=True, ignored=all_approx_features)
        for dependency in deps:
            if isinstance(dependency, AggregationFeature):
                no_unapproximated_aggs = False
                break

    cutoff_df_time_var = 'time'
    target_time = '_original_time'

    if approximate is not None:
        # If there are approximated aggs, bin times
        binned_cutoff_time = bin_cutoff_times(cutoff_time.copy(), approximate)

        # Think about collisions: what if original time is a feature
        binned_cutoff_time[target_time] = cutoff_time[cutoff_df_time_var]

        cutoff_time_to_pass = binned_cutoff_time

    else:
        cutoff_time_to_pass = cutoff_time

    chunk_size = _handle_chunk_size(chunk_size, cutoff_time.shape[0])
    tqdm_options = {
        'total': (cutoff_time.shape[0] / FEATURE_CALCULATION_PERCENTAGE),
        'bar_format': PBAR_FORMAT,
        'disable': True
    }

    if verbose:
        tqdm_options.update({'disable': False})
    elif progress_callback is not None:
        # allows us to utilize progress_bar updates without printing to anywhere
        tqdm_options.update({'file': open(os.devnull, 'w'), 'disable': False})

    progress_bar = make_tqdm_iterator(**tqdm_options)

    if n_jobs != 1 or dask_kwargs is not None:
        feature_matrix = parallel_calculate_chunks(
            cutoff_time=cutoff_time_to_pass,
            chunk_size=chunk_size,
            feature_set=feature_set,
            approximate=approximate,
            training_window=training_window,
            save_progress=save_progress,
            entityset=entityset,
            n_jobs=n_jobs,
            no_unapproximated_aggs=no_unapproximated_aggs,
            cutoff_df_time_var=cutoff_df_time_var,
            target_time=target_time,
            pass_columns=pass_columns,
            progress_bar=progress_bar,
            dask_kwargs=dask_kwargs or {},
            progress_callback=progress_callback)
    else:
        feature_matrix = calculate_chunk(
            cutoff_time=cutoff_time_to_pass,
            chunk_size=chunk_size,
            feature_set=feature_set,
            approximate=approximate,
            training_window=training_window,
            save_progress=save_progress,
            entityset=entityset,
            no_unapproximated_aggs=no_unapproximated_aggs,
            cutoff_df_time_var=cutoff_df_time_var,
            target_time=target_time,
            pass_columns=pass_columns,
            progress_bar=progress_bar,
            progress_callback=progress_callback)

    # ensure rows are sorted by input order
    feature_matrix = feature_matrix.reindex(
        cutoff_time[["instance_id", "time"]])
    if not cutoff_time_in_index:
        feature_matrix.reset_index(level='time', drop=True, inplace=True)

    if save_progress and os.path.exists(os.path.join(save_progress, 'temp')):
        shutil.rmtree(os.path.join(save_progress, 'temp'))

    # force to 100% since we saved last 5 percent
    previous_progress = progress_bar.n
    progress_bar.update(progress_bar.total - progress_bar.n)

    if progress_callback is not None:
        update, progress_percent, time_elapsed = update_progress_callback_parameters(
            progress_bar, previous_progress)
        progress_callback(update, progress_percent, time_elapsed)

    progress_bar.refresh()
    progress_bar.close()

    return feature_matrix
Exemple #20
0
def calculate_feature_matrix(features,
                             entityset=None,
                             cutoff_time=None,
                             instance_ids=None,
                             entities=None,
                             relationships=None,
                             cutoff_time_in_index=False,
                             training_window=None,
                             approximate=None,
                             save_progress=None,
                             verbose=False,
                             chunk_size=None,
                             profile=False):
    """Calculates a matrix for a given set of instance ids and calculation times.

    Args:
        features (list[PrimitiveBase]): Feature definitions to be calculated.

        entityset (EntitySet): An already initialized entityset. Required if `entities` and `relationships`
            not provided

        cutoff_time (pd.DataFrame or Datetime): Specifies at which time to calculate
            the features for each instance.  Can either be a DataFrame with
            'instance_id' and 'time' columns, DataFrame with the name of the
            index variable in the target entity and a time column, or a single
            value to calculate for all instances. If the dataframe has more than two columns, any additional
            columns will be added to the resulting feature matrix.

        instance_ids (list): List of instances to calculate features on. Only
            used if cutoff_time is a single datetime.

        entities (dict[str -> tuple(pd.DataFrame, str, str)]): dictionary of
            entities. Entries take the format
            {entity id: (dataframe, id column, (time_column))}.

        relationships (list[(str, str, str, str)]): list of relationships
            between entities. List items are a tuple with the format
            (parent entity id, parent variable, child entity id, child variable).

        cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex
            where the second index is the cutoff time (first is instance id).
            DataFrame will be sorted by (time, instance_id).

        training_window (dict[str -> Timedelta] or Timedelta, optional):
            Window or windows defining how much older than the cutoff time data
            can be to be included when calculating the feature.  To specify
            which entities to apply windows to, use a dictionary mapping entity
            id -> Timedelta. If None, all older data is used.

        approximate (Timedelta or str): Frequency to group instances with similar
            cutoff times by for features with costly calculations. For example,
            if bucket is 24 hours, all instances with cutoff times on the same
            day will use the same calculation for expensive features.

        verbose (bool, optional): Print progress info. The time granularity is
            per chunk.

        profile (bool, optional): Enables profiling if True.

        chunk_size (int or float or None or "cutoff time"): Number of rows of
            output feature matrix to calculate at time. If passed an integer
            greater than 0, will try to use that many rows per chunk. If passed
            a float value between 0 and 1 sets the chunk size to that
            percentage of all instances. If passed the string "cutoff time",
            rows are split per cutoff time.

        save_progress (str, optional): path to save intermediate computational results.
    """
    assert (isinstance(features, list) and features != [] and
            all([isinstance(feature, PrimitiveBase) for feature in features])), \
        "features must be a non-empty list of features"

    # handle loading entityset
    from featuretools.entityset.entityset import EntitySet
    if not isinstance(entityset, EntitySet):
        if entities is not None and relationships is not None:
            entityset = EntitySet("entityset", entities, relationships)

    target_entity = entityset[features[0].entity.id]
    pass_columns = []

    if not isinstance(cutoff_time, pd.DataFrame):
        if isinstance(cutoff_time, list):
            raise TypeError("cutoff_time must be a single value or DataFrame")

        if cutoff_time is None:
            if entityset.time_type == NumericTimeIndex:
                cutoff_time = np.inf
            else:
                cutoff_time = datetime.now()

        if instance_ids is None:
            index_var = target_entity.index
            instance_ids = target_entity.df[index_var].tolist()

        cutoff_time = [cutoff_time] * len(instance_ids)
        map_args = [(id, time) for id, time in zip(instance_ids, cutoff_time)]
        cutoff_time = pd.DataFrame(map_args, columns=['instance_id', 'time'])
    else:
        cutoff_time = cutoff_time.copy()

        # handle how columns are names in cutoff_time
        if "instance_id" not in cutoff_time.columns:
            if target_entity.index not in cutoff_time.columns:
                raise AttributeError(
                    'Name of the index variable in the target entity'
                    ' or "instance_id" must be present in cutoff_time')
            # rename to instance_id
            cutoff_time.rename(columns={target_entity.index: "instance_id"},
                               inplace=True)

        if "time" not in cutoff_time.columns:
            # take the first column that isn't instance_id and assume it is time
            not_instance_id = [
                c for c in cutoff_time.columns if c != "instance_id"
            ]
            cutoff_time.rename(columns={not_instance_id[0]: "time"},
                               inplace=True)
        if cutoff_time['time'].dtype == object:
            if (entityset.time_type == NumericTimeIndex
                    and cutoff_time['time'].dtype.name.find('int') == -1
                    and cutoff_time['time'].dtype.name.find('float') == -1):
                raise TypeError(
                    "cutoff_time times must be numeric: try casting via pd.to_numeric(cutoff_time['time'])"
                )
            elif (entityset.time_type == DatetimeTimeIndex
                  and cutoff_time['time'].dtype.name.find('time') == -1):
                raise TypeError(
                    "cutoff_time times must be datetime type: try casting via pd.to_datetime(cutoff_time['time'])"
                )
        pass_columns = [column_name for column_name in cutoff_time.columns[2:]]

    if _check_time_type(cutoff_time['time'].iloc[0]) is None:
        raise ValueError("cutoff_time time values must be datetime or numeric")

    backend = PandasBackend(entityset, features)

    # Get dictionary of features to approximate
    if approximate is not None:
        to_approximate, all_approx_feature_set = gather_approximate_features(
            features, backend)
    else:
        to_approximate = defaultdict(list)
        all_approx_feature_set = None

    # Check if there are any non-approximated aggregation features
    no_unapproximated_aggs = True
    for feature in features:
        if isinstance(feature, AggregationPrimitive):
            # do not need to check if feature is in to_approximate since
            # only base features of direct features can be in to_approximate
            no_unapproximated_aggs = False
            break

        deps = feature.get_deep_dependencies(all_approx_feature_set)
        for dependency in deps:
            if (isinstance(dependency, AggregationPrimitive) and dependency
                    not in to_approximate[dependency.entity.id]):
                no_unapproximated_aggs = False
                break

    cutoff_df_time_var = 'time'
    target_time = '_original_time'
    num_per_chunk = calc_num_per_chunk(chunk_size, cutoff_time.shape)

    if approximate is not None:
        # If there are approximated aggs, bin times
        binned_cutoff_time = bin_cutoff_times(cutoff_time.copy(), approximate)

        # Think about collisions: what if original time is a feature
        binned_cutoff_time[target_time] = cutoff_time[cutoff_df_time_var]

        cutoff_time_to_pass = binned_cutoff_time

    else:
        cutoff_time_to_pass = cutoff_time

    if num_per_chunk == "cutoff time":
        iterator = cutoff_time_to_pass.groupby(cutoff_df_time_var)
    else:
        iterator = get_next_chunk(cutoff_time=cutoff_time_to_pass,
                                  time_variable=cutoff_df_time_var,
                                  num_per_chunk=num_per_chunk)

    # if verbose, create progess bar
    if verbose:
        chunks = []
        if num_per_chunk == "cutoff time":
            for _, group in iterator:
                chunks.append(group)
        else:
            for chunk in iterator:
                chunks.append(chunk)

        pbar_string = ("Elapsed: {elapsed} | Remaining: {remaining} | "
                       "Progress: {l_bar}{bar}| "
                       "Calculated: {n}/{total} chunks")
        iterator = make_tqdm_iterator(iterable=chunks,
                                      total=len(chunks),
                                      bar_format=pbar_string)
    feature_matrix = []
    backend = PandasBackend(entityset, features)

    for chunk in iterator:
        # if not using chunks, pull out the group dataframe
        if isinstance(chunk, tuple):
            chunk = chunk[1]
        _feature_matrix = calculate_chunk(features, chunk, approximate,
                                          entityset, training_window, profile,
                                          verbose, save_progress, backend,
                                          no_unapproximated_aggs,
                                          cutoff_df_time_var, target_time,
                                          pass_columns)
        feature_matrix.append(_feature_matrix)
        # Do a manual garbage collection in case objects from calculate_chunk
        # weren't collected automatically
        gc.collect()
    if verbose:
        iterator.close()
    feature_matrix = pd.concat(feature_matrix)

    feature_matrix.sort_index(level='time', kind='mergesort', inplace=True)
    if not cutoff_time_in_index:
        feature_matrix.reset_index(level='time', drop=True, inplace=True)

    if save_progress and os.path.exists(os.path.join(save_progress, 'temp')):
        shutil.rmtree(os.path.join(save_progress, 'temp'))

    return feature_matrix
Exemple #21
0
    def get_pandas_data_slice(self, filter_entity_ids, index_eid,
                              instances, entity_columns=None,
                              time_last=None, training_window=None,
                              verbose=False):
        """
        Get the slice of data related to the supplied instances of the index
        entity.
        """
        eframes_by_filter = {}

        if verbose:
            iterator = make_tqdm_iterator(iterable=filter_entity_ids,
                                          desc="Gathering relevant data",
                                          unit="entity")
        else:
            iterator = filter_entity_ids
        # gather frames for each child, for each parent
        for filter_eid in iterator:
            # get the instances of the top-level entity linked by our instances
            toplevel_slice = self._related_instances(start_entity_id=index_eid,
                                                     final_entity_id=filter_eid,
                                                     instance_ids=instances,
                                                     time_last=time_last,
                                                     training_window=training_window)

            eframes = {filter_eid: toplevel_slice}

            # Do a bredth-first search of the relationship tree rooted at this
            # entity, filling out eframes for each entity we hit on the way.
            r_queue = self.get_backward_relationships(filter_eid)
            while r_queue:
                r = r_queue.pop(0)
                child_eid = r.child_variable.entity.id
                child_columns = None
                if entity_columns is not None and child_eid not in entity_columns:
                    # entity_columns specifies which columns to extract
                    # if it skips a relationship (specifies child and grandparent columns)
                    # we need to at least add the ids of the intermediate entity
                    child_columns = [v.id for v in self[child_eid].variables
                                     if isinstance(v, (vtypes.Index, vtypes.Id,
                                                       vtypes.TimeIndex))]
                elif entity_columns is not None:
                    child_columns = entity_columns[child_eid]

                parent_eid = r.parent_variable.entity.id

                # If we've already seen this child, this is a diamond graph and
                # we don't know what to do
                if child_eid in eframes:
                    raise RuntimeError('Diamond graph detected!')

                # Add this child's children to the queue
                r_queue += self.get_backward_relationships(child_eid)

                # Query the child of the current backwards relationship for the
                # instances we want
                instance_vals = eframes[parent_eid][r.parent_variable.id]
                eframes[child_eid] =\
                    self.entity_stores[child_eid].query_by_values(
                        instance_vals,
                        variable_id=r.child_variable.id,
                        columns=child_columns,
                        time_last=time_last,
                        training_window=training_window)

                # add link variables to this dataframe in order to link it to its
                # (grand)parents
                self._add_multigenerational_link_vars(frames=eframes,
                                                      start_entity_id=filter_eid,
                                                      end_entity_id=child_eid)

            eframes_by_filter[filter_eid] = eframes

        # If there are no instances of *this* entity in the index, return None
        if eframes_by_filter[index_eid][index_eid].shape[0] == 0:
            return None

        return eframes_by_filter