Beispiel #1
0
    def test_replace_input_formats_listlike(self):
        # both dicts
        to_rep = {'A': np.nan, 'B': 0, 'C': ''}
        values = {'A': 0, 'B': -1, 'C': 'missing'}
        df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5],
                        'C': ['', 'asdf', 'fd']})
        filled = df.replace(to_rep, values)
        expected = {k: v.replace(to_rep[k], values[k]) for k, v in df.items()}
        assert_frame_equal(filled, DataFrame(expected))

        result = df.replace([0, 2, 5], [5, 2, 0])
        expected = DataFrame({'A': [np.nan, 5, np.inf], 'B': [5, 2, 0],
                              'C': ['', 'asdf', 'fd']})
        assert_frame_equal(result, expected)

        # scalar to dict
        values = {'A': 0, 'B': -1, 'C': 'missing'}
        df = DataFrame({'A': [np.nan, 0, np.nan], 'B': [0, 2, 5],
                        'C': ['', 'asdf', 'fd']})
        filled = df.replace(np.nan, values)
        expected = {k: v.replace(np.nan, values[k]) for k, v in df.items()}
        assert_frame_equal(filled, DataFrame(expected))

        # list to list
        to_rep = [np.nan, 0, '']
        values = [-2, -1, 'missing']
        result = df.replace(to_rep, values)
        expected = df.copy()
        for i in range(len(to_rep)):
            expected.replace(to_rep[i], values[i], inplace=True)
        assert_frame_equal(result, expected)

        msg = r"Replacement lists must match in length\. Expecting 3 got 2"
        with pytest.raises(ValueError, match=msg):
            df.replace(to_rep, values[1:])
Beispiel #2
0
def int_frame():
    """
    Fixture for DataFrame of ints with index of unique strings

    Columns are ['A', 'B', 'C', 'D']

                A  B  C  D
    vpBeWjM651  1  0  1  0
    5JyxmrP1En -1  0  0  0
    qEDaoD49U2 -1  1  0  0
    m66TkTfsFe  0  0  0  0
    EHPaNzEUFm -1  0 -1  0
    fpRJCevQhi  2  0  0  0
    OlQvnmfi3Q  0  0 -2  0
    ...        .. .. .. ..
    uB1FPlz4uP  0  0  0  1
    EcSe6yNzCU  0  0 -1  0
    L50VudaiI8 -1  1 -2  0
    y3bpw4nwIp  0 -1  0  0
    H0RdLLwrCT  1  1  0  0
    rY82K0vMwm  0  0  0  0
    1OPIUjnkjk  2  0  0  0

    [30 rows x 4 columns]
    """
    df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()})
    # force these all to int64 to avoid platform testing issues
    return DataFrame({c: s for c, s in df.items()}, dtype=np.int64)
Beispiel #3
0
def _features_to_properties(features: pd.DataFrame) -> Dict[str, np.ndarray]:
    """Converts a features DataFrame to a deprecated properties dictionary.

    See Also
    --------
    :meth:`_FeatureTable.properties`
    """
    return {name: series.to_numpy() for name, series in features.items()}
Beispiel #4
0
 def test_items(self):
     # GH 17213, GH 13918
     cols = ["a", "b", "c"]
     df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=cols)
     for c, (k, v) in zip(cols, df.items()):
         assert c == k
         assert isinstance(v, Series)
         assert (df[k] == v).all()
Beispiel #5
0
 def test_items(self):
     # GH 17213, GH 13918
     cols = ['a', 'b', 'c']
     df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=cols)
     for c, (k, v) in zip(cols, df.items()):
         assert c == k
         assert isinstance(v, Series)
         assert (df[k] == v).all()
Beispiel #6
0
def str_to_cats(dataframe: pd.DataFrame):
    """
    Convert string column to categories
    """
    for col_name, col in dataframe.items():
        if is_string_dtype(col):
            dataframe[col_name] = col.astype('category').cat.as_ordered()
    return dataframe
Beispiel #7
0
 def test_items(self):
     # issue #17213, #13918
     cols = ['a', 'b', 'c']
     df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=cols)
     for c, (k, v) in zip(cols, df.items()):
         assert c == k
         assert isinstance(v, Series)
         assert (df[k] == v).all()
Beispiel #8
0
def _test(model_split: dict, test_data: DataFrame, evaluation: bool,
          is_binary_classifier: bool) -> (DataFrame, float):
    clusterer = model_split[ModelType.CLUSTERER.value]
    classifier = model_split[ModelType.CLASSIFIER.value]

    test_data = clusterer.cluster_data(test_data)

    results_df = DataFrame()
    auc = 0

    non_empty_clusters = clusterer.n_clusters

    for cluster in range(clusterer.n_clusters):
        cluster_test_df = test_data[cluster]
        if cluster_test_df.empty:
            non_empty_clusters -= 1
        else:
            cluster_targets_df = cluster_test_df['label']
            if evaluation:
                try:
                    if hasattr(classifier[cluster], 'decision_function'):
                        scores = classifier[cluster].decision_function(
                            cluster_test_df.drop(['label'], 1))
                    else:
                        scores = classifier[cluster].predict_proba(
                            cluster_test_df.drop(['label'], 1))
                        if np.size(scores, 1) >= 2:  # checks number of columns
                            scores = scores[:, 1]
                except (NotImplementedError, KeyError):
                    if hasattr(classifier[cluster], 'decision_function'):
                        scores = classifier[cluster].decision_function(
                            cluster_test_df.drop(['label'], 1).values)
                    else:
                        scores = classifier[cluster].predict_proba(
                            cluster_test_df.drop(['label'], 1).values)
                        try:
                            if np.size(scores,
                                       1) >= 2:  # checks number of columns
                                scores = scores[:, 1]
                        except Exception as exception:
                            pass
                auc += get_auc(cluster_targets_df, scores)
            try:
                cluster_test_df['predicted'] = classifier[cluster].predict(
                    cluster_test_df.drop(['label'], 1))
            except (NotImplementedError, KeyError):
                cluster_test_df['predicted'] = classifier[cluster].predict(
                    cluster_test_df.drop(['label'], 1).values)

            results_df = results_df.append(cluster_test_df)

    if is_binary_classifier or max(
        [len(set(t['label'])) for _, t in test_data.items()]) <= 2:
        auc = float(auc) / non_empty_clusters
    else:
        pass  # TODO: check if AUC is ok for multiclass, otherwise implement

    return results_df, auc
def _sample_dfs(t_df: pyspark.sql.DataFrame, t_fracs: pd.DataFrame,
                c_can_df: pyspark.sql.DataFrame, c_fracs: pd.DataFrame,
                match_col: str) -> Tuple[DataFrame, DataFrame]:
    r"""given treatment and control pops and their stratified sample
    fracs, return balanced pops

    Parameters
    ----------
    t_df : pyspark.DataFrame
        treatment pop
    t_fracs: pd.DataFrame
        with columns `match_col` and 'treatment_scaled_sample_fraction'
    c_can_df : pyspark.DataFrame
        control can pop
    c_fracs : pd.DataFrame
        with columns `match_col` and control_scaled_sample_fraction

    Returns
    -------
    t_out : pyspark.sql.DataFrame
    c_out : pyspark.sql.DataFrame

    Raises
    ------
    UncaughtExceptions

    """
    _persist_if_unpersisted(t_df)
    _persist_if_unpersisted(c_can_df)

    t_fracs = t_fracs.set_index(
        match_col).treatment_scaled_sample_fraction.to_dict()
    t_dict = {}
    for key, value in t_fracs.items():
        t_dict[int(key)] = min(float(value), 1)
    t_out = t_df.sampleBy(col=match_col, fractions=t_dict, seed=42)

    c_fracs = c_fracs.set_index(
        match_col).control_scaled_sample_fraction.to_dict()
    c_dict = {}
    for key, value in c_fracs.items():
        c_dict[int(key)] = float(value)
    c_out = c_can_df.sampleBy(col=match_col, fractions=c_dict, seed=42)

    return t_out, c_out
Beispiel #10
0
def make_mem_efficient_gene_arrays(calls: pd.DataFrame):

    names, values = zip(*calls.items())

    namesA, namesB = zip(*itertools.combinations(names, r=2))

    valuesA, valuesB = zip(*itertools.combinations(values, r=2))

    return namesA, namesB, valuesA, valuesB
Beispiel #11
0
def int_frame():
    """
    Fixture for DataFrame of ints with index of unique strings

    Columns are ['A', 'B', 'C', 'D']
    """
    df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()})
    # force these all to int64 to avoid platform testing issues
    return DataFrame({c: s for c, s in df.items()}, dtype=np.int64)
Beispiel #12
0
def get_dtypes_and_schemas_of_dataframe(dataframe: pd.DataFrame):
    dtypes_by_column_name = {}
    schema_type_hints_by_column_name = {}

    for column_name, column_values in dataframe.items():
        dtypes_by_column_name[column_name], schema_type_hints_by_column_name[column_name] = \
            get_dtype_and_schema_of_array(column_values)

    return dtypes_by_column_name, schema_type_hints_by_column_name
def plot_neff_vs_width(df: pd.DataFrame, **kwargs):
    width = df.width
    for mode_number, neff in df.items():
        if mode_number != "width":
            plt.plot(width, neff, ".-", label=str(mode_number))

    plt.legend(**kwargs)
    plt.xlabel("width (um)")
    plt.ylabel("neff")
Beispiel #14
0
 def _unravel_gradient_covariance_matrix(cls, parameter_name,
                                         covariance_matrix: pd.DataFrame):
     return {
         cls._gradient_covariance_name.format(parameter_name=parameter_name,
                                              gate_name_1=gate_1,
                                              gate_name_2=gate_2): cov_entry
         for gate_1, cov_column in covariance_matrix.items()
         for gate_2, cov_entry in cov_column.items()
     }
Beispiel #15
0
def perf(data: pd.DataFrame) -> pd.DataFrame:
    """
    Performance rebased to 100
    """
    return pd.DataFrame(
        pd.concat(
            [(srs.dropna().pct_change().fillna(0).add(1).cumprod().mul(100))
             for _, srs in data.items()],
            axis=1))
Beispiel #16
0
 def add_feature(src: pd.DataFrame,
                 target: dict,
                 prefix: str = None) -> None:
     if prefix is None:
         prefix = ""
     else:
         prefix += "_"
     for k, v in src.items():
         target[str(prefix) + str(k)] = v
Beispiel #17
0
    def test_replace_input_formats_listlike(self):
        # both dicts
        to_rep = {"A": np.nan, "B": 0, "C": ""}
        values = {"A": 0, "B": -1, "C": "missing"}
        df = DataFrame({
            "A": [np.nan, 0, np.inf],
            "B": [0, 2, 5],
            "C": ["", "asdf", "fd"]
        })
        filled = df.replace(to_rep, values)
        expected = {k: v.replace(to_rep[k], values[k]) for k, v in df.items()}
        tm.assert_frame_equal(filled, DataFrame(expected))

        result = df.replace([0, 2, 5], [5, 2, 0])
        expected = DataFrame({
            "A": [np.nan, 5, np.inf],
            "B": [5, 2, 0],
            "C": ["", "asdf", "fd"]
        })
        tm.assert_frame_equal(result, expected)

        # scalar to dict
        values = {"A": 0, "B": -1, "C": "missing"}
        df = DataFrame({
            "A": [np.nan, 0, np.nan],
            "B": [0, 2, 5],
            "C": ["", "asdf", "fd"]
        })
        filled = df.replace(np.nan, values)
        expected = {k: v.replace(np.nan, values[k]) for k, v in df.items()}
        tm.assert_frame_equal(filled, DataFrame(expected))

        # list to list
        to_rep = [np.nan, 0, ""]
        values = [-2, -1, "missing"]
        result = df.replace(to_rep, values)
        expected = df.copy()
        for i in range(len(to_rep)):
            expected.replace(to_rep[i], values[i], inplace=True)
        tm.assert_frame_equal(result, expected)

        msg = r"Replacement lists must match in length\. Expecting 3 got 2"
        with pytest.raises(ValueError, match=msg):
            df.replace(to_rep, values[1:])
Beispiel #18
0
    def test_replace_input_formats_listlike(self):
        # both dicts
        to_rep = {'A': np.nan, 'B': 0, 'C': ''}
        values = {'A': 0, 'B': -1, 'C': 'missing'}
        df = DataFrame({
            'A': [np.nan, 0, np.inf],
            'B': [0, 2, 5],
            'C': ['', 'asdf', 'fd']
        })
        filled = df.replace(to_rep, values)
        expected = {k: v.replace(to_rep[k], values[k]) for k, v in df.items()}
        assert_frame_equal(filled, DataFrame(expected))

        result = df.replace([0, 2, 5], [5, 2, 0])
        expected = DataFrame({
            'A': [np.nan, 5, np.inf],
            'B': [5, 2, 0],
            'C': ['', 'asdf', 'fd']
        })
        assert_frame_equal(result, expected)

        # scalar to dict
        values = {'A': 0, 'B': -1, 'C': 'missing'}
        df = DataFrame({
            'A': [np.nan, 0, np.nan],
            'B': [0, 2, 5],
            'C': ['', 'asdf', 'fd']
        })
        filled = df.replace(np.nan, values)
        expected = {k: v.replace(np.nan, values[k]) for k, v in df.items()}
        assert_frame_equal(filled, DataFrame(expected))

        # list to list
        to_rep = [np.nan, 0, '']
        values = [-2, -1, 'missing']
        result = df.replace(to_rep, values)
        expected = df.copy()
        for i in range(len(to_rep)):
            expected.replace(to_rep[i], values[i], inplace=True)
        assert_frame_equal(result, expected)

        msg = r"Replacement lists must match in length\. Expecting 3 got 2"
        with pytest.raises(ValueError, match=msg):
            df.replace(to_rep, values[1:])
Beispiel #19
0
def test_apply_differently_indexed():
    df = DataFrame(np.random.randn(20, 10))

    result = df.apply(Series.describe, axis=0)
    expected = DataFrame({i: v.describe() for i, v in df.items()}, columns=df.columns)
    tm.assert_frame_equal(result, expected)

    result = df.apply(Series.describe, axis=1)
    expected = DataFrame({i: v.describe() for i, v in df.T.items()}, columns=df.index).T
    tm.assert_frame_equal(result, expected)
Beispiel #20
0
    def __createSeriesForAttributes(dataframe: pd.DataFrame, timeIndex: int, timeIndexType: Type) \
            -> Tuple[List[QtCharts.QLineSeries], float, float]:
        """ Creates a QLineSeries for every column in the dataframe. 'timeIndex' column is used for
        xAxis

        :return: tuple as (list of series, yMin, yMax)
        """
        timeIndexName: str = dataframe.columns[timeIndex]

        # Convert time values to their numerical equivalent
        if timeIndexType == Types.Datetime:
            # Time axis is Datetime, so convert every date into the number of ms from 01/01/1970
            # dataframe[timeIndexName]: pd.Series[pd.Timestamp]
            # This may not be super accurate
            dataframe.loc[:, timeIndexName] = pd.to_numeric(
                dataframe[timeIndexName], downcast='integer',
                errors='coerce').values / (10**6)
            # dataframe[timeIndexName] \
            #    .map(lambda timestamp: int(timestamp.to_pydatetime().timestamp() * 1000) if )
        else:
            # Types.Ordinal
            # dataframe[timeIndexName]: pd.Series[pd.Categorical]
            dataframe.loc[:, timeIndexName] = dataframe[
                timeIndexName].cat.codes.to_list()

        timeValues: pd.Series = dataframe[timeIndexName].astype(float)
        # Remove time column since we already used it to create the time points
        dataframe = dataframe.drop(timeIndexName, axis=1)

        # Create series for every column (excluding time)
        allSeries: List[QtCharts.QLineSeries] = list()
        # Also keep track of the range the y axis should have
        yMin: float = None
        yMax: float = None
        for colName, valueSeries in dataframe.items():
            valueSeries = pd.Series(valueSeries)
            if pd.api.types.is_categorical(valueSeries):
                # makes sure this is a series of floats
                valueSeries = valueSeries.cat.codes.astype(float)
            # Compute minimum and maximum of series and update global range
            smin = valueSeries.min()
            smax = valueSeries.max()
            yMin = smin if (yMin is None or yMin > smin) else yMin
            yMax = smax if (yMax is None or yMax < smax) else yMax
            # Create series
            qSeries = QtCharts.QLineSeries()
            points: List[QPointF] = list(
                map(lambda t: QPointF(*t), zip(timeValues, valueSeries)))
            qSeries.append(points)
            qSeries.setName(colName)
            qSeries.setUseOpenGL(True)
            qSeries.setPointsVisible(
                True)  # This is ignored with OpenGL enabled
            allSeries.append(qSeries)
        return allSeries, yMin, yMax
Beispiel #21
0
    def validate_predictions(predictions: pd.DataFrame):
        names = predictions.columns.values
        assert len(
            names
        ) >= 2, "predictions frame should have 2 columns (regression) or more (classification)"
        assert names[
            -1] == "truth", "last column of predictions frame must be named `truth`"
        assert names[
            -2] == "predictions", "last column of predictions frame must be named `predictions`"
        if len(names) == 2:  # regression
            for name, col in predictions.items():
                pd.to_numeric(
                    col)  # pandas will raise if we have non-numerical values
        else:  # classification
            predictors = names[:-2]
            probabilities, preds, truth = predictions.iloc[:, :
                                                           -2], predictions.iloc[:,
                                                                                 -2], predictions.iloc[:,
                                                                                                       -1]
            assert np.array_equal(
                predictors, np.sort(predictors)
            ), "Predictors columns are not sorted in lexicographic order."
            assert set(np.unique(predictors)) == set(
                predictors
            ), "Predictions contain multiple columns with the same label."
            for name, col in probabilities.items():
                pd.to_numeric(
                    col)  # pandas will raise if we have non-numerical values

            if _encode_predictions_and_truth_:
                assert np.array_equal(truth, truth.astype(
                    int)), "Values in truth column are not encoded."
                assert np.array_equal(preds, preds.astype(
                    int)), "Values in predictions column are not encoded."
                predictors_set = set(range(len(predictors)))
                validate_row = lambda r: r[:-2].astype(float).values.argmax(
                ) == r[-2]
            else:
                predictors_set = set(predictors)
                validate_row = lambda r: r[:-2].astype(float).idxmax() == r[-2]

            truth_set = set(truth.unique())
            if predictors_set < truth_set:
                log.warning(
                    "Truth column contains values unseen during training: no matching probability column."
                )
            if predictors_set > truth_set:
                log.warning(
                    "Truth column doesn't contain all the possible target values: the test dataset may be too small."
                )
            predictions_set = set(preds.unique())
            assert predictions_set <= predictors_set, "Predictions column contains unexpected values: {}.".format(
                predictions_set - predictors_set)
            assert predictions.apply(validate_row, axis=1).all(
            ), "Predictions don't always match the predictor with the highest probability."
Beispiel #22
0
def jCurrencyConversionChart(fromCurrency, toCurrencies, amount):
    fromCurrency = fromCurrency.upper()
    amount = float(amount)
    toCurrencies = [x.upper() for x in toCurrencies]

    result = {}
    for toCurrency in toCurrencies:
        result[toCurrency] = jConvertCurrency(fromCurrency,toCurrency,amount)
    result = DataFrame(result.items(), columns = ['to_currency','amount'])
    result['from_currency'] = fromCurrency
    return (result[['from_currency','to_currency','amount']])
Beispiel #23
0
def compute_matrix_from_columns(
    input_df: DataFrame, callback: Callable, remove_na: bool = True
) -> DataFrame:
    output_data = OrderedDict.fromkeys(input_df.columns)

    for feature1_name, feature_1_series in input_df.items():
        output_data[feature1_name] = OrderedDict.fromkeys(input_df.columns)

        for feature2_name, feature_2_series in input_df.items():

            if remove_na:
                mask = ~feature_1_series.isna() & ~feature_2_series.isna()
            else:
                mask = [True] * len(feature_1_series)

            output_data[feature1_name][feature2_name] = callback(
                feature_1_series[mask], feature_2_series[mask]
            )

    return DataFrame(output_data, columns=input_df.columns, index=input_df.columns)
Beispiel #24
0
    def kstest_dataframe(ror: pd.DataFrame, distr: str = 'norm') -> pd.DataFrame:
        """
        Kolmogorov-Smirnov test for goodness of fit test on time series in form of Pandas DataFrame.

        Returns:
            (The test statistic, The p-value for the hypothesis test)
        """
        test_dict = {}
        for label, content in ror.items():
            test_values = Frame.kstest_series(content, distr=distr)
            test_dict.update({label: test_values})
        return pd.DataFrame.from_dict(test_dict, orient='columns')
def save_pandas_to_lmdb(df: pd.DataFrame, lmdb_path: str, max_size: int = None):
    if max_size is None:
        max_size = df.memory_usage(deep=True).sum() * 1.5
        max_size = int(max_size)

    df.reset_index(drop=True)
    df = df.to_dict(orient='index')

    env = lmdb.open(lmdb_path, map_size=max_size)
    with env.begin(write=True) as txn:
        for k, v in tqdm(df.items()):
            txn.put(str(k).encode(), msgpack.packb(v))
Beispiel #26
0
def tranform_columns_to_categorical(
    df: pd.DataFrame, ordered: Dict = dict()) -> pd.DataFrame:
    df = df.copy()

    for n, c in df.items():
        if is_string_dtype(c):
            df[n] = c.astype('category').cat.as_ordered()

            if n in ordered:
                df[n] = df[n].cat.set_categories(ordered[n], ordered=True)

    return df
Beispiel #27
0
def cats_to_codes(dataframe: pd.DataFrame, max_n_cats: int = None):
    """
    converst categories to ints
    :param dataframe:
    :param max_n_cats: converts categories to int only if there are less than max_n_cats different categories on the column
    :return:
    """
    for col_name, col in dataframe.items():
        if not is_numeric_dtype(col) and (
                max_n_cats is None or len(col.cat.categories) > max_n_cats):
            dataframe[col_name] = pd.Categorical(
                col).codes + 1  # nulls are -1 so with +1 they are 0
    return dataframe
Beispiel #28
0
def to_dict(x: pd.DataFrame, drop_na: bool = True) -> Union[list, dict]:
    """ Convert a pandas DataFrame to a row-wise list of dictionaries """

    if isinstance(x, dict):
        pass
    if isinstance(x, pd.DataFrame):
        x = x.to_dict(orient='records')

    # Drop None values - these do not need te be send each time
    if drop_na:
        x = [{k: v for k, v in x.items() if not pd.isna(v)} for x in x]

    return x
    def test_from_records_dictlike(self):

        # test the dict methods
        df = DataFrame(
            {
                "A": np.array(np.random.randn(6), dtype=np.float64),
                "A1": np.array(np.random.randn(6), dtype=np.float64),
                "B": np.array(np.arange(6), dtype=np.int64),
                "C": ["foo"] * 6,
                "D": np.array([True, False] * 3, dtype=bool),
                "E": np.array(np.random.randn(6), dtype=np.float32),
                "E1": np.array(np.random.randn(6), dtype=np.float32),
                "F": np.array(np.arange(6), dtype=np.int32),
            }
        )

        # columns is in a different order here than the actual items iterated
        # from the dict
        blocks = df._to_dict_of_blocks()
        columns = []
        for b in blocks.values():
            columns.extend(b.columns)

        asdict = {x: y for x, y in df.items()}
        asdict2 = {x: y.values for x, y in df.items()}

        # dict of series & dict of ndarrays (have dtype info)
        results = []
        results.append(DataFrame.from_records(asdict).reindex(columns=df.columns))
        results.append(
            DataFrame.from_records(asdict, columns=columns).reindex(columns=df.columns)
        )
        results.append(
            DataFrame.from_records(asdict2, columns=columns).reindex(columns=df.columns)
        )

        for r in results:
            tm.assert_frame_equal(r, df)
Beispiel #30
0
def apply_cats(dataframe: pd.DataFrame, cats_dataframe: pd.DataFrame):
    """
    apply the categories in cats_dataframe to dataframe
    :param dataframe:
    :param cats_dataframe:
    :return:
    """
    for col_name, col in dataframe.items():
        if (col_name in cats_dataframe.columns) and (
                cats_dataframe[col_name].dtype.name == 'category'):
            dataframe[col_name] = col.astype('category').cat.as_ordered()
            dataframe[col_name] = col.cat.set_categories(
                cats_dataframe[col_name].car.categories, ordered=True)
    return dataframe
Beispiel #31
0
    def test_apply_differently_indexed(self):
        df = DataFrame(np.random.randn(20, 10))

        result0 = df.apply(Series.describe, axis=0)
        expected0 = DataFrame({i: v.describe()
                               for i, v in df.items()},
                              columns=df.columns)
        assert_frame_equal(result0, expected0)

        result1 = df.apply(Series.describe, axis=1)
        expected1 = DataFrame({i: v.describe()
                               for i, v in df.T.items()},
                              columns=df.index).T
        assert_frame_equal(result1, expected1)
Beispiel #32
0
def infer_schema_from_df(
    df: pd.DataFrame,
    features,
    entities,
    timestamp_key: str = None,
    entity_columns=None,
    options: InferOptions = InferOptions.Null,
):
    """infer feature set schema from dataframe"""
    timestamp_fields = []
    current_entities = list(entities.keys())
    entity_columns = entity_columns or []

    def upsert_entity(name, value_type):
        if name in current_entities:
            entities[name].value_type = value_type
        else:
            entities[name] = Entity(name=column, value_type=value_type)

    for column, series in df.items():
        value_type = _get_column_type(series)
        is_entity = column in entity_columns or column in current_entities
        if is_entity:
            upsert_entity(column, value_type)
        elif (
            InferOptions.get_common_options(options, InferOptions.Features)
            and column != timestamp_key
        ):
            if column in features.keys():
                features[column].value_type = value_type
            else:
                features[column] = Feature(name=column, value_type=value_type)
        if value_type == "datetime" and not is_entity:
            timestamp_fields.append(column)

    if InferOptions.get_common_options(options, InferOptions.Index):
        # infer types of index fields
        if df.index.name:
            value_type = _get_column_type(df.index)
            upsert_entity(df.index.name, value_type)
        elif df.index.nlevels > 1:
            for level, name in zip(df.index.levels, df.index.names):
                value_type = _get_column_type(level)
                upsert_entity(name, value_type)
                if value_type == "datetime":
                    timestamp_fields.append(name)

    if len(timestamp_fields) == 1 and not timestamp_key:
        return timestamp_fields[0]
    return timestamp_key
Beispiel #33
0
    def test_apply_differently_indexed(self):
        df = DataFrame(np.random.randn(20, 10))

        result0 = df.apply(Series.describe, axis=0)
        expected0 = DataFrame({i: v.describe()
                               for i, v in df.items()},
                              columns=df.columns)
        assert_frame_equal(result0, expected0)

        result1 = df.apply(Series.describe, axis=1)
        expected1 = DataFrame({i: v.describe()
                               for i, v in df.T.items()},
                              columns=df.index).T
        assert_frame_equal(result1, expected1)
Beispiel #34
0
 def get_common_processes_at_threshold(self,
                                       signif_gw_results: pd.DataFrame,
                                       affiliation_matrix: pd.DataFrame,
                                       threshold: float,
                                       go_type: str = "name"):
     x_common_processes = {
         k: pd.merge(df,
                     GenewalkObj.min_x_shared_processes(
                         affiliation_matrix, threshold),
                     left_on="go_name",
                     right_on=go_type)
         for k, df in signif_gw_results.items()
     }
     return x_common_processes
def get_numeric_features(data: pd.DataFrame) -> List[str]:
    """
    A function to get all numeric features in a pandas dataframe
    # Parameters
    data: `pd.DataFrame`:
        A pandas Dataframe

    # Returns
    List[str]:
        All numeric columns in the input dataframe
    """
    return [
        feature for feature, values in data.items() if is_numeric_dtype(values)
    ]
Beispiel #36
0
    def test_replace_input_formats_scalar(self):
        df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5],
                        'C': ['', 'asdf', 'fd']})

        # dict to scalar
        to_rep = {'A': np.nan, 'B': 0, 'C': ''}
        filled = df.replace(to_rep, 0)
        expected = {k: v.replace(to_rep[k], 0) for k, v in df.items()}
        assert_frame_equal(filled, DataFrame(expected))

        msg = "value argument must be scalar, dict, or Series"
        with pytest.raises(TypeError, match=msg):
            df.replace(to_rep, [np.nan, 0, ''])

        # list to scalar
        to_rep = [np.nan, 0, '']
        result = df.replace(to_rep, -1)
        expected = df.copy()
        for i in range(len(to_rep)):
            expected.replace(to_rep[i], -1, inplace=True)
        assert_frame_equal(result, expected)