Example #1
0
class Player:
    def __init__(self, first_name, last_name, id):
        self.first_name = first_name
        self.last_name = last_name
        self.id = id
        self.hrs = [0,0,0,0,0,0] #One for each month of the game
        self.hr_total = 0
        self.hr_series = Series()
        self.hr_total_series = Series()

    def __str__(self):
        return str.format('{0} : {1}', self.id, self.last_name)

    def __repr__(self):
        return self.__str__()

    def add_hrs(self, count, date):
        self.hr_total += count
        self.hr_total_series[date] = self.hr_series.sum() + count
        if(self.hr_series.last_valid_index() == date ):
            self.hr_series[date] = count + self.hr_series[date]
        else:
            self.hr_series[date] = count


    def name(self):
        return self.first_name + " " + self.last_name

    def get_player_hr_dataframe(self):
        return self.hr_series.to_frame(self.name())

    def get_player_hr_total_dataframe(self):
        return self.hr_total_series.to_frame(self.name())
Example #2
0
    def test_constructor_from_dense_series(self):
        # GH 19393
        # series with name
        x = Series(np.random.randn(10000), name='a')
        result = SparseDataFrame(x)
        expected = x.to_frame().to_sparse()
        tm.assert_sp_frame_equal(result, expected)

        # series with no name
        x = Series(np.random.randn(10000))
        result = SparseDataFrame(x)
        expected = x.to_frame().to_sparse()
        tm.assert_sp_frame_equal(result, expected)
Example #3
0
    def test_constructor_from_dense_series(self):
        # GH 19393
        # series with name
        x = Series(np.random.randn(10000), name='a')
        result = SparseDataFrame(x)
        expected = x.to_frame().to_sparse()
        tm.assert_sp_frame_equal(result, expected)

        # series with no name
        x = Series(np.random.randn(10000))
        result = SparseDataFrame(x)
        expected = x.to_frame().to_sparse()
        tm.assert_sp_frame_equal(result, expected)
def save_counts(file_read, file_write):
    data = df.from_csv(file_read)
    answer_count = Series.to_frame(
        data.groupby(['language'])['answer_count'].sum())
    question_count = Series.to_frame(
        data.groupby(['language'])['post_type'].count())
    answer_count = answer_count.reset_index()
    question_count = question_count.reset_index()
    question_count.columns = ['language', 'question_count']
    merge = df.merge(question_count, answer_count, on=['language', 'language'])
    merge['total'] = merge.question_count + merge.answer_count
    merge['ratio'] = merge.answer_count / merge.question_count
    merge.to_csv(file_write)
Example #5
0
def test_equals_None_vs_float():
    # GH#44190
    left = Series([-np.inf, np.nan, -1.0, 0.0, 1.0, 10 / 3, np.inf],
                  dtype=object)
    right = Series([None] * len(left))

    # these series were found to be equal due to a bug, check that they are correctly
    # found to not equal
    assert not left.equals(right)
    assert not right.equals(left)
    assert not left.to_frame().equals(right.to_frame())
    assert not right.to_frame().equals(left.to_frame())
    assert not Index(left, dtype="object").equals(Index(right, dtype="object"))
    assert not Index(right, dtype="object").equals(Index(left, dtype="object"))
Example #6
0
def match_data(bench: pd.Series, *args):
    """
    将不同的pandas.Series按照时间index对齐
    Parameters
    ----------
    bench
        对齐所要参照的基准

    args
        需要对对齐的序列

    Returns
    -------
    pandas.DataFrame
        对齐后的DataFrame

    """

    assert isinstance(bench.index, pd.DatetimeIndex), "Index should be pandas.DatetimeIndex!"

    res = bench.to_frame()
    for i in range(len(args)):
        res[args[i].name] = args[i]

    return res
Example #7
0
def simulate_new_housing(
    indiv_hh: DataFrame,
    archetype_new_build: pd.Series,
    percentage_demand_met: float,
    projected_la_housing_demand: DataFrame,
    year: int,
    random_state: int = 42,
) -> DataFrame:

    if percentage_demand_met == 0:
        new_housing = pd.DataFrame()
    else:
        annual_demand = projected_la_housing_demand.loc[year]
        new_housing_by_la: List[DataFrame] = []
        for la, demand in annual_demand.items():
            la_total_new_buildings = int(demand * percentage_demand_met)
            la_new_housing = (indiv_hh.query("local_authority == @la").sample(
                la_total_new_buildings,
                random_state=random_state).loc[:,
                                               ["local_authority", "EDNAME"]])
            new_housing_by_la.append(la_new_housing)

        new_housing_raw = pd.concat(new_housing_by_la).reset_index()
        archetype_properties = archetype_new_build.to_frame().T
        archetype_broadcast = pd.concat(
            [archetype_properties] * len(new_housing_raw)).reset_index(
            )  # broadcast archetype to the same length as new housing

        new_housing = pd.concat([new_housing_raw, archetype_broadcast],
                                axis=1).set_index("SMALL_AREA")

    return new_housing
Example #8
0
    def test_shift_dt64values_int_fill_deprecated(self):
        # GH#31971
        ser = Series([pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")])
        df = ser.to_frame()

        with tm.assert_produces_warning(FutureWarning):
            result = df.shift(1, fill_value=0)

        expected = Series([pd.Timestamp(0), ser[0]]).to_frame()
        tm.assert_frame_equal(result, expected)

        # axis = 1
        df2 = DataFrame({"A": ser, "B": ser})
        df2._consolidate_inplace()

        with tm.assert_produces_warning(FutureWarning):
            result = df2.shift(1, axis=1, fill_value=0)

        expected = DataFrame({
            "A": [pd.Timestamp(0), pd.Timestamp(0)],
            "B": df2["A"]
        })
        tm.assert_frame_equal(result, expected)

        # same thing but not consolidated
        # This isn't great that we get different behavior, but
        #  that will go away when the deprecation is enforced
        df3 = DataFrame({"A": ser})
        df3["B"] = ser
        assert len(df3._mgr.arrays) == 2
        result = df3.shift(1, axis=1, fill_value=0)
        expected = DataFrame({"A": [0, 0], "B": df2["A"]})
        tm.assert_frame_equal(result, expected)
Example #9
0
    def to_csv(self, path: str, file_name: str, data_: pd.Series):
        data_path_ = os.path.join(path, file_name + '.csv')
        data_df = data_.to_frame().T

        header = False if os.path.exists(data_path_) else True

        data_df.to_csv(data_path_, mode='a', header=header)
Example #10
0
    def test_imethods_with_dups(self):

        # GH6493
        # iat/iloc with dups

        s = Series(range(5), index=[1, 1, 2, 2, 3], dtype="int64")
        result = s.iloc[2]
        assert result == 2
        result = s.iat[2]
        assert result == 2

        msg = "index 10 is out of bounds for axis 0 with size 5"
        with pytest.raises(IndexError, match=msg):
            s.iat[10]
        msg = "index -10 is out of bounds for axis 0 with size 5"
        with pytest.raises(IndexError, match=msg):
            s.iat[-10]

        result = s.iloc[[2, 3]]
        expected = Series([2, 3], [2, 2], dtype="int64")
        tm.assert_series_equal(result, expected)

        df = s.to_frame()
        result = df.iloc[2]
        expected = Series(2, index=[0], name=2)
        tm.assert_series_equal(result, expected)

        result = df.iat[2, 0]
        assert result == 2
Example #11
0
def get_trend_variables(s: pd.Series,
                        field_name: str,
                        alpha: float = 0.25) \
        -> (pd.DataFrame, list):
    d = s.to_frame(name=field_name)

    ema_df = pd.DataFrame()
    new_name_trend_strength = field_name + '_trend_strength'
    new_name_trend_strength_weighted = field_name + '_trend_strength_weighted'

    range_len = 17
    index_range = range(1, range_len)
    for exp in index_range:
        h1 = alpha * (1 / (exp))
        h2 = alpha * (1 / (exp + 1))
        ema1 = d[field_name].ewm(alpha=h1).mean()
        ema2 = d[field_name].ewm(alpha=h2).mean()
        ema_df[str(exp)] = ema1 / ema2
        # ema_df[col_name] = (ema_df[col_name] / ema_df[col_name].shift(1)).fillna(1)
    # for exp in range(1, 17):
    #     ema_df[str(exp)] = d[field_name].rolling(exp*2).mean()
    #     ema_df[str(exp)] = (ema_df[str(exp)] / ema_df[str(exp)].shift(1)).fillna(1)

    # weights = [2**(v-1) for v in index_range]
    weights = [v for v in index_range]
    weights = pd.Series(list(map(lambda x: x / sum(weights), weights)),
                        index=ema_df.columns.values)
    d[new_name_trend_strength_weighted] = ema_df.subtract(-1).dot(weights)
    neg = np.sum((ema_df.values < 1), axis=1)
    pos = np.sum((ema_df.values > 1), axis=1)
    d[new_name_trend_strength] = ((pos - neg) + range_len) / (
        2 * range_len)  # creates an index, 0-1, of trend strength
    return d, [new_name_trend_strength, new_name_trend_strength_weighted]
Example #12
0
def return_pv(pv: pd.Series, shr_mem: list, prog_mem: list, coords: tuple,
              i: int):
    """
    Does necessary stuff to pv to convert it back to xarray (adds lat, lon) and saves it to shr_mem
    also updates and draws progress bar

    Parameters
    ----------
    pv : Pandas series
        containing calculated pv values
    shr_mem : List
        shared memory where all the calculated pv time series are stored
    prog_mem : List
        list indicating the overall progress of the computation, first value ([0]) is the total number
    of coordinate tuples to compute.
    coords : Tuple
        coordinates of pv station (lat, lon)
    i : int
        index where in shr_mem to save pv, unique for every coordinate tuple
    """

    pv = pv.to_frame()
    pv.columns = ["pv"]
    pv = pv.reset_index()
    pv["lat"] = coords[0]
    pv["lon"] = coords[1]
    pv = pv.set_index(["lon", "lat", "time"])
    shr_mem[i] = pv.to_xarray()
    prog_mem.append(1)
    len_coord_list = prog_mem[0]
    progress_bar(len(prog_mem), len_coord_list)
Example #13
0
 def test_to_sql_series(self):
     s = Series(np.arange(5, dtype='int64'), name='series')
     sql.to_sql(s, "test_series", self.conn, flavor='sqlite', index=False)
     s2 = sql.read_sql_query("SELECT * FROM test_series",
                             self.conn,
                             flavor='sqlite')
     tm.assert_frame_equal(s.to_frame(), s2)
Example #14
0
    def calc_daily_qty(final_qty: float, trades: pd.Series,
                       start_date: datetime, end_date: datetime) -> pd.Series:
        """Calculates the daily position quantities based on the final quantity
        and the trades occurred during the period."""
        df = pd.concat([
            pd.DataFrame(data={'position': [np.nan, final_qty]},
                         index=[start_date, end_date]),
            trades.to_frame('trade_qty')
        ])  # type: pd.DataFrame
        df.sort_index(inplace=True)

        df = df.resample('1D').sum()

        df.index.name = 'dt'
        df.reset_index(inplace=True)

        # Global fillna won't work with pandas 0.18:
        # https://github.com/pandas-dev/pandas/issues/7630
        df['trade_qty'].fillna(0, inplace=True)
        df['position'].fillna(0, inplace=True)

        # FIXME: looping is not nice
        # https://stackoverflow.com/questions/34855859/
        #   is-there-a-way-in-pandas-to-use-previous-row-value-
        #   in-dataframe-apply-when-previ
        for i in reversed(range(len(df) - 1)):
            df.loc[i, 'position'] = \
                df.loc[i + 1, 'position'] - df.loc[i + 1, 'trade_qty']

        df.index = df['dt']
        df.index.name = None

        return df['position']
Example #15
0
def object2proto(obj: pd.Series) -> PandasSeries_PB:
    """Convert pd.Series to PandasDataFrame_PB with pyarrow.

    Args:
        obj: target Series

    Returns:
        Serialized version of Series, which will be used to reconstruction.

    """
    # https://arrow.apache.org/docs/python/pandas.html
    # series must either be converted to a dataframe or use pa.Array
    # however pa.Array mentions you must account for the null values yourself
    dataframe = obj.to_frame()
    schema = pa.Schema.from_pandas(dataframe)
    table = pa.Table.from_pandas(dataframe)
    sink = pa.BufferOutputStream()

    writer = pa.ipc.new_file(sink, schema)
    writer.write(table)
    writer.close()

    buf = sink.getvalue()

    siz = len(buf)
    df_bytes = pa.compress(buf, asbytes=True)

    return PandasSeries_PB(series=df_bytes, decompressed_size=siz)
Example #16
0
    def test_imethods_with_dups(self):

        # GH6493
        # iat/iloc with dups

        s = Series(range(5), index=[1, 1, 2, 2, 3], dtype='int64')
        result = s.iloc[2]
        assert result == 2
        result = s.iat[2]
        assert result == 2

        msg = "index 10 is out of bounds for axis 0 with size 5"
        with pytest.raises(IndexError, match=msg):
            s.iat[10]
        msg = "index -10 is out of bounds for axis 0 with size 5"
        with pytest.raises(IndexError, match=msg):
            s.iat[-10]

        result = s.iloc[[2, 3]]
        expected = Series([2, 3], [2, 2], dtype='int64')
        tm.assert_series_equal(result, expected)

        df = s.to_frame()
        result = df.iloc[2]
        expected = Series(2, index=[0], name=2)
        tm.assert_series_equal(result, expected)

        result = df.iat[2, 0]
        assert result == 2
Example #17
0
    def test_imethods_with_dups(self):

        # GH6493
        # iat/iloc with dups

        s = Series(range(5), index=[1, 1, 2, 2, 3], dtype='int64')
        result = s.iloc[2]
        assert result == 2
        result = s.iat[2]
        assert result == 2

        pytest.raises(IndexError, lambda: s.iat[10])
        pytest.raises(IndexError, lambda: s.iat[-10])

        result = s.iloc[[2, 3]]
        expected = Series([2, 3], [2, 2], dtype='int64')
        tm.assert_series_equal(result, expected)

        df = s.to_frame()
        result = df.iloc[2]
        expected = Series(2, index=[0], name=2)
        tm.assert_series_equal(result, expected)

        result = df.iat[2, 0]
        assert result == 2
Example #18
0
def s_plot(s: pd.Series) -> NotebookChart:
    "Generate a Chartify plot of this series, auto-detecting the plot type."
    x_axis_type = _detect_axis_type(s.index)
    ch = NotebookChart(x_axis_type=x_axis_type, blank_labels=True)

    df = s.to_frame().reset_index()
    df.columns = ['index', 'value']

    if x_axis_type == 'categorical':
        ch.plot.bar(
            df,
            categorical_columns='index',
            numeric_column='value',
        )
    else:
        ch.plot.line(
            df,
            x_column='index',
            y_column='value',
        )

    if s.name:
        ch.axes.set_yaxis_label(s.name)

    if s.index.name:
        ch.axes.set_xaxis_label(s.index.name)

    return ch
Example #19
0
    def generate_text_features(self, X: Series, feature: str) -> DataFrame:
        X: DataFrame = X.to_frame(name=feature)
        X[feature +
          '.char_count'] = [self.char_count(value) for value in X[feature]]
        X[feature +
          '.word_count'] = [self.word_count(value) for value in X[feature]]
        X[feature + '.capital_ratio'] = [
            self.capital_ratio(value) for value in X[feature]
        ]
        X[feature +
          '.lower_ratio'] = [self.lower_ratio(value) for value in X[feature]]
        X[feature +
          '.digit_ratio'] = [self.digit_ratio(value) for value in X[feature]]
        X[feature + '.special_ratio'] = [
            self.special_ratio(value) for value in X[feature]
        ]

        symbols = [
            '!', '?', '@', '%', '$', '*', '&', '#', '^', '.', ':', ' ', '/',
            ';', '-', '='
        ]
        for symbol in symbols:
            X[feature + '.symbol_count.' + symbol] = [
                self.symbol_in_string_count(value, symbol)
                for value in X[feature]
            ]
            X[feature + '.symbol_ratio.' +
              symbol] = X[feature + '.symbol_count.' +
                          symbol] / X[feature + '.char_count']
            X[feature + '.symbol_ratio.' + symbol].fillna(0, inplace=True)

        X = X.drop(feature, axis=1)

        return X
Example #20
0
def statistically_significant_symbols(p:pd.Series, not_significant="ns", return_pvalues=False):
    """
    # Lexicon
    # ns
    # P > 0.05
    # *
    # P ≤ 0.05
    # **
    # P ≤ 0.01
    # ***
    # P ≤ 0.001
    # ****
    #  P ≤ 0.0001 (For the last two choices only)

    Future:  Make this customizable
    """
    if not hasattr(p, "__iter__"):
        if p > 0.05:
            return not_significant
        symbol = ""
        if p <= 0.05:
            symbol += "*"
        if p <= 0.01:
            symbol += "*"
        if p <= 0.001:
            symbol += "*"
        if p <= 0.0001:
            symbol += "*"
        return symbol
    else:
        symbols =  pd.Series(p).map(lambda x:statistically_significant_symbols(x, not_significant=not_significant))
        if return_pvalues:
            return pd.concat([symbols.to_frame("symbol"), p.to_frame("p_value")], axis=1)
        else:
            return symbols
Example #21
0
    def test_imethods_with_dups(self):

        # GH6493
        # iat/iloc with dups

        s = Series(range(5), index=[1, 1, 2, 2, 3], dtype='int64')
        result = s.iloc[2]
        assert result == 2
        result = s.iat[2]
        assert result == 2

        pytest.raises(IndexError, lambda: s.iat[10])
        pytest.raises(IndexError, lambda: s.iat[-10])

        result = s.iloc[[2, 3]]
        expected = Series([2, 3], [2, 2], dtype='int64')
        tm.assert_series_equal(result, expected)

        df = s.to_frame()
        result = df.iloc[2]
        expected = Series(2, index=[0], name=2)
        tm.assert_series_equal(result, expected)

        result = df.iat[2, 0]
        assert result == 2
Example #22
0
 def _is_series_of_strings(series_to_test: pd.Series) -> bool:
     if not isinstance(series_to_test, pd.Series):
         return False
     elif series_to_test.to_frame().applymap(
             lambda x: not isinstance(x, str)).squeeze().any():
         return False
     return True
Example #23
0
    def test_where_datetimelike_noop(self, dtype):
        # GH#45135, analogue to GH#44181 for Period don't raise on no-op
        # For td64/dt64/dt64tz we already don't raise, but also are
        #  checking that we don't unnecessarily upcast to object.
        ser = Series(np.arange(3) * 10**9, dtype=np.int64).view(dtype)
        df = ser.to_frame()
        mask = np.array([False, False, False])

        res = ser.where(~mask, "foo")
        tm.assert_series_equal(res, ser)

        mask2 = mask.reshape(-1, 1)
        res2 = df.where(~mask2, "foo")
        tm.assert_frame_equal(res2, df)

        res3 = ser.mask(mask, "foo")
        tm.assert_series_equal(res3, ser)

        res4 = df.mask(mask2, "foo")
        tm.assert_frame_equal(res4, df)

        # opposite case where we are replacing *all* values -> we downcast
        #  from object dtype # GH#45768
        res5 = df.where(mask2, 4)
        expected = DataFrame(4, index=df.index, columns=df.columns)
        tm.assert_frame_equal(res5, expected)

        # unlike where, Block.putmask does not downcast
        df.mask(~mask2, 4, inplace=True)
        tm.assert_frame_equal(df, expected.astype(object))
Example #24
0
def mock_stock_data_provider_closing_prices(
        closing_prices: pd.Series) -> MagicMock:
    with patch('tm.StockDataProvider') as mock:
        instance = mock.return_value
        type(instance).history = PropertyMock(
            return_value=closing_prices.to_frame(name='Close'))
    return instance
Example #25
0
    def _to_label_features_with_min_cut(
            self,
            feature_col: pd.Series,
            feature_name: str,
            category_min_cnt: int = None,
            category_min_rate: float = None,
            aggregation_value='others') -> pd.DataFrame:
        feature_col: pd.Series = feature_col.astype('object')

        if category_min_cnt is not None or category_min_rate is not None:
            category_cnt_series: pd.Series = feature_col.value_counts()
            sum_cnt = len(feature_col.values)

            for category_val, cnt in zip(category_cnt_series.index,
                                         category_cnt_series.values):
                if (category_min_cnt is not None and cnt < category_min_cnt) \
                        or (category_min_rate is not None and float(cnt) / float(sum_cnt) < category_min_rate):
                    feature_col.replace(category_val,
                                        aggregation_value,
                                        inplace=True)

        feature_col: pd.Series = feature_col.astype('category')
        feature_df = feature_col.to_frame(name=feature_name)

        dummy_df: pd.DataFrame = pd.get_dummies(feature_df, drop_first=False)
        self._feature_columns = dummy_df.columns

        return dummy_df
Example #26
0
def event_plot(series: pd.Series) -> Axes:
    """Return matplotlib event plot"""
    plt.style.use("bmh")

    if series.empty:
        return empty_mpl_figure()

    df = series.to_frame(name="timestamp")
    df.loc[:, "event"] = 1 + (np.random.rand(df.size) - 0.5) / 5  # jitter
    ax = df.plot.scatter(
        x="timestamp",
        y="event",
        c="None",
        edgecolors="blue",
        alpha=1,
        s=80,
        figsize=(15, 5),
        legend=False,
        grid=True,
    )

    # set x axis
    ax.xaxis.set_major_formatter(DateFormatter("%b %d %H:%M"))
    buffer = ((series.max() - series.min()) / 30) + timedelta(seconds=1)
    ax.set_xlim(left=series.min() - buffer,
                right=pd.Timestamp.utcnow() + buffer)
    ax.set_xlabel("Timestamp (UTC)")

    # set y axis
    ax.set_ybound(0, 2)
    ax.axes.get_yaxis().set_visible(False)

    plt.tight_layout()
    return ax
Example #27
0
def get_predict_increased_data_frame2(sel_frame, target, predict_start_day,
                                      shift_days, params):
    # 建立ARIMA模型
    test_frame = sel_frame.set_index('createtime')
    test_frame: DataFrame = test_frame.asfreq('1H', method='bfill')
    print("pre_count:" + str(test_frame.count()))

    print("dup_count:" + str(test_frame.count()))

    model: ARIMA = ARIMA(test_frame, (params[0], params[1], params[2])).fit()
    # 给出一份模型报告
    # print("model summary:\n" + str(model.summary2()))

    # 复制最后一个数据,并加上ARIMA预测出的结果
    en_day = datetime.datetime.strptime(
        predict_start_day, "%Y-%m-%d") + datetime.timedelta(days=shift_days)
    predict_dta: DataFrame = Series.to_frame(
        model.predict(start=predict_start_day, end=en_day))
    predict_dta.columns = ['predict_' + target]
    print("predict_dta:" + str(predict_dta.head(10)))
    predict_dta = predict_dta.cumsum()
    print("predict_dta_cumsum:" + str(predict_dta.head(10)))

    last_value = test_frame.tail(1)
    final_predict_dta = predict_dta + last_value.values[0][0]

    print("final_predict_dta:" + str(final_predict_dta.head(10)))

    return final_predict_dta
Example #28
0
def get_predict_increased_data_frame(sel_frame, target, predict_start_day,
                                     shift_days, params):
    # 建立ARIMA模型
    test_frame = sel_frame.set_index('createtime')
    test_frame: DataFrame = test_frame.asfreq('1H', method='bfill')
    print("pre_count:" + str(test_frame.count()))

    # test_frame = duplicate_frame(test_frame, predict_start_day, shift_days)
    # print("dup_count:" + str(test_frame.count()))

    model: ARIMA = ARIMA(test_frame, (params[0], params[1], params[2])).fit()
    # 给出一份模型报告
    # print("model summary:\n" + str(model.summary2()))

    en_day = datetime.datetime.strptime(
        predict_start_day, "%Y-%m-%d") + datetime.timedelta(
            days=shift_days) + datetime.timedelta(seconds=-1)
    times = pd.date_range(predict_start_day, en_day, freq='h')
    series = Series(model.forecast(shift_days * 24)[0], index=times)
    predict_dta: DataFrame = Series.to_frame(series)
    predict_dta.columns = ['predict_' + target]
    print("predict_dta:" + str(
        model.predict(start=predict_start_day,
                      end=en_day + datetime.timedelta(seconds=1)).head(10)))
    print("forecast_dta:" + str(model.forecast(100)))

    max_value = test_frame.max()[0]
    min_value = test_frame.min()[0]
    predict_dta = predict_dta['predict_' + target].map(
        lambda a: a - max_value + min_value if a >= max_value else a)

    return predict_dta
Example #29
0
 def _get_first_names_probs(self, first_names: pd.Series) -> pd.DataFrame:
     first_names_probs = first_names.to_frame().merge(
         p_first_name_given_race_df,
         left_on='first_name',
         right_index=True,
         how='left',
     )
     return first_names_probs
Example #30
0
    def test_flags_identity(self, frame_or_series):
        s = Series([1, 2])
        if frame_or_series is DataFrame:
            s = s.to_frame()

        assert s.flags is s.flags
        s2 = s.copy()
        assert s2.flags is not s.flags
Example #31
0
 def _get_last_name_probs(self, last_names: pd.Series) -> pd.DataFrame:
     last_names_probs = last_names.to_frame().merge(
         self._PROB_RACE_GIVEN_SURNAME,
         left_on='last_name',
         right_index=True,
         how='left',
     )
     return last_names_probs
Example #32
0
    def log_series(s: pd.Series, message: str = None):

        if not isinstance(s, pd.Series):
            raise TypeError("Argument must be a pandas.Series")

        logger = logging.getLogger()
        logger.debug("{}\n\n{}\n".format(message if message else "",
                                         s.to_frame().to_markdown()))
Example #33
0
    def test_flags_identity(self, as_frame):
        s = Series([1, 2])
        if as_frame:
            s = s.to_frame()

        assert s.flags is s.flags
        s2 = s.copy()
        assert s2.flags is not s.flags
Example #34
0
    def test_flags_identity(self, frame_or_series):
        obj = Series([1, 2])
        if frame_or_series is DataFrame:
            obj = obj.to_frame()

        assert obj.flags is obj.flags
        obj2 = obj.copy()
        assert obj2.flags is not obj.flags
Example #35
0
    def test_ops_datetimelike_align(self):
        # GH 7500
        # datetimelike ops need to align
        dt = Series(date_range('2012-1-1', periods=3, freq='D'))
        dt.iloc[2] = np.nan
        dt2 = dt[::-1]

        expected = Series([timedelta(0), timedelta(0), pd.NaT])
        # name is reset
        result = dt2 - dt
        assert_series_equal(result, expected)

        expected = Series(expected, name=0)
        result = (dt2.to_frame() - dt.to_frame())[0]
        assert_series_equal(result, expected)
Example #36
0
    def _z_test_word_list(word_count_series_one: pd.Series,
                          word_count_series_two: pd.Series) -> pd.Series:
        """Run z-test on all the words of two input word lists.

        :param word_count_series_one: a pandas series where:
            - the data is the word counts.
            - the index is the corresponding words.
            - the name depends on the what the input is. If a file is given,
              the name will be string "File" add the actual file name, or if a
              class is given, the name will be string "class" add the actual
              class name.
        :param word_count_series_two: a pandas series where:
            - the data is the word counts.
            - the index is the corresponding words.
            - the name depends on the what the input is. If a file is given,
              the name will be string "File" add the actual file name, or if a
              class is given, the name will be string "class" add the actual
              class name.
        :return: a panda series where:
            - the data is the z-scores.
            - the index is the corresponding words.
            - the name is a readable header for analysis result.
        """
        # Find sample population of the two input data set.
        total_word_count_one = word_count_series_one.sum()
        total_word_count_two = word_count_series_two.sum()

        # Join two input pandas series together to avoid making the assumption
        # that they are parallel array in future analysis.
        joined_data_frame = word_count_series_one.to_frame().join(
            word_count_series_two.to_frame())

        # Perform the z-test to detect word anomalies.
        # We are using dict instead of pandas series here, because this method
        # requires 'full_word_score_dict' to be sorted via the absolute value
        # of the z-scores (the 'value' of the dictionary).
        # For code clarity we use this as a temp solution, but in future we
        # can implement the 'sort_by' function for series in our general
        # functions if we need it for better performance.
        full_word_score_dict = \
            {word: TopwordModel._z_test(p1=count1 / total_word_count_one,
                                        p2=count2 / total_word_count_two,
                                        n1=total_word_count_one,
                                        n2=total_word_count_two)
             for word, [count1, count2] in joined_data_frame.iterrows()}

        # Filter out the insignificant result.
        sig_word_score_dict = \
            {word: z_score for word, z_score in full_word_score_dict.items()
             if abs(z_score) >= 1.96}

        # Sort 'sig_word_score_dict' by absolute value of z-scores in
        # descending order.
        sorted_dict = OrderedDict(sorted(sig_word_score_dict.items(),
                                         key=lambda item: abs(item[1]),
                                         reverse=True))

        # Convert the sorted result to a panda series.
        result_series = pd.Series(sorted_dict)
        # Set the result series name.
        result_series.name = f"{word_count_series_one.name} compares to " \
                             f"{word_count_series_two.name}"

        return result_series
    test_x_t = text.iloc[test_idx]
    train_x_v = df.iloc[train_idx, :]
    test_x_v = df.iloc[test_idx, :]

    tf = tf_vectorizer.fit_transform(train_x_t.tolist())
    train_x = hstack([csr_matrix(train_x_v), tf], format='csr')

    tf = tf_vectorizer.fit_transform(test_x_t.tolist())
    test_x = hstack([csr_matrix(test_x_v), tf], format='csr')

    test_x_list.append(test_x)
    test_y_all = test_y_all.append(test_y, ignore_index=True)

    _ = sgdr.partial_fit(train_x, train_y, classes=[0.0, 1.0])

    print time() - t1

test_x = vstack(test_x_list, format='csr')
test_y = test_y_all.to_frame('original')
test_y['predicted'] = sgdr.predict_proba(test_x)[1]



mad = (test_y['original'] - test_y['predicted']).abs().median()
type_s = (test_y['original'].gt(0.0) == test_y['predicted'].gt(0.0)).mean()
fn = (test_y['original'].gt(0.0) & test_y['predicted'].le(0.0)).mean()
fp = (test_y['original'].le(0.0) & test_y['predicted'].gt(0.0)).mean()


import seaborn as sns
sns.distplot(test_y['original'])
Example #38
0
 def test_to_sql_series(self):
     s = Series(np.arange(5, dtype='int64'), name='series')
     sql.to_sql(s, "test_series", self.conn, flavor='sqlite', index=False)
     s2 = sql.read_sql("SELECT * FROM test_series", self.conn, 
                       flavor='sqlite')
     tm.assert_frame_equal(s.to_frame(), s2)
Example #39
0
def alpha_correlation(output_dir: str,
                      alpha_diversity: pd.Series,
                      metadata: qiime2.Metadata,
                      method: str = 'spearman') -> None:
    try:
        alpha_correlation_fn = _alpha_correlation_fns[method]
    except KeyError:
        raise ValueError('Unknown alpha correlation method %s. The available '
                         'options are %s.' %
                         (method, ', '.join(_alpha_correlation_fns.keys())))

    # Filter metadata to only include IDs present in the alpha diversity data.
    # Also ensures every alpha diversity ID is present in the metadata.
    metadata = metadata.filter_ids(alpha_diversity.index)

    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='numeric',
                                       drop_all_missing=True)
    filtered_columns = pre_filtered_cols - set(metadata.columns)

    if len(metadata.columns) == 0:
        raise ValueError(
            "Metadata contains only non-numeric or empty columns. This "
            "visualizer requires at least one numeric metadata column to "
            "execute.")

    # save out metadata for download in viz
    alpha_diversity.index.name = 'id'
    alpha = qiime2.Metadata(alpha_diversity.to_frame())
    md = metadata.merge(alpha)
    md.save(os.path.join(output_dir, 'metadata.tsv'))

    filenames = []
    for column in metadata.columns:
        metadata_column = metadata.get_column(column)
        metadata_column = metadata_column.drop_missing_values()

        # create a dataframe containing the data to be correlated, and drop
        # any samples that have no data in either column
        df = pd.concat([metadata_column.to_series(), alpha_diversity], axis=1,
                       join='inner')

        # compute correlation
        correlation_result = alpha_correlation_fn(df[metadata_column.name],
                                                  df[alpha_diversity.name])

        warning = None
        if alpha_diversity.shape[0] != df.shape[0]:
            warning = {'initial': alpha_diversity.shape[0],
                       'method': method.title(),
                       'filtered': df.shape[0]}

        escaped_column = quote(column)
        filename = 'column-%s.jsonp' % escaped_column
        filenames.append(filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            fh.write("load_data('%s'," % column)
            df.to_json(fh, orient='split')
            fh.write(",")
            json.dump(warning, fh)
            fh.write(",")
            json.dump({
                'method': method.title(),
                'testStat': '%1.4f' % correlation_result[0],
                'pVal': '%1.4f' % correlation_result[1],
                'sampleSize': df.shape[0]}, fh)
            fh.write(");")

    index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'columns': [quote(fn) for fn in filenames],
        'filtered_columns': ', '.join(sorted(filtered_columns))})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_correlation_assets',
                                 'dist'),
                    os.path.join(output_dir, 'dist'))
Example #40
0
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series,
                             metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the alpha diversity data.
    # Also ensures every alpha diversity ID is present in the metadata.
    metadata = metadata.filter_ids(alpha_diversity.index)

    # Metadata column filtering could be done in one pass, but this visualizer
    # displays separate warnings for non-categorical columns, and categorical
    # columns that didn't satisfy the requirements of the statistics being
    # computed.
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='categorical')
    non_categorical_columns = pre_filtered_cols - set(metadata.columns)

    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(
        drop_all_unique=True, drop_zero_variance=True, drop_all_missing=True)
    filtered_columns = pre_filtered_cols - set(metadata.columns)

    if len(metadata.columns) == 0:
        raise ValueError(
            "Metadata does not contain any columns that satisfy this "
            "visualizer's requirements. There must be at least one metadata "
            "column that contains categorical data, isn't empty, doesn't "
            "consist of unique values, and doesn't consist of exactly one "
            "value.")

    metric_name = alpha_diversity.name

    # save out metadata for download in viz
    alpha_diversity.index.name = 'id'
    alpha = qiime2.Metadata(alpha_diversity.to_frame())
    md = metadata.merge(alpha)
    md.save(os.path.join(output_dir, 'metadata.tsv'))

    filenames = []
    filtered_group_comparisons = []
    for column in metadata.columns:
        metadata_column = metadata.get_column(column)
        metadata_column = metadata_column.drop_missing_values()

        initial_data_length = alpha_diversity.shape[0]
        data = pd.concat([alpha_diversity, metadata_column.to_series()],
                         axis=1, join='inner')
        filtered_data_length = data.shape[0]

        names = []
        groups = []
        for name, group in data.groupby(metadata_column.name):
            names.append('%s (n=%d)' % (name, len(group)))
            groups.append(list(group[metric_name]))

        escaped_column = quote(column)
        escaped_column = escaped_column.replace('/', '%2F')
        filename = 'column-%s.jsonp' % escaped_column
        filenames.append(filename)

        # perform Kruskal-Wallis across all groups
        kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups)

        # perform pairwise Kruskal-Wallis across all pairs of groups and
        # correct for multiple comparisons
        kw_H_pairwise = []
        for i in range(len(names)):
            for j in range(i):
                try:
                    H, p = scipy.stats.mstats.kruskalwallis(groups[i],
                                                            groups[j])
                    kw_H_pairwise.append([names[j], names[i], H, p])
                except ValueError:
                    filtered_group_comparisons.append(
                        ['%s:%s' % (column, names[i]),
                         '%s:%s' % (column, names[j])])
        kw_H_pairwise = pd.DataFrame(
            kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value'])
        kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True)
        kw_H_pairwise['q-value'] = multipletests(
            kw_H_pairwise['p-value'], method='fdr_bh')[1]
        kw_H_pairwise.sort_index(inplace=True)
        pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column
        pairwise_path = os.path.join(output_dir, pairwise_fn)
        kw_H_pairwise.to_csv(pairwise_path)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            series = pd.Series(groups, index=names)

            fh.write("load_data('%s'," % column)
            series.to_json(fh, orient='split')
            fh.write(",")
            json.dump({'initial': initial_data_length,
                       'filtered': filtered_data_length}, fh)
            fh.write(",")
            json.dump({'H': kw_H_all, 'p': kw_p_all}, fh)
            fh.write(",'")
            table = q2templates.df_to_html(kw_H_pairwise)
            fh.write(table.replace('\n', '').replace("'", "\\'"))
            fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name))

    index = os.path.join(
        TEMPLATES, 'alpha_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'columns': [quote(fn) for fn in filenames],
        'non_categorical_columns': ', '.join(sorted(non_categorical_columns)),
        'filtered_columns': ', '.join(sorted(filtered_columns)),
        'filtered_group_comparisons':
            '; '.join([' vs '.join(e) for e in filtered_group_comparisons])})

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'),
        os.path.join(output_dir, 'dist'))
Example #41
0
def _25(series: pd.Series) -> TSVTaxonomyFormat:
    return _dataframe_to_tsv_taxonomy_format(series.to_frame())