def fit_ls_to_EIregret_sub(df, wid, subseries_fname, savefolder, force_run_all=False):
    # try:
    tic = time()
    print 'begin ' + wid
    sys.stdout.flush()
    subfname = savefolder + subseries_fname + wid + '.pkl'
    if (not isfile(subfname)) or force_run_all:
        print str(wid) + ' fitting now...'
        # fit subject length-scale
        subfit = fit_sub(df, wid)
        # format into series
        print 'converting to series...'
        s_subfit = Series(subfit)
        # make names more sensible
        s_subfit.rename({'fun': 'sse', #FIXME: no longer sse; should be cumulative regret
                            'x': 'fit_ls'},
                            inplace=True)
        s_subfit['workerid'] = wid
        # pickle
        print 'saving...'
        s_subfit.to_pickle(subfname)
        toc = time()
        print wid + ' saved successfully in ' + str(toc-tic) + ' seconds'
    else:
        print wid + 'already run (pass force_run_all=True to force rerun)'
    # except:
    #     print Warning(wid + ' failed')
    sys.stdout.flush()
Example #2
0
 def test_rename_set_name_inplace(self):
     s = Series(range(3), index=list('abc'))
     for name in ['foo', 123, 123., datetime(2001, 11, 11), ('foo',)]:
         s.rename(name, inplace=True)
         self.assertEqual(s.name, name)
         self.assert_numpy_array_equal(s.index.values,
                                       np.array(['a', 'b', 'c']))
Example #3
0
    def test_rename_set_name_inplace(self):
        s = Series(range(3), index=list('abc'))
        for name in ['foo', 123, 123., datetime(2001, 11, 11), ('foo',)]:
            s.rename(name, inplace=True)
            assert s.name == name

            exp = np.array(['a', 'b', 'c'], dtype=np.object_)
            tm.assert_numpy_array_equal(s.index.values, exp)
Example #4
0
def test_rename():
    # GH 17407
    s = Series(range(1, 6), index=pd.Index(range(2, 7), name='IntIndex'))
    result = s.rename(str)
    expected = s.rename(lambda i: str(i))
    assert_series_equal(result, expected)

    assert result.name == expected.name
Example #5
0
    def test_rename_set_name_inplace(self):
        s = Series(range(3), index=list("abc"))
        for name in ["foo", 123, 123.0, datetime(2001, 11, 11), ("foo",)]:
            s.rename(name, inplace=True)
            self.assertEqual(s.name, name)

            exp = np.array(["a", "b", "c"], dtype=np.object_)
            self.assert_numpy_array_equal(s.index.values, exp)
Example #6
0
 def test_rename_set_name(self):
     s = Series(range(4), index=list('abcd'))
     for name in ['foo', 123, 123., datetime(2001, 11, 11), ('foo',)]:
         result = s.rename(name)
         assert result.name == name
         tm.assert_numpy_array_equal(result.index.values, s.index.values)
         assert s.name is None
Example #7
0
 def test_rename_set_name(self):
     s = Series(range(4), index=list("abcd"))
     for name in ["foo", 123, 123.0, datetime(2001, 11, 11), ("foo",)]:
         result = s.rename(name)
         self.assertEqual(result.name, name)
         self.assert_numpy_array_equal(result.index.values, s.index.values)
         self.assertTrue(s.name is None)
Example #8
0
 def test_rename_set_name(self):
     s = Series(range(4), index=list('abcd'))
     for name in ['foo', ['foo'], ('foo',)]:
         result = s.rename(name)
         self.assertEqual(result.name, name)
         self.assert_numpy_array_equal(result.index.values, s.index.values)
         self.assertTrue(s.name is None)
Example #9
0
    def test_rename(self):
        renamer = lambda x: x.strftime("%Y%m%d")
        renamed = self.ts.rename(renamer)
        self.assertEqual(renamed.index[0], renamer(self.ts.index[0]))

        # dict
        rename_dict = dict(zip(self.ts.index, renamed.index))
        renamed2 = self.ts.rename(rename_dict)
        assert_series_equal(renamed, renamed2)

        # partial dict
        s = Series(np.arange(4), index=["a", "b", "c", "d"], dtype="int64")
        renamed = s.rename({"b": "foo", "d": "bar"})
        self.assert_numpy_array_equal(renamed.index, ["a", "foo", "c", "bar"])

        # index with name
        renamer = Series(np.arange(4), index=Index(["a", "b", "c", "d"], name="name"), dtype="int64")
        renamed = renamer.rename({})
        self.assertEqual(renamed.index.name, renamer.index.name)
Example #10
0
    def test_groupby_multi_categorical_as_index(self):
        # GH13204
        df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]),
                        'A': [10, 11, 11],
                        'B': [101, 102, 103]})
        result = df.groupby(['cat', 'A'], as_index=False).sum()
        expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]),
                              'A': [10, 11, 10, 11, 10, 11],
                              'B': [101.0, nan, nan, 205.0, nan, nan]},
                             columns=['cat', 'A', 'B'])
        tm.assert_frame_equal(result, expected)

        # function grouper
        f = lambda r: df.loc[r, 'A']
        result = df.groupby(['cat', f], as_index=False).sum()
        expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]),
                              'A': [10.0, nan, nan, 22.0, nan, nan],
                              'B': [101.0, nan, nan, 205.0, nan, nan]},
                             columns=['cat', 'A', 'B'])
        tm.assert_frame_equal(result, expected)

        # another not in-axis grouper
        s = Series(['a', 'b', 'b'], name='cat2')
        result = df.groupby(['cat', s], as_index=False).sum()
        expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]),
                              'A': [10.0, nan, nan, 22.0, nan, nan],
                              'B': [101.0, nan, nan, 205.0, nan, nan]},
                             columns=['cat', 'A', 'B'])
        tm.assert_frame_equal(result, expected)

        # GH18872: conflicting names in desired index
        pytest.raises(ValueError, lambda: df.groupby(['cat',
                                                      s.rename('cat')]).sum())

        # is original index dropped?
        expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]),
                              'A': [10, 11, 10, 11, 10, 11],
                              'B': [101.0, nan, nan, 205.0, nan, nan]},
                             columns=['cat', 'A', 'B'])

        group_columns = ['cat', 'A']

        for name in [None, 'X', 'B', 'cat']:
            df.index = Index(list("abc"), name=name)

            if name in group_columns and name in df.index.names:
                with tm.assert_produces_warning(FutureWarning,
                                                check_stacklevel=False):
                    result = df.groupby(group_columns, as_index=False).sum()

            else:
                result = df.groupby(group_columns, as_index=False).sum()

            tm.assert_frame_equal(result, expected, check_index_type=True)
Example #11
0
    def test_rename(self):
        renamer = lambda x: x.strftime('%Y%m%d')
        renamed = self.ts.rename(renamer)
        self.assertEqual(renamed.index[0], renamer(self.ts.index[0]))

        # dict
        rename_dict = dict(zip(self.ts.index, renamed.index))
        renamed2 = self.ts.rename(rename_dict)
        assert_series_equal(renamed, renamed2)

        # partial dict
        s = Series(np.arange(4), index=['a', 'b', 'c', 'd'], dtype='int64')
        renamed = s.rename({'b': 'foo', 'd': 'bar'})
        self.assert_index_equal(renamed.index, Index(['a', 'foo', 'c', 'bar']))

        # index with name
        renamer = Series(np.arange(4),
                         index=Index(['a', 'b', 'c', 'd'], name='name'),
                         dtype='int64')
        renamed = renamer.rename({})
        self.assertEqual(renamed.index.name, renamer.index.name)
    def test_rename(self):
        renamer = lambda x: x.strftime('%Y%m%d')
        renamed = self.ts.rename(renamer)
        assert renamed.index[0] == renamer(self.ts.index[0])

        # dict
        rename_dict = dict(zip(self.ts.index, renamed.index))
        renamed2 = self.ts.rename(rename_dict)
        assert_series_equal(renamed, renamed2)

        # partial dict
        s = Series(np.arange(4), index=['a', 'b', 'c', 'd'], dtype='int64')
        renamed = s.rename({'b': 'foo', 'd': 'bar'})
        tm.assert_index_equal(renamed.index, Index(['a', 'foo', 'c', 'bar']))

        # index with name
        renamer = Series(np.arange(4),
                         index=Index(['a', 'b', 'c', 'd'], name='name'),
                         dtype='int64')
        renamed = renamer.rename({})
        assert renamed.index.name == renamer.index.name
Example #13
0
 def test_rename_axis_supported(self):
     # Supporting axis for compatibility, detailed in GH-18589
     s = Series(range(5))
     s.rename({}, axis=0)
     s.rename({}, axis='index')
     with tm.assert_raises_regex(ValueError, 'No axis named 5'):
         s.rename({}, axis=5)
Example #14
0
 def test_rename_axis_supported(self):
     # Supporting axis for compatibility, detailed in GH-18589
     s = Series(range(5))
     s.rename({}, axis=0)
     s.rename({}, axis="index")
     with pytest.raises(ValueError, match="No axis named 5"):
         s.rename({}, axis=5)
Example #15
0
 def test_rename_axis_supported(self):
     # Supporting axis for compatibility, detailed in GH-18589
     s = Series(range(5))
     s.rename({}, axis=0)
     s.rename({}, axis='index')
     with pytest.raises(ValueError, match='No axis named 5'):
         s.rename({}, axis=5)
Example #16
0
def plot_quantile_accuracy_heatmap(actual: pd.Series,
                                   prediction: pd.Series,
                                   bins: int = 10,
                                   normalize: bool = True,
                                   color_map: str = 'Oranges',
                                   verbose: bool = True) -> None:
    """Plots the accuracy heatmap with binned actuals/predictions into quantiles.

  The accuracy of a regression model can be evaluated by visualizing the heatmap
  of a confusion matrix using the binned values of actual and predicted results.
  This is particulary useful to roughly understand how accurate your model is
  rather than relying on purerly numeric regressions performance metrics (eg.
  RMSE, MAE or MAPE).

  Args:
    actual: Series with the actual values.
    prediction: Series with prediction values.
    bins: Number of quantile bins to create.
    normalize: Normalizes the heatmap value with percentages. The percentages in
      each block of the heatmap, represent the accuracy of the model within the
      same quantile (each row will add-up to one).
    color_map: Matplotlib colormap to use in the heatmap. Full list available at
      https://matplotlib.org/3.1.0/tutorials/colors/colormaps.html.
    verbose: If True, will print the accuracy of the 1st and 2nd bins.
  """
    actual = actual.rename('actual')
    prediction = prediction.rename('prediction')

    heatmap = pd.crosstab(index=pd.qcut(prediction, bins, labels=False) + 1,
                          columns=pd.qcut(actual, bins, labels=False) + 1,
                          normalize='index' if normalize else False).fillna(0)

    sns.heatmap(heatmap.round(2), annot=True, fmt='g', cmap=color_map)

    if verbose:
        accuracy_1st_quantile, accuracy_2nd_quantile = _compute_quantile_accuracies(
            heatmap)
        print(f'{accuracy_1st_quantile*100:.2f}% accuracy within 1st quantile')
        print(f'{accuracy_2nd_quantile*100:.2f}% accuracy within 2nd quantile')
Example #17
0
    def test_rename(self, datetime_series):
        ts = datetime_series
        renamer = lambda x: x.strftime("%Y%m%d")
        renamed = ts.rename(renamer)
        assert renamed.index[0] == renamer(ts.index[0])

        # dict
        rename_dict = dict(zip(ts.index, renamed.index))
        renamed2 = ts.rename(rename_dict)
        tm.assert_series_equal(renamed, renamed2)

        # partial dict
        s = Series(np.arange(4), index=["a", "b", "c", "d"], dtype="int64")
        renamed = s.rename({"b": "foo", "d": "bar"})
        tm.assert_index_equal(renamed.index, Index(["a", "foo", "c", "bar"]))

        # index with name
        renamer = Series(
            np.arange(4), index=Index(["a", "b", "c", "d"], name="name"), dtype="int64"
        )
        renamed = renamer.rename({})
        assert renamed.index.name == renamer.index.name
Example #18
0
 def pipe_metrics(self, ds: pd.Series) -> pd.Series:
     ds = ds.rename({
         "1ra. DOSIS": "people_vaccinated",
         "2da DOSIS": "people_fully_vaccinated",
     })
     ds = enrich_data(
         ds, "total_vaccinations", ds.people_vaccinated +
         ds.people_fully_vaccinated + ds["DOSIS UNICA"])
     ds.people_vaccinated = ds.people_vaccinated + ds["DOSIS UNICA"]
     ds.people_fully_vaccinated = ds.people_fully_vaccinated + ds[
         "DOSIS UNICA"]
     ds = ds.drop("DOSIS UNICA")
     return ds
def plot_segmentation(ser: pd.Series, std_dev: float):

    cond = adaptative_sampling(ser, std_dev=std_dev)
    compression_rate = 1 - (sum(cond) / len(cond))
    ser_segmented = ser[cond]

    fig = plot_multilple_series(
        ser.rename('raw'),
        ser_segmented.rename('segmented'),
        kind='scatter',
        title=f'{ser.name} - compression rate: {compression_rate*100:.1f}%')
    [g.update(mode='lines') for g in fig.data]
    fig.data[-1].update(mode='lines+markers')
    return fig
Example #20
0
    def _rename_folder(folder_current: pd.Series,
                       beautified_folder: pd.Series):
        """
        Renames existing folder. However, if relevant tags are missing or special cases occur (like mixed artists), then the folder is not touched.
        """

        check_for_none = any(i is None for i in beautified_folder)

        if check_for_none is True:
            return  # Do not rename any files if there is any None value in the beautified filepath list.

        else:
            folder_current = folder_current[0]  # This is a pathlib object.
            beautified_folder = beautified_folder[0]

            beautified_folder = folder_current.parent / beautified_folder  # Only rename the last item of the folder path. Files in subdirectories are that way not moved around.

            if pathlib.Path(beautified_folder).is_dir():
                print(
                    f"[Error] {beautified_folder} already exists. Folder was not renamed."
                )
            else:
                folder_current.rename(beautified_folder)
Example #21
0
 def pipe_metrics(self, ds: pd.Series) -> pd.Series:
     cols = ["PRIMERA DOSIS", "SEGUNDA DOSIS", "UNIDOSIS"]
     ds.loc[cols] = ds.loc[cols].apply(clean_count)
     ds = ds.rename(
         {
             "PRIMERA DOSIS": "people_vaccinated",
             "SEGUNDA DOSIS": "people_fully_vaccinated",
         }
     )
     ds = enrich_data(ds, "total_vaccinations", ds.people_vaccinated + ds.people_fully_vaccinated + ds["UNIDOSIS"])
     ds.people_vaccinated = ds.people_vaccinated + ds["UNIDOSIS"]
     ds.people_fully_vaccinated = ds.people_fully_vaccinated + ds["UNIDOSIS"]
     ds = ds.drop("UNIDOSIS")
     return ds
Example #22
0
    def test_rename(self):
        renamer = lambda x: x.strftime('%Y%m%d')
        renamed = self.ts.rename(renamer)
        self.assertEqual(renamed.index[0], renamer(self.ts.index[0]))

        # dict
        rename_dict = dict(zip(self.ts.index, renamed.index))
        renamed2 = self.ts.rename(rename_dict)
        assert_series_equal(renamed, renamed2)

        # partial dict
        s = Series(np.arange(4), index=['a', 'b', 'c', 'd'])
        renamed = s.rename({'b' : 'foo', 'd' : 'bar'})
        self.assert_(np.array_equal(renamed.index, ['a', 'foo', 'c', 'bar']))
Example #23
0
def sum_deltas_by_truthy_data(data_frame: pd.DataFrame,
                              truthy_frames: pd.Series) -> pd.DataFrame:
    """
    Gets the delta from the pandas data frame for certain players at certain times.
    :param data_frame: The game data frame, goal frames are removed
    :param truthy_frames: Frames that have a truth value applied to them.
    :return: The time based on delta.
    """
    truthy_frames = truthy_frames.rename('truthy')
    combined_data = pd.concat(
        [truthy_frames, data_frame['game', 'delta'].rename('delta')], axis=1)
    return combined_data.loc[combined_data['truthy'] == True].sum().rename(
        columns={
            'delta': 'true_values'
        }).delta
Example #24
0
def RSI_14(days):
    RSI_list = []
    stock = main_df[Stock_Choice]
    leny = (len(stock))
    leny2 = (len(stock)) - days

    for i in reversed(stock[days:]):
        RSI_list.append(RSI(stock, leny, leny2))
        leny -= 1
        leny2 -= 1

    #since it goes backwards will give error on first days since it lacks enough data ie -2 index
    RSI_list2 = [0] * days
    #revers list so it can merged with first index as last date
    for i in reversed(RSI_list):
        RSI_list2.append(i)

    #make AAPL a dataframe not a series
    stock = main_df[['AAPL']]
    #set index to colums so it can merge
    stock.reset_index(inplace=True)
    #make list into a series
    RSI_list2 = Series(RSI_list2)
    RSI_list2.rename('RSI', inplace=True)
    #merge
    Final_Result = pd.concat([stock, RSI_list2], axis=1)
    Final_Result = Final_Result.set_index('Date')
    Final_Result.drop('AAPL', axis=1, inplace=True)
    lenny3 = len(Final_Result) - 1

    print(Final_Result)
    ax3 = Final_Result[days:].plot(kind='line')
    ax3.set_title('{} {} day RSI is {}'.format(
        Stock_Choice, days + 1, round(Final_Result['RSI'][lenny3], 2)))
    plt.axhline(y=30, xmin=0, xmax=3, c="red", linewidth=1, zorder=1)
    plt.axhline(y=70, xmin=0, xmax=3, c="red", linewidth=1, zorder=1)
Example #25
0
    def test_to_frame_respects_name_none(self):
        # GH#44212 if we explicitly pass name=None, then that should be respected,
        #  not changed to 0
        # GH-45448 this is first deprecated to only change in the future
        ser = Series(range(3))
        with tm.assert_produces_warning(FutureWarning):
            result = ser.to_frame(None)

        # exp_index = Index([None], dtype=object)
        exp_index = Index([0])
        tm.assert_index_equal(result.columns, exp_index)

        with tm.assert_produces_warning(FutureWarning):
            result = ser.rename("foo").to_frame(None)
        exp_index = Index(["foo"], dtype=object)
        tm.assert_index_equal(result.columns, exp_index)
Example #26
0
    def save(cls, result: pd.Series) -> None:
        """Merge result into the database"""

        result = result.rename(
            {"name": "operator", "alias": "operator_alias", "fscore": "confidence"}
        )

        result = result.to_frame().T[
            ["operator", "operator_alias", "confidence", "method"]
        ]

        try:
            cls.table.merge_records(result)
            cls.table.persist()
            logger.debug(f"Updated {result.operator_alias} in database")
        except Exception as e:
            logger.error(f"Could not update database -- {e}")
Example #27
0
def QA_fetch_factor_start_date(factor: pd.Series) -> pd.DataFrame:
    """
    获取因子池上市时间,本地获取接口,使用前先保存股票基本信息
    """
    factor = QA_fmt_factor(factor.copy())
    merged_data = pd.DataFrame(factor.rename("factor"))
    # 股票代码格式化
    stock_list = QA_fmt_code_list(
        factor.index.get_level_values("code").drop_duplicates())
    # 上市时间获取
    df_local = QA_fetch_stock_basic(status=None).set_index("code")
    intersection = df_local.index.intersection(stock_list)
    ss = df_local.loc[intersection]["list_date"]
    # 拼接上市时间
    merged_data = merged_data.loc[(slice(None), list(ss.index)), :]
    merged_data["start_date"] = merged_data.index.map(
        lambda x: ss.loc[x[1]]).tolist()
    return merged_data
Example #28
0
def get_timeserie_properties(series: pd.Series,
                             submonths: list = None,
                             scale_trend_intercept=False,
                             auto_corr_at: list = [1, 5]) -> pd.Series:
    """ 
    Function to be called on a timeseries (does not need to be contiguous)
    Extracts some statistics that can be of interest and returns them as a Series
    If you want only certain months submonths of the series to be taken into account that provide those as a list of integers 
    Also it tries to lag the series with a given number of days, to compute autocorrelation
    """
    if not submonths is None:
        series = series.loc[series.index.month.map(lambda m: m in submonths)]
    n_nan = series.isna().sum()
    series = series.dropna()  # Remove nans
    lm = LinearRegression()
    if scale_trend_intercept:
        lm.fit(y=scale(series), X=series.index.year.values.reshape(-1, 1))
    else:
        lm.fit(y=series, X=series.index.year.values.reshape(-1, 1))
    trend = float(lm.coef_)  # (standardized) coefficient / yr
    intercept = lm.intercept_
    # Smoothness is the autocorrelation at a lag of 1 day and 5 days.
    results = pd.Series({
        'std': std,
        'mean': mean,
        'length': length,
        'n_nan': n_nan,
        'trend': trend,
        'intercept': intercept
    })
    for lag in auto_corr_at:
        lagged = pd.Series(series.values,
                           index=series.index - pd.Timedelta(f'{lag}D'),
                           name=f'{lag}D')
        results.loc[f'auto{lag}'] = pd.merge(
            left=series.rename('unlagged'),
            right=lagged,
            left_index=True,
            right_index=True,
            how='inner'
        ).corr().iloc[
            0,
            -1]  # Potential bug here in pd merge when the series is not correctly named, because then it searches for columns
    return (results)
def plot_linear_and_segmentation(ser: pd.Series, std_dev: float):

    (ser_interp, ser_filt,
     ser_smooth) = signal_preprocess_linear(ser, output_type='all')

    cond = adaptative_sampling(ser_smooth, std_dev=std_dev)
    compression_rate = 1 - (sum(cond) / len(cond))
    ser_segmented = ser_smooth[cond]

    fig = plot_multilple_series(
        ser.rename('raw'),
        signal_resample(ser, resample_method='').rename('resample'),
        ser_interp.rename('interpolate'),
        ser_filt.rename('filter'),
        ser_smooth.rename('smooth'),
        ser_segmented.rename('segmented'),
        kind='scatter',
        title=f'{ser.name} - compression rate: {compression_rate*100:.1f}%')
    [g.update(mode='lines') for g in fig.data]
    fig.data[-1].update(mode='lines+markers')
    return fig
Example #30
0
    def test_rename_series_with_multiindex(self):
        # issue #43659
        arrays = [
            ["bar", "baz", "baz", "foo", "qux"],
            ["one", "one", "two", "two", "one"],
        ]

        index = MultiIndex.from_arrays(arrays, names=["first", "second"])
        s = Series(np.ones(5), index=index)
        result = s.rename(index={"one": "yes"}, level="second", errors="raise")

        arrays_expected = [
            ["bar", "baz", "baz", "foo", "qux"],
            ["yes", "yes", "two", "two", "yes"],
        ]

        index_expected = MultiIndex.from_arrays(arrays_expected,
                                                names=["first", "second"])
        series_expected = Series(np.ones(5), index=index_expected)

        tm.assert_series_equal(result, series_expected)
Example #31
0
def time_aware_data_split(X: pd.DataFrame,
                          y: pd.Series,
                          start_series: pd.Series,
                          test_size: float = 0.2):
    """The time aware data split shall return a train and a test set. The
    test-set needs to be balanced. Time awareness is applicable, when the
    test-set consists of the newest students in the data set.
    :param X: the values of the dataset
    :param y: the labels of the dataset
    :param test_size: the balance of the test-set (Default is 0.2)
    :param start_series: the data storing the start semester. This must be a
                            pd.Series with student id as index. (Default path
                            is ./data/output/cleaned/hzb.csv)
    :return: X_train, X_test, y_train, y_test"""

    # Concatenate values with labels, so that they are strictly together.
    X[y.name] = y
    data = X
    # Concatenate the values and labels with the semester it was taken
    # `join='inner'` handles, that only indices, that are present in both
    # DataFrames, are kept. In the end data will be overridden.
    data: pd.DataFrame = pd.concat(
        [data, start_series.rename('Semester')], axis=1, join='inner')
    # Sorting the DataFrame by column makes it possible to do a time aware
    # split.
    data = data.sort_values(by=['Semester'])
    # Since sklearn adopted the non shuffled train_test_split, we can use it
    # for the real data split. For this purpose, the values and labels need
    # to be extracted again from the computed DataFrame
    x = data.drop(columns=['Semester', 'Label'])
    y = data['Label']
    # delete `data` for computational safety.
    del data
    # Use the actual train_test_split of sklearn library without shuffling
    # the data
    return train_test_split(x, y, test_size=test_size, shuffle=False)
Example #32
0
 def default_series_rename(series: pd.Series,
                           name: str = 'tmp') -> pd.Series:
     return series.rename(name)
def translate_index(input: pd.Series) -> pd.Series:
    return input.rename({
        'first_vaccine_number': 'people_vaccinated',
        'second_vaccine_number': 'people_fully_vaccinated',
    })
def make_pie_chart(index: int, values: pd.Series, title: str, axs):
    """
    Builds a pie chart at [index] of a 2, 3 subplot with [values] and [title]

    Args:
        - index(int): the location of the pie chart. Increases right to left and up to down
        - values(pd.Series): The pandas series contain both the labels of the individual portion of the chart and the actual values for each clade
        - title(str): the title of the chart
        - axs: The maplotlib Axs to plot the pie chart on
    """

    def get_index(index: int) -> List[int]:
        index -= 1
        if index >= 3:
            index -= 3
            return [1, index]
        return [0, index]

    labels = OrderedDict(
        [
            ("O", "O"),
            ("S", "S"),
            ("L", "L"),
            ("V", "V"),
            ("G", "G"),
            ("Gn", "G+S477X"),
            ("GH", "GH"),
            ("GHn", "GH+S477X"),
            ("GR", "GR"),
            ("GRn", "GR+S477X"),
            ("GV", "GV"),
            ("GVn", "GV+S477X"),
        ]
    )
    # Must correspond to order in data
    colors = {
        "G": "#ffcccc",
        "G+S477X": "#ffcccc",
        "GH": "#f4b183",
        "GH+S477X": "#f4b183",
        "GR": "#ff7c80",
        "GR+S477X": "#ff7c80",
        "GV": "#f08bb5",
        "GV+S477X": "#f08bb5",
        "L": "#d9d9d9",
        "O": "#808080",
        "S": "#70ad47",
        "V": "#ff99ff",
    }
    to_hatch = ["G+S477X", "GH+S477X", "GR+S477X", "GV+S477X"]
    indices = get_index(index)
    # print(title, values)
    values.rename(index=labels, inplace=True)
    # print("renamed", values)
    normalize = values.values.sum(axis=0) != 0

    axs[indices[0], indices[1]].set_title(label=title, fontdict={"fontsize": 10})
    pie = axs[indices[0], indices[1]].pie(
        values.values, normalize=normalize, labeldistance=None, colors=colors.values()
    )
    plotted_val = values[values > 0]
    plotted_clades = sorted(
        plotted_val.index, key=lambda x: plotted_val[x], reverse=True
    )

    wedges = list(map(lambda x: (x, x.theta2 - x.theta1), pie[0]))
    wedges = sorted(wedges, key=lambda x: x[1], reverse=True)
    for wedge, clade in zip(wedges, plotted_clades):
        # print(wedge, clade)
        # print(f"{clade} in {to_hatch} = {clade in to_hatch}")
        if clade in to_hatch:
            wedge[0].set_hatch("+++")

    wedges = {}
    for key, value in colors.items():
        patch = mpatches.Patch(
            facecolor=value, hatch="+++" if key in to_hatch else "", label=key
        )
        wedges[key] = patch
    return wedges
Example #35
0
 def test_rename_partial_dict(self):
     # partial dict
     ser = Series(np.arange(4), index=["a", "b", "c", "d"], dtype="int64")
     renamed = ser.rename({"b": "foo", "d": "bar"})
     tm.assert_index_equal(renamed.index, Index(["a", "foo", "c", "bar"]))
Example #36
0
def variants_vaccines(rate_age_pattern: pd.Series,
                      denom_age_pattern: pd.Series,
                      age_spec_population: pd.Series,
                      rate: pd.Series,
                      day_shift: int,
                      escape_variant_prevalence: pd.Series,
                      severity_variant_prevalence: pd.Series,
                      vaccine_coverage: pd.DataFrame,
                      population: pd.Series,
                      variant_risk_ratio: float,
                      verbose: bool = True,):
    escape_variant_prevalence = escape_variant_prevalence.reset_index()
    escape_variant_prevalence['date'] += pd.Timedelta(days=day_shift)
    escape_variant_prevalence = (escape_variant_prevalence
                                 .set_index(['location_id', 'date'])
                                 .loc[:, 'escape_variant_prevalence'])
    escape_variant_prevalence = pd.concat([rate, escape_variant_prevalence], axis=1)  # borrow axis
    escape_variant_prevalence = escape_variant_prevalence['escape_variant_prevalence'].fillna(0)
    
    severity_variant_prevalence = severity_variant_prevalence.reset_index()
    severity_variant_prevalence['date'] += pd.Timedelta(days=day_shift)
    severity_variant_prevalence = (severity_variant_prevalence
                                 .set_index(['location_id', 'date'])
                                 .loc[:, 'severity_variant_prevalence'])
    severity_variant_prevalence = pd.concat([rate, severity_variant_prevalence], axis=1)  # borrow axis
    severity_variant_prevalence = severity_variant_prevalence['severity_variant_prevalence'].fillna(0)

    lr_e = [f'cumulative_lr_effective_{variant_suffix}' for variant_suffix in ['wildtype', 'variant']]
    lr_ep = [f'cumulative_lr_effective_protected_{variant_suffix}' for variant_suffix in ['wildtype', 'variant']]
    hr_e = [f'cumulative_hr_effective_{variant_suffix}' for variant_suffix in ['wildtype', 'variant']]
    hr_ep = [f'cumulative_hr_effective_protected_{variant_suffix}' for variant_suffix in ['wildtype', 'variant']]
    vaccine_coverage = (vaccine_coverage
                        .loc[:, lr_e + lr_ep + hr_e + hr_ep]
                        .reset_index())
    vaccine_coverage['date'] += pd.Timedelta(days=day_shift)
    vaccine_coverage = vaccine_coverage.set_index(['location_id', 'date'])
    vaccine_coverage = pd.concat([rate.rename('rate'), vaccine_coverage], axis=1)  # borrow axis
    del vaccine_coverage['rate']
    vaccine_coverage = vaccine_coverage.fillna(0)
    
    # not super necessary...
    numerator = pd.Series(100, index=rate.index)
    numerator /= population
    
    denominator_a = (numerator / rate)
    denominator_ev = (numerator / (rate * variant_risk_ratio))
    denominator_sv = denominator_ev.copy()
    denominator_a *= (1 - (escape_variant_prevalence + severity_variant_prevalence)[denominator_a.index])
    denominator_ev *= escape_variant_prevalence[denominator_ev.index]
    denominator_sv *= severity_variant_prevalence[denominator_sv.index]

    numerator_a = (rate * denominator_a)
    numerator_ev = (rate * variant_risk_ratio * denominator_ev)
    numerator_sv = (rate * variant_risk_ratio * denominator_sv)
    
    if verbose:
        logger.info('Adjusting ancestral...')
    numerator_lr_a, numerator_hr_a, denominator_lr_a, denominator_hr_a = adjust_by_variant_classification(
        numerator=numerator_a,
        denominator=denominator_a,
        variant_suffixes=['wildtype', 'variant',],
        rate_age_pattern=rate_age_pattern,
        denom_age_pattern=denom_age_pattern,
        age_spec_population=age_spec_population,
        vaccine_coverage=vaccine_coverage,
        population=population,
    )
    if verbose:
        logger.info('Adjusting non-escape...')
    numerator_lr_sv, numerator_hr_sv, denominator_lr_sv, denominator_hr_sv = adjust_by_variant_classification(
        numerator=numerator_sv,
        denominator=denominator_sv,
        variant_suffixes=['wildtype', 'variant'],
        rate_age_pattern=rate_age_pattern,
        denom_age_pattern=denom_age_pattern,
        age_spec_population=age_spec_population,
        vaccine_coverage=vaccine_coverage,
        population=population,
    )
    if verbose:
        logger.info('Adjusting escape...')
    numerator_lr_ev, numerator_hr_ev, denominator_lr_ev, denominator_hr_ev = adjust_by_variant_classification(
        numerator=numerator_ev,
        denominator=denominator_ev,
        variant_suffixes=['variant',],
        rate_age_pattern=rate_age_pattern,
        denom_age_pattern=denom_age_pattern,
        age_spec_population=age_spec_population,
        vaccine_coverage=vaccine_coverage,
        population=population,
    )
    
    numerator_lr = numerator_lr_a + numerator_lr_ev + numerator_lr_sv
    denominator_lr = denominator_lr_a + denominator_lr_ev + denominator_lr_sv
    numerator_hr = numerator_hr_a + numerator_hr_ev + numerator_hr_sv
    denominator_hr = denominator_hr_a + denominator_hr_ev + denominator_hr_sv
    
    rate = (numerator_lr + numerator_hr) / (denominator_lr + denominator_hr)
    rate_lr = numerator_lr / denominator_lr
    rate_hr = numerator_hr / denominator_hr
    
    pct_inf_lr = denominator_lr / (denominator_lr + denominator_hr)
    pct_inf_hr = denominator_hr / (denominator_lr + denominator_hr)
    
    return rate, rate_lr, rate_hr, pct_inf_lr, pct_inf_hr
plt.show()

name = f"RandomForest (depth={best})"
model = RandomForestRegressor(max_depth=best,
                              random_state=0).fit(X_train, Y_train)
test[name] = mean_squared_error(Y_test, model.predict(X_test))
train[name] = mean_squared_error(Y_train, model.predict(X_train))
final_models[name] = model
DataFrame({'name': name, 'train': np.sqrt(train[name]),
           'test': np.sqrt(test[name])}, index=['RMSE'])

# show top feature Importances
top_n = 20
imp = Series(model.feature_importances_, index=X.columns).sort_values()
DataFrame.from_dict({i+1: {'importance': imp[s],
                           'series_id': s,
                           'description': alf.header(s)}
                     for i, s in enumerate(np.flip(imp.index[-top_n:]))},
                    orient='index')

# Plot summary of model RMSE's
fig, ax = plt.subplots(num=1, clear=True, figsize=(10,6))
np.sqrt(train.rename('train').to_frame().join(test.rename('test')))\
  .sort_values('test').plot.barh(ax=ax, width=0.85)
ax.yaxis.set_tick_params(labelsize=10)
ax.set_title('Regression RMSE')
ax.figure.subplots_adjust(left=0.35)
plt.savefig(os.path.join(imgdir, 'rmse.jpg'))
plt.show()
    
Example #38
0
 def test_rename_method_and_index(self):
     # GH 40977
     ser = Series([1, 2])
     with pytest.raises(TypeError,
                        match="Cannot specify both 'mapper' and 'index'"):
         ser.rename(str, index=str)
Example #39
0
def translate_index(ds: pd.Series) -> pd.Series:
    return ds.rename({
        "Общ брой лица със завършен ваксинационен цикъл":
        "people_fully_vaccinated",
        "Общо поставени дози": "total_vaccinations",
    })
Example #40
0
 def test_rename_none(self):
     # GH 40977
     ser = Series([1, 2], name="foo")
     result = ser.rename(None)
     expected = Series([1, 2])
     tm.assert_series_equal(result, expected)
Example #41
0
 def test_rename_mi(self):
     s = Series([11,21,31],
                index=MultiIndex.from_tuples([("A",x) for x in ["a","B","c"]]))
     result = s.rename(str.lower)
Example #42
0
 def test_rename_by_series(self):
     ser = Series(range(5), name="foo")
     renamer = Series({1: 10, 2: 20})
     result = ser.rename(renamer)
     expected = Series(range(5), index=[0, 10, 20, 3, 4], name="foo")
     tm.assert_series_equal(result, expected)
Example #43
0
def translate_index(ds: pd.Series) -> pd.Series:
    return ds.rename({
        'Общо ваксинирани лицас втора доза': 'people_fully_vaccinated',
        'Общо поставени дози': 'total_vaccinations',
    })
Example #44
0
 def test_rename_mi(self):
     s = Series(
         [11, 21, 31],
         index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]]),
     )
     s.rename(str.lower)
Example #45
0
 def test_rename_by_series(self):
     s = Series(range(5), name='foo')
     renamer = Series({1: 10, 2: 20})
     result = s.rename(renamer)
     expected = Series(range(5), index=[0, 10, 20, 3, 4], name='foo')
     tm.assert_series_equal(result, expected)