Esempio n. 1
0
def process_recarray_pandas(data, endog_idx=0, exog_idx=None, dtype=None,
                            index_idx=None):

    data = DataFrame(data, dtype=dtype)
    names = data.columns

    if isinstance(endog_idx, (int, long)):
        endog_name = names[endog_idx]
        endog = data[endog_name]
        if exog_idx is None:
            exog = data.drop([endog_name], axis=1)
        else:
            exog = data.filter(names[exog_idx])
    else:
        endog = data.loc[:, endog_idx]
        endog_name = list(endog.columns)
        if exog_idx is None:
            exog = data.drop(endog_name, axis=1)
        elif isinstance(exog_idx, (int, long)):
            exog = data.filter([names[exog_idx]])
        else:
            exog = data.filter(names[exog_idx])

    if index_idx is not None:  # NOTE: will have to be improved for dates
        endog.index = Index(data.iloc[:, index_idx])
        exog.index = Index(data.iloc[:, index_idx])
        data = data.set_index(names[index_idx])

    exog_name = list(exog.columns)
    dataset = Dataset(data=data, names=list(names), endog=endog, exog=exog,
                      endog_name=endog_name, exog_name=exog_name)
    return dataset
Esempio n. 2
0
def make_portfolio_dataframe(df: pd.DataFrame, melt=False):
    assert df is not None
    # print(df)
    df["date"] = pd.to_datetime(df["date"])
    avg_profit_over_period = (df.filter(
        items=["stock", "stock_profit"]).groupby("stock").mean())
    avg_profit_over_period["contribution"] = [
        "positive" if profit >= 0.0 else "negative"
        for profit in avg_profit_over_period.stock_profit
    ]
    # dont want to override actual profit with average
    avg_profit_over_period = avg_profit_over_period.drop("stock_profit",
                                                         axis="columns")
    df = df.merge(avg_profit_over_period,
                  left_on="stock",
                  right_index=True,
                  how="inner")
    # print(df)
    if melt:
        df = df.filter(items=[
            "stock", "date", "stock_profit", "stock_worth", "contribution"
        ])
        melted_df = df.melt(id_vars=["date", "stock", "contribution"],
                            var_name="field")
        return melted_df
    return df
Esempio n. 3
0
 def pipe_age_create_groups(self, df: pd.DataFrame) -> pd.DataFrame:
     # Split data in dataframes with first and second doses
     df1 = df.filter(regex=r"Date|VaccinationsPercentagePopulationVaccinatedFirstDose.*")
     df2 = df.filter(regex=r"Date|VaccinationsPercentagePopulationVaccinatedSecondDose.*")
     df3 = df.filter(regex=r"Date|VaccinationsPercentagePopulationVaccinatedThirdDose.*")
     # Melt dataframes
     df1 = df1.melt(
         id_vars="Date",
         var_name="age_group",
         value_name="people_vaccinated_per_hundred",
     )
     df2 = df2.melt(
         id_vars="Date",
         var_name="age_group",
         value_name="people_fully_vaccinated_per_hundred",
     )
     df3 = df3.melt(
         id_vars="Date",
         var_name="age_group",
         value_name="people_with_booster_per_hundred",
     )
     # Process and merge dataframes
     df1 = df1.assign(age_group=df1.age_group.apply(self._extract_age_group))
     df2 = df2.assign(age_group=df2.age_group.apply(self._extract_age_group))
     df3 = df3.assign(age_group=df3.age_group.apply(self._extract_age_group))
     df = df1.merge(df2, on=["Date", "age_group"]).dropna(subset=["Date"])
     df = df.merge(df3, on=["Date", "age_group"]).dropna(subset=["Date"])
     return df
Esempio n. 4
0
def process_recarray_pandas(data, endog_idx=0, exog_idx=None, dtype=None):
    from pandas import DataFrame

    data = DataFrame(data, dtype=dtype)
    names = data.columns

    if isinstance(endog_idx, int):
        endog_name = names[endog_idx]
        endog = data[endog_name]
        if exog_idx is None:
            exog = data.drop([endog_name], axis=1)
        else:
            exog = data.filter(names[exog_idx])
    else:
        endog = data.ix[:, endog_idx]
        endog_name = list(endog.columns)
        if exog_idx is None:
            exog = data.drop(endog_name, axis=1)
        elif isinstance(exog_idx, int):
            exog = data.filter([names[exog_idx]])
        else:
            exog = data.filter(names[exog_idx])

    exog_name = list(exog.columns)
    dataset = Dataset(data=data, names=list(names), endog=endog, exog=exog,
                      endog_name=endog_name, exog_name=exog_name)
    return dataset
Esempio n. 5
0
def merge_columns(df: pd.DataFrame, main: str, auxiliary: str):
    """
        Merge two columns with same prefix into one column without suffix
        For example: merge name_x and name_y into name
    :param df:
    :param main: suffix of main columns to be kept
    :param auxiliary: suffix of auxiliary columns to fill na values of corresponding main columns
    :return:
    """
    mains = set([name.split(main)[0] for name in list(df.filter(regex=main))])
    auxiliaries = set([
        name.split(auxiliary)[0] for name in list(df.filter(regex=auxiliary))
    ])
    shared = list(mains.intersection(
        auxiliaries))  # columns shared in main and auxiliary
    only_aux = list(auxiliaries.difference(mains))

    # Fill nan values of main columns with auxiliary columns
    main_columns = [name + main for name in shared]
    aux_columns = [name + auxiliary for name in shared]
    df = fillna(df, target=main_columns, source=aux_columns)

    # Re-suffix auxiliary columns having no duplicate in main columns
    # to keep exclusively auxiliary ones in final results
    df = df.rename(
        columns={name + auxiliary: name + main
                 for name in only_aux})

    # Drop auxiliary columns
    df = drop_columns(df=df, end_with=auxiliary)

    # Remove suffix from main columns
    df = df.rename(columns={col: col.split(main)[0] for col in df.columns})

    return df
Esempio n. 6
0
def preprocess_pipeline(df: pd.DataFrame, config: Config):

    drop_columns(df, config)

    date_cols = list(df.filter(like='datetime_'))
    str_cols = list(df.filter(like='string_'))
    num_cols = list(df.filter(like='number_'))
    id_cols = list(df.filter(like='id_'))

    for c in id_cols + num_cols:
        if str(df[c].dtype) == 'object':
            log(f'column {c} is object (expected numerical type), casted as category'
                )
            df[c] = df[c].astype('category').cat.as_ordered().cat.codes

    df = add_is_na_cols(df, config)
    df = fillna(df, config)
    df = downcast(df, config)

    non_negative_target_detect(df, config)

    if len(date_cols) != 0:
        df = process_datetime(df, date_cols, config)

    if len(str_cols) != 0:
        df = process_strings(df, str_cols, config)
        df = mean_encode_kf(df, str_cols, 5, config)

    return df
def _calculate_performance(weight_price_df: pd.DataFrame) -> pd.DataFrame:
    """(Used by 'index_calculation')
    Assumes weight_price_df is a DataFrame. Returns a new DataFrame with columns containing the performance."""
    tickers = [
        col_name.replace('_price', '') for col_name in list(weight_price_df)
        if '_price' in col_name
    ]
    # Get the price at each rebalance date and then roll the value
    for ticker in tickers:
        weight_price_df[ticker + '_price_last_rbd'] = weight_price_df[ticker + '_price'] * \
                                                      (weight_price_df['rebalance'] == 1)
        weight_price_df[ticker + '_price_last_rbd'].replace(0,
                                                            np.nan,
                                                            inplace=True)
        weight_price_df.fillna(method='ffill', inplace=True)  # forward fill

    # Calculate the performance
    performance_col_name = [ticker + '_performance' for ticker in tickers]
    weight_price_df[performance_col_name] = pd.DataFrame(
        data=weight_price_df.filter(regex='_price$').values /
        weight_price_df.filter(regex='_price_last_rbd$').shift(1).values,
        index=weight_price_df.index)

    # Calculate the weighted performance
    weighted_performance_col_names = [
        ticker + '_weighted_return' for ticker in tickers
    ]
    weight_price_df[weighted_performance_col_names] = pd.DataFrame(data=weight_price_df.filter(regex='_weight$').shift(1).values * \
                                                                        (weight_price_df.filter(regex='_performance$').values - 1.0),
                                                                   index=weight_price_df.index)
    return weight_price_df
Esempio n. 8
0
    def _exibe_iteracao(self, iteracao=None, candidato_selecionado=None):
        if EXIBE_ITERACAO:

            if iteracao != None:
                print(f'\nIteração {iteracao + 1}:')

            df_anuncio = DataFrame(self.matriz_anuncio,
                                   columns=['Tamanho', 'Frequencia', 'Ganho'])

            print('\nAnúncios disponíveis C:')
            print(df_anuncio.filter(self._lista_anuncio_disponivel, axis=0))

            if self._limite_inferior_atual != None:
                print(f'\nLimite inferior: {self._limite_inferior_atual}')

            if self._lista_indice_anuncio_candidato_atual != None:
                print(f'\nCandidatos RC:')
                print(
                    df_anuncio.filter(
                        self._lista_indice_anuncio_candidato_atual, axis=0))

            if candidato_selecionado != None:
                print('\nCandidato selecionado A_j:')
                print(df_anuncio.filter([candidato_selecionado], axis=0))

            print('\nSolução parcial S:')
            print(
                DataFrame(self.solucao.matriz_solucao,
                          columns=['Espaço ocupado', 'Anúncios inseridos']))

            print('\n==================================\n')
def fe_likely_word(df: pd.DataFrame, target_avg_word: pd.DataFrame):
    for word in tqdm(
        target_avg_word.query("target_avg > 0.7 and target_count > 3")["word"]
    ):
        df["fe_likely_word_{}".format(word)] = (
            df["text_preprocessed"]
            .str.lower()
            .str.contains(word, regex=False)
            .astype(int)
        )

    for word in tqdm(
        target_avg_word.query("target_avg < 0.3 and target_count > 3")["word"]
    ):
        df["fe_unlikely_word_{}".format(word)] = (
            df["text_preprocessed"]
            .str.lower()
            .str.contains(word, regex=False)
            .astype(int)
        )
    df = pd.concat(
        [df.filter(like="fe_likely_word_"), df.filter(like="fe_unlikely_word_")],
        axis=1,
    )
    return df
Esempio n. 10
0
def subset_df_for_each_comparison(df: pd.DataFrame, base_df: pd.DataFrame,
                                  comparisons_dict: dict):

    values_prefix = rule_params["all"]["values_cols_prefix"]
    abundance_df = df.filter(regex=values_prefix)

    for comparison in comparisons_dict.keys():

        # get abundances values for reference
        reference_abundances_col = abundance_df.filter(
            regex=comparisons_dict[comparison]["reference"])

        # Add a descriptor in column names / useful if several different control to use, more flexible than in the
        # config file (
        reference_abundances_col = reference_abundances_col.add_suffix(
            '_reference')

        # get abundances values for the condition to compare with reference
        condition_abundances_col = df.filter(
            regex=comparisons_dict[comparison]["condition"])

        # create complete dataframe
        result_df = pd.concat(
            [base_df, reference_abundances_col, condition_abundances_col],
            axis=1)

        # export result
        output_result = os.path.join(os.path.dirname(args.output_file),
                                     '{}'.format(comparison))

        logging.debug('Path to file: {}'.format(output_result))
        h.export_result_to_csv(result_df, output_result)
        logger.info('Data for {} exported to csv'.format(comparison))

    return
Esempio n. 11
0
    def fit(self, df: pd.DataFrame):
        distances = df["distance"]
        voltages = df.filter(regex="^voltage_\\d+$")
        weights = df.filter(regex="^weight_\\d+$")
        forces = self.get_forces(weights)

        off_ground = distances > self.dist_cutoff

        self.f_0.fit(voltages[off_ground].values.reshape((-1, )),
                     forces[off_ground].values.reshape((-1, )))
        base_pred = self.f_0.predict(voltages.values.reshape(-1, )).reshape(
            (-1, self.n))
        ground_factor = (forces.values / base_pred).mean(
            axis=1)  # F = F0 * Fge -> Fge = F / F0
        valid = np.isfinite(
            ground_factor)  # Filter out invalid values (nan, inf, -inf)
        ground_factor = ground_factor[valid]
        dists = distances.values[valid]

        self.f_ge.fit(dists, ground_factor)
        ge = self.f_ge.predict(distances.values).reshape((-1, 1))

        unknown = forces.values - base_pred * ge  # F = F0 * Fge + Fu -> Fu = F - F0 * Fge

        self.f_u.fit(pd.concat((voltages, distances), axis=1).values, unknown)
    def test_filter_bytestring(self, name):
        # GH13101
        df = DataFrame({b'a': [1, 2], b'b': [3, 4]})
        expected = DataFrame({b'a': [1, 2]})

        assert_frame_equal(df.filter(like=name), expected)
        assert_frame_equal(df.filter(regex=name), expected)
def prepare_targets(
    df: pd.DataFrame,
    top_variance_filter: Optional[int],
    gene_filter: Optional[List[str]],
) -> pd.DataFrame:
    if gene_filter is not None:
        gene_symbol, _ = split_gene_label_series(df.columns)
        columns = pd.DataFrame({
            "column_name": df.columns,
            "gene_symbol": gene_symbol
        })
        filtered_columns = columns[columns["gene_symbol"].isin(
            gene_filter)]["column_name"]
        if len(filtered_columns) == 0:
            raise ValueError("No matching genes found")
        df = df.filter(items=filtered_columns, axis="columns")

    if top_variance_filter is not None:
        df = df.filter(
            items=df.var(axis=0).sort_values(
                ascending=False)[:top_variance_filter].index,
            axis="columns",
        )

    df.index.name = "Row.name"

    return df
Esempio n. 14
0
    def test_filter_bytestring(self, name):
        # GH13101
        df = DataFrame({b'a': [1, 2], b'b': [3, 4]})
        expected = DataFrame({b'a': [1, 2]})

        assert_frame_equal(df.filter(like=name), expected)
        assert_frame_equal(df.filter(regex=name), expected)
Esempio n. 15
0
def generate_simple_costs(data: pandas.DataFrame, date_format: str,
                          base_file_name: str) -> None:

    data.reset_index(inplace=True)

    data['Date'] = data['Date'].dt.strftime(date_format)
    data.filter(items=['Date', 'Cost'])
    generate_data_files(data[['Date', 'Cost']], base_file_name)
Esempio n. 16
0
    def test_filter_corner(self):
        empty = DataFrame()

        result = empty.filter([])
        assert_frame_equal(result, empty)

        result = empty.filter(like='foo')
        assert_frame_equal(result, empty)
    def test_filter_corner(self):
        empty = DataFrame()

        result = empty.filter([])
        assert_frame_equal(result, empty)

        result = empty.filter(like='foo')
        assert_frame_equal(result, empty)
    def reverse_one_hot_vector_encoding(self,
                                        df_one_hot_vector: pd.DataFrame = None
                                        ):
        """reverses the application of the one hot vector operation on the dataframe df_one_hot_vector

        :param df_one_hot_vector: the dataframe on which to perform the reversal
        :type df_one_hot_vector: pd.DataFrame

        :return: a dataframe where the one_hot_vector columns are transformed into one categorical column
        :rtype: pd.DataFrame
        """

        assert (isinstance(df_one_hot_vector, pd.DataFrame))

        colnames = self.one_hot_vector_category_column_names

        df_categorical = None
        for colname in colnames:
            # create a dataframe that only contains the columns that were created by
            # the one_hot vector application
            df_only_one_hot_vector_columns = df_one_hot_vector.filter(
                regex=(f'^{colname}_')).copy()
            if df_only_one_hot_vector_columns.shape[1] == 0:
                # no column has been found
                break
            # rename the one hot vector columns such that they only contain the values of the original categorical
            # variable
            df_only_one_hot_vector_columns.rename(
                columns=lambda x: re.sub(f'^{colname}_', '', x), inplace=True)

            # create a series that contains the categorical variables
            series_with_categorical_variable = df_only_one_hot_vector_columns.idxmax(
                axis=1)
            # convert the series to a dataframe
            df_with_categorical_variable = series_with_categorical_variable.to_frame(
            )
            # rename the column name to the original column name
            df_with_categorical_variable.columns = [colname]

            # drop the one hot vector columns from the dataframe
            df_everything_except_one_hot_vector_columns = df_one_hot_vector.filter(
                regex=(f'^(?!^{colname}_)')).copy()

            # join the two to yield the subpart of the dataframe that is categorical for the colname
            df_categorical_subpart = pd.concat([
                df_with_categorical_variable,
                df_everything_except_one_hot_vector_columns
            ],
                                               axis=1)

            if df_categorical is None:
                df_categorical = df_categorical_subpart
            else:
                df_categorical = pd.concat(
                    [df_categorical, df_categorical_subpart], axis=1)

        return df_categorical
Esempio n. 19
0
    def test_filter(self):
        # items
        filtered = self.frame.filter(['A', 'B', 'E'])
        self.assertEqual(len(filtered.columns), 2)
        self.assertNotIn('E', filtered)

        filtered = self.frame.filter(['A', 'B', 'E'], axis='columns')
        self.assertEqual(len(filtered.columns), 2)
        self.assertNotIn('E', filtered)

        # other axis
        idx = self.frame.index[0:4]
        filtered = self.frame.filter(idx, axis='index')
        expected = self.frame.reindex(index=idx)
        assert_frame_equal(filtered, expected)

        # like
        fcopy = self.frame.copy()
        fcopy['AA'] = 1

        filtered = fcopy.filter(like='A')
        self.assertEqual(len(filtered.columns), 2)
        self.assertIn('AA', filtered)

        # like with ints in column names
        df = DataFrame(0., index=[0, 1, 2], columns=[0, 1, '_A', '_B'])
        filtered = df.filter(like='_')
        self.assertEqual(len(filtered.columns), 2)

        # regex with ints in column names
        # from PR #10384
        df = DataFrame(0., index=[0, 1, 2], columns=['A1', 1, 'B', 2, 'C'])
        expected = DataFrame(0.,
                             index=[0, 1, 2],
                             columns=pd.Index([1, 2], dtype=object))
        filtered = df.filter(regex='^[0-9]+$')
        assert_frame_equal(filtered, expected)

        expected = DataFrame(0., index=[0, 1, 2], columns=[0, '0', 1, '1'])
        # shouldn't remove anything
        filtered = expected.filter(regex='^[0-9]+$')
        assert_frame_equal(filtered, expected)

        # pass in None
        with assertRaisesRegexp(TypeError, 'Must pass'):
            self.frame.filter(items=None)

        # objects
        filtered = self.mixed_frame.filter(like='foo')
        self.assertIn('foo', filtered)

        # unicode columns, won't ascii-encode
        df = self.frame.rename(columns={'B': u('\u2202')})
        filtered = df.filter(like='C')
        self.assertTrue('C' in filtered)
Esempio n. 20
0
def plot_inventory(
    planning: pd.DataFrame,
    timeline: List[str],
    cust_orders,
) -> None:
    # Plot inventory
    df = (planning.filter(like="early prod",
                          axis=0).copy().rename(columns={
                              "Solution": "Qty"
                          }).reset_index())

    df[["Date", "Order"]] = df["index"].str.split(",", expand=True)
    df["Date"] = df["Date"].str.split("[").str[1]
    df["Order"] = df["Order"].str.split("]").str[0]
    df = df[["Date", "Qty", "Order"]]

    models_list = cust_orders[['Order', 'Product_Family']]
    df = pd.merge(df, models_list, on='Order', how='inner')
    df = df[["Date", "Qty", "Product_Family"]]

    bars = (alt.Chart(df).mark_bar().encode(
        y="Qty:Q",
        color="Product_Family:N",
        tooltip=["Product_Family", "Qty"],
    ).interactive().properties(width=550 / len(timeline) - 22, height=60))

    chart_inventory = (alt.layer(
        bars, data=df).facet(column="Date:N").properties(title="Inventory"))

    # Plot shortage
    df = (planning.filter(like="late prod",
                          axis=0).copy().rename(columns={
                              "Solution": "Qty"
                          }).reset_index())

    df[["Date", "Order"]] = df["index"].str.split(",", expand=True)
    df["Date"] = df["Date"].str.split("[").str[1]
    df["Order"] = df["Order"].str.split("]").str[0]
    df = df[["Date", "Qty", "Order"]]

    models_list = cust_orders[['Order', 'Product_Family']]
    df = pd.merge(df, models_list, on='Order', how='inner')
    df = df[["Date", "Qty", "Product_Family"]]

    bars = (alt.Chart(df).mark_bar().encode(
        y="Qty:Q",
        color="Product_Family:N",
        tooltip=["Product_Family", "Qty"],
    ).interactive().properties(width=550 / len(timeline) - 22, height=60))

    chart_shortage = (alt.layer(
        bars, data=df).facet(column="Date:N").properties(title="Shortage"))

    chart = alt.vconcat(chart_inventory, chart_shortage)
    chart.save("Inventory_Shortage.html")
Esempio n. 21
0
    def test_filter(self):
        # items
        filtered = self.frame.filter(['A', 'B', 'E'])
        self.assertEqual(len(filtered.columns), 2)
        self.assertNotIn('E', filtered)

        filtered = self.frame.filter(['A', 'B', 'E'], axis='columns')
        self.assertEqual(len(filtered.columns), 2)
        self.assertNotIn('E', filtered)

        # other axis
        idx = self.frame.index[0:4]
        filtered = self.frame.filter(idx, axis='index')
        expected = self.frame.reindex(index=idx)
        assert_frame_equal(filtered, expected)

        # like
        fcopy = self.frame.copy()
        fcopy['AA'] = 1

        filtered = fcopy.filter(like='A')
        self.assertEqual(len(filtered.columns), 2)
        self.assertIn('AA', filtered)

        # like with ints in column names
        df = DataFrame(0., index=[0, 1, 2], columns=[0, 1, '_A', '_B'])
        filtered = df.filter(like='_')
        self.assertEqual(len(filtered.columns), 2)

        # regex with ints in column names
        # from PR #10384
        df = DataFrame(0., index=[0, 1, 2], columns=['A1', 1, 'B', 2, 'C'])
        expected = DataFrame(
            0., index=[0, 1, 2], columns=pd.Index([1, 2], dtype=object))
        filtered = df.filter(regex='^[0-9]+$')
        assert_frame_equal(filtered, expected)

        expected = DataFrame(0., index=[0, 1, 2], columns=[0, '0', 1, '1'])
        # shouldn't remove anything
        filtered = expected.filter(regex='^[0-9]+$')
        assert_frame_equal(filtered, expected)

        # pass in None
        with assertRaisesRegexp(TypeError, 'Must pass'):
            self.frame.filter(items=None)

        # objects
        filtered = self.mixed_frame.filter(like='foo')
        self.assertIn('foo', filtered)

        # unicode columns, won't ascii-encode
        df = self.frame.rename(columns={'B': u('\u2202')})
        filtered = df.filter(like='C')
        self.assertTrue('C' in filtered)
Esempio n. 22
0
    def run(self, annotations: pd.DataFrame):
        x = annotations.filter(items=Keypoints.get_point_attribute_name(
            coordinate='x'))
        y = annotations.filter(items=Keypoints.get_point_attribute_name(
            coordinate='y'))

        x = x.subtract(self._starting_point[0])
        y = y.subtract(self._starting_point[1])

        return annotations.combine(pd.concat([x, y]), lambda x, y: x
                                   if math.isnan(y) else y)
    def test_filter(self):
        # items
        filtered = self.frame.filter(["A", "B", "E"])
        self.assertEqual(len(filtered.columns), 2)
        self.assertNotIn("E", filtered)

        filtered = self.frame.filter(["A", "B", "E"], axis="columns")
        self.assertEqual(len(filtered.columns), 2)
        self.assertNotIn("E", filtered)

        # other axis
        idx = self.frame.index[0:4]
        filtered = self.frame.filter(idx, axis="index")
        expected = self.frame.reindex(index=idx)
        assert_frame_equal(filtered, expected)

        # like
        fcopy = self.frame.copy()
        fcopy["AA"] = 1

        filtered = fcopy.filter(like="A")
        self.assertEqual(len(filtered.columns), 2)
        self.assertIn("AA", filtered)

        # like with ints in column names
        df = DataFrame(0.0, index=[0, 1, 2], columns=[0, 1, "_A", "_B"])
        filtered = df.filter(like="_")
        self.assertEqual(len(filtered.columns), 2)

        # regex with ints in column names
        # from PR #10384
        df = DataFrame(0.0, index=[0, 1, 2], columns=["A1", 1, "B", 2, "C"])
        expected = DataFrame(0.0, index=[0, 1, 2], columns=pd.Index([1, 2], dtype=object))
        filtered = df.filter(regex="^[0-9]+$")
        assert_frame_equal(filtered, expected)

        expected = DataFrame(0.0, index=[0, 1, 2], columns=[0, "0", 1, "1"])
        # shouldn't remove anything
        filtered = expected.filter(regex="^[0-9]+$")
        assert_frame_equal(filtered, expected)

        # pass in None
        with assertRaisesRegexp(TypeError, "Must pass"):
            self.frame.filter(items=None)

        # objects
        filtered = self.mixed_frame.filter(like="foo")
        self.assertIn("foo", filtered)

        # unicode columns, won't ascii-encode
        df = self.frame.rename(columns={"B": u("\u2202")})
        filtered = df.filter(like="C")
        self.assertTrue("C" in filtered)
Esempio n. 24
0
    def fit(self, X: pd.DataFrame, y: pd.Series, /) -> 'Preprocessor':
        self.catCols = self.getCatColumns(X)
        self.catPreprocessor.fit(X.filter(self.catCols), y)

        self.catPCACols = self.getCatCorrelatedCols(X)
        self.catPCA.fit(X.filter(self.catPCACols), y)

        self.contCols = self.getContColumns(X)
        self.contPreprocessor.fit(X.filter(self.contCols))

        self.contPCACols = self.getContCorrelatedCols(X)
        self.contPCA.fit(X.filter(self.contPCACols))
        return self
Esempio n. 25
0
def get_historical_production(kornmo,
                              years: List[int] = None,
                              look_back_years: int = 4) -> DataFrame:
    """
    Creates a DataFrame with all farmers for each year in 'years' (default: 2017-2019),
    with each farmers production numbers for previous years, looking back the number of years specified.
    :param kornmo: An instance of a KornmoDataset
    :param years: The dataframe will have one row for each of these years per farmer
    :param look_back_years: Each row will contain this many years of production numbers
    """

    import pandas as pd
    from functools import reduce

    if years is None:
        years = [2017, 2018, 2019]

    deliveries_by_year = kornmo.get_historical_deliveries_by_year()

    data = DataFrame()

    for year in years:
        dataframes = [
            df.copy() for y, df in deliveries_by_year.items()
            if year - look_back_years <= y < year
        ]

        for index, x in enumerate(dataframes):
            x.drop('year', axis=1, inplace=True)
            x.columns = [
                'orgnr', f'bygg_sum_{index}', f'hvete_sum_{index}',
                f'havre_sum_{index}', f'rug_og_rughvete_sum_{index}'
            ]

        history_data = reduce(
            lambda left, right: pd.merge(
                left, right, on=['orgnr'], how='outer'), dataframes)

        history_data.insert(0, 'year', year)

        data = data.append(history_data, ignore_index=True)

    data_index_cols = data.filter(items=['orgnr', 'year'], axis=1)
    data_cols = data.filter(
        items=[col for col in data.columns if col not in ['orgnr', 'year']])

    return data_index_cols.merge(normalize(data_cols.fillna(0), 0, 10000),
                                 left_index=True,
                                 right_index=True)
    def from_df(cls, clusters_df: pd.DataFrame,
                cluster_id: int) -> 'BlockCluster':
        col_stem = 'cluster' + str(cluster_id)

        xmin = clusters_df.filter(regex=col_stem + '_xmin').iloc[-1].values[0]
        xmax = clusters_df.filter(regex=col_stem + '_xmax').iloc[-1].values[0]
        ymin = clusters_df.filter(regex=col_stem + '_ymin').iloc[-1].values[0]
        ymax = clusters_df.filter(regex=col_stem + '_ymax').iloc[-1].values[0]

        avg_blocks = clusters_df.filter(regex='cum_avg_' + col_stem +
                                        '_block_count').iloc[-1].values[0]
        return BlockCluster(ll=Vector3D(xmin, ymin),
                            ur=Vector3D(xmax, ymax),
                            cluster_id=cluster_id,
                            avg_blocks=avg_blocks)
Esempio n. 27
0
def na_per_group(df: pd.DataFrame, list_group_prefix: list,
                 values_cols_prefix: str):
    """
    Input : df with abundances values, list of string to select appropriate value columns per groups (list_group_prefix)
    and prefix used for data column (values_cols_prefix)
    The script creates one column per group containing % of missing value for each protein.
    Returns : the resulting columns as a dataframe (stats_per_groups), and input df augmented with the resulting columns
    """
    stats_per_groups = pd.DataFrame()

    for group in list_group_prefix:
        data = df.filter(regex=group)

        # strip prefix shared with all other groups
        prefix_to_remove = values_cols_prefix + '_'
        group_name = re.sub(prefix_to_remove, "", group)

        column_to_add_name = "nan_percentage_{}".format(group_name)

        # Add percentage of NaN in the data
        column_to_add_values = data.isna().sum(axis=1) / len(
            data.columns.tolist()) * 100
        kwargs = {column_to_add_name: column_to_add_values}
        df = df.assign(**kwargs)  # keyword in assign can't be an expression

        # Save results aside
        stats_per_groups = pd.concat(
            [stats_per_groups, df[column_to_add_name]], axis=1)
    return df, stats_per_groups
Esempio n. 28
0
def select_all_that_apply_wide_to_long(df: pd.DataFrame) -> pd.DataFrame:
    """
    Given a wide DataFrame *df*, collapses "Select All That Apply"-type
    categorical survey responses with pseudo-one-hot-encoding format and
    converts the responses into human-readable lists of equivalent logical
    value.

    Throws a :class:`AssertionError` if the number of rows in the resulting
    DataFrame differs from the given *df*.

    Contains hard-coded column names that may need to be updated in the future.
    """

    stubnames = [
        "race", "feeding", "meds", "rn_sx", "sx_specific", "missed_impact",
        "household_sx", "travel_type",
    ]
    pid = ['study_id'] + ['week']

    stubbed_columns = list(df.filter(regex='|'.join(stubnames)))
    reshaped_data = df.drop(stubbed_columns, axis='columns')

    for stub in stubnames:
        true_values = collapse_wide_stubbed_columns(df, stub, pid)
        if true_values is None:
            continue

        reshaped_data = reshaped_data.merge(true_values, how='left', on=pid)

    assert len(df) == len(reshaped_data), f"You do not have a 1:1 merge on {pid}"

    return reshaped_data
Esempio n. 29
0
def rename_baseline_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Two columns in the longitudinal data are reported at baseline but do not
    follow the stubnaming-convention of the follow-up columns. Rename these
    columns in *df* to follow the weekly data naming conventions.

    Note that baseline columns are named inconsistently. Some end with '_bl'.
    Other contain '_bl_' in the middle of the column name. Handle both of these
    cases.

    Contains some hard-coded column names that may need to be updated in the
    future.
    """
    df['swab_date_0'] = df['enroll_date']

    rename_map = {
        'enroll_date': 'assess_date_0',
        'baseline_id': 'sample_id_0',
    }

    baseline_columns = list(df.filter(regex='_bl$|_bl_', axis='columns'))

    for col in baseline_columns:
        rename_map[col] = re.sub("_bl", "", col) + "_0"

    return df.rename(columns=rename_map)
Esempio n. 30
0
def calibration_plot(df_calibration, model_name):

    dtTrade = df_calibration['dtTrade'][0]
    title = '%s Model (%s)' % (model_name, dtTrade)

    df_calibration = DataFrame.filter(df_calibration,
                    items=['dtExpiry', 
                    'Strike', 'IVBid', 'IVAsk',
                    'TTM', model_name+'-IV'])

    # group by maturity
    grouped = df_calibration.groupby('dtExpiry')

    all_groups = [(dt, g) for dt, g in grouped]

    xy = [(0, 0), (0, 1), (1, 0), (1, 1)]

    for k in range(0, len(all_groups), 4):
        if (k + 4) >= len(all_groups):
            break
        fig, axs = plt.subplots(2, 2, sharex=True, sharey=True)
        axs[0, 0].set_title(title)

        for i in range(4):
            x, y = xy[i]
            calibration_subplot(axs[x, y], all_groups[i + k][1], i,
                                model_name)
        plt.show(block=False)
def predict(test_set: DataFrame, model: LogisticRegression, reg, filename):
    test_df = test_set.filter(regex=reg)
    test_np = test_df.as_matrix()
    predictions = model.predict(test_np)
    result = DataFrame({'PassengerId': test_set['PassengerId'].as_matrix(),
                        'Survived': predictions.astype(np.int32)})
    result.to_csv(filename, index=False)
Esempio n. 32
0
def make_helpers(df_option):
    """ build array of helpers and rate curves
    """

    # extract rates and div yields from the data set
    df_tmp = DataFrame.filter(df_option, items=['dtExpiry', 'iRate', 'iDiv'])
    grouped = df_tmp.groupby('dtExpiry')

    def aggregate(serie):
        return serie[serie.index[0]]

    df_rates = grouped.agg(aggregate)

    # Get first index:
    first_index = 0

    dtTrade = df_option['dtTrade'][first_index]
    # back out the spot from any forward
    iRate = df_option['iRate'][first_index]
    iDiv = df_option['iDiv'][first_index]
    TTM = df_option['TTM'][first_index]
    Fwd = df_option['Fwd'][first_index]
    spot = SimpleQuote(Fwd * np.exp(-(iRate - iDiv) * TTM))
    print('Spot: %f risk-free rate: %f div. yield: %f' % (spot.value,
                                                          iRate, iDiv))

    # build array of option helpers
    hh = heston_helpers(spot, df_option, dtTrade, df_rates)

    risk_free_ts = dfToZeroCurve(df_rates['iRate'], dtTrade)
    dividend_ts = dfToZeroCurve(df_rates['iDiv'], dtTrade)

    return {'options': hh['options'], 'spot': spot,
            'risk_free_rate': risk_free_ts,
            'dividend_rate': dividend_ts}
Esempio n. 33
0
def drop_very_low_forces(data: pd.DataFrame,
                         threshold: int = 4) -> pd.DataFrame:
    to_drop = (data.filter(like="/") < threshold).any(axis=1)
    print(
        f"\tRemove {to_drop.sum()} rows that have less than {threshold} kg test:\n{data.loc[to_drop]}"
    )
    return data.loc[~to_drop]
Esempio n. 34
0
    def _find_submodules(df: pd.DataFrame, modules: List[str] = None) -> Set[str]:
        """
        Search for submodules at root or provided modules.

        To find the submodules the method analyzes the name of the variables.
        If the kwarg 'modules' is not None, the submodules search will be applied to
        the variables that are part of these modules.

        :param df: the pandas dataframe containing the variables
        :param modules: the list of modules to which the variables belong
        :return the submodules list
        """
        var_names = df.filter(items=["Name"])

        if not modules:
            modules = []

        def get_next_module(path):
            submodules = path.split(":")
            if len(modules) >= len(submodules) or submodules[: len(modules)] != modules:
                return ""
            else:
                return submodules[len(modules)]

        submodules = var_names.applymap(get_next_module)
        submodules = submodules[submodules.Name != ""]

        return set(submodules["Name"].tolist())
def get_states_w_vacations(date: pd.Timestamp, params: pd.DataFrame) -> dict:
    """Get states which currently have vacations for pupils.

    Returns:
        state_to_vacation_name (dict): keys are the states that have vacations
            on the current date. Values are the names of the vacation.

    """
    vacations = params.filter(like="ferien", axis=0).copy()
    if vacations.empty:
        raise ValueError(
            "'params' does not contain any information about vacations.")

    # Dates are stored as epochs so that value can be a numeric column.
    vacations["value"] = from_epochs_to_timestamps(vacations["value"])
    vacations = vacations.groupby(
        vacations.index.names)["value"].first().unstack()
    latest_vacation_date = vacations["end"].max()
    assert (date <= latest_vacation_date
            ), f"Vacations are only known until {latest_vacation_date}"

    has_vacations = (vacations["start"] <= date) & (date <= vacations["end"])
    state_to_vacation = {
        state: name
        for name, state in has_vacations[has_vacations].index
    }
    return state_to_vacation
Esempio n. 36
0
def calibration_plot(df_calibration, model_name):

    dtTrade = df_calibration['dtTrade'][0]
    title = '%s Model (%s)' % (model_name, dtTrade)

    df_calibration = DataFrame.filter(df_calibration,
                                      items=[
                                          'dtExpiry', 'Strike', 'IVBid',
                                          'IVAsk', 'TTM', model_name + '-IV'
                                      ])

    # group by maturity
    grouped = df_calibration.groupby('dtExpiry')

    all_groups = [(dt, g) for dt, g in grouped]

    xy = [(0, 0), (0, 1), (1, 0), (1, 1)]

    for k in range(0, len(all_groups), 4):
        if (k + 4) >= len(all_groups):
            break
        fig, axs = plt.subplots(2, 2, sharex=True, sharey=True)
        axs[0, 0].set_title(title)

        for i in range(4):
            x, y = xy[i]
            calibration_subplot(axs[x, y], all_groups[i + k][1], i, model_name)
        plt.show(block=False)
Esempio n. 37
0
def cv_per_group(df: pd.DataFrame, list_group_prefix: list,
                 values_cols_prefix: str):
    """
    Input : df with abundances values, list of string to select appropriate value columns per groups (list_group_prefix)
    and prefix used for data column (values_cols_prefix)
    The script creates one column per group containing CV for each protein.
    Returns : the resulting columns as a dataframe (stats_per_groups), and input df augmented with the resulting columns
    """
    stats_per_groups = pd.DataFrame()

    for group in list_group_prefix:
        data = df.filter(regex=group)

        # strip prefix shared with all other groups
        prefix_to_remove = values_cols_prefix + '_'
        group_name = re.sub(prefix_to_remove, "", group)

        column_to_add_name = "CV_{}".format(group_name)

        # Compute CV
        column_to_add_values = np.nanstd(data, axis=1) / np.nanmean(data,
                                                                    axis=1)

        kwargs = {column_to_add_name: column_to_add_values}
        df = df.assign(**kwargs)  # keyword in assign can't be an expression

        # Save results aside
        stats_per_groups = pd.concat(
            [stats_per_groups, df[column_to_add_name]], axis=1)
    return df, stats_per_groups
Esempio n. 38
0
def merge_df(df_option, options, model_name):
    df_output = DataFrame.filter(
        df_option,
        items=[
            "dtTrade",
            "dtExpiry",
            "Type",
            "Strike",
            "Mid",
            "QuickDelta",
            "IVBid",
            "IVAsk",
            "iRate",
            "iDiv",
            "ATMVol",
            "Fwd",
            "TTM",
        ],
    )

    model_value = np.zeros(len(df_option))
    model_iv = np.zeros(len(df_option))
    for i, j in zip(range(len(df_option)), range(0, len(options), 2)):
        model_value[i] = options[j].model_value()
        model_iv[i] = options[j].impliedVolatility(
            model_value[i], accuracy=1.0e-5, maxEvaluations=5000, minVol=0.01, maxVol=10.0
        )

    df_output[model_name + "-Value"] = model_value
    df_output[model_name + "-IV"] = model_iv

    return df_output
Esempio n. 39
0
def tree_modeling(train_set: DataFrame, reg):
    train_df = train_set.filter(regex=reg)
    train_np = train_df.as_matrix()
    x = train_np[:, 1:]
    y = train_np[:, 0]
    clf = DecisionTreeClassifier(criterion='gini')
    clf.fit(x, y)
    scores = np.array(cross_validation.cross_val_score(clf, x, y, cv=5))
    print('The accuracy on train set is', scores.mean())
    return clf, train_df
def tree_modeling(train_set: DataFrame, reg):
    train_df = train_set.filter(regex=reg)
    train_np = train_df.as_matrix()
    x = train_np[:, 1:]
    y = train_np[:, 0]
    clf = LogisticRegression(penalty='l1')
    clf.fit(x, y)
    scores = np.array(cross_validation.cross_val_score(clf, x, y, cv=5))
    print('The accuracy on train set is', scores.mean())
    return clf, train_df
Esempio n. 41
0
def heston_calibration(df_option, ival=None):
    """
    calibrate heston model
    """

    # extract rates and div yields from the data set
    df_tmp = DataFrame.filter(df_option, items=["dtExpiry", "iRate", "iDiv"])
    grouped = df_tmp.groupby("dtExpiry")
    df_rates = grouped.agg(lambda x: x[0])

    dtTrade = df_option["dtTrade"][0]
    # back out the spot from any forward
    iRate = df_option["iRate"][0]
    iDiv = df_option["iDiv"][0]
    TTM = df_option["TTM"][0]
    Fwd = df_option["Fwd"][0]
    spot = SimpleQuote(Fwd * np.exp(-(iRate - iDiv) * TTM))
    print("Spot: %f risk-free rate: %f div. yield: %f" % (spot.value, iRate, iDiv))

    # build array of option helpers
    hh = heston_helpers(spot, df_option, dtTrade, df_rates)
    options = hh["options"]
    spot = hh["spot"]

    risk_free_ts = dfToZeroCurve(df_rates["iRate"], dtTrade)
    dividend_ts = dfToZeroCurve(df_rates["iDiv"], dtTrade)

    # initial values for parameters
    if ival is None:
        ival = {"v0": 0.1, "kappa": 1.0, "theta": 0.1, "sigma": 0.5, "rho": -0.5}

    process = HestonProcess(
        risk_free_ts, dividend_ts, spot, ival["v0"], ival["kappa"], ival["theta"], ival["sigma"], ival["rho"]
    )

    model = HestonModel(process)
    engine = AnalyticHestonEngine(model, 64)

    for option in options:
        option.set_pricing_engine(engine)

    om = LevenbergMarquardt(1e-8, 1e-8, 1e-8)
    model.calibrate(options, om, EndCriteria(400, 40, 1.0e-8, 1.0e-8, 1.0e-8))

    print("model calibration results:")
    print("v0: %f kappa: %f theta: %f sigma: %f rho: %f" % (model.v0, model.kappa, model.theta, model.sigma, model.rho))

    calib_error = (1.0 / len(options)) * sum([pow(o.calibration_error() * 100.0, 2) for o in options])

    print("SSE: %f" % calib_error)

    # merge the fitted volatility and the input data set
    return merge_df(df_option, options, "Heston")
Esempio n. 42
0
def tree_modeling(train_set: DataFrame, reg):
    train_df = train_set.filter(regex=reg)
    train_np = train_df.as_matrix()
    x = train_np[:, 1:]
    y = train_np[:, 0]
    clf = RandomForestClassifier(n_estimators=270,
                                 max_depth=8,
                                 min_samples_leaf=3,
                                 random_state=50)
    clf.fit(x, y)
    scores = np.array(cross_validation.cross_val_score(clf, x, y, cv=5))
    print('The accuracy on train set is', scores.mean())
    return clf, train_df
Esempio n. 43
0
File: hjm.py Progetto: alpmdog/CQF
def calculate_pca(forwards, no_factors=3):
    fwddiff = forwards.diff()
    fwddiff = fwddiff.dropna()
    covmat = fwddiff.cov()
    covmat = covmat * 252 / 10000
    eigenvecs, eigenmat = jacobi(covmat.values)
    eigvecs = Series(eigenvecs, index=covmat.columns)
    sorted_eigvecs = eigvecs.order(ascending=False)
    top3 = sorted_eigvecs[:no_factors].index
    eigenmat_df = DataFrame(eigenmat, index=covmat.columns,
                            columns=covmat.columns)
    filtered_eigenmat = eigenmat_df.filter(top3)
    return sorted_eigvecs, filtered_eigenmat
Esempio n. 44
0
def boost_modeling(train_set: DataFrame, reg):
    train_df = train_set.filter(regex=reg)
    train_np = train_df.as_matrix()
    x = train_np[:, 1:]
    y = train_np[:, 0]
    train_x, train_y, valid_x, valid_y = leave_out(x, y)
    dtrain = xgb.DMatrix(data=train_x, label=train_y)
    dvalid = xgb.DMatrix(data=valid_x, label=valid_y)
    watchlist = [(dtrain, 'train')]
    param = {'max_depth': 6, 'eta': 0.05, 'silent': 1,
             'objective': 'binary:logistic', 'subsample': 0.9}
    bst = xgb.train(param, dtrain, num_boost_round=17, evals=watchlist)
    return bst
    def test_filter_regex_search(self):
        fcopy = self.frame.copy()
        fcopy["AA"] = 1

        # regex
        filtered = fcopy.filter(regex="[A]+")
        self.assertEqual(len(filtered.columns), 2)
        self.assertIn("AA", filtered)

        # doesn't have to be at beginning
        df = DataFrame({"aBBa": [1, 2], "BBaBB": [1, 2], "aCCa": [1, 2], "aCCaBB": [1, 2]})

        result = df.filter(regex="BB")
        exp = df[[x for x in df.columns if "BB" in x]]
        assert_frame_equal(result, exp)
Esempio n. 46
0
    def test_filter_regex_search(self):
        fcopy = self.frame.copy()
        fcopy['AA'] = 1

        # regex
        filtered = fcopy.filter(regex='[A]+')
        self.assertEqual(len(filtered.columns), 2)
        self.assertIn('AA', filtered)

        # doesn't have to be at beginning
        df = DataFrame({'aBBa': [1, 2],
                        'BBaBB': [1, 2],
                        'aCCa': [1, 2],
                        'aCCaBB': [1, 2]})

        result = df.filter(regex='BB')
        exp = df[[x for x in df.columns if 'BB' in x]]
        assert_frame_equal(result, exp)
Esempio n. 47
0
def merge_df(df_option, options, model_name):
    df_output = DataFrame.filter(df_option,
                items=['dtTrade', 'dtExpiry',
                       'Type', 'K', 'Mid',
                       'QuickDelta', 'VB', 'VA',
                       'R', 'D', 'ATMVol', 'F', 'T'])

    model_value = np.zeros(len(df_option))
    model_iv = np.zeros(len(df_option))
    for i, j in zip(range(len(df_option)), range(0, len(options),2)):
        model_value[i] = options[j].model_value()
        model_iv[i] = options[j].impliedVolatility(model_value[i],
            accuracy=1.e-5, maxEvaluations=5000,
            minVol=.01, maxVol=10.0)

    df_output[model_name + '-Value'] = model_value
    df_output[model_name + '-IV'] = model_iv

    return df_output
Esempio n. 48
0
def calibration_plot(title, df_calibration, model_name):
    df_calibration = DataFrame.filter(
        df_calibration, items=["dtExpiry", "Strike", "IVBid", "IVAsk", "TTM", model_name + "-IV"]
    )

    # group by maturity
    grouped = df_calibration.groupby("dtExpiry")

    all_groups = [(dt, g) for dt, g in grouped]

    xy = [(0, 0), (0, 1), (1, 0), (1, 1)]

    for k in range(0, len(all_groups), 4):
        if (k + 4) >= len(all_groups):
            break
        plt.figure()
        fig, axs = plt.subplots(2, 2, sharex=True, sharey=True)

        for i in range(4):
            x, y = xy[i]
            calibration_subplot(axs[x, y], all_groups[i + k][1], i, model_name)
        plt.show()
Esempio n. 49
0
	def main(self):
		
		sampleMap   = self.maps
		snpsDict    = self.snps
		MainDF      = DataFrame()
		snpsList    = self.snpsPos 
		samplesAll  = self.samA
		samplesUsed = self.samM

		## code for multiprocessing
		
		pool  = mp.Pool(processes=4)
		for res in pool.imap_unordered(functools.partial(mp_coverage_process,sampleMap=sampleMap,snpsDict=snpsDict,sortedSnps=snpsList),samplesAll,chunksize=50):
			sampleName, (coverages, snpIndices) = res
			DF_temp = DataFrame({sampleName:coverages},index = snpIndices)
			MainDF  = pd.concat([MainDF,DF_temp],axis = 1)
			
		"""
		for sampleName in samplesUsed:
			
			print sampleName
			sys.stdout.flush()	
			covrFile = sampleMap[sampleName]
			coverages, snpIndices = self.coverage_process(covrFile,snpsDict)
			DF_temp = DataFrame({sampleName:coverages},index = snpIndices)
			MainDF  = pd.concat([MainDF,DF_temp],axis = 1)
		
		"""		
		
		subDF   = MainDF.filter(items=samplesUsed)
		medians = subDF.median(axis=1)
		medFile = os.path.join(self.odir,'median_coverages_for_chrm_'+self.chrm+'_part'+self.part+'.csv')
		medians.to_csv(medFile)

		outf = os.path.join(self.odir,'coverages_for_chrm_'+self.chrm+'_part'+self.part+'.csv')
		MainDF.to_csv(outf)

		return 'Done'
Esempio n. 50
0
def calibration_plot(title, df_calibration, model_name):
    df_calibration = DataFrame.filter(df_calibration,
                    items=['dtExpiry', 
                           'K', 'VB', 'VA',
                           'T', model_name+'-IV'])

    # group by maturity
    grouped = df_calibration.groupby('dtExpiry')

    all_groups = [(dt, g) for dt, g in grouped]
    
    xy = [(0,0), (0,1), (1,0), (1,1)]

    for k in range(0, len(all_groups),4):
        if (k+4) >= len(all_groups):
            break
        plt.figure()
        fig, axs = plt.subplots(2, 2, sharex=True, sharey=True)

        for i in range(4):
            x,y = xy[i]
            calibration_subplot(axs[x,y], all_groups[i+k][1],i, model_name)
        fig.suptitle(title, fontsize=12, fontweight='bold')
        fig.show()
MainDF = DataFrame()
for sample in samplesAll:

    subDirectory = os.path.join(datad, sample)
    covrFileName = sample + "_" + chrm + "_part_" + str(part) + ".pickle"
    covrFilePath = os.path.join(subDirectory, covrFileName)

    try:
        with open(covrFilePath, "rb") as inp:
            data = cPickle.load(inp)
    except IOError, e:
        print e

    coverages = data["coverages"]
    varIndices = data["indices"]

    DF_temp = DataFrame({sample: coverages}, index=varIndices)
    MainDF = pd.concat([MainDF, DF_temp], axis=1)


subDF = MainDF.filter(items=samplesMed)
medians = subDF.median(axis=1)
medFile = os.path.join(outd, "Medians", "median_coverages_for_chrm_" + chrm + "_part" + part + ".csv")
medians.to_csv(medFile)

outf1 = os.path.join(outd, "Whites", "coverages_for_chrm_" + chrm + "_part" + part + "_whites_only.csv")
subDF.to_csv(outf1)
outf2 = os.path.join(outd, "All", "coverages_for_chrm_" + chrm + "_part" + part + "_all_samples.csv")
MainDF.to_csv(outf2)
Esempio n. 52
0
def heston_helpers(df_option, dtTrade=None, df_rates=None, ival=None):
    """
    Create array of heston options helpers
    """

    if dtTrade is None:
        dtTrade = df_option['dtTrade'][0]
    DtSettlement = datetoQLDate(dtTrade)
    
    settings = Settings()
    settings.evaluation_date = DtSettlement

    calendar = TARGET()

    if df_rates is None:
        df_tmp = DataFrame.filter(df_option, items=['dtExpiry', 'IR', 'IDIV'])
        grouped = df_tmp.groupby('dtExpiry')
        df_rates = grouped.agg(lambda x: x[0])

    # convert data frame (date/value) into zero curve
    # expect the index to be a date, and 1 column of values

    risk_free_ts = dfToZeroCurve(df_rates['R'], dtTrade)
    dividend_ts = dfToZeroCurve(df_rates['D'], dtTrade)

    # back out the spot from any forward
    iRate = df_option['R'][0]
    iDiv = df_option['D'][0]
    TTM = df_option['T'][0]
    Fwd = df_option['F'][0]
    spot = SimpleQuote(Fwd*np.exp(-(iRate-iDiv)*TTM))
    print('Spot: %f risk-free rate: %f div. yield: %f' % (spot.value, iRate, iDiv))

    # loop through rows in option data frame, construct
    # helpers for bid/ask

    oneDay = datetime.timedelta(days=1)
    dtExpiry = [dtTrade + int(t*365)*oneDay for t in df_option['T']]
    df_option['dtExpiry'] = dtExpiry

    options = []
    for index, row in df_option.T.iteritems():

        strike = row['K']
        if (strike/spot.value > 1.3) | (strike/spot.value < .7):
            continue

        days = int(365*row['T'])
        maturity = Period(days, Days)

        options.append(
                HestonModelHelper(
                    maturity, calendar, spot.value,
                    strike, SimpleQuote(row['VB']),
                    risk_free_ts, dividend_ts,
                    ImpliedVolError))
        
        options.append(
                HestonModelHelper(
                    maturity, calendar, spot.value,
                    strike, SimpleQuote(row['VA']),
                    risk_free_ts, dividend_ts,
                    ImpliedVolError))

    return {'options':options, 'spot': spot}
Esempio n. 53
0
def heston_calibration(df_option, ival=None):
    """
    calibrate heston model
    """

    # extract rates and div yields from the data set    
    df_tmp = DataFrame.filter(df_option, items=['dtExpiry', 'iRate', 'iDiv'])
    grouped = df_tmp.groupby('dtExpiry')

    def aggregate(serie):
        return serie[serie.index[0]]

    df_rates = grouped.agg(aggregate)

    # Get first index:
    first_index = 0

    dtTrade = df_option['dtTrade'][first_index]
    # back out the spot from any forward
    iRate = df_option['iRate'][first_index]
    iDiv = df_option['iDiv'][first_index]
    TTM = df_option['TTM'][first_index]
    Fwd = df_option['Fwd'][first_index]
    spot = SimpleQuote(Fwd*np.exp(-(iRate-iDiv)*TTM))
    print('Spot: %f risk-free rate: %f div. yield: %f' % (spot.value, iRate, iDiv))

    # build array of option helpers
    hh = heston_helpers(spot, df_option, dtTrade, df_rates)
    options = hh['options']
    spot = hh['spot']

    risk_free_ts = dfToZeroCurve(df_rates['iRate'], dtTrade)
    dividend_ts = dfToZeroCurve(df_rates['iDiv'], dtTrade)

    # initial values for parameters
    if ival is None:
        ival = {'v0': 0.1, 'kappa': 1.0, 'theta': 0.1,
        'sigma': 0.5, 'rho': -.5}

    process = HestonProcess(
        risk_free_ts, dividend_ts, spot, ival['v0'], ival['kappa'],
         ival['theta'], ival['sigma'], ival['rho'])

    model = HestonModel(process)
    engine = AnalyticHestonEngine(model, 64)

    for option in options:
        option.set_pricing_engine(engine)

    om = LevenbergMarquardt(1e-8, 1e-8, 1e-8)
    model.calibrate(
        options, om, EndCriteria(400, 40, 1.0e-8, 1.0e-8, 1.0e-8)
    )

    print('model calibration results:')
    print('v0: %f kappa: %f theta: %f sigma: %f rho: %f' %
          (model.v0, model.kappa, model.theta, model.sigma,
           model.rho))

    calib_error = (1.0/len(options)) * sum(
        [pow(o.calibration_error()*100.0,2) for o in options])

    print('SSE: %f' % calib_error)

    # merge the fitted volatility and the input data set
    return merge_df(df_option, options, 'Heston')
Esempio n. 54
0
    def test_filter(self):
        # Items
        filtered = self.frame.filter(['A', 'B', 'E'])
        assert len(filtered.columns) == 2
        assert 'E' not in filtered

        filtered = self.frame.filter(['A', 'B', 'E'], axis='columns')
        assert len(filtered.columns) == 2
        assert 'E' not in filtered

        # Other axis
        idx = self.frame.index[0:4]
        filtered = self.frame.filter(idx, axis='index')
        expected = self.frame.reindex(index=idx)
        tm.assert_frame_equal(filtered, expected)

        # like
        fcopy = self.frame.copy()
        fcopy['AA'] = 1

        filtered = fcopy.filter(like='A')
        assert len(filtered.columns) == 2
        assert 'AA' in filtered

        # like with ints in column names
        df = DataFrame(0., index=[0, 1, 2], columns=[0, 1, '_A', '_B'])
        filtered = df.filter(like='_')
        assert len(filtered.columns) == 2

        # regex with ints in column names
        # from PR #10384
        df = DataFrame(0., index=[0, 1, 2], columns=['A1', 1, 'B', 2, 'C'])
        expected = DataFrame(
            0., index=[0, 1, 2], columns=pd.Index([1, 2], dtype=object))
        filtered = df.filter(regex='^[0-9]+$')
        tm.assert_frame_equal(filtered, expected)

        expected = DataFrame(0., index=[0, 1, 2], columns=[0, '0', 1, '1'])
        # shouldn't remove anything
        filtered = expected.filter(regex='^[0-9]+$')
        tm.assert_frame_equal(filtered, expected)

        # pass in None
        with pytest.raises(TypeError, match='Must pass'):
            self.frame.filter()
        with pytest.raises(TypeError, match='Must pass'):
            self.frame.filter(items=None)
        with pytest.raises(TypeError, match='Must pass'):
            self.frame.filter(axis=1)

        # test mutually exclusive arguments
        with pytest.raises(TypeError, match='mutually exclusive'):
            self.frame.filter(items=['one', 'three'], regex='e$', like='bbi')
        with pytest.raises(TypeError, match='mutually exclusive'):
            self.frame.filter(items=['one', 'three'], regex='e$', axis=1)
        with pytest.raises(TypeError, match='mutually exclusive'):
            self.frame.filter(items=['one', 'three'], regex='e$')
        with pytest.raises(TypeError, match='mutually exclusive'):
            self.frame.filter(items=['one', 'three'], like='bbi', axis=0)
        with pytest.raises(TypeError, match='mutually exclusive'):
            self.frame.filter(items=['one', 'three'], like='bbi')

        # objects
        filtered = self.mixed_frame.filter(like='foo')
        assert 'foo' in filtered

        # unicode columns, won't ascii-encode
        df = self.frame.rename(columns={'B': u('\u2202')})
        filtered = df.filter(like='C')
        assert 'C' in filtered
Esempio n. 55
0
    def test_filter_unicode(self, name, expected):
        # GH13101
        df = DataFrame({u'a': [1, 2], u'あ': [3, 4]})

        assert_frame_equal(df.filter(like=name), expected)
        assert_frame_equal(df.filter(regex=name), expected)
Esempio n. 56
0
def ATM_Vol(premium, discountFactor, forward, strike):
    """
    Aproximate std dev, for calls close to the money
    """
    vol = (premium/discountFactor - .5*(forward-strike))*5.0/(forward+strike) 

    return vol

    # get spot and option data frame
    
    (spot, optionDataFrame) = read_SPX_file(option_data_file)

    grouped = optionDataFrame.groupby('dtExpiry') 

    isFirst = True
    for spec, group in grouped:
        print('processing group %s' % spec)

        # implied vol for this type/expiry group

        indx = group.index
        
        dtTrade = group['dtTrade'][indx[0]]
        dtExpiry = group['dtExpiry'][indx[0]]
        daysToExpiry = (dtExpiry-dtTrade).days
        timeToMaturity = daysToExpiry/365.0

        # exclude groups with too few data points 
        # or too short maturity

        if timeToMaturity < tMin:
            continue
            
        # valid call and put quotes
        df_call = group[(group['Type'] == 'C') & (group['Bid']>0) \
                    & (group['Ask']>0)]
        df_put = group[(group['Type'] == 'P') &  (group['Bid']>0) \
                    & (group['Ask']>0)]
        if (len(df_call) == 0) | (len(df_put) == 0):
            continue

        # calculate forward, implied interest rate and implied div. yield
            
        df_call['Mid'] = (df_call['Bid']+df_call['Ask'])/2
        df_put['Mid'] = (df_put['Bid']+df_put['Ask'])/2
    
        df_C = DataFrame.filter(df_call, items=['Strike', 'Mid'])
        df_C.columns = ['Strike', 'PremiumC']
        to_join = DataFrame(df_put['Mid'], index=df_put['Strike'],
            columns=['PremiumP']) 

        # use 'inner' join because some strikes are not quoted for C and P
        df_all = df_C.join(to_join, on='Strike', how='inner')
    
        df_all['C-P'] = df_all['PremiumC'] - df_all['PremiumP']
    
        model = ols(y=df_all['C-P'], x=df_all.ix[:,'Strike'])
        b = model.beta 
    
        # intercept is last coef
        iRate = -np.log(-b[0])/timeToMaturity
        dRate = np.log(spot/b[1])/timeToMaturity
        discountFactor = np.exp(-iRate*timeToMaturity)
        Fwd = spot * np.exp((iRate-dRate)*timeToMaturity)

        print('Fwd: %f int rate: %f div yield: %f' % (Fwd, iRate, dRate))

        # interpolate ATM premium and vol: used to compute Quick Delta
        f_call = interp1d(df_all['Strike'].values, df_all['PremiumC'].values)
        f_put = interp1d(df_all['Strike'].values, df_all['PremiumP'].values)

        atmPremium = (f_call(Fwd)+f_put(Fwd))/2
        atmVol = blackFormulaImpliedStdDev('C', strike=Fwd,
                 forward=Fwd, blackPrice=atmPremium,
                 discount=discountFactor,
                 TTM=timeToMaturity)/np.sqrt(timeToMaturity)
                    
        print('ATM vol: %f' % atmVol)

        # Quick Delta, computed with ATM vol
        rv = norm()
        df_call['QuickDelta'] = [rv.cdf(np.log(Fwd/strike)/(atmVol*np.sqrt(timeToMaturity))) \
        for strike in df_call['Strike']]
        df_put['QuickDelta'] = [rv.cdf(np.log(Fwd/strike)/(atmVol*np.sqrt(timeToMaturity))) \
        for strike in df_put['Strike']]

        # implied bid/ask vol for all options
    
        def impvol(strike, premium):
            try:
                vol = blackFormulaImpliedStdDev(cp, strike,
                    forward=Fwd, blackPrice=premium, discount=discountFactor,
                    TTM=timeToMaturity)
            except:
                vol = np.nan
                return vol/np.sqrt(timeToMaturity)
        
        cp = 'C'
        df_call['IVBid'] = [impvol(strike, price) for strike, price in zip(df_call['Strike'], df_call['Bid'])]
    df_call['IVAsk'] = [impvol(strike, price) for strike, price in zip(df_call['Strike'], df_call['Ask'])]
    # QD computed with ATM vol 
        
    cp = 'P'
    df_put['IVBid'] = [impvol(strike, price) for strike, price in zip(df_put['Strike'], df_put['Bid'])]
    df_put['IVAsk'] = [impvol(strike, price) for strike, price in zip(df_put['Strike'], df_put['Ask'])]

    # keep OTM data for options within QD range
    
    df_call = df_call[  (df_call['Strike'] >= Fwd) & \
                        (df_call['QuickDelta'] >= QDMin) & \
                        (df_call['QuickDelta'] <= QDMax) ]
                        
    df_put = df_put[  (df_put['Strike'] < Fwd) & \
                        (df_put['QuickDelta'] >= QDMin) & \
                        (df_put['QuickDelta'] <= QDMax) ]

    # final assembly...

    df_cp = df_call.append(df_put,  ignore_index=True)
    df_cp['R'] = iRate 
    df_cp['D'] = dRate 
    df_cp['ATMVol'] = atmVol 
    df_cp['F'] = Fwd
    df_cp['T'] = timeToMaturity
    df_cp = df_cp.rename(columns=
                         {'IVBid': 'VB',
                          'IVAsk': 'VA',
                          'Strike': 'K'})
    df_cp['CP'] = [1 if t == 'C' else -1 for t in df_cp['Type']]
                         
    if isFirst:
        df_final = df_cp
        isFirst = False 
    else:
        df_final = df_final.append(df_cp, ignore_index=True)
Esempio n. 57
0
    df_cp = df_call.append(df_put,  ignore_index=True)
    df_cp['R'] = iRate 
    df_cp['D'] = dRate 
    df_cp['ATMVol'] = atmVol 
    df_cp['F'] = Fwd
    df_cp['T'] = timeToMaturity
    df_cp = df_cp.rename(columns=
                         {'IVBid': 'VB',
                          'IVAsk': 'VA',
                          'Strike': 'K'})
    df_cp['CP'] = [1 if t == 'C' else -1 for t in df_cp['Type']]
                         
    if isFirst:
        df_final = df_cp
        isFirst = False 
    else:
        df_final = df_final.append(df_cp, ignore_index=True)
        
df_final.to_csv(calibration_data_file, index=False)

df_final.save('data/df_final.pkl')

# save term structure of dividends and rate: first item in each expiry group   
df_tmp = DataFrame.filter(df_final, items=['dtExpiry', 'R', 'D'])
grouped = df_tmp.groupby('dtExpiry')
df_rates = grouped.agg(lambda x: x[0])
   
df_rates.to_csv(rate_div_file)
df_rates.save('data/df_rates.pkl')