Ejemplo n.º 1
0
 def dfLastLacFrecuencia(self, df: pd.DataFrame):
     groupedDf = df.groupby('LAST_LAC')['LAST_LAC'].agg(FRECUENCIA=pd.NamedAgg(column='LAST_LAC', aggfunc='size'))
     groupedDf['LAST_LAC'] = groupedDf.index
     groupedDf = groupedDf[['LAST_LAC', 'FRECUENCIA']]
     return groupedDf.sort_values(ascending=False, by='FRECUENCIA')
    def __init__(self, load_file_name: str, features_files_dict: dict, total_payoff_label: bool=True,
                 features_file_list: list=list(), use_all_history: bool = False, label: str='total_payoff',
                 use_all_history_text_average: bool = False,
                 use_all_history_text: bool=False, use_all_history_average: bool = False,
                 use_prefix_suffix_setting: bool=False, features_to_drop: list=None,
                 suffix_average_text: bool=False, no_suffix_text: bool=False,
                 non_nn_turn_model: bool=False, transformer_model: bool=False,
                 prefix_data_in_sequence: bool=False, data_type='train_data', no_decision_features: bool=False,
                 suffix_no_current_round_average_text: bool=False):
        """
        :param load_file_name: the raw data file name
        :param features_files_dict: dict of features files and types
        :param total_payoff_label: if the label is the total payoff of the expert or the next rounds normalized payoff
        :param label: the name of the label
        :param features_file_list: if using fix features- the name of the features file
        :param use_all_history: if to add some numeric features regarding the history decisions and lottery
        :param use_all_history_average: if to add some numeric features regarding the history decisions and lottery as
        average over the history
        :param use_all_history_text: if to use all the history text features
        :param use_all_history_text_average: if to use the history text as average over all the history
        :param use_prefix_suffix_setting: if we create data for crf model with fixed prefix
        :param features_to_drop: a list of features to drop
        :param suffix_average_text:  if we want to add the suffix average text features
        :param no_suffix_text: if we don't want to use the text of the suffix rounds
        :param non_nn_turn_model: non neural networks models that predict a label for each round
        :param transformer_model: create data for transformer model --> create features for prefix rounds too
        :param prefix_data_in_sequence: if the prefix data is not in the suffix features but in the seq
        :param no_decision_features: if we want to check models without decision features
        :param suffix_no_current_round_average_text: if we want the average of all suffix text
        """
        print(f'Start create and save data for file: {os.path.join(data_directory, f"{load_file_name}_{data_type}.csv")}')
        logging.info('Start create and save data for file: {}'.
                     format(os.path.join(data_directory, f'{load_file_name}_{data_type}.csv')))

        self.data = pd.read_csv(os.path.join(data_directory, f'{load_file_name}_{data_type}.csv'))  # , usecols=columns_to_use)
        print(f'Number of rows in data: {self.data.shape[0]}')
        self.data = self.data.loc[(self.data.status == 'play') & (self.data.player_id_in_group == 2)]
        print(f'Number of rows in data: {self.data.shape[0]} after keep only play and decision makers')
        self.data = self.data.drop_duplicates()
        print(f'Number of rows in data: {self.data.shape[0]} after drop duplicates')

        # get manual text features
        reviews_features_files_list = list()
        print(f'Load features from: {features_file_list}')
        for features_file in features_file_list:
            features_file_type = features_files_dict[features_file]
            if features_file_type == 'pkl':
                reviews_features_files_list.append(joblib.load(os.path.join(
                    data_directory, f'{features_file}_{data_type}.{features_file_type}')))
            elif features_file_type == 'xlsx':
                features = pd.read_excel(os.path.join(
                    data_directory, f'{features_file}_{data_type}.{features_file_type}'))
                if data_type == 'test_data':  # change order to be the same as in the train data
                    train_features = pd.read_excel(
                        os.path.join(data_directory, f'{features_file}_train_data.{features_file_type}'))
                    features = features[train_features.columns]
                reviews_features_files_list.append(features)

            else:
                print('Features file type is has to be pkl or xlsx')
                return
        # get manual text features
        for index, reviews_features_file in enumerate(reviews_features_files_list):
            if 'review' in reviews_features_file:
                reviews_features_file = reviews_features_file.drop('review', axis=1)
            if 'score' in reviews_features_file:
                reviews_features_file = reviews_features_file.drop('score', axis=1)

            if reviews_features_file.shape[1] == 2:  # Bert features -> flat the vectors
                reviews = pd.DataFrame()
                for i in reviews_features_file.index:
                    temp = pd.DataFrame(reviews_features_file.at[i, 'review_features']).append(
                        pd.DataFrame([reviews_features_file.at[i, 'review_id']], index=['review_id']))
                    reviews = pd.concat([reviews, temp], axis=1, ignore_index=True)

                reviews_features_files_list[index] = reviews.T
            else:  # manual features
                if features_to_drop is not None:
                    reviews_features_files_list[index] = reviews_features_file.drop(features_to_drop, axis=1)

        if len(reviews_features_files_list) == 1:
            self.reviews_features = reviews_features_files_list[0]
        elif len(reviews_features_files_list) == 2:
            self.reviews_features = reviews_features_files_list[0].merge(reviews_features_files_list[1],
                                                                         on='review_id')
        else:
            print(f"Can't create reviews features with {len(reviews_features_files_list)} feature types")

        # calculate expert total payoff --> the label
        self.data['exp_payoff'] = self.data.group_receiver_choice.map({1: 0, 0: 1})
        total_exp_payoff = self.data.groupby(by='pair_id').agg(
            total_exp_payoff=pd.NamedAgg(column='exp_payoff', aggfunc=sum))
        self.data = self.data.merge(total_exp_payoff, how='left', right_index=True, left_on='pair_id')
        self.data['10_result'] = np.where(self.data.group_lottery_result == 10, 1, 0)
        self.data = self.data[['pair_id', 'total_exp_payoff', 'subsession_round_number', 'group_sender_answer_reviews',
                               'exp_payoff', 'group_lottery_result', 'review_id', 'previous_round_lottery_result',
                               'previous_round_decision', 'group_average_score',
                               'lottery_result_low', 'lottery_result_med1', 'previous_round_lottery_result_low',
                               'previous_round_lottery_result_high', 'previous_average_score_low',
                               'previous_average_score_high', 'previous_round_lottery_result_med1',
                               'group_sender_payoff', 'lottery_result_high',
                               'chose_lose', 'chose_earn', 'not_chose_lose', 'not_chose_earn',
                               'previous_score', 'group_sender_answer_scores', '10_result']]
        # 'time_spent_low', 'time_spent_high',
        self.final_data = pd.DataFrame()
        self.pairs = pd.Series(self.data.pair_id.unique())
        self.total_payoff_label = total_payoff_label
        self.label = label
        self.number_of_rounds = 10
        self.features_file_list = features_file_list
        self.use_all_history = use_all_history
        self.use_all_history_average = use_all_history_average
        self.use_all_history_text_average = use_all_history_text_average
        self.use_all_history_text = use_all_history_text
        self.suffix_average_text = suffix_average_text
        self.suffix_no_current_round_average_text = suffix_no_current_round_average_text
        self.no_suffix_text = no_suffix_text
        self.non_nn_turn_model = non_nn_turn_model
        self.transformer_model = transformer_model
        self.prefix_data_in_sequence = prefix_data_in_sequence
        self.decisions_payoffs_columns = ['exp_payoff', 'lottery_result_high', 'lottery_result_low',
                                          'lottery_result_med1', 'chose_lose', 'chose_earn', 'not_chose_lose',
                                          'not_chose_earn']
        if no_decision_features:
            self.decisions_payoffs_columns = list()
        print(f'Number of pairs in data: {self.pairs.shape}')

        self.history_columns = list()
        if self.use_all_history_average:
            self.set_all_history_average_measures()

        # create file names:
        file_name_component = [f'{self.label}_label_',
                               'prefix_suffix_' if use_prefix_suffix_setting else '',
                               'non_nn_turn_model_' if self.non_nn_turn_model else '',
                               'transformer_' if self.transformer_model else '',
                               f'all_history_features_' if self.use_all_history else '',
                               f'all_history_features_avg_with_global_alpha_{alpha_global}_'
                               if self.use_all_history_average else '',
                               f'all_history_text_avg_with_alpha_{alpha_text}_' if
                               self.use_all_history_text_average else '',
                               'prefix_in_seq_' if prefix_data_in_sequence else '',
                               f'no_suffix_text_' if self.no_suffix_text else '',
                               f'all_suffix_text_average_' if self.suffix_average_text else '',
                               f'all_history_text_' if self.use_all_history_text else '',
                               f'{self.features_file_list}_',
                               'no_decision_features_' if no_decision_features else 'use_decision_features_',
                               f'{condition}_{data_type}']
        self.base_file_name = ''.join(file_name_component)
        print(f'Create data for: {self.base_file_name}')
        return
Ejemplo n.º 3
0
import pandas as pd

df_dummy = pd.DataFrame(
    dict(id=[1, 1, 2, 2, 3, 3, 3], values=[3, 5, 6, 7, 8, 9, 15]))

df_stats = (df_dummy.groupby(["id"]).agg(
    count=pd.NamedAgg(column="values", aggfunc="count"),
    sum=pd.NamedAgg(column="values", aggfunc="sum"),
    max=pd.NamedAgg(column="values", aggfunc="max"),
).reset_index().assign(
    pct_value=lambda df: round(100 * df["sum"] / sum(df["sum"]), 2)))

# To double check - you might sample a column

df_temp = df_dummy.loc[lambda x: x["id"] == 1][["values"]]
df_temp.sum().values
df_temp.max().values

# Or you would do this:

df_check = pd.DataFrame({
    "id": {
        0: 1,
        1: 2,
        2: 3
    },
    "count": {
        0: 2,
        1: 2,
        2: 3
    },
Ejemplo n.º 4
0
df['EarlyDeliveryDate'] = pd.to_datetime(df['EarlyDeliveryDate'])
df['ReceivedDate'] = pd.to_datetime(df['ReceivedDate'])
#df.info()
    
df['DeliveryTime'] = [x/10 if x>0 else x for x in df.DeliveryTime]

df['NormDeliveryTime'] = 0-df.DeliveryTime.abs()
scale = MinMaxScaler()
df['NormDeliveryTime'] = scale.fit_transform(df[['NormDeliveryTime']])

df['PercentKept'] = 1-df.PercentOfQuantityReturned


df['ReceivedMonth'] = df['ReceivedDate'].dt.month
df['ReceivedYear'] = df['ReceivedDate'].dt.year
grouped_monthly = df.groupby(['ReceivedYear', 'ReceivedMonth', 'Vendor']).agg(MonthlyNormDeliveryTime = pd.NamedAgg(column = 'NormDeliveryTime', aggfunc='mean'), MonthlyPercentReceived = pd.NamedAgg(column = 'PercentOfQuantityReceived', aggfunc='mean'), MonthlyPercentKept = pd.NamedAgg(column = 'PercentKept', aggfunc='mean'), VendorId = pd.NamedAgg(column = 'Vendor', aggfunc='first'))



"""*Categorization* of orders into performing and non-performing along with the reason. (Promptness, Quantity, Quality)"""


cls = KMeans(n_clusters = 4)
cls_assignment = cls.fit_predict(grouped_monthly[['MonthlyPercentReceived','MonthlyNormDeliveryTime','MonthlyPercentKept']])
grouped_monthly['label'] = cls_assignment


grouped = grouped_monthly.groupby("VendorId")
vendor_output = pd.DataFrame(columns=['Vendor_ID', 'Performance', 'Performance_Percent','UnderPerformance (Quality)', 'UnderPerformance (Quantity)', 'UnderPerformance (Promptness)'])
for name, group in grouped:
    values = group['label'].value_counts()
Ejemplo n.º 5
0
features = [feat for _, feat in feature_names_to_columns]

for category, entry_name in feature_names_to_columns:
    unique_count = corpus[entry_name].unique().shape[0]
    print(f"Number of unique {category} : {unique_count}")

# Remove all data with categories which are too rare (have less than a predefined frequency)
for feature in ["pos", "pattern"]:
    print(f"{feature} category frequency : ")
    freq = corpus.groupby(feature).count().morpheme / len(corpus)
    print((freq * 100).to_string())
    low_freq = freq[freq < 0.2].dropna().index
    corpus = corpus[~corpus[feature].isin(low_freq)]

num_roots_per_morpheme = corpus[["morpheme", "root"]].drop_duplicates().groupby("morpheme").agg(
    count=pd.NamedAgg(column="morpheme", aggfunc='count'))
for num_roots in num_roots_per_morpheme["count"].unique():
    unique_count = len(num_roots_per_morpheme[num_roots_per_morpheme["count"] == num_roots])
    percent = int(unique_count / len(corpus["morpheme"].unique()) * 100)
    print(f"Number of morphemes with {num_roots} distinct roots : {unique_count}  ({percent}%)")

special_radicals_map = {
    1: ("i", "w", "n"),
    2: ("i", "w", "h"),
    3: ("h", "i")
}
special_roots_df_lst = []
for rad_i, special_vals in special_radicals_map.items():
    for ch in special_vals:
        ri_special = corpus[corpus.root.str[rad_i - 1] == ch]
        special_roots_df_lst.append(ri_special)
Ejemplo n.º 6
0
    def get_window_features(self, campdate_df):
        """
        Finds meta data of each ANM in each time window

        Parameters
        ----------
        campdate_df : dataframe
            preprocessed dataframe

        """

        grouped = campdate_df.groupby(['sub_center_id', 'camp_id'])
        bp = grouped.agg(
            Tot_num_patients=pd.NamedAgg(column='ANC_Mother Id',
                                         aggfunc=lambda x: len(list(x))),
            Num_camps=pd.NamedAgg(column='cluster_date',
                                  aggfunc=lambda x: len(set(x))),
            dates=pd.NamedAgg(column='cluster_date',
                              aggfunc=lambda x: list(set(x)))).reset_index()

        anmgrouped = bp.groupby(['sub_center_id'])
        bpanm = anmgrouped.agg(
            Tot_num_patients=pd.NamedAgg(column='Tot_num_patients',
                                         aggfunc=sum),
            Num_camps=pd.NamedAgg(column='Num_camps', aggfunc=sum),
            Num_locations=pd.NamedAgg(column='camp_id',
                                      aggfunc=lambda x: len(list(x))),
            dates=pd.NamedAgg(column='dates',
                              aggfunc=lambda x: list(x))).reset_index()

        dates_diff = np.array([])
        for i in range(len(bpanm)):
            num_camps = bpanm.loc[i, 'Num_camps']
            if num_camps < 2:
                dates_diff = np.append(dates_diff, 28)
            else:
                anm_dates = bpanm.loc[i, 'dates']
                flat_list = [item for sublist in anm_dates for item in sublist]
                flat_list.sort()
                anm_diff = np.array([])
                for j in range(1, len(flat_list)):
                    diff = (flat_list[j] - flat_list[j - 1]).days
                    anm_diff = np.append(anm_diff, diff)
                dates_diff = np.append(dates_diff, np.mean(anm_diff))

        dates_diff = pd.Series(dates_diff)
        bpanm = bpanm.assign(dates_diff=dates_diff.values)

        # merge with dummy dataframe
        df_merge = pd.merge(self.allANMdf,
                            bpanm,
                            on='sub_center_id',
                            how='left')

        t = df_merge['Tot_num_patients'].isna().sum()
        df_merge['Tot_num_patients'].fillna(0, inplace=True)
        df_merge['Num_camps'].fillna(0, inplace=True)
        df_merge['dates_diff'].fillna(0, inplace=True)
        df_merge['Num_locations'].fillna(0, inplace=True)

        window_features = np.asarray([
            np.asarray(df_merge['Tot_num_patients']),
            np.asarray(df_merge['Num_camps']),
            np.asarray(df_merge['dates_diff']),
            np.asarray(df_merge['Num_locations'])
        ])

        return np.transpose(window_features), t
Ejemplo n.º 7
0
def sub_matrix(bcf_stats, header=_sub_header, report=None):
    """Create a report section with a base substitution matrix.

    :param bcf_stats: one or more outputs from `bcftools stats`.
    :param header: a markdown formatted header.
    :param report: an HTMLSection instance.

    :returns: an HTMLSection instance, if `report` was provided the given
        instance is modified and returned.
    """
    report = _maybe_new_report(report)
    report.markdown(header)

    sim_sub = {
        'G>A': 'C>T',
        'G>C': 'C>G',
        'G>T': 'C>A',
        'T>A': 'A>T',
        'T>C': 'A>G',
        'T>G': 'A>C'
    }

    def canon_sub(sub):
        b1 = sub[0]
        if b1 not in {'A', 'C'}:
            return canon_sub(sim_sub[sub])
        else:
            return b1, sub[2]

    df = bcf_stats['ST']
    df['canon_sub'] = df['type'].apply(canon_sub)
    df['original'] = df['canon_sub'].apply(lambda x: x[0])
    df['substitution'] = df['canon_sub'].apply(lambda x: x[1])
    df['count'] = df['count'].astype(int)
    df = df[['original', 'substitution', 'count']] \
        .groupby(['original', 'substitution']) \
        .agg(count=pd.NamedAgg(column='count', aggfunc='sum')) \
        .reset_index()

    colors = Blues9[::-1]
    mapper = LinearColorMapper(palette=colors,
                               low=min(df['count']),
                               high=max(df['count']))
    p = figure(y_range=['C', 'A'],
               x_range=['A', 'C', 'G', 'T'],
               x_axis_location="above",
               x_axis_label='alternative base',
               y_axis_label='reference base',
               tools="save",
               toolbar_location='below',
               output_backend="webgl",
               height=225,
               width=300,
               tooltips=[('sub', '@original>@substitution'),
                         ('count', '@count')])
    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.rect(source=df,
           y="original",
           x="substitution",
           width=1,
           height=1,
           fill_color={
               'field': 'count',
               'transform': mapper
           },
           line_color=None)
    report.plot(p)
    return report
def main_individualData(data_filename,
                        headers_filename,
                        output_filename,
                        ascending=True,
                        top_n=None):
    """Run main script
    
    Expected input headers are the following:
    indices: column with indices (discrete) values
    metrics: column with metric (continuous) values
    sort: columns to sort by
    scale: columns to project/transform to the range 0 to 1
    """

    # read in the headers
    data_used = pd.read_csv(headers_filename)
    index_columns = list(data_used.loc[:, "indices"].dropna())
    metric_columns = list(data_used.loc[:, "metrics"].dropna())
    sort_columns = list(data_used.loc[:, "sort"].dropna())
    scale_columns = list(data_used.loc[:, "scale"].dropna())

    headers = list(set(chain(index_columns, metric_columns, sort_columns)))

    # read in the data
    df = pd.read_csv(data_filename, usecols=headers)

    # make the empty data frame for the aggregate statistics
    metric_means = [item + "_mean" for item in metric_columns]
    metric_errors = [item + "_error" for item in metric_columns]
    metric_sort = [item + "_mean" for item in sort_columns]
    agg_stats = pd.DataFrame(columns=index_columns)

    # calculate the aggregated statistics (average and standard deviation)
    for metric in list(set(chain(metric_columns, sort_columns))):
        # scale the data
        if (metric in scale_columns):
            df[metric] = df[metric].transform(lambda x: (x - x.min()) /
                                              (x.max() - x.min()))

        # aggregate statistics
        df_tmp = df.groupby(index_columns).agg(
            metric_means=pd.NamedAgg(column=metric, aggfunc="mean"),
            metric_errors=pd.NamedAgg(column=metric, aggfunc="std"))
        df_tmp = df_tmp.rename(columns={
            "metric_means": metric + "_mean",
            "metric_errors": metric + "_error"
        })
        agg_stats = agg_stats.merge(df_tmp, on=index_columns, how="outer")

    # sort and filter
    agg_stats = agg_stats.sort_values(
        by=metric_sort, axis=0, ascending=ascending).reset_index(drop=True)
    if top_n:
        agg_stats = agg_stats.head(top_n)

    # make the plot
    fig = dicrete_plot(agg_stats,
                       index_columns,
                       metric_means,
                       metric_errors,
                       scale=1.5,
                       legend_offset=0.5,
                       capsize=5)

    # Export to svg file:
    fig.savefig(output_filename)
Ejemplo n.º 9
0
df.head()

df.info()

df['PercentKept'] = 1-df.PercentOfQuantityReturned
df.head()

import datetime as dt

df['ReceivedMonth'] = df['ReceivedDate'].dt.month
df['ReceivedYear'] = df['ReceivedDate'].dt.year


grouped_monthly = df.groupby(['ReceivedYear', 
                              'ReceivedMonth', 
                              'Vendor']).agg(MonthlyNormDeliveryTime = pd.NamedAgg(column = 'NormDeliveryTime', 
                                                                                   aggfunc='mean'), 
                                             MonthlyPercentReceived = pd.NamedAgg(column = 'PercentOfQuantityReceived', 
                                                                                   aggfunc='mean'), 
                                             MonthlyPercentKept = pd.NamedAgg(column = 'PercentKept', 
                                                                                  aggfunc='mean'), 
                                             VendorId = pd.NamedAgg(column = 'Vendor', 
                                                                                  aggfunc='first'))
grouped_monthly.head(60)

import numpy as np
from numpy import linalg as LA

def Rater(Alternatives):
  length=Alternatives.shape[0]
  matrix=np.ones((length,length))
  for i in range(length):
    'Balance':
    'mean'
})
df_summary.rename(columns={
    'Exited': '# of churned customers',
    'Balance': 'Average Balance of Customers'
},
                  inplace=True)
df_summary

# Alternative

# The NamedAgg function allows renaming the columns in the aggregation. The syntax is as follows:

df_summary_1 = df[['Geography', 'Exited', 'Balance']].groupby('Geography').agg(
    Number_of_churned_customers=pd.NamedAgg('Exited', 'sum'),
    Average_balance_of_customers=pd.NamedAgg('Balance', 'mean'))

# 15. Reset the index

# The index of the dataframes that the groupby returns consist of the group names. We can change it by resetting the index.

df_new = df[['Geography', 'Exited',
             'Balance']].groupby(['Geography', 'Exited']).mean().reset_index()
df_new

# 16. Reset the index with a drop

# In some cases, we need to reset the index and get rid of the original index at the same time.
# Consider a case where draw a sample from a dataframe. The sample will keep the index of the
# original dataframe so we want to reset it.
Ejemplo n.º 11
0
def sanitize_blast_data(data: pd.DataFrame,
                        queries: pd.DataFrame,
                        targets: pd.DataFrame,
                        qmult=3,
                        tmult=1):

    if data[data.btop.isna()].shape[0] > 0:
        raise ValueError(
            f"BTOP not present in the tabular file: please rerun BLAST with the correct blast_keys: {blast_keys}"
        )

    data["qseqid"] = data["qseqid"].str.replace(id_pattern, "\\1")
    data["sseqid"] = data["sseqid"].str.replace(id_pattern, "\\1")

    assert "qid" in queries.columns or "qid" == queries.index.name, queries.head(
    )
    assert "sid" in targets.columns or "sid" == targets.index.name, targets.head(
    )
    data = data.join(queries, on=["qseqid"]).join(targets, on=["sseqid"]).join(
        data.groupby(["qseqid", "sseqid"]).agg(
            min_evalue=pd.NamedAgg("evalue", np.min),
            max_bitscore=pd.NamedAgg("bitscore",
                                     np.max))[["min_evalue", "max_bitscore"]],
        on=["qseqid", "sseqid"])

    # Check the joins were successful
    if any(data.qid.isna()):
        raise KeyError(
            "The tabular file passed in and the queries in the database differ. Please rerun BLAST and "
            "Mikado serialise with the correct query file.")

    if any(data.sid.isna()):
        print(targets.head())
        print()
        print(data["sseqid"].head())
        print()
        raise KeyError(
            "The tabular file passed in and the targets in the database differ. Please rerun BLAST and "
            "Mikado serialise with the correct target file.")

    for col in ["qstart", "qend", "sstart", "send", "qlength", "slength"]:
        if col != "slength":
            err_val = (col, data[data[col].isna()].shape[0], data.shape[0])
        else:
            err_val = (col, data[["sseqid"]].head())
        if data[col].isna().any():
            raise ValueError(
                "Column {col} contains {nnan} NaN values out of {tot}. Head: {err_val}\
Please make sure you have run BLAST asking for the following fields:\
{blast_keys}".format(col=col,
                     nnan=np.where(data[col].isna())[0].shape[0],
                     tot=data.shape[0],
                     err_val=err_val,
                     blast_keys=blast_keys))
        try:
            data[col] = data[col].astype(int).values
        except ValueError as exc:
            raise ValueError("{}: {}".format(exc, col))

    for key, multiplier, (start, end), length in [
        ("query_frame", qmult, ("qstart", "qend"), "qlength"),
        ("target_frame", tmult, ("sstart", "send"), "slength")
    ]:
        # Switch start and end when they are not in the correct order
        _ix = (data[start] > data[end])
        if multiplier > 1:
            data.loc[~_ix, key] = data[start] % multiplier
            data.loc[_ix, key] = -((data[length] - data[end] - 1) % multiplier)
            data.loc[(data[key] == 0) & ~_ix, key] = multiplier
            data.loc[(data[key] == 0) & _ix, key] = -multiplier
        else:
            data.loc[:, key] = 0
        data.loc[_ix, [start, end]] = data.loc[_ix, [end, start]].values
        data[start] -= 1
    # Get the minimum evalue for each group
    # data["aln_span"] = data.qend - data.qstart
    # Set the hsp_num
    data["sstart"] = data["sstart"].astype(int).values
    data["hsp_num"] = data.sort_values("bitscore", ascending=False).groupby(
        ["qseqid", "sseqid"]).cumcount() + 1
    temp = data[["qseqid", "sseqid", "max_bitscore"
                 ]].drop_duplicates().sort_values(["max_bitscore", "sseqid"],
                                                  ascending=[False, True])
    temp["hit_num"] = temp.groupby(["qseqid"]).cumcount() + 1
    temp.set_index(["qseqid", "sseqid"], inplace=True)
    data = data.join(temp["hit_num"], on=["qseqid", "sseqid"])
    data = data.sort_values(["qid", "sid"])
    data.set_index(["qid", "sid"], drop=False, inplace=True)
    return data
Ejemplo n.º 12
0
def colorscheme_by_site(
    colorscheme_name,
    sites_df,
    color_by,
):
    """Add a color scheme to the color registry.

    The scheme can then be added to a `ngview.widget.NGLWidget` as described
    `here <https://github.com/dwhswenson/contact_map/pull/62>`_. For instance::

        view.add_cartoon(color=colorscheme_name)


    Parameters
    ----------
    colorscheme_name : str
        Name of the color scheme.
    sites_df : pandas.DataFrame or str
        Information on how to color sites. Can either be data frame or name
        of CSV file with data frame. Must have columns named 'pdb_chain'
        and 'pdb_site' as well as the column specified by `color_by`.
    color_by : str or 2-tuple
        How to color the sites. Can either specify as a str the name of
        a column in `sites_df` that has the name of a color for each site,
        or can be the 2-tuple `(val_col, color_map)`. In this case,
        `val_col` is name of column with numerical values, and `color_map`
        is a :class:`pdb_prot_align.colorschemes.ValueToColorMap` that maps
        the numbers in this column to colors. If colors are specified as
        str and are hex, then they need to be like this '#25828e'.

    """
    site_col = 'pdb_site'
    chain_col = 'pdb_chain'

    if isinstance(sites_df, str):
        sites_df = pd.read_csv(sites_df)
    elif not isinstance(sites_df, pd.DataFrame):
        raise ValueError('`sites_df` must be data frame or name of CSV file')

    if isinstance(color_by, str):
        color_col = color_by
    elif (len(color_by) == 2 and isinstance(color_by[0], str) and isinstance(
            color_by[1], pdb_prot_align.colorschemes.ValueToColorMap)):
        val_col = color_by[0]
        if val_col not in sites_df.columns:
            raise ValueError(f"`sites_df` lacks column {val_col}")
        color_map = color_by[1]
        color_col = 'color'
        if color_col in sites_df.columns:
            raise ValueError(f"`sites_df` can not have column {color_col} "
                             'if `color_by` is a 2-tuple')
        sites_df = (sites_df.assign(
            color=lambda x: x[val_col].map(color_map.val_to_color)))

    cols = [site_col, chain_col, color_col]
    for col in cols:
        if col not in sites_df.columns:
            raise ValueError(f"`sites_df` lacks column {col}")

    # Drop duplicate and NaN rows, which could be case if data frame
    # is tidy and has amino acid identity, and ensure site is integer.
    sites_df = (sites_df[cols].drop_duplicates().dropna().assign(
        **{site_col: lambda x: x[site_col].astype('int')}))
    # make sure just one color per site / chain
    dups = (sites_df.groupby(
        [chain_col,
         site_col]).aggregate(nrows=pd.NamedAgg(color_col, 'count')).query(
             'nrows > 1').reset_index()[[chain_col, site_col]])
    if len(dups):
        raise ValueError('non-unique colors for some sites:\n' + str(dups))

    # Do the coloring; details on selection schemes:
    # https://github.com/arose/ngl/blob/master/doc/usage/selection-language.md
    colorscheme = []
    for tup in sites_df.itertuples():
        chain = getattr(tup, chain_col)
        resi = getattr(tup, site_col)
        if isinstance(resi, float):
            if resi != int(resi):
                raise ValueError(f"non-integer residue {resi}")
            resi = int(resi)
        sel_str = f":{chain} and {resi}"
        color = getattr(tup, color_col)
        colorscheme.append([color, sel_str])

    nglview.color.ColormakerRegistry.add_scheme(colorscheme_name, colorscheme)
Ejemplo n.º 13
0
#sampling the data to the console
print(data.head())

#***********************************************************************************************************************************************************
#********************************************* LOGIC EXTRACT A CSV HAVING STATE, COUNTY, YEAR, VOTES_BY_PARTY, PARTY, CANDIDATE & VOTES********************************************
#***********************************************************************************************************************************************************

print(
    '***************************** STATE, COUNTY, YEAR, VOTES_BY_PARTY, PARTY, CANDIDATE & VOTES *****************************'
)

#creating a dataframe with with candidatevotes aggregated across countys in states
data_cnty_sum = pd.DataFrame(
    data.groupby(['state', 'county', 'year', 'party',
                  'candidate']).agg(votes_by_party_cnty=pd.NamedAgg(
                      column='candidatevotes', aggfunc=sum))).reset_index()

print(
    '***************************** GENERATING UNIQUE STRING FOR DATA_CNTY_SUM *****************************'
)
#creating a new column called unique string. This unique string will be used to join with another dataframe
data_cnty_sum['Unique_string'] = data_cnty_sum['year'].map(str).str.strip()+\
                                 data_cnty_sum['state'].map(str).str.strip()+\
                                 data_cnty_sum['county'].map(str).str.strip()+\
                                 data_cnty_sum['votes_by_party_cnty'].map(str).str.strip()

print(
    '***************************** GENERATING UNIQUE STRING FOR DATA_CNTY_SUM *****************************'
)
print(data_cnty_sum.head(1))
#creating dataframe with candidatevotes aggregated across county to find out who the winner was in a given year
def data_exploration():
    st.title('Data exploration')

    st.write('''
    In this page, I develop the diggings I did before starting to implement a solution.

    I have a strong belief that this exploration part could go a lot further with more time allocated to it.

    In particular, I did not explore friends interactions.
    ''')

    ########################################################################
    notifications = read_csv(csv_path)
    write_df('Inital dataset', notifications)

    ########################################################################
    nb_notifications = len(notifications)
    nb_users = len(notifications['user_id'].unique())
    nb_friend = len(notifications['friend_id'].unique())
    nb_users_days = len(notifications[['user_id', 'day']].drop_duplicates())
    duration = (notifications['timestamp'].max() - notifications['timestamp'].min()).days

    st.write('\n'.join([
        f'### Key numbers\n'
        f'- Sent **{nb_notifications} notifications**',
        f'- to **{nb_users} users**',
        f'- from **{nb_friend} friends**',
        f'- over **{duration} days**',
        # f'- {nb_users_days} user-day pairs',
    ]))

    st.write('''
    ### Dataset splitting


    First thing I did with this dataset was to split it into a training and a test dataset.
    The idea is see if the hypothesis I was doing on one part of the data were still true on the other one.


    In our case, a random sampling of the notifications is not efficient since they can be very correlated.
    What I did is :
    - split the users (receivers) in two groups
    - temporally split the notifications in half


    This gave me 4 datasets :
    - a training dataset, only on data from August
    - a testing dataset only on data from August, in order to measure performance on a different subset of users.
    - a testing dataset only on data from September but with training users, in order to measure the difference between the month of August and the month of September.
    - a testest dataset with test users and September data, in order to combine both benefits.


    To be honest, even if I splitted the dataset in those 4 parts, I did not have the time to really study the differences between them. Also, I realized that a better split was possible by distinguishing groups of friend. In my splitting, a friend could have sent a notification at the same time to two different users, one in the train set and one in the test set.

    ''')

    st.info(f'Data loaded from {csv_path}')

    ########################################################################

    st.write('### Histogram of average nb of notification per day per user')

    count_per_user_per_day, over_notified_users, fig = histograms(notifications)
    st.write(fig)
    st.write('''
    Figure that shows the average number of notifications per day per user.

    What we can get out of this figure the fact that most users are below the threshold (4 notifications per day) in most of the cases.

    However, a non-negligeable part of the users are regularly spammed.
    ''')

    write_df('Over notififed users', over_notified_users)
    st.write(f'**Cumulated number of days with a user over notified :** {len(over_notified_users)}')

    ########################################################################

    stats = compute_stats(notifications)

    st.write('### Repartition of users')

    st.write('''
    Below figure show a repartition of the users based on the notifications they receive. Each point represent a group of users. The larger is the dot, the more users there is.

    In abscissa, the number of different days the user received a notification.

    In ordinate, the maximum number of notifications the user received in a single day.

    In a way, all users below the threshold (red line) don't need any help from our bundler. A good predictor of whether the user belongs to this category or not would be a really good way to assure a minimal delay to most of our users. This option is briefly discussed in the DelayPredictor section.
    ''')

    fig = px.scatter(
        stats,
        x='nb_days_with_notifications',
        y='max_notifications_in_a_day',
        size='log_nb_users',
        color='critical',
        hover_data=['nb_users'],
    )
    fig.add_shape(
        go.layout.Shape(
            type="line",
            x0=0, y0=4.5, x1=max(stats['nb_days_with_notifications']), y1=4.5,
            line={'color': 'red', 'width': 3}
        ))
    fig.update_layout(yaxis_type="log")
    st.write(fig)

    ########################################################################

    st.write('### Per-user exploration')

    st.write('''
    Often in datascience projects, it is a good idea to look precisely to the data line by line.

    In the figure below, we can explore the notifications for each users one by one. Each point is a notification. A notification is labelled as 'critical' when at least 4 notifications have been sent the same day for that user.

    In abscissa, the timestamp of the notification in second of the day.

    In ordinate, the day the notification has been received.
    ''')

    user_ids = notifications['user_id'].unique().tolist()
    user_id = st.selectbox('User id', user_ids[:100])
    user_notifications = get_user_notifications(notifications, user_id)
    fig = px.scatter(
        user_notifications,
        x='second_in_day',
        y='day',
        color='critical',
        hover_data=['second_in_day'],
    )
    st.write(fig)

    ########################################################################

    st.write('## By-day repartition')

    st.write('''
    Last intuition I wanted to confirm was the fact the number of notifications per day varies a lot depending on the day.

    First some days might be more sunny than others (so more people go for a tour). Second, some days of the week are more appropriate for a little hike.

    I also wanted to explore how early in the day we could predict that the day would be a good day. On the figures we can see that around 10-12am, we already have a pretty good idea of what will be the total number of notifications that day. However, I did not had the time to implement a proper algorithm to take advantage of it.
    ''')

    for groupby_key in ['day', 'day_of_week']:
        st.write(f'### Count notifications grouped by {groupby_key}')
        kwargs = {
            'count': pd.NamedAgg('timestamp', 'count'),
            # 'day_of_week': pd.NamedAgg('day_of_week', lambda x: x.iloc[0]),
        }
        for hour in HOURS:
            kwargs[f'count_before_{hour}'] = pd.NamedAgg(f'is_before_{hour}', 'sum')
        counts_per_day = notifications.groupby(groupby_key).agg(**kwargs).reset_index()

        # write_df('counts_per_day', counts_per_day)

        fig = go.Figure()
        for hour in HOURS:
            hour_key = f'count_before_{hour}'
            fig.add_trace(
                go.Scatter(
                    x=counts_per_day[groupby_key],
                    y=counts_per_day[hour_key],
                    mode='lines',
                    name=hour_key,
                )
            )
        st.write(fig)
Ejemplo n.º 15
0
def main(start_date, end_date, tipo_calcolo, path_anagrafica_pdr, path_anagrafica_pdr2, path_anagrafica_osservatori, path_wkr, path_output):


    #BASE PATH SU S3
    path_to_data = 's3://zus-qa-s3/'
    
    #DIVISIONE PERIODO DI CONTO IN SOTTO PERIODI CON LUNGHEZZA MASSIMA DI UN MESE, START_COUNT E END_COUNT SONO I NOMI DELLE DATE CHE DELIMITANO GLI INTERVALLI
    date_format='%Y%m%d'
    start_d=datetime.strptime(start_date,date_format)
    end_d=datetime.strptime(end_date,date_format)
    N_NUM=end_d.month + (12 - start_d.month + 1) + (end_d.year - start_d.year - 1) * 12
    START_COUNT,END_COUNT=comp(start_d,end_d,N_NUM)
    year = start_date[:4]
    
    #LETTURA PROFILI ELABORATI
    df_profili = read_profili(path_to_data +'preprocessato/sistema/coefficienti/external/' + year + '/profili_elaborati.csv', start_date, end_date)
    print('read from ' + path_to_data +'preprocessato/sistema/coefficienti/external/' + year + '/profili_elaborati.csv')

    #LETTURA WKR
    df_wkr = read_wkr(start_date, end_date, tipo_calcolo, path_to_data + path_wkr)
    print('read from ' + path_to_data + path_wkr)

    #LETTURA ANAGRAFICA PDR 1
    df_pdr= read_pdr(path_to_data + path_anagrafica_pdr)
    print('read from ' + path_to_data + path_anagrafica_pdr)
    
    #LETTURA ANAGRAFICA PDR 2
    if path_anagrafica_pdr2 and (tipo_calcolo == 'cons'):
        df_pdr2 = read_pdr(path_to_data+path_anagrafica_pdr2)
        print('read from ' + path_to_data+path_anagrafica_pdr2)

    #LETTURA ANAGRAFICA OSSERVATORI
    df_anagrafica_osservatori = read_osservatori(path_to_data + path_anagrafica_osservatori)
    print('read from ' + path_to_data + path_anagrafica_osservatori)
    
    columns=['START_DATE', 'END_DATE', 'TIPO_CALCOLO', 'CONSUMO_ANNUO_ANOMALIE', 'PATH_ANAGRAFICA_PDR', 'PATH_ANAGRAFICA_OSSERVATORI', 'PATH_WKR']
    df_metadata = pd.DataFrame([[start_date, end_date, tipo_calcolo, 0.0, path_anagrafica_pdr, path_anagrafica_osservatori, path_wkr]], columns=columns)

    for i, j in zip(START_COUNT, END_COUNT):
        print(i, j)
        df_coef_month = df_profili.loc[(df_profili['DATE'] >= i) & (df_profili['DATE'] <= j)]
        anno_mese = df_coef_month['ANNO_MESE'].unique()[0].replace("-", "").replace("/","")
        print('coef filtered per month')
        df_pdr_month = df_pdr.loc[(df_pdr['DATA_FINE'] >= i) & (df_pdr['DATA_INIZIO'] <= j)]
        if df_pdr_month.empty:
            df_pdr_month = df_pdr2.loc[(df_pdr2['DATA_FINE'] >= i) & (df_pdr2['DATA_INIZIO'] <= j)]
            print('using anagrafica pdr2 for ', i, j)
        
        df_pdr_month_ee = df_pdr_month.loc[df_pdr_month['SOCIETA'] == 'edison_energia']
        df_pdr_month_sg = df_pdr_month.loc[df_pdr_month['SOCIETA'] == 'societa_gruppo']
        df_pdr_month_gr = df_pdr_month.loc[df_pdr_month['SOCIETA'] == 'grossisti']
        
        df_pp_pdr_aggr_month_ee, df_pp_pdr_aggr_station_tipo_tratt_month_ee, df_pp_pdr_aggr_station_societa_profilo_tratt_month_ee, df_pp_pdr_checks_ee = mergeDati(df_coef_month, df_pdr_month_ee, df_anagrafica_osservatori, df_wkr, anno_mese, 'ee', path_to_data + path_output)
        print('computed edison energia')
        df_pp_pdr_aggr_month_sg, df_pp_pdr_aggr_station_tipo_tratt_month_sg, df_pp_pdr_aggr_station_societa_profilo_tratt_month_sg, df_pp_pdr_checks_sg = mergeDati(df_coef_month, df_pdr_month_sg, df_anagrafica_osservatori, df_wkr, anno_mese, 'sg', path_to_data + path_output)
        print('computed societa gruppo')
        df_pp_pdr_aggr_month_gr, df_pp_pdr_aggr_station_tipo_tratt_month_gr, df_pp_pdr_aggr_station_societa_profilo_tratt_month_gr, df_pp_pdr_checks_gr = mergeDati(df_coef_month, df_pdr_month_gr, df_anagrafica_osservatori, df_wkr, anno_mese, 'gr', path_to_data + path_output)
        print('computed grossisti')
        df_pp_pdr_aggr  = df_pp_pdr_aggr_month_ee.append(df_pp_pdr_aggr_month_sg).append(df_pp_pdr_aggr_month_gr)
        df_pp_pdr_aggr_station_tipo_tratt = df_pp_pdr_aggr_station_tipo_tratt_month_ee.append(df_pp_pdr_aggr_station_tipo_tratt_month_sg).append(df_pp_pdr_aggr_station_tipo_tratt_month_gr)
        df_pp_pdr_aggr_station_societa_profilo_tratt = df_pp_pdr_aggr_station_societa_profilo_tratt_month_ee.append(df_pp_pdr_aggr_station_societa_profilo_tratt_month_sg).append(df_pp_pdr_aggr_station_societa_profilo_tratt_month_gr)
        df_pp_pdr_checks = df_pp_pdr_checks_ee.append(df_pp_pdr_checks_sg).append(df_pp_pdr_checks_gr)
        print('computation ended')
        df_pp_pdr_aggr.to_csv(path_to_data + path_output + anno_mese + "/" + 'aggregato_societa_tipo_tratt.csv')
        print('aggregato_grafico written')
        df_pp_pdr_aggr_station_tipo_tratt.to_csv(path_to_data + path_output  + anno_mese + "/" +  'aggregato_station_tipo_tratt.csv')
        print('aggregato_station_tipo_tratt written')
        df_pp_pdr_aggr_station_societa_profilo_tratt.to_csv(path_to_data + path_output  + anno_mese + "/" +  'aggregato_station_societa_profilo_tratt.csv')
        print('aggregato_station_societa_profilo_tratt written')
        df_pp_pdr_checks.to_csv(path_to_data + path_output  + anno_mese + "/" +  'anomalie_dettaglio.csv')
        df_pp_pdr_checks['TOT_CONSUMO_ANNUO'] = df_pp_pdr_checks.groupby(['PDR']).agg(TOT_CONSUMO_ANNUO=pd.NamedAgg(column='CONSUMO_ANNUO', aggfunc='mean')).reset_index()['TOT_CONSUMO_ANNUO'].sum()
        df_pp_pdr_checks['TOT_PDR'] = len(df_pp_pdr_checks['PDR'].unique())
        #df_pp_pdr_kpi_checks = df_pp_pdr_checks[['TOT_PDR', 'TOT_CONSUMO_ANNUO']].drop_duplicates()
        df_metadata['CONSUMO_ANNUO_ANOMALIE'] = df_metadata['CONSUMO_ANNUO_ANOMALIE'] + df_pp_pdr_checks['TOT_CONSUMO_ANNUO'].sum()
        #df_pp_pdr_kpi_checks.to_csv(path_to_data + path_output  + anno_mese + "/" +  'anomalie_aggregato.csv')
        print('dettaglio anomalie written')
        
    df_metadata.to_csv(path_to_data + path_output  + anno_mese + "/" +  'metadati.csv')
    print('all months have been computed')
    return (path_to_data + path_output)
Ejemplo n.º 16
0
    columns=columns,
)
df_energy_hh.reset_index(inplace=True)
df_energy_hh.rename(columns={'index': 'interval'}, inplace=True)
df_energy_hh['interval'] = (df_energy_hh['interval'] % 48) + 1
df_energy_hh.set_index('interval_date', inplace=True)

for nc in numeric_columns:
    df_energy_hh[nc] = pd.to_numeric(df_energy_hh[nc])

df_energy_hh['gross_usage_kwh'] = df_energy_hh['meter_consumption_kwh'] + \
    (df_energy_hh['solar_generation_kwh'] - df_energy_hh['meter_generation_kwh'] -
     df_energy_hh['charge_quantity_kwh']) + df_energy_hh['discharge_quantity_kwh']

df_energy_daily = df_energy_hh.groupby(['interval_date']).agg(
    meter_consumption_kwh=pd.NamedAgg(column='meter_consumption_kwh',
                                      aggfunc='mean'),
    meter_generation_kwh=pd.NamedAgg(column='meter_generation_kwh',
                                     aggfunc='mean'),
    solar_generation_kwh=pd.NamedAgg(column='solar_generation_kwh',
                                     aggfunc='mean'),
    solar_mean_powr_kw=pd.NamedAgg(column='solar_mean_powr_kw',
                                   aggfunc='mean'),
    solar_devices_reporting=pd.NamedAgg(column='solar_devices_reporting',
                                        aggfunc='median'),
    capacity_kw=pd.NamedAgg(column='capacity_kw', aggfunc='mean'),
    charge_quantity_kwh=pd.NamedAgg(column='charge_quantity_kwh',
                                    aggfunc='mean'),
    discharge_quantity_kwh=pd.NamedAgg(column='discharge_quantity_kwh',
                                       aggfunc='mean'),
    deterioration_state_pct=pd.NamedAgg(column='deterioration_state_pct',
                                        aggfunc='mean'),
Ejemplo n.º 17
0
    else:
        return pd.Series([i for i in range(0, len(df[x]))])


def linear_regression(df: pd.DataFrame, x: str, y: str) -> None:
    fixed_x = transform_variable(df, x)
    model = sm.OLS(df[y], sm.add_constant(fixed_x)).fit()
    print(model.summary())

    coef = pd.read_html(model.summary().tables[1].as_html(),
                        header=0,
                        index_col=0)[0]['coef']
    df.plot(x=x, y=y, kind='scatter')
    plt.plot(df[x], [pd.DataFrame.mean(df[y]) for _ in fixed_x.items()],
             color='green')
    plt.plot(df_by_sal[x],
             [coef.values[1] * x + coef.values[0] for _, x in fixed_x.items()],
             color='red')
    plt.xticks(rotation=90)
    plt.savefig(f'img/lr_{y}_{x}.png')
    plt.close()


df = pd.read_csv("csv/typed_uanl.csv")  # type: pd.DataFrame
#print_tabulate(df.head(50))
df_by_sal = df.groupby("Fecha")\
              .aggregate(sueldo_mensual=pd.NamedAgg(column="Sueldo Neto", aggfunc=pd.DataFrame.mean))
# df_by_sal["sueldo_mensual"] = df_by_sal["sueldo_mensual"]**10
print_tabulate(df_by_sal.head(5))
linear_regression(df_by_sal, "Fecha", "sueldo_mensual")
Ejemplo n.º 18
0
months_2017 = list(range(201701, 201713))
months_2018 = list(range(201801, 201813))
months_2019 = list(range(201901, 201913))
work_months = months_2017 + months_2018 + months_2019
work_months = list(map(str, work_months))

# list of customers ID
custumers = df.customer_id.unique().tolist()

# create a DF customer_id, months
mc = list(itertools.product(custumers, work_months))
mc_df = pd.DataFrame(mc, columns=['customer_id', 'date'])

# aggregate the transactions
transactions = df.groupby(by=['customer_id', 'date']).agg(
    nb_transaction=pd.NamedAgg(column="product_id",
                               aggfunc="count")).reset_index()

# full years data (months without any transaction are included)
full_transactions_df = transactions.merge(mc_df,
                                          on=['customer_id', 'date'],
                                          how='right').fillna(0)

# create the Data Set
base_date = parser.parse("2018-12-16T22:39:59.247Z")
data_set_list = []
for i in range(1, 13):

    # built sliding period
    cursor_date = base_date + relativedelta(months=i - 1)
    year_before = cursor_date + relativedelta(months=-11)
    tree_months_after = cursor_date + relativedelta(months=3)
Ejemplo n.º 19
0
def modeling_n_prediction(df, device, position):
    # Filter table with keyword from mobile devices and position less than equal to 10
    data = df[(df['device'] == device) & (df['position'] <= position)]
    data['rank'] = data['position'].astype(int)

    # Order the tables
    data = data.sort_values(["keyword", "date", "rank"],
                            ascending=(True, True, True))

    # Group keywords, date and rank and calculate sum of clicks and impressions
    grouped_data = data.groupby(['keyword', 'date', 'rank']).agg(
        all_clicks=pd.NamedAgg(column='clicks', aggfunc=sum),
        all_impressions=pd.NamedAgg(column='impressions', aggfunc=sum))
    grouped_data = grouped_data.reset_index()
    grouped_data = grouped_data.sort_values(["keyword", "date", "rank"],
                                            ascending=(True, True, True))
    # Get the list of unique keywords in google search console data
    mobile_keywords = grouped_data['keyword'].unique()

    grouped_data['keyword'] = grouped_data['keyword'].astype(str)

    key_date_df_list = {}
    count = 1
    for key in list(keyword_master['keywords'].unique()):
        key_df_list = {}
        print(count)
        print('Processing for keyword: ', key)
        print()

        distance = [
            dis.get_jaro_distance(key, word) for word in mobile_keywords
        ]
        distance = np.array(distance)
        cluster = np.where(distance <= 0.3)
        total_count = len(mobile_keywords[cluster]) - 1

        words = '|'.join(mobile_keywords.tolist())
        key_df = pd.DataFrame(
            columns=['keyword', 'date', 'rank', 'clicks', 'impressions'])

        dt_list = list(grouped_data['date'].drop_duplicates().astype(str))
        dt_list.sort()

        apply1(dt_list, key_df_list, grouped_data, mobile_keywords, cluster,
               key)
        temp_df = pd.DataFrame(
            columns=['keyword', 'date', 'rank', 'clicks', 'impressions'])
        for k, val in key_df_list.items():
            temp_df = pd.concat([temp_df, val], ignore_index=True)
        key_date_df_list[key] = temp_df
        count = count + 1
    t_df = pd.DataFrame(
        columns=['keyword', 'date', 'rank', 'clicks', 'impressions'])
    for k, val in key_date_df_list.items():
        t_df = pd.concat([t_df, val], ignore_index=True)
    all_ranks_df = t_df

    if (device == 'MOBILE'):
        ctrs = ctr_df[['position', 'mobile_ctr']]
    else:
        ctrs = ctr_df[['position', 'web_ctr']]

    all_ranks_df['rank'] = all_ranks_df['rank'].astype(int)
    all_ranks_df['impressions'] = all_ranks_df['impressions'].astype(float)
    all_ranks_df = pd.merge(all_ranks_df,
                            ctrs,
                            left_on="rank",
                            right_on="position")

    # Calculate the max and avg impressions for the keyword for each date
    temp_all_ranks_df = all_ranks_df.groupby(['keyword', 'date']).agg(
        avg_impressions=pd.NamedAgg(column='impressions', aggfunc=round_mean),
        max_impressions=pd.NamedAgg(column='impressions', aggfunc=round_max))
    temp_all_ranks_df = temp_all_ranks_df.reset_index()
    all_ranks_df = pd.merge(all_ranks_df,
                            temp_all_ranks_df,
                            on=['keyword', 'date'])

    # Replace NA values with avg impressions
    all_ranks_df['impressions'] = all_ranks_df['impressions'].fillna(
        all_ranks_df['avg_impressions'])
    all_ranks_df = all_ranks_df.sort_values(["keyword", "date", "rank"],
                                            ascending=(True, True, True))
    #df['First Season'] = np.where(df['First Season'] > 1990, 1, df['First Season'])
    all_ranks_df['impressions'] = np.where(
        all_ranks_df['impressions'] <= all_ranks_df['avg_impressions'],
        all_ranks_df['max_impressions'], all_ranks_df['impressions'])

    if (device == 'MOBILE'):
        all_ranks_df['clicks'] = (all_ranks_df['mobile_ctr'] *
                                  all_ranks_df['impressions']) / 100
    else:
        all_ranks_df['clicks'] = (all_ranks_df['web_ctr'] *
                                  all_ranks_df['impressions']) / 100
    all_ranks_df.clicks = all_ranks_df.clicks.round()
    all_ranks_df['clicks'] = all_ranks_df['clicks'].astype(int)

    if (device == 'MOBILE'):
        all_ranks_df['mobile_ctr'] = None
    else:
        all_ranks_df['web_ctr'] = None

    all_ranks_df['avg_impressions'] = None
    all_ranks_df['max_impressions'] = None

    all_ranks_df['keyword'] = all_ranks_df['keyword'].astype(str)
    all_ranks_df['impressions'] = all_ranks_df['impressions'].astype(int)
    all_ranks_df['date'] = all_ranks_df['date'].astype(str)

    casted_df = all_ranks_df.pivot_table(index=['keyword', 'date'],
                                         columns='rank',
                                         values=['clicks', 'impressions'])
    casted_df.columns = [
        "{0}_{1}".format(l1, l2) for l1, l2 in casted_df.columns
    ]
    casted_df = casted_df.reset_index()
    casted_df['keyword'] = casted_df['keyword'].astype('category')

    key_pred_list = {}

    for key in list(keyword_master['keywords'].unique()):
        print('Forecasting for keyword - ', key)
        print()

        pred_pos_list = {}

        for position in range(1, 11):
            print('Position - ', position)
            print()

            key_sub = casted_df[casted_df['keyword'] == key]
            key_sub['date'] = pd.to_datetime(key_sub['date'])
            clicks_trend = key_sub[['clicks_' + str(position), 'date']]
            clicks_trend.columns = ["y", "ds"]

            prediction_days = 14
            pred_len = 0
            totalRow = len(clicks_trend)
            pred_range = [totalRow - pred_len + 1, totalRow]
            pre_views = clicks_trend.head(totalRow - pred_len)
            post_views = clicks_trend.tail(pred_len)

            m = fbprophet.Prophet()
            m.fit(pre_views)
            future = m.make_future_dataframe(periods=prediction_days)
            fcast = m.predict(future)

            pred_df = fcast[['ds', 'yhat', 'yhat_lower',
                             'yhat_upper']].tail(prediction_days)
            pred_df['position'] = position
            pred_df['keyword'] = key
            pred_df.columns = [
                "date", "clicks", "clicks_lower", "clicks_upper", "position",
                "keyword"
            ]
            pred_df = pred_df[[
                "keyword", "date", "position", "clicks", "clicks_lower",
                "clicks_upper"
            ]]
            pred_df.clicks_upper = pred_df.clicks_upper.round()
            pred_df.clicks_lower = pred_df.clicks_lower.round()
            #fig1 = m.plot(fcast)

            pred_pos_list[position] = pred_df
        t1_df = pd.DataFrame(columns=[
            "keyword", "date", "position", "clicks", "clicks_lower",
            "clicks_upper"
        ])
        for k, val in pred_pos_list.items():
            t1_df = pd.concat([t1_df, val], ignore_index=True)
        key_pred_list[key] = t1_df

    print('\n')
    t2_df = pd.DataFrame(columns=[
        "keyword", "date", "position", "clicks", "clicks_lower", "clicks_upper"
    ])
    for k, val in key_pred_list.items():
        t2_df = pd.concat([t2_df, val], ignore_index=True)
    pred_key_df = t2_df
    casted_pred_df = pred_key_df.pivot_table(
        index=['keyword', 'date'],
        columns='position',
        values=['clicks', 'clicks_lower', 'clicks_upper'])
    casted_pred_df.columns = [
        "{0}_{1}".format(l1, l2) for l1, l2 in casted_pred_df.columns
    ]
    casted_pred_df = casted_pred_df.reset_index()

    casted_pred_df = pd.merge(keywords_df,
                              casted_pred_df,
                              left_on="keywords",
                              right_on="keyword")
    #  casted_df['impressions'] = np.where(all_ranks_df['impressions'] <= all_ranks_df['avg_impressions'], all_ranks_df['max_impressions'], all_ranks_df['impressions'])

    #print(casted_pred_df['date'])
    casted_pred_df['date'] = casted_pred_df['date'].astype(str)
    casted_pred_df = casted_pred_df.astype(int, errors='ignore')
    #casted_pred_df['date'] = casted_pred_df['date'].astype(str)
    num = casted_pred_df._get_numeric_data()
    num[num < 0] = 0
    #print(casted_pred_df['date'])
    casted_pred_df.to_json(r'FinalResults_UK_' + device + '.json',
                           orient='records')

    return list([casted_df, casted_pred_df])
Ejemplo n.º 20
0
#%%
import pandas as pd

#%%
data = pd.read_csv("data/projeto4_telecom_treino.csv")

#%%
data.columns

#%%
feature = "account_length"

churn_by_feature = data.groupby(
    [feature, "churn"]).agg(count=pd.NamedAgg(column="churn", aggfunc="count"))

rows_present = [row[0] for row in churn_by_feature.iterrows()]

rows_absent = [(value, churn_) for value in data[feature].unique()
               for churn_ in ["no", "yes"]
               if (value, churn_) not in rows_present]

rows_absent = pd.DataFrame(
    rows_absent,
    columns=[feature, "churn"],
)

rows_absent["count"] = 0

churn_by_feature = churn_by_feature.reset_index()
churn_by_feature = churn_by_feature.append(rows_absent)
Ejemplo n.º 21
0
    'cum_confirmed_cases', 'cum_deaths', 'cum_recoveries'
]]

# for big cities, adjust adm level
mask = df['adm1_name'].isin(['Shanghai', 'Beijing', 'Tianjin', 'Chongqing'])
df.loc[mask, 'adm3_name'] = df.loc[mask, 'adm2_name'].tolist()
df.loc[mask, 'adm2_name'] = df.loc[mask, 'adm1_name'].tolist()

# drop cases unassigned to cities
df = df.loc[df['notes'] != 'prison', :]
df = df.loc[~df['adm2_name'].isin(
    ['International Imported Cases', 'Domestic Imported Cases', 'Unknown']), :]

# aggregate to city level
df = df.groupby(['adm1_name', 'adm2_name', 'date']).agg(
    cum_confirmed_cases=pd.NamedAgg(column='cum_confirmed_cases',
                                    aggfunc=np.nansum),
    cum_deaths=pd.NamedAgg(column='cum_deaths', aggfunc=np.nansum),
    cum_recoveries=pd.NamedAgg(column='cum_recoveries', aggfunc=np.nansum),
).reset_index()

# fill adm0_name variable
df.loc[:, 'adm0_name'] = 'CHN'

## Merge with pre 01/24 data, create balanced panel

# merge with pre 1/24 data
df = pd.concat([df, df_jan_merged], sort=False)

# createa balanced panel
adm = df.loc[:, ['adm0_name', 'adm1_name', 'adm2_name']].drop_duplicates()
days = pd.date_range(start='20200110', end=end_date)
Ejemplo n.º 22
0
import pandas as pd

app = dash.Dash(__name__)
server = app.server

df = pd.read_csv("netflix_titles.csv")
df.drop_duplicates(inplace=True)

pie_fig = px.pie(data_frame=df,
                 names='type',
                 hole=0.8,
                 title='TV Show vs. Movie')

bar_fig = px.bar(data_frame=df.groupby(
    ["type"],
    as_index=False).agg(count=pd.NamedAgg(column="type", aggfunc="count")),
                 x='type',
                 y='count',
                 color='type',
                 title='TV Show vs. Movie')

app.layout = html.Div(children=[
    html.H1(children='Visualizing Netflix Data With Python'),
    html.Div(children='''
        Using Pandas, Plotly Express, and Dash.
    '''),
    html.Div([
        dcc.Graph(id='graph1', figure=pie_fig),
    ]),
    html.Div([
        dcc.Graph(id='graph2', figure=bar_fig),
Ejemplo n.º 23
0
mtcarsDF.groupby('gear').size()
mtcarsDF.groupby(['gear','cyl']).size()
mtcarsDF.groupby(['gear','cyl']).count() #size better


mtcarsDF.groupby('gear').mpg.agg('mean')
mtcarsDF.groupby('gear')['mpg'].agg('mean')
mtcarsDF.groupby('gear')['mpg','wt'].agg('mean')
mtcarsDF.groupby('gear')['mpg','wt'].agg(['mean','max'])
mtcarsDF.groupby('gear').agg([np.mean, np.sum])  #all columns, np is faster, numeric values
mtcarsDF.groupby('gear')['mpg','wt'].agg([np.mean, np.sum, 'count'])
mtcarsDF.groupby('gear')['mpg'].agg([np.mean, np.sum, 'count']).rename(columns={'meanMPG')



mtcarsDF.groupby('gear').agg(meanMPG = pd.NamedAgg(column='mpg', aggfunc='mean'))
mtcarsDF.groupby(['gear','am']).agg(meanMPG = pd.NamedAgg(column='mpg', aggfunc='max'))
mtcarsDF.groupby('gear').agg(meanMPG = pd.NamedAgg(column='mpg', aggfunc='mean'), maxMPG = pd.NamedAgg(column='wt', aggfunc='max'))
mtcarsDF['gear'].count()
mtcarsDF['gear'].max()

mtcarsDF.groupby('gear').mean()
mtcarsDF.groupby('gear').mean().add_prefix('MEAN_')

gearGp = mtcarsDF.groupby('gear')
gearGp.mean()
gearGp.nth(1)
gearGp.nth([1,3])


#crosstab
Ejemplo n.º 24
0
def orderABC(folder_path, file_list, save_path, period, rate=None):
    if rate is None:
        rate = [0.8, 0.95, 1]
    class_type = ['A', 'B', 'C']

    df_list = []
    for i in range(len(file_list)):
        t = pd.read_csv('{}/{}'.format(folder_path, file_list[i]),
                        encoding='gbk')
        df_list.append(t)

    df = pd.concat(df_list)
    df['CATE'] = 'FTW'
    df.loc[(df['CATEGORY'] != 'FTW'), ['CATE']] = 'APP'
    df['Period'] = period

    print('all data size: ', df.shape)

    sku = df.groupby('SKU').agg(line=pd.NamedAgg(column='DATEOUT',
                                                 aggfunc='count'),
                                qty=pd.NamedAgg(column='QTY',
                                                aggfunc='sum')).reset_index()
    sku_cate = df[['SKU', 'Period', 'CATE']].drop_duplicates()

    sku = pd.merge(sku_cate, sku, on='SKU', how='left')

    sku_FTW = sku.loc[sku['CATE'] == 'FTW'].copy()
    sku_APP = sku.loc[sku['CATE'] == 'APP'].copy()

    save_file_name = 'skuABC_{}.xlsx'.format(period)
    write = pd.ExcelWriter('{}/{}'.format(save_path, save_file_name))

    skuABC = class_ABC(write, sku, rate, class_type)
    skuABC_FTW = class_ABC(write, sku_FTW, rate, class_type)
    skuABC_APP = class_ABC(write, sku_APP, rate, class_type)

    write.save()
    write.close()
    '''订单ABC组合'''
    ## 订单ABC组合
    skuABC_cate = skuABC_FTW[['SKU', 'lineABC', 'qtyABC']].append(
        skuABC_APP[['SKU', 'lineABC', 'qtyABC']])

    df = pd.merge(df, skuABC_cate, on='SKU', how='left')
    df_temp = df[['Period', 'DATEOUT', 'DOCNO', 'DATA_TYPE',
                  'CHANNEL']].drop_duplicates()

    ## 统计一个订单中SKU 行数ABC的个数
    df_order_lineABC = pd.pivot_table(df,
                                      index=['DOCNO'],
                                      columns='lineABC',
                                      values='SKU',
                                      aggfunc='count',
                                      fill_value=0).reset_index()
    print(df_order_lineABC.columns)
    cols = list(df_order_lineABC.columns[1:])
    print(cols)
    x = np.where(df_order_lineABC[cols], cols, '')
    df_order_lineABC['orderLineABC'] = pd.Series(''.join(i) for i in x)

    ## 统计一个订单中SKU 件数ABC的个数
    df_order_qtyABC = pd.pivot_table(df,
                                     index=['DOCNO'],
                                     columns='qtyABC',
                                     values='SKU',
                                     aggfunc='count',
                                     fill_value=0).reset_index()
    print(df_order_qtyABC.columns)
    cols = list(df_order_qtyABC.columns[1:])
    print(cols)
    y = np.where(df_order_qtyABC[cols], cols, '')
    df_order_qtyABC['orderQtyABC'] = pd.Series(''.join(i) for i in y)

    # print(df_order_lineABC.head(20))
    # print(df_order_qtyABC.head(20))

    df_order = pd.merge(df_order_lineABC[['DOCNO', 'orderLineABC']],
                        df_order_qtyABC[['DOCNO', 'orderQtyABC']],
                        on='DOCNO',
                        how='left')

    df_result = pd.merge(df_temp, df_order, on='DOCNO', how='left')
    # print(df_result.head(20))

    save_file_name = 'orderABC_{}.csv'.format(period)
    df_result.to_csv('{}/{}'.format(save_path, save_file_name),
                     encoding='gbk',
                     index=False)
Ejemplo n.º 25
0
def inverse_simpson_index(
    barcodecounts,
    *,
    barcodecol="barcode",
    countcol="count",
    groupcols="library",
):
    """Inverse Simpson index (reciprocal probability two barcodes are same).

    Parameters
    ----------
    barcodecounts: pandas.DataFrame
        Data frame with barcode counts
    barcodecol : str
        Column in ``barcodecounts`` listing all unique barcodes.
    countcol : str
        Column in ``barcodecounts`` with counts of each barcode.
    groupcols : str, list, or None
        Columns in ``barcodecounts`` by which we group for calculations.

    Returns
    -------
    pandas.DataFrame

    Example
    -------
    >>> barcodecounts = pd.DataFrame.from_records(
    ...        [('lib1', 'AA', 10),
    ...         ('lib1', 'AT', 20),
    ...         ('lib1', 'AC', 30),
    ...         ('lib2', 'AA', 5)],
    ...        columns=['library', 'barcode', 'count'])
    >>> inverse_simpson_index(barcodecounts)
      library  inverse_simpson_index
    0    lib1               2.571429
    1    lib2               1.000000

    """
    # based on here: https://gist.github.com/martinjc/f227b447791df8c90568
    reserved_cols = ["dummy", "p2", "simpson_index", "inverse_simpson_index"]
    for col in reserved_cols:
        if col in barcodecounts.columns:
            raise ValueError(f"`barcodecounts` cannot have column {col}")

    if groupcols:
        if isinstance(groupcols, str):
            groupcols = [groupcols]
    else:
        groupcols = ["dummy"]
        barcodecounts["dummy"] = "dummy"
    req_cols = [barcodecol, countcol, *groupcols]
    if not set(barcodecounts.columns).issuperset(req_cols):
        raise ValueError(f"`barcodecounts` lacks columns {req_cols}")
    if len(barcodecounts) != len(barcodecounts.groupby(req_cols)):
        raise ValueError("`barcodecol` and `groupcols` not unique rows")

    df = (barcodecounts.assign(p2=lambda x: (x[countcol] / (x.groupby(
        groupcols)[countcol].transform("sum")))**2).groupby(
            groupcols, as_index=False).aggregate(
                simpson_index=pd.NamedAgg("p2", "sum")).assign(
                    inverse_simpson_index=lambda x: 1 / x["simpson_index"]))
    if groupcols == ["dummy"]:
        groupcols = []
    return df[[*groupcols, "inverse_simpson_index"]]
Ejemplo n.º 26
0
def order_by_date(folder_path, save_path, index=None):
    if index is None:
        index = ['DATEOUT', 'DOCNO']

    save_file_name = 'order_by_' + '_'.join(index) + '.csv'

    os.chdir(folder_path)
    file_list = os.listdir()  # 将该文件夹下的所有文件名存入一个列表

    for i in range(len(file_list)):
        df = pd.read_csv('{}/{}'.format(folder_path, file_list[i]),
                         encoding='gbk',
                         low_memory=False)
        df['CATE'] = 'FTW'
        df.loc[(df['CATEGORY'] != 'FTW'), ['CATE']] = 'APP'

        # print(file_list[i], '非重复订单数: ', df['DOCNO'].nunique())

        # ### merge order DATA_TYPE, CHANNEL
        type = df[['DATEOUT', 'DOCNO', 'DATA_TYPE',
                   'CHANNEL']].drop_duplicates().reset_index()

        skuNum = df.groupby(index)['SKU'].nunique()
        qty = df.groupby(index).agg(
            Qty=pd.NamedAgg(column='QTY', aggfunc='sum'))

        re0 = pd.merge(type, skuNum, on=index, how='outer')
        re = pd.merge(re0, qty, on=index, how='outer')
        # print(re.columns)
        # print(re.head(5))

        re['order_structure'] = np.NAN
        re.loc[(re['SKU'] == 1) & (re['Qty'] == 1),
               ['order_structure']] = '单品单件'
        re.loc[(re['SKU'] == 1) & (re['Qty'] > 1),
               ['order_structure']] = '单品多件'
        re.loc[(re['SKU'] > 1) & (re['Qty'] > 1) & (re['SKU'] == re['Qty']),
               ['order_structure']] = '多品单件'
        re.loc[(re['SKU'] > 1) & (re['Qty'] > 1) & (re['SKU'] != re['Qty']),
               ['order_structure']] = '多品多件'

        re['order_structure2'] = '单件'
        re.loc[(re['order_structure'] != '单品单件'), ['order_structure2']] = '多件'

        order_cate = pd.pivot_table(df,
                                    index=index,
                                    columns=['CATE'],
                                    values=['QTY'],
                                    aggfunc=sum,
                                    fill_value=0)

        # print(order_cate.columns)
        # print(order_cate.head(5))

        col = []
        for j in order_cate.columns:
            j = list(j)
            col.append('_'.join(j))

        order_cate.columns = col
        # print(order_cate.columns)
        order_cate['order_category'] = ''
        order_cate.loc[(order_cate['QTY_APP'] > 0) &
                       (order_cate['QTY_FTW'] > 0), ['order_category']] = 'A+F'
        order_cate.loc[(order_cate['QTY_APP'] > 0) &
                       (order_cate['QTY_FTW'] == 0), ['order_category']] = 'A'
        order_cate.loc[(order_cate['QTY_APP'] == 0) &
                       (order_cate['QTY_FTW'] > 0), ['order_category']] = 'F'

        result = pd.merge(re, order_cate, on=index, how='outer')

        print(i + 1, file_list[i], '订单数:', df['DOCNO'].nunique(), '行数:',
              result.shape[0], '件数:', df['QTY'].sum())
        if i > 0:
            result.to_csv('{}/{}'.format(save_path, save_file_name),
                          index=False,
                          encoding='gbk',
                          header=False,
                          mode='a+')
        else:
            result.to_csv('{}/{}'.format(save_path, save_file_name),
                          encoding='gbk',
                          index=False)
Ejemplo n.º 27
0
def main():
    # data = load_wine()
    # data = load_breast_cancer()
    data = load_boston()
    # data = load_diabetes()
    X = data.data
    y = data.target

    # Determine if response is continuous or boolean

    if y.dtype is str or bool is True:
        response_type = "boolean"
        print("---Response is boolean---")
    elif np.unique(y).size / y.size < 0.05:
        response_type = "boolean"
        print("---Response is boolean---")
    else:
        response_type = "continuous"
        print("---Response is continuous---")

    # Determine if the predictor continuous or boolean &
    # create plots for each variable type

    predictor_type = []
    for idx, column in enumerate(X.T):
        feature_name = data.feature_names[idx]

        predictor = statsmodels.api.add_constant(column)
        # Get the stats & plot
        if column.dtype is str or bool is True:
            v_type = "boolean"
            print(data.feature_names[idx], "is boolean")
            if response_type == "continuous":

                logistic_regression_model = statsmodels.api.GLM(y, predictor)
                logistic_regression_model_fitted = (
                    logistic_regression_model.fit())  # noqa
                print(f"Variable: {feature_name}")
                print(logistic_regression_model_fitted.summary())
                t_value = round(logistic_regression_model_fitted.tvalues[1], 6)
                p_value = "{:.6e}".format(
                    logistic_regression_model_fitted.pvalues[1])  # noqa

                # Categorical Predictor by Continuous Response

                fig = px.scatter(x=column, y=y)
                fig.update_layout(
                    title=
                    f"Variable: {feature_name}: (t-value={t_value}) (p-value={p_value})",  # noqa
                    xaxis_title=f"Variable: {feature_name}",
                    yaxis_title="y",
                )
                fig.show()

            else:
                print(data.feature_names[idx], "is boolean")

                logistic_regression_model = statsmodels.api.GLM(y, predictor)
                logistic_regression_model_fitted = (
                    logistic_regression_model.fit())  # noqa
                print(f"Variable: {feature_name}")
                print(logistic_regression_model_fitted.summary())
                t_value = round(logistic_regression_model_fitted.tvalues[1], 6)
                p_value = "{:.6e}".format(
                    logistic_regression_model_fitted.pvalues[1])  # noqa

                # Continuous Predictor by Continuous Response

                fig = px.scatter(x=column, y=y)
                fig.update_layout(
                    title=
                    f"Variable: {feature_name}: (t-value={t_value}) (p-value={p_value})",  # noqa
                    xaxis_title=f"Variable: {feature_name}",
                    yaxis_title="y",
                )
                fig.show()

        elif round((np.unique(X.T[idx]).size / X.T[idx].size), 2) <= 0.05:
            v_type = "boolean"
            print(data.feature_names[idx], "is boolean")
            if response_type == "continuous":

                logistic_regression_model = statsmodels.api.GLM(y, predictor)
                logistic_regression_model_fitted = (
                    logistic_regression_model.fit())  # noqa
                print(f"Variable: {feature_name}")
                print(logistic_regression_model_fitted.summary())
                t_value = round(logistic_regression_model_fitted.tvalues[1], 6)
                p_value = "{:.6e}".format(
                    logistic_regression_model_fitted.pvalues[1])  # noqa

                # Categorical Predictor by Continuous Response

                fig = px.histogram(x=column, y=y, histfunc="count")
                fig.update_layout(
                    title=
                    f"Variable: {feature_name}: (t-value={t_value}) (p-value={p_value})",  # noqa
                    xaxis_title=f"Variable: {feature_name}",
                    yaxis_title="Response",
                )
                fig.show()

            else:
                print(data.feature_names[idx], "is boolean")

                logistic_regression_model = statsmodels.api.GLM(y, predictor)
                logistic_regression_model_fitted = (
                    logistic_regression_model.fit())  # noqa
                print(f"Variable: {feature_name}")
                print(logistic_regression_model_fitted.summary())
                t_value = round(logistic_regression_model_fitted.tvalues[1], 6)
                p_value = "{:.6e}".format(
                    logistic_regression_model_fitted.pvalues[1])  # noqa

                # Categorical Predictor by Continuous Response

                fig = px.scatter(x=column, y=y)
                fig.update_layout(
                    title=
                    f"Variable: {feature_name}: (t-value={t_value}) (p-value={p_value})",  # noqa
                    xaxis_title=f"Variable: {feature_name}",
                    yaxis_title="y",
                )
                fig.show()
        else:
            print(data.feature_names[idx], "is continuous")
            v_type = "continuous"

            if response_type == "continuous":

                linear_regression_model = statsmodels.api.OLS(y, predictor)
                linear_regression_model_fitted = linear_regression_model.fit()
                print(f"Variable: {feature_name}")
                print(linear_regression_model_fitted.summary())
                t_value = round(linear_regression_model_fitted.tvalues[1], 6)
                p_value = "{:.6e}".format(
                    linear_regression_model_fitted.pvalues[1])  # noqa

                # Continuous Predictor by Continuous Response
                # Plot the figure
                fig = px.scatter(x=column, y=y)
                fig.update_layout(
                    title=
                    f"Variable: {feature_name}: (t-value={t_value}) (p-value={p_value})",  # noqa
                    xaxis_title=f"Variable: {feature_name}",
                    yaxis_title="y",
                )
                fig.show()
            else:
                linear_regression_model = statsmodels.api.OLS(y, predictor)
                linear_regression_model_fitted = linear_regression_model.fit()
                print(f"Variable: {feature_name}")
                print(linear_regression_model_fitted.summary())
                t_value = round(linear_regression_model_fitted.tvalues[1], 6)
                p_value = "{:.6e}".format(
                    linear_regression_model_fitted.pvalues[1])  # noqa
                # Continuous Predictor by Categorical Response
                # Plot the figure
                fig = px.histogram(x=column, y=y)
                fig.update_layout(
                    title=
                    f"Variable: {feature_name}: (t-value={t_value}) (p-value={p_value})",  # noqa
                    xaxis_title=f"Variable: {feature_name}",
                    yaxis_title="y",
                )
                fig.show()
        # create a list of each variable type
        predictor_type.append(v_type)

    print("***Difference with mean table***")

    # Create difference with mean table

    # create a temp table df_bin to store raw data
    n_of_bin = 10

    for idx, column in enumerate(X.T):
        feature_name = data.feature_names[idx]
        predictor = column
        target = data["target"]
        df = pd.DataFrame({feature_name: pd.Series(predictor)})
        df["target"] = target
        count_row = df.shape[0]
        p_min = df[feature_name].min()
        p_max = df[feature_name].max()
        p_range = p_max - p_min
        bin_width = p_range / n_of_bin
        # to include min number
        bin_list = [p_min - 1]
        s = p_min
        # +1 to include max number
        while s < p_max + 1:
            s += bin_width
            bin_list.append(round(s, 0))

        df_bin = df
        df_bin["LowerBin_UpperBin"] = pd.cut(
            x=df[feature_name],
            bins=bin_list,
            include_lowest=True,
            duplicates="drop",  # noqa
        )

        bincenter = []
        for bin_n in df_bin["LowerBin_UpperBin"]:
            bincenter.append(bin_n.mid)

            df_bin["BinCenters"] = pd.DataFrame(
                {"BinCenters": pd.Series(bincenter)})  # noqa
            df_bin["response"] = df["target"]

        # Groupby df_bin table to create a Difference with mean table

        df_bin_groupby = df_bin.groupby(
            ("LowerBin_UpperBin"), as_index=False).agg(  # noqa
                bin_mean=pd.NamedAgg(column=feature_name, aggfunc="mean"),
                bin_count=pd.NamedAgg(column=feature_name, aggfunc="count"),
            )

        bin_center_list = []
        for bin_center in df_bin_groupby["LowerBin_UpperBin"]:
            bin_center_list.append(bin_center.mid)

        df_bin_groupby["BinCenter"] = pd.Series(bin_center_list)

        PopulationMean = (np.sum(column)) / (count_row)
        df_bin["PopulationMean"] = PopulationMean
        df_bin_groupby["PopulationMean"] = PopulationMean

        MeanSquaredDiff = (df_bin_groupby["bin_mean"] -
                           df_bin_groupby["PopulationMean"])**2
        df_bin_groupby["MeanSquaredDiff"] = MeanSquaredDiff

        # Square the difference, sum them up and divide by number of bins
        print(
            f"THE unWeighted NUMBER of {feature_name} IS : {df_bin_groupby['MeanSquaredDiff'].sum() / n_of_bin}"  # noqa
        )
        print(feature_name, df_bin_groupby)

        trace1 = go.Bar(
            x=df_bin_groupby["BinCenter"],
            y=df_bin_groupby["bin_count"],
            name="population",
        )
        layout = go.Layout(
            title_text="Binned Response Mean vs Population Mean")  # noqa

        trace2 = go.Scatter(
            x=df_bin_groupby["BinCenter"],
            y=df_bin_groupby["PopulationMean"],
            name="population mean",
        )
        trace3 = go.Scatter(
            x=df_bin_groupby["BinCenter"],
            y=df_bin_groupby["bin_mean"],
            name="Bin Mean",  # noqa
        )
        combined = [trace1, trace2, trace3]
        fig = go.Figure(data=combined, layout=layout)

        fig.show()

        # Difference with mean table (weighted)

        print("***Difference with mean table (weighted)***")

        df_bin_groupby_weighted = df_bin_groupby.copy()

        population_proportion = []
        for count in df_bin_groupby["bin_count"]:
            population_proportion.append(count / count_row)

        df_bin_groupby_weighted["PopulationProportion"] = pd.Series(
            population_proportion)
        df_bin_groupby_weighted["MeanSquaredDiffWeighted"] = (
            df_bin_groupby_weighted["MeanSquaredDiff"] *
            df_bin_groupby_weighted["PopulationProportion"])

        # Square the difference, sum them up and divide by number of bins
        print(
            f"THE Weighted NUMBER of {feature_name} IS : {df_bin_groupby_weighted['MeanSquaredDiffWeighted'].sum() / n_of_bin}"  # noqa
        )

        print(feature_name, df_bin_groupby_weighted)

    # Random Forest Variable importance ranking
    print("***Random Forest Variable importance ranking***")

    model = RandomForestRegressor()
    model.fit(X, y)

    # get importance
    importance = model.feature_importances_

    feature = []
    score = []
    for i, v in enumerate(importance):
        feature.append(data.feature_names[i])
        score.append(round(v, 5))

    df_ranking_vartype = pd.DataFrame({
        "Feature": pd.Series(feature),
        "Score": pd.Series(score)
    })
    df_ranking_vartype["Variable_type"] = predictor_type
    df_ranking_vartype_sort = df_ranking_vartype.sort_values(by=["Score"])

    # path that will save the ranking excel file
    path = "D:\PycharmProjects\BDA696-MuTing\Feature_Importance_and_type.xlsx"  # noqa
    df_ranking_vartype_sort.to_excel(path, index=False)
    print(df_ranking_vartype_sort)
Ejemplo n.º 28
0
def mergeDati(df_profili, df_pdr, df_anagrafica_osservatori, df_wkr, anno_mese, societa, path_output):
    print('computation for societa ' + societa + ' ' + anno_mese + ' started')
    df_pp_pdr = df_pdr.merge(df_anagrafica_osservatori,on='STATION',how='left')
    print('merge pdr zona climatica')
    df_pp_pdr = df_profili.merge(df_pp_pdr,on=['PROFILO'])
    print('merge pdr profili done')
    df_pp_pdr = df_pp_pdr.loc[(df_pp_pdr['DATE'] >= df_pp_pdr['DATA_INIZIO']) & (df_pp_pdr['DATE'] <= df_pp_pdr['DATA_FINE'])]
    print('filter pdr by date')
    df_pp_pdr = df_pp_pdr.merge(df_wkr,on=['DATE','ZONA_CLIMATICA'],how='left')
    print('merge pdr wkr')
    df_pp_pdr = df_pp_pdr.assign(K=df_pp_pdr['C_WKR']*df_pp_pdr['WKR']+df_pp_pdr['C_CONST'])
    df_pp_pdr = df_pp_pdr.assign(K_NO_WKR=df_pp_pdr['C_WKR']*1+df_pp_pdr['C_CONST'])
    df_pp_pdr = df_pp_pdr.assign(SMC=df_pp_pdr['K']*df_pp_pdr['CONSUMO_ANNUO']/100)
    df_pp_pdr = df_pp_pdr.assign(SMC_NO_WKR=df_pp_pdr['K_NO_WKR']*df_pp_pdr['CONSUMO_ANNUO']/100)
    print('compute k+smc')
    df_pp_pdr_aggr_societa_tipo_tratt = df_pp_pdr.groupby(['SOCIETA', 'TRATTAMENTO_AGG', 'TIPOLOGIA', 'DATE', 'ANNO_MESE', 'WKR']).agg(SMC=pd.NamedAgg(column='SMC', aggfunc='sum'), SMC_NO_WKR=pd.NamedAgg(column='SMC_NO_WKR', aggfunc='sum'), K=pd.NamedAgg(column='K', aggfunc='sum'), K_NO_WKR=pd.NamedAgg(column='K_NO_WKR', aggfunc='sum'), CONSUMO_ANNUO=pd.NamedAgg(column='CONSUMO_ANNUO', aggfunc='sum'), C_CONST=pd.NamedAgg(column='C_CONST', aggfunc='sum'), C_WKR=pd.NamedAgg(column='C_WKR', aggfunc='sum')).reset_index()
    print('computed aggregato grafico')
    df_pp_pdr_aggr_station_tipo_tratt = df_pp_pdr.groupby(['TRATTAMENTO', 'TIPOLOGIA', 'STATION', 'DATE', 'ANNO_MESE', 'WKR']).agg(SMC=pd.NamedAgg(column='SMC', aggfunc='sum'), SMC_NO_WKR=pd.NamedAgg(column='SMC_NO_WKR', aggfunc='sum'), K=pd.NamedAgg(column='K', aggfunc='sum'), K_NO_WKR=pd.NamedAgg(column='K_NO_WKR', aggfunc='sum'), CONSUMO_ANNUO=pd.NamedAgg(column='CONSUMO_ANNUO', aggfunc='sum'), C_CONST=pd.NamedAgg(column='C_CONST', aggfunc='sum'), C_WKR=pd.NamedAgg(column='C_WKR', aggfunc='sum')).reset_index()
    print('computed aggregato station tipologia trattamento')
    df_pp_pdr_aggr_station_societa_profilo_tratt = df_pp_pdr.groupby(['TRATTAMENTO', 'PROFILO', 'SOCIETA', 'PIVA', 'STATION', 'DATE', 'ANNO_MESE', 'WKR']).agg(SMC=pd.NamedAgg(column='SMC', aggfunc='sum'), SMC_NO_WKR=pd.NamedAgg(column='SMC_NO_WKR', aggfunc='sum'), K=pd.NamedAgg(column='K', aggfunc='sum'), K_NO_WKR=pd.NamedAgg(column='K_NO_WKR', aggfunc='sum'), CONSUMO_ANNUO=pd.NamedAgg(column='CONSUMO_ANNUO', aggfunc='sum'), C_CONST=pd.NamedAgg(column='C_CONST', aggfunc='sum'), C_WKR=pd.NamedAgg(column='C_WKR', aggfunc='sum')).reset_index()
    df_pp_pdr_aggr_non_progr = df_pp_pdr.groupby('PDR')
    print('computed aggregato station societa profilo trattamento')
    
    df_pp_pdr_dett = df_pp_pdr[['SOCIETA', 'PIVA', 'TRATTAMENTO', 'TIPOLOGIA', 'PROFILO', 'ZONA_CLIMATICA', 'STATION', 'PDR', 'DATE', 'WKR', 'SMC', 'CONSUMO_ANNUO']]
    print('extract subset of fields for dettaglio')
    df_pp_pdr_dett.loc[df_pp_pdr_dett['TRATTAMENTO'] == 'Y'].to_csv(path_output + anno_mese + '/' + 'dettaglio/dettaglio_' + societa + '_y.csv')
    print('dettaglio y written: ' + df_pp_pdr_dett.loc[df_pp_pdr_dett['TRATTAMENTO'] == 'Y']['PDR'].count().astype(str))
    df_pp_pdr_dett.loc[df_pp_pdr_dett['TRATTAMENTO'] != 'Y'].to_csv(path_output + anno_mese + '/' + 'dettaglio/dettaglio_' + societa + '_gm.csv')
    print('dettaglio gm written: ' + df_pp_pdr_dett.loc[(df_pp_pdr_dett['TRATTAMENTO'] != 'Y')]['PDR'].count().astype(str))
    df_pp_pdr_checks = df_pp_pdr_dett.loc[(df_pp_pdr_dett['SMC'].isnull()) | (df_pp_pdr_dett['WKR'].isnull())]
    print('checks computed: wkr null ' +  df_pp_pdr_dett.loc[df_pp_pdr_dett['SMC'].isnull()]['PDR'].count().astype(str) + ' smc null ' + df_pp_pdr_dett.loc[df_pp_pdr_dett['WKR'].isnull()]['PDR'].count().astype(str))
    
    return df_pp_pdr_aggr_societa_tipo_tratt, df_pp_pdr_aggr_station_tipo_tratt, df_pp_pdr_aggr_station_societa_profilo_tratt, df_pp_pdr_checks
Ejemplo n.º 29
0
            (("y", "A"), np.min),
            (("y", "B"), "mean"),
            [1, 3],
            [0, 2],
            [5.5, 7.5],
        ),
        (
            (("y", "A"), lambda x: max(x)),
            (("y", "A"), lambda x: 1),
            (("y", "B"), "mean"),
            [1, 3],
            [1, 1],
            [5.5, 7.5],
        ),
        (
            pd.NamedAgg(("y", "A"), "max"),
            pd.NamedAgg(("y", "B"), np.mean),
            pd.NamedAgg(("y", "A"), lambda x: 1),
            [1, 3],
            [5.5, 7.5],
            [1, 1],
        ),
    ],
)
def test_agg_relabel_multiindex_column(agg_col1, agg_col2, agg_col3,
                                       agg_result1, agg_result2, agg_result3):
    # GH 29422, add tests for multiindex column cases
    df = DataFrame({
        "group": ["a", "a", "b", "b"],
        "A": [0, 1, 2, 3],
        "B": [5, 6, 7, 8]
Ejemplo n.º 30
0
 def getGroupedByIMSI(self, df: pd.DataFrame):
     # Returns a df with grouped and aggregated values
     def joinValues(series):
         return ','.join(map(str, series[series.notnull()].unique()))
     groupedDf = df.groupby('IMSI').agg(
         IMSI=pd.NamedAgg(column='IMSI', aggfunc=joinValues),
         RAT=pd.NamedAgg(column='RAT', aggfunc=joinValues),
         OPERATOR=pd.NamedAgg(column='OPERATOR', aggfunc=joinValues),
         CHANNEL=pd.NamedAgg(column='CHANNEL', aggfunc=joinValues),
         IMEI=pd.NamedAgg(column='IMEI', aggfunc=joinValues),
         TMSI=pd.NamedAgg(column='TMSI', aggfunc=joinValues),
         MS_POWER=pd.NamedAgg(column='MS_POWER', aggfunc=joinValues),
         TA=pd.NamedAgg(column='TA', aggfunc=joinValues),
         LAST_LAC=pd.NamedAgg(column='LAST_LAC', aggfunc=joinValues),
         HITS=pd.NamedAgg(column='HITS', aggfunc='size'),
         DATE_TIME=pd.NamedAgg(column='DATE_TIME', aggfunc=joinValues),
     )
     return groupedDf.reset_index(drop=True)