Ejemplo n.º 1
0
def get_titanic(col_names=None,
                onehot=False,
                only_n_rows=None,
                seed=None,
                original=False):
    path = os.path.dirname(
        os.path.realpath(__file__)) + "/../../_data/titanic/train.csv"

    df = pd.read_csv(path)
    if original:
        return df
    df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"], inplace=True)

    if col_names is not None:
        df = df[col_names]

    #Fill missing values
    df["Age"].fillna(int(df["Age"].mean()), inplace=True)
    df["Embarked"].fillna("S", inplace=True)

    if only_n_rows and only_n_rows < len(df):
        df = df.sample(only_n_rows, random_state=seed)
    if onehot:
        df['Survived'] = df['Survived'].astype(bool).astype(str)
        df = pd.get_dummies(df)

    return fn.transform_dataset(df)
Ejemplo n.º 2
0
def get_play_store(one_hot=True):
    path = os.path.dirname(os.path.realpath(
        __file__)) + "/../../_data/play_store/googleplaystore.csv"
    df = pd.read_table(path, sep=',')
    df.columns
    #filter weird stuff
    df = df[~(df.Category == "1.9")]

    #feature engineering
    def _parse(x):
        try:
            f = float(x.split('$')[-1])
        except ValueError:
            f = np.NaN
        # return int(np.ceil(f))
        return f

    df.Price = df.Price.apply(_parse)
    df.Price = pd.cut(df.Price, [-np.inf, 0, 1, 2, 3, 5, 10, 20, 50, np.inf],
                      labels=[
                          '0', '0-1', '1-2', '2-3', '3-5', '5-10', '10-20',
                          '20-50', '50+'
                      ]).astype(str)
    df = df[df.Genres.isin(df.Genres.value_counts()[:25].index)]
    # df['Reviews'] =
    df.Reviews = pd.cut(df['Reviews'].astype(float),
                        [-np.inf, 0, 10, 100, 1000, 1e4, 1e5, 1e6, 1e7, 1e8],
                        labels=[
                            '0', '1+', '10+', '100+', '1,000+', '10,000+',
                            '100,000+', '1,000,000+', '10,000,000+'
                        ],
                        retbins=False,
                        include_lowest=False).astype(str)
    df.Rating = pd.cut(df['Rating'].astype(float), [1, 2, 3, 4, 5],
                       labels=['1-2', '2-3', '3-4', '4-5'],
                       retbins=False,
                       include_lowest=False).astype(str)
    cols = [
        'Category', 'Price', 'Rating', 'Content Rating', 'Reviews', 'Installs'
    ]  # 'Genres' viel identisch mit category
    # original data value_dict (tabular data!)
    value_dict = {
        i: ['discrete',
            dict(enumerate(df[c].value_counts().index))]
        for i, c in enumerate(cols)
    }
    parametric_types = [
        Categorical, Categorical, Categorical, Categorical, Categorical,
        Categorical, Categorical, Categorical
    ]
    df = df[cols].dropna(axis=1, )

    #one hot data
    onehot = pd.get_dummies(df[cols], prefix=cols, prefix_sep=': ')
    # value_dict_onehot = {i:}
    parametric_types_onehot = get_feature_types_from_dataset(onehot)
    one_hot, value_dict_onehot, _ = fn.transform_dataset(
        onehot, ['discrete'] * len(onehot.columns))
    return onehot, value_dict_onehot, parametric_types_onehot
Ejemplo n.º 3
0
def test_value_dict():
    import os
    import pandas as pd
    from util import io
    path = os.path.dirname(
        os.path.realpath(__file__)) + "/../../_data/titanic/train.csv"
    df = pd.read_csv(path)
    df = df[["Survived", "Sex", "Age", "Fare", "Pclass"]]
    df, val_dict, param_types = fn.transform_dataset(df)

    io.print_pretty_table(df)
    print(val_dict)
    print(param_types)
Ejemplo n.º 4
0
def get_rki_ed_2():
    df = get_rki_ed_data(column_names=["aufnahmezeitpunkt_datum", "leitsymptom", "leitsymptom_gruppe", "vitalwerte", "diagnosen"])#, file_names=["epias_of_rki.2018-11.300000.json"])
    df["aufnahmezeitpunkt_wochentag"] = df["aufnahmezeitpunkt_datum"].apply(lambda date_str: datetime.datetime.strptime(date_str, '%Y-%m-%d').weekday())
    df["aufnahmezeitpunkt_monat"] = df["aufnahmezeitpunkt_datum"].apply(lambda date_str: datetime.datetime.strptime(date_str, '%Y-%m-%d').month)
    
    
    data = []  
    for val in df["aufnahmezeitpunkt_datum"].unique():
        day_df = df[df["aufnahmezeitpunkt_datum"] == val]
        if len(day_df) == 0: continue
        
        week_day = day_df["aufnahmezeitpunkt_wochentag"].iloc[0]
        month = day_df["aufnahmezeitpunkt_monat"].iloc[0]
        
        temps = []
        heart_rates = []
        breath_rates = []
        blood_pressures = []
        for vitals in day_df["vitalwerte"]:
            for vital in vitals:
                if 'blutdruck_systolisch' in vital: blood_pressures.append(vital['blutdruck_systolisch'])
                if 'atemfrequenz' in vital: breath_rates.append(vital['atemfrequenz'])
                if 'herzfrequenz' in vital: heart_rates.append(vital['herzfrequenz'])
                if 'temperatur' in vital: temps.append(vital['temperatur'])
            
        temps = np.array(temps, dtype=np.float64)
        heart_rates = np.array(heart_rates, dtype=np.float64)
        breath_rates = np.array(breath_rates, dtype=np.float64)
        blood_pressures = np.array(blood_pressures, dtype=np.float64)
        
        if len(temps)==0 or len(heart_rates)==0 or len(breath_rates)==0 or len(blood_pressures) == 0:
            continue
        
        
        data.append([week_day, month, len(temps), len(heart_rates), len(breath_rates), len(blood_pressures), np.mean(temps), np.mean(heart_rates), np.mean(breath_rates), np.mean(blood_pressures)])
    
    df = pd.DataFrame(data, columns=["wochentag",
                                "monat",
                                "count_temperatur",
                                "count_herzfrequenz",
                                "count_atemfrequenz",
                                "count_blutdruck",
                                "avg_temperatur", "avg_herzfrequenz", "avg_atemfrequenz", "avg_blutdruck",
                                ])
    
    return fn.transform_dataset(df, feature_types=["discrete", "discrete", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric"])
Ejemplo n.º 5
0
def get_rki_ed_3():
    df = get_rki_ed_data(column_names=["aufnahmezeitpunkt_datum", "aufnahmezeitpunkt_stunde", "behandlung_fachabteilung", "geschlecht", "altersklasse", "zuweisungsart"], file_names=["epias_of_rki.2018-11.300000.json"])
    
    def weekday(date_str):
        week_day = datetime.datetime.strptime(date_str, '%Y-%m-%d').weekday()
        if week_day == 0: return "Mon"
        if week_day == 1: return "Tue"
        if week_day == 2: return "Wed"
        if week_day == 3: return "Thur"
        if week_day == 4: return "Fri"
        if week_day == 5: return "Sat"
        if week_day == 6: return "Sun"
    
    df["aufnahmezeitpunkt_wochentag"] = df["aufnahmezeitpunkt_datum"].apply(weekday)
    df["aufnahmezeitpunkt_monat"] = df["aufnahmezeitpunkt_datum"].apply(lambda date_str: datetime.datetime.strptime(date_str, '%Y-%m-%d').month)
    df.drop(columns=["aufnahmezeitpunkt_datum"], inplace=True)
    return fn.transform_dataset(df)
Ejemplo n.º 6
0
def mini_titanic():
    data = {
        'Survived': {
            356: True,
            255: True,
            380: True,
            859: False,
            886: False,
            248: True,
            598: False,
            372: False,
            574: False,
            820: True
        },
        'Embarked': {
            356: 'Southampton',
            255: 'Cherbourg',
            380: 'Cherbourg',
            859: 'Cherbourg',
            886: 'Southampton',
            248: 'Southampton',
            598: 'Cherbourg',
            372: 'Southampton',
            574: 'Southampton',
            820: 'Southampton'
        },
        'Sex': {
            356: 'female',
            255: 'female',
            380: 'female',
            859: 'male',
            886: 'male',
            248: 'male',
            598: 'male',
            372: 'male',
            574: 'male',
            820: 'female'
        }
    }
    columns = ['Survived', 'Embarked', 'Sex']
    small = pd.DataFrame(data, columns=columns)
    small = small.sort_values(['Survived', 'Sex', 'Embarked'])
    print(small.to_latex(index=False))
    return fn.transform_dataset(small)
Ejemplo n.º 7
0
def get_adult_41_items(onehot=False, only_n_rows=None, seed=None):
    '''
    UCI adult dataset. cleaned and in transactional form
    The age was discretized. numeric columns (except age) were removed.
    The purpose of this test is to assure that the algorithm can deal with a
    small 2.2 MB (30k rows) data set reasonably efficiently.
    https://raw.githubusercontent.com/tommyod/Efficient-Apriori/master/efficient_apriori/tests/adult_data_cleaned.txt
    :return:
    '''
    path = os.path.dirname(os.path.realpath(
        __file__)) + "/../../_data/adult/adult_data_transactions.data"

    if onehot:
        transaction_encoder = TransactionEncoder()
        # data = []
        # with open(path) as f:
        #     for line in f.readlines():
        #         data.append(set(line.strip().replace(' ', '').split(',')))
        # fit = transaction_encoder.fit(data)
        # one_hot_df = pd.DataFrame(fit.transform(data), columns=fit.columns_)
        columns = [
            'education', 'marital-status', 'relationship', 'race', 'sex',
            'income', 'age'
        ]
        tabular = pd.read_table(path,
                                sep=',',
                                names=columns,
                                skipinitialspace=True)
        df = pd.get_dummies(tabular.astype(str),
                            prefix=None,
                            prefix_sep='_',
                            dtype=np.bool)
    else:
        columns = [
            'education', 'marital-status', 'relationship', 'race', 'sex',
            'income', 'age'
        ]
        df = pd.read_table(path, sep=',', names=columns, skipinitialspace=True)
    if only_n_rows and only_n_rows < len(df):
        df = df.sample(only_n_rows, random_state=seed)
    return fn.transform_dataset(df)
Ejemplo n.º 8
0
def get_titanic_bins(col_names=None,
                     onehot=False,
                     only_n_rows=None,
                     seed=None):
    path = os.path.dirname(
        os.path.realpath(__file__)) + "/../../_data/titanic/train.csv"
    df = pd.read_csv(path)
    df['NumFamily'] = (df.SibSp + df.Parch).astype(int)
    df.loc[df.NumFamily >= 3, 'NumFamily'] = '3+'
    df.NumFamily = df.NumFamily.astype(str)
    df.Embarked.replace(
        {
            'C': 'Cherbourg',
            'Q': 'Queenstown',
            'S': 'Southampton'
        },
        inplace=True)
    df["Embarked"].fillna("Unknown", inplace=True)
    df.drop(columns=[
        "PassengerId", "Name", "Ticket", "Cabin", 'SibSp', 'Parch', 'Fare'
    ],
            inplace=True)
    if col_names is not None:
        df = df[col_names]
    df.Pclass = df.Pclass.astype(str)
    # Fill missing values
    df['Age_'] = np.NaN
    df.loc[df['Age'] < 16, 'Age_'] = 'child'
    df.loc[df['Age'].between(16, 30), 'Age_'] = 'young-adult'
    df.loc[df['Age'].between(31, 50), 'Age_'] = 'middle-aged'
    df.loc[df['Age'].between(50, df.Age.max()), 'Age_'] = 'old'
    df['Age_'].fillna('Unknown', inplace=True)
    df = df.drop(columns=['Age']).rename(columns={'Age_': 'Age'})

    if only_n_rows and only_n_rows < len(df):
        df = df.sample(only_n_rows, random_state=seed)
    if onehot:
        df['Survived'] = df['Survived'].astype(bool).astype(str)
        df = pd.get_dummies(df)

    return fn.transform_dataset(df)
Ejemplo n.º 9
0
def get_lending(
    only_n_rows=None,
    seed=None,
    onehot=True,
    original=False,
):
    '''
    https://www.kaggle.com/wendykan/lending-club-loan-data
    about 2.200.000 rows,
    '''
    # discretizer = KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='quantile')

    with open('../../_data/lending/loan.csv', 'r', encoding='latin-1') as f:
        used_cols = [
            'loan_amnt', 'loan_status', 'term', 'purpose', 'int_rate', 'grade',
            'emp_length', 'home_ownership', 'annual_inc'
        ]
        df = pd.read_csv(f, usecols=used_cols)
        df = _shorten_df(df, only_n_rows, seed=seed)
    if original:
        return df
    df = df[~df.loan_status.isin([
        'Does not meet the credit policy. Status:Charged Off',
        'Does not meet the credit policy. Status:Fully Paid'
    ])]
    df.loan_status.replace(
        ['Late (31-120 days)', 'Late (16-30 days)', 'In Grace Period'],
        'Late',
        inplace=True)
    df.loan_status.replace('Default', 'Charged Off', inplace=True)
    df.emp_length.replace([str(i) + ' years' for i in range(2, 10)],
                          '1-10 years',
                          inplace=True)
    df.emp_length.replace(['< 1 year', '1 year'], '<= 1 year', inplace=True)
    df.grade.replace(
        {
            'A': 'good',
            'B': 'good',
            'C': 'medium',
            'D': 'medium',
            'E': 'bad',
            'F': 'bad',
            'G': 'bad'
        },
        inplace=True)

    keep = (df.purpose.value_counts()[df.purpose.value_counts().head(10).index]
            ).index.to_list()
    df.purpose = df.purpose[df.purpose.isin(keep)]

    df.dropna(inplace=True)
    numeric_cols = df.columns[~(df.dtypes == np.object)]
    # df[numeric_cols] = discretizer.fit_transform(df[numeric_cols])

    for c in numeric_cols:
        # quantiles = np.round(df[c].quantile([0.25, 0.5, 0.75])).astype(int).tolist()
        # q_labels = [x.format(low=quantiles[0], mid=quantiles[1], high=quantiles[2]) for x in ['0 - {low}', '{low} - {mid}', '{mid} - {high}', '{high} - inf']]
        quantiles = np.round(df[c].quantile([0.25, 0.5,
                                             0.75])).astype(int).tolist()
        q_labels = [
            x.format(low=quantiles[0],
                     mid=quantiles[1],
                     high=quantiles[2],
                     max=int(df[c].max())) for x in
            ['0 - {low}', '{low} - {mid}', '{mid} - {high}', '{high} - {max}']
        ]
        df[c] = pd.cut(
            df[c],
            bins=[-np.inf] + quantiles + [np.inf],
            labels=q_labels,
        ).astype(str)
    df.emp_length.replace([''])
    #remove rare items
    df = df[~df.home_ownership.isin(['OTHER', 'ANY'])]

    if onehot:
        df = pd.get_dummies(df, )

    return fn.transform_dataset(df)
Ejemplo n.º 10
0
    # new_df[cols[0]] = str_df[cols[0]].replace(repl_dict[cols[0]])
    # new_df[cols[1]] = str_df[cols[1]].replace(repl_dict[cols[1]])
    # new_df[cols[2]] = str_df[cols[2]].replace(repl_dict[cols[2]])
    # rang = [np.NaN] * len(spn.scope)
    #

    # mini spn example
    x = np.random.choice([1, 2], int(1e4), replace=True, p=[0.3, 0.7])
    p = {1: [0.9, 0.1, 0.], 2: [0., 0.9, 0.1]}
    y = []
    for v in x:
        y.append(np.random.choice([1, 2, 3], 1, replace=True, p=p[v]))
    y = np.array(y).reshape(-1, )
    z = np.random.choice([1, 2], int(1e4), replace=True, p=[0.4, 0.6])
    df = pd.DataFrame(dict(zip(['X', 'Y', 'Z'], [x, y, z]))).astype(str)
    df, vd, pars = fn.transform_dataset(df)
    spn = spn_handler.load_or_create_spn(df,
                                         vd,
                                         pars,
                                         'mini_example',
                                         0.4,
                                         0.5,
                                         nrows=None,
                                         seed=1,
                                         force_create=True,
                                         clustering='km_rule_clustering')
    spn = spn.children[1]
    manspn = ( 0.3 * (Categorical(p=[0.9, 0.1], scope=0) * Categorical(p=[0.55, 0.4, 0.05], scope=1))
               + 0.7 * (Categorical(p=[0., 1.], scope=0) * Categorical(p=[0.1, 0.2, 0.7], scope=1)) ) \
            * (Categorical(p=[0.4, 0.6], scope=2))
    # plot leaves from example
Ejemplo n.º 11
0
                                              min_instances_slice)

    #Print some statistics
    fn.print_statistics(spn)

    #Example value dict generation

    path = os.path.dirname(
        os.path.realpath(__file__)) + "/../../_data/titanic/train.csv"
    df = pd.read_csv(path)

    #print data (top 5 rows)
    io.print_pretty_table(df.head(5))

    df = df[["Survived", "Sex", "Age", "Fare", "Pclass"]]
    df, val_dict, param_types = fn.transform_dataset(df)

    #print data after transformation (top 5 rows)
    io.print_pretty_table(df.head(5))
    ''''
    SPN functions
    '''

    #Load synthetic example SPN (very simple SPN)
    from simple_spn.example import example_spns
    spn = example_spns.get_gender_spn()

    #plot spn
    fn.plot_spn(spn, "sample_spn.pdf", value_dict)

    #generate samples