Ejemplo n.º 1
0
            if min_percentage <= date_of_col_percentage < max_percentage:
                if verbose:
                    text = '{}/{} ({}%) - {}'.format(
                        dates_len, col_len, round(date_of_col_percentage, 2),
                        column)
                    print(text)
                columns.append(column)
    return columns


def dates_to_bool(dataframe, columns):
    for column in columns:
        print(column)
        dataframe[column] = dataframe[column].map(
            lambda v: is_string_and_date(v))
    return dataframe


def is_string_and_date(v):
    if isinstance(v, str) and date_util.is_date(v):
        return 't'
    return 'f'


df = data_util.get_dataframe(constants.DATA + 'no_empty_h_150.csv')
date_columns = get_column_dates_stats(df, 0, 100, False)
dates_0_10_as_bool_150 = dates_to_bool(df, date_columns)
dates_0_10_as_bool_150.to_csv(constants.DATA + 'dates_0_100_as_bool_150.csv',
                              index=False,
                              encoding='UTF-8')
Ejemplo n.º 2
0
from src.util import data_util, constants

def little_MCAR(df):
    pass

df = data_util.get_dataframe(constants.DATA + 'df_80_plus_no_cont_no_ctx_comp_dup.csv', max_rows=1000, max_columns=1000)
little_MCAR(df)
Ejemplo n.º 3
0
    for run in range(0, runs):
        sample_df = data_util.get_sampled(initial_df, cols_amount).iloc[:, 1:]
        ss = k_modes_silhouette(sample_df, cluster_amount)
        print(ss)
        if ss <= min_silhouette_treshold:
            print(sample_df.columns)
            for column in sample_df.columns:
                if column not in columns:
                    columns.append(column)
    idf = initial_df.drop(columns, axis=1)
    return idf


#
df = data_util.get_dataframe(constants.DATASETS + '4_10.csv',
                             max_rows=1000,
                             max_columns=1000,
                             with_namespaces=True)
df.fillna(value='missing', inplace=True)
df = df.astype(str)
df.to_csv(constants.DATA + '4_10_arm.csv', index=False, header=False)

# extracted_df = extract_df_with_high_silhouette_columns(df, 10, 5, 0.25, 100)
# k_modes_silhouette(extracted_df, 3)
# pass

# df = data_util.get_dataframe(constants.DATASETS + '6_10.csv', with_namespaces=True, max_rows=1000, max_columns=1000, header=None)
# df.fillna(value='missing', inplace=True)
# df.to_csv(constants.DATA + 'df_80_plus_no_cont_no_ctx_comp_dup_as_str_no_header.csv', index=False, header=False)
#
# amount_of_cols_list = [3, 5, 10, 20, 50, 90, 120]
# silhoutte_means = []
Ejemplo n.º 4
0
from src.util import data_util, constants
import pandas as pd

df = data_util.get_dataframe(constants.DATA + 'df_no_99.csv')
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

newdf = df.select_dtypes(include=numerics)


def get_continious_column_names(df):
    likely_categories = {}
    categorical_columns = []
    for var in newdf.columns:
        likely_categories[var] = 1. * df[var].nunique() / df[var].count()
    for v, k in likely_categories.items():
        if k >= 0.6:
            categorical_columns.append(v)


def get_uniform_categorical_intervals(df, columns):
    for columns in columns:
        try:
            cat = pd.cut(df[columns], 5)
        except TypeError:
            print(columns)
        except KeyError:
            print(columns)
            continue
        print('changed like:{}'.format(columns))
        df[columns] = cat
Ejemplo n.º 5
0
from src.util import data_util, constants, decorators
from src.validation import simple_matching, silhouette
from src.data_stats.duplicates import get_cramers_by_partition
from src import k_modes, hdbscan_demo

df_150 = data_util.get_dataframe(constants.DATASETS + '2_123.csv').iloc[:, :].astype(str)
df_500 = data_util.get_dataframe(constants.GENERATED_DATASETS + '123_500.csv').iloc[:, :].astype(str)
df_1000 = data_util.get_dataframe(constants.GENERATED_DATASETS + '123_1000.csv').iloc[:, :].astype(str)
df_5000 = data_util.get_dataframe(constants.GENERATED_DATASETS + '123_5000.csv').iloc[:, :].astype(str)


@decorators.measure_time
def dissimilarity_matrix_benchmark(df):
    simple_matching.get_dissimilarity_matrix_opt(df)


@decorators.measure_time
def correlation_duplicates_benchmark(df):
    get_cramers_by_partition(df)


@decorators.measure_time
def k_modes_benchmark(df):
    k_modes.k_modes_dict(cluster_amount=3, dataframe=df)


@decorators.measure_time
def hdbscan_benchmark(df):
    hdbscan_demo.hdbscan_dict(df)

Ejemplo n.º 6
0
            (max_percentage >= calc_util.percentage(dataframe, whole))).sum()


def percentage_nan_row(dataframe, whole=150, min_percentage=0, max_percentage=100):
    count = 0
    for row in dataframe:
        if (min_percentage < calc_util.percentage(row, whole)) and (max_percentage >= calc_util.percentage(row, whole)):
            count += 1
    return count


def percentage_nan_equals(dataframe, max_rows=150, equal_percentage=100):
    return (equal_percentage == calc_util.percentage(dataframe, max_rows)).sum()


df = data_util.get_dataframe(constants.DATA + '/df_80_plus_filled.csv')
max_columns = df.shape[1]
max_rows = df.shape[0]
overall = max_columns * max_rows
nan_by_column = df.isnull().sum()

nan_overall = nan_by_column.values.sum()
print('overall nan amount: {}/{}'.format(nan_overall, overall))
print('min nan amount: {}'.format(nan_by_column.min()))
print('max nan amount: {}'.format(nan_by_column.max()))

with_zero_missing = percentage_nan_equals(nan_by_column, equal_percentage=0)
print('0%: {}/{}'.format(with_zero_missing, max_rows))
print('100%: {}/{}'.format(percentage_nan_equals(nan_by_column, equal_percentage=100), max_rows))

# df_no_empty = df.loc[:, df.isnull().sum() != 150]
Ejemplo n.º 7
0
from matplotlib import pyplot as plt
import seaborn as sns
from src.util import constants, data_util
from src.validation import simple_matching
from src import hdbscan_impl
import pandas as pd


df = data_util.get_dataframe(constants.DATASETS + '2_344.csv', max_rows=1000, max_columns=300)
df.fillna(value='missing', inplace=True)
df = df.astype(str)

vals = []
cluster_dict = hdbscan_impl.hdbscan_dict(df, visualize_tree=False, log=True)
# cluster_dict = k_modes.k_modes_dict(cluster_amount=3, dataframe=df)
keys_by_cluster_size = sorted(cluster_dict, key=lambda k: len(cluster_dict[k]), reverse=True)
for key in keys_by_cluster_size:
    for el in cluster_dict.get(key):
        vals.append(el)

dm = simple_matching.get_dissimilarity_matrix(pd.DataFrame(vals))
fig, ax = plt.subplots()
colormap = plt.cm.cubehelix_r
ax = sns.heatmap(dm, cmap=colormap)
fig.savefig(constants.DATA + 'va/hdbscan/dataset_1g')

Ejemplo n.º 8
0
import missingno as msno
from src.util import data_util, constants

df = data_util.get_dataframe(constants.DATA +
                             'original/original_150_allstring.csv').iloc[:, :]
dfc = df.copy()
df_no_ctx = data_util.get_dataframe(constants.DATASETS +
                                    '2_344.csv').iloc[:, :]
msno.matrix(df)
Ejemplo n.º 9
0
                    min = dm[i, j]
                    min_j = j
        J.remove(min_j)
        I.append(min_j)
        P.append(min_j)
    result = []
    for p in P:
        row = []
        for q in P:
            row.append(dm[p, q])
        result.append(row)
    return result
    # P.append(np.argmin(extracted_cols_rows))


df = data_util.get_dataframe(constants.DATASETS + '2_123.csv')
df.fillna(value='missing', inplace=True)
distance_matrix = simple_matching.get_dissimilarity_matrix(df.iloc[:, 1:])
distance_matrix = np.random.rand(10, 10)

b = np.random.random_integers(0, 75, size=(150, 150))
b_symm = (b + b.T) / 200
np.fill_diagonal(b_symm, 0)
print(np.matrix(b_symm))
# dm = [[0, 0.255, 0.505, 0.625, 0.87], [0.255, 0., 0.725, 0.715, 0.435], [0.505, 0.725, 0., 0.785, 0.64], [0.625, 0.715, 0.785, 0., 0.4],
#       [0.87, 0.435, 0.64, 0.4, 0.]]
print(np.matrix(b_symm))
print('================================================')
dm = order(distance_matrix)
#
print(np.matrix(dm))