def __init__(self):
        crm_data_path = os.path.join(DATASET_DIR, "crm.xlsx")
        crm_data_reader = CRMData(crm_data_path)
        crm_df = crm_data_reader.get_all_data()

        merchant_data_path = os.path.join(DATASET_DIR, "MerchantSumAmountPerDay.txt")
        merchant_data_reader = MerchantData(merchant_data_path)
        merchant_df = merchant_data_reader.all_processed_dataframe()

        self.data_reader = CRMMerchant(crm_df, merchant_df)
def no_transaction_vs_harmonic():
    merchant_data_path = os.path.join(DATASET_DIR,
                                      "MerchantSumAmountPerDay.txt")
    data_reader = MerchantData(merchant_data_path)
    X = data_reader.get_clean_data()

    # Data Selection
    harmonic_amount = X[:, 0]  # Recency
    no_transaction = X[:, 1]  # Frequency

    # Plotting
    title = "Trans vs Harmonic"
    labels = ("No Transaction", "Harmonic Sum")
    plotlyvisualize.scatter(no_transaction,
                            harmonic_amount,
                            title,
                            labels,
                            out_path=PLOT_OUT_DIR)
Beispiel #3
0
 def __init__(self, merchant_data_df):
     if merchant_data_df is None:
         merchant_data_path = os.path.join(
             DATASET_DIR, "Merchants-PerDay-GRTOneThousand.txt")
         merchant_data_reader = MerchantData(merchant_data_path)
         self._merchant_data_df = merchant_data_reader.all_processed_dataframe(
         )
         self._merchant_data_df = merchant_data_df.set_index(
             "merchant_number")
         print(self._merchant_data_df)
     else:
         self._merchant_data_df = merchant_data_df
     #print(self._merchant_data_df)
     crm_data_path = os.path.join(DATASET_DIR, "CRM-Senf-Merchant.xlsx")
     self.crm_data_reader = CRMData(crm_data_path)
     self._crm_df = self.crm_data_reader.get_all_data()
     print(self._crm_df)
     self._crm_merchant_df = CRMMerchant(self._crm_df,
                                         self._merchant_data_df)
def no_transactions_vs_sum_amounts():
    merchant_data_path = os.path.join(DATASET_DIR,
                                      "MerchantSumAmountPerDay.txt")
    data_reader = MerchantData(merchant_data_path)
    X = data_reader.get_clean_data()

    # Data Selection
    no_transaction = X[:, 1]  # Frequency
    sum_amounts = X[:, 2]  # Money

    # Plotting
    title = "No Transactions vs Sum Amounts"
    labels = ("No Transaction", "Sum Amounts")
    no_transaction = np.log(no_transaction)
    #sum_amounts = np.log(sum_amounts)/np.log(1.5)

    plotlyvisualize.scatter(no_transaction,
                            sum_amounts,
                            title,
                            labels,
                            out_path=PLOT_OUT_DIR)
Beispiel #5
0
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
from sklearn.neighbors import KernelDensity
import os
from readdata.merchantdata import MerchantData

DATASET_DIR = "dataset"
PLOT_OUT_DIR = "plotsout"

merchant_data_path = os.path.join(DATASET_DIR, "MerchantSumAmountPerDay.txt")
data_reader = MerchantData(merchant_data_path)
X = data_reader.get_clean_data()

# Data Selection
no_transaction = X[:, 1]  # Frequency
sum_amounts = X[:, 2]  # Money

# Plot a 1D density example
N = 100
np.random.seed(1)
N = no_transaction.shape[0]
X = no_transaction[:,
                   np.newaxis]  #np.random.normal(0, 1, 0.3 * N)[:, np.newaxis]
X_plot = np.linspace(np.min(X), np.max(X), 1000)[:, np.newaxis]

fig, ax = plt.subplots()

for kernel in ['gaussian', 'tophat', 'epanechnikov']:
    kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X)
    log_dens = kde.score_samples(X_plot)
Beispiel #6
0
from sklearn import datasets
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import preprocessing
from readdata.crmmerchant import CRMMerchant
from readdata.merchantdata import MerchantData
from readdata.crmdata import CRMData
from visualization import plotlyvisualize
import os

DATASET_DIR = "dataset"
PLOT_OUT_DIR = "plotsout"

# Data Read
merchant_data_path = os.path.join(DATASET_DIR, "MerchantSumAmountPerDay.txt")
merchant_data_reader = MerchantData(merchant_data_path)
merchant_df = merchant_data_reader.all_processed_dataframe()

crm_data_path = os.path.join(DATASET_DIR, "crm.xlsx")
crm_data_reader = CRMData(crm_data_path)
crm_df = crm_data_reader.get_all_data()

crm_merchant_df = CRMMerchant(crm_df, merchant_df)

# harmonic_amount = crm_merchant_df[["harmonic"]].as_matrix().astype(np.float)
# no_transaction = crm_merchant_df[["all_transactions"]].as_matrix().astype(np.float)
# sum_amounts = crm_merchant_df[["sum_amounts"]].as_matrix().astype(np.float)
# senf_code = crm_merchant_df["senf_code"].tolist()
# plotlyvisualize.histogram(senf_code)

mm = crm_data_reader.get_guild_series()
def no_transaction_per_day():
    merchant_data_path = os.path.join(DATASET_DIR,
                                      "MerchantSumAmountPerDay.txt")
    data_reader = MerchantData(merchant_data_path)
    transaction_per_day = data_reader.get_transaction_per_day()
    plotlyvisualize.histogram(transaction_per_day)
 def __init__(self):
     merchant_data_path = os.path.join(DATASET_DIR,
                                       "MerchantSumAmountPerDay.txt")
     self.data_reader = MerchantData(merchant_data_path)
     self.scaler = MinMaxScaler()
class BoxCoxTransformation(object):
    """
    The idea behind this class is to Transform data into
    another space which is the log of the original space.
    Because data is assumed to be distributed like exponential dist.
    it's natural to take logs
    References:
        https://en.wikipedia.org/wiki/Data_transformation_(statistics)
    """
    @timethis
    def __init__(self):
        merchant_data_path = os.path.join(DATASET_DIR,
                                          "MerchantSumAmountPerDay.txt")
        self.data_reader = MerchantData(merchant_data_path)
        self.scaler = MinMaxScaler()

    @timethis
    def _merchant_df(self, ):
        return self.data_reader.selected_dataframe()

    @timethis
    def get_boxcox_no_transactions(self):
        """
        :return: natural logarithm of number of transactions
        :rtype: pandas.core.series.Series
        """
        merchant_df = self._merchant_df()
        no_transactions_series = merchant_df["all_transactions"]
        #no_transactions_series = pd.Series(data=preprocessing.normalize(no_transactions_series),
        #                                  index=no_transactions_series.index)
        #print(no_transactions_series)
        boxcox_no_transactions, _ = stats.boxcox(no_transactions_series)
        boxcox_no_transactions_series = pd.Series(
            data=boxcox_no_transactions, index=no_transactions_series.index)
        return boxcox_no_transactions_series

    @timethis
    def get_boxcox_sum_amounts(self):
        """
        :return: natural logarithm of sum amounts 
        :rtype: pandas.core.series.Series
        """
        merchant_df = self._merchant_df()
        sum_amount_series = merchant_df["sum_amounts"]
        #sum_amount_series = pd.Series(data=self.scaler.fit_transform(sum_amount_series),
        #                              index=sum_amount_series.index)
        boxcox_sum_amount, _ = stats.boxcox(sum_amount_series)
        boxcox_sum_amount_series = pd.Series(data=boxcox_sum_amount,
                                             index=sum_amount_series.index)
        return boxcox_sum_amount_series

    @timethis
    def get_boxcox_harmonic(self):
        merchant_df = self._merchant_df()
        harmonic_series = merchant_df["harmonic"]
        boxcox_harmonic, _ = stats.boxcox(harmonic_series)
        boxcox_harmonic_series = pd.Series(data=boxcox_harmonic,
                                           index=harmonic_series.index)
        return boxcox_harmonic_series

    @timethis
    def kolmogrov_smirnov_on_no_transactions_test(self):
        boxcox_no_transactions = self.get_boxcox_no_transactions()
        #return stats.kstest(boxcox_no_transactions, 'norm')
        return stats.shapiro(boxcox_no_transactions[0:4000])

    @timethis
    def kolmogrov_smirnov_on_sum_amounts_test(self):
        boxcox_sum_amounts = self.get_boxcox_sum_amounts()
        #return stats.kstest(boxcox_sum_amounts, 'norm')
        return stats.shapiro(boxcox_sum_amounts[0:4000])

    @timethis
    def get_no_transactions_vs_sum_amounts_df(self):
        """
        :return: dataframe of number of all transactions and sum amounts
        :rtype: pandas.core.frame.DataFrame
        """
        frames = [
            self.data_reader.get_sum_transactions_series(),
            self.data_reader.get_sum_amounts_series()
        ]
        selected_data = pd.concat(frames, axis=1)
        selected_data.columns = ["all_transactions", "sum_amounts"]
        return selected_data

    def get_all_merchants_df(self):
        frames = [
            self.data_reader.get_sum_transactions_series(),
            self.data_reader.get_sum_amounts_series(),
            self.data_reader.get_harmonic_series()
        ]
        selected_data = pd.concat(frames, axis=1)
        selected_data.columns = ["all_transactions", "sum_amounts", "harmonic"]
        return selected_data

    @timethis
    def get_boxcox_transactions_vs_sum_amount(self, visualize=False):
        """
        :return: the tuple of logarithm of number of all transactions and lograithm of sum amounts
        :rtype: tuple
        :param visualize: to visualize results or not
        :type visualize: bool
        """
        log_no_transactions_series = self.get_boxcox_no_transactions()
        log_sum_amount_series = self.get_boxcox_sum_amounts()

        if visualize:
            plotlyvisualize.scatter(
                log_no_transactions_series,
                log_sum_amount_series,
                title=
                "Box Cox Transformation for No Transactions vs Sum Amounts",
                axis_labels=("BoxCox No Transactions", "BoxCox Sum Amounts"),
                out_path=PLOT_OUT_DIR)

        return log_no_transactions_series, log_sum_amount_series

    @timethis
    def kmeans_no_transactions_sum_amounts(self,
                                           num_clusters=2,
                                           visualize_real_scale=False,
                                           visualize_boxcox_scale=False):
        """
        This function use kmeans to cluster sum amount and number of transactions in log transformed space
        :param num_clusters: number of clusters for kmeans
        :type num_clusters: int
        :param visualize_real_scale: to visualize in real scale space or not
        :type visualize_real_scale: bool
        :param visualize_boxcox_scale: to visualize in log space or not 
        :type visualize_boxcox_scale: bool
        :return: labels to each datapoint
        :rtype: pandas.core.series.Series
        """
        random_state = 170
        boxcox_no_transactions_series = self.get_boxcox_no_transactions()
        boxcox_sum_amounts_series = self.get_boxcox_sum_amounts()
        frames = [boxcox_no_transactions_series, boxcox_sum_amounts_series]
        boxcox_no_transactions_vs_sum_amounts_df = pd.concat(frames, axis=1)
        boxcox_no_transactions_vs_sum_amounts_df.columns = [
            'all_transactions', 'sum_amounts'
        ]
        X = boxcox_no_transactions_vs_sum_amounts_df.as_matrix().astype(
            np.float)
        y_pred = KMeans(n_clusters=num_clusters,
                        random_state=random_state).fit_predict(X)
        labels_series = pd.Series(
            data=y_pred, index=boxcox_no_transactions_vs_sum_amounts_df.index)

        boxcox_no_transactions_vs_sum_amounts_df["labels"] = labels_series
        boxcox_traces = []
        for cluster_num in range(num_clusters):
            boxcox_traces.append(boxcox_no_transactions_vs_sum_amounts_df[
                boxcox_no_transactions_vs_sum_amounts_df["labels"] ==
                cluster_num])

        no_transactions_vs_sum_amounts_df = self.get_no_transactions_vs_sum_amounts_df(
        )
        no_transactions_vs_sum_amounts_df["labels"] = labels_series
        traces = []
        for cluster_num in range(num_clusters):
            traces.append(no_transactions_vs_sum_amounts_df[
                no_transactions_vs_sum_amounts_df["labels"] == cluster_num])

        if visualize_boxcox_scale:
            plotlyvisualize.scatter_by_cluster(
                boxcox_traces,
                columns=["all_transactions", "sum_amounts", "labels"],
                title="Kmeans for BoxCox Scale",
                axis_labels=["BoxCox No Transacions", "BoxCox Sum Amounts"],
                out_path=PLOT_OUT_DIR)

        if visualize_real_scale:
            plotlyvisualize.scatter_by_cluster(
                traces,
                columns=["all_transactions", "sum_amounts", "labels"],
                title="Kmeans for Real Scale",
                axis_labels=["No Transacions", "Sum Amounts"],
                out_path=PLOT_OUT_DIR)

        return no_transactions_vs_sum_amounts_df

    @timethis
    def kmeans(self,
               num_clusters=2,
               visualize_real_scale=False,
               visualize_boxcox_scale=False):
        random_state = 170
        boxcox_no_transactions_series = self.get_boxcox_no_transactions()
        boxcox_sum_amounts_series = self.get_boxcox_sum_amounts()
        boxcox_harmoinc_series = self.get_boxcox_harmonic()
        frames = [
            boxcox_no_transactions_series, boxcox_sum_amounts_series,
            boxcox_harmoinc_series
        ]
        boxcox_all_merchants_df = pd.concat(frames, axis=1)
        boxcox_all_merchants_df.columns = [
            "all_transactions", "sum_amounts", "harmonic"
        ]
        X = boxcox_all_merchants_df.as_matrix().astype(np.float)
        y_pred = KMeans(n_clusters=num_clusters,
                        random_state=random_state).fit_predict(X)
        labels_series = pd.Series(data=y_pred,
                                  index=boxcox_all_merchants_df.index)

        boxcox_all_merchants_df["labels"] = labels_series
        boxcox_traces = []
        for cluster_num in range(num_clusters):
            boxcox_traces.append(boxcox_all_merchants_df[
                boxcox_all_merchants_df["labels"] == cluster_num])

        all_merchants_df = self.get_all_merchants_df()
        all_merchants_df["labels"] = labels_series
        traces = []
        for cluster_num in range(num_clusters):
            traces.append(
                all_merchants_df[all_merchants_df["labels"] == cluster_num])

        if visualize_boxcox_scale:
            plotlyvisualize.scatter3d(boxcox_traces,
                                      columns=[
                                          "all_transactions", "sum_amounts",
                                          "harmonic", "labels"
                                      ],
                                      title="Kmeans for BoxCox Scale",
                                      out_path=PLOT_OUT_DIR)

        if visualize_real_scale:
            plotlyvisualize.scatter3d(traces,
                                      columns=[
                                          "all_transactions", "sum_amounts",
                                          "harmonic", "labels"
                                      ],
                                      title="Kmeans for Real Scale",
                                      out_path=PLOT_OUT_DIR)

        return all_merchants_df