Ejemplo n.º 1
0
def kernel(table, gamma=1, dbtype='mixed'):
    if dbtype == 'mixed':
        dist = gower.gower_matrix(table)
        # kernel = np.power(table.shape[0], -gamma*dist)
        kernel = np.exp(-gamma * dist)
        # kernel = rbf_kernel(table, gamma)
        # kernel = np.power(dist, np.shape(table)[0])
    return kernel
Ejemplo n.º 2
0
 def _define_noise_examples(self):
     distances = gower.gower_matrix(self._attrs)
     min_distances = [
         self._cal_dNN(distances, indx)
         for indx in range(self._labels.shape[0])
     ]
     min_distances = pd.DataFrame(min_distances, columns=["distance"])
     min_distances.sort_values("distance", ascending=False, inplace=True)
     return list(min_distances[:self._num_noise].index)
Ejemplo n.º 3
0
def kernel_function(proto_critic, input):
    gamma = 1
    kernel = np.array([])
    for idx in range(input.shape[0]):
        dist = gower.gower_matrix(data_x=proto_critic,
                                  data_y=np.reshape(input[idx, :],
                                                    (1, -1))).reshape((-1))
        if idx == 0:
            kernel = np.exp(-gamma * dist)
        else:
            kernel = np.vstack((kernel, np.exp(-gamma * dist)))
    return kernel
Ejemplo n.º 4
0
def calc_gowers(df, continuous_columns):
    """Function to simplify calculating Gower's distance
    
    Args:
        df (pandas.DataFrame): dataframe of observations for which to calculate Gower's dstance \n
        continuous_columns (list): list of integers identifying the indexes of columns that are continuous
        
    Returns:
        gow_dists (numpy.array): a numpy array of Gower's distances between observations
    """
    cat_list = make_categorical_list(continuous_columns, len(df.columns) - 5)
    data_np = df.iloc[:, 5:].to_numpy()
    gow_dists = gower.gower_matrix(data_np, cat_features=cat_list)
    return gow_dists
Ejemplo n.º 5
0
def calculate_distance(df):
    # list of non boolean columns that require preprocessing
    non_boolean_cols = [
        'idade_empresa_anos', 'idade_maxima_socios', 'idade_media_socios',
        'idade_minima_socios', 'qt_filiais', 'qt_socios',
        'qt_socios_st_regular'
    ]

    # normalizing the non boolean columns
    df = min_max_col(df, non_boolean_cols)

    # calculating the gower distance matrix
    dissimilarity_matrix = gower.gower_matrix(df)
    return dissimilarity_matrix
Ejemplo n.º 6
0
def N1(X, y, cat_features=[]):
    """
    Calculate Fraction of Borderline Points (N1)
      - X: ndarray features
      - y: ndarray target
      - cat_features: a boolean array that specifies categorical features
    """

    if len(cat_features) == 0:
        cat_features = np.zeros(X.shape[-1], dtype=bool)

    # Calculate Gower distance matrix
    distance_matrix = gower.gower_matrix(X, cat_features=cat_features)

    # Generate a Minimum Spanning Tree
    tree = MST(distance_matrix)
    sub = tree[y[tree[:, 0]] != y[tree[:, 1]]]
    vertices = np.unique(sub.flatten())
    return len(vertices) / X.shape[0]
Ejemplo n.º 7
0
def cluster(df, random_state=None):
    """Use Gower distances and K Medioids to cluster the data from between 2 to 8 clusters.
    Uses silhouette analysis to determine optimal number of clusters

        Args:
            df (pandas.DataFrame): The data in a pandas dataframe
            random_state (int): Can be sued to fix the random state - ideal for testing

        Returns:
            (array): an array of the cluster assignments
            (int): the number of clusters used
        """
    # Compute the Gower distance matrix
    # NOTE:large datasets will cause slow processing since array size is n2
    matrix = gower.gower_matrix(df)

    # Use silhouette analysis to determine the optimal number of clusters
    # between 2 and 8 clusters
    res = []
    for k in range(2, 9):

        #must have enough samples ie. k-1
        if k < len(matrix) - 1:
            k_medoids = KMedoids(n_clusters=k,
                                 random_state=random_state).fit(matrix)

            # Catch exceptions here and set the score to -1 (worst)
            try:
                silhouette_avg = silhouette_score(df, k_medoids.labels_)
                res.append([k, silhouette_avg])

            # If only one cluster causes an error so give worst score to this k
            except ValueError:
                res.append([k, -1])

    # Best cluster has the value closest to 1 from the range -1 to 1
    best_cluster = max(res, key=lambda x: x[1])

    # Refit with the best number of clusters
    k_medoids = KMedoids(n_clusters=best_cluster[0],
                         random_state=random_state).fit(matrix)

    return k_medoids.labels_, best_cluster[0]
Ejemplo n.º 8
0
def test_answer():
    Xd = pd.DataFrame({
        'age': [21, 21, 19, 30, 21, 21, 19, 30, None],
        'gender': ['M', 'M', 'N', 'M', 'F', 'F', 'F', 'F', None],
        'civil_status': [
            'MARRIED', 'SINGLE', 'SINGLE', 'SINGLE', 'MARRIED', 'SINGLE',
            'WIDOW', 'DIVORCED', None
        ],
        'salary': [
            3000.0, 1200.0, 32000.0, 1800.0, 2900.0, 1100.0, 10000.0, 1500.0,
            None
        ],
        'has_children': [1, 0, 1, 1, 1, 0, 0, 1, None],
        'available_credit':
        [2200, 100, 22000, 1100, 2000, 100, 6000, 2200, None]
    })
    Yd = Xd.iloc[1:3, :]
    X = np.asarray(Xd)
    Y = np.asarray(Yd)
    aaa = gower.gower_matrix(X)
    assert aaa[0][1] == pytest.approx(0.3590238, 0.001)
Ejemplo n.º 9
0
cat_features = []
for col in df.columns:
    if col not in num_features:
        cat_features.append(col)

# scale standardization of numerical values
df_num = pd.DataFrame(StandardScaler().fit_transform(df[num_features]),columns=num_features)
df_cat = df.drop(columns=num_features)
df_std = df_cat.merge(df_num,left_index=True,right_index=True,how='left')
df_w_name = df_std
df_w_name.head()
df_std = df.set_index('DrinkName')

# generate similarity matrix
distance_matrix = gower.gower_matrix(df_std)
#create complete linkage
Zd = linkage(distance_matrix,method='complete') 

# hierarchical clustering visulization
fig,axs = plt.subplots(1,1,figsize=(25,5))
dn = dendrogram(Zd, truncate_mode='level',p=6,show_leaf_counts=True,ax=axs)

# find optimal k clusters
results = {}
for k in range(2,12):
    cluster_array = fcluster(Zd,k,criterion='maxclust')
    score = silhouette_score(distance_matrix,cluster_array,metric='precomputed')
    results[k] = score

plt.plot([i for i in results.keys()],[j for j in results.values()],label='gower')
Ejemplo n.º 10
0
from scipy.cluster.hierarchy import fcluster
link = shc.linkage(rez[['Category','Region','Company Age','Number of Employees','6 Month Growth','Number of Investors','Supported Languages','Price Availability','Price Range','Discount for Smallest Package','Discount for Biggest Package','Monthly Subscription','Yearly Subscription','Localization','Customization','Freemium','Free Trial','Number of Versions','Segmentation','Per Feature','Per User','One Time','Pay As You Go','Volume-based Price','Fixed Price']], method='ward')
rez['cluster'] = fcluster(link,10.2,criterion = 'distance')
rez.groupby('cluster').mean()


# In[590]:


import gower
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram

dm = gower.gower_matrix(rez[['Price Availability','Price Range','Discount for Smallest Package','Discount for Biggest Package','Monthly Subscription','Yearly Subscription','Localization','Customization','Freemium','Free Trial','Number of Versions','Segmentation','Per Feature','Per User','One Time','Pay As You Go','Volume-based Price','Fixed Price']])
Zd = linkage(dm)


# In[591]:



plt.figure(figsize=(40, 50), dpi= 80)  
plt.title("Дендограмма", fontsize=30)  
dendrogram(Zd)
plt.xticks(fontsize=10)
plt.show()


# In[483]:
    # combinedDataset_allUsers.csv if the file created for the experiment one from 'output.py'
    df = pandas.read_csv(f"{current_directory}\\combinedDataset_allUsers.csv")

    df = df.drop(
        ["level", "temperature", "voltage", "status", "health", "output"],
        axis=1)  # I want Phone Usage
    df["Connectivity"] = df["Cellular"] | df["WiFi"]
    df = df.drop(["WiFi", "Cellular", "isInteractive"],
                 axis=1)  # Cause of correlation between columns

    # Uncomment for dataframe description #
    #print(df.describe())
    #plotCorr(df)

    # After the first run, you don't have to compute the distance matrix again. You can read it from the pickle file
    distance = gower.gower_matrix(df)
    with open("distance.pickle", "wb") as f:
        pickle.dump(distance, f)
    #distance = pickle.load( open( "distance.pickle", "rb" ) )
    print("Done with distance!")
    del df

    # Compute the condensed distance matrix for the linkage and cophenetic
    condensedDst = squareform(distance)
    #del distance
    ''' From the linkage function Definition
    Methods 'centroid', 'median' and 'ward' are correctly defined only if Euclidean pairwise metric is used. If `y` is passed as precomputed
    pairwise distances, then it is a user responsibility to assure that these distances are in fact Euclidean, otherwise the produced result
    will be incorrect.
    '''
Ejemplo n.º 12
0
                le_dict[col] = deepcopy(le)

            nj, nj_bin, nj_ord, nj_categ = compute_nj(train, var_distrib)
            nb_cont = np.sum(var_distrib == 'continuous')

            # Feature category (cf)
            dtype = {
                train.columns[j]: dtypes_dict[var_distrib[j]]
                for j in range(p)
            }

            train = train.astype(dtype, copy=True)
            numobs = len(train)

            # Defining distances over the features
            dm = gower_matrix(train, cat_features=cat_features)

            #*****************************************************************
            # Sampling rules
            #*****************************************************************
            authorized_ranges = np.expand_dims(
                np.stack([[-np.inf, np.inf] for var in var_distrib]).T, 1)

            if sub_design == 'bivarié':
                # Want to sample only women of more than 60 years old
                authorized_ranges[:, 0, 0] = [60,
                                              100]  # Of more than 60 years old

                # Keep only women
                sex_idx = np.argmax(varnames == 'sex')
                women_idx = np.argmax(le_dict['sex'].classes_ == 'Female')
Ejemplo n.º 13
0
 def categorical_gower():
     print('using categorical data - Gower', end=' ')
     categorical_df = pd.DataFrame(gower_matrix(videos_df.loc[:, category_columns], cat_features = [True for v in category_columns])).set_index(videos_df.index)
     print('> added {} columns'.format(len(categorical_df.columns)))
     return categorical_df
Ejemplo n.º 14
0
# Feature category (cf)
cf_non_enc = np.logical_or(vd_categ_non_enc == 'categorical',
                           vd_categ_non_enc == 'bernoulli')

# Non encoded version of the dataset:
#y_nenc_typed = y_categ_non_enc.astype(np.object)
#y_np_nenc = y_nenc_typed.values


dtype = {y.columns[j]: np.float64 if (var_distrib[j] != 'bernoulli') and \
        (var_distrib[j] != 'categorical') else str for j in range(p_new)}

y = y.astype(dtype, copy=True)

# !!! Defining distances over the non encoded features
dm = gower_matrix(y, cat_features=cf_non_enc)

#===========================================#
# Running the algorithm
#===========================================#

nb_pobs = 100  # Target for pseudo observations
r = np.array([2, 1])
numobs = len(y)
k = [n_clusters]

seed = 1
init_seed = 2

eps = 1E-05
it = 50
Ejemplo n.º 15
0
import numpy as np
import gower
from sklearn.neighbors import DistanceMetric
from scipy.sparse import issparse

df = pd.read_csv(
    "C:/Users/ahhua/Documents/Github/capstone/Data/processed_data.csv")
df = df.drop([df.columns[0]], axis=1)

df = df.drop(['Title', 'Size', 'Link', 'Filename', 'Src'], axis=1)
cate = [0]
cate.extend([1] * 43)
cate.extend([0, 1, 0, 0, 0, 0])
weights = [2] + (43 * [1]) + (6 * [5])
print('q')
g = gower.gower_matrix(df, weight=np.array(weights), cat_features=cate)
print('r')
pred = gower.gower_topn(df.iloc[0:2, :],
                        df.iloc[:, ],
                        n=5,
                        weight=np.array(weights),
                        cat_features=cate)
print(pred)
'''
area = DistanceMetric.get_metric('manhattan').pairwise(df[['Area']])
area = area/max(np.ptp(df['Area']),1)

num_colors = DistanceMetric.get_metric('manhattan').pairwise(df[['num_colors']])
num_colors = num_colors/max(np.ptp(df['num_colors']),1)

complexity = DistanceMetric.get_metric('manhattan').pairwise(df[['complexity']])
Ejemplo n.º 16
0
    nan_mask = y.isnull()
    cat_features = var_distrib == 'categorical'
    y, le_dict = data_processing(y, var_distrib)
    y = y.where(~nan_mask, np.nan)

    nj, nj_bin, nj_ord, nj_categ = compute_nj(full_contra, var_distrib)
    nb_cont = np.sum(var_distrib == 'continuous')

    # Feature category (cf)
    dtype = {y.columns[j]: dtypes_dict[var_distrib[j]] for j in range(p)}

    full_contra = full_contra.astype(dtype, copy=True)
    complete_y = y[~y.isna().any(1)].astype(dtype, copy=True)

    # Defining distance matrix
    dm = gower_matrix(complete_y, cat_features=cat_features)

    #===========================================#
    # Hyperparameters
    #===========================================#

    n_clusters = 4
    nb_pobs = 100  # Target for pseudo observations
    r = np.array([2, 1])
    numobs = len(y)
    k = [n_clusters]

    seed = 1
    init_seed = 2

    eps = 1E-05
Ejemplo n.º 17
0
def peptide_identification(args):
    print(datetime.now(), ': Peptid identification starts...')
    print('Settings: ')
    print(args)

    # PLATO setting
    subclusterCount = args.subclusterCount
    spy = args.spy
    spy_portion = args.spy_portion
    RN = args.RN
    rnd_all = args.rnd_all  # If random method, include all decoys
    rnd_portion = args.rnd_portion  # If random method, include rnd.portion of positive set, default 1: pos set = neg set
    replicates_cnt = args.replicates_cnt
    include_label = args.include_label
    AML_preprocess = args.AML_preprocess
    output_folder = args.output_folder

    # AutoML parameter setting
    autoML_best_model_selection = args.autoML_best_model_selection
    autoML_iterations = args.autoML_iterations

    metric = args.metric  # Other metrics: azureml.train.automl.utilities.get_primary_metrics('classification')
    cv_fold = args.cv_fold

    # Input, output
    file_name = args.sample_name
    input_path = args.input_folder
    output_path = output_folder + '/' + file_name
    log_file = output_path + '_autoML_errors_log.html'

    # Instantiate AutoML config and create an experiment in autoML workspace
    ws = Workspace.from_config()
    experiment_name = file_name
    experiment = Experiment(ws, experiment_name)
    print(datetime.now(),
          ': Assigned experiment ' + experiment_name + ' on Azure portal ')

    output = {}
    output['SDK version'] = azureml.core.VERSION
    output['Workspace Name'] = ws.name
    output['Resource Group'] = ws.resource_group
    output['Location'] = ws.location
    outputDf = pd.DataFrame(data=output, index=[''])
    print(outputDf)

    print(datetime.now(), ': Reading inputs')
    # Read POSITIVES and ALL inputs
    positives_path = glob.glob(input_path + file_name + '*POSITIVES*')
    raw_positives = pd.read_csv(positives_path[0], sep='\t')

    if AML_preprocess == True:
        all_path = glob.glob(input_path + file_name + '-ALL.txt')
        raw_all = pd.read_csv(all_path[0], sep='\t')
        # Extract new features
        # First and last three amino acides of peptide sequences as features - If NA then B category
        raw_all['Peptide'] = raw_all.Peptide.str.replace(
            r'([\(\[]).*?([\)\]])', r'B', regex=True)
        raw_all['P1'] = raw_all['Peptide'].str[0]
        raw_all['P2'] = raw_all['Peptide'].str[2]
        raw_all['P3'] = raw_all['Peptide'].str[3]
        raw_all['P4'] = raw_all['Peptide'].str[-4]
        raw_all['P5'] = raw_all['Peptide'].str[-3]
        raw_all['P6'] = raw_all['Peptide'].str[-1]

    else:
        all_path = glob.glob(input_path + file_name +
                             '_percolator_feature.txt')
        raw_all = pd.read_csv(all_path[0], sep='\t')

    raw_all['Class'] = 0

    # Make positive and test set
    test_data = raw_all.drop(['ScanNr', 'Proteins'], axis=1)
    positive_set = pd.merge(left=pd.DataFrame(raw_positives['SpecId']),
                            right=pd.DataFrame(test_data),
                            how='left',
                            left_on='SpecId',
                            right_on='SpecId')
    positive_set['Class'] = 1

    # Remove decoys in positive set, if there is any
    decoys_in_positive_idx = positive_set.index[positive_set['Label'] ==
                                                -1].tolist()
    positive_set = positive_set[positive_set['Label'] != -1]

    # Dataframe to store predictions
    all_predictions = pd.DataFrame({
        'SpecId': list(test_data['SpecId']),
        'Peptide': list(test_data['Peptide']),
        'Label': list(test_data['Label'])
    })
    prediction_summary = all_predictions

    # Prepare test set for modeling
    y_test = test_data['Class']
    if include_label == True:
        X_test = test_data.drop(['SpecId', 'Peptide', 'Class'], axis=1)
    else:
        X_test = test_data.drop(['SpecId', 'Peptide', 'Label', 'Class'],
                                axis=1)

    # Prepare positive set for modeling
    positive_set_idx = [
        test_data['SpecId'].tolist().index(x)
        for x in positive_set['SpecId'].tolist()
        if x in test_data['SpecId'].tolist()
    ]

    # Used to create the negative set
    decoys_idx = np.setdiff1d(
        test_data.index[test_data['Label'] == -1].tolist(),
        decoys_in_positive_idx).tolist()

    global gower_dist_avg
    if RN == True:
        if os.path.exists(input_path + file_name +
                          'gower_dist_avg.npy') == False:
            print(datetime.now(), ': Calculating Gower distance')
            gower_dist = gower.gower_matrix(test_data)
            selected_rows = gower_dist[positive_set_idx]
            gower_dist_avg = np.mean(selected_rows, axis=0)
            print(datetime.now(), ': Saving Gower distance matrix')
            np.save(input_path + '/' + file_name + 'gower_dist_avg.npy',
                    gower_dist_avg)  # save
        else:
            print(datetime.now(), ': Loading Gower distance matrix from ',
                  input_path + file_name + 'gower_dist_avg.npy')
            gower_dist_avg = np.load(input_path + file_name +
                                     'gower_dist_avg.npy')  # load

    if spy == True:
        all_spies = pd.DataFrame()
    '''
    Create train set by concatinating positive and negative set, build model(s) using autoML
    and store predictions based on the best model
    '''
    for rep in range(0, replicates_cnt):
        print(datetime.now(), ': Replicate #', rep + 1)
        if spy == True:
            # Exclude spy_portion of training data to be the spies
            positive_set = positive_set.sample(n=len(positive_set),
                                               random_state=rep *
                                               100).reset_index(drop=True)
            spySet_size = round(len(positive_set) * spy_portion)
            spies_ID = positive_set.loc[1:spySet_size, ['SpecId']]
            positive_set_wSpy = positive_set.iloc[spySet_size +
                                                  1:len(positive_set)]

        if RN == False:
            if rnd_all == True:
                # Negative set includes all decoys
                negative_set_idx = decoys_idx
            else:
                # Negative set idx includes rnd_portion times of |positive_set| indecies
                random.seed(rep)
                random.shuffle(decoys_idx)
                negative_set_idx = decoys_idx[0:rnd_portion *
                                              len(positive_set)]
        else:
            print(datetime.now(), ': Starts estimating RNs')
            negative_set_idx = reliable_negative(test_data, positive_set,
                                                 subclusterCount, rep)
            print(datetime.now(), ': Ends estimating RNs')

        negative_set = test_data.iloc[negative_set_idx]

        if spy == True:
            train_data = pd.concat([positive_set_wSpy, negative_set], axis=0)
        else:
            train_data = pd.concat([positive_set, negative_set], axis=0)

        y_train = train_data['Class']
        if include_label == True:
            X_train = train_data.drop(['SpecId', 'Peptide', 'Class'], axis=1)
        else:
            X_train = train_data.drop(['SpecId', 'Peptide', 'Class', 'Label'],
                                      axis=1)

        print('Training set size:', len(y_train), '\nTest set size:',
              len(y_test))

        automl_config = AutoMLConfig(task='classification',
                                     debug_log=log_file,
                                     primary_metric=metric,
                                     iteration_timeout_minutes=200,
                                     iterations=autoML_iterations,
                                     verbosity=logging.INFO,
                                     preprocess=AML_preprocess,
                                     X=X_train,
                                     y=y_train,
                                     n_cross_validations=cv_fold,
                                     model_explainability=True)

        print(datetime.now(), ': modeling replicate #' + str(rep + 1) + '...')
        local_run = experiment.submit(automl_config, show_output=True)

        if autoML_best_model_selection == False:
            # Retrieve the Best Model based on bunch of metrics
            children = list(local_run.get_children())
            metricslist = {}
            for run in children:
                properties = run.get_properties()
                metrics = {
                    k: v
                    for k, v in run.get_metrics().items()
                    if isinstance(v, float)
                }
                metricslist[int(properties['iteration'])] = metrics

            rundata = pd.DataFrame(metricslist).sort_index(1)
            tmp = rundata.T.sort_values([
                'AUC_weighted', 'f1_score_weighted',
                'precision_score_weighted', 'recall_score_weighted',
                'weighted_accuracy'
            ],
                                        ascending=False)
            rundata = tmp.sort_values('log_loss', ascending=True).T
            best_run_iteration = rundata.columns.values[0]
            rundata.to_csv(output_path + '_metrics_list_' + str(rep) + '.txt')
            best_run, fitted_model = local_run.get_output(
                iteration=best_run_iteration)
        else:
            best_run, fitted_model = local_run.get_output()

        print('Best run: ', best_run)
        print(datetime.now(), ': Saving best model and predictions')
        # Save the best model, prediction value and probability
        modelname = output_path + '_model_' + str(rep) + '.sav'
        joblib.dump(fitted_model, modelname)
        y_pred_val = fitted_model.predict(X_test)
        y_pred_prob = fitted_model.predict_proba(X_test)

        # Add the results of the replicate to all predictions table
        all_predictions['pred_rep' + str(rep)] = list(y_pred_val)
        all_predictions['prob_rep' + str(rep)] = list(
            [item[1] for item in y_pred_prob])

        # Overwrite prediction values based on the spies cutoff
        if spy == True:
            threshold = min(
                pd.merge(spies_ID, all_predictions,
                         on='SpecId')['prob_rep' + str(rep)])
            all_predictions['pred_rep' + str(rep)] = np.where(
                all_predictions['prob_rep' + str(rep)] >= threshold, 1, 0)
            all_spies['SpecId' + str(rep)] = spies_ID['SpecId']
            all_spies['Prob_rep' + str(rep)] = list(
                pd.merge(spies_ID, all_predictions,
                         on=['SpecId'])['prob_rep' + str(rep)])

        print(datetime.now(), ': Replicate #' + str(rep + 1) + ' processed!')
        all_predictions.to_csv(output_path + '_all_predictions.csv',
                               index=False)

    if spy == True:
        all_spies.to_csv(output_path + '_all_spies.csv', index=False)

    print(datetime.now(), ': Generate prediction summary of all replicates')
    pred_col_indecies = [
        col for col in all_predictions.columns if 'pred' in col
    ]
    prob_col_indecies = [
        col for col in all_predictions.columns if 'prob' in col
    ]

    prediction_summary['Std'] = all_predictions[prob_col_indecies].std(
        skipna=True, axis=1)
    prediction_summary['Min'] = all_predictions[prob_col_indecies].min(
        skipna=True, axis=1)
    prediction_summary['Max'] = all_predictions[prob_col_indecies].max(
        skipna=True, axis=1)
    prediction_summary['Avg'] = all_predictions[prob_col_indecies].mean(
        skipna=True, axis=1)
    prediction_summary['Median'] = all_predictions[prob_col_indecies].median(
        skipna=True, axis=1)
    prediction_summary['Vote'] = all_predictions[pred_col_indecies].sum(
        skipna=True, axis=1)
    prediction_summary.to_csv(output_path + '_prediction_summary.txt',
                              sep='\t',
                              index=False)

    # Feature importance
    print(datetime.now(), ': Output feature importance of the best run')
    client = ExplanationClient.from_run(best_run)
    raw_explanations = client.download_model_explanation(
        top_k=len(X_test.columns))
    print('Raw feature importance')
    print(raw_explanations.get_feature_importance_dict())
    d = raw_explanations.get_feature_importance_dict()
    raw_feature_importance = pd.DataFrame(list(d.items()))
    raw_feature_importance.to_csv(output_path + '_raw_feature_importance.csv',
                                  index=False)
    # Engineered
    engineered_explanations = client.download_model_explanation(
        top_k=len(X_test.columns))
    print('Engineered feature importance')
    print(engineered_explanations.get_feature_importance_dict())
    d = engineered_explanations.get_feature_importance_dict()
    engineered_feature_importance = pd.DataFrame(list(d.items()))
    engineered_feature_importance.to_csv(output_path +
                                         '_engineered_feature_importance.csv',
                                         index=False)

    now = datetime.now()
    print(datetime.now(), ': Program end')
Ejemplo n.º 18
0
 def get_gower_matrix(self):
     distances = gower.gower_matrix(self.data)
     distances = pd.DataFrame(distances, index=self.data.index)
     distances.columns = distances.index
     distances=distances.replace(0, 1000)
     return(distances)
Ejemplo n.º 19
0
import plotly.graph_objects as go

import pandas as pd
import numpy as np

import gower

import prince

from sklearn import manifold

from components import (ALL_STATS, BASIC_STATS, TYPES, GENS, DATA, BASE_COLS,
                        OTHER, SIZE_COLS, CATEGORICAL_COLS_GOWER,
                        NUMERICAL_COLS_GOWER)

GOWER_MATRIX = gower.gower_matrix(
    DATA.fillna('None')[CATEGORICAL_COLS_GOWER + NUMERICAL_COLS_GOWER])


def update_stats_scatter(stats, types, generations, yaxis):
    labels = BASE_COLS + stats + OTHER
    df = DATA[labels]
    df = df[((df['Type 1'].isin(types)) | (df['Type 2'].isin(types)))
            & (df['Generation'].isin(generations))]
    df_long = pd.melt(df,
                      id_vars=BASE_COLS + OTHER,
                      value_vars=stats,
                      var_name="Stat")

    if len(stats) == 1:
        fig = px.scatter(
            df,
Ejemplo n.º 20
0
def DDGMM(y, n_clusters, r, k, init, var_distrib, nj, it = 50, \
          eps = 1E-05, maxstep = 100, seed = None, perform_selec = True):
    ''' Fit a Generalized Linear Mixture of Latent Variables Model (GLMLVM)
    
    y (numobs x p ndarray): The observations containing categorical variables
    n_clusters (int): The number of clusters to look for in the data
    r (list): The dimension of latent variables through the first 2 layers
    k (list): The number of components of the latent Gaussian mixture layers
    init (dict): The initialisation parameters for the algorithm
    var_distrib (p 1darray): An array containing the types of the variables in y 
    nj (p 1darray): For binary/count data: The maximum values that the variable can take. 
                    For ordinal data: the number of different existing categories for each variable
    it (int): The maximum number of MCEM iterations of the algorithm
    eps (float): If the likelihood increase by less than eps then the algorithm stops
    maxstep (int): The maximum number of optimisation step for each variable
    seed (int): The random state seed to set (Only for numpy generated data for the moment)
    perform_selec (Bool): Whether to perform architecture selection or not
    ------------------------------------------------------------------------------------------------
    returns (dict): The predicted classes, the likelihood through the EM steps
                    and a continuous representation of the data
    '''

    prev_lik = -1E16
    best_lik = -1E16
    tol = 0.01
    max_patience = 1
    patience = 0

    best_k = deepcopy(k)
    best_r = deepcopy(r)

    best_sil = -1
    new_sil = -1

    # Initialize the parameters
    eta = deepcopy(init['eta'])
    psi = deepcopy(init['psi'])
    lambda_bin = deepcopy(init['lambda_bin'])
    lambda_ord = deepcopy(init['lambda_ord'])
    lambda_categ = deepcopy(init['lambda_categ'])

    H = deepcopy(init['H'])
    w_s = deepcopy(
        init['w_s']
    )  # Probability of path s' through the network for all s' in Omega

    numobs = len(y)
    likelihood = []
    it_num = 0
    ratio = 1000
    np.random.seed = seed

    # Dispatch variables between categories
    y_bin = y[:,
              np.logical_or(var_distrib == 'bernoulli', var_distrib ==
                            'binomial')]
    nj_bin = nj[np.logical_or(var_distrib == 'bernoulli',
                              var_distrib == 'binomial')].astype(int)
    nb_bin = len(nj_bin)

    y_categ = y[:, var_distrib == 'categorical']
    nj_categ = nj[var_distrib == 'categorical'].astype(int)
    nb_categ = len(nj_categ)

    y_ord = y[:, var_distrib == 'ordinal']
    nj_ord = nj[var_distrib == 'ordinal'].astype(int)
    nb_ord = len(nj_ord)

    L = len(k)
    k_aug = k + [1]
    S = np.array([np.prod(k_aug[l:]) for l in range(L + 1)])
    M = M_growth(1, r, numobs)

    assert nb_ord + nb_bin + nb_categ > 0

    # Compute the Gower matrix
    cat_features = np.logical_or(var_distrib == 'categorical',
                                 var_distrib == 'bernoulli')
    dm = gower_matrix(y, cat_features=cat_features)

    while (it_num < it) & ((ratio > eps) | (patience <= max_patience)):
        print(it_num)

        # The clustering layer is the one used to perform the clustering
        # i.e. the layer l such that k[l] == n_clusters
        clustering_layer = np.argmax(np.array(k) == n_clusters)

        #####################################################################################
        ################################# S step ############################################
        #####################################################################################

        #=====================================================================
        # Draw from f(z^{l} | s, Theta) for all s in Omega
        #=====================================================================

        mu_s, sigma_s = compute_path_params(eta, H, psi)
        sigma_s = ensure_psd(sigma_s)
        z_s, zc_s = draw_z_s(mu_s, sigma_s, eta, M)
        '''
        print('mu_s',  np.abs(mu_s[0]).mean())
        print('sigma_s',  np.abs(sigma_s[0]).mean())
        print('z_s0',  np.abs(z_s[0]).mean())
        print('z_s1',  np.abs(z_s[1]).mean(0)[:,0])
        '''

        #========================================================================
        # Draw from f(z^{l+1} | z^{l}, s, Theta) for l >= 1
        #========================================================================

        chsi = compute_chsi(H, psi, mu_s, sigma_s)
        chsi = ensure_psd(chsi)
        rho = compute_rho(eta, H, psi, mu_s, sigma_s, zc_s, chsi)

        # In the following z2 and z1 will denote z^{l+1} and z^{l} respectively
        z2_z1s = draw_z2_z1s(chsi, rho, M, r)

        #=======================================================================
        # Compute the p(y| z1) for all variable categories
        #=======================================================================

        py_zl1 = fy_zl1(lambda_bin, y_bin, nj_bin, lambda_ord, y_ord, nj_ord,
                        lambda_categ, y_categ, nj_categ, z_s[0])

        #========================================================================
        # Draw from p(z1 | y, s) proportional to p(y | z1) * p(z1 | s) for all s
        #========================================================================

        zl1_ys = draw_zl1_ys(z_s, py_zl1, M)

        #####################################################################################
        ################################# E step ############################################
        #####################################################################################

        #=====================================================================
        # Compute conditional probabilities used in the appendix of asta paper
        #=====================================================================

        pzl1_ys, ps_y, p_y = E_step_GLLVM(z_s[0], mu_s[0], sigma_s[0], w_s,
                                          py_zl1)
        #del(py_zl1)

        #=====================================================================
        # Compute p(z^{(l)}| s, y). Equation (5) of the paper
        #=====================================================================

        pz2_z1s = fz2_z1s(t(pzl1_ys, (1, 0, 2)), z2_z1s, chsi, rho, S)
        pz_ys = fz_ys(t(pzl1_ys, (1, 0, 2)), pz2_z1s)

        #=====================================================================
        # Compute MFA expectations
        #=====================================================================

        Ez_ys, E_z1z2T_ys, E_z2z2T_ys, EeeT_ys = \
            E_step_DGMM(zl1_ys, H, z_s, zc_s, z2_z1s, pz_ys, pz2_z1s, S)

        ###########################################################################
        ############################ M step #######################################
        ###########################################################################

        #=======================================================
        # Compute MFA Parameters
        #=======================================================

        w_s = np.mean(ps_y, axis=0)
        eta, H, psi = M_step_DGMM(Ez_ys, E_z1z2T_ys, E_z2z2T_ys, EeeT_ys, ps_y,
                                  H, k)

        #=======================================================
        # Identifiability conditions
        #=======================================================

        # Update eta, H and Psi values
        H = diagonal_cond(H, psi)
        Ez, AT = compute_z_moments(w_s, eta, H, psi)
        eta, H, psi = identifiable_estim_DGMM(eta, H, psi, Ez, AT)

        del (Ez)

        #=======================================================
        # Compute GLLVM Parameters
        #=======================================================

        # We optimize each column separately as it is faster than all column jointly
        # (and more relevant with the independence hypothesis)

        lambda_bin = bin_params_GLLVM(y_bin, nj_bin, lambda_bin, ps_y, pzl1_ys, z_s[0], AT[0],\
                     tol = tol, maxstep = maxstep)

        lambda_ord = ord_params_GLLVM(y_ord, nj_ord, lambda_ord, ps_y, pzl1_ys, z_s[0], AT[0],\
                     tol = tol, maxstep = maxstep)

        lambda_categ = categ_params_GLLVM(y_categ, nj_categ, lambda_categ, ps_y, pzl1_ys, z_s[0], AT[0],\
                     tol = tol, maxstep = maxstep)

        ###########################################################################
        ################## Clustering parameters updating #########################
        ###########################################################################

        new_lik = np.sum(np.log(p_y))
        likelihood.append(new_lik)
        ratio = (new_lik - prev_lik) / abs(prev_lik)
        print(likelihood)

        idx_to_sum = tuple(set(range(1, L + 1)) - set([clustering_layer + 1]))
        psl_y = ps_y.reshape(numobs, *k, order='C').sum(idx_to_sum)

        temp_class = np.argmax(psl_y, axis=1)
        try:
            new_sil = silhouette_score(dm, temp_class, metric='precomputed')
        except ValueError:
            new_sil = -1

        print('Silhouette score:', new_sil)
        if best_sil < new_sil:
            z = (ps_y[..., n_axis] * Ez_ys[clustering_layer]).sum(1)
            best_sil = deepcopy(new_sil)
            classes = deepcopy(temp_class)

            fig = plt.figure(figsize=(8, 8))
            plt.scatter(z[:, 0], z[:, 1])
            plt.show()

        # Refresh the classes only if they provide a better explanation of the data
        if best_lik < new_lik:
            best_lik = deepcopy(prev_lik)

        if prev_lik < new_lik:
            patience = 0
            M = M_growth(it_num + 2, r, numobs)
        else:
            patience += 1

        ###########################################################################
        ######################## Parameter selection  #############################
        ###########################################################################

        is_not_min_specif = not (np.all(np.array(k) == n_clusters)
                                 & np.array_equal(r, [2, 1]))

        if look_for_simpler_network(
                it_num) & perform_selec & is_not_min_specif:
            r_to_keep = r_select(y_bin, y_ord, y_categ, zl1_ys, z2_z1s, w_s)

            # If r_l == 0, delete the last l + 1: layers
            new_L = np.sum([len(rl) != 0 for rl in r_to_keep]) - 1

            k_to_keep = k_select(w_s, k, new_L, clustering_layer)

            is_L_unchanged = L == new_L
            is_r_unchanged = np.all(
                [len(r_to_keep[l]) == r[l] for l in range(new_L + 1)])
            is_k_unchanged = np.all(
                [len(k_to_keep[l]) == k[l] for l in range(new_L)])

            is_selection = not (is_r_unchanged & is_k_unchanged
                                & is_L_unchanged)

            assert new_L > 0

            if is_selection:

                eta = [eta[l][k_to_keep[l]] for l in range(new_L)]
                eta = [eta[l][:, r_to_keep[l]] for l in range(new_L)]

                H = [H[l][k_to_keep[l]] for l in range(new_L)]
                H = [H[l][:, r_to_keep[l]] for l in range(new_L)]
                H = [H[l][:, :, r_to_keep[l + 1]] for l in range(new_L)]

                psi = [psi[l][k_to_keep[l]] for l in range(new_L)]
                psi = [psi[l][:, r_to_keep[l]] for l in range(new_L)]
                psi = [psi[l][:, :, r_to_keep[l]] for l in range(new_L)]

                if nb_bin > 0:
                    # Add the intercept:
                    bin_r_to_keep = np.concatenate([[0],
                                                    np.array(r_to_keep[0]) + 1
                                                    ])
                    lambda_bin = lambda_bin[:, bin_r_to_keep]

                if nb_ord > 0:
                    # Intercept coefficients handling is a little more complicated here
                    lambda_ord_intercept = [
                        lambda_ord_j[:-r[0]] for lambda_ord_j in lambda_ord
                    ]
                    Lambda_ord_var = np.stack(
                        [lambda_ord_j[-r[0]:] for lambda_ord_j in lambda_ord])
                    Lambda_ord_var = Lambda_ord_var[:, r_to_keep[0]]
                    lambda_ord = [np.concatenate([lambda_ord_intercept[j], Lambda_ord_var[j]])\
                                  for j in range(nb_ord)]

                if nb_categ > 0:
                    lambda_categ_intercept = [
                        lambda_categ[j][:, 0] for j in range(nb_categ)
                    ]
                    Lambda_categ_var = [
                        lambda_categ_j[:, -r[0]:]
                        for lambda_categ_j in lambda_categ
                    ]
                    Lambda_categ_var = [
                        lambda_categ_j[:, r_to_keep[0]]
                        for lambda_categ_j in lambda_categ
                    ]

                    lambda_categ = [np.hstack([lambda_categ_intercept[j][..., n_axis], Lambda_categ_var[j]])\
                                   for j in range(nb_categ)]

                w = w_s.reshape(*k, order='C')
                new_k_idx_grid = np.ix_(*k_to_keep[:new_L])

                # If layer deletion, sum the last components of the paths
                if L > new_L:
                    deleted_dims = tuple(range(L)[new_L:])
                    w_s = w[new_k_idx_grid].sum(deleted_dims).flatten(
                        order='C')
                else:
                    w_s = w[new_k_idx_grid].flatten(order='C')

                w_s /= w_s.sum()

                k = [len(k_to_keep[l]) for l in range(new_L)]
                r = [len(r_to_keep[l]) for l in range(new_L + 1)]

                k_aug = k + [1]
                S = np.array([np.prod(k_aug[l:]) for l in range(new_L + 1)])
                L = new_L

                patience = 0
                best_r = deepcopy(r)
                best_k = deepcopy(k)

                # Identifiability conditions
                H = diagonal_cond(H, psi)
                Ez, AT = compute_z_moments(w_s, eta, H, psi)
                eta, H, psi = identifiable_estim_DGMM(eta, H, psi, Ez, AT)

            print('New architecture:')
            print('k', k)
            print('r', r)
            print('L', L)
            print('S', S)
            print("w_s", len(w_s))

        prev_lik = deepcopy(new_lik)
        it_num = it_num + 1

    out = dict(likelihood = likelihood, classes = classes, z = z, \
               best_r = best_r, best_k = best_k)
    return (out)
Ejemplo n.º 21
0
	def post(self,request):
		# request = self.request.data
		null=np.nan
		# stores = request['data']['teststores']

		# ########
		filename = request.FILES['store_mstr']
		if filename:
			filename_check = filename.name
			ext = [".xlsx", ".csv", ".xls"]
			if filename_check.endswith(tuple(ext)):
				extesnion = os.path.splitext(filename_check)[1]
				if (extesnion == '.csv'):
					data = pd.read_csv(filename)
				else:
					teststores = pd.read_excel(filename)
		
		
		store_features =request.POST['store_features']
		# stores  =request.POST['teststores']
		##############
		# stores = pd.DataFrame(eval(stores))
		# teststores = pd.DataFrame.from_dict(stores, orient='columns')
		# store_features =request['data']['store_features']
		
		if teststores is not None:

			mandatory_features = ['Banner', 'Outlet_surface', 'Shelf_meters_Choc','Shelf_meters_Dog', 'Shelf_meters_Cat', 
	                             'Influence_Overall','CSV_of_outlet']
			# store_features = mandatory_features + store_features

			# stores_master_df = pd.read_excel("datas/TL_StoreMstr.xlsx")
			# test_master_df = pd.read_excel("TL_TestMstr.xlsx")
			# teststore_map_df = pd.read_excel("TL_Teststore_map.xlsx")
			# controlstore_map_df = pd.read_excel("TL_Controlstore_Mstr.xlsx")
			
			Allstores = StoreMstr.objects.filter(is_active=True,is_deleted=False)
			Teststores  = TestStoreSerializer(Allstores,many=True)
			stores_master_df = pd.DataFrame(Teststores.data)

			Alltest = TestMstr.objects.filter(is_active=True,is_deleted=False)
			Testmst  = TestSerializer(Alltest,many=True)
			test_master_df = pd.DataFrame(Testmst.data)
			# test_master_df = pd.read_excel("datas/TL_TestMstr.xlsx")
			if(test_master_df.empty):
				columns = ['test_id','test_name','test_desc','testtype','target_var','territory_name','store_segment','category_name','confidence_lev','margin_oferror','std_deviation','pre_start','pre_end','test_window','testwin_start','testwin_end','stage_id','created_on','modified_on','is_active','deleted_at','is_deleted']
				test_master_df = pd.DataFrame(columns=columns)

			Alltestmap = TestStoreMap.objects.filter(is_active=True,is_deleted=False)
			Teststop  = TestStoreMapSerializer(Alltestmap,many=True)
			teststore_map_df = pd.DataFrame(Teststop.data)
			# teststore_map_df = pd.read_excel("datas/TL_Teststore_map.xlsx")

			controlstore = ControlStoreMstr.objects.filter(is_active=True,is_deleted=False)
			controlstores  = ControlstoreSerializer(controlstore,many=True)
			controlstore_map_df = pd.DataFrame(Teststop.data)


			# Eliminating the Invalid Stores From Population
			stores_master_df = check_if_store_valid(storesfile=stores_master_df)
			stores_master_df = stores_master_df[stores_master_df["Is Valid Store"]==1]

			# ELIMINATING ALL THE TEST AND CONTROL STORES WHICH ARE CURRENTLY ACTIVE
			# Get all the active tests(test ids)
			active_tests_df = test_master_df[test_master_df["is_active"]==True]

			

			if active_tests_df.shape[0] != 0:
				# THIS MEANS THERE ARE ACTIVE TESTS AND WE NEED TO ELIMINATE ACTIVE TEST AND CONTROL STORES

				#Get the active test stores using test ids
				active_test_stores = teststore_map_df[teststore_map_df["test_id"].isin(active_tests_df["test_id"])]

				#Get the active control stores using test ids
				active_control_stores = controlstore_map_df[controlstore_map_df["test_id"].isin(active_tests_df["test_id"])]

				#Remove active test and control stores from population
				stores_master_df = stores_master_df[~stores_master_df["store_id"].isin(active_test_stores["store_id"])]
				stores_master_df = stores_master_df[~stores_master_df["store_id"].isin(active_control_stores["store_id"])]

			else:
	            # THIS MEANS THERE ARE NO ACTIVE TESTS AND WE NEED NOT ELIMINATE ANY TEST AND CONTROL STORES
				pass

			# ELIMINATING THE TESTSTORES FROM POPULATION

			stores_master_df = stores_master_df[~(stores_master_df["Partner_ID"].isin(teststores["Partner_ID"]))]

			refA = teststores.copy(deep=True)
			refB = stores_master_df.copy(deep=True)

			useA = refA[store_features].copy(deep=True)
			useB = refB[store_features].copy(deep=True)

			scaler = StandardScaler()
			scale_cols = [item for item in store_features if item!="Banner"]
			useA[scale_cols] = scaler.fit_transform(useA[scale_cols])
			useB[scale_cols] = scaler.fit_transform(useB[scale_cols])

			gowermatrix = gower.gower_matrix(useA,useB)

			# Identifying similar stores
			df_list = list()
			for test_pid,row in zip(refA["Partner ID"],gowermatrix):
				df = refB.copy(deep=True)
				df["Gower_Distance"] = list(row)
				df = df.sort_values(by="Gower_Distance",ascending=True)
				df["Test store Partner ID"] = test_pid
				df["Gower_Distance"] = df["Gower_Distance"].apply(lambda x:round(x,2))
				df["Similarity Measure"] = df["Gower_Distance"].apply(lambda x: 1-x)
				df_list.append(df.head(1))
			control_stores = pd.concat(df_list)
			control_stores["Checked_Flag"] = 0
			finalcontrol_stores = control_stores.reset_index().to_json(orient='records')			
			return json.Response(finalcontrol_stores,True)
		else:
			return json.Response("Please pass Test stores",False)
Ejemplo n.º 22
0
cluster_telco_centroid = pd.Series(telco_centroid.labels_)
telco_data["cluster"] = cluster_telco_centroid

telco_data.iloc[:, 0:29].groupby(telco_data.cluster).mean()

import os

telco_data.to_csv("final_telco_data.csv", encoding="utf-8")

os.getcwd()

#using gowers clustering for mixed data
import gower
from scipy.cluster.hierarchy import fcluster, dendrogram

gowers_matrix = gower.gower_matrix(telco_data)
gowers_linkage = linkage(gowers_matrix)
gcluster = fcluster(gowers_linkage, 3, criterion='maxclust')
dendrogram(gowers_linkage)
telco_data["cluster"] = gcluster
telco_data.iloc[:, 0:29].groupby(telco_data.cluster).mean()

import os

telco_data.to_csv("final2_telco_data.csv", encoding="utf-8")

os.getcwd()

#############################Problem 4####################################################

import pandas as pd
Ejemplo n.º 23
0
def MDGMM(y, n_clusters, r, k, init, var_distrib, nj, it = 50, \
          eps = 1E-05, maxstep = 100, seed = None, perform_selec = True): 
    
    ''' Fit a Generalized Linear Mixture of Latent Variables Model (GLMLVM)
    
    y (numobs x p ndarray): The observations containing mixed variables
    n_clusters (int or str): The number of clusters to look for in the data or the use mode of the MDGMM
    r (dict): The dimension of latent variables through the first 2 layers
    k (dict): The number of components of the latent Gaussian mixture layers
    init (dict): The initialisation parameters for the algorithm
    var_distrib (p 1darray): An array containing the types of the variables in y 
    nj (p 1darray): For binary/count data: The maximum values that the variable can take. 
                    For ordinal data: the number of different existing categories for each variable
                    For categorical data: the number of different existing categories for each variable
    it (int): The maximum number of MCEM iterations of the algorithm
    eps (float): If the likelihood increase by less than eps then the algorithm stops
    maxstep (int): The maximum number of optimisation step for each variable
    seed (int): The random state seed to set (Only for numpy generated data for the moment)
    perform_selec (Bool): Whether to perform architecture selection or not
    ------------------------------------------------------------------------------------------------
    returns (dict): The predicted classes, the likelihood through the EM steps
                    and a continuous representation of the data
    '''
    
    # Break the reference link 
    k = deepcopy(k)
    r = deepcopy(r)
    
    best_k = deepcopy(k)
    best_r = deepcopy(r)

    # Add other checks for the other variables
    check_inputs(k, r)

    prev_lik = - 1E15
    best_lik = -1E15
    
    tol = 0.01
    max_patience = 1
    patience = 0
    
    #====================================================
    # Initialize the parameters
    #====================================================
        
    eta_c, eta_d, H_c, H_d, psi_c, psi_d = dispatch_dgmm_init(init)
    lambda_bin, lambda_ord, lambda_categ = dispatch_gllvm_init(init)
    w_s_c, w_s_d = dispatch_paths_init(init)
    
    numobs = len(y)
    likelihood = []
    it_num = 0
    ratio = 1000
    np.random.seed = seed

    #====================================================        
    # Dispatch variables between categories
    #====================================================

    y_bin = y[:, np.logical_or(var_distrib == 'bernoulli',\
                               var_distrib == 'binomial')]
    nj_bin = nj[np.logical_or(var_distrib == 'bernoulli',\
                              var_distrib == 'binomial')]
        
    nj_bin = nj_bin.astype(int)
    nb_bin = len(nj_bin)
        
    y_ord = y[:, var_distrib == 'ordinal']    
    nj_ord = nj[var_distrib == 'ordinal']
    nj_ord = nj_ord.astype(int)
    nb_ord = len(nj_ord)
    
    y_categ = y[:, var_distrib == 'categorical']
    nj_categ = nj[var_distrib == 'categorical'].astype(int)
    nb_categ = len(nj_categ)    
    
    yc = y[:, var_distrib == 'continuous'] 
    
    ss = StandardScaler()
    yc = ss.fit_transform(yc)

    nb_cont = yc.shape[1]
    
    # *_1L standsds for quantities going through all the network (head + tail)
    k_1L, L_1L, L, bar_L, S_1L = nb_comps_and_layers(k)    
    r_1L = {'c': r['c'] + r['t'], 'd': r['d'] + r['t'], 't': r['t']}
    
    best_sil = [-1.1 for l in range(L['t'] - 1)] if n_clusters == 'multi' else -1.1 
    new_sil = [-1.1 for l in range(L['t'] - 1)] if n_clusters == 'multi' else -1.1 
    
    
    M = M_growth(1, r_1L, numobs) 

    if nb_bin + nb_ord + nb_categ == 0: # Create the InputError class and change this
        raise ValueError('Input does not contain discrete variables,\
                         consider using a regular DGMM')
    if nb_cont == 0: # Create the InputError class and change this
        raise ValueError('Input does not contain continuous values,\
                         consider using a DDGMM')
                         
                         
    # Compute the Gower matrix
    cat_features = np.logical_or(var_distrib == 'categorical', var_distrib == 'bernoulli')
    dm = gower_matrix(y, cat_features = cat_features)
                     
    while (it_num < it) & ((ratio > eps) | (patience <= max_patience)):
        print(it_num)

        # The clustering layer is the one used to perform the clustering 
        # i.e. the layer l such that k[l] == n_clusters
        if not(isnumeric(n_clusters)):
            if n_clusters == 'auto':
                clustering_layer = 0
            elif n_clusters == 'multi':
                clustering_layer = list(range(L['t'] - 1))
            else:
                raise ValueError('Please enter an int, auto or multi for n_clusters')
        else:
            assert (np.array(k['t']) == n_clusters).any()
            clustering_layer = np.argmax(np.array(k['t']) == n_clusters)

        #####################################################################################
        ################################# MC step ############################################
        #####################################################################################

        #=====================================================================
        # Draw from f(z^{l} | s, Theta) for both heads and tail
        #=====================================================================  
        
        mu_s_c, sigma_s_c = compute_path_params(eta_c, H_c, psi_c)
        sigma_s_c = ensure_psd(sigma_s_c)
        
        mu_s_d, sigma_s_d = compute_path_params(eta_d, H_d, psi_d)
        sigma_s_d = ensure_psd(sigma_s_d)
                        
        z_s_c, zc_s_c, z_s_d, zc_s_d = draw_z_s_all_network(mu_s_c, sigma_s_c,\
                            mu_s_d, sigma_s_d, yc, eta_c, eta_d, S_1L, L, M)
                    
        #========================================================================
        # Draw from f(z^{l+1} | z^{l}, s, Theta) for l >= 1
        #========================================================================
        
        # Create wrapper as before and after
        chsi_c = compute_chsi(H_c, psi_c, mu_s_c, sigma_s_c)
        chsi_c = ensure_psd(chsi_c)
        rho_c = compute_rho(eta_c, H_c, psi_c, mu_s_c, sigma_s_c, zc_s_c, chsi_c)
        
                
        chsi_d = compute_chsi(H_d, psi_d, mu_s_d, sigma_s_d)
        chsi_d = ensure_psd(chsi_d)
        rho_d = compute_rho(eta_d, H_d, psi_d, mu_s_d, sigma_s_d, zc_s_d, chsi_d)


        # In the following z2 and z1 will denote z^{l+1} and z^{l} respectively
        z2_z1s_c, z2_z1s_d = draw_z2_z1s_network(chsi_c, chsi_d, rho_c, \
                                                 rho_d, M, r_1L, L)
        
        #=======================================================================
        # Compute the p(y^D| z1) for all discrete variables
        #=======================================================================
        
        py_zl1_d = fy_zl1(lambda_bin, y_bin, nj_bin, lambda_ord, y_ord, nj_ord,\
                          lambda_categ, y_categ, nj_categ, z_s_d[0])
        
        #========================================================================
        # Draw from p(z1 | y, s) proportional to p(y | z1) * p(z1 | s) for all s
        #========================================================================
                
        zl1_ys_d = draw_zl1_ys(z_s_d, py_zl1_d, M['d'])
                
        #####################################################################################
        ################################# E step ############################################
        #####################################################################################
        
        #=====================================================================
        # Compute quantities necessary for E steps of both heads and tail
        #=====================================================================
        
        # Discrete head quantities
        pzl1_ys_d, ps_y_d, py_d = E_step_GLLVM(z_s_d[0], mu_s_d[0], sigma_s_d[0], w_s_d, py_zl1_d)        
        py_s_d = ps_y_d * py_d / w_s_d[n_axis]
        
        # Continuous head quantities
        ps_y_c, py_s_c, py_c = continuous_lik(yc, mu_s_c[0], sigma_s_c[0], w_s_c)
        
        pz_s_d = fz_s(z_s_d, mu_s_d, sigma_s_d) 
        pz_s_c = fz_s(z_s_c, mu_s_c, sigma_s_c) 
        
        #=====================================================================
        # Compute p(z^{(l)}| s, y). Equation (5) of the paper
        #=====================================================================
        
        # Compute pz2_z1s_d and pz2_z1s_d for the tail indices whereas it is useless
        
        pz2_z1s_d = fz2_z1s(t(pzl1_ys_d, (1, 0, 2)), z2_z1s_d, chsi_d, rho_d, S_1L['d'])
        pz_ys_d = fz_ys(t(pzl1_ys_d, (1, 0, 2)), pz2_z1s_d)
          
        pz2_z1s_c = fz2_z1s([], z2_z1s_c, chsi_c, rho_c, S_1L['c'])
        pz_ys_c = fz_ys([], pz2_z1s_c)
        
        pz2_z1s_t = fz2_z1s([], z2_z1s_c[bar_L['c']:], chsi_c[bar_L['c']:], \
                            rho_c[bar_L['c']:], S_1L['t'])

        # Junction layer computations
        # Compute p(zC |s)
        py_zs_d = fy_zs(pz_ys_d, py_s_d) 
        py_zs_c = fy_zs(pz_ys_c, py_s_c)
         
        # Compute p(zt | yC, yD, sC, SD)        
        pzt_yCyDs = fz_yCyDs(py_zs_c, pz_ys_d, py_s_c, M, S_1L, L)

        #=====================================================================
        # Compute MFA expectations
        #=====================================================================
        
        # Discrete head. 
        Ez_ys_d, E_z1z2T_ys_d, E_z2z2T_ys_d, EeeT_ys_d = \
            E_step_DGMM_d(zl1_ys_d, H_d, z_s_d, zc_s_d, z2_z1s_d, pz_ys_d,\
                        pz2_z1s_d, S_1L['d'], L['d'])
        
            
        # Continuous head
        Ez_ys_c, E_z1z2T_ys_c, E_z2z2T_ys_c, EeeT_ys_c = \
            E_step_DGMM_c(H_c, z_s_c, zc_s_c, z2_z1s_c, pz_ys_c,\
                          pz2_z1s_c, S_1L['c'], L['c'])


        # Junction layers
        Ez_ys_t, E_z1z2T_ys_t, E_z2z2T_ys_t, EeeT_ys_t = \
            E_step_DGMM_t(H_c[bar_L['c']:], \
            z_s_c[bar_L['c']:], zc_s_c[bar_L['c']:], z2_z1s_c[bar_L['c']:],\
                pzt_yCyDs, pz2_z1s_t, S_1L, L, k_1L)  
        
        # Error here for the first two terms: p(y^h | z^t, s^C) != p(y^h | z^t, s^{1C:L})
        pst_yCyD = fst_yCyD(py_zs_c, py_zs_d, pz_s_d, w_s_c, w_s_d, k_1L, L)   
                               
        ###########################################################################
        ############################ M step #######################################
        ###########################################################################

        #=======================================================
        # Compute DGMM Parameters 
        #=======================================================
            
        # Discrete head
        w_s_d = np.mean(ps_y_d, axis = 0)      
        eta_d_barL, H_d_barL, psi_d_barL = M_step_DGMM(Ez_ys_d, E_z1z2T_ys_d, E_z2z2T_ys_d, \
                                        EeeT_ys_d, ps_y_d, H_d, k_1L['d'][:-1],\
                                            L_1L['d'], r_1L['d'])
         
        # Add dispatching function here
        eta_d[:bar_L['d']] = eta_d_barL
        H_d[:bar_L['d']] = H_d_barL
        psi_d[:bar_L['d']] = psi_d_barL
                
        # Continuous head
        w_s_c = np.mean(ps_y_c, axis = 0)  
        eta_c_barL, H_c_barL, psi_c_barL = M_step_DGMM(Ez_ys_c, E_z1z2T_ys_c, E_z2z2T_ys_c, \
                                        EeeT_ys_c, ps_y_c, H_c, k_1L['c'][:-1],\
                                            L_1L['c'] + 1, r_1L['c'])
        
        eta_c[:bar_L['c']] = eta_c_barL
        H_c[:bar_L['c']] = H_c_barL
        psi_c[:bar_L['c']] = psi_c_barL
                    

        # Common tail
        eta_t, H_t, psi_t, Ezst_y = M_step_DGMM_t(Ez_ys_t, E_z1z2T_ys_t, E_z2z2T_ys_t, \
                                        EeeT_ys_t, ps_y_c, ps_y_d, pst_yCyD, \
                                            H_c[bar_L['c']:], S_1L, k_1L, \
                                            L_1L, L, r_1L['t'])  
            
        eta_d[bar_L['d']:] = eta_t
        H_d[bar_L['d']:] = H_t
        psi_d[bar_L['d']:] = psi_t            

        eta_c[bar_L['c']:] = eta_t
        H_c[bar_L['c']:] = H_t
        psi_c[bar_L['c']:] = psi_t  
                         
        #=======================================================
        # Identifiability conditions
        #=======================================================
        w_s_t = np.mean(pst_yCyD, axis = 0)  
        eta_d, H_d, psi_d, AT_d, eta_c, H_c, psi_c, AT_c = network_identifiability(eta_d, \
                                H_d, psi_d, eta_c, H_c, psi_c, w_s_c, w_s_d, w_s_t, bar_L)
                
        #=======================================================
        # Compute GLLVM Parameters
        #=======================================================
        
        # We optimize each column separately as it is faster than all column jointly 
        # (and more relevant with the independence hypothesis)
                
        lambda_bin = bin_params_GLLVM(y_bin, nj_bin, lambda_bin, ps_y_d, \
                    pzl1_ys_d, z_s_d[0], AT_d[0], tol = tol, maxstep = maxstep)
                 
        lambda_ord = ord_params_GLLVM(y_ord, nj_ord, lambda_ord, ps_y_d, \
                    pzl1_ys_d, z_s_d[0], AT_d[0], tol = tol, maxstep = maxstep)
            
        lambda_categ = categ_params_GLLVM(y_categ, nj_categ, lambda_categ, ps_y_d,\
                    pzl1_ys_d, z_s_d[0], AT_d[0], tol = tol, maxstep = maxstep)

        ###########################################################################
        ################## Clustering parameters updating #########################
        ###########################################################################
          
        new_lik = np.sum(np.log(py_d) + np.log(py_c))
        likelihood.append(new_lik)
        ratio = (new_lik - prev_lik)/abs(prev_lik)
        
        
        if n_clusters == 'multi':
            temp_classes = [] 
            z_tail = []
            classes = [[] for l in range(L['t'] - 1)]
            
            for l in clustering_layer:
                idx_to_sum = tuple(set(range(1, L['t'] + 1)) -\
                                   set([clustering_layer[l] + 1]))
                psl_y = pst_yCyD.reshape(numobs, *k['t'],\
                                         order = 'C').sum(idx_to_sum)
                
                temp_class_l = np.argmax(psl_y, axis = 1)
                sil_l = silhouette_score(dm, temp_class_l, metric = 'precomputed')
                    
                temp_classes.append(temp_class_l)
                #z_tail.append(Ezst_y[l].sum(1))
                new_sil[l] = sil_l
            
            #z_tail = []
            for l in range(L['t'] - 1):
                zl = Ezst_y[l].sum(1)
                z_tail.append(zl)
                    
                if best_sil[l] < new_sil[l]:
                    # Update the quantity if the silhouette score is better 
                    best_sil[l] = deepcopy(new_sil[l])
                    classes[l] = deepcopy(temp_classes[l])
                    
                    if zl.shape[-1] == 3:
                        plot_3d(zl, classes[l])
                    elif zl.shape[-1] == 2:
                        plot_2d(zl, classes[l])
           
        else: 
            idx_to_sum = tuple(set(range(1, L['t'] + 1)) - set([clustering_layer + 1]))
            psl_y = pst_yCyD.reshape(numobs, *k['t'], order = 'C').sum(idx_to_sum) 
        
            temp_classes = np.argmax(psl_y, axis = 1) 
            try:
                new_sil = silhouette_score(dm, temp_classes, metric = 'precomputed') 
            except:
                new_sil = -1
            
            z_tail = [Ezst_y[l].sum(1) for l in range(L['t'] - 1)]
                             
            if best_sil < new_sil:
                # Update the quantity if the silhouette score is better 
                zl = z_tail[clustering_layer]
                best_sil = deepcopy(new_sil)
                classes = deepcopy(temp_classes)
                
                if zl.shape[-1] == 3:
                    plot_3d(zl, classes)
                elif zl.shape[-1] == 2:
                    plot_2d(zl, classes)
        
        # Refresh the likelihood if best
        if best_lik < new_lik:
            best_lik = deepcopy(prev_lik)
      
        if prev_lik < new_lik:
            patience = 0
            M = M_growth(it_num + 1, r_1L, numobs)
        else:
            patience += 1
                       
        ###########################################################################
        ######################## Parameter selection  #############################
        ###########################################################################
                    
        min_nb_clusters = 2
        is_not_min_specif = not(is_min_architecture_reached(k, r, min_nb_clusters))
        
        if look_for_simpler_network(it_num) & perform_selec & is_not_min_specif:
            
            # To add: selection according to categ
            r_to_keep = r_select(y_bin, y_ord, y_categ, yc, zl1_ys_d,\
                                 z2_z1s_d[:bar_L['d']], w_s_d, z2_z1s_c[:bar_L['c']],
                                 z2_z1s_c[bar_L['c']:], n_clusters)
            
            # Check layer deletion
            is_c_layer_deletion = np.any([len(rl) == 0 for rl in r_to_keep['c']]) 
            is_d_layer_deletion = np.any([len(rl) == 0 for rl in r_to_keep['d']]) 
            is_head_layer_deletion = np.any([is_c_layer_deletion, is_d_layer_deletion])
            
            if is_head_layer_deletion:
                # Restart the algorithm
                if is_c_layer_deletion:
                    r['c'] = [len(rl) for rl in r_to_keep['c'][:-1]]
                    k['c'] = k['c'][:-1]
                if is_d_layer_deletion:
                    r['d'] = [len(rl) for rl in r_to_keep['d'][:-1]]
                    k['d'] = k['d'][:-1]   
                    
                init = dim_reduce_init(pd.DataFrame(y), n_clusters, k, r, nj, var_distrib,\
                                       seed = None)
                
                eta_c, eta_d, H_c, H_d, psi_c, psi_d = dispatch_dgmm_init(init)
                lambda_bin, lambda_ord, lambda_categ = dispatch_gllvm_init(init)
                w_s_c, w_s_d = dispatch_paths_init(init)
                  
                # *_1L standsds for quantities going through all the network (head + tail)
                k_1L, L_1L, L, bar_L, S_1L = nb_comps_and_layers(k)    
                r_1L = {'c': r['c'] + r['t'], 'd': r['d'] + r['t'], 't': r['t']}
                        
                M = M_growth(it_num + 1, r_1L, numobs) 
                
                prev_lik = deepcopy(new_lik)
                it_num = it_num + 1
                print(likelihood)
                
                print('Restarting the algorithm')
                continue
            
            new_Lt = np.sum([len(rl) != 0 for rl in r_to_keep['t']]) #- 1
            
            # If r_l == 0, delete the last l + 1: layers
            new_Lt = np.sum([len(rl) != 0 for rl in r_to_keep['t']]) #- 1
            
            #w_s_t = pst_yCyD.mean(0)
            k_to_keep = k_select(w_s_c, w_s_d, w_s_t, k, new_Lt, clustering_layer, n_clusters)
                        
            is_selection = check_if_selection(r_to_keep, r, k_to_keep, k, L, new_Lt)
            
            assert new_Lt > 0 # > 1 ?
            if n_clusters == 'multi':
                assert new_Lt == L['t']
            
            if is_selection:
                
                # Part to change when update also number of layers on each head 
                nb_deleted_layers_tail = L['t'] - new_Lt
                L['t'] = new_Lt
                L_1L = {keys: values - nb_deleted_layers_tail for keys, values in L_1L.items()}
                
                eta_c, eta_d, H_c, H_d, psi_c, psi_d = dgmm_coeff_selection(eta_c,\
                            H_c, psi_c, eta_d, H_d, psi_d, L, r_to_keep, k_to_keep)
                    
                lambda_bin, lambda_ord, lambda_categ = gllvm_coeff_selection(lambda_bin, lambda_ord,\
                                                               lambda_categ, r, r_to_keep)
                
                w_s_c, w_s_d = path_proba_selection(w_s_c, w_s_d, k, k_to_keep, new_Lt)
                
                k = {h: [len(k_to_keep[h][l]) for l in range(L[h])] for h in ['d', 't']}
                k['c'] = [len(k_to_keep['c'][l]) for l in range(L['c'] + 1)]
                
                r = {h: [len(r_to_keep[h][l]) for l in range(L[h])] for h in ['d', 't']}
                r['c'] = [len(r_to_keep['c'][l]) for l in range(L['c'] + 1)]
                
                k_1L, _, L, bar_L, S_1L = nb_comps_and_layers(k)    
                r_1L = {'c': r['c'] + r['t'], 'd': r['d'] + r['t'], 't': r['t']}
            
                patience = 0
                best_r = deepcopy(r)
                best_k = deepcopy(k)  
                
                #=======================================================
                # Identifiability conditions
                #======================================================= 
                eta_d, H_d, psi_d, AT_d, eta_c, H_c, psi_c, AT_c = network_identifiability(eta_d, \
                                H_d, psi_d, eta_c, H_c, psi_c, w_s_c, w_s_d, w_s_t, bar_L)
                    
            print('New architecture:')
            print('k', k)
            print('r', r)
            print('L', L)
            print('S_1L', S_1L)
            print("w_s_c", len(w_s_c))
            print("w_s_d", len(w_s_d))
        
        M = M_growth(it_num + 1, r_1L, numobs)
        
        prev_lik = deepcopy(new_lik)
        print(likelihood)
        print('Silhouette score:', new_sil)  
        
        it_num = it_num + 1

    out = dict(likelihood = likelihood, classes = classes, \
                   best_r = best_r, best_k = best_k)
    if n_clusters == 'multi':
        out['z'] = z_tail
    else:
        out['z'] = z_tail[clustering_layer]
    return(out)
Ejemplo n.º 24
0
def M1DGMM(y, n_clusters, r, k, init, var_distrib, nj, it = 50, \
          eps = 1E-05, maxstep = 100, seed = None, perform_selec = True,\
              dm =  [], max_patience = 1, use_silhouette = True):# dm small hack to remove 
    
    ''' Fit a Generalized Linear Mixture of Latent Variables Model (GLMLVM)
    
    y (numobs x p ndarray): The observations containing mixed variables
    n_clusters (int): The number of clusters to look for in the data
    r (list): The dimension of latent variables through the first 2 layers
    k (list): The number of components of the latent Gaussian mixture layers
    init (dict): The initialisation parameters for the algorithm
    var_distrib (p 1darray): An array containing the types of the variables in y 
    nj (p 1darray): For binary/count data: The maximum values that the variable can take. 
                    For ordinal data: the number of different existing categories for each variable
    it (int): The maximum number of MCEM iterations of the algorithm
    eps (float): If the likelihood increase by less than eps then the algorithm stops
    maxstep (int): The maximum number of optimisation step for each variable
    seed (int): The random state seed to set (Only for numpy generated data for the moment)
    perform_selec (Bool): Whether to perform architecture selection or not
    use_silhouette (Bool): If True use the silhouette as quality criterion (best for clustering) else use
                            the likelihood (best for data augmentation).
    ------------------------------------------------------------------------------------------------
    returns (dict): The predicted classes, the likelihood through the EM steps
                    and a continuous representation of the data
    '''

    prev_lik = - 1E16
    best_lik = -1E16
    
    best_sil = -1 
    new_sil = -1 
        
    tol = 0.01
    patience = 0
    is_looking_for_better_arch = False
    
    # Initialize the parameters
    eta = deepcopy(init['eta'])
    psi = deepcopy(init['psi'])
    lambda_bin = deepcopy(init['lambda_bin'])
    lambda_ord = deepcopy(init['lambda_ord'])
    lambda_cont = deepcopy(init['lambda_cont'])
    lambda_categ = deepcopy(init['lambda_categ'])

    H = deepcopy(init['H'])
    w_s = deepcopy(init['w_s']) # Probability of path s' through the network for all s' in Omega
   
    numobs = len(y)
    likelihood = []
    silhouette = []
    it_num = 0
    ratio = 1000
    np.random.seed = seed
    out = {} # Store the full output
        
    # Dispatch variables between categories
    y_bin = y[:, np.logical_or(var_distrib == 'bernoulli',var_distrib == 'binomial')]
    nj_bin = nj[np.logical_or(var_distrib == 'bernoulli',var_distrib == 'binomial')].astype(int)
    nb_bin = len(nj_bin)
        
    y_ord = y[:, var_distrib == 'ordinal']    
    nj_ord = nj[var_distrib == 'ordinal'].astype(int)
    nb_ord = len(nj_ord)
    
    y_categ = y[:, var_distrib == 'categorical']
    nj_categ = nj[var_distrib == 'categorical'].astype(int)
    nb_categ = len(nj_categ)    
    
    y_cont = y[:, var_distrib == 'continuous'].astype(float)
    nb_cont = y_cont.shape[1]
    
    # Set y_count standard error to 1
    y_cont = y_cont / y_cont.std(axis = 0, keepdims = True)
    
    L = len(k)
    k_aug = k + [1]
    S = np.array([np.prod(k_aug[l:]) for l in range(L + 1)])    
    M = M_growth(1, r, numobs)
   
    assert nb_bin + nb_ord + nb_cont + nb_categ > 0 
    if nb_bin + nb_ord + nb_cont + nb_categ != len(var_distrib):
        raise ValueError('Some variable types were not understood,\
                         existing types are: continuous, categorical,\
                         ordinal, binomial and bernoulli')

    # Compute the Gower matrix
    if len(dm) == 0:
        cat_features = np.logical_or(var_distrib == 'categorical', var_distrib == 'bernoulli')
        dm = gower_matrix(y, cat_features = cat_features)
    
               
    # Do not stop the iterations if there are some iterations left or if the likelihood is increasing
    # or if we have not reached the maximum patience and if a new architecture was looked for
    # in the previous iteration
    while ((it_num < it) & (ratio > eps) & (patience <= max_patience)) | is_looking_for_better_arch:
        print(it_num)

        # The clustering layer is the one used to perform the clustering 
        # i.e. the layer l such that k[l] == n_clusters
        
        if not(isnumeric(n_clusters)):
            if n_clusters == 'auto':
                clustering_layer = 0
            else:
                raise ValueError('Please enter an int or "auto" for n_clusters')
        else:
            assert (np.array(k) == n_clusters).any()
            clustering_layer = np.argmax(np.array(k) == n_clusters)

        #####################################################################################
        ################################# S step ############################################
        #####################################################################################

        #=====================================================================
        # Draw from f(z^{l} | s, Theta) for all s in Omega
        #=====================================================================  
        
        mu_s, sigma_s = compute_path_params(eta, H, psi)
        sigma_s = ensure_psd(sigma_s)
        z_s, zc_s = draw_z_s(mu_s, sigma_s, eta, M)
         
        #========================================================================
        # Draw from f(z^{l+1} | z^{l}, s, Theta) for l >= 1
        #========================================================================
        
        chsi = compute_chsi(H, psi, mu_s, sigma_s)
        chsi = ensure_psd(chsi)
        rho = compute_rho(eta, H, psi, mu_s, sigma_s, zc_s, chsi)

        # In the following z2 and z1 will denote z^{l+1} and z^{l} respectively
        z2_z1s = draw_z2_z1s(chsi, rho, M, r)
                   
        #=======================================================================
        # Compute the p(y| z1) for all variable categories
        #=======================================================================
        
        py_zl1 = fy_zl1(lambda_bin, y_bin, nj_bin, lambda_ord, y_ord, nj_ord, \
                        lambda_categ, y_categ, nj_categ, y_cont, lambda_cont, z_s[0])
        
        #========================================================================
        # Draw from p(z1 | y, s) proportional to p(y | z1) * p(z1 | s) for all s
        #========================================================================
                
        zl1_ys = draw_zl1_ys(z_s, py_zl1, M)
                
        #####################################################################################
        ################################# E step ############################################
        #####################################################################################
        
        #=====================================================================
        # Compute conditional probabilities used in the appendix of asta paper
        #=====================================================================
        
        pzl1_ys, ps_y, p_y = E_step_GLLVM(z_s[0], mu_s[0], sigma_s[0], w_s, py_zl1)

        #=====================================================================
        # Compute p(z^{(l)}| s, y). Equation (5) of the paper
        #=====================================================================
        
        pz2_z1s = fz2_z1s(t(pzl1_ys, (1, 0, 2)), z2_z1s, chsi, rho, S)
        pz_ys = fz_ys(t(pzl1_ys, (1, 0, 2)), pz2_z1s)
                
        
        #=====================================================================
        # Compute MFA expectations
        #=====================================================================
        
        Ez_ys, E_z1z2T_ys, E_z2z2T_ys, EeeT_ys = \
            E_step_DGMM(zl1_ys, H, z_s, zc_s, z2_z1s, pz_ys, pz2_z1s, S)


        ###########################################################################
        ############################ M step #######################################
        ###########################################################################
             
        #=======================================================
        # Compute MFA Parameters 
        #=======================================================

        w_s = np.mean(ps_y, axis = 0)      
        eta, H, psi = M_step_DGMM(Ez_ys, E_z1z2T_ys, E_z2z2T_ys, EeeT_ys, ps_y, H, k)

        #=======================================================
        # Identifiability conditions
        #======================================================= 

        # Update eta, H and Psi values
        H = diagonal_cond(H, psi)
        Ez, AT = compute_z_moments(w_s, eta, H, psi)
        eta, H, psi = identifiable_estim_DGMM(eta, H, psi, Ez, AT)
        
        del(Ez)
        
        #=======================================================
        # Compute GLLVM Parameters
        #=======================================================
                        
        lambda_bin = bin_params_GLLVM(y_bin, nj_bin, lambda_bin, ps_y, pzl1_ys, z_s[0], AT[0],\
                     tol = tol, maxstep = maxstep)
                 
        lambda_ord = ord_params_GLLVM(y_ord, nj_ord, lambda_ord, ps_y, pzl1_ys, z_s[0], AT[0],\
                     tol = tol, maxstep = maxstep)
            
        lambda_categ = categ_params_GLLVM(y_categ, nj_categ, lambda_categ, ps_y, pzl1_ys, z_s[0], AT[0],\
                     tol = tol, maxstep = maxstep)

        lambda_cont = cont_params_GLLVM(y_cont, lambda_cont, ps_y, pzl1_ys, z_s[0], AT[0],\
                     tol = tol, maxstep = maxstep)

        ###########################################################################
        ################## Clustering parameters updating #########################
        ###########################################################################
          
        new_lik = np.sum(np.log(p_y))
        likelihood.append(new_lik)
        silhouette.append(new_sil)
        ratio = abs((new_lik - prev_lik)/prev_lik)
        
        idx_to_sum = tuple(set(range(1, L + 1)) - set([clustering_layer + 1]))
        psl_y = ps_y.reshape(numobs, *k, order = 'C').sum(idx_to_sum) 

        temp_class = np.argmax(psl_y, axis = 1)
        try:
            new_sil = silhouette_score(dm, temp_class, metric = 'precomputed')
        except ValueError:
            new_sil = -1
           
        # Store the params according to the silhouette or likelihood
        is_better = (best_sil < new_sil) if use_silhouette else (best_lik < new_lik)
            
        if is_better:
            z = (ps_y[..., n_axis] * Ez_ys[clustering_layer]).sum(1)
            best_sil = deepcopy(new_sil)
            classes = deepcopy(temp_class)
            '''
            plt.figure(figsize=(8,8))
            plt.scatter(z[:, 0], z[:, 1], c = classes)
            plt.show()
            '''
            
            # Store the output
            out['classes'] = deepcopy(classes)
            out['best_z'] = deepcopy(z_s[0])
            out['Ez.y'] = z
            out['best_k'] = deepcopy(k)
            out['best_r'] = deepcopy(r)
            
            out['best_w_s'] = deepcopy(w_s)
            out['lambda_bin'] = deepcopy(lambda_bin)
            out['lambda_ord'] = deepcopy(lambda_ord)
            out['lambda_categ'] = deepcopy(lambda_categ)
            out['lambda_cont'] = deepcopy(lambda_cont)

            out['eta'] = deepcopy(eta)            
            out['mu'] = deepcopy(mu_s)
            out['sigma'] = deepcopy(sigma_s)
            
            out['psl_y'] = deepcopy(psl_y)
            out['ps_y'] = deepcopy(ps_y)

            
        # Refresh the classes only if they provide a better explanation of the data
        if best_lik < new_lik:
            best_lik = deepcopy(prev_lik)
                               
        if prev_lik < new_lik:
            patience = 0
            M = M_growth(it_num + 2, r, numobs)
        else:
            patience += 1
                          
        ###########################################################################
        ######################## Parameter selection  #############################
        ###########################################################################
        min_nb_clusters = 2
       
        if isnumeric(n_clusters): # To change when add multi mode
            is_not_min_specif = not(np.all(np.array(k) == n_clusters) & np.array_equal(r, [2,1]))
        else:
            is_not_min_specif = not(np.all(np.array(k) == min_nb_clusters) & np.array_equal(r, [2,1]))
        
        is_looking_for_better_arch = look_for_simpler_network(it_num) & perform_selec & is_not_min_specif
        if is_looking_for_better_arch:
            r_to_keep = r_select(y_bin, y_ord, y_categ, y_cont, zl1_ys, z2_z1s, w_s)
            
            # If r_l == 0, delete the last l + 1: layers
            new_L = np.sum([len(rl) != 0 for rl in r_to_keep]) - 1 
            
            k_to_keep = k_select(w_s, k, new_L, clustering_layer, not(isnumeric(n_clusters)))
    
            is_L_unchanged = (L == new_L)
            is_r_unchanged = np.all([len(r_to_keep[l]) == r[l] for l in range(new_L + 1)])
            is_k_unchanged = np.all([len(k_to_keep[l]) == k[l] for l in range(new_L)])
              
            is_selection = not(is_r_unchanged & is_k_unchanged & is_L_unchanged)
            
            assert new_L > 0
            
            if is_selection:           
                
                eta = [eta[l][k_to_keep[l]] for l in range(new_L)]
                eta = [eta[l][:, r_to_keep[l]] for l in range(new_L)]
                
                H = [H[l][k_to_keep[l]] for l in range(new_L)]
                H = [H[l][:, r_to_keep[l]] for l in range(new_L)]
                H = [H[l][:, :, r_to_keep[l + 1]] for l in range(new_L)]
                
                psi = [psi[l][k_to_keep[l]] for l in range(new_L)]
                psi = [psi[l][:, r_to_keep[l]] for l in range(new_L)]
                psi = [psi[l][:, :, r_to_keep[l]] for l in range(new_L)]
                
                if nb_bin > 0:
                    # Add the intercept:
                    bin_r_to_keep = np.concatenate([[0], np.array(r_to_keep[0]) + 1]) 
                    lambda_bin = lambda_bin[:, bin_r_to_keep]
                 
                if nb_ord > 0:
                    # Intercept coefficients handling is a little more complicated here
                    lambda_ord_intercept = [lambda_ord_j[:-r[0]] for lambda_ord_j in lambda_ord]
                    Lambda_ord_var = np.stack([lambda_ord_j[-r[0]:] for lambda_ord_j in lambda_ord])
                    Lambda_ord_var = Lambda_ord_var[:, r_to_keep[0]]
                    lambda_ord = [np.concatenate([lambda_ord_intercept[j], Lambda_ord_var[j]])\
                                  for j in range(nb_ord)]
    
                # To recheck
                if nb_cont > 0:
                    # Add the intercept:
                    cont_r_to_keep = np.concatenate([[0], np.array(r_to_keep[0]) + 1]) 
                    lambda_cont = lambda_cont[:, cont_r_to_keep]  
                    
                if nb_categ > 0:
                    lambda_categ_intercept = [lambda_categ[j][:, 0]  for j in range(nb_categ)]
                    Lambda_categ_var = [lambda_categ_j[:,-r[0]:] for lambda_categ_j in lambda_categ]
                    Lambda_categ_var = [lambda_categ_j[:, r_to_keep[0]] for lambda_categ_j in lambda_categ]

                    lambda_categ = [np.hstack([lambda_categ_intercept[j][..., n_axis], Lambda_categ_var[j]])\
                                   for j in range(nb_categ)]  

                w = w_s.reshape(*k, order = 'C')
                new_k_idx_grid = np.ix_(*k_to_keep[:new_L])
                
                # If layer deletion, sum the last components of the paths
                if L > new_L: 
                    deleted_dims = tuple(range(L)[new_L:])
                    w_s = w[new_k_idx_grid].sum(deleted_dims).flatten(order = 'C')
                else:
                    w_s = w[new_k_idx_grid].flatten(order = 'C')
    
                w_s /= w_s.sum()
                
                
                # Refresh the classes: TO RECHECK
                #idx_to_sum = tuple(set(range(1, L + 1)) - set([clustering_layer + 1]))
                #ps_y_tmp = ps_y.reshape(numobs, *k, order = 'C').sum(idx_to_sum)
                #np.argmax(ps_y_tmp[:, k_to_keep[0]], axis = 1)

    
                k = [len(k_to_keep[l]) for l in range(new_L)]
                r = [len(r_to_keep[l]) for l in range(new_L + 1)]
                
                k_aug = k + [1]
                S = np.array([np.prod(k_aug[l:]) for l in range(new_L + 1)])    
                L = new_L

                patience = 0
                
                # Identifiability conditions
                H = diagonal_cond(H, psi)
                Ez, AT = compute_z_moments(w_s, eta, H, psi)
                eta, H, psi = identifiable_estim_DGMM(eta, H, psi, Ez, AT)
        
                del(Ez)
                                                
                         
            print('New architecture:')
            print('k', k)
            print('r', r)
            print('L', L)
            print('S',S)
            print("w_s", len(w_s))
            
        prev_lik = deepcopy(new_lik)
        it_num = it_num + 1
        print(likelihood)
        print(silhouette)
        

    out['likelihood'] = likelihood
    out['silhouette'] = silhouette
    
    return(out)
Ejemplo n.º 25
0
 def dummies_gower(df):
     return pd.DataFrame(gower_matrix(df.applymap(str), cat_features = [True for v in df.columns])).set_index(videos_df.index)
        metrics.adjusted_mutual_info_score,
    ]

    for artist_gt in range(ARTIST_GT):
        data = pd.read_csv(
            "../data preprocessing/final_df_with_encodings_with_price_binned.csv",
            header=0,
            usecols=[*features, *encodings])
        filter_artist_gt(artist_gt)
        n_rows = len(data)
        mapping = {"(0, 250]": 0, "(250, 1250]": 1, "(1250, 4200]": 2}
        labels_true = [mapping[x] for x in data['price_binned']]
        results = []
        for name, features_ in feature_groups.items():
            result = [name]
            features_used = gower.gower_matrix(data[features_])
            cluster = SpectralClustering(NUMBER_OF_PRICE_BINS,
                                         affinity='precomputed',
                                         n_init=200,
                                         n_jobs=-1).fit(features_used)
            labels_pred = cluster.labels_
            result += [m(labels_true, labels_pred) for m in clustering_metrics]
            results.append(result)
        print(
            pd.DataFrame(
                results, columns=[
                    "Feature", "V-measure", "Adj. Rand Index"
                ]).sort_values(by='Adj. Rand Index', ascending=False).to_latex(
                    f"../data preprocessing/artist_gt_{artist_gt}_ari.tex",
                    index=False,
                    caption=f"Number of rows {n_rows}"))
Ejemplo n.º 27
0
#y = y.where(~nan_mask, np.nan)

nj, nj_bin, nj_ord, nj_categ = compute_nj(full_contra, var_distrib)
nb_cont = np.sum(var_distrib == 'continuous')

p_new = full_contra.shape[1]

# Feature category (cf)
dtype = {full_contra.columns[j]: dtypes_dict[var_distrib[j]] for j in range(p_new)}
full_contra = full_contra.astype(dtype, copy=True)

# Feature category (cf)
cat_features = var_distrib == 'categorical'

# Defining distance matrix
dm3 = gower_matrix(full_contra, cat_features = cat_features) 

#===========================================#
# Hyperparameters
#===========================================# 

n_clusters = 2
nb_pobs = 100 # Target for pseudo observations
r = np.array([2, 1])
numobs = len(full_contra)
k = [n_clusters]

seed = 1
init_seed = 2
    
eps = 1E-05
Ejemplo n.º 28
0
nj, nj_bin, nj_ord, nj_categ = compute_nj(y, var_distrib)
y_np = y.values
nb_cont = np.sum(var_distrib == 'continuous')

p_new = y.shape[1]

# Feature category (cf)
cf_non_enc = np.logical_or(vd_categ_non_enc == 'categorical',
                           vd_categ_non_enc == 'bernoulli')

# Non encoded version of the dataset:
y_nenc_typed = y_categ_non_enc.astype(np.object)
y_np_nenc = y_nenc_typed.values

# Defining distances over the non encoded features
dm = gower_matrix(y_nenc_typed, cat_features=cf_non_enc)

dtype = {y.columns[j]: np.float64 if (var_distrib[j] != 'bernoulli') and \
        (var_distrib[j] != 'categorical') else np.str for j in range(p_new)}

y = y.astype(dtype, copy=True)

#===========================================#
# Running the algorithm
#===========================================#

r = np.array([2, 1])
numobs = len(y)
k = [n_clusters]

seed = 1
Ejemplo n.º 29
0
s1 = '1'
s2 = '1000'
n1 = 1
n2 = 1000
types = {
    'numbers': [n1, n2, 2, 2000],
    'strings': [s1, s2, '2', '2000'],
    'mixed': [n1, n2, s1, s2]
}
orders = {
    'same': lambda a: [a, a],
    '25%': lambda a: [a, [a[0], a[1], a[2], a[2]]],
    '25% 2': lambda a: [a, [a[0], a[0], a[2], a[3]]],
    '50%': lambda a: [a, [a[0], a[1], a[0], a[1]]],
    '50% 2': lambda a: [a, [a[0], a[2], a[1], a[3]]],
    '50% 3': lambda a: [a, [a[3], a[1], a[2], a[0]]],
    '75%': lambda a: [a, [a[0], a[0], a[0], a[0]]],
    '75% 2': lambda a: [a, [a[1], a[0], a[3], a[3]]],
    '100%': lambda a: [a, [a[3], a[2], a[1], a[0]]]
}

for k, a in types.items():
    print(k)
    for o, func in orders.items():
        print(o)
        b = func(a)
        X = pd.DataFrame(b)
        X['c'] = 'c'
        print(X)
        print(gower.gower_matrix(X))
Ejemplo n.º 30
0
            #print(set(discrete_k))
            #test[col] = pd.cut(test[col], bins).map(lambda x: x.mid).astype(float)

            le.fit(test[col].append(train[col]))
            train[col] = le.transform(train[col])
            k_dict[col] = deepcopy(le)

        nj, nj_bin, nj_ord, nj_categ = compute_nj(train, var_distrib)
        nb_cont = np.sum(var_distrib == 'continuous')
        p_new = train.shape[1]
        train_np = train.values

        # Defining distances over the features
        cat_features = pd.Series(var_distrib).isin(
            ['categorical', 'bernoulli']).to_list()
        dm = gower_matrix(train.astype(np.object), cat_features=cat_features)

        dtype = {train.columns[j]: np.float64 if (var_distrib[j] != 'bernoulli') and \
                (var_distrib[j] != 'categorical') else np.str for j in range(p_new)}

        train = train.astype(dtype, copy=True)
        numobs = len(train)

        #*****************************************************************
        # Run MIAMI
        #*****************************************************************

        prince_init = dim_reduce_init(train, 2, k, r, nj, var_distrib, seed = None,\
                                      use_famd=True)
        out = MIAMI(train_np, 'auto', r, k, prince_init, var_distrib, nj, authorized_ranges, nb_pobs, it,\
                     eps, maxstep, seed, perform_selec = False, dm = dm, max_patience = 0)