def load_feature_data():
    dataPath = '../data/TrainFeature-normalized image.csv'
    data = pd.read_csv(dataPath)
    row, _ = data.shape
    originData = pd.DataFrame(data)
    # print(originData)
    gleason_data = originData[originData['ClassifyValue'] != 0]
    label = gleason_data['ClassifyValue'] - 1
    label = label.reset_index()
    # print(label['ClassifyValue'].value_counts())
    # print(label)
    selectedFeatutesList = []
    # colNames = gleason_data[gleason_data.columns[8:]].columns
    for colName in gleason_data.columns[8:]:
        # if 'T2' not in colName:
        selectedFeatutesList.append(colName)
    gleason_data = pd.DataFrame(gleason_data, columns=selectedFeatutesList)
    colNames = gleason_data.columns
    gleason_data = gleason_data.fillna(0)
    gleason_data = gleason_data.astype(np.float64)
    gleason_data = StandardScaler().fit_transform(gleason_data)
    gleason_data = pd.DataFrame(gleason_data)
    gleason_data.columns = colNames
    # smo = SMOTE(random_state=2)
    # x_smote, y_smote = smo.fit_sample(gleason_data, label['ClassifyValue'])
    # input_features = x_smote.shape[1]
    input_features = gleason_data.shape[1]
    # print(x_smote.shape[1])
    # print(x_smote)
    return gleason_data, label['ClassifyValue'], input_features
def feature_selection(df, target):
    convert_dct = {'integer': 'int64', 'string': 'object', 'float': 'float64', 'boolean': 'bool',
                   'date-iso-8601': 'datetime64[ns]', 'date-eu': 'datetime64[ns]',
                   'date-non-std-subtype': 'datetime64[ns]', 'date-non-std': 'datetime64[ns]', 'gender': 'category',
                   'all-identical': 'category'}
    ptype = Ptype()
    ptype.run_inference(df)
    predicted = ptype.predicted_types
    count_normal_vars = 0
    count_continuous_vars = 0
    features = []
    for key in predicted:
        # print(key, predicted[key])
        if predicted[key] == 'int' or predicted[key] == 'float':
            features.append(key)
    x = df.loc[:, features].values
    x = StandardScaler().fit_transform(x)
    x = pd.DataFrame(x)
    x.columns = features


    X = x.drop(target, 1)  # Feature Matrix
    y = x[target]  # Target Variable

    # no of features
    nof_list = np.arange(1, len(features))
    high_score = 0
    # Variable to store the optimum features
    nof = 0
    score_list = []
    for n in range(len(nof_list)):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
        model = LinearRegression()
        rfe = RFE(model, nof_list[n])
        X_train_rfe = rfe.fit_transform(X_train, y_train)
        X_test_rfe = rfe.transform(X_test)
        model.fit(X_train_rfe, y_train)
        score = model.score(X_test_rfe, y_test)
        score_list.append(score)
        if (score > high_score):
            high_score = score
            nof = nof_list[n]
    # print("Optimum number of features: %d" % nof)
    # print("Score with %d features: %f" % (nof, high_score))
    cols = list(X.columns)
    model = LinearRegression()
    # Initializing RFE model
    rfe = RFE(model, nof)
    # Transforming data using RFE
    X_rfe = rfe.fit_transform(X, y)
    # Fitting the data to model
    model.fit(X_rfe, y)
    temp = pd.Series(rfe.support_, index=cols)
    selected_features_rfe = temp[temp == True].index

    quality_measure = nof/len(features)
    return quality_measure
 def TTest_Lasso(data):
     selectedFeatutesList = []
     label = data['åŒ…č†œä¾µēŠÆ']
     print(label[label[0:] == 0].values.size,
           label[label[0:] == 1].values.size)
     colNames = data[data.columns[1:]].columns
     data = data[data.columns[1:]].fillna(0)
     data = data.astype(np.float64)
     data = StandardScaler().fit_transform(data)
     data = pd.DataFrame(data)
     data.columns = colNames
     data['label'] = label
     # balanced Data
     smo = SMOTE(random_state=2)
     X_smote, y_smote = smo.fit_sample(data, data['label'])
     print(X_smote)
     for colName in X_smote.columns[0:-1]:
         if 'ADC' in colName:
             if levene(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName])[1] > 0.05 \
                 and ttest_ind(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName])[1] < 0.05:
                 selectedFeatutesList.append(colName)
             elif levene(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName])[1] <= 0.05 and \
                     ttest_ind(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName],
                               equal_var=False)[1] < 0.05:
                 selectedFeatutesList.append(colName)
     if 'label' not in selectedFeatutesList:
         selectedFeatutesList = ['label'] + selectedFeatutesList
     # print(index)
     data1 = X_smote[X_smote['label'] == 0][selectedFeatutesList]
     data2 = X_smote[X_smote['label'] == 1][selectedFeatutesList]
     trainData = pd.concat([data1, data2])
     # print(trainData)
     trainData = shuffle(trainData)
     trainData.index = range(len(trainData))
     X = trainData[trainData.columns[1:]]
     y = trainData['label']
     alphas = np.logspace(-3, 1, 50)
     model_lassoCV = LassoCV(alphas=alphas, cv=5, max_iter=3000).fit(X, y)
     print(model_lassoCV.alpha_)
     coef = pd.Series(model_lassoCV.coef_, index=X.columns)
     index = coef[coef != 0].index
     X = X[index]
     print(coef[coef != 0].sort_values(axis=0, ascending=False))
     # featureNum = np.arange(len(index))
     # featureCoef = coef[coef != 0]
     # plt.bar(featureNum, featureCoef,
     #         color='lightblue',
     #         edgecolor='black',
     #         alpha=0.8)
     # plt.xticks(featureNum, index,
     #            rotation='45',
     #            ha='right',
     #            va='top')
     print("Lasso picked " + str(sum(coef != 0)) +
           " variables and eliminated the other " + str(sum(coef == 0)) +
           " variables")
     return X, y
Example #4
0
def prepare_data(filename):
    data = pd.read_csv(filename)
    print(f'File {filename} \nIlosc wyjątkĆ³w \n {data["class"].value_counts()}', )
    data = StandardScaler().fit_transform(data)
    data = normalize(data)
    data = PCA(n_components=2).fit_transform(data)
    data = pd.DataFrame(data)
    data.columns = ['P1', 'P2']
    return data
Example #5
0
def scale_features(df):
    '''
    Scales the dataframe features with StandardScaler.
    '''
    df_clms = df.columns
    df_scaled = StandardScaler().fit_transform(df)
    df_scaled = pd.DataFrame(df_scaled)
    df_scaled.columns = df_clms

    return df_scaled
Example #6
0
 def select_clustering_columns(self, col, PCA=False, n_component=0.80):
     """Select the columns that will be use in the clustering"""
     if PCA:
         self.clustered_df = self.PCA(n_component, col)
     else:
         df = self.df[col]
         df = StandardScaler().fit_transform(
             df)  # Standardized the features
         df = pd.DataFrame(df)
         df.columns = col
         self.clustered_df = df
Example #7
0
def prepare_data(filename):
    data = pd.read_csv(filename)
    if filename == 'online_shoppers_intention.csv':
        data = pd.get_dummies(data, columns=['Month', 'VisitorType'])
        data.fillna(0, inplace=True)
    data = StandardScaler().fit_transform(data)
    data = normalize(data)
    data = PCA(n_components=2).fit_transform(data)
    data = pd.DataFrame(data)
    data.columns = ['P1', 'P2']
    return data[:5000]
Example #8
0
def prepare_input_data(input_data,
                       fill_blanks=True,
                       strategy="median",
                       standardize=True,
                       pred=False):
    """

	Args:
		input_data:
		fill_blanks:
		strategy:
		standardize:
		pred:

	Returns:

	"""
    data = get_data(input_data=input_data)
    # print(data)
    df = data[data.columns.drop(list(data.filter(regex='pred|flag')))]
    predicted_labels = data.filter(regex='pred|flag')
    cols = df.columns
    columns_full = list(cols)
    columns_without_ids = columns_full
    columns_without_ids.remove('ID')
    columns_without_ids.remove('True_Label')
    info = data[['ID', 'True_Label']]

    data_without_blanks = pd.DataFrame()
    if fill_blanks is True:
        data_without_blanks = filling_blank_cells(df.drop(['ID', 'True_Label'],
                                                          axis=1),
                                                  strategy=strategy)
    if fill_blanks is False:
        data_without_blanks = df.drop(['ID', 'True_Label'], axis=1).dropna()
    if standardize is True:
        data_without_blanks = StandardScaler().fit_transform(
            data_without_blanks)
    if standardize is False:
        pass
    data_without_blanks.columns = columns_without_ids
    complete_data = merge_dataframe(info, pd.DataFrame(data_without_blanks))
    complete_data = complete_data.join(predicted_labels, how='left')
    if pred is False:
        complete_data = complete_data.dropna()

    # print('\n')
    return complete_data
 def Pearson_RFE_selector(originData):
     label = originData['label']
     print(label[label[0:] == 0].values.size,
           label[label[0:] == 1].values.size)
     colNames = originData[originData.columns[2:-1]].columns
     data = originData[originData.columns[2:-1]].fillna(0)
     data = data.astype(np.float64)
     data = StandardScaler().fit_transform(data)
     data = pd.DataFrame(data)
     data.columns = colNames
     data['label'] = label
     # balanced Data
     smo = SMOTE(random_state=3)
     X_smote, y_smote = smo.fit_sample(data[data.columns[0:-1]],
                                       data['label'])
     # print(X_smote)
     # features_list = []
     # for colName in X_smote.columns:
     #     if 'DWI' in colName:
     #         features_list.append(colName)
     # X_smote = X_smote[features_list]
     # print(X_smote)
     # print(X_smote.corr(method='pearson').columns)
     # pearson_corr = X_smote.corr(method='pearson')
     # pearson_features_list = []
     # mean = pearson_corr['label'].mean()
     # for colName in pearson_corr[(abs(pearson_corr['label']) > mean) & (abs(pearson_corr['label']) < 1)].index:
     #     # if 'T2' in colName or 'ADC' in colName or 'DWI' in colName:
     #         pearson_features_list.append(colName)
     # print(X_smote[pearson_features_list])
     # X_pearson = X_smote[pearson_features_list]
     # y_pearson = X_smote['label']
     lr = LinearRegression()
     rfe = RFECV(lr, step=1, cv=5)
     rfe.fit(X_smote, y_smote)
     X_rfe = X_smote.loc[:, rfe.support_]
     # print(X_rfe)
     names = X_rfe.columns
     ranks = rfe.ranking_
     feature_indexes = []
     for i in range(len(ranks)):
         if ranks[i] == 1:
             feature_indexes += [i]
     print(len(feature_indexes))
     print(names)
     # print(sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names)))
     return X_rfe, y_smote
Example #10
0
def calculate_all_neighbours(morph_stats_cut, all_by_all_nucleus):
    """ Run all calculations for neighbours based on the provided morphology table, and the validation / xyz
    criteria"""

    normed_morph = StandardScaler().fit_transform(morph_stats_cut)
    normed_morph = pd.DataFrame(normed_morph)
    normed_morph.columns = morph_stats_cut.columns

    # all by all neighbours, rows == no of rows in normed_morph, one for each cell
    # columns give index of neighbours (1st column == 1st neighbour, 2nd column == 2nd neighbour and so on...)
    nn = neighbours(normed_morph)

    # result using nuclei xyz position criteria - list of len() equal to number of rows in morphology table,
    # each entry is the number of neighbours away the first cell to meet the xyz criteria occurs
    result_nuc_criteria = get_nearest_index(nn, all_by_all_nucleus)

    # Cells where no cells meet the xyz criteria have values of -1

    # check - each row of nn should contain all possible indices (0 - nrow of nn) apart from its own index
    unique_vals = np.array(range(0, nn.shape[0]))
    for i in range(0, nn.shape[0]):
        row = nn[i, :]
        test = np.sort(unique_vals[unique_vals != i])
        row_vals = np.sort(np.unique(row))
        if not (test == row_vals).all():
            print(i)
            print('row does not contain all values apart from its own index')

    # calculate randomised results
    nuc_results_random = []
    for i in range(0, 100):
        print(i)
        # shuffle each row & re-run
        nn_shuffle = nn.copy()
        for j in range(0, nn.shape[0]):
            np.random.shuffle(nn_shuffle[j, :])

        nuc_results_random.append(
            get_nearest_index(nn_shuffle, all_by_all_nucleus))

    # one row per iteration, number of cols == number of rows in morphology table. Each entry is the number of
    # neighoburs away the first cell to meet the xyz criteria occurs
    nuc_results_random = pd.DataFrame.from_records(nuc_results_random)

    return result_nuc_criteria, nuc_results_random
Example #11
0
    def PCA(self, n_component, columns):
        """ n_component : number of component of the PCA.
        columns: columns used to execute the PCA
        Transform the data into principal components using PCA.
        Return a dataframe df containing the microtrips described by their new (n_component) component."""
        df = self.df[columns]  # select only columns required for PCA
        df = StandardScaler().fit_transform(df)  # Standardized the features
        pca = PCA(n_components=n_component)
        pca.fit(df)
        pca = pca.transform(df)

        columns = []
        for i in range(len(pca[0])):
            columns.append('Component' + str(i))
            i += 1
        df = pd.DataFrame(pca)
        df.columns = columns
        return df
Example #12
0
def feature_scaling(df, type_scale):
    '''
    This function takes in either the azdias or the customers dataframe and applyes the selected feature scaler
    Args: customer or azdias dataframe and a string representing the type of scaling intended
    returns: scaled dataframe
    '''

    features_list = df.columns

    if type_scale == 'StandardScaler':
        df_scaled = StandardScaler().fit_transform(df)

    if type_scale == 'MinMaxScaler':
        df_scaled = MinMaxScaler().fit_transform(df)

    df_scaled = pd.DataFrame(df_scaled)
    df_scaled.columns = features_list

    return df_scaled
Example #13
0
 def apply_pca(self, df, num_pcs, use_whiten=False):
     """
     Apply Principal Components Analysis to dataframe.
     
     Parameters:
         - df, dataframe to apply PCA to
         - num_pcs, int, the number of principal componetns to use
         - use_whiten, boolean, flag to indicate whether data should be 
             whitened
     
     Return:
         pandas dataframe containing the transformed data.
     """
     # ensure data is scaled appropriately
     subreddits = df.index
     df = StandardScaler().fit_transform(df)
     # create the PCA object with the desired number of principal components
     pca = PCA(n_components = num_pcs, whiten=use_whiten)
     # return the PCA'd data
     df = pd.DataFrame(pca.fit_transform(df))
      
     df.columns = ["PC_" + str(col) for col in df.columns.values]
     return df.set_index(subreddits)
data_train_ = data_train.copy()
dt1 = pd.to_datetime(data_train["register_time"])
data_train["register_time"] = dt1.dt.dayofyear
# data_train = data_train.drop("user_id",axis=1)
data_train[
    'cha'] = data_train['prediction_pay_price'] - data_train['pay_price']
data_train = data_train.drop(["lable", 'prediction_pay_price'], axis=1)

data_train['cha_2'] = data_train['cha']
del data_train['cha']
data_train.rename(columns={'cha_2': 'cha'}, inplace=True)
x_, y = data_train.iloc[:, 0:len(data_train.columns) -
                        1], data_train.iloc[:, len(data_train.columns) - 1:]
x = StandardScaler().fit_transform(x_)
x = pd.DataFrame(x)
x.columns = x_.columns

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    random_state=1,
                                                    test_size=0.5)

xgb = XGBRegressor(n_estimators=400,
                   learning_rate=0.01,
                   max_depth=4,
                   random_state=1)
xgb.fit(x_train, y_train)
y_train_pred = xgb.predict(x_train)
y_test_pred = xgb.predict(x_test)
y_pred = xgb.predict(x)
print u'xgbļ¼š', mean_squared_error(y_train, y_train_pred)
 def TTest_mRMR_svmRFE_selector(originData):
     selectedFeatutesList = []
     label = originData['label']
     colNames = originData[originData.columns[2:8]].columns
     data = originData[originData.columns[2:8]].fillna(0)
     data = data.astype(np.float64)
     data = StandardScaler().fit_transform(data)
     # minmax_scale = preprocessing.MinMaxScaler().fit(data)
     # data = minmax_scale.transform(data)
     data = pd.DataFrame(data)
     data.columns = colNames
     data['label'] = label
     # balanced Data
     smo = SMOTE(random_state=3)
     X_smote, y_smote = smo.fit_sample(data, data['label'])
     for colName in X_smote.columns[0:-1]:
         # if 'DWI' in colName:
         if levene(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName])[1] > 0.05 and \
                 ttest_ind(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName])[
                     1] < 0.05:
             selectedFeatutesList.append(colName)
         elif levene(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName])[1] <= 0.05 and \
                 ttest_ind(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName],
                           equal_var=False)[1] < 0.05:
             selectedFeatutesList.append(colName)
     if 'label' not in selectedFeatutesList:
         selectedFeatutesList = ['label'] + selectedFeatutesList
     # print(index)
     data1 = X_smote[X_smote['label'] == 0][selectedFeatutesList]
     data2 = X_smote[X_smote['label'] == 1][selectedFeatutesList]
     trainData = pd.concat([data1, data2])
     # trainData = shuffle(trainData)
     # trainData.index = range(len(trainData))  # ę‰“ä¹±åŽé‡ę–°ę ‡å·
     X = trainData[trainData.columns[1:]]
     y = trainData['label']
     # print(X_Smote)
     # mRMR_features = pymrmr.mRMR(X_smote, 'MIQ', 15)
     # define MI_FS feature selection method
     feat_selector = mifs.MutualInformationFeatureSelector(method='JMIM')
     feat_selector.fit(X, y)
     # feat_selector._support_mask
     # feat_selector.ranking_
     # call transform() on X to filter it down to selected features
     # X_filtered = feat_selector.transform(X_smote)
     # X_filtered = pd.DataFrame(X_filtered)
     # print(feat_selector.ranking_)
     # if 'label' not in mRMR_features: mRMR_features = ['label'] + mRMR_features
     X_mRMR = X.loc[:, feat_selector._support_mask]
     colNames = X_mRMR.columns
     clf = LinearSVC()
     # featureNums = len(selectedFeatutesList)
     # print(featureNums)
     model = RFE(clf, n_features_to_select=len(feat_selector.ranking_))
     # print(y)
     # print(X_mRMR)
     model.fit(X_mRMR, y)
     feats = list(np.array(colNames)[model.support_])
     for featureNames in feats:
         print(featureNames)
     print(len(feats))
     X_RFE = X_mRMR[feats]
     return X_RFE, y
Example #16
0
def mass_univariate(df: Optional[pd.DataFrame] = None,
                    cat_independentVar_cols: Optional[Union[List[str],
                                                      List[np.ndarray]]] = None,
                    cont_independentVar_cols: Optional[Union[List[str],
                                                             List[np.ndarray]]] = None,
                    dependentVar_cols: Optional[Union[List[str],
                                                      List[np.ndarray]]] = None,
                    scaling: bool=True,
                    col_to_drop : Optional[List[str]] = None,
                    additional_info=None) -> Union[sm.regression.linear_model.RegressionResultsWrapper, pd.DataFrame]:
    """[Returns the model and model summary
         Performs univariate test, implements statsmodel.api.OLS]

    Args:
        df (Optional[pd.DataFrame]): [pandas data frame, where each row is one observation]
        cat_independentVar_cols (Optional[Union[List[str], List[np.ndarray]]], optional): [list of categorical variables, such that they will be converted to dummies by pandas]. Defaults to None.
        cont_independentVar_cols (Optional[Union[List[str], List[np.ndarray]]], optional): [list of continuous variables, they will be appended to cat_independentVar_cols]. Defaults to None.
        dependentVar_cols (Optional[Union[List[str], List[np.ndarray]]], optional): [the dependent variable, the ]. Defaults to None.
        scaling [bool]: Default = True. If false, we will not perform standard scaler
        col_to_drop [list[str]] = if we want to remove any column from the independent Variable before fitting the model.
        additional_info ([type], optional): [additional columns to be appended]. Defaults to None.

    Returns:
        Union[sm.regression.linear_model.RegressionResultsWrapper, pd.DataFrame]: [the statsmodel model and dataframe of the model summary, where cols are independent variables and const]
    """
    model_summary = defaultdict(list)
    if type(df) == pd.DataFrame:
        new_df = df.copy()

    cat_independentVar = defaultdict(list)
    cont_independentVar = defaultdict(list)
    dependentVar = defaultdict(list)
    independentVar = dict()
    if type(cat_independentVar_cols) == list:
        for idx, cat_independentVar_col in enumerate(cat_independentVar_cols):
            if type(cat_independentVar_col) == str:
                cat_independentVar_temp = pd.get_dummies(new_df[cat_independentVar_col].values,
                                                         prefix=cat_independentVar_col,
                                                         drop_first=True).to_dict(orient='list')

                cat_independentVar.update(cat_independentVar_temp)
            if type(cat_independentVar_col) == np.ndarray:
                cat_independentVar_temp = pd.get_dummies(cat_independentVar_col,
                                                         prefix='Cat_' +
                                                         str(idx),
                                                         drop_first=True).to_dict(orient='list')

                cat_independentVar.update(cat_independentVar_temp)
    elif type(cat_independentVar_cols) == np.ndarray:
        if cat_independentVar_cols.ndim == 1:
            cat_independentVar_temp = pd.get_dummies(cat_independentVar_cols,
                                                     prefix='Cat_',
                                                     drop_first=True).to_dict(orient='list')
            cat_independentVar.update(cat_independentVar_temp)
        else:
            for col in range(cat_independentVar_cols.shape[1]):
                cat_independentVar_temp = pd.get_dummies(cat_independentVar_cols[:, col],
                                                         prefix='Cat_'+col+'_',
                                                         drop_first=True).to_dict(orient='list')
                cat_independentVar.update(cat_independentVar_temp)

    if type(cont_independentVar_cols) == list:
        if not cont_independentVar_cols:
            pass
        elif type(cont_independentVar_cols[0]) == str:
            cont_independentVar_temp = np.asarray(
                new_df.loc[:, cont_independentVar_cols])
            if cont_independentVar_temp.ndim == 1:
                cont_independentVar_temp = cont_independentVar.reshape(-1, 1)
            if scaling:
                cont_independentVar_temp = StandardScaler().fit_transform(cont_independentVar_temp)
            cont_independentVar_temp = pd.DataFrame(cont_independentVar_temp)
            cont_independentVar_temp.columns = cont_independentVar_cols
            cont_independentVar = cont_independentVar_temp.to_dict(
                orient='list')

        elif type(cont_independentVar_cols[0]) == np.ndarray:
            cont_independentVar_temp = np.hstack(cont_independentVar_cols)
            if cont_independentVar_temp.ndim == 1:
                cont_independentVar_temp = cont_independentVar.reshape(-1, 1)
            if scaling:
                cont_independentVar_temp = StandardScaler().fit_transform(cont_independentVar_temp)
            cont_independentVar_temp = pd.DataFrame(cont_independentVar_temp)
            cont_independentVar_temp.columns = [
                'Cont_'+str(i) for i in range(cont_independentVar_cols.shape[1])]
            cont_independentVar = cont_independentVar_temp.to_dict(
                orient='list')

    elif type(cont_independentVar_cols) == np.ndarray:
        cont_independentVar_temp = cont_independentVar_cols
        if cont_independentVar_temp.ndim == 1:
            cont_independentVar_temp = cont_independentVar_temp.reshape(-1, 1)
        if scaling:
            cont_independentVar_temp = StandardScaler().fit_transform(cont_independentVar_temp)
        cont_independentVar_temp = pd.DataFrame(cont_independentVar_temp)
        cont_independentVar_temp.columns = [
            'Cont_'+str(i) for i in range(cont_independentVar_temp.shape[1])]
        cont_independentVar = cont_independentVar_temp.to_dict(orient='list')

    if type(dependentVar_cols) == list:

        if type(dependentVar_cols[0]) == str:
            dependentVar_temp = np.asarray(new_df.loc[:, dependentVar_cols])
            if dependentVar_temp.ndim == 1:
                dependentVar_temp = dependentVar_temp.reshape(-1, 1)
            if scaling:
                dependentVar_temp = StandardScaler().fit_transform(dependentVar_temp)
            dependentVar_temp = pd.DataFrame(dependentVar_temp)
            dependentVar_temp.columns = dependentVar_cols
            dependentVar = dependentVar_temp.to_dict(orient='list')
        elif dependentVar_cols[0] == np.ndarray:
            dependentVar_temp = np.hstack(dependentVar_cols)
            if dependentVar_temp.ndim == 1:
                dependentVar_temp = dependentVar_temp.reshape(-1, 1)
            if scaling:
                dependentVar_temp = StandardScaler().fit_transform(dependentVar_temp)
            dependentVar_temp = pd.DataFrame(dependentVar_temp)
            dependentVar_temp.columns = [
                'Dependent_Var_'+str(i) for i in range(dependentVar_cols.shape[1])]
            dependentVar = dependentVar_temp.to_dict(orient='list')
    elif type(dependentVar_cols) == np.ndarray:
        dependentVar_temp = dependentVar_cols
        if dependentVar_temp.ndim == 1:
            dependentVar_temp = dependentVar_temp.reshape(-1, 1)
        if scaling:
            dependentVar_temp = StandardScaler().fit_transform(dependentVar_temp)
        dependentVar_temp = pd.DataFrame(dependentVar_temp)
        dependentVar_temp.columns = [
            'Dependent_Var_'+str(i) for i in range(dependentVar_cols.shape[1])]
        dependentVar = dependentVar_temp.to_dict(orient='list')

    independentVar.update(cont_independentVar)
    independentVar.update(cat_independentVar)
    if not independentVar:
        independentVar['const'] = [1. for i in range(
            len(list(dependentVar.values())[0]))]
        independentVar = pd.DataFrame(independentVar)
    else:
        independentVar = pd.DataFrame(independentVar)
        independentVar = sm.add_constant(independentVar)
    
    if col_to_drop:
        try:
            independentVar = independentVar.drop(col_to_drop,axis=1)
        except KeyError:
            print('You might be trying to remove the categorical data, which in this function is written as Gender_2.0 or sth., Try again')
            raise KeyError()
    dependentVar = pd.DataFrame(dependentVar)

    for feature in dependentVar.columns:
        last_model = sm.OLS(dependentVar.loc[:, feature], independentVar).fit()
        result = [None] * (len(last_model.params) + len(last_model.pvalues))
        result[::2] = last_model.params
        result[1::2] = last_model.pvalues
        model_summary[feature].extend(result)

    model_summary = pd.DataFrame(model_summary).T
    list1 = independentVar.columns.to_list()
    list2 = ['_coef', '_pval']

    model_summary.columns = [i + n for i in list1 for n in list2]
    if additional_info:
        model_summary['Additional_info'] = additional_info
        model_summary.columns = [i + n for i in list1
                                 for n in list2] + ['Additional_info']

    return last_model, model_summary
Example #17
0
def main():
    umap = np.loadtxt(snakemake.input.umap, delimiter='\t')

    chosen_k = int(snakemake.params.chosen_k)
    chosen_res = float(snakemake.params.chosen_res)

    if chosen_k == 0:
        raise ValueError('No chosen number of neighbours - k - for louvain')

    elif chosen_res == 0:
        raise ValueError('No chosen resolution for louvain')

    with open(snakemake.input.cluster_file, 'rb') as f:
        clustering = pickle.load(f)

    plot, no_clusters, no_singletons = plot_clusters_on_umap(umap,
                                                             clustering,
                                                             label=True)
    plot.set_title('k_%s_res_%s__no_clusters__%s__no_singletons__%s' %
                   (chosen_k, chosen_res, no_clusters, no_singletons),
                   fontsize=20)
    plt.savefig(snakemake.output.fig)

    table = pd.read_csv(snakemake.input.filtered, sep='\t')
    col_name = 'chosen_k_%s_res_%s' % (chosen_k, chosen_res)
    table[col_name] = clustering

    # just keep label id and clusters
    if 'label_id' in table.columns:
        table = table[['label_id', col_name]]
    elif 'unique_id' in table.columns:
        table = table[['unique_id', col_name]]
    table.to_csv(snakemake.output.merged_table, index=False, sep='\t')

    # make morph and viz tables
    viz_table = pd.read_csv(snakemake.input.viz_table, sep='\t')
    full_with_ids = pd.read_csv(snakemake.input.full_with_ids, sep='\t')
    merged_morph = pd.read_csv(snakemake.input.merged_morph, sep='\t')
    merged_morph = filter_texture_fails(merged_morph)

    if 'unique' not in snakemake.params.gene_assign:
        viz_table['clusters'] = clustering

        # make table with label id & clusters for all cells
        full_with_ids['clusters'] = clustering
        table = full_with_ids[['label_id', 'clusters']]

    else:
        # make table with label id & clusters for all cells
        cut = full_with_ids[['label_id', 'unique_id']]
        table = cut.join(table.set_index('unique_id'),
                         on='unique_id',
                         how='left')
        table = table[['label_id', col_name]]
        table.columns = ['label_id', 'clusters']

        viz_table = viz_table.join(table.set_index('label_id'),
                                   on='label_id',
                                   how='left')

    make_binary_columns(viz_table, 'clusters', snakemake.output.viz_table)
    morph_table = table.join(merged_morph.set_index('label_id'),
                             on='label_id',
                             how='inner')
    morph_table.to_csv(snakemake.output.morph_table, index=False, sep='\t')

    # and a normalised version
    just_morph = morph_table.drop(columns=['label_id', 'clusters'])
    col_names = just_morph.columns.tolist()
    just_morph = StandardScaler().fit_transform(just_morph)
    just_morph = pd.DataFrame(data=just_morph)
    just_morph.columns = col_names

    # can have issues with index matches producing nan values - reset indices here
    just_morph.reset_index(drop=True, inplace=True)
    morph_table.reset_index(drop=True, inplace=True)

    just_morph.insert(0, 'clusters', morph_table['clusters'])
    just_morph.insert(0, 'label_id', morph_table['label_id'])
    just_morph.to_csv(snakemake.output.morph_table_normalised,
                      index=False,
                      sep='\t')
Example #18
0

# In[10]:

##########################
### Data Preprocessing ###
##########################

# Normalizing data (just numeric columns)
train_std = StandardScaler().fit_transform(train[numeric_cols])
test_std = StandardScaler().fit_transform(test[numeric_cols[:-1]])

train_std = pandas.DataFrame(data=train_std[0:,0:])
test_std = pandas.DataFrame(data=test_std[0:,0:])

train_std.columns = numeric_cols
# Leave out label column for test data
test_std.columns = numeric_cols[:-1]


# In[ ]:

# SVD
u,s,v = np.linalg.svd(train_std.T)

print('SVD: ', u)


# In[ ]:

# Eigendecomposition
from sklearn.feature_selection import SelectFromModel
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
#traemos las variables categoricas con los nuevos valores numericos y actualizamos la data
numData = pd.DataFrame(numD)
#Definimos un nuevo dataframe para conservar los valores originales en data y los transformados quedan en dataT
dataT = data
NCat = [
    'ID_USER', 'monto', 'hora', 'linea_tc', 'interes_tc', 'is_prime', 'dcto',
    'cashback'
]
dataNCat = data[NCat]
dataNCat = StandardScaler().fit_transform(dataNCat)
dataNCat = pd.DataFrame(dataNCat)
dataNCat.columns = NCat
for i in cat:
    dataT[i] = numData[i]
for i in NCat:
    dataT[i] = dataNCat[i]
#Ahora dividimos la data
X = dataT.drop('fraude', axis=1)
y = dataT['fraude'].values

# Modelo Decision Tree
tree_model = DecisionTreeClassifier(max_depth=2, criterion='entropy')

# 2. K-Nearest Neighbors

n = 5
Example #20
0
    def loadRawData(datadir,
                    puckid,
                    num_nmf_factors=100,
                    prep_for_benchmarking=False):
        """
Load data for a particular puck, clean it up a bit and store as AnnData. For later use, also performs a NMF and stores those.
Borrows code from autoNMFreg_windows.py, provided with the Slide-Seq raw data.
        """
        from sklearn.preprocessing import StandardScaler

        puckdir = "{0}/Puck_{1}".format(datadir, puckid)
        beadmapdir = max(glob.glob("{0}/BeadMapping_*-*_????".format(puckdir)),
                         key=os.path.getctime)
        schema_debug("Flag 314.001 ", beadmapdir)

        # gene exp
        gexp_file = "{0}/MappedDGEForR.csv".format(beadmapdir)
        dge = fast_csv_read(gexp_file, header=0, index_col=0)
        #  for faster testing runs, use below, it has just the first 500 cols of the gexp_file
        ## dge = fast_csv_read("/tmp/a1_dge.csv", header = 0, index_col = 0)
        dge = dge.T
        dge = dge.reset_index()
        dge = dge.rename(columns={'index': 'barcode'})
        schema_debug("Flag 314.010 ", dge.shape, dge.columns)

        # spatial location
        beadloc_file = "{0}/BeadLocationsForR.csv".format(beadmapdir)
        coords = fast_csv_read(beadloc_file, header=0)
        coords = coords.rename(columns={'Barcodes': 'barcode'})
        coords = coords.rename(columns={'barcodes': 'barcode'})
        schema_debug("Flag 314.020 ", coords.shape, coords.columns)

        # Slide-Seq cluster assignments
        atlas_clusters_file = "{0}/AnalogizerClusterAssignments.csv".format(
            beadmapdir)
        clstrs = pd.read_csv(atlas_clusters_file, index_col=None)
        assert list(clstrs.columns) == ["Var1", "x"]
        clstrs.columns = ["barcode", "atlas_cluster"]
        clstrs = clstrs.set_index("barcode")
        schema_debug("Flag 314.030 ", clstrs.shape, clstrs.columns)

        df_merged = dge.merge(coords, right_on='barcode', left_on='barcode')
        df_merged = df_merged[df_merged.barcode.isin(clstrs.index)]
        schema_debug("Flag 314.040 ", df_merged.shape, df_merged.columns)

        # remove sparse gene exp
        counts = df_merged.drop(['xcoord', 'ycoord'], axis=1)
        counts2 = counts.copy(deep=True)
        counts2 = counts2.set_index('barcode')  #.drop('barcode',axis=1)
        counts2_okcols = counts2.sum(axis=0) > 0
        counts2 = counts2.loc[:, counts2_okcols]
        UMI_threshold = 5
        counts2_umis = counts2.sum(axis=1).values
        counts2 = counts2.loc[counts2_umis > UMI_threshold, :]
        schema_debug("Flag 314.0552 ", counts.shape, counts2.shape,
                     counts2_umis.shape, isinstance(counts2, pd.DataFrame))

        #slide-seq authors normalize to have sum=1 across each bead, rather than 1e6
        cval = counts2_umis[counts2_umis > UMI_threshold]
        if not prep_for_benchmarking:
            counts2 = counts2.divide(
                cval, axis=0)  #np.true_divide(counts2, counts2_umis[:,None])
            #counts2 = np.true_divide(counts2, counts2_umis[:,None])

            # this is also a little unusual, but I'm following their practice
            counts2.iloc[:, :] = StandardScaler(with_mean=False).fit_transform(
                counts2.values)
            schema_debug("Flag 314.0553 ", counts2.shape, counts2_umis.shape,
                         isinstance(counts2, pd.DataFrame))

        coords2 = df_merged.loc[df_merged.barcode.isin(counts2.index),
                                ["barcode", "xcoord", "ycoord"]].copy(
                                    deep=True)
        coords2 = coords2.set_index('barcode')  #.drop('barcode', axis=1)
        schema_debug("Flag 314.0555 ", coords2.shape,
                     isinstance(coords2, pd.DataFrame))

        ok_barcodes = set(coords2.index) & set(counts2.index) & set(
            clstrs.index)
        schema_debug("Flag 314.060 ", coords2.shape, counts2.shape,
                     clstrs.shape, len(ok_barcodes))

        if prep_for_benchmarking:
            return (counts2[counts2.index.isin(ok_barcodes)].sort_index(),
                    coords2[coords2.index.isin(ok_barcodes)].sort_index(),
                    clstrs[clstrs.index.isin(ok_barcodes)].sort_index())

        ## do NMF
        K1 = num_nmf_factors
        listK1 = ["P{}".format(i + 1) for i in range(K1)]
        random_state = 17  #for repeatability, a fixed value
        model1 = sklearn.decomposition.NMF(n_components=K1,
                                           init='random',
                                           random_state=random_state,
                                           alpha=0,
                                           l1_ratio=0)
        Ho = model1.fit_transform(
            counts2.values
        )  #yes, slideseq code had Ho and Wo mixed up. Just following their lead here.
        Wo = model1.components_

        schema_debug("Flag 314.070 ", Ho.shape, Wo.shape)

        Ho_norm = StandardScaler(with_mean=False).fit_transform(Ho)
        Ho_norm = pd.DataFrame(Ho_norm)
        Ho_norm.index = counts2.index
        Ho_norm.columns = listK1
        Wo = pd.DataFrame(Wo)
        Wo.index = listK1
        Wo.index.name = "Factor"
        Wo.columns = list(counts2.columns)

        Ho_norm = Ho_norm[Ho_norm.index.isin(ok_barcodes)]
        Ho_norm = Ho_norm / Ho_norm.std(axis=0)

        schema_debug("Flag 314.080 ", Ho_norm.shape, Wo.shape)

        genexp = counts2[counts2.index.isin(ok_barcodes)].sort_index()
        beadloc = coords2[coords2.index.isin(ok_barcodes)].sort_index()
        clstrs = clstrs[clstrs.index.isin(ok_barcodes)].sort_index()
        Ho_norm = Ho_norm.sort_index()

        schema_debug("Flag 314.090 ", genexp.shape, beadloc.shape,
                     clstrs.shape, Ho_norm.shape, genexp.index[:5],
                     beadloc.index[:5])

        beadloc["atlas_cluster"] = clstrs["atlas_cluster"]

        if "AnnData" not in dir():
            from anndata import AnnData

        adata = AnnData(X=genexp.values,
                        obs=beadloc,
                        uns={
                            "Ho": Ho_norm,
                            "Ho.index": list(Ho_norm.index),
                            "Ho.columns": list(Ho_norm.columns),
                            "Wo": Wo,
                            "Wo.index": list(Wo.index),
                            "Wo.columns": list(Wo.columns)
                        })
        return adata

"""---------------------------------------------------------------------------------------------------------------------------------------------------
Cluster deltaDF to identify trends and groups
"""
"""
Perform one hot encoding for categorical features
"""
EncodedDF = pd.get_dummies(deltaDF, columns=['Subject', 'Condition', 'Limb'])

#Normalize values in each column (except categorical features) to have mean of 0 and variance of 1-----------------------------------------------------
deltaDF_norm = StandardScaler().fit_transform(deltaDF.loc[:,:'Stride_Width_Mean'])
deltaDF_norm = pd.DataFrame(deltaDF_norm)

deltaDF_norm = pd.concat([deltaDF_norm, EncodedDF.loc[:, 'Subject_0':]], axis = 1)
deltaDF_norm.columns = EncodedDF.columns
#-------------------------------------------------------------------------------------------------------------------------------------------------------
"""
PCA
"""
pca = PCA(n_components=5)
pComponents = pca.fit_transform(Features)

#Plot explained variance
features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_ratio_, color='black')
plt.xlabel('PCA features')
plt.ylabel('variance %')
plt.xticks(features)

PCA_components = pd.DataFrame(pComponents)
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.cluster import AgglomerativeClustering
from tabulate import tabulate

#traemos las variables categoricas con los nuevos valores numericos y actualizamos la data
numData = pd.DataFrame(numD)
for i in cat:
    data[i] = numData[i]
#Calculamos  la media de cada una de las variables agrupadas por cliente
users = data.groupby(['ID_USER']).mean()
users = StandardScaler().fit_transform(users)
users = pd.DataFrame(users)
users.columns = [
    'genero', 'monto', 'fecha', 'hora', 'dispositivo', 'establecimiento',
    'ciudad', 'tipo_tc', 'linea_tc', 'interes_tc', 'status_txn', 'is_prime',
    'dcto', 'cashback', 'fraude'
]
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(users)
principalDf = pd.DataFrame(
    data=principalComponents,
    columns=['principal component 1', 'principal component 2'])
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel('Componente  principal 1', fontsize=15)
ax.set_ylabel('Componente principal 2', fontsize=15)
ax.set_title(' VisualizaciĆ³n de los usuarios con 2 component PCA ',
             fontsize=20)
ax.scatter(principalDf['principal component 1'],
           principalDf['principal component 2'])
Example #23
0
#%%
#####################################################################################
# clustering
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
##############################
# Hierarchical clustering
# USArrests data
usarrest = pd.read_csv("./data/usarrest.csv")
Xname = ['Murder', 'Assault', 'UrbanPop', 'Rape']
Xdata = StandardScaler().fit_transform(usarrest[Xname])
Xdata = pd.DataFrame(Xdata)
Xdata.columns = Xname
Xdata.describe()
# Calculate the linkage: mergings
mergings = linkage(Xdata, method='average')
# Plot the dendrogram, using varieties as labels
plt.figure(figsize=(20, 10))
dendrogram(mergings,
           leaf_rotation=90,
           leaf_font_size=20,
           labels=usarrest['State'].values)
plt.show()
# Calculate means for each cluster
cluster = AgglomerativeClustering(n_clusters=5,
                                  affinity='euclidean',
                                  linkage='average')
group = cluster.fit_predict(Xdata)
Example #24
0
t.check_question_two(solution_2_dict)

# `3.` Fit PCA to reduce the current dimensionality of the datset to 3 dimensions.  You can use the helper functions, or perform the steps on your own.  If you fit on your own, be sure to standardize your data.  At the end of this process, you will want an X matrix with the reduced dimensionality to only 3 features.  Additionally, you will want your **pca** object back that has been used to fit and transform your dataset.

# In[14]:

#Scale your data, fit, and transform using pca
df_ss = StandardScaler().fit_transform(df)

# In[15]:

#Create a dataframe
df_ss = pd.DataFrame(df_ss)
df_ss.columns = [
    'Sports', 'SUV', 'Wagon', 'Minivan', 'Pickup', 'AWD', 'RWD', 'Retail',
    'Dealer', 'Engine', 'Cylinders', 'Horsepower', 'CityMPG', 'HighwayMPG',
    'Weight', 'Wheelbase', 'Length', 'Width'
]

# In[16]:

#Check first few rows
df_ss.head()

# In[17]:

df_ss.describe()

# In[18]:

#Reduce feature down to 3
Example #25
0
test = test.drop(cols, axis=1)

print("Train Rows: " + str(train.shape[0]) + ", Columns: " +
      str(train.shape[1]))
print("Test Rows: " + str(test.shape[0]) + ", Columns: " + str(test.shape[1]))

col_names = train.columns.values
train_norm = StandardScaler().fit_transform(train.values)
test_norm = StandardScaler().fit_transform(test.values)

train_norm = pd.DataFrame(train_norm)
#train_norm = train_norm.add_prefix('feature_')
test_norm = pd.DataFrame(test_norm)
#test_norm = test_norm.add_prefix('feature_')

train_norm.columns = col_names
test_norm.columns = col_names
train_norm['target'] = target
train_norm['ID'] = train_id
test_norm['ID'] = test_id
train_norm.to_csv(path + 'removed_train.csv', index=False)
test_norm.to_csv(path + 'removed_test.csv', index=False)

ntrain = train.shape[0]
ntest = test.shape[0]

#print('final column list')
#print(list(train.columns.values))

print("Combine Train and Test")
df = pd.concat([train, test], axis=0)

# In[177]:


fraud_data.TransactionAmt.max()


# In[183]:


#Standardization-/-Normalization
from sklearn.preprocessing import StandardScaler
scaled_features = StandardScaler().fit_transform(X)
scaled_features = pd.DataFrame(data = scaled_features)
scaled_features.columns = X.columns
scaled_features.head()


# In[187]:


#Splitting the data 
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3, random_state = 42)

# X_train: independent feature data for training the model
# Y_train: dependent feature data for training the model
# X_test: independent feature data for testing the model; will be used to predict the target values
# Y_test: original target values of X_test; We will compare this values with our predicted values.