def load_feature_data(): dataPath = '../data/TrainFeature-normalized image.csv' data = pd.read_csv(dataPath) row, _ = data.shape originData = pd.DataFrame(data) # print(originData) gleason_data = originData[originData['ClassifyValue'] != 0] label = gleason_data['ClassifyValue'] - 1 label = label.reset_index() # print(label['ClassifyValue'].value_counts()) # print(label) selectedFeatutesList = [] # colNames = gleason_data[gleason_data.columns[8:]].columns for colName in gleason_data.columns[8:]: # if 'T2' not in colName: selectedFeatutesList.append(colName) gleason_data = pd.DataFrame(gleason_data, columns=selectedFeatutesList) colNames = gleason_data.columns gleason_data = gleason_data.fillna(0) gleason_data = gleason_data.astype(np.float64) gleason_data = StandardScaler().fit_transform(gleason_data) gleason_data = pd.DataFrame(gleason_data) gleason_data.columns = colNames # smo = SMOTE(random_state=2) # x_smote, y_smote = smo.fit_sample(gleason_data, label['ClassifyValue']) # input_features = x_smote.shape[1] input_features = gleason_data.shape[1] # print(x_smote.shape[1]) # print(x_smote) return gleason_data, label['ClassifyValue'], input_features
def feature_selection(df, target): convert_dct = {'integer': 'int64', 'string': 'object', 'float': 'float64', 'boolean': 'bool', 'date-iso-8601': 'datetime64[ns]', 'date-eu': 'datetime64[ns]', 'date-non-std-subtype': 'datetime64[ns]', 'date-non-std': 'datetime64[ns]', 'gender': 'category', 'all-identical': 'category'} ptype = Ptype() ptype.run_inference(df) predicted = ptype.predicted_types count_normal_vars = 0 count_continuous_vars = 0 features = [] for key in predicted: # print(key, predicted[key]) if predicted[key] == 'int' or predicted[key] == 'float': features.append(key) x = df.loc[:, features].values x = StandardScaler().fit_transform(x) x = pd.DataFrame(x) x.columns = features X = x.drop(target, 1) # Feature Matrix y = x[target] # Target Variable # no of features nof_list = np.arange(1, len(features)) high_score = 0 # Variable to store the optimum features nof = 0 score_list = [] for n in range(len(nof_list)): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) model = LinearRegression() rfe = RFE(model, nof_list[n]) X_train_rfe = rfe.fit_transform(X_train, y_train) X_test_rfe = rfe.transform(X_test) model.fit(X_train_rfe, y_train) score = model.score(X_test_rfe, y_test) score_list.append(score) if (score > high_score): high_score = score nof = nof_list[n] # print("Optimum number of features: %d" % nof) # print("Score with %d features: %f" % (nof, high_score)) cols = list(X.columns) model = LinearRegression() # Initializing RFE model rfe = RFE(model, nof) # Transforming data using RFE X_rfe = rfe.fit_transform(X, y) # Fitting the data to model model.fit(X_rfe, y) temp = pd.Series(rfe.support_, index=cols) selected_features_rfe = temp[temp == True].index quality_measure = nof/len(features) return quality_measure
def TTest_Lasso(data): selectedFeatutesList = [] label = data['å čä¾µēÆ'] print(label[label[0:] == 0].values.size, label[label[0:] == 1].values.size) colNames = data[data.columns[1:]].columns data = data[data.columns[1:]].fillna(0) data = data.astype(np.float64) data = StandardScaler().fit_transform(data) data = pd.DataFrame(data) data.columns = colNames data['label'] = label # balanced Data smo = SMOTE(random_state=2) X_smote, y_smote = smo.fit_sample(data, data['label']) print(X_smote) for colName in X_smote.columns[0:-1]: if 'ADC' in colName: if levene(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName])[1] > 0.05 \ and ttest_ind(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName])[1] < 0.05: selectedFeatutesList.append(colName) elif levene(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName])[1] <= 0.05 and \ ttest_ind(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName], equal_var=False)[1] < 0.05: selectedFeatutesList.append(colName) if 'label' not in selectedFeatutesList: selectedFeatutesList = ['label'] + selectedFeatutesList # print(index) data1 = X_smote[X_smote['label'] == 0][selectedFeatutesList] data2 = X_smote[X_smote['label'] == 1][selectedFeatutesList] trainData = pd.concat([data1, data2]) # print(trainData) trainData = shuffle(trainData) trainData.index = range(len(trainData)) X = trainData[trainData.columns[1:]] y = trainData['label'] alphas = np.logspace(-3, 1, 50) model_lassoCV = LassoCV(alphas=alphas, cv=5, max_iter=3000).fit(X, y) print(model_lassoCV.alpha_) coef = pd.Series(model_lassoCV.coef_, index=X.columns) index = coef[coef != 0].index X = X[index] print(coef[coef != 0].sort_values(axis=0, ascending=False)) # featureNum = np.arange(len(index)) # featureCoef = coef[coef != 0] # plt.bar(featureNum, featureCoef, # color='lightblue', # edgecolor='black', # alpha=0.8) # plt.xticks(featureNum, index, # rotation='45', # ha='right', # va='top') print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables") return X, y
def prepare_data(filename): data = pd.read_csv(filename) print(f'File {filename} \nIlosc wyjÄ tkĆ³w \n {data["class"].value_counts()}', ) data = StandardScaler().fit_transform(data) data = normalize(data) data = PCA(n_components=2).fit_transform(data) data = pd.DataFrame(data) data.columns = ['P1', 'P2'] return data
def scale_features(df): ''' Scales the dataframe features with StandardScaler. ''' df_clms = df.columns df_scaled = StandardScaler().fit_transform(df) df_scaled = pd.DataFrame(df_scaled) df_scaled.columns = df_clms return df_scaled
def select_clustering_columns(self, col, PCA=False, n_component=0.80): """Select the columns that will be use in the clustering""" if PCA: self.clustered_df = self.PCA(n_component, col) else: df = self.df[col] df = StandardScaler().fit_transform( df) # Standardized the features df = pd.DataFrame(df) df.columns = col self.clustered_df = df
def prepare_data(filename): data = pd.read_csv(filename) if filename == 'online_shoppers_intention.csv': data = pd.get_dummies(data, columns=['Month', 'VisitorType']) data.fillna(0, inplace=True) data = StandardScaler().fit_transform(data) data = normalize(data) data = PCA(n_components=2).fit_transform(data) data = pd.DataFrame(data) data.columns = ['P1', 'P2'] return data[:5000]
def prepare_input_data(input_data, fill_blanks=True, strategy="median", standardize=True, pred=False): """ Args: input_data: fill_blanks: strategy: standardize: pred: Returns: """ data = get_data(input_data=input_data) # print(data) df = data[data.columns.drop(list(data.filter(regex='pred|flag')))] predicted_labels = data.filter(regex='pred|flag') cols = df.columns columns_full = list(cols) columns_without_ids = columns_full columns_without_ids.remove('ID') columns_without_ids.remove('True_Label') info = data[['ID', 'True_Label']] data_without_blanks = pd.DataFrame() if fill_blanks is True: data_without_blanks = filling_blank_cells(df.drop(['ID', 'True_Label'], axis=1), strategy=strategy) if fill_blanks is False: data_without_blanks = df.drop(['ID', 'True_Label'], axis=1).dropna() if standardize is True: data_without_blanks = StandardScaler().fit_transform( data_without_blanks) if standardize is False: pass data_without_blanks.columns = columns_without_ids complete_data = merge_dataframe(info, pd.DataFrame(data_without_blanks)) complete_data = complete_data.join(predicted_labels, how='left') if pred is False: complete_data = complete_data.dropna() # print('\n') return complete_data
def Pearson_RFE_selector(originData): label = originData['label'] print(label[label[0:] == 0].values.size, label[label[0:] == 1].values.size) colNames = originData[originData.columns[2:-1]].columns data = originData[originData.columns[2:-1]].fillna(0) data = data.astype(np.float64) data = StandardScaler().fit_transform(data) data = pd.DataFrame(data) data.columns = colNames data['label'] = label # balanced Data smo = SMOTE(random_state=3) X_smote, y_smote = smo.fit_sample(data[data.columns[0:-1]], data['label']) # print(X_smote) # features_list = [] # for colName in X_smote.columns: # if 'DWI' in colName: # features_list.append(colName) # X_smote = X_smote[features_list] # print(X_smote) # print(X_smote.corr(method='pearson').columns) # pearson_corr = X_smote.corr(method='pearson') # pearson_features_list = [] # mean = pearson_corr['label'].mean() # for colName in pearson_corr[(abs(pearson_corr['label']) > mean) & (abs(pearson_corr['label']) < 1)].index: # # if 'T2' in colName or 'ADC' in colName or 'DWI' in colName: # pearson_features_list.append(colName) # print(X_smote[pearson_features_list]) # X_pearson = X_smote[pearson_features_list] # y_pearson = X_smote['label'] lr = LinearRegression() rfe = RFECV(lr, step=1, cv=5) rfe.fit(X_smote, y_smote) X_rfe = X_smote.loc[:, rfe.support_] # print(X_rfe) names = X_rfe.columns ranks = rfe.ranking_ feature_indexes = [] for i in range(len(ranks)): if ranks[i] == 1: feature_indexes += [i] print(len(feature_indexes)) print(names) # print(sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names))) return X_rfe, y_smote
def calculate_all_neighbours(morph_stats_cut, all_by_all_nucleus): """ Run all calculations for neighbours based on the provided morphology table, and the validation / xyz criteria""" normed_morph = StandardScaler().fit_transform(morph_stats_cut) normed_morph = pd.DataFrame(normed_morph) normed_morph.columns = morph_stats_cut.columns # all by all neighbours, rows == no of rows in normed_morph, one for each cell # columns give index of neighbours (1st column == 1st neighbour, 2nd column == 2nd neighbour and so on...) nn = neighbours(normed_morph) # result using nuclei xyz position criteria - list of len() equal to number of rows in morphology table, # each entry is the number of neighbours away the first cell to meet the xyz criteria occurs result_nuc_criteria = get_nearest_index(nn, all_by_all_nucleus) # Cells where no cells meet the xyz criteria have values of -1 # check - each row of nn should contain all possible indices (0 - nrow of nn) apart from its own index unique_vals = np.array(range(0, nn.shape[0])) for i in range(0, nn.shape[0]): row = nn[i, :] test = np.sort(unique_vals[unique_vals != i]) row_vals = np.sort(np.unique(row)) if not (test == row_vals).all(): print(i) print('row does not contain all values apart from its own index') # calculate randomised results nuc_results_random = [] for i in range(0, 100): print(i) # shuffle each row & re-run nn_shuffle = nn.copy() for j in range(0, nn.shape[0]): np.random.shuffle(nn_shuffle[j, :]) nuc_results_random.append( get_nearest_index(nn_shuffle, all_by_all_nucleus)) # one row per iteration, number of cols == number of rows in morphology table. Each entry is the number of # neighoburs away the first cell to meet the xyz criteria occurs nuc_results_random = pd.DataFrame.from_records(nuc_results_random) return result_nuc_criteria, nuc_results_random
def PCA(self, n_component, columns): """ n_component : number of component of the PCA. columns: columns used to execute the PCA Transform the data into principal components using PCA. Return a dataframe df containing the microtrips described by their new (n_component) component.""" df = self.df[columns] # select only columns required for PCA df = StandardScaler().fit_transform(df) # Standardized the features pca = PCA(n_components=n_component) pca.fit(df) pca = pca.transform(df) columns = [] for i in range(len(pca[0])): columns.append('Component' + str(i)) i += 1 df = pd.DataFrame(pca) df.columns = columns return df
def feature_scaling(df, type_scale): ''' This function takes in either the azdias or the customers dataframe and applyes the selected feature scaler Args: customer or azdias dataframe and a string representing the type of scaling intended returns: scaled dataframe ''' features_list = df.columns if type_scale == 'StandardScaler': df_scaled = StandardScaler().fit_transform(df) if type_scale == 'MinMaxScaler': df_scaled = MinMaxScaler().fit_transform(df) df_scaled = pd.DataFrame(df_scaled) df_scaled.columns = features_list return df_scaled
def apply_pca(self, df, num_pcs, use_whiten=False): """ Apply Principal Components Analysis to dataframe. Parameters: - df, dataframe to apply PCA to - num_pcs, int, the number of principal componetns to use - use_whiten, boolean, flag to indicate whether data should be whitened Return: pandas dataframe containing the transformed data. """ # ensure data is scaled appropriately subreddits = df.index df = StandardScaler().fit_transform(df) # create the PCA object with the desired number of principal components pca = PCA(n_components = num_pcs, whiten=use_whiten) # return the PCA'd data df = pd.DataFrame(pca.fit_transform(df)) df.columns = ["PC_" + str(col) for col in df.columns.values] return df.set_index(subreddits)
data_train_ = data_train.copy() dt1 = pd.to_datetime(data_train["register_time"]) data_train["register_time"] = dt1.dt.dayofyear # data_train = data_train.drop("user_id",axis=1) data_train[ 'cha'] = data_train['prediction_pay_price'] - data_train['pay_price'] data_train = data_train.drop(["lable", 'prediction_pay_price'], axis=1) data_train['cha_2'] = data_train['cha'] del data_train['cha'] data_train.rename(columns={'cha_2': 'cha'}, inplace=True) x_, y = data_train.iloc[:, 0:len(data_train.columns) - 1], data_train.iloc[:, len(data_train.columns) - 1:] x = StandardScaler().fit_transform(x_) x = pd.DataFrame(x) x.columns = x_.columns x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=0.5) xgb = XGBRegressor(n_estimators=400, learning_rate=0.01, max_depth=4, random_state=1) xgb.fit(x_train, y_train) y_train_pred = xgb.predict(x_train) y_test_pred = xgb.predict(x_test) y_pred = xgb.predict(x) print u'xgbļ¼', mean_squared_error(y_train, y_train_pred)
def TTest_mRMR_svmRFE_selector(originData): selectedFeatutesList = [] label = originData['label'] colNames = originData[originData.columns[2:8]].columns data = originData[originData.columns[2:8]].fillna(0) data = data.astype(np.float64) data = StandardScaler().fit_transform(data) # minmax_scale = preprocessing.MinMaxScaler().fit(data) # data = minmax_scale.transform(data) data = pd.DataFrame(data) data.columns = colNames data['label'] = label # balanced Data smo = SMOTE(random_state=3) X_smote, y_smote = smo.fit_sample(data, data['label']) for colName in X_smote.columns[0:-1]: # if 'DWI' in colName: if levene(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName])[1] > 0.05 and \ ttest_ind(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName])[ 1] < 0.05: selectedFeatutesList.append(colName) elif levene(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName])[1] <= 0.05 and \ ttest_ind(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName], equal_var=False)[1] < 0.05: selectedFeatutesList.append(colName) if 'label' not in selectedFeatutesList: selectedFeatutesList = ['label'] + selectedFeatutesList # print(index) data1 = X_smote[X_smote['label'] == 0][selectedFeatutesList] data2 = X_smote[X_smote['label'] == 1][selectedFeatutesList] trainData = pd.concat([data1, data2]) # trainData = shuffle(trainData) # trainData.index = range(len(trainData)) # ęä¹±åéę°ę å· X = trainData[trainData.columns[1:]] y = trainData['label'] # print(X_Smote) # mRMR_features = pymrmr.mRMR(X_smote, 'MIQ', 15) # define MI_FS feature selection method feat_selector = mifs.MutualInformationFeatureSelector(method='JMIM') feat_selector.fit(X, y) # feat_selector._support_mask # feat_selector.ranking_ # call transform() on X to filter it down to selected features # X_filtered = feat_selector.transform(X_smote) # X_filtered = pd.DataFrame(X_filtered) # print(feat_selector.ranking_) # if 'label' not in mRMR_features: mRMR_features = ['label'] + mRMR_features X_mRMR = X.loc[:, feat_selector._support_mask] colNames = X_mRMR.columns clf = LinearSVC() # featureNums = len(selectedFeatutesList) # print(featureNums) model = RFE(clf, n_features_to_select=len(feat_selector.ranking_)) # print(y) # print(X_mRMR) model.fit(X_mRMR, y) feats = list(np.array(colNames)[model.support_]) for featureNames in feats: print(featureNames) print(len(feats)) X_RFE = X_mRMR[feats] return X_RFE, y
def mass_univariate(df: Optional[pd.DataFrame] = None, cat_independentVar_cols: Optional[Union[List[str], List[np.ndarray]]] = None, cont_independentVar_cols: Optional[Union[List[str], List[np.ndarray]]] = None, dependentVar_cols: Optional[Union[List[str], List[np.ndarray]]] = None, scaling: bool=True, col_to_drop : Optional[List[str]] = None, additional_info=None) -> Union[sm.regression.linear_model.RegressionResultsWrapper, pd.DataFrame]: """[Returns the model and model summary Performs univariate test, implements statsmodel.api.OLS] Args: df (Optional[pd.DataFrame]): [pandas data frame, where each row is one observation] cat_independentVar_cols (Optional[Union[List[str], List[np.ndarray]]], optional): [list of categorical variables, such that they will be converted to dummies by pandas]. Defaults to None. cont_independentVar_cols (Optional[Union[List[str], List[np.ndarray]]], optional): [list of continuous variables, they will be appended to cat_independentVar_cols]. Defaults to None. dependentVar_cols (Optional[Union[List[str], List[np.ndarray]]], optional): [the dependent variable, the ]. Defaults to None. scaling [bool]: Default = True. If false, we will not perform standard scaler col_to_drop [list[str]] = if we want to remove any column from the independent Variable before fitting the model. additional_info ([type], optional): [additional columns to be appended]. Defaults to None. Returns: Union[sm.regression.linear_model.RegressionResultsWrapper, pd.DataFrame]: [the statsmodel model and dataframe of the model summary, where cols are independent variables and const] """ model_summary = defaultdict(list) if type(df) == pd.DataFrame: new_df = df.copy() cat_independentVar = defaultdict(list) cont_independentVar = defaultdict(list) dependentVar = defaultdict(list) independentVar = dict() if type(cat_independentVar_cols) == list: for idx, cat_independentVar_col in enumerate(cat_independentVar_cols): if type(cat_independentVar_col) == str: cat_independentVar_temp = pd.get_dummies(new_df[cat_independentVar_col].values, prefix=cat_independentVar_col, drop_first=True).to_dict(orient='list') cat_independentVar.update(cat_independentVar_temp) if type(cat_independentVar_col) == np.ndarray: cat_independentVar_temp = pd.get_dummies(cat_independentVar_col, prefix='Cat_' + str(idx), drop_first=True).to_dict(orient='list') cat_independentVar.update(cat_independentVar_temp) elif type(cat_independentVar_cols) == np.ndarray: if cat_independentVar_cols.ndim == 1: cat_independentVar_temp = pd.get_dummies(cat_independentVar_cols, prefix='Cat_', drop_first=True).to_dict(orient='list') cat_independentVar.update(cat_independentVar_temp) else: for col in range(cat_independentVar_cols.shape[1]): cat_independentVar_temp = pd.get_dummies(cat_independentVar_cols[:, col], prefix='Cat_'+col+'_', drop_first=True).to_dict(orient='list') cat_independentVar.update(cat_independentVar_temp) if type(cont_independentVar_cols) == list: if not cont_independentVar_cols: pass elif type(cont_independentVar_cols[0]) == str: cont_independentVar_temp = np.asarray( new_df.loc[:, cont_independentVar_cols]) if cont_independentVar_temp.ndim == 1: cont_independentVar_temp = cont_independentVar.reshape(-1, 1) if scaling: cont_independentVar_temp = StandardScaler().fit_transform(cont_independentVar_temp) cont_independentVar_temp = pd.DataFrame(cont_independentVar_temp) cont_independentVar_temp.columns = cont_independentVar_cols cont_independentVar = cont_independentVar_temp.to_dict( orient='list') elif type(cont_independentVar_cols[0]) == np.ndarray: cont_independentVar_temp = np.hstack(cont_independentVar_cols) if cont_independentVar_temp.ndim == 1: cont_independentVar_temp = cont_independentVar.reshape(-1, 1) if scaling: cont_independentVar_temp = StandardScaler().fit_transform(cont_independentVar_temp) cont_independentVar_temp = pd.DataFrame(cont_independentVar_temp) cont_independentVar_temp.columns = [ 'Cont_'+str(i) for i in range(cont_independentVar_cols.shape[1])] cont_independentVar = cont_independentVar_temp.to_dict( orient='list') elif type(cont_independentVar_cols) == np.ndarray: cont_independentVar_temp = cont_independentVar_cols if cont_independentVar_temp.ndim == 1: cont_independentVar_temp = cont_independentVar_temp.reshape(-1, 1) if scaling: cont_independentVar_temp = StandardScaler().fit_transform(cont_independentVar_temp) cont_independentVar_temp = pd.DataFrame(cont_independentVar_temp) cont_independentVar_temp.columns = [ 'Cont_'+str(i) for i in range(cont_independentVar_temp.shape[1])] cont_independentVar = cont_independentVar_temp.to_dict(orient='list') if type(dependentVar_cols) == list: if type(dependentVar_cols[0]) == str: dependentVar_temp = np.asarray(new_df.loc[:, dependentVar_cols]) if dependentVar_temp.ndim == 1: dependentVar_temp = dependentVar_temp.reshape(-1, 1) if scaling: dependentVar_temp = StandardScaler().fit_transform(dependentVar_temp) dependentVar_temp = pd.DataFrame(dependentVar_temp) dependentVar_temp.columns = dependentVar_cols dependentVar = dependentVar_temp.to_dict(orient='list') elif dependentVar_cols[0] == np.ndarray: dependentVar_temp = np.hstack(dependentVar_cols) if dependentVar_temp.ndim == 1: dependentVar_temp = dependentVar_temp.reshape(-1, 1) if scaling: dependentVar_temp = StandardScaler().fit_transform(dependentVar_temp) dependentVar_temp = pd.DataFrame(dependentVar_temp) dependentVar_temp.columns = [ 'Dependent_Var_'+str(i) for i in range(dependentVar_cols.shape[1])] dependentVar = dependentVar_temp.to_dict(orient='list') elif type(dependentVar_cols) == np.ndarray: dependentVar_temp = dependentVar_cols if dependentVar_temp.ndim == 1: dependentVar_temp = dependentVar_temp.reshape(-1, 1) if scaling: dependentVar_temp = StandardScaler().fit_transform(dependentVar_temp) dependentVar_temp = pd.DataFrame(dependentVar_temp) dependentVar_temp.columns = [ 'Dependent_Var_'+str(i) for i in range(dependentVar_cols.shape[1])] dependentVar = dependentVar_temp.to_dict(orient='list') independentVar.update(cont_independentVar) independentVar.update(cat_independentVar) if not independentVar: independentVar['const'] = [1. for i in range( len(list(dependentVar.values())[0]))] independentVar = pd.DataFrame(independentVar) else: independentVar = pd.DataFrame(independentVar) independentVar = sm.add_constant(independentVar) if col_to_drop: try: independentVar = independentVar.drop(col_to_drop,axis=1) except KeyError: print('You might be trying to remove the categorical data, which in this function is written as Gender_2.0 or sth., Try again') raise KeyError() dependentVar = pd.DataFrame(dependentVar) for feature in dependentVar.columns: last_model = sm.OLS(dependentVar.loc[:, feature], independentVar).fit() result = [None] * (len(last_model.params) + len(last_model.pvalues)) result[::2] = last_model.params result[1::2] = last_model.pvalues model_summary[feature].extend(result) model_summary = pd.DataFrame(model_summary).T list1 = independentVar.columns.to_list() list2 = ['_coef', '_pval'] model_summary.columns = [i + n for i in list1 for n in list2] if additional_info: model_summary['Additional_info'] = additional_info model_summary.columns = [i + n for i in list1 for n in list2] + ['Additional_info'] return last_model, model_summary
def main(): umap = np.loadtxt(snakemake.input.umap, delimiter='\t') chosen_k = int(snakemake.params.chosen_k) chosen_res = float(snakemake.params.chosen_res) if chosen_k == 0: raise ValueError('No chosen number of neighbours - k - for louvain') elif chosen_res == 0: raise ValueError('No chosen resolution for louvain') with open(snakemake.input.cluster_file, 'rb') as f: clustering = pickle.load(f) plot, no_clusters, no_singletons = plot_clusters_on_umap(umap, clustering, label=True) plot.set_title('k_%s_res_%s__no_clusters__%s__no_singletons__%s' % (chosen_k, chosen_res, no_clusters, no_singletons), fontsize=20) plt.savefig(snakemake.output.fig) table = pd.read_csv(snakemake.input.filtered, sep='\t') col_name = 'chosen_k_%s_res_%s' % (chosen_k, chosen_res) table[col_name] = clustering # just keep label id and clusters if 'label_id' in table.columns: table = table[['label_id', col_name]] elif 'unique_id' in table.columns: table = table[['unique_id', col_name]] table.to_csv(snakemake.output.merged_table, index=False, sep='\t') # make morph and viz tables viz_table = pd.read_csv(snakemake.input.viz_table, sep='\t') full_with_ids = pd.read_csv(snakemake.input.full_with_ids, sep='\t') merged_morph = pd.read_csv(snakemake.input.merged_morph, sep='\t') merged_morph = filter_texture_fails(merged_morph) if 'unique' not in snakemake.params.gene_assign: viz_table['clusters'] = clustering # make table with label id & clusters for all cells full_with_ids['clusters'] = clustering table = full_with_ids[['label_id', 'clusters']] else: # make table with label id & clusters for all cells cut = full_with_ids[['label_id', 'unique_id']] table = cut.join(table.set_index('unique_id'), on='unique_id', how='left') table = table[['label_id', col_name]] table.columns = ['label_id', 'clusters'] viz_table = viz_table.join(table.set_index('label_id'), on='label_id', how='left') make_binary_columns(viz_table, 'clusters', snakemake.output.viz_table) morph_table = table.join(merged_morph.set_index('label_id'), on='label_id', how='inner') morph_table.to_csv(snakemake.output.morph_table, index=False, sep='\t') # and a normalised version just_morph = morph_table.drop(columns=['label_id', 'clusters']) col_names = just_morph.columns.tolist() just_morph = StandardScaler().fit_transform(just_morph) just_morph = pd.DataFrame(data=just_morph) just_morph.columns = col_names # can have issues with index matches producing nan values - reset indices here just_morph.reset_index(drop=True, inplace=True) morph_table.reset_index(drop=True, inplace=True) just_morph.insert(0, 'clusters', morph_table['clusters']) just_morph.insert(0, 'label_id', morph_table['label_id']) just_morph.to_csv(snakemake.output.morph_table_normalised, index=False, sep='\t')
# In[10]: ########################## ### Data Preprocessing ### ########################## # Normalizing data (just numeric columns) train_std = StandardScaler().fit_transform(train[numeric_cols]) test_std = StandardScaler().fit_transform(test[numeric_cols[:-1]]) train_std = pandas.DataFrame(data=train_std[0:,0:]) test_std = pandas.DataFrame(data=test_std[0:,0:]) train_std.columns = numeric_cols # Leave out label column for test data test_std.columns = numeric_cols[:-1] # In[ ]: # SVD u,s,v = np.linalg.svd(train_std.T) print('SVD: ', u) # In[ ]: # Eigendecomposition
from sklearn.feature_selection import SelectFromModel from imblearn.ensemble import BalancedRandomForestClassifier from imblearn.pipeline import Pipeline from imblearn.over_sampling import SMOTE #traemos las variables categoricas con los nuevos valores numericos y actualizamos la data numData = pd.DataFrame(numD) #Definimos un nuevo dataframe para conservar los valores originales en data y los transformados quedan en dataT dataT = data NCat = [ 'ID_USER', 'monto', 'hora', 'linea_tc', 'interes_tc', 'is_prime', 'dcto', 'cashback' ] dataNCat = data[NCat] dataNCat = StandardScaler().fit_transform(dataNCat) dataNCat = pd.DataFrame(dataNCat) dataNCat.columns = NCat for i in cat: dataT[i] = numData[i] for i in NCat: dataT[i] = dataNCat[i] #Ahora dividimos la data X = dataT.drop('fraude', axis=1) y = dataT['fraude'].values # Modelo Decision Tree tree_model = DecisionTreeClassifier(max_depth=2, criterion='entropy') # 2. K-Nearest Neighbors n = 5
def loadRawData(datadir, puckid, num_nmf_factors=100, prep_for_benchmarking=False): """ Load data for a particular puck, clean it up a bit and store as AnnData. For later use, also performs a NMF and stores those. Borrows code from autoNMFreg_windows.py, provided with the Slide-Seq raw data. """ from sklearn.preprocessing import StandardScaler puckdir = "{0}/Puck_{1}".format(datadir, puckid) beadmapdir = max(glob.glob("{0}/BeadMapping_*-*_????".format(puckdir)), key=os.path.getctime) schema_debug("Flag 314.001 ", beadmapdir) # gene exp gexp_file = "{0}/MappedDGEForR.csv".format(beadmapdir) dge = fast_csv_read(gexp_file, header=0, index_col=0) # for faster testing runs, use below, it has just the first 500 cols of the gexp_file ## dge = fast_csv_read("/tmp/a1_dge.csv", header = 0, index_col = 0) dge = dge.T dge = dge.reset_index() dge = dge.rename(columns={'index': 'barcode'}) schema_debug("Flag 314.010 ", dge.shape, dge.columns) # spatial location beadloc_file = "{0}/BeadLocationsForR.csv".format(beadmapdir) coords = fast_csv_read(beadloc_file, header=0) coords = coords.rename(columns={'Barcodes': 'barcode'}) coords = coords.rename(columns={'barcodes': 'barcode'}) schema_debug("Flag 314.020 ", coords.shape, coords.columns) # Slide-Seq cluster assignments atlas_clusters_file = "{0}/AnalogizerClusterAssignments.csv".format( beadmapdir) clstrs = pd.read_csv(atlas_clusters_file, index_col=None) assert list(clstrs.columns) == ["Var1", "x"] clstrs.columns = ["barcode", "atlas_cluster"] clstrs = clstrs.set_index("barcode") schema_debug("Flag 314.030 ", clstrs.shape, clstrs.columns) df_merged = dge.merge(coords, right_on='barcode', left_on='barcode') df_merged = df_merged[df_merged.barcode.isin(clstrs.index)] schema_debug("Flag 314.040 ", df_merged.shape, df_merged.columns) # remove sparse gene exp counts = df_merged.drop(['xcoord', 'ycoord'], axis=1) counts2 = counts.copy(deep=True) counts2 = counts2.set_index('barcode') #.drop('barcode',axis=1) counts2_okcols = counts2.sum(axis=0) > 0 counts2 = counts2.loc[:, counts2_okcols] UMI_threshold = 5 counts2_umis = counts2.sum(axis=1).values counts2 = counts2.loc[counts2_umis > UMI_threshold, :] schema_debug("Flag 314.0552 ", counts.shape, counts2.shape, counts2_umis.shape, isinstance(counts2, pd.DataFrame)) #slide-seq authors normalize to have sum=1 across each bead, rather than 1e6 cval = counts2_umis[counts2_umis > UMI_threshold] if not prep_for_benchmarking: counts2 = counts2.divide( cval, axis=0) #np.true_divide(counts2, counts2_umis[:,None]) #counts2 = np.true_divide(counts2, counts2_umis[:,None]) # this is also a little unusual, but I'm following their practice counts2.iloc[:, :] = StandardScaler(with_mean=False).fit_transform( counts2.values) schema_debug("Flag 314.0553 ", counts2.shape, counts2_umis.shape, isinstance(counts2, pd.DataFrame)) coords2 = df_merged.loc[df_merged.barcode.isin(counts2.index), ["barcode", "xcoord", "ycoord"]].copy( deep=True) coords2 = coords2.set_index('barcode') #.drop('barcode', axis=1) schema_debug("Flag 314.0555 ", coords2.shape, isinstance(coords2, pd.DataFrame)) ok_barcodes = set(coords2.index) & set(counts2.index) & set( clstrs.index) schema_debug("Flag 314.060 ", coords2.shape, counts2.shape, clstrs.shape, len(ok_barcodes)) if prep_for_benchmarking: return (counts2[counts2.index.isin(ok_barcodes)].sort_index(), coords2[coords2.index.isin(ok_barcodes)].sort_index(), clstrs[clstrs.index.isin(ok_barcodes)].sort_index()) ## do NMF K1 = num_nmf_factors listK1 = ["P{}".format(i + 1) for i in range(K1)] random_state = 17 #for repeatability, a fixed value model1 = sklearn.decomposition.NMF(n_components=K1, init='random', random_state=random_state, alpha=0, l1_ratio=0) Ho = model1.fit_transform( counts2.values ) #yes, slideseq code had Ho and Wo mixed up. Just following their lead here. Wo = model1.components_ schema_debug("Flag 314.070 ", Ho.shape, Wo.shape) Ho_norm = StandardScaler(with_mean=False).fit_transform(Ho) Ho_norm = pd.DataFrame(Ho_norm) Ho_norm.index = counts2.index Ho_norm.columns = listK1 Wo = pd.DataFrame(Wo) Wo.index = listK1 Wo.index.name = "Factor" Wo.columns = list(counts2.columns) Ho_norm = Ho_norm[Ho_norm.index.isin(ok_barcodes)] Ho_norm = Ho_norm / Ho_norm.std(axis=0) schema_debug("Flag 314.080 ", Ho_norm.shape, Wo.shape) genexp = counts2[counts2.index.isin(ok_barcodes)].sort_index() beadloc = coords2[coords2.index.isin(ok_barcodes)].sort_index() clstrs = clstrs[clstrs.index.isin(ok_barcodes)].sort_index() Ho_norm = Ho_norm.sort_index() schema_debug("Flag 314.090 ", genexp.shape, beadloc.shape, clstrs.shape, Ho_norm.shape, genexp.index[:5], beadloc.index[:5]) beadloc["atlas_cluster"] = clstrs["atlas_cluster"] if "AnnData" not in dir(): from anndata import AnnData adata = AnnData(X=genexp.values, obs=beadloc, uns={ "Ho": Ho_norm, "Ho.index": list(Ho_norm.index), "Ho.columns": list(Ho_norm.columns), "Wo": Wo, "Wo.index": list(Wo.index), "Wo.columns": list(Wo.columns) }) return adata
"""--------------------------------------------------------------------------------------------------------------------------------------------------- Cluster deltaDF to identify trends and groups """ """ Perform one hot encoding for categorical features """ EncodedDF = pd.get_dummies(deltaDF, columns=['Subject', 'Condition', 'Limb']) #Normalize values in each column (except categorical features) to have mean of 0 and variance of 1----------------------------------------------------- deltaDF_norm = StandardScaler().fit_transform(deltaDF.loc[:,:'Stride_Width_Mean']) deltaDF_norm = pd.DataFrame(deltaDF_norm) deltaDF_norm = pd.concat([deltaDF_norm, EncodedDF.loc[:, 'Subject_0':]], axis = 1) deltaDF_norm.columns = EncodedDF.columns #------------------------------------------------------------------------------------------------------------------------------------------------------- """ PCA """ pca = PCA(n_components=5) pComponents = pca.fit_transform(Features) #Plot explained variance features = range(pca.n_components_) plt.bar(features, pca.explained_variance_ratio_, color='black') plt.xlabel('PCA features') plt.ylabel('variance %') plt.xticks(features) PCA_components = pd.DataFrame(pComponents)
from sklearn.cluster import KMeans from scipy.cluster.hierarchy import linkage, dendrogram from sklearn.cluster import AgglomerativeClustering from tabulate import tabulate #traemos las variables categoricas con los nuevos valores numericos y actualizamos la data numData = pd.DataFrame(numD) for i in cat: data[i] = numData[i] #Calculamos la media de cada una de las variables agrupadas por cliente users = data.groupby(['ID_USER']).mean() users = StandardScaler().fit_transform(users) users = pd.DataFrame(users) users.columns = [ 'genero', 'monto', 'fecha', 'hora', 'dispositivo', 'establecimiento', 'ciudad', 'tipo_tc', 'linea_tc', 'interes_tc', 'status_txn', 'is_prime', 'dcto', 'cashback', 'fraude' ] pca = PCA(n_components=2) principalComponents = pca.fit_transform(users) principalDf = pd.DataFrame( data=principalComponents, columns=['principal component 1', 'principal component 2']) fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(1, 1, 1) ax.set_xlabel('Componente principal 1', fontsize=15) ax.set_ylabel('Componente principal 2', fontsize=15) ax.set_title(' VisualizaciĆ³n de los usuarios con 2 component PCA ', fontsize=20) ax.scatter(principalDf['principal component 1'], principalDf['principal component 2'])
#%% ##################################################################################### # clustering from sklearn.preprocessing import StandardScaler from scipy.cluster.hierarchy import dendrogram, linkage from sklearn.cluster import AgglomerativeClustering from sklearn.cluster import KMeans ############################## # Hierarchical clustering # USArrests data usarrest = pd.read_csv("./data/usarrest.csv") Xname = ['Murder', 'Assault', 'UrbanPop', 'Rape'] Xdata = StandardScaler().fit_transform(usarrest[Xname]) Xdata = pd.DataFrame(Xdata) Xdata.columns = Xname Xdata.describe() # Calculate the linkage: mergings mergings = linkage(Xdata, method='average') # Plot the dendrogram, using varieties as labels plt.figure(figsize=(20, 10)) dendrogram(mergings, leaf_rotation=90, leaf_font_size=20, labels=usarrest['State'].values) plt.show() # Calculate means for each cluster cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='average') group = cluster.fit_predict(Xdata)
t.check_question_two(solution_2_dict) # `3.` Fit PCA to reduce the current dimensionality of the datset to 3 dimensions. You can use the helper functions, or perform the steps on your own. If you fit on your own, be sure to standardize your data. At the end of this process, you will want an X matrix with the reduced dimensionality to only 3 features. Additionally, you will want your **pca** object back that has been used to fit and transform your dataset. # In[14]: #Scale your data, fit, and transform using pca df_ss = StandardScaler().fit_transform(df) # In[15]: #Create a dataframe df_ss = pd.DataFrame(df_ss) df_ss.columns = [ 'Sports', 'SUV', 'Wagon', 'Minivan', 'Pickup', 'AWD', 'RWD', 'Retail', 'Dealer', 'Engine', 'Cylinders', 'Horsepower', 'CityMPG', 'HighwayMPG', 'Weight', 'Wheelbase', 'Length', 'Width' ] # In[16]: #Check first few rows df_ss.head() # In[17]: df_ss.describe() # In[18]: #Reduce feature down to 3
test = test.drop(cols, axis=1) print("Train Rows: " + str(train.shape[0]) + ", Columns: " + str(train.shape[1])) print("Test Rows: " + str(test.shape[0]) + ", Columns: " + str(test.shape[1])) col_names = train.columns.values train_norm = StandardScaler().fit_transform(train.values) test_norm = StandardScaler().fit_transform(test.values) train_norm = pd.DataFrame(train_norm) #train_norm = train_norm.add_prefix('feature_') test_norm = pd.DataFrame(test_norm) #test_norm = test_norm.add_prefix('feature_') train_norm.columns = col_names test_norm.columns = col_names train_norm['target'] = target train_norm['ID'] = train_id test_norm['ID'] = test_id train_norm.to_csv(path + 'removed_train.csv', index=False) test_norm.to_csv(path + 'removed_test.csv', index=False) ntrain = train.shape[0] ntest = test.shape[0] #print('final column list') #print(list(train.columns.values)) print("Combine Train and Test") df = pd.concat([train, test], axis=0)
# In[177]: fraud_data.TransactionAmt.max() # In[183]: #Standardization-/-Normalization from sklearn.preprocessing import StandardScaler scaled_features = StandardScaler().fit_transform(X) scaled_features = pd.DataFrame(data = scaled_features) scaled_features.columns = X.columns scaled_features.head() # In[187]: #Splitting the data from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3, random_state = 42) # X_train: independent feature data for training the model # Y_train: dependent feature data for training the model # X_test: independent feature data for testing the model; will be used to predict the target values # Y_test: original target values of X_test; We will compare this values with our predicted values.