def make_preprocessing_pandas(self, _df_csv_read_ori, _preprocessing_type , _label): """ SKLearn을 사용해서 Pandas를 Proprocessing label은 Preprocessing 하면 안됨 Args: params: * _preprocessing_type: ['scale', 'minmax_scale', 'robust_scale', 'normalize', 'maxabs_scale'] * _df_csv_read_ori : pandas dataframe * _label Returns: Preprocessing DataFrame """ if _preprocessing_type == None or _preprocessing_type == 'null': logging.info("No Preprocessing") result_df = _df_csv_read_ori else : logging.info("Preprocessing type : {0}".format(_preprocessing_type)) numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] for i, v in _df_csv_read_ori.dtypes.iteritems(): if v in numerics: if i not in _label: #preprocessing_types = ['scale', 'minmax_scale', 'robust_scale', 'normalize', 'maxabs_scale'] #_preprocessing_type = ['maxabs_scale'] if 'scale' in _preprocessing_type: _df_csv_read_ori[i] = preprocessing.scale(_df_csv_read_ori[i].fillna(0.0)) if 'minmax_scale' in _preprocessing_type: _df_csv_read_ori[i] = preprocessing.minmax_scale(_df_csv_read_ori[i].fillna(0.0)) if 'robust_scale' in _preprocessing_type: _df_csv_read_ori[i] = preprocessing.robust_scale(_df_csv_read_ori[i].fillna(0.0)) if 'normalize' in _preprocessing_type: _df_csv_read_ori[i] = preprocessing.normalize(_df_csv_read_ori[i].fillna(0.0)) if 'maxabs_scale' in _preprocessing_type: _df_csv_read_ori[i] = preprocessing.maxabs_scale(_df_csv_read_ori[i].fillna(0.0)) result_df = _df_csv_read_ori return result_df
def parameter_scan(img_id, kind, cols='x y'.split(), only_core=True, do_scale=True, proba_cut=0.9, factor=0.1): p4id = markings.TileID(img_id, scope='planet4') functions = dict(blotch=p4id.plot_blotches, fan=p4id.plot_fans) min_samples_base = round(factor * p4id.n_marked_classifications) min_cluster_size_vals = [min_samples_base, round(1.5 * min_samples_base)] min_samples_vals = [1, min_samples_base, round(1.5 * min_samples_base)] data = p4id.filter_data(kind) X = data[cols].as_matrix() if do_scale: X = robust_scale(X) fig, ax = plt.subplots(nrows=len(min_cluster_size_vals), ncols=len(min_samples_vals) + 1) axes = ax.flatten() for ax, (mcs, ms) in zip(axes, product(min_cluster_size_vals, min_samples_vals)): logger.debug("Running with %i and %i.", mcs, ms) if ms > mcs: ax.set_title('ms > mcs') ax.set_axis_off() continue # elif ms == mcs and ms == 2 * min_samples_base: # p4id.show_subframe(ax=ax) # continue clusterer = HDBScanner(X, mcs, ms, proba_cut=proba_cut, only_core=only_core, metric='manhattan') reduced_data = post_processing(kind, data, clusterer) plot_results(clusterer, data, p4id, kind, reduced_data, ax=ax) ax.set_title('MCS: {}, MS: {}\nn_clusters: {}, averaged: {}' .format(mcs, ms, clusterer.n_clusters, len(reduced_data)), fontsize=6) threshold = pd.Series(clusterer.hdbscan.outlier_scores_).quantile(0.9) outliers = np.where(clusterer.hdbscan.outlier_scores_ > threshold, True, False) ax.scatter(data.loc[outliers, 'x'], data.loc[outliers, 'y'], marker='x', s=15, linewidth=1, c='red', alpha=0.75) p4id.show_subframe(ax=axes[-1]) functions[kind](ax=axes[-2], lw=0.25) fig.suptitle("n_class: {}, ncols: {}, factor: {}, scale: {}" .format(p4id.n_marked_classifications, len(cols), factor, do_scale)) savepath = ("plots/{}/{}_lencols{}_factor{}_scale{}.png" .format(kind, img_id, len(cols), factor, do_scale)) fig.savefig(savepath, dpi=200)
) if args.fit: # # fit the hybrid model # # prepare input song features and playlist targets at training X_fit, Y_fit = shape_data( playlists_idx, songs_idx, idx2song, features, mode='train', subset=fit_idx ) # preprocess input features if required if model.standardize: X_fit = prep.robust_scale(X_fit) if model.normalize: X_fit = prep.normalize(X_fit, norm=model.normalize) # fit the classifier fit( model=model, fit_input=X_fit.astype(theano.config.floatX), fit_target=Y_fit.astype(np.int8), out_dir=out_dir, random_state=rng ) if args.test: #
def show_result(self): self.hideAll() self.verticalLayoutWidget.show() self.plainTextResult.show() self.resultWidget.show() self.resultWidget.horizontalHeader().setStyleSheet( "QHeaderView::section {background-color:#D9E5FF;color:#000000;}") self.resultWidget.horizontalHeader().setStretchLastSection(True) self.resultWidget.verticalHeader().setStyleSheet( "QHeaderView::section {background-color:#D9E5FF;color:#000000;}") if self.brand == "E": self.result = pd.DataFrame(columns=["지점명", "폐점확률"]) self.df = pd.read_excel("./Data/preprocessing/E/binning.xlsx") LR = joblib.load("./model/emart.pk1") for i, v in enumerate(np.round(LR.predict_proba(robust_scale(self.df.iloc[:, 2:-1])), 3)): if i < 138: self.result.loc[i, "지점명"] = self.df.iloc[i, 1] self.result.loc[i, "폐점확률"] = v[0] columns = list(map(str, self.result.keys())) self.result = self.result.astype(str) item_count = len(self.result[columns[0]]) self.resultWidget.setRowCount(item_count) self.resultWidget.setColumnCount(len(columns)) self.resultWidget.setHorizontalHeaderLabels(columns) for j in range(item_count): row = self.result.iloc[j, :] for i in range(len(row)): item = QTableWidgetItem(row[i]) item.setTextAlignment(Qt.AlignVCenter | Qt.AlignRight) self.resultWidget.setItem(j, i, item) self.resultWidget.resizeRowsToContents() f = open("./txt_file/emart.txt", "r", encoding="UTF-8") lines = f.readlines() lines = "".join(lines) self.plainTextResult.setPlainText(lines) f.close() elif self.brand == "ET": self.result = pd.DataFrame(columns=["지점명", "전환확률"]) self.df = pd.read_excel("./Data/preprocessing/ET/binning.xlsx") LR = joblib.load("./model/emart_tr.pk1") for i, v in enumerate(np.round(LR.predict_proba(robust_scale(self.df.iloc[:, 2:-1])), 3)): if i < 22: self.result.loc[i, "지점명"] = self.df.iloc[i, 1] self.result.loc[i, "전환확률"] = v[1] columns = list(map(str, self.result.keys())) self.result = self.result.astype(str) item_count = len(self.result[columns[0]]) self.resultWidget.setRowCount(item_count) self.resultWidget.setColumnCount(len(columns)) self.resultWidget.setHorizontalHeaderLabels(columns) for j in range(item_count): row = self.result.iloc[j, :] for i in range(len(row)): item = QTableWidgetItem(row[i]) item.setTextAlignment(Qt.AlignVCenter | Qt.AlignRight) self.resultWidget.setItem(j, i, item) self.resultWidget.resizeRowsToContents() f = open("./txt_file/emart_tr.txt", "r", encoding="UTF-8") lines = f.readlines() lines = "".join(lines) self.plainTextResult.setPlainText(lines) f.close() elif self.brand == "H": self.result = pd.DataFrame(columns=["지점명", "폐점확률"]) self.df = pd.read_excel("./Data/preprocessing/H/binning.xlsx") LR = joblib.load("./model/homeplus.pk1") for i, v in enumerate(np.round(LR.predict_proba(scale(self.df.iloc[:, 2:-1])), 3)): if i < 119: self.result.loc[i, "지점명"] = self.df.iloc[i, 1] self.result.loc[i, "폐점확률"] = v[0] columns = list(map(str, self.result.keys())) self.result = self.result.astype(str) item_count = len(self.result[columns[0]]) self.resultWidget.setRowCount(item_count) self.resultWidget.setColumnCount(len(columns)) self.resultWidget.setHorizontalHeaderLabels(columns) for j in range(item_count): row = self.result.iloc[j, :] for i in range(len(row)): item = QTableWidgetItem(row[i]) item.setTextAlignment(Qt.AlignVCenter | Qt.AlignRight) self.resultWidget.setItem(j, i, item) self.resultWidget.resizeRowsToContents() f = open("./txt_file/homeplus.txt", "r", encoding="UTF-8") lines = f.readlines() lines = "".join(lines) self.plainTextResult.setPlainText(lines) f.close() elif self.brand == "HS": self.result = pd.DataFrame(columns=["지점명", "전환확률"]) self.df = pd.read_excel("./Data/preprocessing/HS/binning.xlsx") LR = joblib.load("./model/special.pk1") for i, v in enumerate(np.round(LR.predict_proba(robust_scale(self.df.iloc[:, 2:-1])), 3)): if i < 119: self.result.loc[i, "지점명"] = self.df.iloc[i, 1] self.result.loc[i, "전환확률"] = v[1] columns = list(map(str, self.result.keys())) self.result = self.result.astype(str) item_count = len(self.result[columns[0]]) self.resultWidget.setRowCount(item_count) self.resultWidget.setColumnCount(len(columns)) self.resultWidget.setHorizontalHeaderLabels(columns) for j in range(item_count): row = self.result.iloc[j, :] for i in range(len(row)): item = QTableWidgetItem(row[i]) item.setTextAlignment(Qt.AlignVCenter | Qt.AlignRight) self.resultWidget.setItem(j, i, item) self.resultWidget.resizeRowsToContents() f = open("./txt_file/homeplus_special.txt", "r", encoding="UTF-8") lines = f.readlines() lines = "".join(lines) self.plainTextResult.setPlainText(lines) f.close() elif self.brand == "L": self.result = pd.DataFrame(columns=["지점명", "폐점확률"]) self.df = pd.read_excel("./Data/preprocessing/L/binning.xlsx") LR = joblib.load("./model/lotte.pk1") for i, v in enumerate(LR.predict_proba(self.df.iloc[:, 2:-1])): if i < 109: self.result.loc[i, "지점명"] = self.df.iloc[i, 1] self.result.loc[i, "폐점확률"] = np.round(v[0], 3) columns = list(map(str, self.result.keys())) self.result = self.result.astype(str) item_count = len(self.result[columns[0]]) self.resultWidget.setRowCount(item_count) self.resultWidget.setColumnCount(len(columns)) self.resultWidget.setHorizontalHeaderLabels(columns) for j in range(item_count): row = self.result.iloc[j, :] for i in range(len(row)): item = QTableWidgetItem(row[i]) item.setTextAlignment(Qt.AlignVCenter | Qt.AlignRight) self.resultWidget.setItem(j, i, item) self.resultWidget.resizeRowsToContents() f = open("./txt_file/lotte.txt", "r", encoding="UTF-8") lines = f.readlines() lines = "".join(lines) self.plainTextResult.setPlainText(lines) f.close() if self.checkBox_2.isChecked(): self.result.sort_values(by=self.result.keys()[-1], ascending=False, inplace=True) columns = list(map(str, self.result.keys())) self.result = self.result.astype(str) item_count = len(self.result[columns[0]]) self.resultWidget.setRowCount(item_count) self.resultWidget.setColumnCount(len(columns)) self.resultWidget.setHorizontalHeaderLabels(columns) for j in range(item_count): row = self.result.iloc[j, :] for i in range(len(row)): item = QTableWidgetItem(row[i]) item.setTextAlignment(Qt.AlignVCenter | Qt.AlignRight) self.resultWidget.setItem(j, i, item) if self.checkBox.isChecked(): self.result.to_excel("./save_file/result_{}.xlsx".format(self.brand), index=False) for i in reversed(range(self.resultLayout.count())): self.resultLayout.itemAt(i).widget().setParent(None) fig = plt.Figure() ax = fig.add_subplot(111) a = self.result.sort_values(by=self.result.keys()[-1], ascending=False) ax.barh(a.iloc[:5, 0][::-1], a.iloc[:5, 1][::-1].astype(float), 0.4) ax.set_xticks(np.arange(0, 1.1, step=0.2)) ax.xaxis.set_tick_params(labelsize=7) ax.yaxis.set_tick_params(labelsize=7, rotation=60) ax.set(title="<{} 상위 5개 지점>".format(a.keys()[-1])) canvas = FigureCanvas(fig) canvas.draw() self.resultLayout.addWidget(canvas) canvas.show() self.checkBox.setChecked(False) self.checkBox_2.setChecked(False)
# RationalQuadratic = gp.kernels.RationalQuadratic() regr1 = gp.GaussianProcessRegressor(alpha=1e-5, n_restarts_optimizer=5) regr2 = gp.GaussianProcessRegressor(alpha=1e-5, n_restarts_optimizer=5) # Read data mat = scipy.io.loadmat("Data_and_training_sample.mat") mask = np.array(mat["mask"], dtype=bool) t1_test = np.array(mat["t1"], dtype=float) t1_test = np.reshape(t1_test, (-1, t1_test.shape[-1])) t2_test = np.array(mat["t2"], dtype=float) t2_test = np.reshape(t2_test, (-1, t2_test.shape[-1])) idx = np.where(mask)[0] # Normalise data t1 = pre.robust_scale(t1) t2 = pre.robust_scale(t2) t1_tr = t1[idx.transpose()] t2_tr = t2[idx.transpose()] del mat regr1.fit(t1_tr, t2_tr) t1_hat = np.empty((0, t2.shape[1])) tic = time.time() for i in range(1 + t1.shape[0] / 31000): a = i * 31000 b = a + 31000 temp1 = t1[a:b, :] t1_hat = np.append(t1_hat, regr1.predict(temp1), axis=0) toc = time.time() et = toc - tic
def extreme_random_byhour(df, evaluate_var='DUMMY_30_DAY'): # Log de Precios # df['PESPANIA'] = np.log(df['PESPANIA']) # df['PPORTUGAL'] = np.log(df['PPORTUGAL']) del df['PPORTUGAL'] df['FECHA'] = df['ANIO'].map(str) + '-' + df['MES'].map( str) + '-' + df['DIA'].map(str) df['FECHA'] = pd.to_datetime(df['FECHA'], format='%Y-%m-%d') df['WEEKDAY'] = df['FECHA'].dt.dayofweek # df['DUMMY_2010_REGIMEN'] = pd.Series(0, index=df.index) # df.loc[df['FECHA'] >= '2010-01-01', 'DUMMY_2010_REGIMEN'] = 1 # df = df[df['FECHA'] >= '2010-01-01'] df['ANIO'] = df['ANIO'].map(int) df['MES'] = df['MES'].map(int) df['DIA'] = df['DIA'].map(int) # df = df.groupby(['FECHA']).mean().reset_index() del df['HORA'] del df['FECHA'] del df['FECHA_HORA'] del df['DIA'] # TARGET VARIABLE dummy_important = [ 'DUMMY', 'DUMMY_5_DAY', 'DUMMY_10_DAY', 'DUMMY_15_DAY', 'DUMMY_20_DAY', 'DUMMY_30_DAY' ] dummy_important.remove(evaluate_var) for i in dummy_important: del df[i] # DIFFERENCIATE # DIFERENCIA PESPANIA ''' df['PESPANIA'] = df['PESPANIA'] - df['PESPANIA'].shift(1) df = df.dropna(axis=0) # DIFERENCIA RESTO need_differenciation = ['TOTAL_PRODUCCION_POR', 'TOTAL_DEMANDA_POR', 'CICLO_COMBINADO', 'FUEL_PRIMA', 'PRICE_OIL', 'PRICE_GAS', 'RISK_PREMIUM', 'TME_MADRID', 'TMAX_MADRID', 'TME_BCN', 'TMAX_BCN', 'TMIN_BCN', 'GDP'] for i in need_differenciation: name = 'D_' + str(i) df[name] = df[i] - df[i].shift(1) del df[i] df = df.dropna() ''' # DUMMIES dummy_var = ['ANIO', 'MES', 'WEEKDAY'] for i in dummy_var: name = str(i) dummy = pd.get_dummies(df[i], prefix=name) df = pd.concat([df, dummy], axis=1) del dummy del df[i] # LAGS lag_AR = 28 for i in range(1, lag_AR + 1, 1): name = 'PESPANIA_lag_' + str(i) df[name] = df['PESPANIA'].shift(i) lag_number = 24 lag_variables = [ 'TOTAL_IMPORTACION_ES', 'TOTAL_PRODUCCION_ES', 'TOTAL_DEMANDA_NAC_ES', 'TOTAL_EXPORTACIONES_ES', 'TOTAL_DDA_ES', 'TOTAL_POT_IND_ES', 'HIDRAULICA_CONVENC', 'HIDRAULICA_BOMBEO', 'NUCLEAR', 'CARBON NACIONAL', 'CARBON_IMPO', 'CICLO_COMBINADO', 'FUEL_SIN_PRIMA', 'FUEL_PRIMA', 'REG_ESPECIAL', 'PRICE_OIL', 'PRICE_GAS', 'RISK_PREMIUM' ] for i in range(1, lag_number, 1): for j in lag_variables: name = str(j) + '_lag_' + str(i) df[name] = df[j].shift(i) lag_number = 24 climaticas = [ 'TME_MADRID', 'TMAX_MADRID', 'TMIN_MADRID', 'PP_MADRID', 'TME_BCN', 'TMAX_BCN', 'TMIN_BCN', 'PP_BCN' ] for i in range(1, lag_number + 1, 1): for j in climaticas: name = str(j) + '_lag_' + str(i) df[name] = df[j].shift(i) lag_number = 24 portugal = ['TOTAL_DEMANDA_POR', 'TOTAL_PRODUCCION_POR'] for i in range(1, lag_number + 1, 1): for j in portugal: name = str(j) + '_lag_' + str(i) df[name] = df[j].shift(i) df = df.dropna(how='any', axis=0) normal = df[df[evaluate_var] == 0] anormal = df[df[evaluate_var] == 1] del normal[evaluate_var] del anormal[evaluate_var] # NORMALIZE column_names = normal.columns.values.tolist() normal = preprocessing.robust_scale(normal) normal = pd.DataFrame(normal, columns=[column_names]) column_names = anormal.columns.values.tolist() anormal = preprocessing.robust_scale(anormal) anormal = pd.DataFrame(anormal, columns=[column_names]) total_values = len(df.index) print('total rows ', total_values) anormal_values = len(anormal.index) print('anormal rows ', anormal_values) proportion = anormal_values / total_values print('proportion of anormal ', proportion) normalY = normal[['PESPANIA']] normalX = normal del normalX['PESPANIA'] anormalY = anormal[['PESPANIA']] anormalX = anormal del anormalX['PESPANIA'] names = normalX.columns.values fileNames = np.array(names) # Solo tomamos test y train del normal con el mismo tamaño del test que la muestra de anormales X_train, X_test, y_train, y_test = train_test_split(normalX, normalY, test_size=proportion, random_state=42) nTreeList = range(2000, 2001, 1) for iTrees in nTreeList: tresholds = np.linspace(0.1, 1.0, 200) min_samples_leaf = round(len(X_train.index) * 0.005) print('min_samples_leaf ', min_samples_leaf) min_samples_split = min_samples_leaf * 10 print('min_samples_split ', min_samples_split) print('iTrees ', iTrees) depth = 50 maxFeat = (round((len(df.columns) / 3))) print('Feature Set ', maxFeat) fileModel = ensemble.GradientBoostingRegressor( learning_rate=0.01, n_estimators=500, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, max_depth=depth, verbose=1) fileModel1 = ensemble.ExtraTreesRegressor( criterion='mse', bootstrap=False, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split, n_estimators=iTrees, max_depth=depth, max_features=maxFeat, oob_score=False, random_state=531, verbose=1) fileModel2 = ensemble.RandomForestRegressor( n_estimators=iTrees, max_depth=depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, verbose=1, max_features=maxFeat) fileModel.fit(X_train.values, y_train.values) prediction_normal = fileModel.predict(X_test) print('MSE NORMAL ', mean_squared_error(y_test, prediction_normal)) print('R2 NORMAL ', r2_score(y_test, prediction_normal)) prediction_normal = pd.DataFrame(prediction_normal, index=y_test.index) prediction_normal = pd.concat([y_test, prediction_normal], axis=1) prediction_normal.columns = [ 'PESPANIA_REAL_NO_COLUSION', 'PESPANIA_PRED_NO_COLUSION' ] prediction_normal['DIF_PORC'] = ( prediction_normal['PESPANIA_REAL_NO_COLUSION'] - prediction_normal['PESPANIA_PRED_NO_COLUSION'] ) / prediction_normal['PESPANIA_PRED_NO_COLUSION'] print('PRECIO PROMEDIO PREDICHO - NO COLUSION %.5f' % prediction_normal['PESPANIA_PRED_NO_COLUSION'].mean()) print('PRECIO PROMEDIO REAL - NO COLUSION %.5f ' % prediction_normal['PESPANIA_REAL_NO_COLUSION'].mean()) print('DIFERENCIA PROMEDIO PORCENTUAL (REAL/PRED -1)', prediction_normal['DIF_PORC'].mean() * 100, '%') prediction_normal.to_csv('prediction_normal_dia.csv', sep=';', index=False) prediction_anormal = fileModel.predict(anormalX) print('MSE ANORMAL ', mean_squared_error(anormalY, prediction_anormal)) print('R2 ANORMAL ', r2_score(anormalY, prediction_anormal)) prediction_anormal = pd.DataFrame(prediction_anormal, index=anormalY.index) prediction_anormal = pd.concat([anormalY, prediction_anormal], axis=1) prediction_anormal.columns = [ 'PESPANIA_REAL_COLUSION', 'PESPANIA_PRED_COLUSION' ] prediction_anormal['DIF_PORC'] = ( prediction_anormal['PESPANIA_REAL_COLUSION'] - prediction_anormal['PESPANIA_PRED_COLUSION'] ) / prediction_anormal['PESPANIA_PRED_COLUSION'] print('PRECIO PROMEDIO PREDICHO - COLUSION %.5f' % prediction_anormal['PESPANIA_PRED_COLUSION'].mean()) print('PRECIO PROMEDIO REAL - COLUSION %.5f' % prediction_anormal['PESPANIA_REAL_COLUSION'].mean()) print('DIFERENCIA PROMEDIO PORCENTUAL (REAL/PRED -1)', prediction_anormal['DIF_PORC'].mean() * 100, '%') prediction_anormal.to_csv('prediction_anormal_dia.csv', sep=';', index=False) fig, ax = plot.subplots() sns.regplot(y='PESPANIA_PRED_COLUSION', x='PESPANIA_REAL_COLUSION', data=prediction_anormal, ax=ax, label='COLUSION') sns.regplot(y='PESPANIA_PRED_NO_COLUSION', x='PESPANIA_REAL_NO_COLUSION', data=prediction_normal, ax=ax, label='NON-COLUSION') diag_line, = ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3", label='perfect prediction') plot.legend(loc='best') plot.title('Differences between Prices using ERF') plot.show() fig, ax = plot.subplots() prediction_anormal = prediction_anormal.reset_index() sns.regplot(y='PESPANIA_PRED_COLUSION', x='index', data=prediction_anormal, ax=ax, label='PREDICTED') sns.regplot(y='PESPANIA_REAL_COLUSION', x='index', data=prediction_anormal, ax=ax, label='REAL') # diag_line, = ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3", label='perfect prediction') plot.legend(loc='best') plot.title('Differences between Prices') plot.show() featureImportance = fileModel.feature_importances_ featureImportance = featureImportance / featureImportance.max() sorted_idx = np.argsort(featureImportance) fi = featureImportance[sorted_idx] fi = fi[-10:] barPos = np.arange(sorted_idx.shape[0]) + 0.5 barPos = barPos[-10:] plot.barh(barPos, fi, align='center') fileNames = fileNames[sorted_idx] fileNames = fileNames[-10:] plot.yticks(barPos, fileNames) plot.xlabel('Variable Importance') plot.show()
tempDF.index = pd.to_datetime(tempDF.index, yearfirst=True) tempDF = tempDF.resample('W').agg(np.nansum) tempDF.replace(0, np.nan, inplace=True) firstIndex = tempDF.first_valid_index() lastIndex = tempDF.last_valid_index() tempDFCut = tempDF.loc[firstIndex:lastIndex] tempDFCut = tempDFCut.ewm( span=np.max([1, len(tempDFCut.index) * ewmParameter])).mean() tempDF = tempDF.ewm( span=np.max([1, len(tempDFCut.index) * ewmParameter])).mean() trajectoriesSet[column] = [ list(robust_scale(tempDFCut[column].values)) ] trajectoriesSmoothOriginal[column] = list(tempDF[column].values) trajectoriesRaw[column] = list(data[column].fillna(0)) maxLength = max([len(value[0]) for _, value in trajectoriesSet.items()]) trajectoriesSetProcessed = {} for key, value in trajectoriesSet.items(): value = value[0] if len(value) == maxLength: trajectoriesSetProcessed[key] = np.array(value).reshape( 1, len(value)) continue
def investigate(FEATURES, PERFORMANCE, random_state=0, scale_features='maxabs', scale_performance=None, images_dir=cfg.finvestig_images_dir, data_dir=cfg.finvestig_data_dir, results_dir=cfg.finvestig_results_dir): '''This function is used to produce a list of the top features as chosen by the random forest. For each feature, the forest calculates a score. To construct the most important features, all features with a score above a threshold are chosen. Currently, the threshold is set to 2*mean, where the mean is the arithmetic mean taken over the various scores. :param FEATURES: Features file. This should be in CSV format, with column 0 being the instance name and row zero being the names of the features. :param PERFORMANCE: Performance file. This should be in CSV format, with column 0 being the instance name and row zero being the names of the various solvers. :param random_state: Specify the random seed (int) to be used in training the Random Forest. default=0 :param scale_features: There are various ways to scale the features data. The scaling is done column-wise (i.e. on each feature individually). default='maxabs'. - maxabs = Scale to [-1,1] - scale = Zero mean and unit stdev - minmax = Translate and scale to [0,1] - normalize = Normalize each feature to unit norm - robust = Shift outliers in according to interquartile range :param scale_performance: There are various ways to scale the performance data. The scaling is done row-wise (i.e. on each instance individually). default=None. - maxabs = Scale to [-1,1] - scale = Zero mean and unit stdev - minmax = Translate and scale to [0,1] - normalize = Normalize each row to unit norm - default_scale = Add 1000 to each entry, and row-wise divide by default performance :param images_dir: Directory to dump images. :param data_dir: Directory to dump data. :param results_dir: Directory to dump results. :return: The filename of every saved output automatically has the input file names used to produce it. - Text 1: Reduced by Random Forest Regressor space. This is a subset of the original Feature space, with most important features chosen as the subset. Most important is a heuristic chosen by the Random Forest. Automatically saved to CSV format in data_dir. ''' ################################################################### # Section 1A: Grabs Data ################################################################### stamp = '%s_%s' % (os.path.basename(FEATURES).split('.')[0], os.path.basename(PERFORMANCE).split('.')[0]) with open("%s" % (FEATURES)) as f: reader = csv.reader(f, delimiter=",") data_f = list(reader) #instances = [os.path.basename(line[0]).split('.')[0] for line in data[1:]] features = [line for line in data_f[1:]] feature_names = [line for line in data_f[0]] with open("%s" % (PERFORMANCE)) as f: reader = csv.reader(f, delimiter=",") data_p = list(reader) performances = [line for line in data_p[1:]] ################################################################### # Section 1B: Sync up data so that only instances with both # feature vectors and performance data is trained on ################################################################### performances_matched = [] features_matched = [] instances_matched = [] for line in features: instance_name = os.path.basename(line[0]).split('.')[0] for line in performances: if line[0] == instance_name: instances_matched.append(instance_name) for instance in instances_matched: for line in features: if instance == os.path.basename(line[0]).split('.')[0]: features_matched.append(line) for instance in instances_matched: for line in performances: if instance == line[0]: performances_matched.append(line) performances_tot = [line[1:] for line in performances_matched] features_tot = [ line[1:-1] for line in features_matched ] #the -1 here removes the empty string coming from feature selection # There's some string issue. The following converts to floats: performances_tot = [[float(i) for i in j] for j in performances_tot] performances_tot = np.array(performances_tot) ################################################################### # Section 1C: Scale the feature/performance data ################################################################### # normalize = scale to unit norm # maxabs_scale = scale to [-1,1] # scale = zero mean scaled to std one if scale_features == 'scale': features_tot = preprocessing.scale(features_tot) elif scale_features == 'maxabs': features_tot = preprocessing.maxabs_scale(features_tot) elif scale_features == 'minmax': features_tot = preprocessing.minmax_scale(features_tot) elif scale_features == 'normalize': features_tot = preprocessing.normalize(features_tot) elif scale_features == 'robust': features_tot = preprocessing.robust_scale(features_tot) if scale_performance == 'scale': performances_tot = preprocessing.scale(performances_tot, axis=1) elif scale_performance == 'maxabs': performances_tot = preprocessing.maxabs_scale(performances_tot, axis=1) elif scale_performance == 'minmax': performances_tot = preprocessing.minmax_scale(performances_tot, axis=1) elif scale_performance == 'normalize': performances_tot = preprocessing.normalize(performances_tot, axis=1) elif scale_performance == 'default_scale': performances_tot = [[(float(i) + 1000) / (float(line[0]) + 1000) for i in line] for line in performances_tot] performances_tot = np.array(performances_tot) ################################################################### # Section 2: Find the top features and save reduced feature file to txt ################################################################### # Train up a Random Forest rf_regress = RandomForestRegressor(max_features="sqrt", random_state=random_state, max_depth=None, n_estimators=250, verbose=0) rf_regress.fit(features_tot, performances_tot) # Feature Selection selector = SelectFromModel(rf_regress, prefit=True, threshold='2*mean') Indices = selector.get_support(indices=True) top_features = [feature_names[index + 1] for index in Indices] np.savetxt('%s/rfr_top_features_%s.txt' % (results_dir, stamp), top_features, fmt='%s') DATA = [] header = [ 'name', ] header.extend([i for i in top_features]) DATA.append(header) for line in instances_matched: DATA.append([line]) a = len(instances_matched) for j in range(len(data_f[0])): for feature in top_features: if data_f[0][j] == feature: for k in range(a): DATA[k + 1].extend([data_f[k + 1][j]]) with open('%s/%s_reduced-byRFR.csv' % (data_dir, stamp), 'w') as f: writer = csv.writer(f) writer.writerows(DATA)
def fit(self): """Extract features from data. Returns ------- self : returns an instance of self. """ ####################################################################### # MAIN PARAMETERS ####################################################################### # Bandpass filter freq_broad = (0.4, 30) # FFT & bandpower parameters win_sec = 5 # = 2 / freq_broad[0] sf = self.sf win = int(win_sec * sf) kwargs_welch = dict(window='hamming', nperseg=win, average='median') bands = [(0.4, 1, 'sdelta'), (1, 4, 'fdelta'), (4, 8, 'theta'), (8, 12, 'alpha'), (12, 16, 'sigma'), (16, 30, 'beta')] ####################################################################### # HELPER FUNCTIONS ####################################################################### def nzc(x): """Calculate the number of zero-crossings along the last axis.""" return ((x[..., :-1] * x[..., 1:]) < 0).sum(axis=1) def mobility(x): """Calculate Hjorth mobility on the last axis.""" return np.sqrt(np.diff(x, axis=1).var(axis=1) / x.var(axis=1)) def petrosian(x): """Calculate the Petrosian fractal dimension on the last axis.""" n = x.shape[1] ln10 = np.log10(n) diff = np.diff(x, axis=1) return ln10 / (ln10 + np.log10(n / (n + 0.4 * nzc(diff)))) ####################################################################### # CALCULATE FEATURES ####################################################################### features = [] for i, c in enumerate(self.ch_types): # Preprocessing # - Filter the data dt_filt = filter_data(self.data[i, :], sf, l_freq=freq_broad[0], h_freq=freq_broad[1], verbose=False) # - Extract epochs. Data is now of shape (n_epochs, n_samples). times, epochs = sliding_window(dt_filt, sf=sf, window=30) # Calculate standard descriptive statistics hmob = mobility(epochs) feat = { 'std': np.std(epochs, ddof=1, axis=1), 'iqr': sp_stats.iqr(epochs, rng=(25, 75), axis=1), 'skew': sp_stats.skew(epochs, axis=1), 'kurt': sp_stats.kurtosis(epochs, axis=1), 'nzc': nzc(epochs), 'hmob': hmob, 'hcomp': mobility(np.diff(epochs, axis=1)) / hmob } # Calculate spectral power features (for EEG + EOG) freqs, psd = sp_sig.welch(epochs, sf, **kwargs_welch) if c != 'emg': bp = bandpower_from_psd_ndarray(psd, freqs, bands=bands) for j, (_, _, b) in enumerate(bands): feat[b] = bp[j] # Add power ratios for EEG if c == 'eeg': delta = feat['sdelta'] + feat['fdelta'] feat['dt'] = delta / feat['theta'] feat['ds'] = delta / feat['sigma'] feat['db'] = delta / feat['beta'] feat['at'] = feat['alpha'] / feat['theta'] # Add total power idx_broad = np.logical_and(freqs >= freq_broad[0], freqs <= freq_broad[1]) dx = freqs[1] - freqs[0] feat['abspow'] = np.trapz(psd[:, idx_broad], dx=dx) # Calculate entropy and fractal dimension features feat['perm'] = np.apply_along_axis(ent.perm_entropy, axis=1, arr=epochs, normalize=True) feat['higuchi'] = np.apply_along_axis(ent.higuchi_fd, axis=1, arr=epochs) feat['petrosian'] = petrosian(epochs) # Convert to dataframe feat = pd.DataFrame(feat).add_prefix(c + '_') features.append(feat) ####################################################################### # SMOOTHING & NORMALIZATION ####################################################################### # Save features to dataframe features = pd.concat(features, axis=1) features.index.name = 'epoch' # Apply centered rolling average (11 epochs = 5 min 30) # Triang: [1/6, 2/6, 3/6, 4/6, 5/6, 6/6 (X), 5/6, 4/6, 3/6, 2/6, 1/6] rollc = features.rolling(window=11, center=True, min_periods=1, win_type='triang').mean() rollc[rollc.columns] = robust_scale(rollc, quantile_range=(5, 95)) rollc = rollc.add_suffix('_c5min_norm') # Now look at the past 5 minutes rollp = features.rolling(window=10, min_periods=1).mean() rollp[rollp.columns] = robust_scale(rollp, quantile_range=(5, 95)) rollp = rollp.add_suffix('_p5min_norm') # Add to current set of features features = features.join(rollc).join(rollp) ####################################################################### # TEMPORAL + METADATA FEATURES AND EXPORT ####################################################################### # Add temporal features features['time_hour'] = times / 3600 features['time_norm'] = times / times[-1] # Add metadata if present if self.metadata is not None: for c in self.metadata.keys(): features[c] = self.metadata[c] # Downcast float64 to float32 (to reduce size of training datasets) cols_float = features.select_dtypes(np.float64).columns.tolist() features[cols_float] = features[cols_float].astype(np.float32) # Make sure that age and sex are encoded as int if 'age' in features.columns: features['age'] = features['age'].astype(int) if 'male' in features.columns: features['male'] = features['male'].astype(int) # Sort the column names here (same behavior as lightGBM) features.sort_index(axis=1, inplace=True) # Add to self self._features = features self.feature_name_ = self._features.columns.tolist()
from sklearn.pipeline import Pipeline from sklearn.cluster import KMeans from sklearn.preprocessing import MinMaxScaler from numpy import linalg import pdb ### Task 3: Create new feature(s) ### Store to my_dataset for easy export below. my_dataset = data_dict ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys = True) ## Scaling the data data = np.array(robust_scale(data)) scaler = MinMaxScaler() scaler.fit(data) print("Data shape: " + str(data.shape)) ###Remove with KMeans outlier_remover = KMeans(n_clusters=1) outlier_remover.fit(data) cluster_center = np.array(outlier_remover.cluster_centers_[0]) errors = np.sqrt(((data-cluster_center) ** 2).sum(1)).reshape(-1,1) ###Remove with linear regression
mean_list = list(set(columns) - set(['MonthlyIncome', 'NumberOfDependents'])) process.fill_missing(df_train = X, df_test = df_test, mean = mean_list, median = 'MonthlyIncome', mode = ['NumberOfDependents']) #### if skew above 2 add a log feature for col in X.columns: process.transform(X,col) process.transform(df_test,col) # #### Model classifier.try_models(X,y, ['LR']) # classifier.try_models(X,y, ['LR', 'RF', 'SGD', 'DT', 'GB', 'AB']) #### scale data for KNN model X_scale = X.copy() preprocessing.robust_scale(X_scale) classifier.try_models(X_scale, y ,['KNN'])
#Separate training and testing datasets dfX = df[[ "O3", "NO2", "SO2", "china", "china1", "china2", "Wind velocity(m/s)", "Wind direction_NW", "Wind direction_S", "Wind direction_SE" ]] dfy = df["PM10"] #split training and testing sets dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(dfX, dfy, test_size=0.3, random_state=0) dfX_train.shape, dfX_test.shape, dfy_train.shape, dfy_test.shape #scale dfX_train = robust_scale(dfX_train) dfX_test = robust_scale(dfX_test) dfy_train = robust_scale(dfy_train) dfy_test = robust_scale(dfy_test) #print LinearRegression equation E model = linear_model.LinearRegression().fit(dfX_train, dfy_train) print("\n<Linear regression equation E - Training Data Set>") j = 0 for i in dfX.columns: print(i, ": E = a(", "%0.5f" % float(model.coef_[j]), ")+", "%0.5f" % float(model.intercept_)) j = j + 1 y_predict = model.predict(dfX_test) #RSS calculation print("\nRSS:", mean_squared_error(dfy_test, y_predict))
from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score import matplotlib.pyplot as plt from sklearn.utils.multiclass import unique_labels from scipy.stats import mode import warnings from mpl_toolkits.mplot3d import Axes3D # suppress warning warnings.simplefilter("ignore") # load and preprocess data data = pd.read_csv("./data.csv", sep=';') x = data.iloc[:, :-1] y = data.iloc[:, -1].to_numpy() x = robust_scale(x) data_num = len(y) select = np.random.choice(data_num, data_num // 5) train_x = x[np.delete(np.arange(data_num), select)] train_y = y[np.delete(np.arange(data_num), select)] test_x = x[select] test_y = y[select] # use multiple methods classifier1 = KNeighborsClassifier() classifier2 = MLPClassifier() classifier3 = SVC() classifier4 = DecisionTreeClassifier() classifier1.fit(train_x, train_y) classifier2.fit(train_x, train_y) classifier3.fit(train_x, train_y)
def robust_scale_data(data,target_cols): #scaler = RobustScaler().fit(data[target_cols]) #data[target_cols] = scaler.tranform(data[target_cols]) data[target_cols] = robust_scale(data[target_cols]) return data
def mergeData(file_path_in, file_path_out, is_regression): print "Merge data..." # Check if directory exist for p in file_path_out: checkAndCreateDir(p) # Merge edsr data df_esdr = mergeEsdrData(file_path_in[0]) # Aggregate ESDR data b_hr = 4 # how many hours to look back df_esdr = aggregateEsdrData(df_esdr, b_hr) idx = df_esdr["EpochTime"].values # Aggregate smell data b_hr = 4 # how many hours to look back f_hr = [-2, 2] # how many hours to look further bin_smell = None if is_regression else [ 10 ] # bin smell reports into labels or not df_smell, df_smell_raw, bow_smell = aggregateSmellData( file_path_in[1], idx, b_hr, f_hr, bin_smell, 3, 5) df_bow_smell = pd.DataFrame.from_dict(bow_smell, orient="index").reset_index() df_bow_smell.columns = ["word", "count"] # Merge esdr, smell, and tracker data df = pd.merge_ordered(df_esdr, df_smell, on="EpochTime", how="outer", fill_method=None) df = pd.merge_ordered(df, df_tracker, on="EpochTime", how="outer", fill_method=None) df = df.dropna().reset_index(drop=True) # Sort by epoch time df = df.sort_values("EpochTime") # Drop data points before Oct 6th 2016 (the app released date) df = df[df["EpochTime"] >= 1475726400].reset_index(drop=True) # Compute columns of days of the week and hours of the day df_datetime = pd.to_datetime(df["EpochTime"], unit="s") df_hd = df_datetime.dt.hour df_dw = df_datetime.dt.dayofweek # Compute sample weights df_w, df_freq = computeSampleWeights(df_smell_raw, df_hd, df_dw) # Drop the epochtime column df.drop("EpochTime", axis=1, inplace=True) # Prevent extreme small values df[df < 1e-6] = 0 df_w[df_w < 1e-6] = 0 # Transformed data points df_tran = pd.DataFrame(preprocessing.robust_scale(df), columns=df.columns) df_tran = df_tran.round(6) df_tran["NumberOfSmellReports"] = df["NumberOfSmellReports"] # Add days of week and hours of day df["DayOfWeek"] = df_dw df["HourOfDay"] = df_hd df_tran["DayOfWeek"] = df_dw df_tran["HourOfDay"] = df_hd # Write dataframe into a csv file df.to_csv(file_path_out[0]) df_tran.to_csv(file_path_out[1]) df_w.to_csv(file_path_out[2]) df.corr().to_csv(file_path_out[3]) df_freq.to_csv(file_path_out[4]) df_bow_smell.to_csv(file_path_out[5]) print "Dataset created at " + file_path_out[0] print "Transformed dataset created at " + file_path_out[1] print "Sample weights created at " + file_path_out[2] print "Original correlations created at " + file_path_out[3] print "Frequency of data points created at " + file_path_out[4] print "Bag of words for smell description created at " + file_path_out[5]
import os, sys import json, csv from sklearn.feature_extraction import DictVectorizer from sklearn import preprocessing import numpy as np """ argument [path_to_csv] """ def kmeans(X): k_means = cluster.KMeans(15) k_means.fit(X) print k_means.labels_[::10] if __name__ == '__main__': # data = np.genfromtxt(sys.argv[1], delimiter=',') data = pd.read_csv(sys.argv[1]).fillna('0') data = data.to_dict(orient='records') vec = DictVectorizer() featureMatrix = np.array(vec.fit_transform(data).toarray()) X_scaled = preprocessing.robust_scale(featureMatrix) print "\nDimensions of feature matrix: ", X_scaled.shape for i in X_scaled: print i kmeans(X_scaled) # ocsvm(X_scaled)
def xform_data(self, df): """ Some special handling of the price data. First, we don't want prices to be absolute, since we wan't the agent to learn actions _relative_ to states; that is, states need to be transformed into "relative" some how. This is called "stationary time series"; they fluctuate around y=0, like visualizing audio rather than a line graph. Next, we don't want absolute price changes, since that's still not relative enough (prices change in larger amounts when the BTC price is already large - we want to learn the pattern, not the numbers). So the solution is percent-changes. Now - making everything a percent-change from its past makes it so you can track that field's history, but you lose how it relates to the other fields in its cross-section. So here's what we do. Anchor all the price fields to the target (close-price); so they're relative w/i the cross-section. Then set target to its percent-change over time. Leave the volume stuff alone, we _do_ want that absolute. Then scale everything. Crazy, I know; but IMO makes sense. Hit me if you have a better idea. """ columns = [] ind_ct = self.hypers.indicators_count tables_ = data.get_tables(self.hypers.arbitrage) for table in tables_: for col in table['cols']: name_col = f'{table["name"]}_{col}' if name_col == data.target: columns.append(self.diff(df[name_col], True)) elif col in table['price_cols']: columns.append(df[name_col] / df[data.target]) else: columns.append(df[name_col]) # Add extra indicator columns ohlcv = table.get('ohlcv', {}) if ohlcv and ind_ct: ind = pd.DataFrame() # TA-Lib requires specifically-named columns (OHLCV) for k, v in ohlcv.items(): ind[k] = df[f"{name}_{v}"] # Sort these by effectiveness. I'm no expert, so if this seems off please submit a PR! Later after # you've optimized the other hypers, come back here and create a hyper for every indicator you want to # try (zoom in on indicators) best_indicators = [ tlib.MOM, tlib.SMA, # tlib.BBANDS, # TODO signature different; special handling tlib.RSI, tlib.EMA, tlib.ATR ] for i in range(ind_ct): columns.append(best_indicators[i]( ind, timeperiod=self.hypers.indicators_window) / df[data.target]) states = np.column_stack(columns) prices = df[data.target].values # Remove padding at the start of all data. Indicators are aggregate fns, so don't count until we have # that much historical data if ind_ct: states = states[self.hypers.indicators_window:] prices = prices[self.hypers.indicators_window:] # Pre-scale all price actions up-front, since they don't change. We'll scale changing values real-time elsewhere states = preprocessing.robust_scale(states, quantile_range=(1., 99.)) # Reducing the dimensionality of our states (OHLCV + indicators + arbitrage => 5 or 6 weights) # because TensorForce's memory branch changed Policy Gradient models' batching from timesteps to episodes. # This takes of way too much GPU RAM for us, so we had to cut back in quite a few areas (num steps to train # per episode, episode batch_size, and especially states: if self.cli_args.autoencode: ae = AutoEncoder() states = ae.fit_transform_tied(states) return states, prices
def standardize(train_X, valid_X, test_X, scaling=1, robust=0): # FIXME: standardization process can be performed: # 1. in terms of column (input variables) # 2. in terms of row (input cases) # when different features are in different scales, we need to perform 1 # when different features are in the same scale, but different cases have extranous variations, such as exposure, volumn, dynamics, etc., we need to perform 2. # in the usage of ACE data preprocessing, we need to standardize in terms of cases rather than variables # see http://www.faqs.org/faqs/ai-faq/neural-nets/part2/section-16.html for details # for a general workable piece of code, we need to implement both kind of scaling ########################################### # standardization: # for every feature compenent x_i compute the mean(mu) and std(sigma) w.r.t. the whole training X # update x_i <= (x_i - mu) / sigma # standardize the validation and test set using the same mu and sigma of the training X's # mu = numpy.mean(train_X,axis=0) # sigma = numpy.std(train_X,axis=0) # print "mu %d"%mu.shape # print "sigma %d"%sigma.shape ## python will automatically broadcast the row vector to the whole matrix, no worry # train_X = (train_X - mu) / sigma # valid_X = (valid_X - mu) / sigma # test_X = (test_X - mu) / sigma ########################################## # scaling = -1: not scaling at all; # scaling = 0, perform standardization along axis=0 - scaling input variables # scaling = 1, perform standardization along axis=1 - scaling input cases if scaling == 0: ''' for every feature compenent x_i compute the mean(mu) and std(sigma) w.r.t. X(:,i) update x_i <= (x_i - mu) / sigma standardize the validation and test set using the same mu and sigma of the training X's both valid and test set need to reuse the scaler of training set ''' if robust == 1: scaler = preprocessing.RobustScaler().fit(train_X) else: scaler = preprocessing.StandardScaler().fit(train_X) train_X = scaler.transform(train_X) valid_X = scaler.transform(valid_X) test_X = scaler.transform(test_X) elif scaling == 1: ''' for every training case X compute the mean(mu) and std(sigma) w.r.t. X(i,:) update X_i <= (X_i - mu) / sigma standardize the validation and test set using the same mu and sigma of the training X's training, valid and test set are independently scaled, since scaling is performed in terms of cases ''' if robust == 1: train_X = preprocessing.robust_scale(train_X, axis=1) valid_X = preprocessing.robust_scale(valid_X, axis=1) test_X = preprocessing.robust_scale(test_X, axis=1) else: train_X = preprocessing.scale(train_X, axis=1) valid_X = preprocessing.scale(valid_X, axis=1) test_X = preprocessing.scale(test_X, axis=1) ''' if scaling == 1: scaler = preprocessing.StandardScaler().fit(train_X) train_X = scaler.transform(train_X) valid_X = scaler.transform(valid_X) test_X = scaler.transform(test_X) elif scaling == 2: # [0,1] scaling min_max_scaler = preprocessing.MinMaxScaler().fit(train_X) train_X = min_max_scaler.transform(train_X) valid_X = min_max_scaler.transform(valid_X) test_X = min_max_scaler.transform(test_X) elif scaling == 3: # [-1,1] scaling max_abs_scaler = preprocessing.MaxAbsScaler().fit(train_X) train_X = max_abs_scaler.transform(train_X) valid_X = max_abs_scaler.transform(valid_X) test_X = max_abs_scaler.transform(test_X) ''' return train_X, valid_X, test_X
def normalize(self, X): return preprocessing.robust_scale(X)
def standardize(train_X, valid_X, test_X, scaling=1, robust=0): # FIXME: standardization process can be performed: # 1. in terms of column (input variables) # 2. in terms of row (input cases) # when different features are in different scales, we need to perform 1 # when different features are in the same scale, but different cases have extranous variations, such as exposure, volumn, dynamics, etc., we need to perform 2. # in the usage of ACE data preprocessing, we need to standardize in terms of cases rather than variables # see http://www.faqs.org/faqs/ai-faq/neural-nets/part2/section-16.html for details # for a general workable piece of code, we need to implement both kind of scaling ########################################### # standardization: # for every feature compenent x_i compute the mean(mu) and std(sigma) w.r.t. the whole training X # update x_i <= (x_i - mu) / sigma # standardize the validation and test set using the same mu and sigma of the training X's # mu = numpy.mean(train_X,axis=0) # sigma = numpy.std(train_X,axis=0) # print "mu %d"%mu.shape # print "sigma %d"%sigma.shape ## python will automatically broadcast the row vector to the whole matrix, no worry # train_X = (train_X - mu) / sigma # valid_X = (valid_X - mu) / sigma # test_X = (test_X - mu) / sigma ########################################## # scaling = -1: not scaling at all; # scaling = 0, perform standardization along axis=0 - scaling input variables # scaling = 1, perform standardization along axis=1 - scaling input cases if scaling == 0: ''' for every feature compenent x_i compute the mean(mu) and std(sigma) w.r.t. X(:,i) update x_i <= (x_i - mu) / sigma standardize the validation and test set using the same mu and sigma of the training X's both valid and test set need to reuse the scaler of training set ''' if robust == 1: scaler = preprocessing.RobustScaler().fit(train_X) else: scaler = preprocessing.StandardScaler().fit(train_X) train_X = scaler.transform(train_X) valid_X = scaler.transform(valid_X) test_X = scaler.transform(test_X) elif scaling == 1: ''' for every training case X compute the mean(mu) and std(sigma) w.r.t. X(i,:) update X_i <= (X_i - mu) / sigma standardize the validation and test set using the same mu and sigma of the training X's training, valid and test set are independently scaled, since scaling is performed in terms of cases ''' if robust == 1: train_X = preprocessing.robust_scale(train_X, axis=1) valid_X = preprocessing.robust_scale(valid_X, axis=1) test_X = preprocessing.robust_scale(test_X, axis=1) else: train_X = preprocessing.scale(train_X, axis=1) valid_X = preprocessing.scale(valid_X, axis=1) test_X = preprocessing.scale(test_X, axis=1) ''' if scaling == 1: scaler = preprocessing.StandardScaler().fit(train_X) train_X = scaler.transform(train_X) valid_X = scaler.transform(valid_X) test_X = scaler.transform(test_X) elif scaling == 2: # [0,1] scaling min_max_scaler = preprocessing.MinMaxScaler().fit(train_X) train_X = min_max_scaler.transform(train_X) valid_X = min_max_scaler.transform(valid_X) test_X = min_max_scaler.transform(test_X) elif scaling == 3: # [-1,1] scaling max_abs_scaler = preprocessing.MaxAbsScaler().fit(train_X) train_X = max_abs_scaler.transform(train_X) valid_X = max_abs_scaler.transform(valid_X) test_X = max_abs_scaler.transform(test_X) ''' return train_X, valid_X, test_X
print(X.shape) print(X2.shape) raw_data, raw_target = X2, y print(raw_data.shape) # In[13]: train, test, train_t, test_t = train_test_split(X2, y, test_size=0.3, random_state=random_state, stratify=y) train = preprocessing.robust_scale(train, axis=0, with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True) # test = preprocessing.robust_scale(test, axis=0, with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True) # In[14]: learning_rate = 0.1 #tun_range1 = {'n_estimators':range(20,81,10)}
plt.xlabel("Y Test") plt.ylabel("Y Predicted") plt.xlim(lim_lower, lim_upper) plt.ylim(lim_lower, lim_upper) plt.plot([lim_lower, lim_upper], [lim_lower, lim_upper], ls="--") plt.savefig("%s.svg" % (name), format='svg', dpi=300) plt.clf() # READ DATA SET df = pd.read_csv("drug_descriptors.txt") names = [i for i in df.columns[5:]] #print names df[names] = robust_scale(df[names]) #print df.head(5) X = df.loc[:, names] # ANALYZE DESCRIPTOR CORRELATIONS corr_X = np.corrcoef(X) of = open("corr.csv", "w") of.write(", ") for i in range(len(names)): of.write("%20s, " % (names[i])) of.write("\n") for i in range(len(names)): of.write("%20s, " % (names[i])) for j in range(len(names)): of.write("%8.5f, " % (corr_X[i][j])) of.write("\n")
#page 253 데이터 Scaling 변환(minmax_scale) #minmax_scale(): 최소 최대값을 이용하여 데이터 반환 ds_minmax_scale = minmax_scale(fit_numeric) ds_minmax_scale = pd.DataFrame(ds_minmax_scale, columns=fit_numeric.columns) ds_minmax_scale.head() #요약 통계량 ds_minmax_scale_describe = ds_minmax_scale.describe() ds_minmax_scale_describe.round(3) # In[107]: #page 254 #robust_scale():데이터 변환 함수 ds_robust_scale = robust_scale(fit_numeric) ds_robust_scale = pd.DataFrame(ds_robust_scale, columns=fit_numeric.columns) ds_robust_scale.head() #요약통계량 ds_robust_scale_describe = ds_robust_scale.describe() ds_robust_scale_describe.round(3) # In[108]: #page 255 #Scale, Robust, MinMax scale 변환 비교 ds_rstpulse = pd.DataFrame() ds_rstpulse["Raw"] = ds_fitness["RSTPULSE"] ds_rstpulse["Scale"] = ds_scale["RSTPULSE"] ds_rstpulse["Robust"] = ds_robust_scale["RSTPULSE"]
def __init__(self, ep_len=5000, window=300, arbitrage=False, indicators={}): self.ep_len = ep_len self.window = window self.arbitrage = arbitrage self.indicators = indicators self.ep_stride = ep_len # disjoint # self.ep_stride = 100 # overlap; shift each episode by x seconds. # TODO overlapping stride would cause test/train overlap. Tweak it so train can overlap data, but test gets silo'd col_renames = { 'Timestamp': 'timestamp', 'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume_(BTC)': 'volume_btc', 'Volume_(Currency)': 'volume', 'Weighted_Price': 'vwap' } filenames = { # 'bitstamp': 'bitstampUSD_1-min_data_2012-01-01_to_2018-06-27.csv', 'coinbase': 'coinbaseUSD_1-min_data_2014-12-01_to_2018-06-27.csv', # 'coincheck': 'coincheckJPY_1-min_data_2014-10-31_to_2018-06-27.csv' } primary_table = 'coinbase' self.target = f"{primary_table}_close" df = None for table, filename in filenames.items(): df_ = pd.read_csv( path.join(path.dirname(__file__), 'bitcoin-historical-data', filename)) col_renames_ = {k: f"{table}_{v}" for k, v in col_renames.items()} df_ = df_.rename(columns=col_renames_) ts = f"{table}_timestamp" df_[ts] = pd.to_datetime(df_[ts], unit='s') df_ = df_.set_index(ts) df = df_ if df is None else df.join(df_) # too quiet before 2015, time waste. copy() to avoid pandas errors df = df.loc['2015':].copy() df['month'] = df.index.month df['day'] = df.index.day df['hour'] = df.index.hour # TODO drop null rows? (inner join?) # TODO arbitrage # TODO indicators diff_cols = [ f"{table}_{k}" for k in 'open high low close volume_btc volume vwap'.split(' ') for table in filenames.keys() ] df[diff_cols] = df[diff_cols].pct_change()\ .replace([np.inf, -np.inf], np.nan)\ .ffill() # .bfill()? df = df.iloc[1:] target = df[ self. target] # don't scale price changes; we use that in raw form later df = pd.DataFrame(robust_scale(df.values, quantile_range=(.1, 100 - .1)), columns=df.columns, index=df.index) df[self.target] = target df['cash'], df['value'] = 0., 0. self.df = df
def ert_by_hour_by_auction(df, evaluate_var): # Log de Precios # df['PESPANIA'] = np.log(df['PESPANIA']) # df['PPORTUGAL'] = np.log(df['PPORTUGAL']) del df['PPORTUGAL'] df['FECHA'] = df['ANIO'].map(str) + '-' + df['MES'].map( str) + '-' + df['DIA'].map(str) df['FECHA'] = pd.to_datetime(df['FECHA'], format='%Y-%m-%d') df['WEEKDAY'] = df['FECHA'].dt.dayofweek # df['DUMMY_2010_REGIMEN'] = pd.Series(0, index=df.index) # df.loc[df['FECHA'] >= '2010-01-01', 'DUMMY_2010_REGIMEN'] = 1 # df = df[df['FECHA'] >= '2010-01-01'] df['ANIO'] = df['ANIO'].map(int) df['MES'] = df['MES'].map(int) df['DIA'] = df['DIA'].map(int) # df = df.groupby(['FECHA']).mean().reset_index() del df['HORA'] del df['FECHA_HORA'] del df['DIA'] # TARGET VARIABLE dummy_important = [ 'DUMMY_5_DAY', 'DUMMY_10_DAY', 'DUMMY_15_DAY', 'DUMMY_20_DAY', 'DUMMY_30_DAY' ] dummy_important.remove(evaluate_var) for i in dummy_important: del df[i] # DIFFERENCIATE # DIFERENCIA PESPANIA ''' df['PESPANIA'] = df['PESPANIA'] - df['PESPANIA'].shift(1) df = df.dropna(axis=0) # DIFERENCIA RESTO need_differenciation = ['TOTAL_PRODUCCION_POR', 'TOTAL_DEMANDA_POR', 'CICLO_COMBINADO', 'FUEL_PRIMA', 'PRICE_OIL', 'PRICE_GAS', 'RISK_PREMIUM', 'TME_MADRID', 'TMAX_MADRID', 'TME_BCN', 'TMAX_BCN', 'TMIN_BCN', 'GDP'] for i in need_differenciation: name = 'D_' + str(i) df[name] = df[i] - df[i].shift(1) del df[i] df = df.dropna() ''' # DUMMIES dummy_var = ['ANIO', 'MES', 'WEEKDAY'] for i in dummy_var: name = str(i) dummy = pd.get_dummies(df[i], prefix=name) df = pd.concat([df, dummy], axis=1) del dummy del df[i] # LAGS lag_AR = 24 for i in range(1, lag_AR + 1, 1): name = 'PESPANIA_lag_' + str(i) df[name] = df['PESPANIA'].shift(i) lag_number = 24 lag_variables = [ 'TOTAL_IMPORTACION_ES', 'TOTAL_PRODUCCION_ES', 'TOTAL_DEMANDA_NAC_ES', 'TOTAL_EXPORTACIONES_ES', 'TOTAL_DDA_ES', 'TOTAL_POT_IND_ES', 'HIDRAULICA_CONVENC', 'HIDRAULICA_BOMBEO', 'NUCLEAR', 'CARBON NACIONAL', 'CARBON_IMPO', 'CICLO_COMBINADO', 'FUEL_SIN_PRIMA', 'FUEL_PRIMA', 'REG_ESPECIAL', 'PRICE_OIL', 'PRICE_GAS', 'RISK_PREMIUM' ] for i in range(1, lag_number, 1): for j in lag_variables: name = str(j) + '_lag_' + str(i) df[name] = df[j].shift(i) lag_number = 24 climaticas = [ 'TME_MADRID', 'TMAX_MADRID', 'TMIN_MADRID', 'PP_MADRID', 'TME_BCN', 'TMAX_BCN', 'TMIN_BCN', 'PP_BCN' ] for i in range(1, lag_number + 1, 1): for j in climaticas: name = str(j) + '_lag_' + str(i) df[name] = df[j].shift(i) lag_number = 24 portugal = ['TOTAL_DEMANDA_POR', 'TOTAL_PRODUCCION_POR'] for i in range(1, lag_number + 1, 1): for j in portugal: name = str(j) + '_lag_' + str(i) df[name] = df[j].shift(i) df = df.dropna(how='any', axis=0) normal = df[df[evaluate_var] == 0] anormal = df[df[evaluate_var] == 1] del normal[evaluate_var] del anormal[evaluate_var] # NORMALIZE column_names = normal.columns.values.tolist() normal_date = normal[['FECHA']] normal = preprocessing.robust_scale(normal.drop('FECHA', axis=1).values) normal = pd.DataFrame(normal) normal = pd.concat([normal, normal_date], axis=1) normal = pd.DataFrame(normal, columns=[column_names]) column_names = anormal.columns.values.tolist() anormal_date = anormal[['FECHA']] anormal = preprocessing.robust_scale(anormal.drop('FECHA', axis=1).values) anormal = pd.DataFrame(anormal) anormal = pd.concat([anormal, anormal_date], axis=1) anormal = pd.DataFrame(anormal, columns=[column_names]) total_values = len(df.index) print('total rows ', total_values) anormal_values = len(anormal.index) print('anormal rows ', anormal_values) proportion = anormal_values / total_values print('proportion of anormal ', proportion) normalY = normal[['PESPANIA']] normalX = normal del normalX['PESPANIA'] anormalY = anormal[['PESPANIA']] anormalX = anormal del anormalX['PESPANIA'] names = normalX.columns.values fileNames = np.array(names) # Solo tomamos test y train del normal con el mismo tamaño del test que la muestra de anormales X_train, X_test, y_train, y_test = train_test_split(normalX, normalY, test_size=proportion, random_state=42) auction_date = df.FECHA[df['DUMMY'] == 1].tolist() del df['DUMMY'] days_before = 30 for auction_date in auction_date: anormalY = df[df['FECHA'] >= auction_date - datetime.timedelta(days=days_before)] anormalY = anormalY[anormalY['FECHA'] <= auction_date] anormalY = anormalY[['PESPANIA']] df_before_auction = df[df['FECHA'] < auction_date - datetime.timedelta(days=days_before)] df_before_auction_Y = df_before_auction[['PESPANIA']] df_before_auction_X = df_before_auction.drop(['FECHA', 'PESPANIA'], axis=1) normalY = df[df['FECHA'] >= auction_date] normalY = normalY[normalY['FECHA'] <= auction_date + datetime.timedelta(days=days_before)] future_dates = normalY.drop(['FECHA', 'PESPANIA'], axis=1) normalY = normalY[['PESPANIA']] df_before_auction1 = df[df['FECHA'] <= auction_date] df_before_auction_Y_1 = df_before_auction1[['PESPANIA']] df_before_auction_X_1 = df_before_auction1.drop(['FECHA', 'PESPANIA'], axis=1) min_samples_leaf = round(len(df_before_auction.index) * 0.005) print('min_samples_leaf ', min_samples_leaf) min_samples_split = min_samples_leaf * 10 print('min_samples_split ', min_samples_split) iTrees = 100 print('iTrees ', iTrees) depth = 50 maxFeat = (round((len(df.columns) / 3))) print('Feature Set ', maxFeat) fileModel = ensemble.GradientBoostingRegressor( learning_rate=0.01, n_estimators=iTrees, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, max_depth=depth, verbose=1) fileModel.fit(df_before_auction_X, df_before_auction_Y) fileModel2 = ensemble.GradientBoostingRegressor( learning_rate=0.01, n_estimators=iTrees, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, max_depth=depth, verbose=1) fileModel2.fit(df_before_auction_X_1, df_before_auction_Y_1) # Predecimos los siguientes días para el anormal y_hat = fileModel.predict(future_dates) y_hat = pd.DataFrame(y_hat, columns=['yhat']) prediction_anormal = y_hat anormalY = anormalY.reset_index(drop=True) # Predecimos los siguientes días para el normal y_hat = fileModel2.predict(future_dates) y_hat = pd.DataFrame(y_hat, columns=['yhat']) prediction_normal = y_hat y_test = normalY.reset_index(drop=True) print('MSE ANORMAL ', mean_squared_error(anormalY, prediction_anormal)) print('R2 ANORMAL ', r2_score(anormalY, prediction_anormal)) # Comparamos con el verdadero valor print('MSE NORMAL ', mean_squared_error(y_test, prediction_normal)) print('R2 NORMAL ', r2_score(y_test, prediction_normal)) prediction_normal = pd.DataFrame(prediction_normal, index=y_test.index) prediction_normal = pd.concat([y_test, prediction_normal], axis=1) prediction_normal.columns = [ 'PESPANIA_REAL_NO_COLUSION', 'PESPANIA_PRED_NO_COLUSION' ] prediction_normal['DIF_PORC'] = ( prediction_normal['PESPANIA_REAL_NO_COLUSION'] - prediction_normal['PESPANIA_PRED_NO_COLUSION'] ) / prediction_normal['PESPANIA_PRED_NO_COLUSION'] print('PRECIO PROMEDIO PREDICHO - NO COLUSION %.5f' % prediction_normal['PESPANIA_PRED_NO_COLUSION'].mean()) print('PRECIO PROMEDIO REAL - NO COLUSION %.5f ' % prediction_normal['PESPANIA_REAL_NO_COLUSION'].mean()) print('DIFERENCIA PROMEDIO PORCENTUAL (REAL/PRED -1)', prediction_normal['DIF_PORC'].mean() * 100, '%') prediction_anormal = pd.DataFrame(prediction_anormal, index=anormalY.index) prediction_anormal = pd.concat([anormalY, prediction_anormal], axis=1) prediction_anormal.columns = [ 'PESPANIA_REAL_COLUSION', 'PESPANIA_PRED_COLUSION' ] prediction_anormal['DIF_PORC'] = ( prediction_anormal['PESPANIA_REAL_COLUSION'] - prediction_anormal['PESPANIA_PRED_COLUSION'] ) / prediction_anormal['PESPANIA_PRED_COLUSION'] print('PRECIO PROMEDIO PREDICHO - COLUSION %.5f' % prediction_anormal['PESPANIA_PRED_COLUSION'].mean()) print('PRECIO PROMEDIO REAL - COLUSION %.5f' % prediction_anormal['PESPANIA_REAL_COLUSION'].mean()) print('DIFERENCIA PROMEDIO PORCENTUAL (REAL/PRED -1)', prediction_anormal['DIF_PORC'].mean() * 100, '%') # prediction_anormal.to_csv('prediction_anormal_dia.csv', sep=';', index=False) fig, ax = plot.subplots() prediction_anormal = prediction_anormal.reset_index() sns.regplot(y='PESPANIA_PRED_COLUSION', x='index', data=prediction_anormal, ax=ax, label='PREDICTED') sns.regplot(y='PESPANIA_REAL_COLUSION', x='index', data=prediction_anormal, ax=ax, label='REAL') # diag_line, = ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3", label='perfect prediction') plot.legend(loc='best') plot.title('Differences between Prices') plot.show() fig, ax = plot.subplots() prediction_normal = prediction_normal.reset_index() sns.regplot(y='PESPANIA_PRED_NO_COLUSION', x='index', data=prediction_normal, ax=ax, label='PREDICTED') sns.regplot(y='PESPANIA_REAL_NO_COLUSION', x='index', data=prediction_normal, ax=ax, label='REAL') # diag_line, = ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3", label='perfect prediction') plot.legend(loc='best') plot.title('Differences between Prices') plot.show()
def svmModel(self, training_filename, test_filename): # Cargar los datos del fichero f = open(test_filename) lines = f.readlines() f.close() # Inicializar los vectores de datos matriz = [] classify = [] # Convierto en una lista los datos de la primera fila que vienen separados por coma format = list(lines[0].strip().split(',')) # Recorro las líneas del fichero a partir de la segunda for line in lines[1:]: # Inicializo variable temporal vector = [] # Convierto en lista los elementos de la fila que vienen separados por coma lista = list(line.split(',')) # Recorro la lista for i in range(len(lista)): # Lleno los vectores correspondientes if format[i] == 'num': vector.append(float(lista[i])) elif format[i] == 'class': classify.append(list(lista[i][:-2])) # Se llena la matriz matriz.append(vector) # Se aplica el preprocesamiento Scaled = preprocessing.scale(matriz) Normalized = preprocessing.normalize(matriz, norm='l2') robustScaled = preprocessing.robust_scale(matriz, axis=0, with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True) # return JsonResponse({"Prueba": list(robustScaled)}) # Se crea el modelo a partir del clasificador seleccionado y con los datos escogidos clf = svm.SVC(gamma=0.5, C=100.) clf.fit(matriz, classify) # Se guarda el modelo en el fichero joblib.dump(clf, 'modelo.joblib') # Para clasificar los nuevos datos # Cargar el modelo guardado clf2 = joblib.load('modelo.joblib') # Inicializar variables para ver resultados numCorrect = 0.0 numIncorrect = 0.0 # Aplicar clasificación a los datos res = clf2.predict(matriz) # Se inicaliza una lista para comparar resultados Solu = [] # Convertir el resultado en una lista resList = list(res) # Recorrer la lista para adicionar a una lista cada elemento del resultado convertido en lista for j in range(len(resList)): Solu.append(list(res[j])) # Comparar resultados obtenidos con los utilizados en la creación del modelo resultado = [] for i in range(len(resList)): if Solu[i] == classify[i]: numCorrect += 1 else: numIncorrect += 1 # Devuelvo en formato JSON los resultados # resultado.append("%4.2f%% correct" % (numCorrect)) return JsonResponse({ "Clasificado": list(res), "Original": classify, "Correctamente clasificados": numCorrect, "Mal clasificados": numIncorrect })
path3 = os.path.join(current_path, 'dataset/prostate_all_samples_trees.csv') # Read data sets #d1 = filter_dataset(Dataset(path1, scale=False, normalize=False, sep=','), 0.25, fdr=True) #d2 = filter_dataset(Dataset(path2, scale=False, normalize=False, sep=','), 0.10, fdr=false) d3 = filter_dataset(Dataset(path3, scale=False, normalize=False, sep=','), 0.25, fdr=True) # Find what is above Mean in each data set... # m1 = d1.matrix # m1 = robust_scale(m1) # m1[m1 < -0.33] = np.nan # m1[m1 > 0.33] = np.nan # m1 = m1 + 10 m3 = d3.matrix m3 = robust_scale(m3) i1 = m3 > -0.50 i2 = m3 < 0.50 r = i1 * i2 m3[r] = np.nan m3 = m3 # Join the data set into one matrix # df1 = DataFrame(m1, index=d1.samples, columns=d1.genes) #df2 = DataFrame(m2, index=d2.samples, columns=d2.genes) df3 = DataFrame(m3, index=d3.samples, columns=d3.genes) result = DataFrame() #result = result.append(df1) #result = result.append(df2) result = result.append(df3)
INPUT: Dataframe and column names in list OUTPUT: Dataframe with features (X) and target variable dataframe (y) ''' X = df.loc[:,cols] y = df.loc[:, 'current_market_value'] return X,y def scale(x,y): ''' INPUT: Dataframe with features (X) and target variable dataframe (y) OUTPUT: Scaled dataframes ''' X = preprocessing.robust_scale(x) y = preprocessing.robust_scale(y) return X,y def run_linear_models(x,y): ''' Get an overview of performances of different linear models. Linear models: Linear Regression, Ridge, 3x Lasso, ElasticNet INPUT: Dataframe with features (X) and target variable dataframe (y) OUTPUT: Scores and feature importances of each model ''' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) lr = LinearRegression()
def scale(X): X_np = np.array(X.values) X_scaled = preprocessing.robust_scale(X_np) return X_scaled
def rf_reg_grid_search(df, features, label, param_grid, rand_state, scores, name): """This routine calculates the random forest regression on a grid of hyper-parameters for the random forest method to test the best hyper-parameters. The analysis results of the test will be written out and saved. Parameters: df : pandas dataframe The dataframe containing the features and the label for the regression. features : list of strings List of features label : string The label for the regression param_grid : dictionary-like structure Parameter grid of input parameters for the grid search rand_state : integer Setting the random state variables to ensure reproducibility scores : list of strings Setting the score by which the grid search should be evaluated name : strings Setting the name of the output file for the grid search which contains all information about the grid """ X, y = sets.build_matrices(df, features, label) # Standardizing the data X = preprocessing.robust_scale(X) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=rand_state) print "Training sample size: ", X_train.shape print "Evaluation sample size: ", X_test.shape for score in scores: print("# Tuning hyper-parameters for %s" % score) print() reg = GridSearchCV(RandomForestRegressor(random_state=rand_state), \ param_grid,scoring='%s' % score,cv=5,n_jobs=4) reg.fit(X_train, y_train) print("Best parameters set found on training set:") print() print(reg.best_params_) print() print("Grid scores on training set:") print() means = reg.cv_results_['mean_test_score'] stds = reg.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, reg.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) print() df = pd.DataFrame(reg.cv_results_) df.to_hdf('RF_GS_' + name + '_' + score + '.hdf5', 'data') print() print("The model is trained on the full development set (80%).") print("The scores are computed on the full evaluation set (20%).") print() y_true, y_pred = y_test, reg.predict(X_test) ml_an.evaluate_regression(y_test, y_pred) pz_an.evaluate_photoz(y_test, y_pred) print()
def RandomForest_Classifier(): np.set_printoptions(precision=3, suppress=True) data_title = 'LungData' Data = np.loadtxt( '../Lung_Challenge_Features/Lung_Feature_Results_Final.csv', delimiter=",", skiprows=1, dtype="object") # Randomly divide the data according to a .5 proportion into train and test # In order to get the same random split, make sure that the np.random.seed # line is left uncommented. # np.random.seed(0) TrainData = np.zeros((0, Data.shape[1])) TestData = np.zeros((0, Data.shape[1])) for endpoint in np.arange(100, 600, 100): TempData = Data[endpoint - 100:endpoint] random_mask = np.random.rand(len(TempData)) < .5 TrainData = np.vstack((TrainData, TempData[random_mask])) TestData = np.vstack((TestData, TempData[~random_mask])) np.random.shuffle(TrainData) np.random.shuffle(TestData) dimsT = np.shape(TestData) dims = np.shape(TrainData) yT = TestData[:, 1].astype(float) XT = TestData[:, 2:].astype(float) y = TrainData[:, 1].astype(float) X = TrainData[:, 2:].astype(float) # X = normalize(robust_scale(X, axis=1), axis=1) # XT = normalize(robust_scale(XT, axis=1), axis=1) X = robust_scale(normalize(X, axis=1), axis=1) XT = robust_scale(normalize(XT, axis=1), axis=1) best = [np.inf, np.inf, np.inf] Results = np.zeros((5, 2)) Predictions = np.zeros((dimsT[0], 2)) Predictions[:, 0] = yT Results[:, 0] = [1, 2, 3, 4, 5] Winner = [] ParameterTuner = 20 # for i in range(1,ParameterTuner+1): for i in [15]: # Some different machine learning options to test.. for k in [[1, RandomForestClassifier(n_estimators=i)]]: # for k in [[1, RandomForestClassifier(n_estimators=(int(math.ceil(float(i)/10))), n_jobs=-1)]]: # for k in [[1, svm.LinearSVC(C=100,dual=True)]]: # for k in [[1, svm.SVC(C=10,kernel="rbf", degree=2)]]: # for k in [[1, BaggingRegressor(base_estimator=RandomForestRegressor(n_estimators=10), n_jobs=-1, n_estimators=20)]]: # for k in [[1, svm.NuSVC(nu=(.3), kernel="rbf", verbose=True, probability=True, tol=1e-6, decision_function_shape='ovr')]]: print 'Parameter Value: ' + str(i) clf = k[1].fit(X, y) XP = clf.predict(X) XTP = clf.predict(XT) RTP = np.random.choice(yT, dims[0]) TempResults = np.zeros((5, 2)) for category in range(1, 6): print 'Category: ' + str(category) # Assess the accuracy on training and testing data. # Also compare against a random model. count = [0, 0, 0] total = [0, 0, 0] actual = [y, yT, yT] predicted = [XP, XTP, RTP] prediction_labels = ['Training', 'Testing', 'Random'] for j in range(0, min(dims[0], dimsT[0])): for TestTrainRandom in xrange(3): if actual[TestTrainRandom][j] == category: if str(actual[TestTrainRandom][j]) != str( predicted[TestTrainRandom][j]): count[TestTrainRandom] += 1 total[TestTrainRandom] += 1 if not 0 in total: for ptype in xrange(3): print prediction_labels[ptype] + ' Error Rate: ' + str( float(count[ptype]) / total[ptype]) print "" TempResults[category - 1, k[0]] = float(count[1]) / total[1] if np.mean(TempResults[:, k[0]]) < best[k[0]]: best[k[0]] = np.mean(TempResults[:, k[0]]) # Results[k[0]+2] = np.std(TempResults[:,k[0]]) Results[:, k[0]] = TempResults[:, k[0]] Predictions[:, k[0]] = XTP Winner += [i] Results[:, 1] = 1 - Results[:, 1] print 'Best Parameters: ' print Winner print 'Accuracy per Category: ' print Results np.savetxt('Results' + data_title + '.csv', Results, fmt="%s", delimiter=",") np.savetxt('Predictions' + data_title + '.csv', Predictions, fmt="%s", delimiter=",") print "All done!"
def rf_reg_example(df, features, label, params, rand_state, save=False, save_filename=None): """This routine calculates an example of the random forest regression tuned to photometric redshift estimation. The results will be analyzed with the analyis routines/functions provided in ml_eval.py and photoz_analysis.py Parameters: df : pandas dataframe The dataframe containing the features and the label for the regression. features : list of strings List of features label : string The label for the regression params : dictionary List of input parameters for the regression rand_state : integer Setting the random state variables to ensure reproducibility """ # Building test and training sample X, y = sets.build_matrices(df, features, label) # Standardizing the data X = preprocessing.robust_scale(X) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=rand_state) # Random Forest Regression reg = RandomForestRegressor(**params) reg.fit(X_train, y_train) y_pred = reg.predict(X_test) feat_importances = reg.feature_importances_ # Save predicted and test y values for later analysis if save: if save_filename: results = pd.DataFrame(data=np.array([y_pred, y_test]).T, columns=['y_pred', 'y_test']) results.to_csv(save_filename + '.csv', index=False) else: print "Error: No Filename supplied!" # Evaluate regression method print "Feature Importances " for i in range(len(features)): print str(features[i]) + ": " + str(feat_importances[i]) print "\n" ml_an.evaluate_regression(y_test, y_pred) pz_an.plot_redshifts(y_test, y_pred) pz_an.plot_error_hist(y_test, y_pred) plt.show()
features_array_X[index, 5] = out_degree_dictionary[second_node] features_array_X[index, 6] = degree_centrality_dictionary[first_node] features_array_X[index, 7] = degree_centrality_dictionary[second_node] features_array_X[index, 8] = common_in_neighbors_dictionary[(first_node, second_node)] features_array_X[index, 9] = common_out_neighbors_dictionary[(first_node, second_node)] features_array_X[index, 10] = nodes_core[first_node] + nodes_core[second_node] features_array_X[index, 11] = nodes_pagerank[first_node] features_array_X[index, 12] = nodes_pagerank[second_node] index += 1 features_array_X = robust_scale(features_array_X, axis=0, with_centering=False, with_scaling=True, copy=True) # Normalize X array per feature #features_array_X = normalize_array(features_array_X,0) # Normalize X array per training instance features_array_X = normalize_array(features_array_X, 1) print("Starting training...") rand_forest = RandomForestClassifier(n_estimators=25, min_samples_leaf=50) rand_forest.fit(features_array_X, Y_train) print("It's prediction time...") predictions = rand_forest.predict_proba(features_array_X) #print("Starting training...")
def load_additional_positionwise_data(self, class_files, identifier, standardize=False): """ Add additional numerical features to the network (for each nucleotide in a sequence). For every position in an input sequence additional numerical data can be added to the network (e.g. ChIP-seq signal, conservation for every nucleotide). The data will be added to the input matrix. E.g.: Using sequences of length 200 over the alphabet "ACGT" results in input matrices of size 4x200. Additional position-wise data will be added to these matrices as a new row resulting in matrices of size 5x200. Input files are text files and must contain as many whitespace-separated values in each line as the sequences are long, e.g.: '0.679 1.223 -0.296 ... '0.961 0.532 0.112 ... '0.065 -0.333 -0.256 ... '... The number of provided files must match the fasta files provided to the __init__ function (e.g. if you provided a list of 3 files to __init__ you must provide a list of 3 files here as well) and the number of lines in each file must match the number of entries in the corresponding fasta file. If you want to add multiple features simply call this function multiple times. Input features should be standardized in some way prior to adding them to the network, as this tends to improve the predictive performance. In the same way network kernels are visualized as sequence motifs after the network training (based on the first 4 rows of the input matrices and using the visualize_kernel() Model function), the rows corresponding to additional features are summarized as line plots as well. Parameters ---------- class_files: str or [str] A text file (multi-label) or a list of text files (single-label). identifier: str A short feature name (will be shown in kernel output plots). standardize: bool Scale each column according to the interquartile range. """ if not "positionwise" in dir(self): self.positionwise = OrderedDict() if identifier in self.positionwise: raise RuntimeError( "Identifier '{}' already exists.".format(identifier)) if not isinstance(class_files, list): class_files = [class_files] len_sequence = self.data[0].shape[0] new_data = np.empty((len(self.labels), len_sequence), dtype=np.float32) row = 0 for file_name in class_files: handle = io.get_handle(file_name, 'rt') for i, line in enumerate(handle): try: new_data[row, :] = [float(x) for x in line.split()] except ValueError as err: raise RuntimeError( "ValueError: {} (in line {} in {}).".format( err, i + 1, file_name)) row += 1 handle.close() if row != len(self.labels): raise RuntimeError( "Amount of additional data ({}) doesn't match number of sequences ({})." .format(row, len(self.labels))) if True == standardize: from sklearn.preprocessing import robust_scale self.positionwise[identifier] = robust_scale(new_data, axis=0) if not "positionwise_unscaled" in dir(self): self.positionwise_unscaled = OrderedDict() self.positionwise_unscaled[identifier] = new_data else: self.positionwise[identifier] = new_data
def normalize(x, positions): num_columns = x.shape[1] for i in range(num_columns): if i in positions: x[:, i:i + 1] = np.copy(preprocessing.robust_scale(x[:, i:i + 1])) return x
def _preprocess_minmax(cls, array: np.ndarray) -> np.ndarray: frames, channels, window_size = array.shape scaled_array = array.transpose((1, 0, 2)).reshape((channels, -1)) scaled_array = preprocessing.robust_scale(scaled_array, axis=1) return preprocessing.minmax_scale(scaled_array, axis=1).reshape( channels, frames, window_size).transpose(1, 0, 2)
### Extract features and labels from dataset for local testing my_df = my_df[ features_list ] # fill in NaN values strategy -- test all 3 to gauge impact on accuracy #my_df = my_df.fillna(0) #my_df = my_df.fillna( my_df.median() ) my_df = my_df.fillna( my_df.mean() ) my_df_array = np.array( my_df ) old_features_array = my_df_array[ :, 1: ] values_array = my_df_array[ :, [0] ].astype(int) values_array = np.ravel(values_array) # test impact of scaling, use robust_scale due to outlier values from sklearn import preprocessing old_features_array_scaled = preprocessing.robust_scale( old_features_array ) # select most important features from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import SelectFpr, chi2, f_classif # test values of k from 2-14 selector = SelectKBest(f_classif, k=20) features_array = selector.fit_transform(old_features_array_scaled, values_array) # split scaled or unscaled data into train and test sets from sklearn.cross_validation import train_test_split features_train, features_test, labels_train, labels_test = train_test_split( \ features_array, values_array, \ test_size = 0.2, random_state=16)
# index = 16 * j + i # fig = figs[j] # ax = fig.add_subplot(4, 4, i + 1) # channels_data[start][index] = pearsonr(s[start], s[index])[0] # ax.plot(s[start], ys, 'r') # fig.tight_layout() i=1 x = np.arange(0,3383.766,.001) data = np.zeros(shape=(61,3383766)) for index, column in enumerate(data3): data[index] = np.concatenate((data1[index],data2[index],data3[index],data4[index],data5[index],data6[index])) data = preprocessing.robust_scale(data,True,True,True) #eeg = signal.detrend(np.array(data),type='constant') #data = signal.medfilt(eeg) #data = eeg-data #matrix=[] #for index, row in (enumerate(data)): # if(index!=63 and index!=62 and index!=61): # print index # matrix.append(row[3000000:-1]) #matrix = np.array(matrix) for index, column in enumerate(data): #band = band +