def scale(df, scaling=None): """Scale data included in pandas dataframe. Parameters ---------- df : pandas dataframe dataframe to scale scaling : 'maxabs', 'minmax', 'std', or None, optional (default 'std') type of scaling to apply """ if scaling is None or scaling.lower() == 'none': return df df = df.dropna(axis=1, how='any') # Scaling data if scaling == 'maxabs': # Normalizing -1 to 1 scaler = MaxAbsScaler() elif scaling == 'minmax': # Scaling to [0,1] scaler = MinMaxScaler() else: # Standard normalization scaler = StandardScaler() mat = df.as_matrix() mat = scaler.fit_transform(mat) df = pd.DataFrame(mat, columns=df.columns) return df
def calculate_district_dis(dis_style = "euclidean"): print(dis_style) cal_what_dis = calculate_function[dis_style] poi_df = pd.read_csv(os.path.join(DATA_DIR, CONCRETE_DIR, POI_SHEET_DIR, "poi_data.csv")) # get all the poi data in dataframe districts_poi = poi_df.values[:, 1:] scaler = MaxAbsScaler() scalered_districts_poi = scaler.fit_transform(districts_poi) if dis_style == "canberra": scalered_districts_poi = districts_poi result = OrderedDict() for based_d in range(districts_poi.shape[0]): result[based_d + 1] = OrderedDict() based_district_poi = scalered_districts_poi[based_d] for c_d in range(districts_poi.shape[0]): compare_district_poi = scalered_districts_poi[c_d] result[based_d + 1][c_d + 1] = cal_what_dis(based_district_poi, compare_district_poi) result[based_d + 1] = sorted(result[based_d + 1].items(), key=lambda d:d[1]) return result
def impute_and_scale(df, scaling='std'): """Impute missing values with mean and scale data included in pandas dataframe. Parameters ---------- df : pandas dataframe dataframe to impute and scale scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std') type of scaling to apply """ df = df.dropna(axis=1, how='all') imputer = Imputer(strategy='mean', axis=0) mat = imputer.fit_transform(df) if scaling is None or scaling.lower() == 'none': return pd.DataFrame(mat, columns=df.columns) if scaling == 'maxabs': scaler = MaxAbsScaler() elif scaling == 'minmax': scaler = MinMaxScaler() else: scaler = StandardScaler() mat = scaler.fit_transform(mat) df = pd.DataFrame(mat, columns=df.columns) return df
def load_data(shuffle=True, n_cols=None): train_path = get_p1_file('http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/P1B1.train.csv') test_path = get_p1_file('http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/P1B1.test.csv') usecols = list(range(n_cols)) if n_cols else None df_train = pd.read_csv(train_path, engine='c', usecols=usecols) df_test = pd.read_csv(test_path, engine='c', usecols=usecols) df_train = df_train.drop('case_id', 1).astype(np.float32) df_test = df_test.drop('case_id', 1).astype(np.float32) if shuffle: df_train = df_train.sample(frac=1, random_state=seed) df_test = df_test.sample(frac=1, random_state=seed) X_train = df_train.as_matrix() X_test = df_test.as_matrix() scaler = MaxAbsScaler() mat = np.concatenate((X_train, X_test), axis=0) mat = scaler.fit_transform(mat) X_train = mat[:X_train.shape[0], :] X_test = mat[X_train.shape[0]:, :] return X_train, X_test
def games_price_feature(directory: str, app_index_dict: Dict[str, Any]) -> csr_matrix: """ Get games price feature Params: directory: raw data path app_index_dict: key->appid, value->app's index return: price: matrix of price """ games = [] # games list for key, value in app_index_dict.items(): games.insert(value, key) games_price = {} #all games and price infp = open(directory, 'r') #game_price in_reader = csv.reader(infp) valid_item = 0 price_array = [] for line in in_reader: if line[1] == 'undefine': games_price[line[0]] = 'undefine' else: price_filter = filter(lambda ch: ch in '0123456789.', line[1]) price_str = ''.join(list(price_filter)) games_price[line[0]] = float(price_str) for game in games: vector = [] if game in games_price and games_price[game] != 'undefine': vector.append(games_price[game]) valid_item += 1 price_array.append(vector) price_array = np.array(price_array) X_scaled = scale(price_array) price_array = X_scaled.tolist() #if the game does not have price, then add 0 for i in range(len(games)): if games[i] not in games_price or games_price[games[i]] == 'undefine': valid_item += 1 price_array.insert(i, [0.]) price_array = np.array(price_array) price_matrix = csr_matrix(price_array) logging.getLogger(__name__).debug('Item with price feature: ' + str(valid_item)) logging.getLogger(__name__).debug('price_matrix.shape: ' + str(price_matrix.shape)) infp.close() scaler = MaxAbsScaler() price_matrix = scaler.fit_transform(price_matrix) return price_matrix
def scaler_dummy(dataset,dataset_test): scaler_mm = MinMaxScaler() scaler_ma = MaxAbsScaler() scaler_sd = StandardScaler() scaler_rb = RobustScaler() numerical = list(dataset.columns) data_transform_mm = pd.DataFrame(data = dataset) data_transform_ma = pd.DataFrame(data = dataset) data_transform_sd = pd.DataFrame(data = dataset) data_transform_rb = pd.DataFrame(data = dataset) data_transform_mm[numerical] = scaler_mm.fit_transform(dataset[numerical]) data_transform_ma[numerical] = scaler_ma.fit_transform(dataset[numerical]) data_transform_sd[numerical] = scaler_sd.fit_transform(dataset[numerical]) data_transform_rb[numerical] = scaler_rb.fit_transform(dataset[numerical]) # scaler_mm.fit(dataset[numerical]) # scaler_ma.fit(dataset[numerical]) # scaler_sd.fit(dataset[numerical]) # scaler_rb.fit(dataset[numerical]) data_transform_mm[numerical] = scaler_mm.transform(dataset[numerical]) data_transform_ma[numerical] = scaler_ma.transform(dataset[numerical]) data_transform_sd[numerical] = scaler_sd.transform(dataset[numerical]) data_transform_rb[numerical] = scaler_rb.transform(dataset[numerical]) ## get dummies features_final_mm = pd.get_dummies(data_transform_mm) features_final_ma = pd.get_dummies(data_transform_ma) features_final_sd = pd.get_dummies(data_transform_sd) features_final_rb = pd.get_dummies(data_transform_rb) numerical = list(dataset_test.columns) scaler_mm_fitted_test = scaler_mm.transform(dataset_test[numerical]) scaler_ma_fitted_test = scaler_ma.transform(dataset_test[numerical]) scaler_sd_fitted_test = scaler_sd.transform(dataset_test[numerical]) scaler_rb_fitted_test = scaler_rb.transform(dataset_test[numerical]) scaler_mm_fitted_test = pd.DataFrame(data = scaler_mm_fitted_test,columns=numerical) scaler_ma_fitted_test = pd.DataFrame(data = scaler_ma_fitted_test,columns=numerical) scaler_sd_fitted_test = pd.DataFrame(data = scaler_sd_fitted_test,columns=numerical) scaler_rb_fitted_test = pd.DataFrame(data = scaler_rb_fitted_test,columns=numerical) features_final_mmt = pd.get_dummies(scaler_mm_fitted_test) features_final_mat = pd.get_dummies(scaler_ma_fitted_test) features_final_sdt = pd.get_dummies(scaler_sd_fitted_test) features_final_rbt = pd.get_dummies(scaler_rb_fitted_test) return features_final_mm, features_final_ma, features_final_sd, features_final_rb, features_final_mmt, features_final_mat, features_final_sdt, features_final_rbt
def load_data(train_path, test_path, gParameters): print('Loading data...') df_train = (pd.read_csv(train_path, header=None).values).astype('float32') df_test = (pd.read_csv(test_path, header=None).values).astype('float32') print('done') print('df_train shape:', df_train.shape) print('df_test shape:', df_test.shape) seqlen = df_train.shape[1] df_y_train = df_train[:, 0].astype('int') df_y_test = df_test[:, 0].astype('int') # only training set has noise Y_train = np_utils.to_categorical(df_y_train, gParameters['classes']) Y_test = np_utils.to_categorical(df_y_test, gParameters['classes']) df_x_train = df_train[:, 1:seqlen].astype(np.float32) df_x_test = df_test[:, 1:seqlen].astype(np.float32) X_train = df_x_train X_test = df_x_test scaler = MaxAbsScaler() mat = np.concatenate((X_train, X_test), axis=0) mat = scaler.fit_transform(mat) X_train = mat[:X_train.shape[0], :] X_test = mat[X_train.shape[0]:, :] # TODO: Add better names for noise boolean, make a featue for both RNA seq and label noise together # check if noise is on (this is for label) if gParameters['add_noise']: # check if we want noise correlated with a feature if gParameters['noise_correlated']: Y_train, y_train_noise_gen = candle.label_flip_correlated( Y_train, gParameters['label_noise'], X_train, gParameters['feature_col'], gParameters['feature_threshold']) # else add uncorrelated noise else: Y_train, y_train_noise_gen = candle.label_flip( Y_train, gParameters['label_noise']) # check if noise is on for RNA-seq data elif gParameters['noise_gaussian']: X_train = candle.add_gaussian_noise(X_train, 0, gParameters['std_dev']) return X_train, Y_train, X_test, Y_test
def features(anime_db): #The features we will be using for the system are the genre, type and the ratings anime_features = pd.concat([ anime_db.genre.str.get_dummies(sep=","), pd.get_dummies(anime_db['type']), anime_db.rating ], axis=1) #use MaxBsScaler to scale the features from 1-0, while preserving sparsity from sklearn.preprocessing import MaxAbsScaler max_abs_scaler = MaxAbsScaler() anime_features = max_abs_scaler.fit_transform(anime_features) return anime_features
def getviz_cosinus(X_train, y_train): preprocessing = MaxAbsScaler() X_train = preprocessing.fit_transform(X_train) reds = y_train == 0 blues = y_train == 1 plt.figure() kpca = KernelPCA(kernel='cosine', n_components=2, n_jobs=-1) X_kpca = kpca.fit_transform(X_train) plt.plot(X_kpca[reds, 0], X_kpca[reds, 1], "ro", label='csp-') plt.plot(X_kpca[blues, 0], X_kpca[blues, 1], "bo", label='csp+') plt.title("Projection by cosine PCA") plt.xlabel("1st principal component") plt.ylabel("2nd component") plt.legend(loc="lower right", prop={'size': 6}) plt.show()
def MaxAbsScaledData(df, colClass): # preparing for standadrising colNames = df.columns.tolist() lstClass = df[colClass] # normalizing the data from sklearn.preprocessing import MaxAbsScaler scaler = MaxAbsScaler() # fit ar = scaler.fit_transform(df) # transform df = pd.DataFrame(data=ar) # # change as required df.columns = colNames df[colClass] = lstClass return (df)
def preprocess_testing(): df = corpus_test.copy() #print(df) #df = df['comments'] df['comments'] = df['comments'].map(lambda x: preprocess_text(x)) #df = df.map(lambda x: preprocess_text(x)) #y_train = df["subreddits"].to_numpy() global vectorizer x_train = vectorizer.transform(df["comments"]) all_cols = np.arange(x_train.shape[1]) global to_delete cols_to_keep = np.where(np.logical_not(np.in1d(all_cols, to_delete)))[0] x_train = x_train[:, cols_to_keep] scalar = MaxAbsScaler() x_train = scalar.fit_transform(x_train) return x_train
def preprocess(df, features): df['qt_coligados'].fillna(0, inplace=True) df['qt_socios'].fillna(1, inplace=True) df['qt_socios_pf'].fillna(1, inplace=True) df['qt_socios_pj'].fillna(0, inplace=True) df['qt_funcionarios'].fillna(0, inplace=True) df['tx_crescimento_12meses'].fillna(0, inplace=True) df['tx_crescimento_24meses'].fillna(0, inplace=True) df['fl_optante_simei'].fillna('False', inplace=True) df['fl_optante_simples'].fillna('False', inplace=True) df['nm_meso_regiao'].fillna('OUTROS', inplace=True) df['nu_meses_rescencia'].fillna(df['nu_meses_rescencia'].median(), inplace=True) df['vl_faturamento_estimado_aux'].fillna( df['vl_faturamento_estimado_aux'].median(), inplace=True) df['vl_faturamento_estimado_grupo_aux'].fillna( df['vl_faturamento_estimado_aux'].median(), inplace=True) df.loc[df['sg_uf_matriz'].isna(), 'sg_uf_matriz'] = df.loc[df['sg_uf_matriz'].isna(), 'sg_uf'] df['de_nivel_atividade'].fillna('MUITO BAIXA', inplace=True) df['de_saude_tributaria'].fillna('VERMELHO', inplace=True) df['idade_media_socios'].fillna(df['idade_media_socios'].median(), inplace=True) df['empsetorcensitariofaixarendapopulacao'].fillna( df['empsetorcensitariofaixarendapopulacao'].median(), inplace=True) df['porc_st_regular'] = df['qt_socios_st_regular'] / df['qt_socios'] df['socio_pep'] = 0 df.loc[df['qt_socios_pep'] > 0, 'socio_pep'] = 1 df['coligada_exterior'] = 0 df.loc[df['qt_coligados_exterior'] > 0, 'coligada_exterior'] = 1 df['porc_socios_pf'] = df['qt_socios_pf'] / df['qt_socios'] df['porc_socios_pj'] = df['qt_socios_pj'] / df['qt_socios'] df = df[df['idade_media_socios'] > 0] df = df[features] df.loc[df['fl_rm'] == 'NAO', 'fl_rm'] = 0 df.loc[df['fl_rm'] == 'SIM', 'fl_rm'] = 1 df.loc[:, 'fl_rm'] = pd.to_numeric(df['fl_rm']) col_bool = df.dtypes[df.dtypes == 'bool'].index for col in col_bool: df[col] = df[col].astype(int) cat_cols = df.select_dtypes('object').columns cat_cols = cat_cols[1:] df = pd.get_dummies(df, columns=cat_cols, drop_first=True) scaler = MaxAbsScaler() df = pd.DataFrame(scaler.fit_transform(df.iloc[:, 1:])) df.fillna(0, inplace=True) return df
class SvrLrR(PredictModel): svr = None mas = None def create_predict_model(self): self.svr = SVR(kernel='linear') self.mas = MaxAbsScaler() def fit(self, X_train, X_valid, y_train, y_valid): self.create_predict_model() X_train = self.mas.fit_transform(X_train) self.svr.fit(X_train, y_train) def predict(self, X_test): X_test = self.mas.transform(X_test) return self.svr.predict(X_test)
class ScalerOperator(Operator): def __init__(self, params=0): ''' :param params: 0 for StandardScaler, 1 for MinMaxScaler, 2 for MaxAbsScaler ''' if params == 0: super().__init__(DATA_PERPROCESSING, 'dp_standardscaler', params) self.scaler = StandardScaler() elif params == 1: super().__init__(DATA_PERPROCESSING, 'dp_minmaxscaler', params) self.scaler = MinMaxScaler() elif params == 2: super().__init__(DATA_PERPROCESSING, 'dp_maxabsscaler', params) self.scaler = MaxAbsScaler() else: raise ValueError( "Invalid params for ScalerOperator. Expected {0,1,2}") def operate(self, dm_list: typing.List, phase='train'): # The input of a ScalerOperator is a DataManager assert len(dm_list) == 1 and isinstance(dm_list[0], DataManager) self.check_phase(phase) dm = dm_list[0] feature_types = dm.feature_types numercial_index = [ i for i in range(len(feature_types)) if feature_types[i] == "Float" or feature_types[i] == "Discrete" ] # Check if there are no numerical features in train_x if len(numercial_index) == 0: return dm if phase == 'train': x = dm.train_X x[:, numercial_index] = self.scaler.fit_transform(x[:, numercial_index]) dm.train_X = x else: x = dm.test_X x[:, numercial_index] = self.scaler.transform(x[:, numercial_index]) dm.test_X = x return dm
class SgdR(PredictModel): sgdr = None mas = None def create_predict_model(self): self.sgdr = SGDRegressor() self.mas = MaxAbsScaler() def fit(self, X_train, X_valid, y_train, y_valid): self.create_predict_model() X_train = self.mas.fit_transform(X_train) self.sgdr.fit(X_train, y_train) def predict(self, X_test): X_test = self.mas.transform(X_test) return self.sgdr.predict(X_test)
def scaleData(df): numericvars = ['AvgRating', 'TotalReviews', 'DegreeCentrality'] mms = MinMaxScaler() dfnumss = pd.DataFrame(mms.fit_transform(df[numericvars]), columns=['mms_' + x for x in numericvars], index=df.index) dfnumss = pd.concat([df, dfnumss], axis=1) dfnumss = dfnumss.drop(numericvars, axis=1) numericabsvars = ['SalesRank'] mas = MaxAbsScaler() dfnummas = pd.DataFrame(mas.fit_transform(dfnumss[numericabsvars]), columns=['mas_' + x for x in numericabsvars], index=df.index) dfnummas = pd.concat([dfnumss, dfnummas], axis=1) dfnummas = dfnummas.drop(numericabsvars, axis=1) return dfnummas
class LinearR(PredictModel): lr = None x_mas = None def create_predict_model(self): self.lr = LinearRegression() self.x_mas = MaxAbsScaler() def fit(self, X_train, X_valid, y_train, y_valid): self.create_predict_model() X_train = self.x_mas.fit_transform(X_train) self.lr.fit(X_train, y_train) def predict(self, X_test): X_test = self.x_mas.transform(X_test) return self.lr.predict(X_test)
def Dados_Balanceados_Separa_Teste_Onehot_Sem_Municipio_Orgao(): feature_names = Load_Obj('feature_names_onehot_sem_municipio_orgao') X_data, y_data = load_svmlight_file( 'desbalanceado_onehot_sem_municipio_orgao.svm', n_features=len(feature_names)) # pylint: disable=unbalanced-tuple-unpacking scaler = MaxAbsScaler() X_data_fit = scaler.fit_transform(X_data) Save_Obj(scaler, 'scaler_onehot_sem_municipio_orgao') X_train_cv, X_test, y_train_cv, y_test = train_test_split( X_data_fit, y_data, test_size=0.1, random_state=6439, stratify=y_data) dump_svmlight_file(X_train_cv, y_train_cv, 'treino_desbalanceado_onehot_sem_municipio_orgao.svm') dump_svmlight_file(X_test, y_test, 'test_desbalanceado_onehot_sem_municipio_orgao.svm')
def test_scaled_labeled_method_distances(): initial_date = create_test_dates()[0]['initial_date'] final_date = create_test_dates()[0]['final_date'] method = create_test_methods()[0] admissible_test_date = initial_date + ' to ' + final_date expected = create_test_data() expected = expected[[admissible_test_date]] expected.rename(columns={admissible_test_date: method}, inplace=True) scaler = MaxAbsScaler() expected[method] = scaler.fit_transform(expected[[method]]) print( distance_matrix.scaled_labeled_method_distances( create_test_data(), initial_date, final_date, method)) print(expected) assert_frame_equal( distance_matrix.scaled_labeled_method_distances( create_test_data(), initial_date, final_date, method), expected)
def mainModel(): # Optimize to get ideal parameters for LGBM model variables.mlData['paramsOptimizedLGBM'], variables.mlData['apsOptimizedLGBM'] = optimization.optimizeLGBM(settings.LGBMSpace) # Make the main LGBM model from the optimized parameters and the settings TFid variables.mlData['modelLGBM'], variables.mlData['probLGBM'], variables.mlData['apsLGBM'], variables.mlData['roc_aucLGBM'] = models.lgbmWMetrics( variables.mlData['xTrain'], variables.mlData['yTrain'], variables.mlData['xTest'], variables.mlData['yTest'], 2 ** variables.mlData['paramsOptimizedLGBM'][1], variables.mlData['paramsOptimizedLGBM'][0], variables.mlData['paramsOptimizedLGBM'][1], variables.mlData['paramsOptimizedLGBM'][2], variables.mlData['paramsOptimizedLGBM'][3], variables.mlData['paramsOptimizedLGBM'][4], variables.mlData['paramsOptimizedLGBM'][5]) # Random Forest variables.mlData['modelRF'], variables.mlData['probRF'], variables.mlData['apsRF'], variables.mlData['roc_aucRF'] = models.randomForestWMetrics(variables.mlData['xTrain'], variables.mlData['yTrain'], variables.mlData['xTest'], variables.mlData['yTest']) # Scaling variables.mlData['scaledXTrain'] = csr_matrix(variables.mlData['xTrain'].copy()) variables.mlData['scaledXTest'] = csr_matrix(variables.mlData['xTest'].copy()) scaler = MaxAbsScaler() variables.mlData['scaledXTrain'] = scaler.fit_transform(variables.mlData['scaledXTrain']) variables.mlData['scaledXTest'] = scaler.transform(variables.mlData['scaledXTest']) # Logistic Regression variables.mlData['modelLR'], variables.mlData['probLR'], variables.mlData['apsLR'], variables.mlData['roc_aucLR'] = models.logisticRegressionWMetrics(variables.mlData['scaledXTrain'], variables.mlData['yTrain'], variables.mlData['scaledXTest'], variables.mlData['yTest']) # Testing the correlation between models pd.DataFrame({'RF': variables.mlData['probRF'], 'LBGM': variables.mlData['probLGBM'], 'LR': variables.mlData['probLR']}).corr() # Final step: ensembling everything p = (variables.mlData['probRF'] + variables.mlData['probLGBM'] + variables.mlData['probLR']) / 3 # Metrics for testing the ensemble aps = average_precision_score(variables.mlData['yTest'], p) roc_auc = roc_auc_score(variables.mlData['yTest'], p) # Save the models on disk jb.dump(variables.mlData['modelRF'], settings.RandomForestPath) jb.dump(variables.mlData['modelLGBM'], settings.lightGBMPath) jb.dump(variables.mlData['modelLR'], settings.logisticRegressionPath) jb.dump(variables.mlData['tFidVec'], settings.VectorizerPath)
def get_gene_count_with_drugs(self, cut_off=None, normalized=True): columns = [col for col in self.country_drug_use_df if 'ldu' in col] drugs_per_country = dict( (self.country_drug_use_df['country_2letter'].ix[i], self.country_drug_use_df[columns].ix[i]) for i in range(9)) sample_country = dict((self.metadata.ix[i]['sample_code'], self.metadata.ix[i]['country']) for i in range(self.metadata.shape[0])) X = np.array([ drugs_per_country[sample_country[int(code)]].values for code in self.gene_counts_df.columns[1:] ]) Y = self.gene_counts_df.ix[:, self.gene_counts_df.columns[1:]].T if normalized: meta = self.metadata.ix[:, ['sample_code', 'norm_Bacteria_pairs']] meta = meta.set_index('sample_code') # Y = np.array([Y.ix[str(code)].values / meta.ix[int(code)].ix['norm_Bacteria_pairs'] for code in Y.index]) for code in Y.index: Y.ix[code] = Y.ix[code].apply(lambda x: np.divide( float(x), meta.ix[int(code), 'norm_Bacteria_pairs'])) # Y = Y * 1000000 scaler = MaxAbsScaler() Y = scaler.fit_transform(Y) else: Y = Y.values normalizer = None # Y = Y.values # if normalized: # normalizer = MaxAbsScaler() # Y = normalizer.fit_transform(Y) # Y = Y * 100 # print Y.shape if cut_off: indices = np.where(Y.sum(axis=0) < cut_off)[0] X = np.delete(X, indices, axis=0) Y = np.delete(Y, indices, axis=0) return X, Y, normalizer
def init(): data = pd.read_csv("test_data.csv") data['subcategory'] = data["subcategory"].apply(lambda x: x.strip()) data_features = pd.concat([ pd.get_dummies(data[["subcategory"]]), pd.get_dummies(data[["skill_set"]]), pd.get_dummies(data[["liked"]]) ], axis=1) from sklearn.preprocessing import MaxAbsScaler max_scaler = MaxAbsScaler() data_features = max_scaler.fit_transform(data_features) pickle.dump(data, open("list_data.sav", "wb")) return data_features
def data_Standardization(self, data, length): print('[data processing]start data standardization !') # data = np.array(data) if self.Standardization: # scaler = StandardScaler() # trans_data = scaler.fit_transform(data) scaler = MaxAbsScaler() data = scaler.fit_transform(data) else: # data = data pass _data = [] start = 0 for l in length: _data.append(data[start:(start + l[0])]) start += l[0] print('[data processing]data standardization end !') return _data
def transform(self, X): """Scale the data. Parameters ---------- X : array-like, shape = (n_samples, n_timestamps) Data to scale. Returns ------- X_new : array-like, shape = (n_samples, n_timestamps) Scaled data. """ X = check_array(X, dtype='float64') scaler = SklearnMaxAbsScaler() X_new = scaler.fit_transform(X.T).T return X_new
def maxabs_scale(dm): feature_types = dm.feature_types numercial_index = [i for i in range(len(feature_types)) if feature_types[i] == "Float" or feature_types[i] == "Discrete"] (train_x, _), (valid_x, _), (test_x, _) = dm.get_train(), dm.get_val(), dm.get_test() scaler = MaxAbsScaler() train_x[:, numercial_index] = scaler.fit_transform(train_x[:, numercial_index]) dm.train_X = train_x if valid_x is not None: valid_x[:, numercial_index] = scaler.transform(valid_x[:, numercial_index]) dm.val_X = valid_x if test_x is not None: test_x[:, numercial_index] = scaler.transform(test_x[:, numercial_index]) dm.test_X = test_x return dm
def f_features(dataframe): # Data used to get features #gyr = dataframe.iloc[:,9:12].values.transpose() #lin = dataframe.iloc[:,12:15].values.transpose() eul = dataframe.iloc[:, 15:18].values.transpose() # calculate features #xy = get_xy_canvas(eul) #eul = get_angles_offset(eul) #xy_euc = euc_dist(xy[0],xy[1]) # Features # feat_1 : gyr_x # feat_2 : gyr_y # feat_3 : gyr_z # feat_4 : lin_x # feat_5 : lin_y # feat_6 : lin_z # feat_7 : eul_roll # feat_8 : eul_yaw # feat_9 : eul_pitch # feat_10 : x_pos # feat_11 : y_pos # feat_12 : euc_dist_xy # Add features to feature list #features = [gyr[0], gyr[1], gyr[2], lin[0], lin[1], lin[2], eul[0], eul[1], # eul[2], xy[0], xy[1], xy_euc] #features = [lin[0], lin[1], lin[2]] features = [eul[0], eul[1], eul[2]] features = np.array(features).transpose() # Feature scaling #sc = StandardScaler() #features = sc.fit_transform(features) # Feature scaling maximum absolute value ma = MaxAbsScaler() features = ma.fit_transform(features) # Resampling to 60 (Median lenght in database) features = resample(features, RESAMPLE_VAL) return features
def pipeline(coin_complete, sequence_length, model): coin_complete = coin_complete.replace([np.inf, -np.inf], np.nan) coin_complete = coin_complete.fillna(0) # scale data max_abs_scaler = MaxAbsScaler() temp = max_abs_scaler.fit_transform(coin_complete) temp = pd.DataFrame(temp, columns=coin_complete.columns) coin_complete = temp.set_index(coin_complete.index) x = time_series_to_supervised(coin_complete, sequence_length) y_predict = model.predict(x) # inverse scaler y_predict_inverse = y_predict * max_abs_scaler.scale_[1] return y_predict, y_predict_inverse
def load_data(train_path, test_path, num_classes): df_train = (pd.read_csv(train_path, header=None).values).astype('float32') df_test = (pd.read_csv(test_path, header=None).values).astype('float32') print('df_train shape:', df_train.shape) print('df_test shape:', df_test.shape) df_y_train = df_train[:, 0].astype('int') df_y_test = df_test[:, 0].astype('int') Y_train = np_utils.to_categorical(df_y_train, num_classes) Y_test = np_utils.to_categorical(df_y_test, num_classes) df_x_train = df_train[:, 1:PL].astype(np.float32) df_x_test = df_test[:, 1:PL].astype(np.float32) X_train = df_x_train X_test = df_x_test scaler = MaxAbsScaler() mat = np.concatenate((X_train, X_test), axis=0) mat = scaler.fit_transform(mat) X_train = mat[:X_train.shape[0], :] X_test = mat[X_train.shape[0]:, :] print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Y_train shape:', Y_train.shape) print('Y_test shape:', Y_test.shape) x_train_len = X_train.shape[1] # this reshaping is critical for the Conv1D to work X_train = np.expand_dims(X_train, axis=2) X_test = np.expand_dims(X_test, axis=2) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) return X_train, Y_train, X_test, Y_test
def scaled_labeled_method_distances(distance_matrix_df, initial_date, final_date, method): """ Retrieves and scales a column of the input dataframe :param distance_matrix_df: A pandas dataframe :param final_date: 'YYYY-MM-DD' :param initial_date: 'YYYY-MM-DD' :param method: method used to compute the distance from the list ['cumulative' + GroItem, 'euclidean' + GroItem, 'dtw' + GroItem, 'tsfresh' + GroItem] :return: A pandas dataframe """ list_of_methods = ['euclidean', 'cumulative', 'dtw', 'ts-features'] if method.split('_')[0] not in list_of_methods: raise ValueError('Method of calculation unavailable') column_name = initial_date + ' to ' + final_date ranked_periods_df = pd.DataFrame(distance_matrix_df[column_name]) scaler = MaxAbsScaler() ranked_periods_df.loc[:, column_name] = scaler.fit_transform(ranked_periods_df[[column_name]]) ranked_periods_df.rename(columns={column_name: method}, inplace=True) return ranked_periods_df
def f_features(dataframe): # Data used to get features eul = dataframe.iloc[:, 15:18].values.transpose() # calculate features xy = get_xy_canvas(eul) # Add features to feature list features = [xy[0], xy[1]] features = np.array(features).transpose() # Feature scaling maximum absolute value ma = MaxAbsScaler() features = ma.fit_transform(features) # Resampling to 60 (Median lenght in database) features = resample(features, RESAMPLE_VAL) return features
def test_maxabs_scaler(): x = np.array([-2.65395789, -7.97116295, -4.76729177, -0.76885033, -6.45609635]) y = np.array([-8.9480332, -4.81582449, -3.73537263, -3.46051912, 1.35137275]) z = np.array([-0.47827432, -2.26208059, -3.75151683, -1.90862151, -1.87541903]) w = np.zeros_like(x) ds = vaex.from_arrays(x=x, y=y, z=z, w=w) df = ds.to_pandas_df() features = ['x', 'y', 'w'] scaler_skl = MaxAbsScaler() result_skl = scaler_skl.fit_transform(df[features]) scaler_vaex = vaex.ml.MaxAbsScaler(features=features) result_vaex = scaler_vaex.fit_transform(ds) assert result_vaex.absmax_scaled_x.values.tolist() == result_skl[:, 0].tolist(), "scikit-learn and vaex results do not match" assert result_vaex.absmax_scaled_y.values.tolist() == result_skl[:, 1].tolist(), "scikit-learn and vaex results do not match" assert result_vaex.absmax_scaled_w.values.tolist() == result_skl[:, 2].tolist(), "scikit-learn and vaex results do not match"
def main(): datasets = gen_datasets() print "origin data:" print datasets #0均值,单位方差 standard_scaler = StandardScaler() scaler_datasets = standard_scaler.fit_transform(datasets) print scaler_datasets print "-" * 80 min_max_scaler = MinMaxScaler() scaler_datasets = min_max_scaler.fit_transform(datasets) print scaler_datasets print "-" * 80 max_abs_scaler = MaxAbsScaler() scaler_datasets = max_abs_scaler.fit_transform(datasets) print scaler_datasets print "-" * 80 normalize = Normalizer(norm="l1") normalize_datasets = normalize.fit_transform(datasets) print normalize_datasets print "-" * 80 binarizer = Binarizer(threshold=1.1) binarizer_datasets = binarizer.fit_transform(datasets) print binarizer_datasets print "-" * 80 one_hot_encoder = OneHotEncoder() one_hot_encoder_datasets = one_hot_encoder.fit_transform([[0, 1, 4], [1, 2, 0], [2, 3, 5]]) print one_hot_encoder_datasets.toarray() print "-" * 80 imputer = Imputer(missing_values=0, strategy="median") imputer_datasets = imputer.fit_transform(datasets) print imputer_datasets print imputer.statistics_
def second_question(train_data, train_labels, test_data, test_labels): """ Second question: :param train_data: the train data :param train_labels: the train labels :param test_data: the test data :param test_labels: the test labels :return: """ # prevent wrong flags values if (b_2 and (c_2 or d_2)) or (not b_2 and c_2 and d_2): raise ValueError( 'Question 2: you can\'t set more than one value as True for question 2 flags' ) # Load word2vec embeddings file words_embeddings = KeyedVectors.load_word2vec_format("wiki.en.100k.vec", binary=False) # Get train and test data features by word2vec X_full_data = get_features(train_data + test_data, train_labels + test_labels, words_embeddings) # Normalize full data features scaler = MaxAbsScaler() X_full_data_maxabs = scaler.fit_transform(X_full_data) # Run logistic regression on normalized train data model = LogisticRegression() model.fit(X_full_data_maxabs[0:len(train_data)], train_labels) # Predict using logistic regression on normalized test data y_predict = model.predict( X_full_data_maxabs[len(train_data):len(X_full_data_maxabs)]) # Flatten full_test_labels, this are the y (true) labels y = np.ravel(test_labels) # Get f1_score and accuracy f_score = f1_score(y, y_predict, average='macro') accuracy = accuracy_score(y, y_predict) return accuracy, f_score
class SkflowLrR(PredictModel): ss = None dnn = None feature_columns = None def input_fn(self, X_train, y_train): feature_cols = { k: tf.constant(X_train[k].values) for k in self.feature_columns } labels = tf.constant(y_train.values) return feature_cols, labels def create_predict_model(self): self.ss = MaxAbsScaler() print() def fit(self, X_train, X_valid, y_train, y_valid): self.create_predict_model() self.feature_columns = X_train.columns tf_feature_cols = [ tf.contrib.layers.real_valued_column(k) for k in self.feature_columns ] ss_X_train = self.ss.fit_transform(X_train) ss_X_train = pd.DataFrame(ss_X_train, columns=self.feature_columns) self.dnn = LinearRegressor(feature_columns=tf_feature_cols) self.dnn.fit(input_fn=lambda: self.input_fn(ss_X_train, y_train), steps=1600) def predict(self, X_test): X_test = self.ss.transform(X_test) X_test_df = pd.DataFrame(X_test, columns=self.feature_columns) predict = self.dnn.predict(input_fn=lambda: self.input_fn( X_test_df, pd.DataFrame(np.zeros(len(X_test)))), as_iterable=False) return predict
def impute_and_scale(df, scaling=None): """Impute missing values with mean and scale data included in pandas dataframe. Parameters ---------- df : pandas dataframe dataframe to impute and scale scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std') type of scaling to apply """ df = df.dropna(axis=1, how='all') imputer = Imputer(strategy='mean', axis=0) mat = imputer.fit_transform(df) # print(mat.shape) if scaling is None: return pd.DataFrame(mat, columns=df.columns) # Scaling data if scaling == 'maxabs': # Normalizing -1 to 1 scaler = MaxAbsScaler() elif scaling == 'minmax': # Scaling to [0,1] scaler = MinMaxScaler() else: # Standard normalization scaler = StandardScaler() mat = scaler.fit_transform(mat) # print(mat.shape) df = pd.DataFrame(mat, columns=df.columns) return df
from estimators import LSHNearestNeighbors from preprocessors import text_preprocess if __name__ == "__main__": df = pandas.read_csv("/media/alexander/b32bf4b4-8724-4107-9d19-abf6615c2f60/alexander/HELP_FILE/query.yaHotelId.showInTop.sure.final.tsv", sep="\t") print("Изначальная размерность данных:", df.shape,";", "Количество отелей:", len(df["yaHotelId"].unique())) sure_df = df[df["sure"]] print(sure_df.shape) filtered_values = [value[0] for value in sure_df["yaHotelId"].value_counts().iteritems() if value[1] >= 5] filtered_df = sure_df[sure_df["yaHotelId"].isin(filtered_values)] print("Получившаяся размерность данных:", filtered_df.shape, ";", "Количество отелей:", len(filtered_df["yaHotelId"].unique())) vectorizer = TfidfVectorizer(preprocessor=text_preprocess) y = np.array(filtered_df["yaHotelId"]) X = vectorizer.fit_transform(filtered_df["query"]) print("X shape:", X.shape) scaler = MaxAbsScaler() scaler.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) clf = LSHNearestNeighbors(n_estimators=10, n_candidates=100, n_neighbors=9, mode="parzen window") clf.fit(X_train, y_train) t1 = time.time() y_pred = clf.predict(X_test) t2 = time.time() - t1 print("delta time:", t2) print("mean time for one query:", t2/X_test.shape[0]) print("accuracy:", accuracy_score(y_test, y_pred))
1066, 1053, 1339, 1040, 497, 253, 1485, 337, 1347, 1343, 122, 980, 87, 126, 528, 694, 1444, 655, 161, 626, 545, 906, 1235, 684, 263, 69, 882, 1209, 180, 1386, 1074, 631, 908, 1176, 947, 401, 1085, 1029, 797, 1107, 386, 559, 588, 522, 644, 614, 1440, 1140, 1267, 1475, 217, 1201, 456, 231, 1079, 1224, 1036, 156, 852, 1384, 1288, 243, 760, 1071] # 6. Zmiana na numpy.array train_index = np.asarray(A) test_index = np.asarray(B) # 7. Podział danych X_train, X_test = raw_X[train_index], raw_X[test_index] y_train, y_test = raw_y[train_index], raw_y[test_index] # 8. Normalizacja X_train_norm = normalizer.fit_transform(X_train) X_test_norm = normalizer.transform(X_test) # 9. Nauka algorytmu clf = SVC(kernel='rbf', C=1, gamma=0.5, coef0=0.0) clf.fit(X_train_norm, y_train) pred = clf.predict(X_test_norm) acc = accuracy_score(pred, y_test) # 10. Wynik print "Accuracy:",acc # 11. Zapisanie modelu with open("model.pickle", "wb") as f: pickle.dump((clf, normalizer), f, 2);
model.add(MaxoutDense(100, input_dim=42)) model.add(Activation('relu')) model.add(GaussianNoise(0.00001)) model.add(Dropout(0.3)) model.add(MaxoutDense(1, input_dim=100)) model.add(Activation('sigmoid')) #ada = Adagrad(lr=0.001) ada = SGD(lr=0.0003, momentum=0.9, decay=0.0001, nesterov=True) model.compile(optimizer=ada, loss='binary_crossentropy', metrics=['accuracy']) scaler = MaxAbsScaler() train_train_scaled = scaler.fit_transform(train_train[features]) train_test_scaled = scaler.transform(train_test[features]) model.fit(train_train_scaled, train_train.target.values, nb_epoch=150, batch_size=100) train_train_pred = model.predict(train_train_scaled, batch_size=100) train_test_pred = model.predict(train_test_scaled, batch_size=100) train_score = log_loss(train_train.target.values, train_train_pred) test_score = log_loss(train_test.target.values, train_test_pred) #test_poly = poly.transform(test[features]) test_scaled = scaler.transform(test[features]) test_pred = model.predict(test_scaled, batch_size=100) ensemble_train.loc[train_test.index, 'nn'] = train_test_pred
def main(): X, y = get_data('../../data/train.csv') sclr = MaxAbsScaler() X = sclr.fit_transform(X) # pickle.dump(sclr, open('./dumps/scaler_pickle', 'wb+')) X_test, y_test = get_data('../../data/val.csv') X_test = sclr.transform(X_test) X_fin, y_fin = get_data('../../data/test.csv') X_fin = sclr.transform(X_fin) other, yo = get_data('../../data/other.csv') other = sclr.transform(other) lin = linear_model.LogisticRegression( C=10000, ) # selector = RFE(lin, 21, step=1) # selector.fit(X, y) # X = selector.transform(X) # X_test = selector.transform(X_test) # X_fin = selector.transform(X_fin) # for i in range(len(selector.support_)): # print i+1, selector.support_[i] lin.fit(X, y) # pickle.dump(lin, open('./dumps/lin_reg_pickle', 'wb+')) x1 = lin.predict_proba(X) x1_test = lin.predict_proba(X_test) # x1_fin = lin.predict_proba(X_fin) # o1 = lin.predict_proba(other) print 'lin' print metrics.classification_report(y, lin.predict(X)) print metrics.classification_report(y_test, lin.predict(X_test)) print metrics.classification_report(y_fin, lin.predict(X_fin)) roc = lin.predict_proba(X_fin) # r = lin.predict(X_test) # l1 = [] # l2 = [] # for i in range(len(roc)): # if max(roc[i]) > 0.5: # l1.append(y_fin[i]) # l2.append(r[i]) # print 'dsfasdfasd' # print metrics.classification_report(l1, l2) # return fpr_grd0, tpr_grd0, _ = metrics.roc_curve(y_fin, roc[:, 0], pos_label=0) fpr_grd1, tpr_grd1, _ = metrics.roc_curve(y_fin, roc[:, 1], pos_label=1) fpr_grd2, tpr_grd2, _ = metrics.roc_curve(y_fin, roc[:, 2], pos_label=2) plt.plot(fpr_grd0, tpr_grd0, label='NRP') plt.plot(fpr_grd1, tpr_grd1, label='RiPP') plt.plot(fpr_grd2, tpr_grd2, label='Polyketide') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') plt.legend(loc='best') plt.show() # print lin.coef_ # print sum(lin.predict_proba(X_test)[0]) svm_model = SVC( C=5000, # kernel='linear', # degree=2, coef0=100, # probability=True, # shrinking=True, # class_weight='balanced', probability=True, # decision_function_shape='ovr' ) svm_model.fit(X, y) x2 = svm_model.predict_proba(X) x2_test = svm_model.predict_proba(X_test) x2_fin = svm_model.predict_proba(X_fin) o2 = svm_model.predict_proba(other) print 'svm' print metrics.classification_report(y, svm_model.predict(X)) print metrics.classification_report(y_test, svm_model.predict(X_test))