def train_model(train_vector): mapper = sklearn_pandas.DataFrameMapper([ ('op_type',preprocessing.LabelEncoder()), (['is_vip','reg_life','pre7day_uid_ipcount','pre7day_uid_jobcount','pre7day_uid_citycount','pre7day_uid_infocount','pre7day_uid_sys_delete_count','pre7day_uid_man_delete_count','pre7day_uid_sys_backmodify_count','pre7day_uid_man_backmodify_count','license_enterpriseid_count','pre7day_ip_uidcount','pre7day_ip_jobcount','pre7day_ip_citycount','pre7day_ip_infocount','pre7day_ip_sys_delete_count','pre7day_ip_man_delete_count','pre7day_ip_sys_backmodify_count','pre7day_ip_man_backmodify_count','pre7day_userip_reguid_count','pre7day_userip_login_count','pre7day_phone_uidcount','pre7day_phone_jobcount','pre7day_phone_citycount','pre7day_phone_infocount','pre24hour_ip_uid_count','pre24hour_ip_job_count','pre24hour_ip_city_count','pre24hour_ip_info_count','pre24hour_uid_ip_count','pre24hour_uid_job_count','pre24hour_uid_city_count','pre24hour_uid_info_count','pre24hour_phone_uid_count','pre24hour_phone_job_count','pre24hour_phone_city_count','pre24hour_phone_info_count','pre24hour_ip_audit_pass_info_count','pre24hour_ip_audit_nopass_info_count','pre24hour_ip_audit_shuazuan_info_count','pre24hour_uid_audit_pass_info_count','pre24hour_uid_audit_nopass_info_count','pre24hour_uid_audit_shuazuan_info_count','pre24hour_phone_audit_pass_info_count','pre24hour_phone_audit_nopass_info_count','pre24hour_phone_audit_shuazuan_info_count','pre24hour_ip_sys_delete_count','pre24hour_ip_sys_backmodify_count','pre24hour_uid_sys_delete_count','pre24hour_uid_sys_backmodify_count','pre1hour_ip_uid_count','pre1hour_ip_job_count','pre1hour_ip_city_count','pre1hour_ip_info_count','pre1hour_uid_ip_count','pre1hour_uid_job_count','pre1hour_uid_city_count','pre1hour_uid_info_count','pre1hour_phone_uid_count','pre1hour_phone_job_count','pre1hour_phone_city_count','pre1hour_phone_info_count','pre1hour_title_uid_count','pre1hour_title_job_count','pre1hour_title_city_count','pre1hour_title_info_count','pre1hour_title_ip_count','pre5min_ip_uid_count','pre5min_ip_job_count','pre5min_ip_city_count','pre5min_ip_info_count','pre5min_uid_ip_count','pre5min_uid_job_count','pre5min_uid_city_count','pre5min_uid_info_count','pre5min_phone_uid_count','pre5min_phone_job_count','pre5min_phone_city_count','pre5min_phone_info_count','pre1min_ip_uid_count','pre1min_ip_job_count','pre1min_ip_city_count','pre1min_ip_info_count','pre1min_uid_ip_count','pre1min_uid_job_count','pre1min_uid_city_count','pre1min_uid_info_count','pre1min_phone_uid_count','pre1min_phone_job_count','pre1min_phone_city_count','pre1min_phone_info_count'],None), (['license'],[preprocessing.Imputer(strategy='most_frequent'),preprocessing.LabelEncoder()]), (['xingzhi'],[preprocessing.Imputer(strategy='most_frequent'),preprocessing.LabelEncoder()]), (['xinzi'],[preprocessing.Imputer(strategy='most_frequent'),preprocessing.LabelEncoder()]), (['xueli'],[preprocessing.Imputer(strategy='most_frequent'),preprocessing.LabelEncoder()]), (['thirdcertificate'],[preprocessing.Imputer(strategy='most_frequent'),preprocessing.LabelEncoder()]), (['zhaopinrenshu'],preprocessing.Imputer(strategy='most_frequent')), (['gongzuonianxian'],[preprocessing.Imputer(strategy='most_frequent'),preprocessing.LabelEncoder()]), (['fulidaiyu_wuxian','fulidaiyu_canbu','fulidaiyu_huabu','fulidaiyu_fangbu','fulidaiyu_jiaotongbu','fulidaiyu_zhoumoshuangxiu','fulidaiyu_jiabanbu','fulidaiyu_baozhu','fulidaiyu_niandishuangxin','fulidaiyu_baochi','fulidaiyu_oversum','user_define_fulidaiyu_oversum','title_punctuationcount','title_rarecharcount','title_iscontainabnormalnumber','content_punctuationcount','content_suspectmaxlength','content_conpuncmaxsize','content_suspectzerocount','content_transitsum','content_transitrate','content_punctuationrate','content_rarecharcount','fuli_suspectmaxlength','fuli_suspectzerocount','fuli_rarecharcount','fuli_iscontainkeyword','fuli_iscontainabnormalnumber','enterprisereg_name_rarecharcount','enterprisereg_address_rarecharcount','enterprisereg_address_iscontain_local'],None), ('target',None) ]) train = mapper.fit_transform(train_vector) pipeline_estimator = pipeline.Pipeline([ ('estimator',RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=2, n_jobs=3 ,random_state=0)) ]) #pipeline_params = dict( #selector__k=[90, 100, 110], #estimator__n_estimators=[200, 250, 300]) #grid_search = model_selection.GridSearchCV(pipeline_estimator, param_grid=pipeline_params, n_jobs=3) #grid_search.fit(train[:,:-1],train[:,-1]) #best_estimator = grid_search.best_estimator_ pipeline_estimator.fit(train[:,:-1],train[:,-1]) return pipeline_estimator,mapper
def load_testset(self, shuffle): print("<==== ====", inspect.stack()[0][3], "==== ====>") df = pd.read_csv("competitionset.csv") continuous_features = ["1", "2", "6", "8", "10"] categorical_features = [ "3", "4", "5", "7", "9", "11", "12", "13", "14", "15", "16", "17", "18" ] col_list = list(df.columns) col_list.remove('rowIndex') col_list.insert(0, 'rowIndex') df = df[col_list] df = pd.get_dummies( df, columns=["feature" + n for n in categorical_features], dtype=np.int64) transform_mapper = sklearn_pandas.DataFrameMapper( [ ('rowIndex', None), ], default=sklearn.preprocessing.StandardScaler()) standardized = transform_mapper.fit_transform(df.copy()) df = pd.DataFrame(standardized, columns=df.columns) print("0. Prepare the Final Data Sets (Regression)") self.TESTSET_X = df.drop(['rowIndex'], axis=1)
def _gen_mapper(self): '''create a list of tuples for the DataFrameMapper for the moving window.''' window_mappings = [(self.target, None)] # keep the target untouched for k in self.mappings.keys(): for i in reversed(range(self.obs_window)): name = "{}_{}".format(k, i) window_mappings.append(([name], self.mappings[k])) self._mapper = sklearn_pandas.DataFrameMapper(window_mappings)
def extract_personal_history(personal_history_frame): personal_history_mapper = sklearn_pandas.DataFrameMapper([ (['N103_a_Smoking (Present or Past)', 'N107_Tobacco Chewing', 'N108_Alcohol Intake'], [sklearn.preprocessing.FunctionTransformer(impute2,validate=False), # Impute no sklearn.preprocessing.FunctionTransformer(np.negative), sklearn.preprocessing.Binarizer(threshold = -1.5)])]) # Flip order so 0 is no, 1 is yes # TODO include type of smoking, start date, stop date, number per day x = personal_history_mapper.fit_transform(personal_history_frame.copy()) feature_names = ['has_smoked','has_chewed_tobacco','has_drunk_alcohol'] x = pd.DataFrame(data=x,index = personal_history_frame.index,columns = feature_names) return x
def extract_current_symptoms(symptom_frame): # TODO Extract information other than just presence or absence of symptoms breathlessness_mapper = sklearn_pandas.DataFrameMapper([ (['N18_Breathlessness'], [sklearn.preprocessing.FunctionTransformer(impute2,validate=False), # Impute no sklearn.preprocessing.FunctionTransformer(np.negative), sklearn.preprocessing.Binarizer(threshold = -1.5)])]) # Flip order so 0 is no, 1 is yes cough_mapper = sklearn_pandas.DataFrameMapper([ (['N30_Cough'], [sklearn.preprocessing.FunctionTransformer(impute2,validate=False), # Impute no sklearn.preprocessing.FunctionTransformer(np.negative), sklearn.preprocessing.Binarizer(threshold = -1.5)])]) # Flip order so 0 is no, 1 is yes chest_pain_mapper = sklearn_pandas.DataFrameMapper([ (['N50_Chest Pain'], [sklearn.preprocessing.FunctionTransformer(impute2,validate=False), # Impute no sklearn.preprocessing.FunctionTransformer(np.negative), sklearn.preprocessing.Binarizer(threshold = -1.5)])]) # Flip order so 0 is no, 1 is yes fever_mapper = sklearn_pandas.DataFrameMapper([ (['N55_Fever'], [sklearn.preprocessing.FunctionTransformer(impute2,validate=False), # Impute no sklearn.preprocessing.FunctionTransformer(np.negative), sklearn.preprocessing.Binarizer(threshold = -1.5)])]) # Flip order so 0 is no, 1 is yes nasal_mapper = sklearn_pandas.DataFrameMapper([ (['N64_Nasal Symptoms'], [sklearn.preprocessing.FunctionTransformer(impute2,validate=False), # Impute no sklearn.preprocessing.FunctionTransformer(np.negative), sklearn.preprocessing.Binarizer(threshold = -1.5)])]) # Flip order so 0 is no, 1 is yes x_breathlessness = breathlessness_mapper.fit_transform(symptom_frame.copy()) x_cough = cough_mapper.fit_transform(symptom_frame.copy()) x_chest_pain = chest_pain_mapper.fit_transform(symptom_frame.copy()) x_fever = fever_mapper.fit_transform(symptom_frame.copy()) x_nasal = nasal_mapper.fit_transform(symptom_frame.copy()) x = pd.DataFrame(index=symptom_frame.index) x['has_breathlessness'] = x_breathlessness x['has_cough'] = x_cough x['has_chest_pain'] = x_chest_pain x['has_fever'] = x_fever x['has_nasal_symptoms'] = x_nasal return x
def extract_risk_factors(risk_factor_frame): risk_factor_mapper = sklearn_pandas.DataFrameMapper([ (['N83_Family history of COPD', 'N84_Family hostory of allergies', 'N86_Personal History of allergies?','N87_Indoor cooking using Biomass?'], [sklearn.preprocessing.FunctionTransformer(impute2,validate=False), # Impute no sklearn.preprocessing.FunctionTransformer(np.negative), sklearn.preprocessing.Binarizer(threshold = -1.5)])]) # Flip order so 0 is no, 1 is yes # TODO include type of allergies, number of hours per day cooking, years cooking x_risk_factor = risk_factor_mapper.fit_transform(risk_factor_frame.copy()) feature_names = ['has_copd_family_history','has_allergy_family_history','has_allergy_personal_history', 'has_biomass_cooking_history'] x = pd.DataFrame(data=x_risk_factor,index = risk_factor_frame.index,columns = feature_names) return x
def main(): data = MNIST('./data') col_names = ["x" + str(x) for x in range(784)] # Define a transform function that will be serialized with the model mnist_mapper = sklearn_pandas.DataFrameMapper([(col_names, StandardScaler()), ("digit", None)]) # 60,000 train samples of 28x28 grid, domain 0-255 mnist_train_data, mnist_train_label = data.load_training() mnist_train_df = pandas.concat( (pandas.DataFrame(mnist_train_data, columns=col_names), pandas.DataFrame(list(mnist_train_label), columns=["digit"])), axis=1) mnist_train_df_norm = mnist_mapper.fit_transform(mnist_train_df) mlp_config = { 'hidden_layer_sizes': (1000, ), 'activation': 'tanh', 'algorithm': 'adam', 'max_iter': 20, 'early_stopping': True, 'validation_fraction': 0.1, 'verbose': True } mnist_classifier = nn.MLPClassifier(**mlp_config) mnist_classifier.fit(X=mnist_train_df_norm[:, 0:28 * 28], y=mnist_train_df_norm[:, 28 * 28]) # 10,000 test samples mnist_test_data, mnist_test_label = data.load_testing() mnist_test_df = pandas.concat( (pandas.DataFrame(mnist_test_data, columns=col_names), pandas.DataFrame(list(mnist_test_label), columns=["digit"])), axis=1) mnist_test_df_norm = mnist_mapper.fit_transform(mnist_test_df) prediction = mnist_classifier.predict_proba(mnist_test_df_norm[:, 0:28 * 28]) truth_array = [ prediction[idx].argmax() == mnist_test_label[idx] for idx in range(len(prediction)) ] accuracy = float(sum(truth_array)) / float(len(truth_array)) print "out of sample model accuracy [%s]" % accuracy print "serializing to pmml" sklearn2pmml(mnist_classifier, mnist_mapper, "MLP_MNIST.pmml", with_repr=True)
def load_trainingset(self, shuffle): print("<==== ====", inspect.stack()[0][3], "==== ====>") df = pd.read_csv("trainingset.csv") continuous_features = ["1", "2", "6", "8", "10"] categorical_features = ["3", "4", "5", "7", "9", "11", "12", "13", "14", "15", "16", "17", "18"] df['Claimed'] = np.where(df['ClaimAmount'] > 0, 1, 0) col_list = list(df.columns) col_list.remove('rowIndex') col_list.remove('Claimed') col_list.remove('ClaimAmount') col_list.insert(0, 'ClaimAmount') col_list.insert(0, 'Claimed') col_list.insert(0, 'rowIndex') df = df[col_list] df = pd.get_dummies( df, columns=["feature" + n for n in categorical_features], dtype=np.int64 ) transform_mapper = sklearn_pandas.DataFrameMapper([ ('rowIndex', None), ('Claimed', None), ('ClaimAmount', None), ], default=sklearn.preprocessing.StandardScaler()) standardized = transform_mapper.fit_transform(df.copy()) df = pd.DataFrame(standardized, columns=df.columns) print("0. Prepare the Final Data Sets (Classification)") self.c_X = df.drop(['rowIndex', 'Claimed', 'ClaimAmount'], axis=1) self.c_Y = df.Claimed self.c_x_train, self.c_x_test, self.c_y_train, self.c_y_test = sklearn.model_selection\ .train_test_split(self.c_X, self.c_Y, test_size=0.30, shuffle=shuffle) print("0. Prepare the Final Data Sets (Regression)") self.r_X = df.drop(['rowIndex', 'Claimed', 'ClaimAmount'], axis=1) self.r_Y = df.ClaimAmount self.r_x_train, self.r_x_test, self.r_y_train, self.r_y_test = sklearn.model_selection\ .train_test_split(self.r_X, self.r_Y, test_size=0.30, shuffle=shuffle)
def load_trainingset(self, shuffle, PARAMETER): self.log( "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@:" + str(PARAMETER)) print("<==== ====", inspect.stack()[0][3], "==== ====>") df = pd.read_csv("trainingset.csv") continuous_features = ["1", "2", "6", "8", "10"] categorical_features = [ "3", "4", "5", "7", "9", "11", "12", "13", "14", "15", "16", "17", "18" ] df['Claimed'] = np.where(df['ClaimAmount'] > 0, 1, 0) df['Outlier'] = np.where(df['ClaimAmount'] > PARAMETER, 1, 0) col_list = list(df.columns) col_list.remove('rowIndex') col_list.remove('Claimed') col_list.remove('ClaimAmount') col_list.remove('Outlier') col_list.insert(0, 'Outlier') col_list.insert(0, 'ClaimAmount') col_list.insert(0, 'Claimed') col_list.insert(0, 'rowIndex') df = df[col_list] df = pd.get_dummies( df, columns=["feature" + n for n in categorical_features], dtype=np.int64) transform_mapper = sklearn_pandas.DataFrameMapper( [ ('rowIndex', None), ('Claimed', None), ('ClaimAmount', None), ('Outlier', None), ], default=sklearn.preprocessing.StandardScaler()) standardized = transform_mapper.fit_transform(df.copy()) df = pd.DataFrame(standardized, columns=df.columns) print("0. Prepare the Final Data Sets (Classification)") self.c_X = df.drop(['rowIndex', 'Claimed', 'ClaimAmount', 'Outlier'], axis=1) self.c_Y = df.Claimed # <Polynomial Features> # poly = sklearn.preprocessing.PolynomialFeatures(2, include_bias=True) # self.c_X = poly.fit_transform(self.c_X) # <Power Transformer> # power = sklearn.preprocessing.PowerTransformer() # power.fit(self.c_X) # self.c_X = power.transform(self.c_X) # <Quantile Transform> # self.c_X = sklearn.preprocessing.quantile_transform(self.c_X, axis=0, n_quantiles=1000, # output_distribution='normal', ignore_implicit_zeros=False, # subsample=100000, random_state=None, copy=False) self.c_x_train, self.c_x_test, self.c_y_train, self.c_y_test = sklearn.model_selection\ .train_test_split(self.c_X, self.c_Y, test_size=0.30, shuffle=shuffle) # self.c_x_train = self.c_x_train.values # self.c_x_test = self.c_x_test.values # self.c_y_train = self.c_y_train.values # self.c_y_test = self.c_y_test.values # # print("0. SMOTE") # self.c_x_train_SMOTE, self.c_y_train_SMOTE = SMOTE().fit_resample(self.c_x_train, self.c_y_train) # # print("0. ADASYN") # self.c_x_train_ADASYN, self.c_y_train_ADASYN = ADASYN().fit_resample(self.c_x_train, self.c_y_train) print("0. Prepare the Final Data Sets (Regression)") self.r_X = df.drop(['rowIndex', 'Claimed', 'ClaimAmount', 'Outlier'], axis=1) self.r_Y = df.ClaimAmount self.r_x_train, self.r_x_test, self.r_y_train, self.r_y_test = sklearn.model_selection\ .train_test_split(self.r_X, self.r_Y, test_size=0.30, shuffle=shuffle) print("0. Prepare the Final Data Sets (Outlier)") self.o_X = df.drop(['rowIndex', 'Claimed', 'ClaimAmount', 'Outlier'], axis=1) self.o_Y = df.Outlier self.o_x_train, self.o_x_test, self.o_y_train, self.o_y_test = sklearn.model_selection\ .train_test_split(self.o_X, self.o_Y, test_size=0.30, shuffle=shuffle) print("0. Aggressive Regression") df_aggressive_regression = df[:int(0.7 * df.shape[0])] df_aggressive_regression = df_aggressive_regression[ df_aggressive_regression['ClaimAmount'] > 0] print(df_aggressive_regression.shape) OUTLIER_CUTOFF = 4647 df_aggressive_regression = df_aggressive_regression[ df_aggressive_regression['ClaimAmount'] < OUTLIER_CUTOFF] self.r_x_train_aggressive = df_aggressive_regression.drop( ['rowIndex', 'Claimed', 'ClaimAmount'], axis=1) self.r_y_train_aggressive = df_aggressive_regression.ClaimAmount
data = [] targets = [] for doc in docs: doc_data = doc['fields']['analyzed_text'][0] if (len(doc_data) > 0): text = '' for word in doc_data: text = text + ' ' + word data.append(text) if (doc['fields']['label'][0] == 'negative'): targets.append(0) else: targets.append(1) else: print 'found empty doc' print len(data) mapper4 = sklearn_pandas.DataFrameMapper([ ('text', sklearn.feature_extraction.text.CountVectorizer()), ], sparse=True) dataframe = pandas.DataFrame({'text': data}) matrix = mapper4.fit_transform(dataframe) clf = sklearn.linear_model.LogisticRegression() clf.fit(matrix, targets) print matrix[2:3] print(clf.predict(matrix[2:3])) print(targets[2]) # this does not work because of https://github.com/jpmml/jpmml-sklearn/issues/4 sklearn2pmml.sklearn2pmml(clf, mapper4, "naive_bayes.pmml", with_repr=True)
"c4": num, "s5": cat, "c5": num, "hand": cat } d_in = pd.read_csv("..\\data\\poker-hand-training-true.data", names=col_names, dtype=col_types) # features can't be parallelly processed by different transformer in a single DataFrameMapper pipeline object # pipeline1 also passes through 'y' (comes back as the 3rd column) engineered_feature_pipeline1 = skp.DataFrameMapper( [(['s1', 's2', 's3', 's4', 's5'], uf.Comparator(criteria=5), { 'alias': 'suit_match' }), (['c1', 'c2', 'c3', 'c4', 'c5'], uf.Comparator(criteria=2), { 'alias': 'no_pairs' })], input_df=True, df_out=True, default=None) #temp = d_in[d_in['hand']=='5'] #engineered_feature_pipeline1.fit_transform(temp).head() engineered_feature_pipeline2 = skp.DataFrameMapper( [(['c1', 'c2', 'c3', 'c4', 'c5'], uf.Comparator(criteria=3), { 'alias': 'has_triplet' })], input_df=True, df_out=True, default=False)
def make_mapper_from_transformations(transformations): return sklearn_pandas.DataFrameMapper( [t.as_input_transformer_tuple() for t in transformations], input_df=True)
def load_trainingset(self, shuffle): print("<==== ====", inspect.stack()[0][3], "==== ====>") df = pd.read_csv("trainingset.csv") continuous_features = ["1", "2", "6", "8", "10"] categorical_features = [ "3", "4", "5", "7", "9", "11", "12", "13", "14", "15", "16", "17", "18" ] df['Claimed'] = np.where(df['ClaimAmount'] > 0, 1, 0) col_list = list(df.columns) col_list.remove('rowIndex') col_list.remove('Claimed') col_list.remove('ClaimAmount') col_list.insert(0, 'ClaimAmount') col_list.insert(0, 'Claimed') col_list.insert(0, 'rowIndex') df = df[col_list] df = pd.get_dummies( df, columns=["feature" + n for n in categorical_features], dtype=np.int64) transform_mapper = sklearn_pandas.DataFrameMapper( [ ('rowIndex', None), ('Claimed', None), ('ClaimAmount', None), ], default=sklearn.preprocessing.StandardScaler()) standardized = transform_mapper.fit_transform(df.copy()) df = pd.DataFrame(standardized, columns=df.columns) print("0. Prepare the Final Data Sets (Classification)") self.c_X = df.drop(['rowIndex', 'Claimed', 'ClaimAmount'], axis=1) self.c_Y = df.Claimed # <Polynomial Features> # poly = sklearn.preprocessing.PolynomialFeatures(2, include_bias=True) # self.c_X = poly.fit_transform(self.c_X) # <Power Transformer> # power = sklearn.preprocessing.PowerTransformer() # power.fit(self.c_X) # self.c_X = power.transform(self.c_X) # <Quantile Transform> # self.c_X = sklearn.preprocessing.quantile_transform(self.c_X, axis=0, n_quantiles=1000, # output_distribution='uniform', ignore_implicit_zeros=False, # subsample=100000, random_state=None, copy=False) # <PCA> # pca = sklearn.decomposition.PCA(n_components=9, copy=True, whiten=False, # svd_solver='auto', tol=0.0, iterated_power='auto', # random_state=None) # pca = sklearn.decomposition.TruncatedSVD(n_components=300, algorithm='randomized', # n_iter=100, random_state=None, tol=0.0) # pca.fit(self.c_X) # self.c_X = pca.transform(self.c_X) self.c_x_train, self.c_x_test, self.c_y_train, self.c_y_test = sklearn.model_selection\ .train_test_split(self.c_X, self.c_Y, test_size=0.30, shuffle=shuffle) print("0. Prepare the Final Data Sets (Regression)") self.r_X = df.drop(['rowIndex', 'Claimed', 'ClaimAmount'], axis=1) self.r_Y = df.ClaimAmount self.r_x_train, self.r_x_test, self.r_y_train, self.r_y_test = sklearn.model_selection\ .train_test_split(self.r_X, self.r_Y, test_size=0.30, shuffle=shuffle) print("0. Aggressive Regression") df_aggressive_regression = df[:int(0.7 * df.shape[0])] df_aggressive_regression = df_aggressive_regression[ df_aggressive_regression['ClaimAmount'] > 0] print(df_aggressive_regression.shape) OUTLIER_CUTOFF = 4647 df_aggressive_regression = df_aggressive_regression[ df_aggressive_regression['ClaimAmount'] < OUTLIER_CUTOFF] self.r_x_train_aggressive = df_aggressive_regression.drop( ['rowIndex', 'Claimed', 'ClaimAmount'], axis=1) self.r_y_train_aggressive = df_aggressive_regression.ClaimAmount print(df_aggressive_regression.shape)
def step(self): import pandas as pd import sklearn.model_selection as model_selection import sklearn as skl import sklearn_pandas as skp import hpogrid import glob if self.run_mode == "grid": data_directory = hpogrid.get_datadir() print("inDS dir: {}".format(data_directory)) data_files = glob.glob(data_directory + "*.csv") print("--> inDS file: {}".format(data_files)) for f in data_files: all_data = pd.read_csv(f) elif self.run_mode == "local": all_data = pd.read_csv( '/afs/cern.ch/work/s/ssevova/public/dark-photon-atlas/plotting/trees/v08/tight-and-ph-skim/mc16d/dataLists/all_data' ) # Remove negative weights for training all_data = all_data[all_data['w'] > 0] # Load the data & split by train/test X = all_data y = all_data['event'] X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.3, random_state=0) X_train = X_train[self.varw] X_test = X_test[self.varw] cols = X_train.columns itrain = X_train.index itest = X_test.index wtest_unscaled = X_test['w'] # Scaling mapper = skp.DataFrameMapper([(cols, skl.preprocessing.StandardScaler())]) scaled_train = mapper.fit_transform(X_train.copy(), len(cols)) scaled_test = mapper.fit_transform(X_test.copy(), len(cols)) X_scaled_train = pd.DataFrame(scaled_train, index=itrain, columns=cols) X_scaled_test = pd.DataFrame(scaled_test, index=itest, columns=cols) wtest = X_scaled_test['w'] wtrain = X_scaled_train['w'] # Deal with weights self.varw.remove("w") X_train = X_scaled_train[self.varw] X_test = X_scaled_test[self.varw] history = self.model.fit(X_train, y_train, epochs=self.epochs, batch_size=self.batchsize, validation_data=(X_test, y_test)) train_loss, train_acc = self.model.evaluate(X_train, y_train, verbose=2) test_loss, test_acc = self.model.evaluate(X_test, y_test, verbose=2) probs = self.model.predict(X_test) predictions = self.model.predict_classes(X_test) fpr, tpr, threshold = skl.metrics.roc_curve( y_test, probs, sample_weight=wtest_unscaled) auc = skl.metrics.auc(fpr, tpr) # It is important to return tf.Tensors as numpy objects. return { "epoch": self.iteration, "loss": train_loss, "accuracy": train_acc, "auc": auc, "test_loss": test_loss, "test_accuracy": test_acc }
# -*- coding: utf-8 -*- import pandas as pd import sklearn_pandas import sklearn2pmml from sklearn import preprocessing from sklearn import svm from sklearn import pipeline xxx = [] data = pd.read_csv("./01_16_ip_1_feature.txt") mapper = sklearn_pandas.DataFrameMapper([([i], preprocessing.StandardScaler()) for i in data.columns]) train = mapper.fit_transform(data) pipeline_estimator = pipeline.Pipeline([('estimator', svm.OneClassSVM(nu=0.015, kernel="rbf", gamma=0.04))]) pipeline_estimator.fit(train) data1 = pd.read_csv("./17_ip_1_feature.txt") train1 = mapper.transform(data1) pred_test = pipeline_estimator.predict(train1) for i in range(len(pred_test)): if pred_test[i] != 1: xxx.append(i) #xxx = pred_test[pred_test == -1] #sklearn2pmml.sklearn2pmml(pipeline_estimator,mapper,"./track_model.pmml", with_repr=True,debug=True)
for i in range(X.shape[0]): x = url_regex.sub(r'\g<1>\g<3>', X[i]) x = media_regex.sub(r'\g<1>\g<3>', x) result[i] = x return result if __name__ == '__main__': df = pd.DataFrame({'bio': [""" i am just %URL% copy pasting my answer to an already answered question Shubham %URL% Bhardwaj's answer to What life lessons are counter-intuitive or go against common sense or wisdom? %URL% %MEDIA% ACTIONS %MEDIA% LIE LOUDER THAN WORDS: I have grown up listening to “ACTIONS SPEAK LOUDER THAN WORDS” whole my life. """, """ i am just copy pasting my answer to an already answered question %MEDIA% %MEDIA% """, ""]}) extractor = UrlAndMediaTextExtractor() stripper = UrlAndMediaTextStripper() mapper = sklearn_pandas.DataFrameMapper([ ('bio', extractor), ('bio', stripper, {'alias': 'bio_stripped'}) ]) print(mapper.transform(df)) print(mapper.transformed_names_)