def balance(self, dataDF, type): lengths = [] dataDFMap = {} for l in self.allLabels: dataDFMap[l] = list(c for c in dataDF if c[1] == l) if (type == 1): maxCount = max(len(dataDFMap[l]) for l in self.allLabels) for l in self.allLabels: if (0 < len(dataDFMap[l]) < maxCount): dataDF = dataDF + resample( dataDFMap[l], replace=True, # sample with replacement n_samples=maxCount - len(dataDFMap[l]), # to match majority class random_state=1) # reproducible results elif (type == 2): minCount = min(len(dataDFMap[l]) for l in self.allLabels) for l in self.allLabels: if (0 < len(dataDFMap[l]) > minCount): dataDF = dataDF + resample( dataDFMap[l], replace=True, # sample with replacement n_samples=len(dataDFMap[l]) - minCount, # to match majority class random_state=1) # reproducible results elif (type == 3): X = np.asarray([list(t[0].values()) for t in dataDF], dtype=bool) Y = np.asarray([t[1] for t in dataDF]) pipe = imbPipeline([('RandomOverSampler', RandomOverSampler(random_state=0))]) X_resampled, y_resampled = pipe.fit_sample(X, Y) dataDF_resampled = [] feature_names = list(dataDF[0][0].keys()) for i, x in enumerate(X_resampled): dataDF_resampled.append((dict(zip(feature_names, x)), y_resampled[i])) dataDF = dataDF_resampled else: X = np.asarray([list(t[0].values()) for t in dataDF], dtype=bool) Y = np.asarray([t[1] for t in dataDF]) pipe = imbPipeline([('RandomOverSampler', RandomOverSampler(random_state=0)), ('smote', SMOTE(kind='svm'))]) X_resampled, y_resampled = pipe.fit_sample(X, Y) dataDF_resampled = [] feature_names = list(dataDF[0][0].keys()) for i, x in enumerate(X_resampled): dataDF_resampled.append((dict(zip(feature_names, x)), y_resampled[i])) dataDF = dataDF_resampled for l in self.allLabels: lengths.append(len(dataDFMap[l])) return dataDF
def get_pipeline(template): lookup_dict = { 'transformer:standard_scaler': StandardScaler(), 'transformer:power_transformer': PowerTransformer(), 'sampler:ros': RandomOverSampler(random_state=313), 'sampler:smote': SMOTE(random_state=313), 'selector:remove_correlated': CorrelationThreshold(), 'selector:remove_nonnormal': SelectNormal(), 'selector:from_correlated2pca': SelectFromPCA(), 'selector:sfm_lr': SelectKBestFromModel(LogisticRegression(solver='saga', random_state=313)), 'selector:sfm_et': SelectKBestFromModel(ExtraTreesClassifier(random_state=313)), 'selector:sfm_gb': SelectKBestFromModel(GradientBoostingClassifier(random_state=313)), 'selector:sfm_xgb': SelectKBestFromModel(XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=313)), 'classifier:lr': LogisticRegression(solver='saga', random_state=313), 'classifier:dt': DecisionTreeClassifier(random_state=313), 'classifier:et': ExtraTreesClassifier(random_state=313), 'classifier:gb': GradientBoostingClassifier(random_state=313), 'classifier:xgb': XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=313), } steps = list() for step in template: steps.append((step, lookup_dict[step])) try: pipeline = Pipeline(steps=steps) except TypeError: pipeline = imbPipeline(steps=steps) return pipeline
def rf_cv(X_train, y_train, cv=5, verbose=False): """ Fits and trains a Random Forest model with GridSearchCV args: X_train (array): Train dataset with Features y_train (array): Train Target cv (object): Default 5-Fold CrossValidation Verbose (bool): True to see verbose GridSearchCV returns: model (estimator): best estimator with highest recall """ rf = RandomForestClassifier() params = [{ 'rf__n_estimators': range(50, 450, 50), 'rf__max_depth': [5, 10], 'rf__min_samples_split': [2, 10, 20], 'rf__max_features': ['sqrt', 8, 10], 'rf__criterion': ['gini'], }] pipe = imbPipeline(steps=[('sample', SMOTE()), ('rf', rf)]) model = GridSearchCV(pipe, params, cv=cv, n_jobs=-1, scoring=f2scorer, verbose=verbose) model.fit(X_train, y_train) return model
def logreg_cv(X_train, y_train, cv=5, verbose=False): """ Fits and trains a Logistic Regression model with GridSearchCV args: X_train (array): Train dataset with Features y_train (array): Train Target cv (object): Default 5-Fold CrossValidation Verbose (bool): True to see verbose GridSearchCV returns: model (estimator): best estimator with highest recall """ logreg = LogisticRegression() # weights = np.linspace(0.05, 0.95, 10) params = [{ 'logreg__penalty': ['l1', 'l2', 'elasticnet', 'none'], 'logreg__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], # 'logreg__class_weight': [{0:x, 1:1-x} for x in weights] }] pipe = imbPipeline(steps=[('sample', SMOTE()), ('logreg', logreg)]) model = GridSearchCV(pipe, params, cv=cv, n_jobs=-1, scoring=f2scorer, verbose=verbose) model.fit(X_train, y_train) return model
def knn_cv(X_train, y_train, cv=5, verbose=False): """ Fits and trains the KNeighborsClassifier with a GridSearchCV args: X_train (array): Train dataset with Features y_train (array): Train Target cv (object): Default 5-Fold CrossValidation Verbose (bool): True to see verbose GridSearchCV returns: model (estimator): best estimator with highest recall """ pipe = imbPipeline( steps=[('sample', SMOTE()), ('scaler', MinMaxScaler()), ('knn', KNeighborsClassifier())]) params = [{'knn__n_neighbors': range(2, 25), 'knn__p': [1, 2]}] model = GridSearchCV(pipe, params, cv=cv, n_jobs=-1, scoring=f2scorer, verbose=verbose) model.fit(X_train, y_train) return model
def train_model(self): print("Training model....") df = self.load_dataset_samples() clf = imbPipeline([('vect', TfidfVectorizer(ngram_range=(1, 2))), ('tfidf', TfidfTransformer(use_idf=False)), ('over-sampling', RandomOverSampler()), ('clf', LogisticRegression(multi_class='multinomial', solver='newton-cg'))]) clf.fit(df.normalized_tweet, df.sentiment) pickle.dump(clf, open(self.filename, 'wb')) return clf
def get_pipeline(self, classifier): vect = self.vectorizer.get_vectorizer(vectorizer_type=VECTORIZER_TYPE, tokenizer_type=TOKENIZER_TYPE) resampler = self.resample.get_resampler( resampler_type=RESAMPLER_TYPE, sampling_strategy=SAMPLING_STRATEGY, k_neighbors=K_NEIGHBORS, allow_minority=True) imbpipe = imbPipeline(steps=[('vect', vect), ('resample', resampler), ('clf', classifier)], verbose=VERBOSE) pipeline = Pipeline(steps=[('vect', vect), ('clf', classifier)], verbose=VERBOSE) return imbpipe if (APPLY_RESAMPLE == True) else pipeline
def resampling(): """ Function to do oversampling and undersampling Retuns: sampling_pipeline a samapling piepleine with steps as oversampling and undersampling """ oversampling = over_sampling.SMOTE(random_state=42) undersampling = under_sampling.RandomUnderSampler(random_state=42) sampling_pipeline = imbPipeline([ ('oversample', oversampling), ('undersample', undersampling) ]) return sampling_pipeline
def run_model(): ''' Trains the best model, and pickles the model at /models/final_model.pkl returns dict BayesSearchCV results ''' logger = logging.getLogger(__name__) logger.info('Training model') project_dir = Path(__file__).resolve().parents[2] obs_features_path = os.path.join(project_dir, 'data', 'processed', 'observations_features.csv') obs = pd.read_csv(obs_features_path) logger.info('Train / test split') # perform train test split X_train, X_test, y_train, y_test = create_Xy(obs) # create model pipe = imbPipeline([('ss', StandardScaler()), ('smote', SMOTE(random_state=RANDOM_STATE, k_neighbors=6)), ('lm', LogisticRegression( random_state=RANDOM_STATE, solver='saga', C=1e5, penalty='l2', ))]) logger.info('Fitting training data.') pipe.fit(X_train, y_train) y_test_probs = pipe.predict_proba(X_test)[:, 1] test_auc = metrics.roc_auc_score(y_test, y_test_probs) logger.info(f'Test AUC score: {test_auc:.3}') logger.info('Pickling model') model_path = os.path.join(project_dir, 'models', 'final_model.pkl') pickle.dump(pipe, open(model_path, 'wb'))
def _get_pipeline(self, pipeline_dic): pipeline_steps = [] for step in pipeline_dic.keys(): kwargs = self.pipeline_options.get(step, {}) if not kwargs: warnings.warn( 'Default parameters are loaded for {0} (see corresponding class for detailed kwargs)' .format(step)) if self._string_processing(step) == 'dimensionalityreduction': if callable(pipeline_dic[step]): step_object = DimensionalityReduction( pipeline_dic[step](**kwargs)) else: step_object = DimensionalityReduction( pipeline_dic[step], **kwargs) elif self._string_processing(step) == 'featureselection': step_object = FeatureSelection(pipeline_dic[step], **kwargs) else: step_object = pipeline_dic[step](**kwargs) pipeline_steps.append((step, step_object)) if self.imblearn_pipeline: return imbPipeline(pipeline_steps) else: return skPipeline(pipeline_steps)
def post(self): # Get the hazard labels abs_filename = ett_h.generate_dynamic_path( [base_folder_location, LabelType.HAZARD.value, label_file_name]) labels = (ett_h.load_data_common_separated(abs_filename, ',')) # Get the label data from input_data raw_label = TrainHazardUpload.input_data[ColumnName.LABEL.value] data = ett_t.transform_data_to_dataframe_basic( TrainHazardUpload.input_data, colnames) # Get the OneHotEncoded labels label_df = ett_t.one_hot_encoding(raw_label) #17 labels dataframe # Rename the OneHotEncoded labels label_df.columns = labels # Get the number of labels num_of_labels = len(labels) # Data preprocessing nan_cleaned_data = ett_c.clean_dataframe_by_regex( data, RegexFilter.NON_ALPHA_NUMERIC.value ) # Removed all non alphanumeric characters d_cleaned_data = ett_c.clean_dataframe_by_regex( nan_cleaned_data, RegexFilter.DIGITS_ONLY.value) # Removed all digits l_cleaned_data = ett_c.remove_non_iso_words( d_cleaned_data, Language.ENGLISH.value) # Remove non-English text rew_cleaned_data = ett_c.remove_language_stopwords( l_cleaned_data, Language.ENGLISH.name) # Remove English stop words l_transformed_data = ett_t.lowercase( rew_cleaned_data) # Transform text to lowercase le_transformed_data = ett_t.stemming_mp( l_transformed_data ) # Transform text to core words i.e. playing > play data = le_transformed_data # Return the newly transformed data # Split the data into 0.8 training datasets and 0.2 testing datasets X_train, X_test, y_train, y_test = train_test_split(data, label_df, test_size=0.2, random_state=42) endpoint_output = {} #for i in range(num_of_labels): for i in range(2): model_id = str(i) single_label = y_train.iloc[:, i] label = labels[i] print("label", label) pipeline = imbPipeline([ (ModelType.TFIDF.value, TfidfVectorizer()), # Data vectorization (ModelType.OVERSAMPLE.value, SMOTE(random_state=42)), # Data balancing (ModelType.SVD.value, TruncatedSVD()), # Feature selection (ModelType.NOR.value, preprocessing.MinMaxScaler()), # Data normalization (ModelType.CLF.value, OneVsRestClassifier(SVC())) ]) # CLassification #list_c = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1] list_c = [1] #list_n = [100, 150, 200, 250, 300, 350, 400, 450, 500, 550]) list_n = [100] # Remember to add[2,\]2] best_score = 0 epsilon = .05 dictionary = {} for para_c in list_c: for para_n in list_n: parameters = { ModelType.TFIDF.value: [ TfidfVectorizer(max_features=800, ngram_range=(1, 4), norm='l2', encoding='latin-1', stop_words='english', analyzer='word') ], ModelType.SVD.value: [ TruncatedSVD(n_components=para_n, n_iter=7, random_state=42) ], ModelType.CLF.value: [ OneVsRestClassifier( SVC(kernel='linear', probability=True, C=para_c)) ] } #gs_clf = GridSearchCV(pipeline, parameters, cv=5, error_score='raise', n_jobs = -1) #gs_clf = GridSearchCV(pipeline, parameters, cv=2, error_score='raise', scoring="f1") gs_clf = GridSearchCV(pipeline, parameters, cv=5, error_score='raise', scoring="f1") gs_clf = gs_clf.fit(X_train, single_label) current_score = gs_clf.best_score_ dictionary[current_score] = parameters for current_score in dictionary.keys(): if current_score - epsilon > best_score: best_score = current_score model_dict = dictionary[best_score] label_model_list = {} label_model_list['score'] = best_score folder_time = time.strftime("_%Y%m%d_%H%M") # Create Directory in the AWS S3 Bucket os.mkdir("/Users/yihanbao/Desktop/unisdr-training/hazard/" + label + "/" + label + folder_time) # Navigate to AWS model saving folder model_folder = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname( os.path.dirname(os.path.realpath(__file__))))), ett_h.generate_dynamic_path( [LabelType.HAZARD.value, label, label + folder_time])) """ # Connect to AWS conn = boto.s3.connect_to_region(" ", aws_access_key_id = 'AWS-Access-Key', aws_secret_access_key = 'AWS-Secrete-Key', calling_format = boto.s3.connection.OrdinaryCallingFormat()) bucket = conn.get_bucket("oict-psdg-unisdr-train-models-v1") # AWS Key aws_path = ett_h.generate_dynamic_path([LabelType.HAZARD.value, label, timestamp+label]) """ # Here to fit the training datasets to the models with best score # vectorization vector = model_dict[ModelType.TFIDF.value][0].fit( X_train, single_label) ett_h.save_model( vector, ett_h.generate_dynamic_path( [model_folder, label + folder_time + vector_model_name])) vectorized_df = vector.transform(X_train) label_model_list[ URLName.VECURL.value] = ett_h.generate_dynamic_path( [model_folder, label + folder_time + vector_model_name]) """ key_name = timestamp+label+model_name full_key_name = os.path.join(path, key_name) pickle_byte_obj = pickle.dump(vector) s3_resource = resource('s3') s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj) """ # Balcancing sm = SMOTE(random_state=42) X_res, y_res = sm.fit_resample(vectorized_df, single_label) # Feature selction svd = model_dict[ModelType.SVD.value][0].fit(X_res, y_res) ett_h.save_model( svd, ett_h.generate_dynamic_path([ model_folder, label + folder_time + dim_reductor_model_name ])) dim_reductor_df = svd.transform(X_res) label_model_list[ URLName.DIMURL.value] = ett_h.generate_dynamic_path([ model_folder, label + folder_time + dim_reductor_model_name ]) """ key_name = timestamp+label+dim_reductor_model_name full_key_name = os.path.join(path, key_name) pickle_byte_obj = pickle.dump(svd) s3_resource = resource('s3') s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj) """ # Normalizing min_max_scaler = preprocessing.MinMaxScaler() nor_model = min_max_scaler.fit(dim_reductor_df, y_res) ett_h.save_model( nor_model, ett_h.generate_dynamic_path([ model_folder, label + folder_time + normalizar_model_name ])) scaled_df = nor_model.transform(dim_reductor_df) label_model_list[ URLName.NORURL.value] = ett_h.generate_dynamic_path([ model_folder, label + folder_time + normalizar_model_name ]) """ key_name = timestamp+label+normalizar_model_name full_key_name = os.path.join(path, key_name) pickle_byte_obj = pickle.dump(nor_model) s3_resource = resource('s3') s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj) """ # Classifier clf = model_dict[ModelType.CLF.value][0].fit(scaled_df, y_res) clf.fit(scaled_df, y_res) ett_h.save_model( clf, ett_h.generate_dynamic_path( [model_folder, label + folder_time + model_name])) label_model_list[ URLName.MODURL.value] = ett_h.generate_dynamic_path( [model_folder, label + folder_time + model_name]) """ key_name = timestamp+label+model_name full_key_name = os.path.join(path, key_name) pickle_byte_obj = pickle.dump(scaled_df) s3_resource = resource('s3') s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj) """ endpoint_output[model_id] = [label_model_list] output = json.dumps(endpoint_output) return output
#Sample #SMOTE analysis smt = SMOTE(random_state=123, ratio=1.0) x_train_sam, y_train_sam = smt.fit_sample(x_train, y_train) print(collections.Counter(y_train_sam)) #LR sampled_logistic = LogisticRegression().fit(x_train_sam, y_train_sam) sampled_pred = sampled_logistic.predict(x_test) print(accuracy_score(y_test, sampled_pred)) print(confusion_matrix(y_test, sampled_pred)) recall_score(y_test, sampled_pred) #GridSearch LR pipe_lr = imbPipeline([('oversample', SMOTE(random_state=123, ratio=1)), ('lr', LogisticRegression())]) skf = StratifiedKFold(n_splits=5) param_grid_lr = [{'lr__C': [0.5, 1, 1.5, 2], 'lr__penalty': ['l1', 'l2']}] grid_lr = GridSearchCV(pipe_lr, param_grid_lr, return_train_score=True, n_jobs=-1, scoring='roc_auc', cv=skf) grid_lr.fit(x_train, y_train) print(grid_lr.best_score_) y_pred = grid_lr.predict(x_test) print(classification_report(y_test, y_pred)) optimised_lr = grid_lr.best_estimator_
MLPClassifier(random_state=1, solver='lbfgs', hidden_layer_sizes=100))] estimators4 = [('scale', scale), ('clf', RandomForestClassifier(random_state=1))] estimators5 = [('scale', scale), ('clf', LinearSVC(random_state=1, dual=False, max_iter=10000, fit_intercept=False, tol=10**-4, verbose=True))] pipe1 = imbPipeline( estimators1 ) # Use of imbPipeline makes possible the implementation of SMOTE techniques pipe2 = imbPipeline(estimators2) pipe3 = imbPipeline(estimators3) pipe4 = imbPipeline(estimators4) pipe5 = imbPipeline(estimators5) param_grid1 = dict( clf__n_neighbors=Grid.KNN()['n_neighbors'], clf__p=Grid.KNN()['p'], clf__weights=Grid.KNN() ['weights']) #Parameters are gathered from a user-defined class. param_grid2 = dict(clf__C=Grid.LR()['C'], clf__penalty=Grid.LR()['penalty']) param_grid3 = dict(clf__alpha=Grid.NN()['alpha'], clf__activation=Grid.NN()['activation']) param_grid4 = dict(clf__n_estimators=Grid.RF()['n_estimators'],
def transform(X_train_df, y_train_df, X_test_df, y_test_df, clf, rebalancing=True, feature_selection=True): # impute missing data imputation_pipe = transformations.make_imputation_pipe() X_train = imputation_pipe.fit_transform(X_train_df) X_test = imputation_pipe.transform(X_test_df) # some feature entineering X_train['age_diff'] = abs(X_train['age'] - X_train['age_o']) X_test['age_diff'] = abs(X_test['age'] - X_test['age_o']) X_train_df['age_diff'] = X_train['age_diff'] X_test_df['age_diff'] = X_test['age_diff'] X_train['same_field'] = ( X_train['field'] == X_train['field_o']).astype(int) X_test['same_field'] = (X_test['field'] == X_test['field_o']).astype(int) X_train.drop(columns=['field', 'field_o'], inplace=True) X_test.drop(columns=['field', 'field_o'], inplace=True) X_train['imp_race_diff'] = abs(X_train['imp_same_race'] - X_train['imp_same_race_o']) X_test['imp_race_diff'] = abs(X_test['imp_same_race'] - X_test['imp_same_race_o']) X_train['imp_rel_diff'] = abs(X_train['imp_same_rel'] - X_train['imp_same_rel_o']) X_test['imp_rel_diff'] = abs(X_test['imp_same_rel'] - X_test['imp_same_rel_o']) # encode data encoding_pipe = transformations.make_encoding_pipe(X_train) X_train = encoding_pipe.fit_transform(X_train) X_test = encoding_pipe.transform(X_test) y_train = y_train_df.values y_test = y_test_df.values # Rebalance data (if needed) if rebalancing: print('Rebalancing...') rebalance_pipe = imbPipeline([('rebalance', None), ('classify', clf)]) grid = tuning.rebalance_grid(rebalance_pipe) grid.fit(X_train, y_train) sampler = grid.best_estimator_.named_steps['rebalance'] if sampler: X_train, y_train = sampler.fit_resample(X_train, y_train) print('best params: {}'.format(grid.best_params_)) # Perform feature selection (if needed) if feature_selection: print('Reducing dimensionality...') reduce_dim_pipe = Pipeline([('reduce_dim', None), ('classify', clf)]) grid = tuning.reduce_dim_grid(reduce_dim_pipe) grid.fit(X_train, y_train) feature_selector = grid.best_estimator_.named_steps['reduce_dim'] X_train = feature_selector.fit_transform(X_train, y_train) X_test = feature_selector.transform(X_test) print('best params: {}'.format(grid.best_params_)) return X_train, y_train, X_test, y_test
from imblearn.over_sampling import RandomOverSampler from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV from datetime import time, datetime from imblearn.pipeline import Pipeline as imbPipeline classifier = RandomForestClassifier() undersample = RandomUnderSampler(sampling_strategy='majority') oversample = RandomOverSampler(sampling_strategy='minority') # Create pipeline using Bag of Words pipe = imbPipeline([ ("cleaner", predictors()), ('vectorizer', tfidf_vector), #('sampler', oversample), ('classifier', classifier) ]) max_features = (50, 250, 500) n_estimators = (50, 100, 300) max_depth = (10, 30, 50) min_samples_split = (2, 5, 10, 15, 100) min_samples_leaf = (1, 2, 5, 10) parameters = dict(vectorizer__max_features=max_features, classifier__n_estimators=n_estimators, classifier__max_depth=max_depth) classifier.get_params().keys()