def run_magellan(train_set, valid_set, test_set, feature_combinations, classifiers, experiment_name, write_test_set_for_inspection=False): train_path = os.path.dirname(train_set) train_file = os.path.basename(train_set) test_path = os.path.dirname(test_set) test_file = os.path.basename(test_set) report_train_name = train_file.replace('.csv', '') report_test_name = test_file.replace('.csv', '') train_set_left = train_file.replace('pairs', 'left') train_set_right = train_file.replace('pairs', 'right') test_set_left = test_file.replace('pairs', 'left') test_set_right = test_file.replace('pairs', 'right') os.makedirs(os.path.dirname( '../../../reports/magellan/{}/'.format(experiment_name)), exist_ok=True) try: os.remove('../../../reports/magellan/{}/{}_{}.csv'.format( experiment_name, report_train_name, report_test_name)) except OSError: pass with open( '../../../reports/magellan/{}/{}_{}.csv'.format( experiment_name, report_train_name, report_test_name), "w") as f: f.write( 'feature#####model#####mean_train_score#####std_train_score#####mean_valid_score#####std_valid_score#####precision_test#####recall_test#####f1_test#####best_params#####train_time#####prediction_time#####feature_importance#####experiment_name#####train_set#####test_set\n' ) for run in range(1, 4): for feature_combination in feature_combinations: A_t = em.read_csv_metadata(train_path + '/' + train_set_left, key='mag_id') B_t = em.read_csv_metadata(train_path + '/' + train_set_right, key='mag_id') # Load the pre-labeled data S_t = em.read_csv_metadata(train_set, key='_id', ltable=A_t, rtable=B_t, fk_ltable='ltable_mag_id', fk_rtable='rtable_mag_id') A_gs = em.read_csv_metadata(test_path + '/' + test_set_left, key='mag_id') B_gs = em.read_csv_metadata(test_path + '/' + test_set_right, key='mag_id') # Load the pre-labeled data S_gs = em.read_csv_metadata(test_set, key='_id', ltable=A_gs, rtable=B_gs, fk_ltable='ltable_mag_id', fk_rtable='rtable_mag_id') A_t.fillna('', inplace=True) A_gs.fillna('', inplace=True) B_t.fillna('', inplace=True) B_gs.fillna('', inplace=True) S_t.fillna('', inplace=True) S_gs.fillna('', inplace=True) ## DIRTY FIX, CLEAN UP! if 'name' in A_t.columns: A_t["price"] = A_t["price"].replace(r'^\s*$', np.nan, regex=True) A_t["price"] = A_t["price"].astype('float64') A_gs["price"] = A_gs["price"].replace(r'^\s*$', np.nan, regex=True) A_gs["price"] = A_gs["price"].astype('float64') B_t["price"] = B_t["price"].replace(r'^\s*$', np.nan, regex=True) B_t["price"] = B_t["price"].astype('float64') B_gs["price"] = B_gs["price"].replace(r'^\s*$', np.nan, regex=True) B_gs["price"] = B_gs["price"].astype('float64') S_t["ltable_price"] = S_t["ltable_price"].replace(r'^\s*$', np.nan, regex=True) S_t["ltable_price"] = S_t["ltable_price"].astype('float64') S_t["rtable_price"] = S_t["rtable_price"].replace(r'^\s*$', np.nan, regex=True) S_t["rtable_price"] = S_t["rtable_price"].astype('float64') S_gs["ltable_price"] = S_gs["ltable_price"].replace(r'^\s*$', np.nan, regex=True) S_gs["ltable_price"] = S_gs["ltable_price"].astype('float64') S_gs["rtable_price"] = S_gs["rtable_price"].replace(r'^\s*$', np.nan, regex=True) S_gs["rtable_price"] = S_gs["rtable_price"].astype('float64') atypes1 = em.get_attr_types(A_t) atypes2 = em.get_attr_types(B_t) match_c = em.get_attr_corres(A_t, B_t) match_c['corres'] = [] # select attributes to compare for feature in feature_combination: match_c['corres'].append((feature, feature)) tok = em.get_tokenizers_for_matching() sim = em.get_sim_funs_for_matching() F_t = em.get_features(A_t, B_t, atypes1, atypes2, match_c, tok, sim) H_t = em.extract_feature_vecs(S_t, feature_table=F_t, attrs_after=['label', 'pair_id'], show_progress=False) H_gs = em.extract_feature_vecs(S_gs, feature_table=F_t, attrs_after='label', show_progress=False) H_t = H_t.fillna(-1) H_gs = H_gs.fillna(-1) validation_ids_df = pd.read_csv(valid_set) val_df = H_t[H_t['pair_id'].isin( validation_ids_df['pair_id'].values)] train_only_df = H_t[~H_t['pair_id']. isin(validation_ids_df['pair_id'].values)] train_only_df = train_only_df.drop(columns='pair_id') val_df = val_df.drop(columns='pair_id') train_only_df = train_only_df.sample(frac=1, random_state=42) pos_neg = H_t['label'].value_counts() pos_neg = round(pos_neg[0] / pos_neg[1]) train_ind = [] val_ind = [] for i in range(len(train_only_df) - 1): train_ind.append(-1) for i in range(len(val_df) - 1): val_ind.append(0) ps = PredefinedSplit(test_fold=np.concatenate((train_ind, val_ind))) train_df = pd.concat([train_only_df, val_df]) for k, v in classifiers.items(): classifier = v['clf'] if 'random_state' in classifier.get_params().keys(): classifier = classifier.set_params(**{'random_state': run}) # add pos_neg ratio to XGBoost params if k == 'XGBoost': v['params']['scale_pos_weight']: [1, pos_neg] model = RandomizedSearchCV(cv=ps, estimator=classifier, param_distributions=v['params'], random_state=42, n_jobs=4, scoring='f1', n_iter=500, pre_dispatch=8, return_train_score=True) feats_train = train_df.drop( ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1) labels_train = train_df['label'] feats_gs = H_gs.drop( ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1) labels_gs = H_gs['label'] try: model.fit(feats_train, labels_train) except ValueError: set_trace() parameters = model.best_params_ score_names = [ 'mean_train_score', 'std_train_score', 'mean_test_score', 'std_test_score' ] scores = {} score_string = '' for name in score_names: scores[name] = model.cv_results_[name][model.best_index_] score_string = score_string + name + ': ' + str( scores[name]) + ' ' feature_names = list(feats_train.columns) if k == 'LogisticRegression' or k == 'LinearSVC': most_important_features = model.best_estimator_.coef_ word_importance = zip(feature_names, most_important_features[0].tolist()) word_importance = sorted( word_importance, key=lambda importance: importance[1], reverse=True) if k == 'RandomForest' or k == 'DecisionTree': most_important_features = model.best_estimator_.feature_importances_ word_importance = zip(feature_names, most_important_features.tolist()) word_importance = sorted( word_importance, key=lambda importance: importance[1], reverse=True) if k == 'NaiveBayes': word_importance = '' if k == 'XGBoost': most_important_features = model.best_estimator_.feature_importances_ word_importance = zip(feature_names, most_important_features.tolist()) word_importance = sorted( word_importance, key=lambda importance: importance[1], reverse=True) if k == 'LogisticRegression': learner = LogisticRegression(random_state=run, solver='liblinear', **parameters) elif k == 'NaiveBayes': learner = GaussianNB() elif k == 'DecisionTree': learner = DecisionTreeClassifier(random_state=run, **parameters) elif k == 'LinearSVC': learner = LinearSVC(random_state=run, dual=False, **parameters) elif k == 'RandomForest': learner = RandomForestClassifier(random_state=run, n_jobs=4, **parameters) elif k == 'XGBoost': learner = xgb.XGBClassifier(random_state=run, n_jobs=4, **parameters) else: print('Learner is not a valid option') break model = learner feats_train = train_only_df.sample(frac=1, random_state=42) feats_train = train_only_df.drop( ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1) labels_train = train_only_df['label'] start = time.time() model.fit(feats_train, labels_train) end = time.time() train_time = end - start start = time.time() preds_gs = model.predict(feats_gs) end = time.time() pred_time = end - start gs_report = classification_report(labels_gs, preds_gs, output_dict=True) feature_report = '+'.join(feature_combination) if write_test_set_for_inspection: out_path = '../../../data/processed/wdc-lspc/inspection/{}/magellan/'.format( experiment_name) os.makedirs(os.path.dirname(out_path), exist_ok=True) file_name = '_'.join([ os.path.basename(train_set), os.path.basename(test_set), k, feature_report ]) file_name = file_name.replace('.csv', '') file_name += f'_{run}.pkl.gz' test_inspection_df = S_gs.copy() if k == 'LinearSVC': proba_gs = model.decision_function(feats_gs).tolist() else: proba_gs = model.predict_proba(feats_gs).tolist() test_inspection_df['pred'] = preds_gs test_inspection_df['Class Prob'] = proba_gs test_inspection_df.to_pickle(out_path + file_name, compression='gzip') with open( '../../../reports/magellan/{}/{}_{}.csv'.format( experiment_name, report_train_name, report_test_name), "a") as f: f.write(feature_report + '#####' + k + '#####' + str(scores['mean_train_score']) + '#####' + str(scores['std_train_score']) + '#####' + str(scores['mean_test_score']) + '#####' + str(scores['std_test_score']) + '#####' + str(gs_report['1']['precision']) + '#####' + str(gs_report['1']['recall']) + '#####' + str(gs_report['1']['f1-score']) + '#####' + str(parameters) + '#####' + str(train_time) + '#####' + str(pred_time) + '#####' + str(word_importance[0:100]) + '#####' + experiment_name + '#####' + report_train_name + '#####' + report_test_name + '\n')
class BaseClustering(AbstractMachineLearningBase): """Base class for classification Parameters ---------- None Attributes ---------- model_: Fited model object, default None weights_: ndarray of shape(n_class, n_features) if the model is linear model, else shape(1,n_features), default None Feature weights of the fited model weights_norm_: ndarray of shape(n_class, n_features) if the model is linear model, else shape(1,n_features), default None Normalized feature weights. Using StandardScaler (z-score) to get the normalized feature weights. """ def __init__(self, search_strategy='grid', k=2, metric=accuracy_score, n_iter_of_randomedsearch=10, n_jobs=2, location='cachedir', verbose=False): self.search_strategy = search_strategy self.k = k self.metric = metric self.n_iter_of_randomedsearch = n_iter_of_randomedsearch self.n_jobs = n_jobs self.location = location self.verbose = verbose self.model_ = None self.weights_ = None self.weights_norm_ = None @timer def fit_(self, x=None, y=None): """Fit the pipeline_""" # TODO: Extending to other cross-validation methods # TODO: when no param's length greater than 1, do not use GridSearchCV or RandomizedSearchCV for speeding up cv = StratifiedKFold(n_splits=self.k) # Default is StratifiedKFold if self.is_search: if self.search_strategy == 'grid': self.model_ = GridSearchCV(self.pipeline_, n_jobs=self.n_jobs, param_grid=self.param_search_, cv=cv, scoring=make_scorer(self.metric), refit=True) elif self.search_strategy == 'random': self.model_ = RandomizedSearchCV( self.pipeline_, n_jobs=self.n_jobs, param_distributions=self.param_search_, cv=cv, scoring=make_scorer(self.metric), refit=True, n_iter=self.n_iter_of_randomedsearch, ) else: print("Please specify which search strategy!\n") return else: self.model_ = self.pipeline_ # start = time.time() self.model_.fit(x, y) # end = time.time() # print(end - start) # Delete the temporary cache before exiting # self.memory.clear(warn=False) return self def predict(self, x): y_hat = self.model_.predict(x) # TODO? if hasattr(self.model_, 'decision_function'): y_prob = self.model_.decision_function(x) elif hasattr(self.model_, 'predict_proba'): y_prob = self.model_.predict_proba(x)[:, 1] else: y_prob = y_hat return y_hat, y_prob def get_weights_(self, x=None, y=None): """ If the model is linear model, the weights are coefficients. If the model is not the linear model, the weights are calculated by occlusion test <Transfer learning improves resting-state functional connectivity pattern analysis using convolutional neural networks>. """ if self.is_search: best_model = self.model_.best_estimator_ else: best_model = self.model_ feature_preprocessing = best_model['feature_preprocessing'] dim_reduction = best_model.get_params().get('dim_reduction', None) feature_selection = best_model.get_params().get( 'feature_selection', None) estimator = best_model['estimator'] # Get weight according to model type: linear model or nonlinear model if hasattr(estimator, "coef_"): # Linear model coef = estimator.coef_ if feature_selection and (feature_selection != "passthrough"): self.weights_ = feature_selection.inverse_transform(coef) else: self.weights_ = coef if dim_reduction and (dim_reduction != "passthrough"): self.weights_ = dim_reduction.inverse_transform(self.weights_) else: # Nonlinear model # TODO: Consider the problem of slow speed caused by a large number of features x_reduced_selected = x.copy() if feature_preprocessing and (feature_preprocessing != "passthrough"): x_reduced_selected = feature_preprocessing.fit_transform( x_reduced_selected) if dim_reduction and (dim_reduction != "passthrough"): x_reduced_selected = dim_reduction.fit_transform( x_reduced_selected) if feature_selection and (feature_selection != "passthrough"): x_reduced_selected = feature_selection.fit_transform( x_reduced_selected, y) y_hat = self.model_.predict(x) score_true = self.metric(y, y_hat) len_feature = x_reduced_selected.shape[1] self.weights_ = np.zeros([1, len_feature]) if len_feature > 1000: print( f"***There are {len_feature} features, it may take a long time to get the weight!***\n" ) print( "***I suggest that you reduce the dimension of features***\n" ) for ifeature in range(len_feature): print(f"Getting weight for the {ifeature+1}th feature...\n") x_ = x_reduced_selected.copy() x_[:, ifeature] = 0 y_hat = estimator.predict(x_) self.weights_[0, ifeature] = score_true - self.metric(y, y_hat) # Back to original space if feature_selection and (feature_selection != "passthrough"): self.weights_ = feature_selection.inverse_transform( self.weights_) if dim_reduction and (dim_reduction != "passthrough"): self.weights_ = dim_reduction.inverse_transform(self.weights_) # Normalize weights self.weights_norm_ = StandardScaler().fit_transform(self.weights_.T).T
def run_wordcooc(train_set, valid_set, test_set, feature_combinations, classifiers, experiment_name, write_test_set_for_inspection=False): train_path = os.path.dirname(train_set) train_file = os.path.basename(train_set) test_path = os.path.dirname(test_set) test_file = os.path.basename(test_set) report_train_name = train_file.replace('.pkl.gz', '') report_test_name = test_file.replace('.pkl.gz', '') os.makedirs(os.path.dirname( '../../../reports/wordcooc/{}/'.format(experiment_name)), exist_ok=True) try: os.remove('../../../reports/wordcooc/{}/{}_{}.csv'.format( experiment_name, report_train_name, report_test_name)) except OSError: pass with open( '../../../reports/wordcooc/{}/{}_{}.csv'.format( experiment_name, report_train_name, report_test_name), "w") as f: f.write( 'feature#####model#####mean_train_score#####std_train_score#####mean_valid_score#####std_valid_score#####precision_test#####recall_test#####f1_test#####best_params#####train_time#####prediction_time#####feature_importance#####experiment_name#####train_set#####test_set\n' ) for run in range(1, 4): for feature_combination in feature_combinations: train_original_df = pd.read_pickle(train_set, compression='gzip') gs_df = pd.read_pickle(test_set, compression='gzip') feature_file_name = train_file.replace('.pkl.gz', '_words.json') with open(train_path + '/feature-names/' + feature_file_name) as json_data: words = json.load(json_data) validation_ids_df = pd.read_pickle(valid_set, compression='gzip') val_df = train_original_df[train_original_df['pair_id'].isin( validation_ids_df['pair_id'].values)] train_only_df = train_original_df[ ~train_original_df['pair_id']. isin(validation_ids_df['pair_id'].values)] train_only_df = train_only_df.sample(frac=1, random_state=42) pos_neg = train_original_df['label'].value_counts() pos_neg = round(pos_neg[0] / pos_neg[1]) train_ind = [] val_ind = [] for i in range(len(train_only_df) - 1): train_ind.append(-1) for i in range(len(val_df) - 1): val_ind.append(0) ps = PredefinedSplit(test_fold=np.concatenate((train_ind, val_ind))) train_df = pd.concat([train_only_df, val_df]) for k, v in classifiers.items(): classifier = v['clf'] if 'random_state' in classifier.get_params().keys(): classifier = classifier.set_params(**{'random_state': run}) # add pos_neg ratio to XGBoost params if k == 'XGBoost': v['params']['scale_pos_weight']: [1, pos_neg] model = RandomizedSearchCV(cv=ps, estimator=classifier, param_distributions=v['params'], random_state=42, n_jobs=4, scoring='f1', n_iter=500, pre_dispatch=8, return_train_score=True) feats_train = scipy.sparse.vstack( train_df[feature_combination + '_wordcooc']) labels_train = train_df['label'] feats_gs = scipy.sparse.vstack(gs_df[feature_combination + '_wordcooc']) labels_gs = gs_df['label'] model.fit(feats_train, labels_train) parameters = model.best_params_ score_names = [ 'mean_train_score', 'std_train_score', 'mean_test_score', 'std_test_score' ] scores = {} score_string = '' for name in score_names: scores[name] = model.cv_results_[name][model.best_index_] score_string = score_string + name + ': ' + str( scores[name]) + ' ' if k == 'LogisticRegression' or k == 'LinearSVC': most_important_features = model.best_estimator_.coef_ word_importance = zip(words[feature_combination], most_important_features[0].tolist()) word_importance = sorted( word_importance, key=lambda importance: importance[1], reverse=True) if k == 'RandomForest' or k == 'DecisionTree': most_important_features = model.best_estimator_.feature_importances_ word_importance = zip(words[feature_combination], most_important_features.tolist()) word_importance = sorted( word_importance, key=lambda importance: importance[1], reverse=True) if k == 'NaiveBayes': word_importance = '' if k == 'XGBoost': most_important_features = model.best_estimator_.feature_importances_ word_importance = zip(words[feature_combination], most_important_features.tolist()) word_importance = sorted( word_importance, key=lambda importance: importance[1], reverse=True) if k == 'LogisticRegression': learner = LogisticRegression(random_state=run, solver='liblinear', **parameters) elif k == 'NaiveBayes': learner = BernoulliNB() elif k == 'DecisionTree': learner = DecisionTreeClassifier(random_state=run, **parameters) elif k == 'LinearSVC': learner = LinearSVC(random_state=run, dual=False, **parameters) elif k == 'RandomForest': learner = RandomForestClassifier(random_state=run, n_jobs=4, **parameters) elif k == 'XGBoost': learner = xgb.XGBClassifier(random_state=run, n_jobs=4, **parameters) else: print('Learner is not a valid option') break model = learner feats_train = scipy.sparse.vstack( train_only_df[feature_combination + '_wordcooc']) labels_train = train_only_df['label'] start = time.time() model.fit(feats_train, labels_train) end = time.time() train_time = end - start start = time.time() preds_gs = model.predict(feats_gs) end = time.time() pred_time = end - start gs_report = classification_report(labels_gs, preds_gs, output_dict=True) if write_test_set_for_inspection: out_path = '../../../data/processed/wdc-lspc/inspection/{}/wordcooc/'.format( experiment_name) os.makedirs(os.path.dirname(out_path), exist_ok=True) file_name = '_'.join([ os.path.basename(train_set), os.path.basename(test_set), k, feature_combination ]) file_name = file_name.replace('.csv', '') file_name += f'_{run}.pkl.gz' test_inspection_df = gs_df.copy() if k == 'LinearSVC': proba_gs = model.decision_function(feats_gs).tolist() else: proba_gs = model.predict_proba(feats_gs).tolist() test_inspection_df['pred'] = preds_gs test_inspection_df['Class Prob'] = proba_gs test_inspection_df.to_pickle(out_path + file_name, compression='gzip') with open( '../../../reports/wordcooc/{}/{}_{}.csv'.format( experiment_name, report_train_name, report_test_name), "a") as f: f.write(feature_combination + '#####' + k + '#####' + str(scores['mean_train_score']) + '#####' + str(scores['std_train_score']) + '#####' + str(scores['mean_test_score']) + '#####' + str(scores['std_test_score']) + '#####' + str(gs_report['1']['precision']) + '#####' + str(gs_report['1']['recall']) + '#####' + str(gs_report['1']['f1-score']) + '#####' + str(parameters) + '#####' + str(train_time) + '#####' + str(pred_time) + '#####' + str(word_importance[0:100]) + '#####' + experiment_name + '#####' + report_train_name + '#####' + report_test_name + '\n')
def train(X, y, weight_classes=True, n_iter_search=500, score='roc_auc', random_state=123): ''' Train a binary SGD classifier using a randomized grid search with given scoring metric. Parameters: X (list-like): list of normalized attachment texts y (list-like): list of validated targets (0 = red, 1 = green) weight_classes (bool): whether or not to use the “balanced” mode to adjust class weights. n_iter_search (int): number of parameter settings that are sampled. Trades off runtime vs quality of the solution. score (str): the scorer used to evaluate the predictions on the test set. `roc_auc` by default. Available options include: accuracy, roc_auc, precision, fbeta, recall. Note: for fbeta, beta is set to 1.5 to favor recall of the positive class. random_state (int): sets the random seed for reproducibility. Returns: results (dict): a dict of scoring metrics and their values best_score (float): mean cross-validated score of the best_estimator. best_estimator (sklearn estimator): estimator that was chosen by the search best_params (dict): parameter setting that gave the best results on the hold out data. ''' if weight_classes: clf = SGDClassifier(class_weight='balanced') else: clf = clf = SGDClassifier() scoring = { 'accuracy': metrics.make_scorer(metrics.accuracy_score), 'roc_auc': metrics.make_scorer(metrics.roc_auc_score), 'precision': metrics.make_scorer(metrics.average_precision_score), 'fbeta': metrics.make_scorer(metrics.fbeta_score, beta=.5), 'recall': metrics.make_scorer(metrics.recall_score) } X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=y, test_size=0.2, random_state=random_state) pipe = Pipeline([('vectorizer', TfidfVectorizer(stop_words='english')), ('select', SelectKBest(chi2)), ('clf', clf)]) param_dist = get_param_distribution() random_search = RandomizedSearchCV(pipe, param_distributions=param_dist, scoring=scoring, refit=score, n_iter=n_iter_search, cv=5, n_jobs=-1, verbose=1, random_state=random_state) try: random_search.fit(X_train, y_train) except Exception as e: logger.error(f"Exception occurred training a new model: \ {e}", exc_info=True) y_pred = random_search.predict(X_test) #get the col number of the positive class (i.e. green) positive_class_col = list(random_search.classes_).index(1) try: y_score = random_search.predict_proba(X_test)[:, positive_class_col] except AttributeError: y_score = random_search.decision_function(X_test) average_precision = metrics.average_precision_score(y_test, y_score) acc = metrics.accuracy_score(y_test, y_pred) try: roc_auc = metrics.roc_auc_score(y_test, y_pred) except ValueError: roc_auc = None precisions, recalls, _ = metrics.precision_recall_curve(y_test, y_score) try: auc = metrics.auc(recalls, precisions) except ValueError: auc = None fbeta = metrics.fbeta_score(y_test, y_pred, beta=1.5) recall = metrics.recall_score(y_test, y_pred) best_estimator = random_search.best_estimator_ best_params = random_search.best_params_ best_score = random_search.best_score_ result_values = [ y_pred, y_score, precisions, recall, average_precision, acc, roc_auc, auc, fbeta, recalls, best_score, best_estimator, y_test ] result_keys = [ 'y_pred', 'y_score', 'precisions', 'recall', 'average_precision', 'acc', 'roc_auc', 'auc', 'fbeta', 'recalls', 'best_score', 'best_estimator', 'y_test' ] results = {k: v for k, v in zip(result_keys, result_values)} return results, best_score, best_estimator, best_params
def randomized_grid_search( self, train_df, clf=SGDClassifier(), n_iter_search=10, #10 for testing purposes pickle_best=True): """ Given labeled training data (`df`) for a binary classification task, performs a randomized grid search `n_iter_search` times using `clf` as the classifier and the `score` as a scoring metric. Attributes: df (pandas DataFrame): The training data. Currently, you must specify within the function the label and feature column names. clf (instance of an sklearn classifier): SGDClassifier() by default n_iter_search: number of parameter settings that are sampled. Trades off runtime vs quality of the solution. pickle_best (bool): whether or not to pickle the best estimator returned by the grid search. Default is True """ score = self.metric scoring = { 'accuracy': metrics.make_scorer(metrics.accuracy_score), 'roc_auc': metrics.make_scorer(metrics.roc_auc_score), 'avg_precision': metrics.make_scorer(metrics.average_precision_score), 'fbeta': metrics.make_scorer(metrics.fbeta_score, beta=1.5), 'recall': metrics.make_scorer(metrics.recall_score) } clf_name = clf.__class__.__name__ X = train_df['Normalized Comments'] y = train_df['Spam'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123) pipe = Pipeline([('vectorizer', TfidfVectorizer()), ('upsample', SMOTE()), ('select', SelectPercentile()), ('clf', clf)]) param_dist = { "vectorizer__ngram_range": [(1, 1), (1, 2), (1, 3)], "vectorizer__min_df": stats.randint(1, 3), "vectorizer__max_df": stats.uniform(.7, .3), "vectorizer__sublinear_tf": [True, False], "upsample": [ None, SMOTE(ratio='minority', kind='svm'), SMOTE(ratio='minority', kind='regular'), SMOTE(ratio='minority', kind='borderline1'), SMOTE(ratio='minority', kind='borderline2') ], "select": [ None, SelectPercentile(percentile=10), SelectPercentile(percentile=20), SelectPercentile(percentile=50), SelectPercentile(percentile=75) ], "clf__alpha": log_uniform(-5, 2), "clf__penalty": ['l2', 'l1', 'elasticnet'], "clf__loss": ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'], } random_search = RandomizedSearchCV(pipe, param_distributions=param_dist, scoring=scoring, refit=score, n_iter=n_iter_search, cv=5, n_jobs=-1, verbose=1) random_search.fit(X_train, y_train) y_pred = random_search.predict(X_test) #get the col number of the positive class (i.e. spam) positive_class_col = list(random_search.classes_).index(1) try: y_score = random_search.predict_proba(X_test)[:, positive_class_col] except AttributeError: y_score = random_search.decision_function(X_test) average_precision = metrics.average_precision_score(y_test, y_score) acc = metrics.accuracy_score(y_test, y_pred) roc_auc = metrics.roc_auc_score(y_test, y_pred) precisions, recalls, _ = metrics.precision_recall_curve( y_test, y_score) auc = metrics.auc(recalls, precisions) fbeta = metrics.fbeta_score(y_test, y_pred, beta=1.5) recall = metrics.recall_score(y_test, y_pred) print("\tRecall on test data: {0:.2f}".format(recall)) print("\tAccuracy on test data: {0:.2f}".format(acc)) print("\tROC-AUC on test data: {0:.2f}".format(roc_auc)) print("\tFbeta on test data: {0:.2f}".format(fbeta)) print("\tAverage Precision on test data: {0:.2f}".format( average_precision)) print("\tPrecision-Recall AUC on test data: {0:.2f}".format(auc)) print("-" * 80) print("Classification Report:") class_names = ['ham', 'spam'] print( metrics.classification_report(y_test, y_pred, target_names=class_names)) best_estimator = random_search.best_estimator_ best_score = random_search.best_score_ result_values = [ y_pred, y_score, precisions, recall, average_precision, acc, roc_auc, auc, fbeta, recalls, best_score, best_estimator, y_test ] result_keys = [ 'y_pred', 'y_score', 'precisions', 'recall', 'average_precision', 'acc', 'roc_auc', 'auc', 'fbeta', 'recalls', 'best_score', 'best_estimator', 'y_test' ] results = {k: v for k, v in zip(result_keys, result_values)} if pickle_best: pickle_dir = os.path.join(os.getcwd(), 'model', 'best_estimators') if not os.path.exists(pickle_dir): os.makedirs(pickle_dir) pickle_path = os.path.join(pickle_dir, 'model_sw.pkl') with open(pickle_path, 'wb') as f: pickle.dump(random_search.best_estimator_, f) return results
class RandomSearch(object): def __init__(self,estimator, param_distributions, n_iter=10, scoring=None, n_jobs=None, iid=False, refit=True,cv=None, verbose=0, pre_dispatch=None, random_state=None, error_score=np.nan, return_train_score=False): """ estimator : 使用的分类器, 并且传入除需要确定最佳的参数之外的参数,每个分类器都需要一个scoring参数, 或者score方法 param_distributions : 最要被优化的参数的取值, 值为字典或列表, param_grid = param_test1, 如:param_test1 = {"n_estimators":range(10,71,10)} 每个 评估器件,scoring 中需要指定一个, 若评估器内没指定, scoring 需要指定, 当scoring为None 时, 使用评估器中默认的score 函数 n_iter: int 默认为 10 scoring : 默认为None, str , 列表/元组或字典。 n_jobs : 默认为None,int, 1, 代表单线程, -1 为多线程 iid : False. bool 型参数, True 是, 将每个测试集的样本进行加权。 refit : 使用找到的最佳参数重新拟合评估器 , 默认为TRUE cv : 默认为None, None 为使用默认的5折, 整数的时候,指定合适的折数, 或者使用cv_split verbose : 显示打印信息, 0 不显示, 1 显示打印进度条 pre_dispatch : n_jobs 并行执行期间调度的作业数 "2*n_jobs" or int error_score : 拟合过程中,若出错,使用这个数值进行填充 一般使用nan return_train_score: bool 型, 默认为False, 不输出 训练分数 # 一般使用到 estimator, param_grid,scoring, n_jobs, cv, verbose """ self.randomsearch = RandomizedSearchCV(self, estimator=estimator, param_distributions=param_distributions, n_iter=n_iter, scoring=scoring, n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose, pre_dispatch=pre_dispatch, random_state=random_state, error_score=error_score, return_train_score=return_train_score) def fit(self, x, y=None): return self.randomsearch.fit(X=x, y=y) def transform(self, x): return self.randomsearch.transform(x=x) def predict(self, x): return self.randomsearch.predict(x=x) def predict_log_proba(self, x): return self.randomsearch.predict_log_proba(X=x) def predict_proba(self, x): return self.randomsearch.predict_proba(X=x) def inverse_transform(self, xt): return self.randomsearch.inverse_transform(Xt=xt) def decision_function(self, x): # refit=True下才支持decision_function return self.randomsearch.decision_function(X=x) def set_params(self,params): self.randomsearch.set_params(params) def get_params(self, deep=True): return self.randomsearch.get_params(deep=deep) def get_score(self, x, y=None): return self.randomsearch.score(X=x,y=y) def get_attribute(self, attribute_name): if attribute_name == "cv_result": return self.randomsearch.cv_results_ elif attribute_name == "best_estimator": return self.randomsearch.best_estimator_ elif attribute_name == "best_score": return self.randomsearch.best_score_ elif attribute_name == "best_params": return self.randomsearch.best_params_ elif attribute_name == "best_index": return self.randomsearch.best_index_ elif attribute_name =="scorer": return self.randomsearch.scorer_ elif attribute_name =="n_split": return self.randomsearch.n_splits_ elif attribute_name =="refit-time": return self.randomsearch.refit_time_ else: ValueError("输入的属性名称有误, 请输入正确的属性名称")