def predictData(): dataMatIn, classLabels = loadDataSet('testSet.txt') dataMatrix = mat(dataMatIn) labelMatrix = mat(classLabels).transpose() classifier = LogReg() # 使用类,参数全是默认的 classifier.fit(dataMatrix, labelMatrix) # 训练数据来学习,不需要返回值 tdataMatIn, tclassLabels = loadDataSet('dataSet.txt') tdataMatrix = mat(tdataMatIn) tlabelMatrix = mat(tclassLabels).transpose() count = 0 rCount = 0 prCount = 0 for i in range(0, len(dataMatIn)): res = classifier.predict(dataMatIn[i]) if res == 1: count += 1 if tclassLabels[i] == 1: prCount += 1 rCount += 1 else: if tclassLabels[i] == 1: rCount += 1 return count, rCount, prCount
def train_LogReg(x_train,y_train): param = { 'penalty':'l2',#正则化方式 'dual':False, #Dual or primal formulation. #Dual formulation is only implemented for l2 penalty with liblinear solver. #Prefer dual=False when n_samples > n_features. 'tol':1e-4, #Tolerance for stopping criteria 'C':1,#正则化参数 'fit_intercept':True, 'intercept_scaling':1, 'class_weight':None, # Weights associated with classes in the form {class_label: weight}. # If not given, all classes are supposed to have weight one. # The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to # class frequencies in the input data as n_samples / (n_classes * np.bincount(y)). # Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. # New in version 0.17: class_weight=’balanced’ 'random_state':None, 'solver':'liblinear', 'max_iter':100, 'multi_class':'ovr', 'verbose':1, 'n_jobs':None } model_logreg = LogReg(**param).fit(x_train,y_train) return model_logreg
def forewardStepWise(model = LogReg(random_state = 0, solver = 'lbfgs', max_iter = 1000)): pred = X.copy() usedVar = [] nVarModel = {} optimalVar = [] for step in range(1, len(X.columns) + 1): accuracySelector = [] for i in pred.columns: usedVar.append(i) xModel = model.fit(X[usedVar],y) CVaccuracy = cross_val_score(xModel, X[usedVar], y, scoring = 'accuracy', cv=10).mean() accuracySelector.append(CVaccuracy) usedVar = usedVar[:-1] nStep = max(accuracySelector) optimalVar.append(nStep) for n in range(len(accuracySelector)): if accuracySelector[n-1] == nStep: locate = n-1 usedVar.append(pred.columns[locate]) pred = pred.drop(columns = {pred.columns[locate]}) nVarModel[step] = usedVar #I have NO idea why this dictionary is adding new strings to keys #from (step-1) in addition to the correct string for the current step... . nVarModel = {key: value[:-1] for key, value in nVarModel.items()} nVarModel[step] = usedVar maxAccuracy = max(optimalVar) count = 0 for i in optimalVar: count += 1 if maxAccuracy == i: goodModel = nVarModel[count] return goodModel #%%
def predictData(lfilename, pfilename): dataMat, labMat = Loaddata(lfilename) datamatrix = mat(dataMat) labMatrix = mat(labMat).transpose() classifier = LogReg() classifier.fit(datamatrix, labMatrix) tdataMatIn, tclassLabels = Loaddata(pfilename) tdataMatrix = mat(tdataMatIn) tlabelMatrix = mat(tclassLabels).transpose() count = 0 rCount = 0 prCount = 0 for i in range(0, len(tdataMatIn)): res = classifier.predict(tdataMatIn[i]) if res == 1: count += 1 if tclassLabels[i] == 1: prCount += 1 rCount += 1 else: if tclassLabels[i] == 1: rCount += 1 return count, rCount, prCount
def zip_estimate(zs, ys, C=1, classifier="logreg", penalty='l2'): """Regress binary outcome against zip using regression of the form: y ~ z1 + z2 + z3 + z4 + z5. """ background_rate = mean(ys) # js = pd.notnull(zs) # zs = zs[js] # ys = ys[js] def to_row(z): arr = [0] * 111110 offset = 0 for i in range(1, 5 + 1): zi = z[:i] j = int(zi) + offset arr[j] = 1 offset += 10**i return arr N = len(zs) A = lil_matrix((N, 111110)) valids = 0 for i, z in tqdm(enumerate(zs), total=N): if not is_valid_zip(z): continue valids += 1 offset = 0 for d in range(1, 5 + 1): zi = z[:d] j = int(zi) + offset A[i, j] = 1 offset += 10**d print("valids:", valids) if classifier == "logreg": clf = LogReg(C=C, penalty=penalty) elif classifier == "ridge": clf = Ridge(alpha=1 / C) else: raise Exception("didn't recognize classifier:", classifier) print("fitting logreg") clf.fit(A, ys) all_zs = [ "".join(x) for k in range(1, 5 + 1) for x in product(*("0123456789" for _ in range(k))) ] all_A = lil_matrix((111110, 111110)) for i, z in tqdm(enumerate(all_zs), total=111110): offset = 0 for d in range(1, len(z) + 1): zi = z[:d] j = int(zi) + offset all_A[i, j] = 1 offset += 10**d if classifier == "logreg": all_yhats = clf.predict_proba(all_A)[:, 1] else: all_yhats = clf.predict(all_A) ml_dict = {z: yhat for z, yhat in zip(all_zs, all_yhats)} return defaultdict(lambda: background_rate, ml_dict)
def __init__(self, input_dim, num_classes, metric): super().__init__() self.input_dim = input_dim self.num_classes = num_classes self.metric = metric self.clf = LogReg(solver='lbfgs', multi_class='multinomial', warm_start=True) print('Logistic regression:') print(f'\t solver = L-BFGS') print(f"\t classes = {self.num_classes}") print(f"\t metric = {self.metric}")
def __init__( self, clf=None, *, seed=None, # Hyper-parameters (used by .fit() function) cv_n_folds=5, converge_latent_estimates=False, pulearning=None, find_label_issues_kwargs={}, label_quality_scores_kwargs={}, verbose=False, ): if clf is None: # Use logistic regression if no classifier is provided. clf = LogReg(multi_class="auto", solver="lbfgs") # Make sure the given classifier has the appropriate methods defined. if not hasattr(clf, "fit"): raise ValueError( "The classifier (clf) must define a .fit() method.") if not hasattr(clf, "predict_proba"): raise ValueError( "The classifier (clf) must define a .predict_proba() method.") if not hasattr(clf, "predict"): raise ValueError( "The classifier (clf) must define a .predict() method.") if seed is not None: np.random.seed(seed=seed) self.clf = clf self.seed = seed self.cv_n_folds = cv_n_folds self.converge_latent_estimates = converge_latent_estimates self.pulearning = pulearning self.find_label_issues_kwargs = find_label_issues_kwargs self.label_quality_scores_kwargs = label_quality_scores_kwargs self.verbose = verbose self.label_issues_df = None self.label_issues_mask = None self.sample_weight = None self.confident_joint = None self.py = None self.ps = None self.num_classes = None self.noise_matrix = None self.inverse_noise_matrix = None self.clf_kwargs = None self.clf_final_kwargs = None
def __init__( self, clf=None, seed=None, # Hyper-parameters (used by .fit() function) cv_n_folds=5, prune_method='prune_by_noise_rate', converge_latent_estimates=False, pulearning=None, n_jobs=None, ): if clf is None: # Use logistic regression if no classifier is provided. clf = LogReg(multi_class='auto', solver='lbfgs') # Make sure the given classifier has the appropriate methods defined. if not hasattr(clf, "fit"): raise ValueError( 'The classifier (clf) must define a .fit() method.') if not hasattr(clf, "predict_proba"): raise ValueError( 'The classifier (clf) must define a .predict_proba() method.') if not hasattr(clf, "predict"): raise ValueError( 'The classifier (clf) must define a .predict() method.') if seed is not None: np.random.seed(seed=seed) # Set-up number of multiprocessing threads used by get_noise_indices() if n_jobs is None: n_jobs = multiprocessing.cpu_count() else: assert (n_jobs >= 1) self.clf = clf self.seed = seed self.cv_n_folds = cv_n_folds self.prune_method = prune_method self.converge_latent_estimates = converge_latent_estimates self.pulearning = pulearning self.n_jobs = n_jobs self.noise_mask = None self.sample_weight = None self.confident_joint = None self.py = None self.ps = None self.K = None self.noise_matrix = None self.inverse_noise_matrix = None
def estimate_cv_predicted_probabilities( X, labels, # class labels can be noisy (s) or not noisy (y). clf=LogReg(multi_class='auto', solver='lbfgs'), cv_n_folds=5, seed=None, ): """This function computes the out-of-sample predicted probability [P(s=k|x)] for every example in X using cross validation. Output is a np.array of shape (N, K) where N is the number of training examples and K is the number of classes. Parameters ---------- X : np.array Input feature matrix (N, D), 2D numpy array labels : np.array or list of ints from [0,1,..,K-1] A discrete vector of class labels which may or may not contain mislabeling clf : sklearn.classifier or equivalent Default classifier used is logistic regression. Assumes clf has predict_proba() and fit() defined. cv_n_folds : int The number of cross-validation folds used to compute out-of-sample probabilities for each example in X. seed : int (default = None) Set the default state of the random number generator used to split the cross-validated folds. If None, uses np.random current random state. Returns -------- psx : np.array (shape (N, K)) P(label=k|x) is a matrix with K (noisy) probabilities for each of the N examples x. This is the probability distribution over all K classes, for each example, regarding whether the example has label s==k P(s=k|x). psx should have been computed using 3 (or higher) fold cross-validation.""" return estimate_py_noise_matrices_and_cv_pred_proba( X=X, s=labels, clf=clf, cv_n_folds=cv_n_folds, seed=seed, )[-1]
def test(emb, label_mat, emb_IDmap, label_IDmap, n_splits, random_state, shuffle): """Test embedding performance Perform node classification using L2 regularized Logistic Regression with 5-Fold Cross Validation """ n_classes = label_mat.shape[1] label_IDs = list(label_IDmap) emb_idx = [emb_IDmap[ID] for ID in label_IDs] x = emb[emb_idx] splitter = skf(n_splits=n_splits, random_state=random_state, shuffle=shuffle) mdl = LogReg(penalty='l2', solver='lbfgs', warm_start=False, max_iter=1000) y_true_all = [] y_pred_all = [] for i in range(n_classes): y = label_mat[:, i] label = i + 1 y_true = np.array([], dtype=bool) y_pred = np.array([]) for j, (train, test) in enumerate(splitter.split(y, y)): print("Testing class #{:>4d},\tfold {:>2d} / {:<2d}".format( label, j + 1, n_splits), flush=True, end='\r') mdl.fit(x[train], y[train]) y_true = np.append(y_true, y[test]) y_pred = np.append(y_pred, mdl.decision_function(x[test])) y_true_all.append(y_true) y_pred_all.append(y_pred) print('') return y_true_all, y_pred_all
def fit(self, rs: RecordSet) -> None: """ fit a Logistic regression mdl :param rs: The record set to fit with. """ # set params self.data = cp.deepcopy(rs) patterns = self.data.entries[:, :-1] out = self.data.entries[:, -1:] # avoid error if self.alpha == 0: raise Exception( "Alpha Logistic too low to obtain reliable results") # import the logistic regression self.model = LogReg(C=1 / self.alpha, penalty="l1", solver="liblinear") self.model.fit(X=patterns, y=out.ravel())
def cross_val_c(window, seeds_arr, slots_arr, tourney_arr, column_names, predictor_dfs): col_names = ['0.0001', '0.001', '0.01', '.1', '1', '10', '100'] test_yr_range = range(2003 + window, 2016) scores = pd.DataFrame(index=test_yr_range, columns=col_names) c_vals = range(-4, 3) for yr in test_yr_range: x_train, y_train, x_test, y_test = train_test_split( window, yr, seeds_arr, slots_arr, tourney_arr, column_names, predictor_dfs) scaler = StandardScaler().fit(x_train.iloc[:, 2:]) for c in c_vals: model = LogReg(C=10**c) model.fit(scaler.transform(x_train.iloc[:, 2:]), y_train.values.T[0]) scores.ix[yr, c + 2] = model.score( scaler.transform(x_test.iloc[:, 2:]), y_test.values.T[0]) return scores
sent = sentences(essay) numOfSent_train.append(sent) numOfSent_valid = [] for essay in valid_df['essay']: sent = sentences(essay) numOfSent_valid.append(sent) train_df['sentence_length'] = numOfSent_train valid_df['sentence_length'] = numOfSent_valid regularization_data_sentence = create_regularization_sentence_length(train_df) train_df = append_regularized_sentence_length(train_df) # FITTING THE TRAINING SET USING L2 LOGISTIC logistic_l2 = LogReg(penalty='l2', solver='liblinear', n_jobs=4) xs = [[x] for x in np.array(train_df['sentence_length'])] logistic_l2.fit(xs, train_std_scores) # DENORMALIZING FOR THE VALID SET max_essay_set = max(train_df['essay_set']) stand_pred_values_l2 = [] for i in range(max_essay_set): current_set = valid_df[valid_df['essay_set'] == i + 1]['sentence_length'] for value in current_set: stand_pred_values_l2.append(int(float(value) * float(regularization_data_sentence[i][2]) + (regularization_data_sentence[i][1]))) # PREDICTING THE SCORE USING THE NEW SENTENCE LENGTH valid_df['new_sentence_length_std'] = stand_pred_values_l2 valid_x = [[x] for x in np.array(valid_df['new_sentence_length_std'])] valid_pred_std_scores_l2 = logistic_l2.predict(valid_x)
return pd.DataFrame(np.array(data).reshape(-1, 3), columns=['ID', 'rate', 'response']) def long_stats(dfk, columns): x = final_dataLONG(columns, dfk) est = smf.mixedlm('rate ~ response', x, groups=x['ID']) est2 = est.fit() print(est2.summary()) ##### ##IMPLEMENTATION ##### lr = LogReg(solver='newton-cg', multi_class='multinomial') df2 = df_1y.replace([np.nan], 'NA') df3 = df_Uy.replace([np.nan], 'NA') df1y = final_data(df2, list(df2)[1:]) dfUy = final_data(df3, list(df3)[1:]) ### #Q5 RESULTS ### #1y Students print('===1y Q5===') x = df1y[['Q5B', 'Q5C']] y = df1y['Q5A']
from sklearn.linear_model import LogisticRegression as LogReg from sklearn.cross_validation import train_test_split import sklearn.metrics as metrics import pandas as pd # Open vectorized files train = pd.read_csv('../datasets/bless2011/data_lex_train_vectorized_diff.tsv', sep='\t', header=None) test = pd.read_csv('../datasets/bless2011/data_lex_test_vectorized_diff.tsv', sep='\t', header=None) ### Training # Remove NaN train.dropna(axis=0, inplace=True) X = train.iloc[:, :-1] y = train.iloc[:, -1].astype(bool) clf = LogReg() clf.fit(X, y) ### Testing orig_rows, orig_cols = test.shape # Remove rows with NaN test.dropna(axis=0, inplace=True) # Count number of rows removed diff = orig_rows - test.shape[0] X = test.iloc[:, :-1] y = test.iloc[:, -1].astype(bool) preds = clf.predict(X)
if y_test[i] == 1: pos += 1 else: neg += 1 # model stats to be put into table model_stats = [31, 0.1, acc, fp, fn] #model_stats = np.zeros((2,5)) #model_stats[0,0] = 31 #model_stats[0,1] = 0.1 #model_stats[0,2] = acc #model_stats[0,3] = fp #model_stats[0,4] = fn # making LaTex table for accuracy scores df = pd.DataFrame.from_records([model_stats], columns=[ 'Number of epochs', 'Learning rate', 'Accuracy', 'False positives', 'False negatives' ]) tab = df.to_latex(index=False, float_format="%.2f") print(f"\n\n{tab}\n\n") # comparison to scikit-learn using LogisticRegression class logreg2 = LogReg() logreg2.fit(X_train, y_train) y_pred = logreg2.predict(X_test) print("scikit-learn accuracy: {:.5f}".format(sklearn_accuracy(y_test, y_pred)))
start['is_EOH'] = start.apply(lambda row: end_of_half_det(row), axis=1) start['pos_leads'] = (start['posteam_score'] > start['defteam_score']).astype(int) to_drop = [ 'Unnamed: 0', 'game_date', 'ends_TD', 'ends_FG', 'ends_punt', 'ends_other' ] sk = start.copy() sk.drop(to_drop, axis=1, inplace=True) sk_train, sk_test = tt_split(sk) y_train = sk_train.pop('target').values X_train = sk_train.values y_test = sk_test.pop('target').values X_test = sk_test.values mod = LogReg(solver='saga', max_iter=5000, multi_class='multinomial', n_jobs=-1) mod.fit(X_train, y_train) mod_score = np.around(mod.score(X_test, y_test), 3) holdout = pd.read_csv('data/start_pos_holdout.csv') m0 = holdout['yardline_100'].notna() holdout = holdout[m0].copy() holdout['target'] = holdout.apply(lambda row: make_target(row), axis=1) holdout['is_EOH'] = holdout.apply(lambda row: end_of_half_det(row), axis=1) holdout['pos_leads'] = (holdout['posteam_score'] > holdout['defteam_score']).astype(int) to_drop.append('game_id') holdout.drop(to_drop, axis=1, inplace=True) holdout_y = holdout.pop('target').values holdout_X = holdout.values
def estimate_noise_matrices( X, s, clf=LogReg(multi_class='auto', solver='lbfgs'), cv_n_folds=5, thresholds=None, converge_latent_estimates=True, seed=None, ): """Estimates the noise_matrix of shape (K, K). This is the fraction of examples in every class, labeled as every other class. The noise_matrix is a conditional probability matrix for P(s=k_s|y=k_y). Under certain conditions, estimates are exact, and in most conditions, estimates are within one percent of the actual noise rates. Parameters ---------- X : np.array Input feature matrix (N, D), 2D numpy array s : np.array A discrete vector of labels, s, which may contain mislabeling. "s" denotes the noisy label instead of \tilde(y), for ASCII reasons. clf : sklearn.classifier or equivalent Default classifier used is logistic regression. Assumes clf has predict_proba() and fit() defined. cv_n_folds : int The number of cross-validation folds used to compute out-of-sample probabilities for each example in X. thresholds : iterable (list or np.array) of shape (K, 1) or (K,) P(s^=k|s=k). If an example has a predicted probability "greater" than this threshold, it is counted as having hidden label y = k. This is not used for pruning, only for estimating the noise rates using confident counts. This value should be between 0 and 1. Default is None. converge_latent_estimates : bool If true, forces numerical consistency of estimates. Each is estimated independently, but they are related mathematically with closed form equivalences. This will iteratively make them mathematically consistent. seed : int (default = None) Set the default state of the random number generator used to split the cross-validated folds. If None, uses np.random current random state. Returns ------ A two-item tuple containing (noise_matrix, inv_noise_matrix).""" return estimate_py_noise_matrices_and_cv_pred_proba( X=X, s=s, clf=clf, cv_n_folds=cv_n_folds, thresholds=thresholds, converge_latent_estimates=converge_latent_estimates, seed=seed, )[1:-2]
suby = np.concatenate( (['pristine'] * len(y_vanilla[y_vanilla == class_name]), ['tampered'] * len(y_augmented[y_augmented == class_name]))) exec_time = time.time() clf.train(subX, suby) exec_time = time.time() - exec_time print('Done. (Training took %.3f seconds)' % (exec_time)) # Stage 2: a pair of multiclass classifiers: one for pristine images, one for tampered images clfs = { 'LogReg': [ LogReg(penalty='l2', max_iter=2000, n_jobs=4, multi_class='ovr', solver='newton-cg') ] * 2 } for n, [c1, c2] in clfs.items(): print('CVing stage 2 classifiers %s...' % (n), end=' ') exec_time = time.time() c1.train(X_vanilla, y=y_vanilla) c2.train(X_augmented, y_augmented) exec_time = time.time() - exec_time print('Done. (Training took %.3f seconds)' % (exec_time))
def estimate_py_noise_matrices_and_cv_pred_proba( X, s, clf=LogReg(multi_class='auto', solver='lbfgs'), cv_n_folds=5, thresholds=None, converge_latent_estimates=False, py_method='cnt', seed=None, ): """This function computes the out-of-sample predicted probability P(s=k|x) for every example x in X using cross validation while also computing the confident counts noise rates within each cross-validated subset and returning the average noise rate across all examples. This function estimates the noise_matrix of shape (K, K). This is the fraction of examples in every class, labeled as every other class. The noise_matrix is a conditional probability matrix for P(s=k_s|y=k_y). Under certain conditions, estimates are exact, and in most conditions, estimates are within one percent of the actual noise rates. Parameters ---------- X : np.array Input feature matrix (N, D), 2D numpy array s : np.array A discrete vector of labels, s, which may contain mislabeling. "s" denotes the noisy label instead of \tilde(y), for ASCII reasons. clf : sklearn.classifier or equivalent Default classifier used is logistic regression. Assumes clf has predict_proba() and fit() defined. cv_n_folds : int The number of cross-validation folds used to compute out-of-sample probabilities for each example in X. thresholds : iterable (list or np.array) of shape (K, 1) or (K,) P(s^=k|s=k). If an example has a predicted probability "greater" than this threshold, it is counted as having hidden label y = k. This is not used for pruning, only for estimating the noise rates using confident counts. This value should be between 0 and 1. Default is None. converge_latent_estimates : bool If true, forces numerical consistency of estimates. Each is estimated independently, but they are related mathematically with closed form equivalences. This will iteratively make them mathematically consistent. py_method : str (Options: ["cnt", "eqn", "marginal", "marginal_ps"]) How to compute the latent prior p(y=k). Default is "cnt" as it often works well even when the noise matrices are estimated poorly by using the matrix diagonals instead of all the probabilities. seed : int (default = None) Set the default state of the random number generator used to split the cross-validated folds. If None, uses np.random current random state. Returns ------ Returns a tuple of five numpy array matrices in the form: (py, noise_matrix, inverse_noise_matrix, joint count matrix i.e. confident joint, predicted probability matrix)""" confident_joint, psx = estimate_confident_joint_and_cv_pred_proba( X=X, s=s, clf=clf, cv_n_folds=cv_n_folds, thresholds=thresholds, seed=seed, ) py, noise_matrix, inv_noise_matrix = estimate_latent( confident_joint=confident_joint, s=s, py_method=py_method, converge_latent_estimates=converge_latent_estimates, ) return py, noise_matrix, inv_noise_matrix, confident_joint, psx
for cl in clss: #Constrain text to 50/50 true false split df = classify_data(cl) X_train = df['text'] y_train = df['y'] #Begin random search of vectorizor parameters for tfidf. for iteration in range(iterations): vec_list = vectorizer(tokenizer=tokenizer, iterations=iterations, max_df_options=max_df, min_df_options=min_df, ngram_range_options=ngram_range) X_train = vec_list[iteration]['vec'] print("Vecorization complete, now training the model") # Fit initial model logit = LogReg(random_state=42, solver='lbfgs', max_iter=1000).fit(X_train, y_train) #Select optimal features model = SelectFromModel(logit, prefit=True) X_new = model.transform(X_train) #Fit new model to selected features logit_new = LogReg(random_state=42, solver='lbfgs', max_iter=1000).fit(X_new, y_train) #Transform test text according to fitted tfidf from training set print("Vectorizing Test Data") test_vec = vec_list[iteration]['tfidf'].transform(X_test) test_vec_new = model.transform(test_vec) #Make predictions from the model with the test data print("Calculating predictions") prediction = logit_new.predict(test_vec_new) fpr_rf, tpr_rf, thresh_rf = skm.roc_curve(list(adjust_test_format(cl)), list(prediction))
all_max_features = [100, 500, 1000, 2000, 4000] for max_features in all_max_features: print('Computing for {} max features'.format(max_features)) vectorizers = get_ngram_vectorizers(X_train, max_features) X_train_ngrams = vectorize(vectorizers, X_train) X_test_ngrams = vectorize(vectorizers, X_test) vectorizers = get_ngram_vectorizers(X_train_l, max_features) X_train_ngrams_l = vectorize(vectorizers, X_train_l) X_test_ngrams_l = vectorize(vectorizers, X_test_l) models = [LogReg(random_state=43), RF(random_state=44), Perc(shuffle=True, random_state=45), SVC(random_state=46)] model_names = ['Logistic Regression', 'Random Forest', 'Perceptron', 'Linear SVM'] for model, model_name in zip(models, model_names): print('Computing: {}'.format(model_name)) for i in range(3): print('no lemmatization') model.fit(X_train_ngrams[i], y_train) score = model.score(X_test_ngrams[i], y_test) results[model_name + '_nl_' + str(max_features) + ['uni', 'bi', 'unibi'][i]] = score print('Accuracy for test set: ', score) print('with lematization') model.fit(X_train_ngrams_l[i], y_train) score = model.score(X_test_ngrams_l[i], y_test)
def __init__(self, config=None, class_min=0): """Initialize a logistic regression classifier.""" self.config = self._resolve_config(config) self.class_min = self._resolve_class_min(class_min) self.model = LogReg(**self.config) self.scaler = StandardScaler()
import sklearn.preprocessing from sklearn.linear_model import LogisticRegression as LogReg #%% Logistic regression # if ever need to change X and Y and then save to pickle X.to_pickle(os.path.join(savedDataFld, 'trainPredictorsDummies.pkl')) y.to_pickle(os.path.join(savedDataFld, 'trainOutcome.pkl')) X = pd.read_pickle(os.path.join(savedDataFld, 'trainPredictorsDummies.pkl')) y = pd.read_pickle(os.path.join(savedDataFld, 'trainOutcome.pkl')) train2_X = pd.read_pickle( os.path.join(savedDataFld, 'testPredictorsDummies.pkl')) train2_y = pd.read_pickle(os.path.join(savedDataFld, 'testOutcome.pkl')) X_test = pd.read_pickle( os.path.join(savedDataFld, 'actualPredictorsDummies.pkl')) #%% fit the model on 70% and test on 30% logRegM = LogReg(random_state=0, solver='lbfgs', max_iter=1000).fit(X, y) train2_z = logRegM.predict(train2_X) confusionMatrixInfo(train2_z, train2_y) """ {'confusionMatrix': array([[135, 31], [ 28, 73]]), 'accuracy': 0.7790262172284644, 'no information rate': 0.6217228464419475, 'sensitivity': 0.8132530120481928, 'specificity': 0.7227722772277227} """
def test_age_identification(): path = 'pan15-author-profiling-training-dataset-2015-03-02\\pan15-author-profiling-training-dataset-english-2015-03-02\\' users = preproc.load_users(path) users_dict = preproc.load_users_dict(path) truth = preproc.get_users_truth(path) #add features print 'Creating features' df = preproc.create_users_dataframe(path) df['label'] = df['user_id'].map(lambda id: 0 if truth[id][0] == 'M' else 1) df['age_label'] = df['user_id'].map(lambda user_id: truth[user_id][1]) le = preprocessing.LabelEncoder() df['age_label'] = le.fit_transform(df['age_label']) df = features.add_self_references_count(df, users) df = features.add_positive_words( df, users_dict, preproc.load_words('resources\\positive-words.txt')) df = features.add_negative_words( df, users_dict, preproc.load_words('resources\\negative-words.txt')) df = features.add_articles(df, users_dict) df = features.add_url_count(df, users_dict) df = features.add_long_words(df, users) #normalize features print 'Normalizing features' scaler = MinMaxScaler(copy=False) scaler.fit_transform(df['pos_words']) scaler.fit_transform(df['neg_words']) scaler.fit_transform(df['self_ref_count']) scaler.fit_transform(df['url_count']) scaler.fit_transform(df['articles']) long_words = [ 'username', 'people', 'nowplaying', 'really', 'should', 'others', 'thanks', 'twitter', 'always', 'google', 'things', 'better', 'tumblr', 'school', 'because', 'someone', 'facebook', 'frzhtmoge7', 'please', 'something' ] feature_names = [ 'self_ref_count', 'articles', 'pos_words', 'neg_words', 'url_count' ] all_features = feature_names + long_words #initialize classifiers log_reg = LogReg() svm_clf = svm.SVC() gnb_clf = GaussianNB() ranfor_clf = RandomForestClassifier() clfs = { 'logistic regression': log_reg, 'linear SVM': svm_clf, 'GaussianNB': gnb_clf, 'random forest': ranfor_clf } for clf in clfs: scores = cross_validation.cross_val_score(clfs[clf], df[all_features], df['age_label'], cv=10) print clf + ' : ' + str(scores.mean())
train_scores = (classification_report( train_labels, train_prediction)).split( '\n') #precision, recall and F-score on train data train_score = ' '.join( train_scores[0].lstrip().split(' ')[:-1]) + '\n' + ' '.join( train_scores[-2].split(' ')[3:-1]) return 'test %.3f train %.3f' % ( test_accuracy, train_accuracy ) + '\n' + 'train: ' + train_score + '\n' + 'test: ' + test_score, k[:-1] if __name__ == "__main__": classifiers_dict = dict() search_parameters = dict() default_parameters = dict() classifiers_dict['LogReg'] = LogReg() classifiers_dict['LinearSVC'] = LinearSVC() search_parameters['LogReg'] = {'C': (3 * 10**-3, 3 * 10**-2, 3 * 10**-1)} search_parameters['LinearSVC'] = { 'C': (3 * 10**-3, 3 * 10**-2, 3 * 10**-1) } d0 = ['implementation', 'epoch'] columns = [ 'cbow', 'size', 'alpha', 'window', 'negative', 'sample', 'min_count' ] best_params = ['best_parametersLogReg', 'best_parametersLinearSVC'] classifiers = ['LogReg', 'LinearSVC'] diag_dir = sys.argv[4] epoch = int(sys.argv[5]) if (epoch == 0):
def main(space_dir, classifier, C=None): """ Write evaluation results to the Res_concat_IMDB.csv DataFrame in separate directories and to the output""" #future DataFrame fields d0 = ['implementation'] parameters = ['size', 'window', 'negative', 'min_count'] columns = [ 'size', 'alpha0', 'alpha1', 'window', 'negative', 'cbow0_sample', 'cbow1_sample', 'min_count', 'iter0', 'iter1' ] best_params = ['best_parameters'] classifiers = ['LogReg', 'LinearSVC'] #default parameters from the article default_parameters['size'] = 150 default_parameters['alpha'] = 0.05 default_parameters['window'] = 10 default_parameters['negative'] = 25 default_parameters['min_count'] = 1 default_parameters['iter'] = '_wtf_' if ( C is not None ): #if C was given as an input value then initialize classifier with it classifiers_dict['LogReg'] = LogReg(C=C) #classifiers_dict['SklearnMLP'] = MLPClassifier(hidden_layer_sizes = (50, 50), max_iter=1000) classifiers_dict['LinearSVC'] = LinearSVC(C=C) #classifiers_dict['StatModelsLogReg'] = sm.Logit() else: #else prepare for GridSerach classifiers_dict['LogReg'] = LogReg() classifiers_dict['LinearSVC'] = LinearSVC() search_parameters['LogReg'] = { 'C': (10**-5, 3 * 10**-5, 10**-4, 3 * 10**-4, 10**-3, 3 * 10**-3, 10**-2, 3 * 10**-2, 10**-1, 3 * 10**-1, 1), 'max_iter': (200, 400, 1000, 2000) } search_parameters['LinearSVC'] = { 'C': (10**-5, 3 * 10**-5, 10**-4, 3 * 10**-4, 10**-3, 3 * 10**-3, 10**-2, 3 * 10**-2, 10**-1, 3 * 10**-1, 1), 'max_iter': (200, 400, 1000, 2000) } #index = 0 for model in os.listdir( space_dir): #for every model in the vectors directory if model.endswith('.txt'): #if it is the name of saved model if ('cbow 0' in model): #if it is a PV-DBOW model par_list = [ ] #list of parameters from the model name (default parameters are ignored) string = model.split(".txt")[0] #name of the PV-DBOW model implementation = string.split()[0] #doc2vec or word2vec for column in parameters: i = string.find( column ) #find each non-default parameter in PV-DBOW model name if (i != -1): value = string[i:].split()[1] #and its value par_list += [column + ' ' + value ] # add them to the list else: par_list += [column + ' -1' ] #if there are no such parameter add -1 for other_model in os.listdir( space_dir): #for every model in the vectors directory if other_model.endswith( '.txt'): #if it is the name of saved model if ('cbow 1' in other_model): #if it is a PV-DM model df = pd.DataFrame( columns=d0 + columns + classifiers + best_params) #initialize DataFrame samples = [] #list for samples value consider = True #whether the PV-DM model is compatible with PV-DBOW model other_model = other_model.split(".txt")[ 0] #name of the PV-DM model for column in parameters: i = other_model.find( column ) #find each non-default parameter in PV-DM model name if (i != -1): if ( column + ' ' + other_model[i:].split()[1] ) not in par_list: #if there is a redundant parameter consider = False #then the PV-DM model is not compatible break else: if ( column + ' -1' ) not in par_list: #if the PV-DM model doesn't have the default parameter consider = False #then the it's not compatible break if (not consider ): #if model is not compatible then skip it continue #index += 1 index = 1 #only one string int the DataFrame #putting parameters into DataFrame for column in parameters: i = string.find(column) if (i != -1): value = string[i:].split()[1] df.set_value(index, column, value) else: df.set_value(index, column, default_parameters[column]) i = string.find('sample') if (i != -1): value = string[i:].split()[1] df.set_value(index, 'cbow0_sample', value) samples += ['sample ' + value] else: df.set_value(index, 'cbow0_sample', '1e-2') samples += ['sample 1e-2'] df.set_value(index, 'implementation', implementation) i = other_model.find('sample') if (i != -1): value = other_model[i:].split()[1] df.set_value(index, 'cbow1_sample', value) else: df.set_value(index, 'cbow1_sample', '1e-4') i = string.find('alpha') if (i != -1): value = string[i:].split()[1] df.set_value(index, 'alpha0', value) else: df.set_value(index, 'alpha0', '0.05') i = other_model.find('alpha') if (i != -1): value = other_model[i:].split()[1] df.set_value(index, 'alpha1', value) else: df.set_value(index, 'alpha1', '0.05') i = string.find('iter') if (i != -1): value = string[i:].split()[1] df.set_value(index, 'iter0', value) else: df.set_value(index, 'iter0', 'wtf') i = other_model.find('iter') if (i != -1): value = other_model[i:].split()[1] df.set_value(index, 'iter1', value) else: df.set_value(index, 'iter1', 'wtf') #load train and test vectors from PV-DBOW model + labels try: DocumentVectors0_0, DocumentVectors1_0 = DocumentVectors( space_dir + model, implementation) except: print(model ) #print which model causes the problem traceback.print_exc(file=sys.stdout) continue #load train and test vectors from PV-DM model + labels try: DocumentVectors0_1, DocumentVectors1_1 = DocumentVectors( space_dir + other_model + '.txt', implementation) except: print(other_model ) #print which model causes the problem traceback.print_exc(file=sys.stdout) continue #concatenate PV-DBOW and PV-DM train models DocumentVectors0 = np.concatenate( (DocumentVectors0_0, DocumentVectors0_1), axis=1) #concatenate PV-DBOW and PV-DM test models DocumentVectors1 = np.concatenate( (DocumentVectors1_0, DocumentVectors1_1), axis=1) d = 50 y_1 = [1] * 500 y_0 = [0] * 500 train_labels = y_1[:-d] + y_0[d:] test_labels = y_1 + y_0 dir_name = ( other_model + ''.join(samples)).replace(' ', '_').replace( '-', '') #name directory after model parameters run_dir = './runs_IMDB/%s-%s/' % ( dir_name, time_str()) #and after starting time os.makedirs(run_dir, exist_ok=True) #make this directory #get accuracy, precision, recall, etc. and best parameters (if C was in input then it will be chosen as the best par) accuracy, best = Classification( classifier, C, DocumentVectors0, train_labels, DocumentVectors1, test_labels) #write it all into DataFrame df.set_value(index, classifier, accuracy) df.set_value(index, 'best_parameters', best) df.to_csv(run_dir + "Res_concat_IMDB" + classifier + ".csv") #and to the output print(other_model) print(model) print(accuracy)
# default inicial EMBARKED = 'Southampton' FARE = 33 AGE = 30 GENDER = 'Female' TITLE = 'Mrs.' CLASS = 'Second' CABIN = 'C' SIBSP = 0 PARCH = 0 # inicializando variables tasa_media = 0 # modelo logreg = LogReg() # flask app app = Flask(__name__) # antes del primer request... @app.before_first_request def startup(): global tasa_media, logreg data = genfromtxt(PATH + '/data/titanic.csv', delimiter=',') tasa_media = (np.mean([e[0] for e in data]) * 100) X_train, X_test, y_train, y_test = tts([e[1:] for e in data],
'intercept_scaling': (1, 2, 3), 'max_iter': (100, 200, 400, 800, 1000), 'multi_class': ('ovr', 'multinomial') } #search_parameters['SklearnMLP'] = {'solver' : ('lbfgs', 'sgd', 'adam')}#TODO search_parameters['SklearnLinearSVC'] = { 'loss': ('hinge', 'squared_hinge'), 'penalty': ('l1', 'l2'), 'dual': (False, True), 'fit_intercept': (True, False), 'intercept_scaling': (1, 2, 3), 'max_iter': (100, 200, 400, 800, 1000), 'multi_class': ('ovr', 'crammer_singer') } classifiers_dict['SklearnLogReg'] = LogReg() #classifiers_dict['SklearnMLP'] = MLPClassifier(hidden_layer_sizes = (50, 50), max_iter=1000) classifiers_dict['SklearnLinearSVC'] = LinearSVC() newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes')) vectorizer = TfidfVectorizer(ngram_range=(1, 2)) DocumentVectors0 = vectorizer.fit_transform(newsgroups_train.data) DocumentVectors1 = vectorizer.transform(newsgroups_test.data)
def estimate_confident_joint_and_cv_pred_proba( X, s, clf=LogReg(multi_class='auto', solver='lbfgs'), cv_n_folds=5, thresholds=None, seed=None, calibrate=True, ): """Estimates P(s,y), the confident counts of the latent joint distribution of true and noisy labels using observed s and predicted probabilities psx. The output of this function is a numpy array of shape (K, K). Under certain conditions, estimates are exact, and in many conditions, estimates are within one percent of actual. Notes: There are two ways to compute the confident joint with pros/cons. 1. For each holdout set, we compute the confident joint, then sum them up. 2. Compute pred_proba for each fold, combine, compute the confident joint. (1) is more accurate because it correctly computes thresholds for each fold (2) is more accurate when you have only a little data because it computes the confident joint using all the probabilities. For example if you had 100 examples, with 5-fold cross validation + uniform p(y) you would only have 20 examples to compute each confident joint for (1). Such small amounts of data is bound to result in estimation errors. For this reason, we implement (2), but we implement (1) as a commented out function at the end of this file. Parameters ---------- X : np.array Input feature matrix (N, D), 2D numpy array s : np.array A discrete vector of labels, s, which may contain mislabeling. "s" denotes the noisy label instead of \tilde(y), for ASCII reasons. clf : sklearn.classifier or equivalent Default classifier used is logistic regression. Assumes clf has predict_proba() and fit() defined. cv_n_folds : int The number of cross-validation folds used to compute out-of-sample probabilities for each example in X. thresholds : iterable (list or np.array) of shape (K, 1) or (K,) P(s^=k|s=k). If an example has a predicted probability "greater" than this threshold, it is counted as having hidden label y = k. This is not used for pruning, only for estimating the noise rates using confident counts. This value should be between 0 and 1. Default is None. seed : int (default = None) Set the default state of the random number generator used to split the cross-validated folds. If None, uses np.random current random state. calibrate : bool (default: True) Calibrates confident joint estimate P(s=i, y=j) such that np.sum(cj) == len(s) and np.sum(cj, axis = 1) == np.bincount(s). Returns ------ Returns a tuple of two numpy array matrices in the form: (joint counts matrix, predicted probability matrix)""" assert_inputs_are_valid(X, s) # Number of classes K = len(np.unique(s)) # Ensure labels are of type np.array() s = np.asarray(s) # Create cross-validation object for out-of-sample predicted probabilities. # CV folds preserve the fraction of noisy positive and # noisy negative examples in each class. kf = StratifiedKFold(n_splits=cv_n_folds, shuffle=True, random_state=seed) # Intialize psx array psx = np.zeros((len(s), K)) # Split X and s into "cv_n_folds" stratified folds. for k, (cv_train_idx, cv_holdout_idx) in enumerate(kf.split(X, s)): clf_copy = copy.deepcopy(clf) # Select the training and holdout cross-validated sets. X_train_cv, X_holdout_cv = X[cv_train_idx], X[cv_holdout_idx] s_train_cv, s_holdout_cv = s[cv_train_idx], s[cv_holdout_idx] # Fit the clf classifier to the training set and # predict on the holdout set and update psx. clf_copy.fit(X_train_cv, s_train_cv) psx_cv = clf_copy.predict_proba(X_holdout_cv) # P(s = k|x) # [:,1] psx[cv_holdout_idx] = psx_cv # Compute the confident counts, a K x K matrix for all pairs of labels. confident_joint = compute_confident_joint( s=s, psx=psx, # P(s = k|x) thresholds=thresholds, calibrate=calibrate, ) return confident_joint, psx