def main(): # set features here features = ['pclass', 'sex', 'age', 'parch', 'sibsp'] # set True to remove entries with no age data from training set, false # otherwise. remove_ageless = False h, train = load('train.csv') if remove_ageless: train = train[train[:,h.index('Age')]!=''] raw_x, raw_t = split_targets(train, h.index('Survived')) h, test = load('test.csv') # get array of keys; won't include 'survived' keys = [key.lower() for key in h] X = preprocess(raw_x, keys, features, bin_age = True, bin_pclass = True) T = raw_t.reshape(raw_t.shape[0], 1).astype(np.float) lr = logreg() lr.fit(X[:,1:],T) print 'Coef: ' + str(lr.coef_) result = lr.predict(preprocess(test, keys, features, bin_age = True, bin_pclass = True)[:,1:]) save_result(test[:,0], result.astype(np.int),'predict.csv')
def __init__( self, clf = None, seed = None, # Hyper-parameters (used by .fit() function) cv_n_folds = 5, prune_method = 'prune_by_noise_rate', converge_latent_estimates = False, pulearning = None, ): if clf is None: # Use logistic regression if no classifier is provided. clf = logreg(multi_class = 'auto', solver = 'lbfgs') # Make sure the passed in classifier has the appropriate methods defined. if not hasattr(clf, "fit"): raise ValueError('The classifier (clf) must define a .fit() method.') if not hasattr(clf, "predict_proba"): raise ValueError('The classifier (clf) must define a .predict_proba() method.') if not hasattr(clf, "predict"): raise ValueError('The classifier (clf) must define a .predict() method.') if seed is not None: np.random.seed(seed = seed) self.clf = clf self.seed = seed self.cv_n_folds = cv_n_folds self.prune_method = prune_method self.converge_latent_estimates = converge_latent_estimates self.pulearning = pulearning
def estimate_noise_matrices( X, s, clf=logreg(multi_class='auto', solver='lbfgs'), cv_n_folds=5, thresholds=None, converge_latent_estimates=True, seed=None, ): '''Estimates the noise_matrix of shape (K, K). This is the fraction of examples in every class, labeled as every other class. The noise_matrix is a conditional probability matrix for P(s=k_s|y=k_y). Under certain conditions, estimates are exact, and in most conditions, estimates are within one percent of the actual noise rates. Parameters ---------- X : np.array Input feature matrix (N, D), 2D numpy array s : np.array A discrete vector of labels, s, which may contain mislabeling clf : sklearn.classifier or equivalent Default classifier used is logistic regression. Assumes clf has predict_proba() and fit() defined. cv_n_folds : int The number of cross-validation folds used to compute out-of-sample probabilities for each example in X. thresholds : iterable (list or np.array) of shape (K, 1) or (K,) P(s^=k|s=k). If an example has a predicted probability "greater" than this threshold, it is counted as having hidden label y = k. This is not used for pruning, only for estimating the noise rates using confident counts. This value should be between 0 and 1. Default is None. converge_latent_estimates : bool If true, forces numerical consistency of estimates. Each is estimated independently, but they are related mathematically with closed form equivalences. This will iteratively make them mathematically consistent. seed : int (default = None) Number to set the default state of the random number generator used to split the cross-validated folds. If None, uses np.random current random state. Returns ------ A two-item tuple containing (noise_matrix, inv_noise_matrix).''' return estimate_py_noise_matrices_and_cv_pred_proba( X=X, s=s, clf=clf, cv_n_folds=cv_n_folds, thresholds=thresholds, converge_latent_estimates=converge_latent_estimates, seed=seed, )[1:-2]
def __init__( self, clf=None, e1=None, ): self.clf = logreg() if clf is None else clf self.e1 = e1
def __init__(self, frac_pos2neg, frac_neg2pos, clf=None): if frac_pos2neg is not None and frac_neg2pos is not None: # Verify that rh1 + rh0 < 1 and pi0 + pi1 < 1. if frac_pos2neg + frac_neg2pos >= 1: raise Exception( "frac_pos2neg + frac_neg2pos < 1 is " + "necessary condition for noisy PN (binary) classification." ) self.rh1 = frac_pos2neg self.rh0 = frac_neg2pos self.clf = logreg() if clf is None else clf
def __init__(self, frac_pos2neg = None, frac_neg2pos = None, clf = None, ): if frac_pos2neg is not None and frac_neg2pos is not None: # Verify that rh1 + rh0 < 1 and pi0 + pi1 < 1. if frac_pos2neg + frac_neg2pos >= 1: raise Exception("frac_pos2neg + frac_neg2pos < 1 is " + "a necessary condition for Rank Pruning.") self.rh1 = frac_pos2neg self.rh0 = frac_neg2pos self.clf = logreg() if clf is None else clf
def train_clf(self, trainfiles): # tokens: list of words, labels: list of corresponding labels # go document by document because of local context final_labels = [] featmat = [] for trainfile in trainfiles: for tokens, labels in yield_tokens_labels(trainfile): final_labels.extend(labels) featmat.append(self.make_featmat_rep(tokens)) featmat = np.vstack(featmat) print("training classifier") clf = logreg(class_weight='balanced', random_state=1) clf.fit(featmat, final_labels) self.clf = clf
def estimate_cv_predicted_probabilities( X, labels, # class labels can be noisy (s) or not noisy (y). clf=logreg(multi_class='auto', solver='lbfgs'), cv_n_folds=5, seed=None, ): '''This function computes the out-of-sample predicted probability [P(s=k|x)] for every example in X using cross validation. Output is a np.array of shape (N, K) where N is the number of training examples and K is the number of classes. Parameters ---------- X : np.array Input feature matrix (N, D), 2D numpy array labels : np.array or list of ints from [0,1,..,K-1] A discrete vector of class labels which may or may not contain mislabeling clf : sklearn.classifier or equivalent Default classifier used is logistic regression. Assumes clf has predict_proba() and fit() defined. cv_n_folds : int The number of cross-validation folds used to compute out-of-sample probabilities for each example in X. seed : int (default = None) Number to set the default state of the random number generator used to split the cross-validated folds. If None, uses np.random current random state. Returns -------- psx : np.array (shape (N, K)) P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x. This is the probability distribution over all K classes, for each example, regarding whether the example has label s==k P(s=k|x). psx should have been computed using 3 (or higher) fold cross-validation.''' return estimate_py_noise_matrices_and_cv_pred_proba( X=X, s=labels, clf=clf, cv_n_folds=cv_n_folds, seed=seed, )[-1]
def __init__( self, clf=None, seed=None, # Hyper-parameters (used by .fit() function) cv_n_folds=5, prune_method='prune_by_noise_rate', converge_latent_estimates=False, pulearning=None, n_jobs=None, ): if clf is None: # Use logistic regression if no classifier is provided. clf = logreg(multi_class='auto', solver='lbfgs') # Make sure the passed in classifier has the appropriate methods defined. if not hasattr(clf, "fit"): raise ValueError( 'The classifier (clf) must define a .fit() method.') if not hasattr(clf, "predict_proba"): raise ValueError( 'The classifier (clf) must define a .predict_proba() method.') if not hasattr(clf, "predict"): raise ValueError( 'The classifier (clf) must define a .predict() method.') if seed is not None: np.random.seed(seed=seed) # Set-up number of multiprocessing threads used by get_noise_indices() if n_jobs is None: if os.name == 'nt': # Windows Python users n_jobs = 1 # Windows has multiprocessing issues so we use 1 job. else: # Mac and Linux Python users n_jobs = multiprocessing.cpu_count() else: assert (n_jobs >= 1) self.clf = clf self.seed = seed self.cv_n_folds = cv_n_folds self.prune_method = prune_method self.converge_latent_estimates = converge_latent_estimates self.pulearning = pulearning self.n_jobs = n_jobs
def compute_cv_predicted_probabilities( X, y, # labels, can be noisy (s) or not noisy (y). clf=logreg(), cv_n_folds=3, verbose=False, ): '''This function computes the out-of-sample predicted probability [P(s=k|x)] for every example in X using cross validation. Output is a np.array of shape (N, K) where N is the number of training examples and K is the number of classes. Parameters ---------- X : np.array Input feature matrix (N, D), 2D numpy array y : np.array A binary vector of labels, y, which may or may not contain mislabeling clf : sklearn.classifier or equivalent Default classifier used is logistic regression. Assumes clf has predict_proba() and fit() defined. cv_n_folds : int The number of cross-validation folds used to compute out-of-sample probabilities for each example in X. verbose : bool Set to true if you wish to print additional information while running. ''' return compute_noise_rates_and_cv_pred_proba( X=X, s=y, clf=clf, cv_n_folds=cv_n_folds, verbose=verbose, )[-1]
def train(X_train, y_train, random_state=0): regressor = logreg(random_state=random_state) regressor.fit(X_train, y_train) return regressor
if wear == True: path += 'wear_' X = np.load(path + endpath) return X, Y if __name__ == '__main__': X, Y = getdata(wear=True, base=True, cap=False) X[np.isinf(X)] = 0.0 cls = CLASS_TO_BE_TESTED Y = (Y == cls)*1.0 iterations = ITERATIONS models = {LinearSVC(): 'Linear SVM', SVC(kernel='sigmoid'): 'Sigmoid SVM', RFC(): 'Random Forest', logreg(): 'Logistic Regression', dtree(): 'Decision Tree', newmodel(): 'True Random', newmodel(a=0): 'Always Zero', newmodel(a=1): 'Always One', KNN(n_neighbors=3): 'KNN'} best_model = None best_acc = 0 for model in models: print('Testing Model - ', models[model]) avg_acc = 0 avg_fp = 0.0 for it in range(0, iterations): # The commented code below is what I used to counter imbalanced data classes # keeping it here in case it's needed for reference. Though we do have version control # so don't know the point. You will probably need to account for class imbalance in case of training the ensemble. # yidxf = np.arange(0, len(Y))[Y==0] # yidxt = np.arange(0, len(Y))[Y==1] #
def estimate_confident_joint_and_cv_pred_proba( X, s, clf=logreg(multi_class='auto', solver='lbfgs'), cv_n_folds=5, thresholds=None, seed=None, calibrate=True, ): '''Estimates P(s,y), the confident counts of the latent joint distribution of true and noisy labels using observed s and predicted probabilities psx. The output of this function is a numpy array of shape (K, K). Under certain conditions, estimates are exact, and in many conditions, estimates are within one percent of actual. Notes: There are two ways to compute the confident joint with pros/cons. 1. For each holdout set, we compute the confident joint, then sum them up. 2. We get all the pred_proba, combine them, compute the confident joint on all. (1) is more accurate because it computes the appropriate thresholds for each fold (2) is more accurate when you have only a little data because it computes the confident joint using all the probabilities. For example if you had only 100 examples, with 5-fold cross validation and uniform p(y) you would only have 20 examples to compute each confident joint for (1). Such small amounts of data is bound to result in estimation errors. For this reason, we implement (2), but we implement (1) as a commented out function at the end of this file. Parameters ---------- X : np.array Input feature matrix (N, D), 2D numpy array s : np.array A discrete vector of labels, s, which may contain mislabeling. "s" denotes the noisy label instead of \tilde(y), for ASCII encoding reasons. clf : sklearn.classifier or equivalent Default classifier used is logistic regression. Assumes clf has predict_proba() and fit() defined. cv_n_folds : int The number of cross-validation folds used to compute out-of-sample probabilities for each example in X. thresholds : iterable (list or np.array) of shape (K, 1) or (K,) P(s^=k|s=k). If an example has a predicted probability "greater" than this threshold, it is counted as having hidden label y = k. This is not used for pruning, only for estimating the noise rates using confident counts. This value should be between 0 and 1. Default is None. seed : int (default = None) Number to set the default state of the random number generator used to split the cross-validated folds. If None, uses np.random current random state. calibrate : bool (default: True) Calibrates confident joint estimate P(s=i, y=j) such that np.sum(cj) == len(s) and np.sum(cj, axis = 1) == np.bincount(s). Returns ------ Returns a tuple of two numpy array matrices in the form: (joint counts matrix, predicted probability matrix)''' assert_inputs_are_valid(X, s) # Number of classes K = len(np.unique(s)) # 'ps' is p(s=k) ps = value_counts(s) / float(len(s)) # Ensure labels are of type np.array() s = np.asarray(s) # Create cross-validation object for out-of-sample predicted probabilities. # CV folds preserve the fraction of noisy positive and # noisy negative examples in each class. kf = StratifiedKFold(n_splits=cv_n_folds, shuffle=True, random_state=seed) # Intialize psx array psx = np.zeros((len(s), K)) # Split X and s into "cv_n_folds" stratified folds. for k, (cv_train_idx, cv_holdout_idx) in enumerate(kf.split(X, s)): clf_copy = copy.deepcopy(clf) # Select the training and holdout cross-validated sets. X_train_cv, X_holdout_cv = X[cv_train_idx], X[cv_holdout_idx] s_train_cv, s_holdout_cv = s[cv_train_idx], s[cv_holdout_idx] # Fit the clf classifier to the training set and # predict on the holdout set and update psx. clf_copy.fit(X_train_cv, s_train_cv) psx_cv = clf_copy.predict_proba(X_holdout_cv) # P(s = k|x) # [:,1] psx[cv_holdout_idx] = psx_cv # Compute the confident counts of all pairwise label-flipping mislabeling rates. confident_joint = compute_confident_joint( s=s, psx=psx, # P(s = k|x) thresholds=thresholds, calibrate=calibrate, ) return confident_joint, psx
def estimate_py_noise_matrices_and_cv_pred_proba( X, s, clf=logreg(multi_class='auto', solver='lbfgs'), cv_n_folds=5, thresholds=None, converge_latent_estimates=False, py_method='cnt', seed=None, ): '''This function computes the out-of-sample predicted probability P(s=k|x) for every example x in X using cross validation while also computing the confident counts noise rates within each cross-validated subset and returning the average noise rate across all examples. This function estimates the noise_matrix of shape (K, K). This is the fraction of examples in every class, labeled as every other class. The noise_matrix is a conditional probability matrix for P(s=k_s|y=k_y). Under certain conditions, estimates are exact, and in most conditions, estimates are within one percent of the actual noise rates. Parameters ---------- X : np.array Input feature matrix (N, D), 2D numpy array s : np.array A discrete vector of labels, s, which may contain mislabeling. "s" denotes the noisy label instead of \tilde(y), for ASCII encoding reasons. clf : sklearn.classifier or equivalent Default classifier used is logistic regression. Assumes clf has predict_proba() and fit() defined. cv_n_folds : int The number of cross-validation folds used to compute out-of-sample probabilities for each example in X. thresholds : iterable (list or np.array) of shape (K, 1) or (K,) P(s^=k|s=k). If an example has a predicted probability "greater" than this threshold, it is counted as having hidden label y = k. This is not used for pruning, only for estimating the noise rates using confident counts. This value should be between 0 and 1. Default is None. converge_latent_estimates : bool If true, forces numerical consistency of estimates. Each is estimated independently, but they are related mathematically with closed form equivalences. This will iteratively make them mathematically consistent. py_method : str How to compute the latent prior p(y=k). Default is "cnt" as it tends to work best, but you may also set this hyperparameter to "eqn" or "marginal". seed : int (default = None) Number to set the default state of the random number generator used to split the cross-validated folds. If None, uses np.random current random state. Returns ------ Returns a tuple of five numpy array matrices in the form: (py, noise_matrix, inverse_noise_matrix, joint count matrix i.e. confident joint, predicted probability matrix)''' confident_joint, psx = estimate_confident_joint_and_cv_pred_proba( X=X, s=s, clf=clf, cv_n_folds=cv_n_folds, thresholds=thresholds, seed=seed, ) py, noise_matrix, inv_noise_matrix = estimate_latent( confident_joint=confident_joint, s=s, py_method=py_method, converge_latent_estimates=converge_latent_estimates, ) return py, noise_matrix, inv_noise_matrix, confident_joint, psx
''' Load dataset from csv ''' result_path = 'entitymodel/results/' df = pd.DataFrame.from_csv('%sdataframe.csv' % result_path, sep=',') df = normalize_rating(df) df = df.fillna(value=0) print (df.head()) drop_columns = ['label', 'domain', 'url'] X = df.drop(drop_columns, axis=1) y = df['label'] ''' Fit model ''' model = logreg() #model = SVC(probability=True) #model = GaussianNB() sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0) for train_index, test_index in sss.split(X, y): ''' calculate url features based on training data of positive class ''' #df = calc_url_features(df,train_index) #X_url = df.drop(drop_columns, axis=1) X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] model.fit(X_train, y_train) print ('shape')
facecolors='none', edgecolors='black', linewidth=2, alpha=0.5) _ = plt.title('Dataset after pruning detected label errors.', fontsize=30) plt.show() except: print("Plotting is only supported in an iPython interface.") print('The actual, latent, underlying noise matrix.') print_noise_matrix(noise_matrix) print('Our estimate of the noise matrix.') print_noise_matrix(est_noise_matrix) print("Accuracy Comparison") print("-------------------") clf = logreg() baseline_score = accuracy_score(y_test, clf.fit(X_train, s).predict(X_test)) print("Logistic regression:", baseline_score) rp = LearningWithNoisyLabels(seed=seed) rp_score = accuracy_score(y_test, rp.fit(X_train, s, psx=psx).predict(X_test)) print("Logistic regression (+rankpruning):", rp_score) diff = rp_score - baseline_score clf = logreg() # If we fit on the pruned dataset without reweighting, performance is much worse. print( 'Fit on denoised data without re-weighting:', accuracy_score( y_test, clf.fit(X_train[~idx_errors], s[~idx_errors]).predict(X_test))) try:
# as well, we'll create a standardised version of the input set # for comparison of performances from sklearn.preprocessing import StandardScaler sc = StandardScaler() sc.fit(X_train) # apply the transformation X_std_train = sc.transform(X_train) X_std_test = sc.transform(X_test) ####################################################################### # logistic regression from sklearn.linear_model import LogisticRegression as logreg mylogreg = logreg(solver='lbfgs') # all other params to default mylogreg.fit(X_train, y_train) pred = mylogreg.predict(X_test) # note: as with Perceptron, the predictions are 0 or 1 error = (y_test != pred) print('Misclass: ', error.sum()) print('Misclass rate: ', format(error.sum()/error.shape[0]*100, '4.2f'), '%') # note that you can access the estimated probabilities predprob = mylogreg.predict_proba(X_test) print(predprob[1:10,:]) intercept = -1*mylogreg.intercept_[0]/mylogreg.coef_[0][1] slope = -1*mylogreg.coef_[0][0]/mylogreg.coef_[0][1] print("with log reg: x2 = ",intercept," + ", slope,"x1")
def C_score(C, X, y, X_test, y_test): m = logreg(C=C) m.fit(X, y) return score(y_test, m.decision_function(X_test))
def compute_conf_counts_noise_rates( X, s, clf=logreg(), cv_n_folds=3, positive_lb_threshold=None, negative_ub_threshold=None, verbose=False, ): '''Computes the rho hat (rh) confident counts estimate of the noise rates from X and s. This function estimates rh1 (the fraction of pos examples mislabeled as neg, frac_pos2neg) and rh0 (the fraction of neg examples mislabeled as pos, frac_neg2pos). The acronym 'rh' stands for rho hat, where rho is a greek symbol for noise rate and hat tells us that the value is estimated, not necessarily exact. Under certain conditions, estimates are exact, and in most conditions, estimates are within one percent of the actual noise rates. Parameters ---------- X : np.array Input feature matrix (N, D), 2D numpy array s : np.array A binary vector of labels, s, which may contain mislabeling clf : sklearn.classifier or equivalent Default classifier used is logistic regression. Assumes clf has predict_proba() and fit() defined. cv_n_folds : int The number of cross-validation folds used to compute out-of-sample probabilities for each example in X. positive_lb_threshold : float P(s^=1|s=1). If an example has a predicted probability "greater" than this threshold, it is counted as having hidden label y = 1. This is not used for pruning, only for estimating the noise rates using confident counts. This value should be between 0 and 1. Default is None. negative_ub_threshold : float P(s^=1|s=0). If an example has a predicted probability "lower" than this threshold, it is counted as having hidden label y = 0. This is not used for pruning, only for estimating the noise rates using confident counts. This value should be between 0 and 1. Default is None. verbose : bool Set to true if you wish to print additional information while running. ''' return compute_noise_rates_and_cv_pred_proba( X=X, s=s, clf=clf, cv_n_folds=cv_n_folds, positive_lb_threshold=positive_lb_threshold, negative_ub_threshold=negative_ub_threshold, verbose=verbose, )[:-1]
X = digits.data y = digits.target # train_test_split splits arrays or matrices into random train and test subsets. # That means that everytime you run it without specifying random_state, you will get a different result and # this is expected behavior. #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) names = [ "Nearest Neighbors", "LinearDiscriminant", "Linear SVM", "LogisticRegression" ] classifiers = [KNN(), LDA(), SVC(), logreg()] for name, clf in zip(names, classifiers): clf.fit(X_train, y_train) score = clf.score(X_test, y_test) print('The score of ' + name + ' classifier is ' + str(score)) #########---------------Q4---------------######### def load_data(folder): """ Load all images from subdirectories of 'folder'. The subdirectory name indicates the class. """
import numpy as np import pylab as pl from sklearn.datasets import load_digits digits=load_digits() data=digits['data'] target=digits['target'] from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression as logreg from sklearn.multiclass import OneVsRestClassifier import characters logregl1 = logreg(penalty='l1') # cross validation #from sklearn.cross_validation import cross_val_score #score = cross_val_score(logregl1, data, target, cv=5) #clfs = [OneVsRestClassifier(logreg(penalty='l1', C=alpha)) #for alpha in np.logspace(-5, 1, 10)] #scores = [cross_val_score(clf, data, target, cv=5) for clf in clfs] #scores_mean = np.array(scores).mean(axis=1) # calculation of the mean score for each value of alpha # end of cross validation
# gradient gradient = 0 for n in range(x.shape[0]): num = -np.exp(-y[n] * np.dot(w, x[n])) * y[n] * x[n] den = 1 + np.exp(-y[n] * np.dot(w, x[n])) gradient += num / den return gradient w = np.random.rand(2) # set step size to a small positive value step = .0001 clf = logreg() weight_history = [] acc_history = [] for _ in range(100): # Apply the gradient descent rule. w = w - step * gradient_log_loss(w, X, y) # Print the current state. #print("Iteration %d: w = %s (log-loss = %.2f)" % # (iteration, str(w), log_loss(w, x, y))) # Compute the accuracy: y_prob = 1 / (1 + np.exp(-np.dot(X, w))) # Threshold at 0.5 (results are 0 and 1)
def __init__(self, clf=None): # Stores the classifier used. # Default classifier used is logistic regression self.clf = logreg() if clf is None else clf
#results = clf.score(xtest,ytest) print dec # In[ ]: #logistic regression? as alternative to lasso? sklearns feature selection modules say that lasso is for regression, and # logistic regression and linear svc are for classification' from sklearn.linear_model import LogisticRegression as logreg from sklearn.feature_selection import SelectFromModel as sfm # In[ ]: lr1 = logreg() lr1.fit(xtrain,ytrain1) print lr1.fit(xtrain,ytrain1) print (lr1.score(xtest,ytest1)) print lr1.coef_ model = sfm(lr1,prefit=True) xnew = model.transform(xtrain) print xtrain.shape print xnew.shape #lr2 = logreg() #lr2.fit(xnew,ytrain1) print xtrain print xnew
def visualize_clf(textdict, doccats, create_html=True, visids=[], subdir_html='', subdir_wc='', maskfiles={}, use_logreg=False): """ visualize a text categorization dataset w.r.t. classification scores (create htmls with highlighted words and word clouds) Input: textdict: dict with {doc_id: text} doccats: dict with {doc_id: category} create_html: whether to create the html files with scores highlighted for individual documents (default: True) visids: a subset of docids for which the html visualization should be created (optional) (if create_html=True but visids=[], select up to 1000 random ids) subdir_html: subdirectory to save the created html files in (has to exist) subdir_wc: subdirectory to save the created word cloud images in (has to exist) maskfiles: dict with {category: path_to_maskfile} for creating the word clouds in a specific form use_logreg: default False; whether to use logistic regression instead of linear SVM Returns: relevant_words: dict with {category: {word: relevancy score}} """ print("possibly selecting subset of 10000 examples") textdict, doccats, visids = select_subset(textdict, doccats, visids) # training examples are all but visids trainids = list(set(textdict.keys()).difference(set(visids))) # train a classifier and predict if use_logreg: renorm = 'max' clf = logreg(class_weight='balanced', random_state=1) else: renorm = 'length' clf = LinearSVC(C=10., class_weight='balanced', random_state=1) print("transforming text into features") # make features (we can use bigrams if we don't have to create htmls) ft = FeatureTransform(norm='max', weight=True, renorm=renorm, identify_bigrams=not create_html, norm_num=False) docfeats = ft.texts2features(textdict, fit_ids=trainids) # convert training data to feature matrix featmat_train, featurenames = features2mat(docfeats, trainids) y_train = [doccats[tid] for tid in trainids] # fit classifier print("training classifier") clf.fit(featmat_train, y_train) del featmat_train # make test featmat and label vector print("making predictions") featmat_test, featurenames = features2mat(docfeats, visids, featurenames) # get actual classification results for all test samples predictions = clf.decision_function(featmat_test) predictions_labels = clf.predict(featmat_test) y_true, y_pred = [doccats[tid] for tid in visids], list(predictions_labels) # report classification accuracy if len(clf.classes_) > 2: f1_micro, f1_macro = skmet.f1_score(y_true, y_pred, average='micro'), skmet.f1_score(y_true, y_pred, average='macro') print("F1 micro-avg: %.3f, F1 macro-avg: %.3f" % (f1_micro, f1_macro)) print("Accuracy: %.3f" % skmet.accuracy_score(y_true, y_pred)) # create the visualizations print("creating the visualization for %i test examples" % len(visids)) # collect all the accumulated scores to later create a wordcloud scores_collected = np.zeros((len(featurenames), len(clf.classes_))) # run through all test documents for i, tid in enumerate(visids): if not i % 100: print("progress: at %i of %i test examples" % (i, len(visids))) # transform the feature vector into a diagonal matrix feat_vec = lil_matrix((len(featurenames), len(featurenames)), dtype=float) feat_vec.setdiag(featmat_test[i, :].toarray().flatten()) feat_vec = csr_matrix(feat_vec) # get the scores (i.e. before summing up) scores = clf.decision_function(feat_vec) # adapt for the intercept scores -= (1. - 1./len(featurenames)) * clf.intercept_ # when creating the html visualization we want the words speaking for the prediction # but when creating the word cloud, we want the words speaking for the actual class metainf = tid + '\n' # binary or multi class? if len(scores.shape) == 1: if clf.classes_[0] == predictions_labels[i]: # we want the scores which speak for the class - for the negative class, # the sign needs to be reversed scores *= -1. scores_dict = dict(zip(featurenames, scores)) metainf += 'True Class: %s\n' % doccats[tid] metainf += 'Predicted Class: %s (Score: %.4f)' % (predictions_labels[i], predictions[i]) scores_collected[:, clf.classes_ == doccats[tid]] += np.array([scores]).T else: scores_dict = dict(zip(featurenames, scores[:, clf.classes_ == predictions_labels[i]][:, 0])) metainf += 'True Class: %s (Score: %.4f)\n' % (doccats[tid], predictions[i, clf.classes_ == doccats[tid]][0]) metainf += 'Predicted Class: %s (Score: %.4f)' % (predictions_labels[i], predictions[i, clf.classes_ == predictions_labels[i]][0]) scores_collected[:, clf.classes_ == doccats[tid]] += scores[:, clf.classes_ == doccats[tid]] # use the vector with scores together with the corresponding feature names and the original text # to create the pretty visualization if create_html: if y_true[i] == y_pred[i]: name = 'correct_' else: name = 'error_' name += tid + '_' + doccats[tid] scores2html(textdict[tid], scores_dict, os.path.join(subdir_html, name.replace(' ', '_').replace('/', '_')), metainf) print("creating word clouds") # normalize the scores for each class scores_collected /= np.max(np.abs(scores_collected), axis=0) # transform the collected scores into a dictionary and create word clouds scores_collected_dict = {cat: dict(zip(featurenames, scores_collected[:, clf.classes_ == cat][:, 0])) for cat in clf.classes_} for cat in scores_collected_dict: create_wordcloud(scores_collected_dict[cat], os.path.join(subdir_wc, "%s.png" % cat), maskfiles[cat] if cat in maskfiles else None) return scores_collected_dict
searchcv_svc.fit(F, y) print('the best parameters for SVC classifier using RandomizedSearchCV are ' + str(searchcv_svc.best_params_)) # Apply grid and search for logreg classifier # Create hyperparameter options hyperparams_grid = { "C": [1e-5, 1e-3, 1e-1, 1], "fit_intercept": [True, False], "penalty": ["l1", "l2"] } grid_logreg = GridSearchCV(logreg(), hyperparams_grid, cv=5) grid_logreg.fit(F, y) print('the best parameters for logreg classifier using GridSearchCV are ' + str(grid_logreg.best_params_)) hyperparams_dist = { "C": stats.beta(1, 3), "fit_intercept": [True, False], "penalty": ["l1", "l2"] } searchcv_logreg = RandomizedSearchCV(logreg(), hyperparams_dist, n_iter=20,
# So we concatenate the arrays on axis 0 (bc only 1 axis) predictions = np.concatenate(predictions, axis=0) predictions[predictions > .5] = 1 predictions[predictions <= .5] = 0 print(predictions) print(sum(predictions == titanic["Survived"])) accuracy = sum(predictions == titanic["Survived"]) / len(predictions) print(accuracy) # = 0.783 #------------------- Logistic Regression method --------------------- # Initialize the algo algo_logreg = logreg(random_state=1) # Compute accuracy score for all cross-V folds; # cross_val_score(algo, predictors, target, cross-validation fold) scores = cross_validation.cross_val_score(algo_logreg, titanic[predictors], titanic["Survived"], cv=3) # Mean of the scores for each folds (3 folds) print(scores.mean()) #----------------------------------- Log Reg. with test set --------------------- titanic_test = pd.read_csv("test.csv") # I) Clean data
def compute_noise_rates_and_cv_pred_proba( X, s, clf=logreg(), cv_n_folds=3, positive_lb_threshold=None, negative_ub_threshold=None, verbose=False, ): '''This function computes the out-of-sample predicted probability P(s=1|x) for every example x in X using cross validation while also computing the confident counts noise rates within each cross-validated subset and returning the average noise rate across all examples. This function estimates rh1 (the fraction of pos examples mislabeled as neg, frac_pos2neg) and rh0 (the fraction of neg examples mislabeled as pos, frac_neg2pos). The acronym 'rh' stands for rho hat, where rho is a greek symbol for noise rate and hat tells us that the value is estimated, not necessarily exact. Under certain conditions, estimates are exact, and in most conditions, estimates are within one percent of the actual noise rates. Parameters ---------- X : np.array Input feature matrix (N, D), 2D numpy array s : np.array A binary vector of labels, s, which may contain mislabeling clf : sklearn.classifier or equivalent Default classifier used is logistic regression. Assumes clf has predict_proba() and fit() defined. cv_n_folds : int The number of cross-validation folds used to compute out-of-sample probabilities for each example in X. positive_lb_threshold : float P(s^=1|s=1). If an example has a predicted probability "greater" than this threshold, it is counted as having hidden label y = 1. This is not used for pruning, only for estimating the noise rates using confident counts. This value should be between 0 and 1. Default is None. negative_ub_threshold : float P(s^=1|s=0). If an example has a predicted probability "lower" than this threshold, it is counted as having hidden label y = 0. This is not used for pruning, only for estimating the noise rates using confident counts. This value should be between 0 and 1. Default is None. verbose : bool Set to true if you wish to print additional information while running. ''' # Create cross-validation object for out-of-sample predicted probabilities. # CV folds preserve the fraction of noisy positive and # noisy negative examples in each class. kf = StratifiedKFold(n_splits=cv_n_folds, shuffle=True) # Intialize result storage and final prob_s_eq_1 array rh1_per_cv_fold = [] rh0_per_cv_fold = [] prob_s_eq_1 = np.zeros(np.shape(s)) # Split X and s into "cv_n_folds" stratified folds. for k, (cv_train_idx, cv_holdout_idx) in enumerate(kf.split(X, s)): # Select the training and holdout cross-validated sets. X_train_cv, X_holdout_cv = X[cv_train_idx], X[cv_holdout_idx] s_train_cv, s_holdout_cv = s[cv_train_idx], s[cv_holdout_idx] # Fit the clf classifier to the training set and # predict on the holdout set and update prob_s_eq_1. clf.fit(X_train_cv, s_train_cv) prob_s_eq_1_cv = clf.predict_proba(X_holdout_cv)[:, 1] # P(s = 1|x) prob_s_eq_1[cv_holdout_idx] = prob_s_eq_1_cv # Compute and append the confident counts noise estimators # to estimate the positive and negative mislabeling rates. rh1_cv, rh0_cv = compute_conf_counts_noise_rates_from_probabilities( s=s_holdout_cv, prob_s_eq_1=prob_s_eq_1_cv, positive_lb_threshold=positive_lb_threshold, negative_ub_threshold=negative_ub_threshold, verbose=verbose, ) rh1_per_cv_fold.append(rh1_cv) rh0_per_cv_fold.append(rh0_cv) # Return mean rh, omitting nan or inf values, and prob_s_eq_1 return ( _mean_without_nan_inf(rh1_per_cv_fold), _mean_without_nan_inf(rh0_per_cv_fold), prob_s_eq_1, )
from cleanlab.classification import LearningWithNoisyLabels from cleanlab.noise_generation import generate_noisy_labels from cleanlab.util import value_counts from cleanlab.latent_algebra import compute_inv_noise_matrix # ## **rankpruning** is the first practical *(works for any classifier, runs fast, robust to poor probability estimation)* algorithm for multiclass learning with noisy labels. Its comprised of components from the theory and algorithsm of **confident learning**. It's a Python class that wraps around any classifier as long as .fit(X, y, sample_weight), .predict(X), .predict_proba(X) are defined. Inspect the **cleanlab** package for documentation. # # ## Here we show the performance of multiclass rankpruning wrapped around a sklearn LogisiticRegression classifier versus LogisticRegression without any help from confident learning on the Iris dataset. # In[16]: # Seed for reproducibility seed = 2 rp = LearningWithNoisyLabels(clf = logreg(), seed = seed) np.random.seed(seed = seed) # Get iris dataset iris = datasets.load_iris() X = iris.data # we only take the first two features. y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) try: get_ipython().run_line_magic('matplotlib', 'inline') from matplotlib import pyplot as plt _ = plt.figure(figsize=(12,8)) color_list = plt.cm.tab10(np.linspace(0, 1, 6)) _ = plt.scatter(X_train[:,1], X_train[:,3], color = [color_list[z] for z in y_train], s = 50)
hist = np.histogram(lbp, bins=range(257))[0] F.append(hist) return np.array(F) X, y = load_data("GTSRB_subset") F = extract_lbp_features(X) #F= Normalizer().fit(F) F = scale(F) X_train, X_test, y_train, y_test = train_test_split(F, y, test_size=0.2) names = ["LogisticRegression", "SVC"] classifiers = [logreg(), SVC()] C_range = 10.0**np.arange(-5, 0) for name, clf in zip(names, classifiers): for C in C_range: for penalty in ["l1", "l2"]: clf.C = C clf.penalty = penalty clf.fit(X_train, y_train) y_pred = clf.predict(X_test) score = accuracy_score(y_test, y_pred) print('The score of ' + name + ' for C = %.2e and penalty = %s is %.3f' % (C, penalty, score))
dot_data = tree.export_graphviz(dt, out_file=None) graph = graphviz.Source(dot_data) predictors = X_train.columns #print(predictors) dot_data = tree.export_graphviz(dt, out_file=None, feature_names=predictors, class_names=('Negative', 'Positive'), filled=True, rounded=True, special_characters=True) graph = graphviz.Source(dot_data) graph """ Model testing """ selected_attributes = [ 'kw_max_avg', 'data_channel_is_world', 'data_channel_is_entertainment', 'LDA_03' ] X = news_pop[selected_attributes] y = news_pop['shares_bin'].values.reshape(-1, 1) mylr = logreg() mylr.fit(X, y) model_summary = func.ModelSummary(mylr, X, y) model_summary.get_summary()
import prepod.lib.io as io import prepod.lib.prep as prep import prepod.lib.models as mdl path_data = '/Users/jannes/Projects/delir/data/' path_labels = path_data + 'info/sudocu_info/subject_data.csv' path_out = '/Users/jannes/Projects/delir/results/test_{}.csv' target = 'delir_60min' data = io.parse_subj_info(path_labels, 'Sudocu') data = prep.drop_non_feature_cols(data, target) data = prep.drop_if_too_many_nans(data, .25) features = list(data.drop(['subj_id', 'delir_60min', 'age'], axis=1)) data = prep.drop_na(data, features) data = prep.to_fv(data, features, target) clfs = [svm.SVC(gamma='scale'), logreg(solver='liblinear')] for clf in clfs: res = mdl.backward_subset_selection(data['X'], data['y'], data['X_labels'], data['y_labels'], K=1, clf=clf) res = pd.DataFrame(res).sort_values(by='mean_acc', ascending=False) res = res[res['n_features'] < 10] res['target'] = target with open(path_out.format('bss'), 'a') as f: res.to_csv(f, index=False) res = mdl.forward_subset_selection(data, K=len(features), init_combos=2, clf=clf) res = pd.DataFrame(res).sort_values(by='mean_acc', ascending=False) res = res[res['n_features'] < 10] res['target'] = target with open(path_out.format('fss'), 'a') as f: res.to_csv(f, header=False, index=False)
print(y) # In[12]: from sklearn.model_selection import train_test_split as tts x_train,x_test,y_train,y_test=tts(x,y,test_size=0.25,random_state=42) # In[13]: from sklearn.linear_model import LogisticRegression as logreg model_logreg = logreg() model_logreg.fit(x_train,y_train) # ----> Logistic regression is a statistical model that in its basic form uses a logistic function to model a binary dependent variable, although many more complex extensions exist. In regression analysis, logistic regression (or logit regression) is estimating the parameters of a logistic model (a form of binary regression). # # ----> Am using logistic regression as it is a classification problem. # In[14]: y_p=model_logreg.predict(x_test) # In[15]:
import numpy as np from sklearn.datasets import load_digits digits=load_digits() data=digits['data'] target=digits['target'] from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression as logreg from sklearn.multiclass import OneVsRestClassifier logregl1 = logreg(penalty='l1') train_data = data[:1000] train_target= target[:1000] test_data = data[1000:] test_target= target[1000:] logregl1.fit(train_data, train_target) prediction= logregl1.predict(test_data) # premiere prediction qui permettrait de faire un affichage (sans lien avec la cross validation) from sklearn.cross_validation import cross_val_score score = cross_val_score(logregl1, data, target, cv=5) clfs = [OneVsRestClassifier(logreg(penalty='l1', C=alpha)) for alpha in np.logspace(-5, 1, 10)] scores = [cross_val_score(clf, data, target, cv=5) for clf in clfs]