Ejemplo n.º 1
0
def main():
    # set features here
    features = ['pclass', 'sex', 'age', 'parch', 'sibsp']
    # set True to remove entries with no age data from training set, false
    # otherwise.
    remove_ageless = False
    
    h, train = load('train.csv')

    if remove_ageless:
        train = train[train[:,h.index('Age')]!='']
        
    raw_x, raw_t = split_targets(train, h.index('Survived'))
    
    h, test = load('test.csv')
    # get array of keys; won't include 'survived'
    keys = [key.lower() for key in h]

    X = preprocess(raw_x, keys, features, bin_age = True, bin_pclass = True)
    T = raw_t.reshape(raw_t.shape[0], 1).astype(np.float)
    
    lr = logreg()
    lr.fit(X[:,1:],T)
    print 'Coef: ' + str(lr.coef_)
    
    result = lr.predict(preprocess(test, keys, features, bin_age = True, bin_pclass = True)[:,1:])
    
    save_result(test[:,0], result.astype(np.int),'predict.csv')    
Ejemplo n.º 2
0
    def __init__(
        self,
        clf = None,
        seed = None,
        # Hyper-parameters (used by .fit() function)
        cv_n_folds = 5,
        prune_method = 'prune_by_noise_rate',
        converge_latent_estimates = False,
        pulearning = None,
    ):

        if clf is None:
            # Use logistic regression if no classifier is provided.
            clf = logreg(multi_class = 'auto', solver = 'lbfgs')

        # Make sure the passed in classifier has the appropriate methods defined.
        if not hasattr(clf, "fit"):
            raise ValueError('The classifier (clf) must define a .fit() method.')
        if not hasattr(clf, "predict_proba"):
            raise ValueError('The classifier (clf) must define a .predict_proba() method.')
        if not hasattr(clf, "predict"):
            raise ValueError('The classifier (clf) must define a .predict() method.')

        if seed is not None:
            np.random.seed(seed = seed)
        
        self.clf = clf
        self.seed = seed
        self.cv_n_folds = cv_n_folds
        self.prune_method = prune_method
        self.converge_latent_estimates = converge_latent_estimates
        self.pulearning = pulearning
Ejemplo n.º 3
0
def estimate_noise_matrices(
    X,
    s,
    clf=logreg(multi_class='auto', solver='lbfgs'),
    cv_n_folds=5,
    thresholds=None,
    converge_latent_estimates=True,
    seed=None,
):
    '''Estimates the noise_matrix of shape (K, K). This is the
    fraction of examples in every class, labeled as every other class. The
    noise_matrix is a conditional probability matrix for P(s=k_s|y=k_y).

    Under certain conditions, estimates are exact, and in most
    conditions, estimates are within one percent of the actual noise rates.

    Parameters
    ----------
    X : np.array
      Input feature matrix (N, D), 2D numpy array

    s : np.array
      A discrete vector of labels, s, which may contain mislabeling

    clf : sklearn.classifier or equivalent
      Default classifier used is logistic regression. Assumes clf
      has predict_proba() and fit() defined.

    cv_n_folds : int
      The number of cross-validation folds used to compute
      out-of-sample probabilities for each example in X.

    thresholds : iterable (list or np.array) of shape (K, 1)  or (K,)
      P(s^=k|s=k). If an example has a predicted probability "greater" than
      this threshold, it is counted as having hidden label y = k. This is
      not used for pruning, only for estimating the noise rates using
      confident counts. This value should be between 0 and 1. Default is None.

    converge_latent_estimates : bool
      If true, forces numerical consistency of estimates. Each is estimated
      independently, but they are related mathematically with closed form
      equivalences. This will iteratively make them mathematically consistent.

    seed : int (default = None)
        Number to set the default state of the random number generator used to split
        the cross-validated folds. If None, uses np.random current random state.

    Returns
    ------
        A two-item tuple containing (noise_matrix, inv_noise_matrix).'''

    return estimate_py_noise_matrices_and_cv_pred_proba(
        X=X,
        s=s,
        clf=clf,
        cv_n_folds=cv_n_folds,
        thresholds=thresholds,
        converge_latent_estimates=converge_latent_estimates,
        seed=seed,
    )[1:-2]
    def __init__(
        self,
        clf=None,
        e1=None,
    ):

        self.clf = logreg() if clf is None else clf
        self.e1 = e1
    def __init__(self, frac_pos2neg, frac_neg2pos, clf=None):

        if frac_pos2neg is not None and frac_neg2pos is not None:
            # Verify that rh1 + rh0 < 1 and pi0 + pi1 < 1.
            if frac_pos2neg + frac_neg2pos >= 1:
                raise Exception(
                    "frac_pos2neg + frac_neg2pos < 1 is " +
                    "necessary condition for noisy PN (binary) classification."
                )

        self.rh1 = frac_pos2neg
        self.rh0 = frac_neg2pos
        self.clf = logreg() if clf is None else clf
Ejemplo n.º 6
0
 def __init__(self,
   frac_pos2neg = None,
   frac_neg2pos = None,
   clf = None,
 ):
   
   if frac_pos2neg is not None and frac_neg2pos is not None:
     # Verify that rh1 + rh0 < 1 and pi0 + pi1 < 1.
     if frac_pos2neg + frac_neg2pos >= 1:
       raise Exception("frac_pos2neg + frac_neg2pos < 1 is " +           "a necessary condition for Rank Pruning.")
   
   self.rh1 = frac_pos2neg
   self.rh0 = frac_neg2pos
   self.clf = logreg() if clf is None else clf
Ejemplo n.º 7
0
 def train_clf(self, trainfiles):
     # tokens: list of words, labels: list of corresponding labels
     # go document by document because of local context
     final_labels = []
     featmat = []
     for trainfile in trainfiles:
         for tokens, labels in yield_tokens_labels(trainfile):
             final_labels.extend(labels)
             featmat.append(self.make_featmat_rep(tokens))
     featmat = np.vstack(featmat)
     print("training classifier")
     clf = logreg(class_weight='balanced', random_state=1)
     clf.fit(featmat, final_labels)
     self.clf = clf
Ejemplo n.º 8
0
def estimate_cv_predicted_probabilities(
    X,
    labels,  # class labels can be noisy (s) or not noisy (y).
    clf=logreg(multi_class='auto', solver='lbfgs'),
    cv_n_folds=5,
    seed=None,
):
    '''This function computes the out-of-sample predicted
    probability [P(s=k|x)] for every example in X using cross
    validation. Output is a np.array of shape (N, K) where N is
    the number of training examples and K is the number of classes.

    Parameters
    ----------

    X : np.array
      Input feature matrix (N, D), 2D numpy array

    labels : np.array or list of ints from [0,1,..,K-1]
      A discrete vector of class labels which may or may not contain mislabeling

    clf : sklearn.classifier or equivalent
      Default classifier used is logistic regression. Assumes clf
      has predict_proba() and fit() defined.

    cv_n_folds : int
      The number of cross-validation folds used to compute
      out-of-sample probabilities for each example in X.

    seed : int (default = None)
        Number to set the default state of the random number generator used to split
        the cross-validated folds. If None, uses np.random current random state.

    Returns
    --------
    psx : np.array (shape (N, K))
        P(s=k|x) is a matrix with K (noisy) probabilities for each of the N examples x.
        This is the probability distribution over all K classes, for each
        example, regarding whether the example has label s==k P(s=k|x). psx should
        have been computed using 3 (or higher) fold cross-validation.'''

    return estimate_py_noise_matrices_and_cv_pred_proba(
        X=X,
        s=labels,
        clf=clf,
        cv_n_folds=cv_n_folds,
        seed=seed,
    )[-1]
Ejemplo n.º 9
0
    def __init__(
        self,
        clf=None,
        seed=None,
        # Hyper-parameters (used by .fit() function)
        cv_n_folds=5,
        prune_method='prune_by_noise_rate',
        converge_latent_estimates=False,
        pulearning=None,
        n_jobs=None,
    ):

        if clf is None:
            # Use logistic regression if no classifier is provided.
            clf = logreg(multi_class='auto', solver='lbfgs')

        # Make sure the passed in classifier has the appropriate methods defined.
        if not hasattr(clf, "fit"):
            raise ValueError(
                'The classifier (clf) must define a .fit() method.')
        if not hasattr(clf, "predict_proba"):
            raise ValueError(
                'The classifier (clf) must define a .predict_proba() method.')
        if not hasattr(clf, "predict"):
            raise ValueError(
                'The classifier (clf) must define a .predict() method.')

        if seed is not None:
            np.random.seed(seed=seed)

        # Set-up number of multiprocessing threads used by get_noise_indices()
        if n_jobs is None:
            if os.name == 'nt':  # Windows Python users
                n_jobs = 1  # Windows has multiprocessing issues so we use 1 job.
            else:  # Mac and Linux Python users
                n_jobs = multiprocessing.cpu_count()
        else:
            assert (n_jobs >= 1)

        self.clf = clf
        self.seed = seed
        self.cv_n_folds = cv_n_folds
        self.prune_method = prune_method
        self.converge_latent_estimates = converge_latent_estimates
        self.pulearning = pulearning
        self.n_jobs = n_jobs
Ejemplo n.º 10
0
def compute_cv_predicted_probabilities(
    X,
    y,  # labels, can be noisy (s) or not noisy (y).
    clf=logreg(),
    cv_n_folds=3,
    verbose=False,
):
    '''This function computes the out-of-sample predicted 
  probability [P(s=k|x)] for every example in X using cross
  validation. Output is a np.array of shape (N, K) where N is 
  the number of training examples and K is the number of classes.

  Parameters
  ----------
    X : np.array
      Input feature matrix (N, D), 2D numpy array

    y : np.array
      A binary vector of labels, y, which may or may not contain mislabeling

    clf : sklearn.classifier or equivalent
      Default classifier used is logistic regression. Assumes clf
      has predict_proba() and fit() defined.

    cv_n_folds : int
      The number of cross-validation folds used to compute
      out-of-sample probabilities for each example in X.

    verbose : bool
      Set to true if you wish to print additional information while running.
  '''

    return compute_noise_rates_and_cv_pred_proba(
        X=X,
        s=y,
        clf=clf,
        cv_n_folds=cv_n_folds,
        verbose=verbose,
    )[-1]
Ejemplo n.º 11
0
def train(X_train, y_train, random_state=0):
    regressor = logreg(random_state=random_state)
    regressor.fit(X_train, y_train)
    return regressor
Ejemplo n.º 12
0
    if wear == True:
        path += 'wear_'
    X = np.load(path + endpath)
    return X, Y
    

if __name__ == '__main__':  
    X, Y = getdata(wear=True, base=True, cap=False)
    X[np.isinf(X)] = 0.0
    cls = CLASS_TO_BE_TESTED
    
    Y = (Y == cls)*1.0
    
    iterations = ITERATIONS
    models = {LinearSVC(): 'Linear SVM', SVC(kernel='sigmoid'): 'Sigmoid SVM', RFC(): 'Random Forest', 
              logreg(): 'Logistic Regression', dtree(): 'Decision Tree', newmodel(): 'True Random',
              newmodel(a=0): 'Always Zero', newmodel(a=1): 'Always One', KNN(n_neighbors=3): 'KNN'}
    
    best_model = None
    best_acc = 0
    for model in models:
        print('Testing Model - ', models[model])
        avg_acc = 0
        avg_fp = 0.0
        for it in range(0, iterations):
            # The commented code below is what I used to counter imbalanced data classes
            # keeping it here in case it's needed for reference. Though we do have version control
            # so don't know the point. You will probably need to account for class imbalance in case of training the ensemble.
#            yidxf = np.arange(0, len(Y))[Y==0]
#            yidxt = np.arange(0, len(Y))[Y==1]
#            
Ejemplo n.º 13
0
def estimate_confident_joint_and_cv_pred_proba(
    X,
    s,
    clf=logreg(multi_class='auto', solver='lbfgs'),
    cv_n_folds=5,
    thresholds=None,
    seed=None,
    calibrate=True,
):
    '''Estimates P(s,y), the confident counts of the latent
    joint distribution of true and noisy labels
    using observed s and predicted probabilities psx.

    The output of this function is a numpy array of shape (K, K).

    Under certain conditions, estimates are exact, and in many
    conditions, estimates are within one percent of actual.

    Notes: There are two ways to compute the confident joint with pros/cons.
    1. For each holdout set, we compute the confident joint, then sum them up.
    2. We get all the pred_proba, combine them, compute the confident joint on all.
    (1) is more accurate because it computes the appropriate thresholds for each fold
    (2) is more accurate when you have only a little data because it computes
    the confident joint using all the probabilities. For example if you had only 100
    examples, with 5-fold cross validation and uniform p(y) you would only have 20
    examples to compute each confident joint for (1). Such small amounts of data
    is bound to result in estimation errors. For this reason, we implement (2),
    but we implement (1) as a commented out function at the end of this file.

    Parameters
    ----------
    X : np.array
      Input feature matrix (N, D), 2D numpy array

    s : np.array
      A discrete vector of labels, s, which may contain mislabeling. "s" denotes
      the noisy label instead of \tilde(y), for ASCII encoding reasons.

    clf : sklearn.classifier or equivalent
      Default classifier used is logistic regression. Assumes clf
      has predict_proba() and fit() defined.

    cv_n_folds : int
      The number of cross-validation folds used to compute
      out-of-sample probabilities for each example in X.

    thresholds : iterable (list or np.array) of shape (K, 1)  or (K,)
      P(s^=k|s=k). If an example has a predicted probability "greater" than
      this threshold, it is counted as having hidden label y = k. This is
      not used for pruning, only for estimating the noise rates using
      confident counts. This value should be between 0 and 1. Default is None.

    seed : int (default = None)
        Number to set the default state of the random number generator used to split
        the cross-validated folds. If None, uses np.random current random state.
        
    calibrate : bool (default: True)
        Calibrates confident joint estimate P(s=i, y=j) such that
        np.sum(cj) == len(s) and np.sum(cj, axis = 1) == np.bincount(s).

    Returns
    ------
      Returns a tuple of two numpy array matrices in the form:
      (joint counts matrix, predicted probability matrix)'''

    assert_inputs_are_valid(X, s)
    # Number of classes
    K = len(np.unique(s))
    # 'ps' is p(s=k)
    ps = value_counts(s) / float(len(s))

    # Ensure labels are of type np.array()
    s = np.asarray(s)

    # Create cross-validation object for out-of-sample predicted probabilities.
    # CV folds preserve the fraction of noisy positive and
    # noisy negative examples in each class.
    kf = StratifiedKFold(n_splits=cv_n_folds, shuffle=True, random_state=seed)

    # Intialize psx array
    psx = np.zeros((len(s), K))

    # Split X and s into "cv_n_folds" stratified folds.
    for k, (cv_train_idx, cv_holdout_idx) in enumerate(kf.split(X, s)):

        clf_copy = copy.deepcopy(clf)

        # Select the training and holdout cross-validated sets.
        X_train_cv, X_holdout_cv = X[cv_train_idx], X[cv_holdout_idx]
        s_train_cv, s_holdout_cv = s[cv_train_idx], s[cv_holdout_idx]

        # Fit the clf classifier to the training set and
        # predict on the holdout set and update psx.
        clf_copy.fit(X_train_cv, s_train_cv)
        psx_cv = clf_copy.predict_proba(X_holdout_cv)  # P(s = k|x) # [:,1]
        psx[cv_holdout_idx] = psx_cv

    # Compute the confident counts of all pairwise label-flipping mislabeling rates.
    confident_joint = compute_confident_joint(
        s=s,
        psx=psx,  # P(s = k|x)
        thresholds=thresholds,
        calibrate=calibrate,
    )

    return confident_joint, psx
Ejemplo n.º 14
0
def estimate_py_noise_matrices_and_cv_pred_proba(
    X,
    s,
    clf=logreg(multi_class='auto', solver='lbfgs'),
    cv_n_folds=5,
    thresholds=None,
    converge_latent_estimates=False,
    py_method='cnt',
    seed=None,
):
    '''This function computes the out-of-sample predicted
    probability P(s=k|x) for every example x in X using cross
    validation while also computing the confident counts noise
    rates within each cross-validated subset and returning
    the average noise rate across all examples.

    This function estimates the noise_matrix of shape (K, K). This is the
    fraction of examples in every class, labeled as every other class. The
    noise_matrix is a conditional probability matrix for P(s=k_s|y=k_y).

    Under certain conditions, estimates are exact, and in most
    conditions, estimates are within one percent of the actual noise rates.

    Parameters
    ----------
    X : np.array
      Input feature matrix (N, D), 2D numpy array

    s : np.array
      A discrete vector of labels, s, which may contain mislabeling. "s" denotes
      the noisy label instead of \tilde(y), for ASCII encoding reasons.

    clf : sklearn.classifier or equivalent
      Default classifier used is logistic regression. Assumes clf
      has predict_proba() and fit() defined.

    cv_n_folds : int
      The number of cross-validation folds used to compute
      out-of-sample probabilities for each example in X.

    thresholds : iterable (list or np.array) of shape (K, 1)  or (K,)
      P(s^=k|s=k). If an example has a predicted probability "greater" than
      this threshold, it is counted as having hidden label y = k. This is
      not used for pruning, only for estimating the noise rates using
      confident counts. This value should be between 0 and 1. Default is None.

    converge_latent_estimates : bool
      If true, forces numerical consistency of estimates. Each is estimated
      independently, but they are related mathematically with closed form
      equivalences. This will iteratively make them mathematically consistent.

    py_method : str
        How to compute the latent prior p(y=k). Default is "cnt" as it tends to
        work best, but you may also set this hyperparameter to "eqn" or "marginal".

    seed : int (default = None)
        Number to set the default state of the random number generator used to split
        the cross-validated folds. If None, uses np.random current random state.

    Returns
    ------
      Returns a tuple of five numpy array matrices in the form:
      (py, noise_matrix, inverse_noise_matrix,
      joint count matrix i.e. confident joint, predicted probability matrix)'''

    confident_joint, psx = estimate_confident_joint_and_cv_pred_proba(
        X=X,
        s=s,
        clf=clf,
        cv_n_folds=cv_n_folds,
        thresholds=thresholds,
        seed=seed,
    )

    py, noise_matrix, inv_noise_matrix = estimate_latent(
        confident_joint=confident_joint,
        s=s,
        py_method=py_method,
        converge_latent_estimates=converge_latent_estimates,
    )

    return py, noise_matrix, inv_noise_matrix, confident_joint, psx
Ejemplo n.º 15
0
'''
Load dataset from csv
'''
result_path = 'entitymodel/results/'
df = pd.DataFrame.from_csv('%sdataframe.csv' % result_path, sep=',')
df = normalize_rating(df)
df = df.fillna(value=0)
print (df.head())
drop_columns = ['label', 'domain', 'url']
X = df.drop(drop_columns, axis=1)
y = df['label']

'''
Fit model
'''
model = logreg()
#model = SVC(probability=True)
#model = GaussianNB()
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)
for train_index, test_index in sss.split(X, y):
    '''
    calculate url features based on training data of positive class
    '''
    #df = calc_url_features(df,train_index)

    #X_url = df.drop(drop_columns, axis=1)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)
    print ('shape')
        facecolors='none',
        edgecolors='black',
        linewidth=2,
        alpha=0.5)
    _ = plt.title('Dataset after pruning detected label errors.', fontsize=30)
    plt.show()
except:
    print("Plotting is only supported in an iPython interface.")

print('The actual, latent, underlying noise matrix.')
print_noise_matrix(noise_matrix)
print('Our estimate of the noise matrix.')
print_noise_matrix(est_noise_matrix)
print("Accuracy Comparison")
print("-------------------")
clf = logreg()
baseline_score = accuracy_score(y_test, clf.fit(X_train, s).predict(X_test))
print("Logistic regression:", baseline_score)
rp = LearningWithNoisyLabels(seed=seed)
rp_score = accuracy_score(y_test, rp.fit(X_train, s, psx=psx).predict(X_test))
print("Logistic regression (+rankpruning):", rp_score)
diff = rp_score - baseline_score
clf = logreg()
# If we fit on the pruned dataset without reweighting, performance is much worse.
print(
    'Fit on denoised data without re-weighting:',
    accuracy_score(
        y_test,
        clf.fit(X_train[~idx_errors], s[~idx_errors]).predict(X_test)))

try:
Ejemplo n.º 17
0
         
# as well, we'll create a standardised version of the input set 
#    for comparison of performances
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(X_train)
# apply the transformation
X_std_train = sc.transform(X_train)
X_std_test = sc.transform(X_test)
      
#######################################################################         
# logistic regression
from sklearn.linear_model import LogisticRegression as logreg

mylogreg = logreg(solver='lbfgs') # all other params to default
mylogreg.fit(X_train, y_train)
pred = mylogreg.predict(X_test)
# note: as with Perceptron, the predictions are 0 or 1
error = (y_test != pred)
print('Misclass: ', error.sum())
print('Misclass rate: ', format(error.sum()/error.shape[0]*100, '4.2f'), '%')

# note that you can access the estimated probabilities
predprob = mylogreg.predict_proba(X_test)
print(predprob[1:10,:])   

intercept = -1*mylogreg.intercept_[0]/mylogreg.coef_[0][1]
slope = -1*mylogreg.coef_[0][0]/mylogreg.coef_[0][1]
print("with log reg: x2 = ",intercept," + ", slope,"x1")
Ejemplo n.º 18
0
def C_score(C, X, y, X_test, y_test):
    
    m = logreg(C=C)
    m.fit(X, y)
    
    return score(y_test, m.decision_function(X_test))
Ejemplo n.º 19
0
def compute_conf_counts_noise_rates(
    X,
    s,
    clf=logreg(),
    cv_n_folds=3,
    positive_lb_threshold=None,
    negative_ub_threshold=None,
    verbose=False,
):
    '''Computes the rho hat (rh) confident counts estimate of the
  noise rates from X and s.
  
  This function estimates rh1 (the fraction of pos examples mislabeled
  as neg, frac_pos2neg) and  rh0 (the fraction of neg examples 
  mislabeled as pos, frac_neg2pos). 
  
  The acronym 'rh' stands for rho hat, where rho is a greek symbol for
  noise rate and hat tells us that the value is estimated, not necessarily
  exact. Under certain conditions, estimates are exact, and in most
  conditions, estimates are within one percent of the actual noise rates.

  Parameters
  ----------
    X : np.array
      Input feature matrix (N, D), 2D numpy array

    s : np.array
      A binary vector of labels, s, which may contain mislabeling

    clf : sklearn.classifier or equivalent
      Default classifier used is logistic regression. Assumes clf
      has predict_proba() and fit() defined.

    cv_n_folds : int
      The number of cross-validation folds used to compute
      out-of-sample probabilities for each example in X.
      
    positive_lb_threshold : float 
      P(s^=1|s=1). If an example has a predicted probability "greater" than 
      this threshold, it is counted as having hidden label y = 1. This is 
      not used for pruning, only for estimating the noise rates using 
      confident counts. This value should be between 0 and 1. Default is None.
      
    negative_ub_threshold : float 
      P(s^=1|s=0). If an example has a predicted probability "lower" than
      this threshold, it is counted as having hidden label y = 0. This is
      not used for pruning, only for estimating the noise rates using
      confident counts. This value should be between 0 and 1. Default is None.

    verbose : bool
      Set to true if you wish to print additional information while running.
  '''

    return compute_noise_rates_and_cv_pred_proba(
        X=X,
        s=s,
        clf=clf,
        cv_n_folds=cv_n_folds,
        positive_lb_threshold=positive_lb_threshold,
        negative_ub_threshold=negative_ub_threshold,
        verbose=verbose,
    )[:-1]
Ejemplo n.º 20
0
X = digits.data
y = digits.target

# train_test_split splits arrays or matrices into random train and test subsets.
# That means that everytime you run it without specifying random_state, you will get a different result and
# this is expected behavior.
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

names = [
    "Nearest Neighbors", "LinearDiscriminant", "Linear SVM",
    "LogisticRegression"
]

classifiers = [KNN(), LDA(), SVC(), logreg()]

for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print('The score of ' + name + ' classifier is ' + str(score))

#########---------------Q4---------------#########


def load_data(folder):
    """ 
    Load all images from subdirectories of
    'folder'. The subdirectory name indicates
    the class.
    """
import numpy as np
import pylab as pl

from sklearn.datasets import load_digits
digits=load_digits()
data=digits['data']
target=digits['target']

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression as logreg
from sklearn.multiclass import OneVsRestClassifier

import characters


logregl1 = logreg(penalty='l1')


# cross validation
#from sklearn.cross_validation import cross_val_score

#score = cross_val_score(logregl1, data, target, cv=5)

#clfs = [OneVsRestClassifier(logreg(penalty='l1', C=alpha)) #for alpha in np.logspace(-5, 1, 10)]

#scores = [cross_val_score(clf, data, target, cv=5) for clf in clfs]

#scores_mean = np.array(scores).mean(axis=1) # calculation of the mean score for each value of alpha
# end of cross validation

Ejemplo n.º 22
0
    # gradient
    gradient = 0

    for n in range(x.shape[0]):
        num = -np.exp(-y[n] * np.dot(w, x[n])) * y[n] * x[n]
        den = 1 + np.exp(-y[n] * np.dot(w, x[n]))
        gradient += num / den
    return gradient


w = np.random.rand(2)

# set step size to a small positive value
step = .0001

clf = logreg()

weight_history = []
acc_history = []

for _ in range(100):
    # Apply the gradient descent rule.
    w = w - step * gradient_log_loss(w, X, y)

    # Print the current state.
    #print("Iteration %d: w = %s (log-loss = %.2f)" %
    #      (iteration, str(w), log_loss(w, x, y)))

    # Compute the accuracy:
    y_prob = 1 / (1 + np.exp(-np.dot(X, w)))
    # Threshold at 0.5 (results are 0 and 1)
    def __init__(self, clf=None):

        # Stores the classifier used.
        # Default classifier used is logistic regression
        self.clf = logreg() if clf is None else clf
#results = clf.score(xtest,ytest)
print dec


# In[ ]:

#logistic regression? as alternative to lasso? sklearns feature selection modules say that lasso is for regression, and 
# logistic regression and linear svc are for classification'
from sklearn.linear_model import LogisticRegression as logreg
from sklearn.feature_selection import SelectFromModel as sfm


# In[ ]:


lr1 = logreg()
lr1.fit(xtrain,ytrain1)

print lr1.fit(xtrain,ytrain1)
print (lr1.score(xtest,ytest1))
print lr1.coef_
model = sfm(lr1,prefit=True)
xnew = model.transform(xtrain)
print xtrain.shape
print xnew.shape
#lr2 = logreg()
#lr2.fit(xnew,ytrain1)
print xtrain
print xnew

Ejemplo n.º 25
0
def visualize_clf(textdict, doccats, create_html=True, visids=[], subdir_html='', subdir_wc='', maskfiles={}, use_logreg=False):
    """
    visualize a text categorization dataset w.r.t. classification scores (create htmls with highlighted words and word clouds)

    Input:
        textdict: dict with {doc_id: text}
        doccats: dict with {doc_id: category}
        create_html: whether to create the html files with scores highlighted for individual documents (default: True)
        visids: a subset of docids for which the html visualization should be created (optional)
                (if create_html=True but visids=[], select up to 1000 random ids)
        subdir_html: subdirectory to save the created html files in (has to exist)
        subdir_wc: subdirectory to save the created word cloud images in (has to exist)
        maskfiles: dict with {category: path_to_maskfile} for creating the word clouds in a specific form
        use_logreg: default False; whether to use logistic regression instead of linear SVM
    Returns:
        relevant_words: dict with {category: {word: relevancy score}}
    """
    print("possibly selecting subset of 10000 examples")
    textdict, doccats, visids = select_subset(textdict, doccats, visids)
    # training examples are all but visids
    trainids = list(set(textdict.keys()).difference(set(visids)))
    # train a classifier and predict
    if use_logreg:
        renorm = 'max'
        clf = logreg(class_weight='balanced', random_state=1)
    else:
        renorm = 'length'
        clf = LinearSVC(C=10., class_weight='balanced', random_state=1)
    print("transforming text into features")
    # make features (we can use bigrams if we don't have to create htmls)
    ft = FeatureTransform(norm='max', weight=True, renorm=renorm, identify_bigrams=not create_html, norm_num=False)
    docfeats = ft.texts2features(textdict, fit_ids=trainids)
    # convert training data to feature matrix
    featmat_train, featurenames = features2mat(docfeats, trainids)
    y_train = [doccats[tid] for tid in trainids]
    # fit classifier
    print("training classifier")
    clf.fit(featmat_train, y_train)
    del featmat_train
    # make test featmat and label vector
    print("making predictions")
    featmat_test, featurenames = features2mat(docfeats, visids, featurenames)
    # get actual classification results for all test samples
    predictions = clf.decision_function(featmat_test)
    predictions_labels = clf.predict(featmat_test)
    y_true, y_pred = [doccats[tid] for tid in visids], list(predictions_labels)
    # report classification accuracy
    if len(clf.classes_) > 2:
        f1_micro, f1_macro = skmet.f1_score(y_true, y_pred, average='micro'), skmet.f1_score(y_true, y_pred, average='macro')
        print("F1 micro-avg: %.3f, F1 macro-avg: %.3f" % (f1_micro, f1_macro))
    print("Accuracy: %.3f" % skmet.accuracy_score(y_true, y_pred))
    # create the visualizations
    print("creating the visualization for %i test examples" % len(visids))
    # collect all the accumulated scores to later create a wordcloud
    scores_collected = np.zeros((len(featurenames), len(clf.classes_)))
    # run through all test documents
    for i, tid in enumerate(visids):
        if not i % 100:
            print("progress: at %i of %i test examples" % (i, len(visids)))
        # transform the feature vector into a diagonal matrix
        feat_vec = lil_matrix((len(featurenames), len(featurenames)), dtype=float)
        feat_vec.setdiag(featmat_test[i, :].toarray().flatten())
        feat_vec = csr_matrix(feat_vec)
        # get the scores (i.e. before summing up)
        scores = clf.decision_function(feat_vec)
        # adapt for the intercept
        scores -= (1. - 1./len(featurenames)) * clf.intercept_
        # when creating the html visualization we want the words speaking for the prediction
        # but when creating the word cloud, we want the words speaking for the actual class
        metainf = tid + '\n'
        # binary or multi class?
        if len(scores.shape) == 1:
            if clf.classes_[0] == predictions_labels[i]:
                # we want the scores which speak for the class - for the negative class,
                # the sign needs to be reversed
                scores *= -1.
            scores_dict = dict(zip(featurenames, scores))
            metainf += 'True Class: %s\n' % doccats[tid]
            metainf += 'Predicted Class: %s  (Score: %.4f)' % (predictions_labels[i], predictions[i])
            scores_collected[:, clf.classes_ == doccats[tid]] += np.array([scores]).T
        else:
            scores_dict = dict(zip(featurenames, scores[:, clf.classes_ == predictions_labels[i]][:, 0]))
            metainf += 'True Class: %s  (Score: %.4f)\n' % (doccats[tid], predictions[i, clf.classes_ == doccats[tid]][0])
            metainf += 'Predicted Class: %s  (Score: %.4f)' % (predictions_labels[i], predictions[i, clf.classes_ == predictions_labels[i]][0])
            scores_collected[:, clf.classes_ == doccats[tid]] += scores[:, clf.classes_ == doccats[tid]]
        # use the vector with scores together with the corresponding feature names and the original text
        # to create the pretty visualization
        if create_html:
            if y_true[i] == y_pred[i]:
                name = 'correct_'
            else:
                name = 'error_'
            name += tid + '_' + doccats[tid]
            scores2html(textdict[tid], scores_dict, os.path.join(subdir_html, name.replace(' ', '_').replace('/', '_')), metainf)
    print("creating word clouds")
    # normalize the scores for each class
    scores_collected /= np.max(np.abs(scores_collected), axis=0)
    # transform the collected scores into a dictionary and create word clouds
    scores_collected_dict = {cat: dict(zip(featurenames, scores_collected[:, clf.classes_ == cat][:, 0])) for cat in clf.classes_}
    for cat in scores_collected_dict:
        create_wordcloud(scores_collected_dict[cat], os.path.join(subdir_wc, "%s.png" % cat), maskfiles[cat] if cat in maskfiles else None)
    return scores_collected_dict
Ejemplo n.º 26
0
searchcv_svc.fit(F, y)

print('the best parameters for SVC classifier using RandomizedSearchCV are ' +
      str(searchcv_svc.best_params_))

# Apply grid and search for logreg classifier

# Create hyperparameter options
hyperparams_grid = {
    "C": [1e-5, 1e-3, 1e-1, 1],
    "fit_intercept": [True, False],
    "penalty": ["l1", "l2"]
}

grid_logreg = GridSearchCV(logreg(), hyperparams_grid, cv=5)

grid_logreg.fit(F, y)

print('the best parameters for logreg classifier using GridSearchCV are ' +
      str(grid_logreg.best_params_))

hyperparams_dist = {
    "C": stats.beta(1, 3),
    "fit_intercept": [True, False],
    "penalty": ["l1", "l2"]
}

searchcv_logreg = RandomizedSearchCV(logreg(),
                                     hyperparams_dist,
                                     n_iter=20,
Ejemplo n.º 27
0
# So we concatenate the arrays on axis 0 (bc only 1 axis)
predictions = np.concatenate(predictions, axis=0)

predictions[predictions > .5] = 1
predictions[predictions <= .5] = 0

print(predictions)

print(sum(predictions == titanic["Survived"]))
accuracy = sum(predictions == titanic["Survived"]) / len(predictions)
print(accuracy)  # = 0.783

#------------------- Logistic Regression method ---------------------

# Initialize the algo
algo_logreg = logreg(random_state=1)

# Compute accuracy score for all cross-V folds;
# cross_val_score(algo, predictors, target, cross-validation fold)
scores = cross_validation.cross_val_score(algo_logreg,
                                          titanic[predictors],
                                          titanic["Survived"],
                                          cv=3)
# Mean of the scores for each folds (3 folds)
print(scores.mean())

#----------------------------------- Log Reg. with test set ---------------------

titanic_test = pd.read_csv("test.csv")

# I) Clean data
Ejemplo n.º 28
0
def compute_noise_rates_and_cv_pred_proba(
    X,
    s,
    clf=logreg(),
    cv_n_folds=3,
    positive_lb_threshold=None,
    negative_ub_threshold=None,
    verbose=False,
):
    '''This function computes the out-of-sample predicted 
  probability P(s=1|x) for every example x in X using cross
  validation while also computing the confident counts noise
  rates within each cross-validated subset and returning
  the average noise rate across all examples. 

  This function estimates rh1 (the fraction of pos examples mislabeled
  as neg, frac_pos2neg) and  rh0 (the fraction of neg examples 
  mislabeled as pos, frac_neg2pos). 
  
  The acronym 'rh' stands for rho hat, where rho is a greek symbol for
  noise rate and hat tells us that the value is estimated, not necessarily
  exact. Under certain conditions, estimates are exact, and in most
  conditions, estimates are within one percent of the actual noise rates.

  Parameters
  ----------
    X : np.array
      Input feature matrix (N, D), 2D numpy array

    s : np.array
      A binary vector of labels, s, which may contain mislabeling

    clf : sklearn.classifier or equivalent
      Default classifier used is logistic regression. Assumes clf
      has predict_proba() and fit() defined.

    cv_n_folds : int
      The number of cross-validation folds used to compute
      out-of-sample probabilities for each example in X.
      
    positive_lb_threshold : float 
      P(s^=1|s=1). If an example has a predicted probability "greater" than 
      this threshold, it is counted as having hidden label y = 1. This is 
      not used for pruning, only for estimating the noise rates using 
      confident counts. This value should be between 0 and 1. Default is None.
      
    negative_ub_threshold : float 
      P(s^=1|s=0). If an example has a predicted probability "lower" than
      this threshold, it is counted as having hidden label y = 0. This is
      not used for pruning, only for estimating the noise rates using
      confident counts. This value should be between 0 and 1. Default is None.

    verbose : bool
      Set to true if you wish to print additional information while running.
  '''

    # Create cross-validation object for out-of-sample predicted probabilities.
    # CV folds preserve the fraction of noisy positive and
    # noisy negative examples in each class.
    kf = StratifiedKFold(n_splits=cv_n_folds, shuffle=True)

    # Intialize result storage and final prob_s_eq_1 array
    rh1_per_cv_fold = []
    rh0_per_cv_fold = []
    prob_s_eq_1 = np.zeros(np.shape(s))

    # Split X and s into "cv_n_folds" stratified folds.
    for k, (cv_train_idx, cv_holdout_idx) in enumerate(kf.split(X, s)):

        # Select the training and holdout cross-validated sets.
        X_train_cv, X_holdout_cv = X[cv_train_idx], X[cv_holdout_idx]
        s_train_cv, s_holdout_cv = s[cv_train_idx], s[cv_holdout_idx]

        # Fit the clf classifier to the training set and
        # predict on the holdout set and update prob_s_eq_1.
        clf.fit(X_train_cv, s_train_cv)
        prob_s_eq_1_cv = clf.predict_proba(X_holdout_cv)[:, 1]  # P(s = 1|x)
        prob_s_eq_1[cv_holdout_idx] = prob_s_eq_1_cv

        # Compute and append the confident counts noise estimators
        # to estimate the positive and negative mislabeling rates.
        rh1_cv, rh0_cv = compute_conf_counts_noise_rates_from_probabilities(
            s=s_holdout_cv,
            prob_s_eq_1=prob_s_eq_1_cv,
            positive_lb_threshold=positive_lb_threshold,
            negative_ub_threshold=negative_ub_threshold,
            verbose=verbose,
        )
        rh1_per_cv_fold.append(rh1_cv)
        rh0_per_cv_fold.append(rh0_cv)

    # Return mean rh, omitting nan or inf values, and prob_s_eq_1
    return (
        _mean_without_nan_inf(rh1_per_cv_fold),
        _mean_without_nan_inf(rh0_per_cv_fold),
        prob_s_eq_1,
    )
Ejemplo n.º 29
0
from cleanlab.classification import LearningWithNoisyLabels
from cleanlab.noise_generation import generate_noisy_labels
from cleanlab.util import value_counts
from cleanlab.latent_algebra import compute_inv_noise_matrix


# ## **rankpruning** is the first practical *(works for any classifier, runs fast, robust to poor probability estimation)* algorithm for multiclass learning with noisy labels. Its comprised of components from the theory and algorithsm of **confident learning**. It's a Python class that wraps around any classifier as long as .fit(X, y, sample_weight), .predict(X), .predict_proba(X) are defined. Inspect the **cleanlab** package for documentation.
# 
# ## Here we show the performance of multiclass rankpruning wrapped around a sklearn LogisiticRegression classifier versus LogisticRegression without any help from confident learning on the Iris dataset.

# In[16]:


# Seed for reproducibility
seed = 2
rp = LearningWithNoisyLabels(clf = logreg(), seed = seed)
np.random.seed(seed = seed)

# Get iris dataset
iris = datasets.load_iris()
X = iris.data  # we only take the first two features.
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


try:
    get_ipython().run_line_magic('matplotlib', 'inline')
    from matplotlib import pyplot as plt
    _ = plt.figure(figsize=(12,8))
    color_list = plt.cm.tab10(np.linspace(0, 1, 6))
    _ = plt.scatter(X_train[:,1], X_train[:,3], color = [color_list[z] for z in y_train], s = 50)  
Ejemplo n.º 30
0
        hist = np.histogram(lbp, bins=range(257))[0]
        F.append(hist)

    return np.array(F)


X, y = load_data("GTSRB_subset")
F = extract_lbp_features(X)
#F= Normalizer().fit(F)
F = scale(F)

X_train, X_test, y_train, y_test = train_test_split(F, y, test_size=0.2)

names = ["LogisticRegression", "SVC"]

classifiers = [logreg(), SVC()]

C_range = 10.0**np.arange(-5, 0)

for name, clf in zip(names, classifiers):
    for C in C_range:
        for penalty in ["l1", "l2"]:
            clf.C = C
            clf.penalty = penalty
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            score = accuracy_score(y_test, y_pred)
            print('The score of ' + name +
                  ' for C = %.2e and penalty = %s is %.3f' %
                  (C, penalty, score))
Ejemplo n.º 31
0
dot_data = tree.export_graphviz(dt, out_file=None)
graph = graphviz.Source(dot_data)

predictors = X_train.columns
#print(predictors)
dot_data = tree.export_graphviz(dt,
                                out_file=None,
                                feature_names=predictors,
                                class_names=('Negative', 'Positive'),
                                filled=True,
                                rounded=True,
                                special_characters=True)
graph = graphviz.Source(dot_data)
graph
"""
Model testing
"""
selected_attributes = [
    'kw_max_avg', 'data_channel_is_world', 'data_channel_is_entertainment',
    'LDA_03'
]
X = news_pop[selected_attributes]
y = news_pop['shares_bin'].values.reshape(-1, 1)

mylr = logreg()
mylr.fit(X, y)

model_summary = func.ModelSummary(mylr, X, y)
model_summary.get_summary()
Ejemplo n.º 32
0
import prepod.lib.io as io
import prepod.lib.prep as prep
import prepod.lib.models as mdl

path_data = '/Users/jannes/Projects/delir/data/'
path_labels = path_data + 'info/sudocu_info/subject_data.csv'
path_out = '/Users/jannes/Projects/delir/results/test_{}.csv'
target = 'delir_60min'
data = io.parse_subj_info(path_labels, 'Sudocu')
data = prep.drop_non_feature_cols(data, target)
data = prep.drop_if_too_many_nans(data, .25)
features = list(data.drop(['subj_id', 'delir_60min', 'age'], axis=1))
data = prep.drop_na(data, features)
data = prep.to_fv(data, features, target)

clfs = [svm.SVC(gamma='scale'), logreg(solver='liblinear')]
for clf in clfs:
    res = mdl.backward_subset_selection(data['X'], data['y'], data['X_labels'], data['y_labels'], K=1, clf=clf)
    res = pd.DataFrame(res).sort_values(by='mean_acc', ascending=False)
    res = res[res['n_features'] < 10]
    res['target'] = target
    with open(path_out.format('bss'), 'a') as f:
        res.to_csv(f, index=False)
    res = mdl.forward_subset_selection(data, K=len(features), init_combos=2, clf=clf)
    res = pd.DataFrame(res).sort_values(by='mean_acc', ascending=False)
    res = res[res['n_features'] < 10]
    res['target'] = target
    with open(path_out.format('fss'), 'a') as f:
        res.to_csv(f, header=False, index=False)

Ejemplo n.º 33
0
def C_score(C, X, y, X_test, y_test):

    m = logreg(C=C)
    m.fit(X, y)

    return score(y_test, m.decision_function(X_test))
Ejemplo n.º 34
0
print(y)


# In[12]:


from sklearn.model_selection import train_test_split as tts
x_train,x_test,y_train,y_test=tts(x,y,test_size=0.25,random_state=42)


# In[13]:


from sklearn.linear_model import LogisticRegression as logreg
model_logreg = logreg() 
model_logreg.fit(x_train,y_train)


# ----> Logistic regression is a statistical model that in its basic form uses a logistic function to model a binary dependent variable, although many more complex extensions exist. In regression analysis, logistic regression (or logit regression) is estimating the parameters of a logistic model (a form of binary regression).
# 
# ----> Am using logistic regression as it is a classification problem.

# In[14]:


y_p=model_logreg.predict(x_test)


# In[15]:
Ejemplo n.º 35
0
import numpy as np

from sklearn.datasets import load_digits
digits=load_digits()
data=digits['data']
target=digits['target']

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression as logreg
from sklearn.multiclass import OneVsRestClassifier

logregl1 = logreg(penalty='l1')

train_data = data[:1000]
train_target= target[:1000]

test_data = data[1000:]
test_target= target[1000:]

logregl1.fit(train_data, train_target)

prediction= logregl1.predict(test_data) # premiere prediction qui permettrait de faire un affichage (sans lien avec la cross validation)

from sklearn.cross_validation import cross_val_score

score = cross_val_score(logregl1, data, target, cv=5)

clfs = [OneVsRestClassifier(logreg(penalty='l1', C=alpha)) for alpha in np.logspace(-5, 1, 10)]

scores = [cross_val_score(clf, data, target, cv=5) for clf in clfs]