コード例 #1
0
def predictData():
    dataMatIn, classLabels = loadDataSet('testSet.txt')
    dataMatrix = mat(dataMatIn)
    labelMatrix = mat(classLabels).transpose()
    classifier = LogReg()  # 使用类,参数全是默认的
    classifier.fit(dataMatrix, labelMatrix)  # 训练数据来学习,不需要返回值

    tdataMatIn, tclassLabels = loadDataSet('dataSet.txt')
    tdataMatrix = mat(tdataMatIn)
    tlabelMatrix = mat(tclassLabels).transpose()

    count = 0
    rCount = 0
    prCount = 0
    for i in range(0, len(dataMatIn)):
        res = classifier.predict(dataMatIn[i])
        if res == 1:
            count += 1
            if tclassLabels[i] == 1:
                prCount += 1
                rCount += 1
        else:
            if tclassLabels[i] == 1:
                rCount += 1
    return count, rCount, prCount
コード例 #2
0
def train_LogReg(x_train,y_train):
    param = {
        'penalty':'l2',#正则化方式
        'dual':False,
        #Dual or primal formulation.
        #Dual formulation is only implemented for l2 penalty with liblinear solver.
        #Prefer dual=False when n_samples > n_features.
        'tol':1e-4,
        #Tolerance for stopping criteria
        'C':1,#正则化参数
        'fit_intercept':True,
        'intercept_scaling':1,
        'class_weight':None,
        # Weights associated with classes in the form {class_label: weight}.
        # If not given, all classes are supposed to have weight one.
        # The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to
        # class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
        # Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified.
        # New in version 0.17: class_weight=’balanced’
        'random_state':None,
        'solver':'liblinear',
        'max_iter':100,
        'multi_class':'ovr',
        'verbose':1,
        'n_jobs':None
    }
    model_logreg = LogReg(**param).fit(x_train,y_train)
    return model_logreg
コード例 #3
0
def forewardStepWise(model = LogReg(random_state = 0, solver = 'lbfgs', max_iter = 1000)):
    pred = X.copy()
    usedVar = []
    nVarModel = {}
    optimalVar = []
    for step in range(1, len(X.columns) + 1):
        accuracySelector = []
        for i in pred.columns:
            usedVar.append(i)
            xModel = model.fit(X[usedVar],y)
            CVaccuracy = cross_val_score(xModel, X[usedVar], y, scoring = 'accuracy', cv=10).mean()
            accuracySelector.append(CVaccuracy)
            usedVar = usedVar[:-1]
        nStep = max(accuracySelector)
        optimalVar.append(nStep)
        for n in range(len(accuracySelector)):
            if accuracySelector[n-1] == nStep:
                locate = n-1
        usedVar.append(pred.columns[locate])
        pred = pred.drop(columns = {pred.columns[locate]})
        nVarModel[step] = usedVar  #I have NO idea why this dictionary is adding new strings to keys 
                                   #from (step-1) in addition to the correct string for the current step... .
    nVarModel = {key: value[:-1] for key, value in nVarModel.items()}
    nVarModel[step] = usedVar
    maxAccuracy = max(optimalVar)
    count = 0
    for i in optimalVar:
        count += 1
        if maxAccuracy == i:
            goodModel = nVarModel[count]
    return goodModel
#%%
コード例 #4
0
ファイル: horse.py プロジェクト: Pallas1992/python-1
def predictData(lfilename, pfilename):
    dataMat, labMat = Loaddata(lfilename)
    datamatrix = mat(dataMat)
    labMatrix = mat(labMat).transpose()

    classifier = LogReg()
    classifier.fit(datamatrix, labMatrix)

    tdataMatIn, tclassLabels = Loaddata(pfilename)
    tdataMatrix = mat(tdataMatIn)
    tlabelMatrix = mat(tclassLabels).transpose()

    count = 0
    rCount = 0
    prCount = 0
    for i in range(0, len(tdataMatIn)):
        res = classifier.predict(tdataMatIn[i])
        if res == 1:
            count += 1
            if tclassLabels[i] == 1:
                prCount += 1
                rCount += 1
        else:
            if tclassLabels[i] == 1:
                rCount += 1
    return count, rCount, prCount
コード例 #5
0
def zip_estimate(zs, ys, C=1, classifier="logreg", penalty='l2'):
    """Regress binary outcome against zip using regression of the form:
    y ~ z1 + z2 + z3 + z4 + z5.
    """
    background_rate = mean(ys)

    # js = pd.notnull(zs)
    # zs = zs[js]
    # ys = ys[js]
    def to_row(z):
        arr = [0] * 111110
        offset = 0
        for i in range(1, 5 + 1):
            zi = z[:i]
            j = int(zi) + offset
            arr[j] = 1
            offset += 10**i
        return arr

    N = len(zs)
    A = lil_matrix((N, 111110))
    valids = 0
    for i, z in tqdm(enumerate(zs), total=N):
        if not is_valid_zip(z):
            continue
        valids += 1
        offset = 0
        for d in range(1, 5 + 1):
            zi = z[:d]
            j = int(zi) + offset
            A[i, j] = 1
            offset += 10**d
    print("valids:", valids)
    if classifier == "logreg":
        clf = LogReg(C=C, penalty=penalty)
    elif classifier == "ridge":
        clf = Ridge(alpha=1 / C)
    else:
        raise Exception("didn't recognize classifier:", classifier)
    print("fitting logreg")
    clf.fit(A, ys)
    all_zs = [
        "".join(x) for k in range(1, 5 + 1)
        for x in product(*("0123456789" for _ in range(k)))
    ]
    all_A = lil_matrix((111110, 111110))
    for i, z in tqdm(enumerate(all_zs), total=111110):
        offset = 0
        for d in range(1, len(z) + 1):
            zi = z[:d]
            j = int(zi) + offset
            all_A[i, j] = 1
            offset += 10**d
    if classifier == "logreg":
        all_yhats = clf.predict_proba(all_A)[:, 1]
    else:
        all_yhats = clf.predict(all_A)
    ml_dict = {z: yhat for z, yhat in zip(all_zs, all_yhats)}
    return defaultdict(lambda: background_rate, ml_dict)
コード例 #6
0
ファイル: linear.py プロジェクト: walleDCR/ssl-transfer
    def __init__(self, input_dim, num_classes, metric):
        super().__init__()
        self.input_dim = input_dim
        self.num_classes = num_classes
        self.metric = metric
        self.clf = LogReg(solver='lbfgs', multi_class='multinomial', warm_start=True)

        print('Logistic regression:')
        print(f'\t solver = L-BFGS')
        print(f"\t classes = {self.num_classes}")
        print(f"\t metric = {self.metric}")
コード例 #7
0
    def __init__(
        self,
        clf=None,
        *,
        seed=None,
        # Hyper-parameters (used by .fit() function)
        cv_n_folds=5,
        converge_latent_estimates=False,
        pulearning=None,
        find_label_issues_kwargs={},
        label_quality_scores_kwargs={},
        verbose=False,
    ):

        if clf is None:
            # Use logistic regression if no classifier is provided.
            clf = LogReg(multi_class="auto", solver="lbfgs")

        # Make sure the given classifier has the appropriate methods defined.
        if not hasattr(clf, "fit"):
            raise ValueError(
                "The classifier (clf) must define a .fit() method.")
        if not hasattr(clf, "predict_proba"):
            raise ValueError(
                "The classifier (clf) must define a .predict_proba() method.")
        if not hasattr(clf, "predict"):
            raise ValueError(
                "The classifier (clf) must define a .predict() method.")

        if seed is not None:
            np.random.seed(seed=seed)

        self.clf = clf
        self.seed = seed
        self.cv_n_folds = cv_n_folds
        self.converge_latent_estimates = converge_latent_estimates
        self.pulearning = pulearning
        self.find_label_issues_kwargs = find_label_issues_kwargs
        self.label_quality_scores_kwargs = label_quality_scores_kwargs
        self.verbose = verbose
        self.label_issues_df = None
        self.label_issues_mask = None
        self.sample_weight = None
        self.confident_joint = None
        self.py = None
        self.ps = None
        self.num_classes = None
        self.noise_matrix = None
        self.inverse_noise_matrix = None
        self.clf_kwargs = None
        self.clf_final_kwargs = None
コード例 #8
0
    def __init__(
        self,
        clf=None,
        seed=None,
        # Hyper-parameters (used by .fit() function)
        cv_n_folds=5,
        prune_method='prune_by_noise_rate',
        converge_latent_estimates=False,
        pulearning=None,
        n_jobs=None,
    ):

        if clf is None:
            # Use logistic regression if no classifier is provided.
            clf = LogReg(multi_class='auto', solver='lbfgs')

        # Make sure the given classifier has the appropriate methods defined.
        if not hasattr(clf, "fit"):
            raise ValueError(
                'The classifier (clf) must define a .fit() method.')
        if not hasattr(clf, "predict_proba"):
            raise ValueError(
                'The classifier (clf) must define a .predict_proba() method.')
        if not hasattr(clf, "predict"):
            raise ValueError(
                'The classifier (clf) must define a .predict() method.')

        if seed is not None:
            np.random.seed(seed=seed)

        # Set-up number of multiprocessing threads used by get_noise_indices()
        if n_jobs is None:
            n_jobs = multiprocessing.cpu_count()
        else:
            assert (n_jobs >= 1)

        self.clf = clf
        self.seed = seed
        self.cv_n_folds = cv_n_folds
        self.prune_method = prune_method
        self.converge_latent_estimates = converge_latent_estimates
        self.pulearning = pulearning
        self.n_jobs = n_jobs
        self.noise_mask = None
        self.sample_weight = None
        self.confident_joint = None
        self.py = None
        self.ps = None
        self.K = None
        self.noise_matrix = None
        self.inverse_noise_matrix = None
コード例 #9
0
def estimate_cv_predicted_probabilities(
    X,
    labels,  # class labels can be noisy (s) or not noisy (y).
    clf=LogReg(multi_class='auto', solver='lbfgs'),
    cv_n_folds=5,
    seed=None,
):
    """This function computes the out-of-sample predicted
    probability [P(s=k|x)] for every example in X using cross
    validation. Output is a np.array of shape (N, K) where N is
    the number of training examples and K is the number of classes.

    Parameters
    ----------

    X : np.array
      Input feature matrix (N, D), 2D numpy array

    labels : np.array or list of ints from [0,1,..,K-1]
      A discrete vector of class labels which may or may not contain mislabeling

    clf : sklearn.classifier or equivalent
      Default classifier used is logistic regression. Assumes clf
      has predict_proba() and fit() defined.

    cv_n_folds : int
      The number of cross-validation folds used to compute
      out-of-sample probabilities for each example in X.

    seed : int (default = None)
        Set the default state of the random number generator used to split
        the cross-validated folds. If None, uses np.random current random state.

    Returns
    --------
    psx : np.array (shape (N, K))
        P(label=k|x) is a matrix with K (noisy) probabilities for each of the N
        examples x. This is the probability distribution over all K classes, for
        each example, regarding whether the example has label s==k P(s=k|x). psx
        should have been computed using 3 (or higher) fold cross-validation."""

    return estimate_py_noise_matrices_and_cv_pred_proba(
        X=X,
        s=labels,
        clf=clf,
        cv_n_folds=cv_n_folds,
        seed=seed,
    )[-1]
コード例 #10
0
def test(emb, label_mat, emb_IDmap, label_IDmap, n_splits, random_state,
         shuffle):
    """Test embedding performance

     Perform node classification using L2 regularized Logistic Regression 
     with 5-Fold Cross Validation

    """
    n_classes = label_mat.shape[1]
    label_IDs = list(label_IDmap)
    emb_idx = [emb_IDmap[ID] for ID in label_IDs]
    x = emb[emb_idx]

    splitter = skf(n_splits=n_splits,
                   random_state=random_state,
                   shuffle=shuffle)
    mdl = LogReg(penalty='l2', solver='lbfgs', warm_start=False, max_iter=1000)

    y_true_all = []
    y_pred_all = []

    for i in range(n_classes):
        y = label_mat[:, i]
        label = i + 1

        y_true = np.array([], dtype=bool)
        y_pred = np.array([])

        for j, (train, test) in enumerate(splitter.split(y, y)):
            print("Testing class #{:>4d},\tfold {:>2d} / {:<2d}".format(
                label, j + 1, n_splits),
                  flush=True,
                  end='\r')
            mdl.fit(x[train], y[train])

            y_true = np.append(y_true, y[test])
            y_pred = np.append(y_pred, mdl.decision_function(x[test]))

        y_true_all.append(y_true)
        y_pred_all.append(y_pred)

    print('')

    return y_true_all, y_pred_all
コード例 #11
0
    def fit(self, rs: RecordSet) -> None:
        """
		fit a Logistic regression mdl

		:param rs: The record set to fit with.
		"""
        # set params
        self.data = cp.deepcopy(rs)
        patterns = self.data.entries[:, :-1]
        out = self.data.entries[:, -1:]

        # avoid error
        if self.alpha == 0:
            raise Exception(
                "Alpha Logistic too low to obtain reliable results")

        # import the logistic regression
        self.model = LogReg(C=1 / self.alpha, penalty="l1", solver="liblinear")
        self.model.fit(X=patterns, y=out.ravel())
コード例 #12
0
def cross_val_c(window, seeds_arr, slots_arr, tourney_arr, column_names,
                predictor_dfs):
    col_names = ['0.0001', '0.001', '0.01', '.1', '1', '10', '100']
    test_yr_range = range(2003 + window, 2016)
    scores = pd.DataFrame(index=test_yr_range, columns=col_names)

    c_vals = range(-4, 3)

    for yr in test_yr_range:
        x_train, y_train, x_test, y_test = train_test_split(
            window, yr, seeds_arr, slots_arr, tourney_arr, column_names,
            predictor_dfs)
        scaler = StandardScaler().fit(x_train.iloc[:, 2:])
        for c in c_vals:
            model = LogReg(C=10**c)
            model.fit(scaler.transform(x_train.iloc[:, 2:]),
                      y_train.values.T[0])
            scores.ix[yr, c + 2] = model.score(
                scaler.transform(x_test.iloc[:, 2:]), y_test.values.T[0])
    return scores
コード例 #13
0
    sent = sentences(essay)
    numOfSent_train.append(sent)

numOfSent_valid = []
for essay in valid_df['essay']:
    sent = sentences(essay)
    numOfSent_valid.append(sent)

train_df['sentence_length'] = numOfSent_train
valid_df['sentence_length'] = numOfSent_valid

regularization_data_sentence = create_regularization_sentence_length(train_df)
train_df = append_regularized_sentence_length(train_df)

# FITTING THE TRAINING SET USING L2 LOGISTIC
logistic_l2 = LogReg(penalty='l2', solver='liblinear', n_jobs=4)
xs = [[x] for x in np.array(train_df['sentence_length'])]
logistic_l2.fit(xs, train_std_scores)

# DENORMALIZING FOR THE VALID SET
max_essay_set = max(train_df['essay_set'])
stand_pred_values_l2 = []
for i in range(max_essay_set):
    current_set = valid_df[valid_df['essay_set'] == i + 1]['sentence_length']
    for value in current_set:
        stand_pred_values_l2.append(int(float(value) * float(regularization_data_sentence[i][2]) + (regularization_data_sentence[i][1])))

# PREDICTING THE SCORE USING THE NEW SENTENCE LENGTH
valid_df['new_sentence_length_std'] = stand_pred_values_l2
valid_x = [[x] for x in np.array(valid_df['new_sentence_length_std'])]
valid_pred_std_scores_l2 = logistic_l2.predict(valid_x)
コード例 #14
0
    return pd.DataFrame(np.array(data).reshape(-1, 3),
                        columns=['ID', 'rate', 'response'])


def long_stats(dfk, columns):
    x = final_dataLONG(columns, dfk)
    est = smf.mixedlm('rate ~ response', x, groups=x['ID'])
    est2 = est.fit()
    print(est2.summary())


#####
##IMPLEMENTATION
#####
lr = LogReg(solver='newton-cg', multi_class='multinomial')

df2 = df_1y.replace([np.nan], 'NA')
df3 = df_Uy.replace([np.nan], 'NA')

df1y = final_data(df2, list(df2)[1:])
dfUy = final_data(df3, list(df3)[1:])

###
#Q5 RESULTS
###

#1y Students
print('===1y Q5===')
x = df1y[['Q5B', 'Q5C']]
y = df1y['Q5A']
コード例 #15
0
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.cross_validation import train_test_split
import sklearn.metrics as metrics
import pandas as pd

# Open vectorized files
train = pd.read_csv('../datasets/bless2011/data_lex_train_vectorized_diff.tsv', sep='\t', header=None)
test = pd.read_csv('../datasets/bless2011/data_lex_test_vectorized_diff.tsv', sep='\t', header=None)

### Training
# Remove NaN
train.dropna(axis=0, inplace=True)
X = train.iloc[:, :-1]
y = train.iloc[:, -1].astype(bool)

clf = LogReg()
clf.fit(X, y)

### Testing
orig_rows, orig_cols = test.shape

# Remove rows with NaN
test.dropna(axis=0, inplace=True)

# Count number of rows removed
diff = orig_rows - test.shape[0]

X = test.iloc[:, :-1]
y = test.iloc[:, -1].astype(bool)

preds = clf.predict(X)
コード例 #16
0
    if y_test[i] == 1:
        pos += 1
    else:
        neg += 1

# model stats to be put into table
model_stats = [31, 0.1, acc, fp, fn]
#model_stats = np.zeros((2,5))
#model_stats[0,0] = 31
#model_stats[0,1] = 0.1
#model_stats[0,2] = acc
#model_stats[0,3] = fp
#model_stats[0,4] = fn

# making LaTex table for accuracy scores
df = pd.DataFrame.from_records([model_stats],
                               columns=[
                                   'Number of epochs', 'Learning rate',
                                   'Accuracy', 'False positives',
                                   'False negatives'
                               ])

tab = df.to_latex(index=False, float_format="%.2f")
print(f"\n\n{tab}\n\n")

# comparison to scikit-learn using LogisticRegression class
logreg2 = LogReg()
logreg2.fit(X_train, y_train)
y_pred = logreg2.predict(X_test)
print("scikit-learn accuracy: {:.5f}".format(sklearn_accuracy(y_test, y_pred)))
コード例 #17
0
ファイル: start_pos_LR.py プロジェクト: sizej/NFL_RT_Prob
    start['is_EOH'] = start.apply(lambda row: end_of_half_det(row), axis=1)
    start['pos_leads'] = (start['posteam_score'] >
                          start['defteam_score']).astype(int)
    to_drop = [
        'Unnamed: 0', 'game_date', 'ends_TD', 'ends_FG', 'ends_punt',
        'ends_other'
    ]
    sk = start.copy()
    sk.drop(to_drop, axis=1, inplace=True)
    sk_train, sk_test = tt_split(sk)
    y_train = sk_train.pop('target').values
    X_train = sk_train.values
    y_test = sk_test.pop('target').values
    X_test = sk_test.values
    mod = LogReg(solver='saga',
                 max_iter=5000,
                 multi_class='multinomial',
                 n_jobs=-1)
    mod.fit(X_train, y_train)
    mod_score = np.around(mod.score(X_test, y_test), 3)

    holdout = pd.read_csv('data/start_pos_holdout.csv')
    m0 = holdout['yardline_100'].notna()
    holdout = holdout[m0].copy()
    holdout['target'] = holdout.apply(lambda row: make_target(row), axis=1)
    holdout['is_EOH'] = holdout.apply(lambda row: end_of_half_det(row), axis=1)
    holdout['pos_leads'] = (holdout['posteam_score'] >
                            holdout['defteam_score']).astype(int)
    to_drop.append('game_id')
    holdout.drop(to_drop, axis=1, inplace=True)
    holdout_y = holdout.pop('target').values
    holdout_X = holdout.values
コード例 #18
0
def estimate_noise_matrices(
    X,
    s,
    clf=LogReg(multi_class='auto', solver='lbfgs'),
    cv_n_folds=5,
    thresholds=None,
    converge_latent_estimates=True,
    seed=None,
):
    """Estimates the noise_matrix of shape (K, K). This is the
    fraction of examples in every class, labeled as every other class. The
    noise_matrix is a conditional probability matrix for P(s=k_s|y=k_y).

    Under certain conditions, estimates are exact, and in most
    conditions, estimates are within one percent of the actual noise rates.

    Parameters
    ----------
    X : np.array
      Input feature matrix (N, D), 2D numpy array

    s : np.array
        A discrete vector of labels, s, which may contain mislabeling. "s"
        denotes the noisy label instead of \tilde(y), for ASCII reasons.

    clf : sklearn.classifier or equivalent
      Default classifier used is logistic regression. Assumes clf
      has predict_proba() and fit() defined.

    cv_n_folds : int
      The number of cross-validation folds used to compute
      out-of-sample probabilities for each example in X.

    thresholds : iterable (list or np.array) of shape (K, 1)  or (K,)
      P(s^=k|s=k). If an example has a predicted probability "greater" than
      this threshold, it is counted as having hidden label y = k. This is
      not used for pruning, only for estimating the noise rates using
      confident counts. This value should be between 0 and 1. Default is None.

    converge_latent_estimates : bool
      If true, forces numerical consistency of estimates. Each is estimated
      independently, but they are related mathematically with closed form
      equivalences. This will iteratively make them mathematically consistent.

    seed : int (default = None)
        Set the default state of the random number generator used to split
        the cross-validated folds. If None, uses np.random current random state.

    Returns
    ------
        A two-item tuple containing (noise_matrix, inv_noise_matrix)."""

    return estimate_py_noise_matrices_and_cv_pred_proba(
        X=X,
        s=s,
        clf=clf,
        cv_n_folds=cv_n_folds,
        thresholds=thresholds,
        converge_latent_estimates=converge_latent_estimates,
        seed=seed,
    )[1:-2]
コード例 #19
0
    suby = np.concatenate(
        (['pristine'] * len(y_vanilla[y_vanilla == class_name]),
         ['tampered'] * len(y_augmented[y_augmented == class_name])))

    exec_time = time.time()
    clf.train(subX, suby)
    exec_time = time.time() - exec_time

    print('Done. (Training took %.3f seconds)' % (exec_time))

# Stage 2: a pair of multiclass classifiers: one for pristine images, one for tampered images
clfs = {
    'LogReg': [
        LogReg(penalty='l2',
               max_iter=2000,
               n_jobs=4,
               multi_class='ovr',
               solver='newton-cg')
    ] * 2
}

for n, [c1, c2] in clfs.items():
    print('CVing stage 2 classifiers %s...' % (n), end=' ')

    exec_time = time.time()
    c1.train(X_vanilla, y=y_vanilla)
    c2.train(X_augmented, y_augmented)
    exec_time = time.time() - exec_time

    print('Done. (Training took %.3f seconds)' % (exec_time))
コード例 #20
0
def estimate_py_noise_matrices_and_cv_pred_proba(
    X,
    s,
    clf=LogReg(multi_class='auto', solver='lbfgs'),
    cv_n_folds=5,
    thresholds=None,
    converge_latent_estimates=False,
    py_method='cnt',
    seed=None,
):
    """This function computes the out-of-sample predicted
    probability P(s=k|x) for every example x in X using cross
    validation while also computing the confident counts noise
    rates within each cross-validated subset and returning
    the average noise rate across all examples.

    This function estimates the noise_matrix of shape (K, K). This is the
    fraction of examples in every class, labeled as every other class. The
    noise_matrix is a conditional probability matrix for P(s=k_s|y=k_y).

    Under certain conditions, estimates are exact, and in most
    conditions, estimates are within one percent of the actual noise rates.

    Parameters
    ----------
    X : np.array
      Input feature matrix (N, D), 2D numpy array

    s : np.array
        A discrete vector of labels, s, which may contain mislabeling. "s"
        denotes the noisy label instead of \tilde(y), for ASCII reasons.

    clf : sklearn.classifier or equivalent
      Default classifier used is logistic regression. Assumes clf
      has predict_proba() and fit() defined.

    cv_n_folds : int
      The number of cross-validation folds used to compute
      out-of-sample probabilities for each example in X.

    thresholds : iterable (list or np.array) of shape (K, 1)  or (K,)
      P(s^=k|s=k). If an example has a predicted probability "greater" than
      this threshold, it is counted as having hidden label y = k. This is
      not used for pruning, only for estimating the noise rates using
      confident counts. This value should be between 0 and 1. Default is None.

    converge_latent_estimates : bool
      If true, forces numerical consistency of estimates. Each is estimated
      independently, but they are related mathematically with closed form
      equivalences. This will iteratively make them mathematically consistent.

    py_method : str (Options: ["cnt", "eqn", "marginal", "marginal_ps"])
        How to compute the latent prior p(y=k). Default is "cnt" as it often
        works well even when the noise matrices are estimated poorly by using
        the matrix diagonals instead of all the probabilities.

    seed : int (default = None)
        Set the default state of the random number generator used to split
        the cross-validated folds. If None, uses np.random current random state.

    Returns
    ------
      Returns a tuple of five numpy array matrices in the form:
      (py, noise_matrix, inverse_noise_matrix,
      joint count matrix i.e. confident joint, predicted probability matrix)"""

    confident_joint, psx = estimate_confident_joint_and_cv_pred_proba(
        X=X,
        s=s,
        clf=clf,
        cv_n_folds=cv_n_folds,
        thresholds=thresholds,
        seed=seed,
    )

    py, noise_matrix, inv_noise_matrix = estimate_latent(
        confident_joint=confident_joint,
        s=s,
        py_method=py_method,
        converge_latent_estimates=converge_latent_estimates,
    )

    return py, noise_matrix, inv_noise_matrix, confident_joint, psx
コード例 #21
0
for cl in clss:
    #Constrain text to 50/50 true false split
    df = classify_data(cl)
    X_train = df['text']
    y_train = df['y']
    #Begin random search of vectorizor parameters for tfidf.
    for iteration in range(iterations):
        vec_list = vectorizer(tokenizer=tokenizer,
                              iterations=iterations,
                              max_df_options=max_df,
                              min_df_options=min_df,
                              ngram_range_options=ngram_range)
        X_train = vec_list[iteration]['vec']
        print("Vecorization complete, now training the model")
        # Fit initial model
        logit = LogReg(random_state=42, solver='lbfgs',
                       max_iter=1000).fit(X_train, y_train)
        #Select optimal features
        model = SelectFromModel(logit, prefit=True)
        X_new = model.transform(X_train)
        #Fit new model to selected features
        logit_new = LogReg(random_state=42, solver='lbfgs',
                           max_iter=1000).fit(X_new, y_train)
        #Transform test text according to fitted tfidf from training set
        print("Vectorizing Test Data")
        test_vec = vec_list[iteration]['tfidf'].transform(X_test)
        test_vec_new = model.transform(test_vec)
        #Make predictions from the model with the test data
        print("Calculating predictions")
        prediction = logit_new.predict(test_vec_new)
        fpr_rf, tpr_rf, thresh_rf = skm.roc_curve(list(adjust_test_format(cl)),
                                                  list(prediction))
コード例 #22
0
ファイル: main.py プロジェクト: julekb/LP2
    all_max_features = [100, 500, 1000, 2000, 4000]

    for max_features in all_max_features:

        print('Computing for {} max features'.format(max_features))

        vectorizers = get_ngram_vectorizers(X_train, max_features)
        X_train_ngrams = vectorize(vectorizers, X_train)
        X_test_ngrams = vectorize(vectorizers, X_test)

        vectorizers = get_ngram_vectorizers(X_train_l, max_features)
        X_train_ngrams_l = vectorize(vectorizers, X_train_l)
        X_test_ngrams_l = vectorize(vectorizers, X_test_l)


        models = [LogReg(random_state=43), RF(random_state=44),
            Perc(shuffle=True, random_state=45), SVC(random_state=46)]
        model_names = ['Logistic Regression', 'Random Forest', 'Perceptron', 'Linear SVM']

        for model, model_name in zip(models, model_names):
            print('Computing: {}'.format(model_name))
            for i in range(3):
                print('no lemmatization')
                model.fit(X_train_ngrams[i], y_train)
                score = model.score(X_test_ngrams[i], y_test)
                results[model_name + '_nl_' + str(max_features) + ['uni', 'bi', 'unibi'][i]] = score
                print('Accuracy for test set: ', score)

                print('with lematization')
                model.fit(X_train_ngrams_l[i], y_train)
                score = model.score(X_test_ngrams_l[i], y_test)
コード例 #23
0
 def __init__(self, config=None, class_min=0):
     """Initialize a logistic regression classifier."""
     self.config = self._resolve_config(config)
     self.class_min = self._resolve_class_min(class_min)
     self.model = LogReg(**self.config)
     self.scaler = StandardScaler()
コード例 #24
0
import sklearn.preprocessing
from sklearn.linear_model import LogisticRegression as LogReg

#%% Logistic regression
# if ever need to change X and Y and then save to pickle

X.to_pickle(os.path.join(savedDataFld, 'trainPredictorsDummies.pkl'))
y.to_pickle(os.path.join(savedDataFld, 'trainOutcome.pkl'))

X = pd.read_pickle(os.path.join(savedDataFld, 'trainPredictorsDummies.pkl'))
y = pd.read_pickle(os.path.join(savedDataFld, 'trainOutcome.pkl'))
train2_X = pd.read_pickle(
    os.path.join(savedDataFld, 'testPredictorsDummies.pkl'))
train2_y = pd.read_pickle(os.path.join(savedDataFld, 'testOutcome.pkl'))
X_test = pd.read_pickle(
    os.path.join(savedDataFld, 'actualPredictorsDummies.pkl'))

#%% fit the model on 70% and test on 30%
logRegM = LogReg(random_state=0, solver='lbfgs', max_iter=1000).fit(X, y)
train2_z = logRegM.predict(train2_X)

confusionMatrixInfo(train2_z, train2_y)
"""
{'confusionMatrix': array([[135,  31],
        [ 28,  73]]),
 'accuracy': 0.7790262172284644,
 'no information rate': 0.6217228464419475,
 'sensitivity': 0.8132530120481928,
 'specificity': 0.7227722772277227}
"""
コード例 #25
0
def test_age_identification():
    path = 'pan15-author-profiling-training-dataset-2015-03-02\\pan15-author-profiling-training-dataset-english-2015-03-02\\'

    users = preproc.load_users(path)
    users_dict = preproc.load_users_dict(path)
    truth = preproc.get_users_truth(path)

    #add features
    print 'Creating features'
    df = preproc.create_users_dataframe(path)
    df['label'] = df['user_id'].map(lambda id: 0 if truth[id][0] == 'M' else 1)
    df['age_label'] = df['user_id'].map(lambda user_id: truth[user_id][1])
    le = preprocessing.LabelEncoder()
    df['age_label'] = le.fit_transform(df['age_label'])
    df = features.add_self_references_count(df, users)
    df = features.add_positive_words(
        df, users_dict, preproc.load_words('resources\\positive-words.txt'))
    df = features.add_negative_words(
        df, users_dict, preproc.load_words('resources\\negative-words.txt'))
    df = features.add_articles(df, users_dict)
    df = features.add_url_count(df, users_dict)
    df = features.add_long_words(df, users)

    #normalize features
    print 'Normalizing features'
    scaler = MinMaxScaler(copy=False)
    scaler.fit_transform(df['pos_words'])
    scaler.fit_transform(df['neg_words'])
    scaler.fit_transform(df['self_ref_count'])
    scaler.fit_transform(df['url_count'])
    scaler.fit_transform(df['articles'])

    long_words = [
        'username', 'people', 'nowplaying', 'really', 'should', 'others',
        'thanks', 'twitter', 'always', 'google', 'things', 'better', 'tumblr',
        'school', 'because', 'someone', 'facebook', 'frzhtmoge7', 'please',
        'something'
    ]

    feature_names = [
        'self_ref_count', 'articles', 'pos_words', 'neg_words', 'url_count'
    ]
    all_features = feature_names + long_words

    #initialize classifiers
    log_reg = LogReg()
    svm_clf = svm.SVC()
    gnb_clf = GaussianNB()
    ranfor_clf = RandomForestClassifier()

    clfs = {
        'logistic regression': log_reg,
        'linear SVM': svm_clf,
        'GaussianNB': gnb_clf,
        'random forest': ranfor_clf
    }
    for clf in clfs:
        scores = cross_validation.cross_val_score(clfs[clf],
                                                  df[all_features],
                                                  df['age_label'],
                                                  cv=10)
        print clf + ' : ' + str(scores.mean())
コード例 #26
0
ファイル: IMDB_small_dataframe.py プロジェクト: Leggera/svn
    train_scores = (classification_report(
        train_labels, train_prediction)).split(
            '\n')  #precision, recall and F-score on train data
    train_score = ' '.join(
        train_scores[0].lstrip().split(' ')[:-1]) + '\n' + ' '.join(
            train_scores[-2].split(' ')[3:-1])
    return 'test %.3f train %.3f' % (
        test_accuracy, train_accuracy
    ) + '\n' + 'train: ' + train_score + '\n' + 'test: ' + test_score, k[:-1]


if __name__ == "__main__":
    classifiers_dict = dict()
    search_parameters = dict()
    default_parameters = dict()
    classifiers_dict['LogReg'] = LogReg()
    classifiers_dict['LinearSVC'] = LinearSVC()
    search_parameters['LogReg'] = {'C': (3 * 10**-3, 3 * 10**-2, 3 * 10**-1)}
    search_parameters['LinearSVC'] = {
        'C': (3 * 10**-3, 3 * 10**-2, 3 * 10**-1)
    }

    d0 = ['implementation', 'epoch']
    columns = [
        'cbow', 'size', 'alpha', 'window', 'negative', 'sample', 'min_count'
    ]
    best_params = ['best_parametersLogReg', 'best_parametersLinearSVC']
    classifiers = ['LogReg', 'LinearSVC']
    diag_dir = sys.argv[4]
    epoch = int(sys.argv[5])
    if (epoch == 0):
コード例 #27
0
def main(space_dir, classifier, C=None):
    """ Write evaluation results to the Res_concat_IMDB.csv DataFrame in separate directories and to the output"""

    #future DataFrame fields
    d0 = ['implementation']
    parameters = ['size', 'window', 'negative', 'min_count']
    columns = [
        'size', 'alpha0', 'alpha1', 'window', 'negative', 'cbow0_sample',
        'cbow1_sample', 'min_count', 'iter0', 'iter1'
    ]
    best_params = ['best_parameters']
    classifiers = ['LogReg', 'LinearSVC']

    #default parameters from the article
    default_parameters['size'] = 150
    default_parameters['alpha'] = 0.05
    default_parameters['window'] = 10
    default_parameters['negative'] = 25
    default_parameters['min_count'] = 1
    default_parameters['iter'] = '_wtf_'

    if (
            C is not None
    ):  #if C was given as an input value then initialize classifier with it
        classifiers_dict['LogReg'] = LogReg(C=C)
        #classifiers_dict['SklearnMLP'] = MLPClassifier(hidden_layer_sizes = (50, 50), max_iter=1000)
        classifiers_dict['LinearSVC'] = LinearSVC(C=C)
        #classifiers_dict['StatModelsLogReg'] = sm.Logit()
    else:  #else prepare for GridSerach
        classifiers_dict['LogReg'] = LogReg()
        classifiers_dict['LinearSVC'] = LinearSVC()
        search_parameters['LogReg'] = {
            'C': (10**-5, 3 * 10**-5, 10**-4, 3 * 10**-4, 10**-3, 3 * 10**-3,
                  10**-2, 3 * 10**-2, 10**-1, 3 * 10**-1, 1),
            'max_iter': (200, 400, 1000, 2000)
        }
        search_parameters['LinearSVC'] = {
            'C': (10**-5, 3 * 10**-5, 10**-4, 3 * 10**-4, 10**-3, 3 * 10**-3,
                  10**-2, 3 * 10**-2, 10**-1, 3 * 10**-1, 1),
            'max_iter': (200, 400, 1000, 2000)
        }

    #index  = 0

    for model in os.listdir(
            space_dir):  #for every model in the vectors directory
        if model.endswith('.txt'):  #if it is the name of saved model
            if ('cbow 0' in model):  #if it is a PV-DBOW model
                par_list = [
                ]  #list of parameters from the model name (default parameters are ignored)

                string = model.split(".txt")[0]  #name of the PV-DBOW model
                implementation = string.split()[0]  #doc2vec or word2vec

                for column in parameters:
                    i = string.find(
                        column
                    )  #find each non-default parameter in PV-DBOW model name

                    if (i != -1):
                        value = string[i:].split()[1]  #and its value
                        par_list += [column + ' ' + value
                                     ]  # add them to the list
                    else:
                        par_list += [column + ' -1'
                                     ]  #if there are no such parameter add -1

                for other_model in os.listdir(
                        space_dir):  #for every model in the vectors directory
                    if other_model.endswith(
                            '.txt'):  #if it is the name of saved model
                        if ('cbow 1' in other_model):  #if it is a PV-DM model
                            df = pd.DataFrame(
                                columns=d0 + columns + classifiers +
                                best_params)  #initialize DataFrame
                            samples = []  #list for samples value
                            consider = True  #whether the PV-DM model is compatible with PV-DBOW model
                            other_model = other_model.split(".txt")[
                                0]  #name of the PV-DM model
                            for column in parameters:
                                i = other_model.find(
                                    column
                                )  #find each non-default parameter in PV-DM model name

                                if (i != -1):
                                    if (
                                            column + ' ' +
                                            other_model[i:].split()[1]
                                    ) not in par_list:  #if there is a redundant parameter
                                        consider = False  #then the PV-DM model is not compatible
                                        break
                                else:
                                    if (
                                            column + ' -1'
                                    ) not in par_list:  #if the PV-DM model doesn't have the default parameter
                                        consider = False  #then the it's not compatible
                                        break
                            if (not consider
                                ):  #if model is not compatible then skip it
                                continue
                            #index += 1
                            index = 1  #only one string int the DataFrame

                            #putting parameters into DataFrame
                            for column in parameters:
                                i = string.find(column)

                                if (i != -1):
                                    value = string[i:].split()[1]
                                    df.set_value(index, column, value)
                                else:
                                    df.set_value(index, column,
                                                 default_parameters[column])

                            i = string.find('sample')
                            if (i != -1):
                                value = string[i:].split()[1]
                                df.set_value(index, 'cbow0_sample', value)
                                samples += ['sample ' + value]
                            else:
                                df.set_value(index, 'cbow0_sample', '1e-2')
                                samples += ['sample 1e-2']

                            df.set_value(index, 'implementation',
                                         implementation)

                            i = other_model.find('sample')
                            if (i != -1):
                                value = other_model[i:].split()[1]
                                df.set_value(index, 'cbow1_sample', value)
                            else:
                                df.set_value(index, 'cbow1_sample', '1e-4')

                            i = string.find('alpha')
                            if (i != -1):
                                value = string[i:].split()[1]
                                df.set_value(index, 'alpha0', value)
                            else:
                                df.set_value(index, 'alpha0', '0.05')

                            i = other_model.find('alpha')
                            if (i != -1):
                                value = other_model[i:].split()[1]
                                df.set_value(index, 'alpha1', value)
                            else:
                                df.set_value(index, 'alpha1', '0.05')

                            i = string.find('iter')
                            if (i != -1):
                                value = string[i:].split()[1]
                                df.set_value(index, 'iter0', value)
                            else:
                                df.set_value(index, 'iter0', 'wtf')

                            i = other_model.find('iter')
                            if (i != -1):
                                value = other_model[i:].split()[1]
                                df.set_value(index, 'iter1', value)
                            else:
                                df.set_value(index, 'iter1', 'wtf')

                            #load train and test vectors from PV-DBOW model + labels
                            try:
                                DocumentVectors0_0, DocumentVectors1_0 = DocumentVectors(
                                    space_dir + model, implementation)
                            except:
                                print(model
                                      )  #print which model causes the problem
                                traceback.print_exc(file=sys.stdout)
                                continue
                            #load train and test vectors from PV-DM model + labels
                            try:
                                DocumentVectors0_1, DocumentVectors1_1 = DocumentVectors(
                                    space_dir + other_model + '.txt',
                                    implementation)
                            except:
                                print(other_model
                                      )  #print which model causes the problem
                                traceback.print_exc(file=sys.stdout)
                                continue

                            #concatenate PV-DBOW and PV-DM train models

                            DocumentVectors0 = np.concatenate(
                                (DocumentVectors0_0, DocumentVectors0_1),
                                axis=1)

                            #concatenate PV-DBOW and PV-DM test models
                            DocumentVectors1 = np.concatenate(
                                (DocumentVectors1_0, DocumentVectors1_1),
                                axis=1)
                            d = 50
                            y_1 = [1] * 500
                            y_0 = [0] * 500
                            train_labels = y_1[:-d] + y_0[d:]
                            test_labels = y_1 + y_0
                            dir_name = (
                                other_model +
                                ''.join(samples)).replace(' ', '_').replace(
                                    '-',
                                    '')  #name directory after model parameters
                            run_dir = './runs_IMDB/%s-%s/' % (
                                dir_name, time_str())  #and after starting time
                            os.makedirs(run_dir,
                                        exist_ok=True)  #make this directory
                            #get accuracy, precision, recall, etc. and best parameters (if C was in input then it will be chosen as the best par)
                            accuracy, best = Classification(
                                classifier, C, DocumentVectors0, train_labels,
                                DocumentVectors1, test_labels)
                            #write it all into DataFrame
                            df.set_value(index, classifier, accuracy)
                            df.set_value(index, 'best_parameters', best)
                            df.to_csv(run_dir + "Res_concat_IMDB" +
                                      classifier + ".csv")
                            #and to the output
                            print(other_model)
                            print(model)
                            print(accuracy)
コード例 #28
0
ファイル: main.py プロジェクト: LRAV/clases_datamex0120
# default inicial
EMBARKED = 'Southampton'
FARE = 33
AGE = 30
GENDER = 'Female'
TITLE = 'Mrs.'
CLASS = 'Second'
CABIN = 'C'
SIBSP = 0
PARCH = 0

# inicializando variables
tasa_media = 0

# modelo
logreg = LogReg()

# flask app
app = Flask(__name__)


# antes del primer request...
@app.before_first_request
def startup():
    global tasa_media, logreg

    data = genfromtxt(PATH + '/data/titanic.csv', delimiter=',')

    tasa_media = (np.mean([e[0] for e in data]) * 100)

    X_train, X_test, y_train, y_test = tts([e[1:] for e in data],
コード例 #29
0
        'intercept_scaling': (1, 2, 3),
        'max_iter': (100, 200, 400, 800, 1000),
        'multi_class': ('ovr', 'multinomial')
    }
    #search_parameters['SklearnMLP'] = {'solver' : ('lbfgs', 'sgd', 'adam')}#TODO
    search_parameters['SklearnLinearSVC'] = {
        'loss': ('hinge', 'squared_hinge'),
        'penalty': ('l1', 'l2'),
        'dual': (False, True),
        'fit_intercept': (True, False),
        'intercept_scaling': (1, 2, 3),
        'max_iter': (100, 200, 400, 800, 1000),
        'multi_class': ('ovr', 'crammer_singer')
    }

    classifiers_dict['SklearnLogReg'] = LogReg()
    #classifiers_dict['SklearnMLP'] = MLPClassifier(hidden_layer_sizes = (50, 50), max_iter=1000)
    classifiers_dict['SklearnLinearSVC'] = LinearSVC()

    newsgroups_train = fetch_20newsgroups(subset='train',
                                          remove=('headers', 'footers',
                                                  'quotes'))
    newsgroups_test = fetch_20newsgroups(subset='test',
                                         remove=('headers', 'footers',
                                                 'quotes'))

    vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    DocumentVectors0 = vectorizer.fit_transform(newsgroups_train.data)

    DocumentVectors1 = vectorizer.transform(newsgroups_test.data)
コード例 #30
0
def estimate_confident_joint_and_cv_pred_proba(
    X,
    s,
    clf=LogReg(multi_class='auto', solver='lbfgs'),
    cv_n_folds=5,
    thresholds=None,
    seed=None,
    calibrate=True,
):
    """Estimates P(s,y), the confident counts of the latent
    joint distribution of true and noisy labels
    using observed s and predicted probabilities psx.

    The output of this function is a numpy array of shape (K, K).

    Under certain conditions, estimates are exact, and in many
    conditions, estimates are within one percent of actual.

    Notes: There are two ways to compute the confident joint with pros/cons.
    1. For each holdout set, we compute the confident joint, then sum them up.
    2. Compute pred_proba for each fold, combine, compute the confident joint.
    (1) is more accurate because it correctly computes thresholds for each fold
    (2) is more accurate when you have only a little data because it computes
    the confident joint using all the probabilities. For example if you had 100
    examples, with 5-fold cross validation + uniform p(y) you would only have 20
    examples to compute each confident joint for (1). Such small amounts of data
    is bound to result in estimation errors. For this reason, we implement (2),
    but we implement (1) as a commented out function at the end of this file.

    Parameters
    ----------
    X : np.array
      Input feature matrix (N, D), 2D numpy array

    s : np.array
        A discrete vector of labels, s, which may contain mislabeling. "s"
        denotes the noisy label instead of \tilde(y), for ASCII reasons.

    clf : sklearn.classifier or equivalent
      Default classifier used is logistic regression. Assumes clf
      has predict_proba() and fit() defined.

    cv_n_folds : int
      The number of cross-validation folds used to compute
      out-of-sample probabilities for each example in X.

    thresholds : iterable (list or np.array) of shape (K, 1)  or (K,)
      P(s^=k|s=k). If an example has a predicted probability "greater" than
      this threshold, it is counted as having hidden label y = k. This is
      not used for pruning, only for estimating the noise rates using
      confident counts. This value should be between 0 and 1. Default is None.

    seed : int (default = None)
        Set the default state of the random number generator used to split
        the cross-validated folds. If None, uses np.random current random state.

    calibrate : bool (default: True)
        Calibrates confident joint estimate P(s=i, y=j) such that
        np.sum(cj) == len(s) and np.sum(cj, axis = 1) == np.bincount(s).

    Returns
    ------
      Returns a tuple of two numpy array matrices in the form:
      (joint counts matrix, predicted probability matrix)"""

    assert_inputs_are_valid(X, s)
    # Number of classes
    K = len(np.unique(s))

    # Ensure labels are of type np.array()
    s = np.asarray(s)

    # Create cross-validation object for out-of-sample predicted probabilities.
    # CV folds preserve the fraction of noisy positive and
    # noisy negative examples in each class.
    kf = StratifiedKFold(n_splits=cv_n_folds, shuffle=True, random_state=seed)

    # Intialize psx array
    psx = np.zeros((len(s), K))

    # Split X and s into "cv_n_folds" stratified folds.
    for k, (cv_train_idx, cv_holdout_idx) in enumerate(kf.split(X, s)):

        clf_copy = copy.deepcopy(clf)

        # Select the training and holdout cross-validated sets.
        X_train_cv, X_holdout_cv = X[cv_train_idx], X[cv_holdout_idx]
        s_train_cv, s_holdout_cv = s[cv_train_idx], s[cv_holdout_idx]

        # Fit the clf classifier to the training set and
        # predict on the holdout set and update psx.
        clf_copy.fit(X_train_cv, s_train_cv)
        psx_cv = clf_copy.predict_proba(X_holdout_cv)  # P(s = k|x) # [:,1]
        psx[cv_holdout_idx] = psx_cv

    # Compute the confident counts, a K x K matrix for all pairs of labels.
    confident_joint = compute_confident_joint(
        s=s,
        psx=psx,  # P(s = k|x)
        thresholds=thresholds,
        calibrate=calibrate,
    )

    return confident_joint, psx