plt.bar(xaxis, barplotF1)
    plt.show()
    print("---------------------------------------------------------------------------------------------------------------------", end = '\n')
    print("---------------------------------------------------------------------------------------------------------------------", end = '\n\n')

# get_precision_recall_fscore_support
def get_PRFS(method, test_labels, pred, label_names):
    none = precision_recall_fscore_support(test_labels, pred, average=None)
    micro = precision_recall_fscore_support(test_labels, pred, average='micro')
    macro = precision_recall_fscore_support(test_labels, pred, average='macro')
    weighted = precision_recall_fscore_support(test_labels, pred, average='weighted')
    return (none, micro, macro, weighted)

"""### Dummy Classifier"""

dc_uniform = DummyClassifier(strategy="uniform")
dc_constant_1 = DummyClassifier(strategy="constant", constant=1)
dc_constant_2 = DummyClassifier(strategy="constant", constant=2)
dc_most_frequent = DummyClassifier(strategy="most_frequent")
dc_stratified = DummyClassifier(strategy="stratified")

# με τη μέθοδο fit "εκπαιδεύουμε" τον ταξινομητή στο σύνολο εκπαίδευσης (τα χαρακτηριστικά και τις ετικέτες τους)
from sklearn.metrics import accuracy_score

# με τη μέθοδο predict παράγουμε προβλέψεις για τα δεδομένα ελέγχου (είσοδος τα χαρακτηριστικά μόνο)
predictions = {}
lsvt_accuracy = {}

model = dc_uniform.fit(train, train_labels)
preds = dc_uniform.predict(test)
predictions['dc_uniform'] = preds
Example #2
0
x = dados_combinados.select_dtypes('float64')
y = dados_combinados['ativo_moa']
x_treino, x_teste, y_treino, y_teste = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=376,
                                                        stratify=y)
modelo_r_logistica = LogisticRegression(max_iter=1000)
modelo_r_logistica.fit(x_treino, y_treino)
modelo_r_logistica.score(x_teste, y_teste)

from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

modelo_dummy = DummyClassifier('most_frequent')
modelo_dummy.fit(x_treino, y_treino)
previsao_dummy = modelo_dummy.predict(x_teste)
accuracy_score(y_teste, previsao_dummy)

from sklearn.tree import DecisionTreeClassifier

x = dados_combinados.select_dtypes('float64')
y = dados_combinados['ativo_moa']
x_treino, x_teste, y_treino, y_teste = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=376,
                                                        stratify=y)
modelo_arvore = DecisionTreeClassifier(max_depth=3)
modelo_arvore.fit(x_treino, y_treino)
Example #3
0
print('New binary labels:\t', y_binary_imbalanced[1:30])
np.bincount(y_binary_imbalanced)    # Negative class (0) is the most frequent class
X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)
​
# Accuracy of Support Vector Machine classifier
from sklearn.svm import SVC
​
svm = SVC(kernel='rbf', C=1).fit(X_train, y_train)
svm.score(X_test, y_test)
Dummy Classifiers
DummyClassifier is a classifier that makes predictions using simple rules, which can be useful as a baseline for comparison against actual classifiers, especially with imbalanced classes.

from sklearn.dummy import DummyClassifier
​
# Negative class (0) is most frequent
dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)
# Therefore the dummy 'most_frequent' classifier always predicts class 0
y_dummy_predictions = dummy_majority.predict(X_test)
​
y_dummy_predictions
dummy_majority.score(X_test, y_test)
svm = SVC(kernel='linear', C=1).fit(X_train, y_train)
svm.score(X_test, y_test)
Confusion matrices
Binary (two-class) confusion matrix
from sklearn.metrics import confusion_matrix
​
# Negative class (0) is most frequent
dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)
y_majority_predicted = dummy_majority.predict(X_test)
confusion = confusion_matrix(y_test, y_majority_predicted)
Output: 479 реплик. Так чуть лучше. Возьмем вместо Кенни Randy. 
'''
characters = f[Cartman | Kyle | Stan | Randy]
characters = characters.dropna()
characters = characters.reset_index(drop=True)
'''Делим выборку на трнировочную и на тестовую: '''
X_train, X_test, y_train, y_test = train_test_split(characters['Line'],
                                                    characters['Character'],
                                                    test_size=0.2)

countvec = CountVectorizer(tokenizer=word_tokenize, stop_words="english")
bowed_fit_train = countvec.fit_transform(X_train)
bowed_train = countvec.transform(X_train)
bowed_test = countvec.transform(X_test)
'''В качестве Baseline классификатора используем DummyClassifier:  '''
dummy = DummyClassifier()
dummy.fit(bowed_train, y_train)
pred = dummy.predict(bowed_test)
'''
#print(classification_report(y_test, predicted)) 

             precision    recall  f1-score   support

    Cartman       0.34      0.34      0.34       322
       Kyle       0.24      0.25      0.24       232
      Randy       0.05      0.04      0.04       118
       Stan       0.29      0.29      0.29       269

avg / total       0.26      0.26      0.26       941
Результаты совсем плохие. Попробуем другие классификаторы
Наивный Байес:
Example #5
0
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score, classification_report, precision_recall_curve, \
    average_precision_score, roc_curve, roc_auc_score
import numpy as np
import matplotlib.pyplot as plt

digits = load_digits()
y = digits.target == 9
X_train, X_test, y_train, y_test = train_test_split(digits.data,
                                                    y,
                                                    random_state=0)
# imbalanced data set
Dummy_majority = DummyClassifier(strategy='most_frequent').fit(
    X_train, y_train)
print('Score, most frequent: {}'.format(Dummy_majority.score(X_test, y_test)))
tree = DecisionTreeClassifier(max_depth=2).fit(X_train, y_train)
print('score, decision tree: {}'.format(tree.score(X_test, y_test)))
Dummy = DummyClassifier().fit(X_train, y_train)
print('Score, dummy: {}'.format(Dummy.score(X_test, y_test)))
lr = LogisticRegression(C=0.1).fit(X_train, y_train)
print('Score, logistic regression: {}'.format(lr.score(X_test, y_test)))
# confusion matrix
pred_lr = lr.predict(X_test)
confusion = confusion_matrix(y_test, pred_lr)
print('Confusion Matrix for LogisticRegression:\n{}'.format(confusion))
print('F1-score for LogisticRegression: {}'.format(f1_score(y_test, pred_lr)))
# for multiclass f1-score can be used with average='micro','macro' or 'weighted'
print('Report:\n{}'.format(classification_report(y_test, pred_lr)))
# higher threshold higher precision, lower recall
Example #6
0
    X = X / 255

    ## Create train-test split (as [Joachims, 2006])
    print("Creating train-test split...")
    n_train = 60000
    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test = X[n_train:]
    y_test = y[n_train:]

    return X_train, X_test, y_train, y_test


ESTIMATORS = {
    "dummy":
    DummyClassifier(),
    'CART':
    DecisionTreeClassifier(),
    'ExtraTrees':
    ExtraTreesClassifier(n_estimators=100),
    'RandomForest':
    RandomForestClassifier(n_estimators=100),
    'Nystroem-SVM':
    make_pipeline(Nystroem(gamma=0.015, n_components=1000), LinearSVC(C=100)),
    'SampledRBF-SVM':
    make_pipeline(RBFSampler(gamma=0.015, n_components=1000),
                  LinearSVC(C=100)),
    'LinearRegression-SAG':
    LogisticRegression(solver='sag', tol=1e-1, C=1e4),
    'MultilayerPerceptron':
    MLPClassifier(hidden_layer_sizes=(100, 100),
def run_learning_algorithm():
    print('Assembling input file paths...')
    processed_data_path = os.path.join(os.path.curdir, 'data', 'processed')
    train_file_path = os.path.join(processed_data_path, 'train.csv')
    test_file_path = os.path.join(processed_data_path, 'test.csv')
    
    print('Reading in from input csv...')
    train_df = pd.read_csv(train_file_path, index_col='PassengerId')
    test_df = pd.read_csv(test_file_path, index_col='PassengerId')
    
    print('Processed data read complete: ')
    print(train_df.info() , '\n')
    print(test_df.info() , '\n')
    
    print('Converting training set into matricies and extracting preliminary test set...')
    X = train_df.loc[:,'Age':].values.astype(float)
    y = train_df['Survived'].ravel()
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=0)
    print(X_train.shape, y_train.shape)
    print(X_test.shape, y_test.shape)
    
    print('\n\nCreating baseline model...')
    model_dummy = DummyClassifier(strategy='most_frequent', random_state=0)
    model_dummy.fit(X_train, y_train)
    print_scores('Baseline', model_dummy, X_test, y_test)
    
    print('Writing baseline results...')
    test_X_mat = test_df.values.astype('float')
    test_X_mat = np.delete(test_X_mat, 0, 1)
    base_predictions = model_dummy.predict(test_X_mat)
    write_kaggle_submission('dummy', test_df.index, base_predictions)
    
    print('\n\nCreating logistic regression model 1...')
    model_lr1 = LogisticRegression(random_state=0)
    model_lr1.fit(X_train, y_train)
    print_scores('Logistic Regression', model_lr1, X_test, y_test)
    
    print('Writing regression results..')
    lr1_predictions = model_lr1.predict(test_X_mat)
    write_kaggle_submission('regression1', test_df.index, lr1_predictions)
    
    print('\n\nCreating logistic regression model 2...')
    model_lr2_init = LogisticRegression(random_state=0)
    lr2_param = {'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']}
    model_lr2 = GridSearchCV(model_lr2_init, param_grid = lr2_param, cv=3)
    model_lr2.fit(X_train, y_train)
    print_scores('Logistic Regression 2', model_lr2, X_test, y_test)
    
    print('Writing regression 2 results..')
    lr2_predictions = model_lr1.predict(test_X_mat)
    write_kaggle_submission('regression2', test_df.index, lr2_predictions)
    
    print('\n\nCreating scaled regression model...')
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model_lr_scaled_init = LogisticRegression(random_state=0)
    lr_scale_param = {'C':[1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty' : ['l1','l2']}
    model_lr_scaled = GridSearchCV(model_lr_scaled_init, param_grid=lr_scale_param, cv=3)
    model_lr_scaled.fit(X_train_scaled, y_train)
    print_scores('Scaled Logistic Regression', model_lr_scaled, X_test_scaled, y_test)
    
    print('Writing scaled regression results..')
    test_X_scaled_mat = scaler.transform(test_X_mat)
    lr_scaled_predictions = model_lr_scaled.predict(test_X_scaled_mat)
    write_kaggle_submission('regression_scaled_minmax', test_df.index, lr_scaled_predictions)
Example #8
0
        svc_pipeline.fit(x_train, np.array(y_train).ravel())
        fpr, tpr, _ = roc_curve(y_test, svc_pipeline.decision_function(x_test))
        roc_auc = auc(fpr, tpr)
        pyplot.plot(fpr, tpr, color="green", label='SVC AUC = %0.8f' % roc_auc)

        logistic_pipeline.fit(x_train, np.array(y_train).ravel())
        fpr, tpr, _ = roc_curve(y_test,
                                logistic_pipeline.decision_function(x_test))
        roc_auc = auc(fpr, tpr)
        pyplot.plot(fpr,
                    tpr,
                    color="orange",
                    label='Logistic Regression AUC = %0.8f' % roc_auc)

        baseline_pipeline = make_pipeline(
            StandardScaler(), DummyClassifier(strategy="most_frequent"))
        baseline_pipeline.fit(x_train, np.array(y_train).ravel())
        fpr, tpr, _ = roc_curve(y_test,
                                baseline_pipeline.predict_proba(x_test)[:, 1])
        roc_auc = auc(fpr, tpr)
        pyplot.plot(fpr,
                    tpr,
                    color="red",
                    label='Baseline AUC = %0.8f' % roc_auc)

        pyplot.legend(loc='lower right')
        pyplot.show()

        best_pipeline = logistic_pipeline  # make confusion matrix for logistic regression model
        plot_confusion_matrix(best_pipeline, test_x_input_features,
                              test_y_output_data)
                 'direction': 'Coefficient direction',
                 'variable': ''
             })
fig.update_layout(yaxis={'categoryorder': 'total ascending'})
fig.show()
# -

# ## Explore evaluation metrics

# ### Compare to always classifying as the dominant class

# The data is not so imbalanced since I removed the duplicates
# +
cancer_df['Class'].value_counts()

dummy_majority_classifier = DummyClassifier(strategy='most_frequent')
dummy_majority_fit = dummy_majority_classifier.fit(X_train, y_train)
print(
    'Just classifying everything as malignant gets you {:.3f} accuracy'.format(
        dummy_majority_fit.score(X_test, y_test)))
# -

# ### Confusion matrix on best model

# +
confusion_matrix(y_test, best_classifier.predict(X_test))
# -

# - Precision = proportion of positive calls that are correct; good for optimizing on low FP rate
# - Recall = proportion of all ground truth positives that are called correctly; good for optimizing on low FN rate
# - F-score = harmonic mean of precision and recall (F1-score)
def run(input_sen, output, testloaders, logit_df, seed=42):
    ## META VARIABLES
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    epochs = 20
    batch_size = 32

    # Train-Val-Test Split
    X_train, X_test, y_train, y_test, _, test_idx = train_test_split(
        input_sen, output, np.arange(len(input_sen)), test_size=0.2)
    y_majority = max(Counter(y_test).values()) / len(y_test)
    assert ((input_sen[test_idx] == X_test).all())
    assert ((output[test_idx] == y_test).all())
    X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                      y_train,
                                                      test_size=0.125)

    ros = RandomOverSampler()
    X_train, y_train = ros.fit_resample(X_train.reshape(-1, 1), y_train)

    # Tokenization & Dataloading
    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

    #Train
    encoding = tokenizer(list(map(lambda x: x[0], X_train)),
                         return_tensors='pt',
                         padding=True,
                         truncation=True,
                         max_length=128).to(device)
    input_ids = encoding['input_ids']
    attention_masks = encoding['attention_mask']
    labels = torch.tensor(y_train).unsqueeze(1).to(device)
    train_dataset = TensorDataset(input_ids, attention_masks, labels)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=RandomSampler(train_dataset),
                                  batch_size=batch_size)

    #Val
    encoding = tokenizer(list(X_val),
                         return_tensors='pt',
                         padding=True,
                         truncation=True,
                         max_length=128).to(device)
    input_ids = encoding['input_ids']
    attention_masks = encoding['attention_mask']
    labels = torch.tensor(y_val).unsqueeze(1).to(device)
    val_dataset = TensorDataset(input_ids, attention_masks, labels)
    validation_dataloader = DataLoader(val_dataset,
                                       sampler=RandomSampler(val_dataset),
                                       batch_size=batch_size)

    #Test
    encoding = tokenizer(list(X_test),
                         return_tensors='pt',
                         padding=True,
                         truncation=True,
                         max_length=128).to(device)
    input_ids = encoding['input_ids']
    attention_masks = encoding['attention_mask']
    labels = torch.tensor(y_test).unsqueeze(1).to(device)
    test_dataset = TensorDataset(input_ids, attention_masks, labels)
    prediction_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset,
                                 sampler=prediction_sampler,
                                 batch_size=batch_size)

    model = AlbertForSequenceClassification.from_pretrained(
        'albert-base-v2', num_labels=3).to(device)
    optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()),
                      lr=2e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=len(train_dataloader) * epochs)

    total_t0 = time.time()
    best_state_dict = None
    best_val = 0
    for epoch_i in range(epochs):

        # ========================================
        #               Training
        # ========================================

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()
        total_train_loss = 0

        model.train()

        predictions_train = np.array([])
        true_label_train = np.array([])
        for step, batch in enumerate(train_dataloader):

            # Progress update every 50 batches.
            if step % 50 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
                    step, len(train_dataloader), elapsed))

            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad()
            loss, logits = model(b_input_ids,
                                 token_type_ids=None,
                                 attention_mask=b_input_mask,
                                 labels=b_labels)

            total_train_loss += loss.item() * b_labels.shape[0]
            loss.backward()
            optimizer.step()
            scheduler.step()

            gc.collect()

            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            predictions_train = np.append(predictions_train,
                                          np.argmax(logits, axis=1).flatten())
            true_label_train = np.append(true_label_train, label_ids)

        # Calculate the average loss over all of the batches.
        accuracy_train = np.sum(
            predictions_train == true_label_train) / true_label_train.shape[0]
        f1_macro_train = f1_score(true_label_train,
                                  predictions_train,
                                  average='macro')
        f1_micro_train = f1_score(true_label_train,
                                  predictions_train,
                                  average='micro')
        print("\n  Training Accuracy: {0:.2f}".format(accuracy_train))
        print("  Training F1-MACRO: {0:.2f}".format(f1_macro_train))
        print("  Training F1-MICRO: {0:.2f}".format(f1_micro_train))

        avg_train_loss = total_train_loss / true_label_train.shape[0]
        training_time = format_time(time.time() - t0)
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))

        # ========================================
        #               Validation
        # ========================================

        print("\nRunning Validation...")

        t0 = time.time()
        model.eval()

        total_val_loss = 0
        predictions_val = np.array([])
        true_label_val = np.array([])
        for batch in validation_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            with torch.no_grad():
                (loss, logits) = model(b_input_ids,
                                       token_type_ids=None,
                                       attention_mask=b_input_mask,
                                       labels=b_labels)

            total_val_loss += loss.item() * b_labels.shape[0]

            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            predictions_val = np.append(predictions_val,
                                        np.argmax(logits, axis=1).flatten())
            true_label_val = np.append(true_label_val, label_ids)

        accuracy_val = np.sum(
            predictions_val == true_label_val) / true_label_val.shape[0]
        f1_macro_val = f1_score(true_label_val,
                                predictions_val,
                                average='macro')
        f1_micro_val = f1_score(true_label_val,
                                predictions_val,
                                average='micro')
        print("  Accuracy: {0:.2f}".format(accuracy_val))
        print("  F1-MACRO: {0:.2f}".format(f1_macro_val))
        print("  F1-MICRO: {0:.2f}".format(f1_micro_val))

        performance_metric = f1_macro_val

        if performance_metric > best_val:
            print("Best Model Updated.")
            best_val = performance_metric
            best_state_dict = model.state_dict()

        avg_val_loss = total_val_loss / true_label_val.shape[0]
        validation_time = format_time(time.time() - t0)
        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))

    print("\nTraining complete!")
    print("Total training took {:} (h:mm:ss)".format(
        format_time(time.time() - total_t0)))

    # ========================================
    #               Test
    # ========================================

    model.load_state_dict(best_state_dict)
    model.eval()

    # testloaders['indomain'] = (test_dataloader, None, None)
    test_measures = defaultdict(dict)

    for name, content in testloaders.items():
        testloader, sentences, labels = content
        predictions_test = np.array([])
        true_label_test = np.array([])
        logit_stack = np.empty((0, 3))

        for batch in testloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch

            with torch.no_grad():
                outputs = model(b_input_ids,
                                token_type_ids=None,
                                attention_mask=b_input_mask)

            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            predictions_test = np.append(predictions_test,
                                         np.argmax(logits, axis=1).flatten())
            true_label_test = np.append(true_label_test, label_ids)

            if name == 'indomain':
                logit_stack = np.vstack((logit_stack, logits))

        if name == 'indomain':
            for cnt, idx in enumerate(test_idx):
                logit_df.iloc[idx, :] += logit_stack[cnt, :]
            easy_examples = test_idx[predictions_test == true_label_test]
            hard_examples = test_idx[predictions_test != true_label_test]

        best_accr = np.sum(
            predictions_test == true_label_test) / true_label_test.shape[0]
        best_macro_f1 = f1_score(true_label_test,
                                 predictions_test,
                                 average='macro')
        print(name + ":  Test Accuracy: {0:.2f}".format(best_accr))
        print(name + ":  Test F1-MACRO: {0:.2f}".format(best_macro_f1))

        test_measures[name]['accr'] = best_accr
        test_measures[name]['f1'] = best_macro_f1

        # ========================================
        #              Dummy Test
        # ========================================
        if name == 'indomain':
            dummy_clf = DummyClassifier(strategy="uniform")
            dummy_clf.fit(X_train, y_train)
            predictions_dummy = dummy_clf.predict(X_test)

            dummy_accr = np.sum(predictions_dummy == y_test) / y_test.shape[0]
            dummy_macro_f1 = f1_score(y_test,
                                      predictions_dummy,
                                      average='macro')
            print("  Dummy Accuracy: {0:.2f}".format(dummy_accr))
            print("  Dummy F1-MACRO: {0:.2f}".format(dummy_macro_f1))
        else:
            dummy_clf = DummyClassifier(strategy="uniform")
            dummy_clf.fit(sentences, labels)
            predictions_dummy = dummy_clf.predict(sentences)

            dummy_accr = np.sum(predictions_dummy == labels) / labels.shape[0]
            dummy_macro_f1 = f1_score(labels,
                                      predictions_dummy,
                                      average='macro')
            print("  Dummy Accuracy: {0:.2f}".format(dummy_accr))
            print("  Dummy F1-MACRO: {0:.2f}".format(dummy_macro_f1))
        test_measures[name]['dummy_accr'] = dummy_accr
        test_measures[name]['dummy_f1'] = dummy_macro_f1

    return {
        'seed': seed,
        # 'best_accr': test_measures['indomain']['accr'],
        # 'best_f1': test_measures['indomain']['f1'],
        # 'dummy_accr': test_measures['indomain']['dummy_accr'],
        # 'dummy_f1': test_measures['indomain']['dummy_f1'],
        'best_accr_sw': test_measures['ood_sw']['accr'],
        'best_f1_sw': test_measures['ood_sw']['f1'],
        'dummy_accr_sw': test_measures['ood_sw']['dummy_accr'],
        'dummy_f1_sw': test_measures['ood_sw']['dummy_f1'],
        'best_accr_bs': test_measures['ood_bs']['accr'],
        'best_f1_bs': test_measures['ood_bs']['f1'],
        'dummy_accr_bs': test_measures['ood_bs']['dummy_accr'],
        'dummy_f1_bs': test_measures['ood_bs']['dummy_f1'],
        # 'majority': y_majority
    }
Example #11
0
reducer = KernelPCA(n_components=150, kernel="cosine", random_state=seed)
corpus_train_tfidf_kpca = reducer.fit_transform(train_tfidf)
corpus_test_tfidf_kpca = reducer.transform(test_tfidf)

print('Finished dimensionality reduction')

#%%
from sklearn.dummy import DummyClassifier

X_train = corpus_train_tfidf_kpca
X_test = corpus_test_tfidf_kpca
y_train = train_category
y_test = test_category

dummy_clf = DummyClassifier(strategy='stratified', random_state=seed)
dummy_clf.fit(X_train, y_train)
accuracy_dummy = dummy_clf.score(X_test, y_test)

print('Dummy Classifier Test Performance:', accuracy_dummy)

#%%

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
Example #12
0
    df = df.drop('day', axis=1)
    df['is_Holiday'] = df['month'].apply(
        lambda x: 1 if x in ['Apr', 'May', 'Jun', 'Nov'] else 0)
    df = df.drop('month', axis=1)
    df = df.drop(['title', 'cast'], axis=1)
    #df = pd.get_dummies(df, prefix='is')
    df['vote_average'] = df['vote_average'].fillna(df['vote_average'].mean())
    df = df.drop('crew', axis=1)
    return df


cls = classification_engineering(cls)
X, Y = cls.drop('return', axis=1), cls['return']
train_X, test_X, train_Y, test_Y = train_test_split(X,
                                                    Y,
                                                    train_size=0.75,
                                                    test_size=0.25,
                                                    stratify=Y)
clf = GradientBoostingClassifier(
)  #Gradient Tree Boosting or Gradient Boosted Regression Trees (GBRT)
clf.fit(train_X, train_Y)
print('Classification Score: ', clf.score(test_X, test_Y))
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(train_X, train_Y)
dummy.score(test_X, test_Y)
plt.figure(figsize=(12, 14))
sns.barplot(x=clf.feature_importances_, y=X.columns)
plt.savefig('classification.png')

#Most relevant Features for Revenue prediction: vote_count and Budget
#Most relevant Features for success binary classification: vote_count, Budget, year and belongs_to_collection
Example #13
0
 def __init__(self):
     self.name = "Baseline"
     self.model = DummyClassifier()
def test_set_params():
    clf = DummyClassifier(strategy="most_frequent")
    transformer = EstimatorTransformer(clf)

    transformer.set_params(estimator__strategy="stratified")
    assert clf.strategy == "stratified"
Example #15
0
clf1 = SVC(kernel='linear',C=20).fit(X_train,Y_train)
scores = cross_val_score(clf1,X_train,Y_train,cv=5)
strat_scores = cross_val_score(clf1,X_train,Y_train,cv=StratifiedKFold(5,random_state=10,shuffle=True))
#Loo = LeaveOneOut()
#Loo_scores = cross_val_score(clf1,X_train,Y_train,cv=Loo)
print("The Cross Validation Score :"+str(scores))
print("The Average Cross Validation Score :"+str(scores.mean()))
print("The Stratified Cross Validation Score :"+str(strat_scores))
print("The Average Stratified Cross Validation Score :"+str(strat_scores.mean()))
#print("The LeaveOneOut Cross Validation Score :"+str(Loo_scores))
#print("The Average LeaveOneOut Cross Validation Score :"+str(Loo_scores.mean()))

from sklearn.dummy import DummyClassifier

for strat in ['stratified', 'most_frequent', 'prior', 'uniform']:
    dummy_maj = DummyClassifier(strategy=strat).fit(X_train,Y_train)
    print("Train Stratergy :{} \n Score :{:.2f}".format(strat,dummy_maj.score(X_train,Y_train)))
    print("Test Stratergy :{} \n Score :{:.2f}".format(strat,dummy_maj.score(X_test,Y_test)))
    
X = np.array(df.iloc[:,[0,13]])
y = np.array(df['price_range'])
print("Shape of X:"+str(X.shape))
print("Shape of y:"+str(y.shape))
X = scaler.fit_transform(X)

cm_dark = ListedColormap(['#ff6060', '#8282ff','#ffaa00','#fff244','#4df9b9','#76e8fc','#3ad628'])
cm_bright = ListedColormap(['#ffafaf', '#c6c6ff','#ffaa00','#ffe2a8','#bfffe7','#c9f7ff','#9eff93'])

plt.scatter(X[:,0],X[:,1],c=y,cmap=cm_dark,s=10,label=y)
plt.show()
        random_state=_SEED,
    ), {}),
    "LR_L1": (LogisticRegression(solver="liblinear",
                                 multi_class="ovr",
                                 penalty="l1",
                                 C=0.1,
                                 class_weight="balanced",
                                 random_state=_SEED), {}),
    "LR_L2": (LogisticRegression(solver="liblinear",
                                 multi_class="ovr",
                                 penalty="l2",
                                 C=0.1,
                                 class_weight="balanced",
                                 random_state=_SEED), {}),
    "Baseline": (
        DummyClassifier(strategy="most_frequent"),
        {},
    ),
    "Random": (
        DummyClassifier(strategy="uniform"),
        {},
    ),
}

SELECTED_FEATURES = {
    "heart-statlog": [0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13],
    "cervical-cancer": [0, 1, 2, 3, 4, 5, 6, 22, 23, 24, 25, 26]
}


def run_grid_search(model_name, X_train, y_train):
Example #17
0
data = []
data_labels = []

print "Reading in the training corpus:"
for i in tqdm(root_data_file.getchildren()):
    data.append(' '.join(e for e in i.itertext()))

print "Reading in the training label file:"
for row in tqdm(root_data_label_file.getchildren()):
    data_labels.append(row.attrib['hyperpartisan'])

dummy_accuracies = []
nn_accuracies = []

#Classifier Object
dummy_clf = DummyClassifier(strategy='most_frequent', random_state=0)

# prepare cross validation
kfold = KFold(10, True, 1)
fold_number = 1

for train, test in kfold.split(data):
    print "........... Fold %d ..........." % fold_number
    fold_number = fold_number + 1

    train_corpus = build_corpus(train)
    train_labels = build_labels(train)

    test_corpus = build_corpus(test)
    test_labels = build_labels(test)
Example #18
0
#%% Benchmark: Linear Regression, Random Forest, KNN, SVM, Most frequent class
glostrup_baseline = pd.read_csv(WDIR + 'glostrup_targets.csv',
                                sep=',').dropna()

X_clf = glostrup_baseline[[
    "bmi", "qrs", "qt", "pr", "p_peak_amp_v5", "q_peak_amp_v5",
    "r_peak_amp_v5", "s_peak_amp_v5", "t_peak_amp_v5"
]]
y_clf = glostrup_baseline.sexnumeric

models_clf = [
    LogisticRegression(),
    RandomForestClassifier(),
    KNeighborsClassifier(),
    SVC(),
    DummyClassifier(strategy="most_frequent")
]

names_clf = ["LR", "RF", "KNN", "SVM", "Most Frequent"]

scoring_clf = {
    'acc': 'accuracy',
    'f1_micro': 'f1_micro',
    'precision_micro': 'precision_micro',
    'recall_micro': 'recall_micro',
    'roc_auc': 'roc_auc'
}
results_clf = []

for model, name in zip(models_clf, names_clf):
    scores_clf = cross_validate(model,
Example #19
0
def supervised_classification(Fold,
                              representation_id='zT',
                              n_classes_list=np.arange(5, 61, 5),
                              verbose=False):
    """Perform supervised classification with QDA

    Args:
        Fold (Dict): Fold summary, with training, validation, and test splits.
        representation_id (str) = 'zT' or 'zE'

    Returns:
        results_df: dataframe with results
    """

    n_min_samples = 6  #per-class number of samples used by QDA to fit.
    path = get_paths()
    O = load_dataset()

    results = {}
    results['n_components'] = []
    results['n_htree_classes'] = []
    results['acc_train'] = []
    results['acc_val'] = []
    results['acc_test'] = []
    results['acc_most_freq'] = []
    results['acc_prior'] = []

    dummy_most_freq = DummyClassifier(strategy="most_frequent")
    dummy_prior = DummyClassifier(strategy="stratified")

    for n_classes in n_classes_list:

        merged_labels, _ = get_merged_ordered_classes(
            data_labels=deepcopy(O['cluster']),
            htree_file=path['htree'],
            n_required_classes=n_classes,
            verbose=False)

        X_train = deepcopy(Fold[representation_id][Fold['train_ind']])
        y_train = deepcopy(merged_labels[Fold['train_ind']])
        ind_train = np.arange(0, np.shape(X_train)[0])

        X_val = deepcopy(Fold[representation_id][Fold['val_ind']])
        y_val = deepcopy(merged_labels[Fold['val_ind']])
        ind_val = np.arange(0, np.shape(X_val)[0])

        X_test = deepcopy(Fold[representation_id][Fold['test_ind']])
        y_test = deepcopy(merged_labels[Fold['test_ind']])
        ind_test = np.arange(0, np.shape(X_test)[0])

        #Remove types with low sample counts in training set
        df = pd.DataFrame({'ind': ind_train, 'lbl': y_train})
        df_train = df[df.groupby('lbl')['lbl'].transform('count').ge(
            n_min_samples)]
        keep_ind = df_train['ind'].values
        X_train = X_train[keep_ind, :]
        y_train = y_train[keep_ind]

        #Print types that were ignored
        if verbose:
            df_train_del = df[df.groupby('lbl')['lbl'].transform('count').lt(
                n_min_samples)]
            print(df_train_del['lbl'].value_counts())

        #Remove types from validation set that are not represented in the training set
        df = pd.DataFrame({'ind': ind_val, 'lbl': y_val})
        df_val = df[df['lbl'].isin(y_train)]
        keep_ind = df_val['ind'].values
        X_val = X_val[keep_ind, :]
        y_val = y_val[keep_ind]

        #Remove types from test set that are not represented in the training set
        df = pd.DataFrame({'ind': ind_test, 'lbl': y_test})
        df_test = df[df['lbl'].isin(y_test)]
        keep_ind = df_test['ind'].values
        X_test = X_test[keep_ind, :]
        y_test = y_test[keep_ind]

        #QDA related metrics
        qda = QDA(reg_param=1e-2, store_covariance=True)
        qda.fit(X_train, y_train)
        y_train_pred = qda.predict(X_train)
        y_val_pred = qda.predict(X_val)
        y_test_pred = qda.predict(X_test)

        results['n_htree_classes'].append(n_classes)
        results['n_components'].append(np.unique(qda.classes_).size)
        results['acc_train'].append(accuracy_score(y_train, y_train_pred))
        results['acc_val'].append(accuracy_score(y_val, y_val_pred))
        results['acc_test'].append(accuracy_score(y_test, y_test_pred))

        #For dummy classifiers
        dummy_most_freq.fit(Fold[representation_id], merged_labels)
        most_freq_pred = dummy_most_freq.predict(Fold[representation_id])

        dummy_prior.fit(Fold[representation_id], merged_labels)
        prior_pred = dummy_prior.predict(Fold[representation_id])

        results['acc_most_freq'].append(
            accuracy_score(merged_labels, most_freq_pred))
        results['acc_prior'].append(accuracy_score(merged_labels, prior_pred))

    results_df = pd.DataFrame(results)
    return results_df
Example #20
0
                    y='Accuracy',
                    data=pd.DataFrame(d),
                    hue='type')
plot.get_figure().savefig('log_description_features2acc_8_2000_80.png')

# btrain_X, train_y, btest_X, test_y = generate_description_features(
#     1, 1000, 'tfidf')
train_X, train_y, test_X, test_y = generate_shelves_features(2000, 'tfidf')
# exit()
# train_X, train_y, test_X, test_y = generate_description_features(
#     1, 4000, 'count')
# pickle.dump((train_X, train_y, test_X, test_y), open('features.pickle', 'wb'))
# clf = AdaBoostClassifier(RandomForestClassifier(), n_estimators=10)
# clf.fit(train_X, train_y)
# print('Ada + RF', clf.score(test_X, test_y))
clf = DummyClassifier()
clf.fit(train_X, train_y)
print('Dummy', clf.score(test_X, test_y))
clf = LogisticRegression(solver='lbfgs', multi_class='multinomial', n_jobs=3)
clf.fit(train_X, train_y)
print('Log', clf.score(test_X, test_y))
clf = MultinomialNB()
clf.fit(train_X, train_y)
print('NB', clf.score(test_X, test_y))
clf = RandomForestClassifier()
clf.fit(train_X, train_y)
print('RF', clf.score(test_X, test_y))
clf = DecisionTreeClassifier()
clf.fit(train_X, train_y)
print('DT', clf.score(test_X, test_y))
clf = VotingClassifier(estimators=[
# prepare the classification pipeline
from sklearn.pipeline import Pipeline
from nilearn.connectome import ConnectivityMeasure
from sklearn.svm import LinearSVC
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import GridSearchCV

kinds = ['correlation', 'partial correlation', 'tangent']

pipe = Pipeline([('connectivity', ConnectivityMeasure(vectorize=True)),
                 ('classifier',
                  GridSearchCV(LinearSVC(), {'C': [.1, 1., 10.]}, cv=5))])

param_grid = [{
    'classifier': [DummyClassifier(strategy='most_frequent')]
}, {
    'connectivity__kind': kinds
}]

######################################################################
# We use random splits of the subjects into training/testing sets.
# StratifiedShuffleSplit allows preserving the proportion of children in the
# test set.
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder

groups = [pheno['Child_Adult'] for pheno in development_dataset.phenotypic]
classes = LabelEncoder().fit_transform(groups)

cv = StratifiedShuffleSplit(n_splits=30, random_state=0, test_size=10)
Example #22
0
                                                        y,
                                                        test_size=0.25,
                                                        stratify=y)
print("Treinaremos com %d elementos e testaremos com %d elementos" %
      (len(treino_x), len(teste_x)))

modelo = LinearSVC()
modelo.fit(treino_x, treino_y)
previsoes = modelo.predict(teste_x)

acuracia = accuracy_score(teste_y, previsoes) * 100
print("A acurácia foi %.2f%%" % acuracia)

from sklearn.dummy import DummyClassifier

dummy_stratified = DummyClassifier()
dummy_stratified.fit(treino_x, treino_y)
acuracia = dummy_stratified.score(teste_x, teste_y) * 100

print("A acurácia do dummy stratified foi %.2f%%" % acuracia)

from sklearn.dummy import DummyClassifier

dummy_mostfrequent = DummyClassifier()
dummy_mostfrequent.fit(treino_x, treino_y)
acuracia = dummy_mostfrequent.score(teste_x, teste_y) * 100

print("A acurácia do dummy mostfrequent foi %.2f%%" % acuracia)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
Example #23
0
def fit_dummy(X, y, n_iter):
    """Fit a dummy estimator"""
    model = DummyClassifier(strategy="prior")
    model.fit(X, y)
    return model
Example #24
0
def test_string_labels():
    X = [[0]] * 5
    y = ["paris", "paris", "tokyo", "amsterdam", "berlin"]
    clf = DummyClassifier(strategy="most_frequent")
    clf.fit(X, y)
    assert_array_equal(clf.predict(X), ["paris"] * 5)
Example #25
0
def main(logger=None):
    ''' Main routine to call the entire process flow ''';

    # Load_Dataset --- Process starts

    logger.info(f'');
    logger.info(f'{"-"*20} Load dataset starts here {"-"*20}');
    logger.info(f'');

    # TODO: DONE; Load Cancer dataset;

    cancer_data_dict = datasets.load_breast_cancer();
    cancer_data_pd = convert2pandas_df(x_array=cancer_data_dict['data'],
                      y=[ cancer_data_dict['target_names'][i] for i in cancer_data_dict['target'] ],
                      # feature_names=iris_dict['feature_names'],
                      feature_names=list(cancer_data_dict['feature_names']),
                      target_name='Target');

    # logger.info(f'{cancer_data_pd.head()}');

    sns.lmplot( x="area error", y="compactness error", data=cancer_data_pd, fit_reg=False, hue='Target', legend=False,
               palette=dict(malignant="#BF0C2B", benign="#02173E")); # , versicolor="#F5900E"));
    plt.legend(loc='lower right');
    chart_save_image(plt=plt, f_size=(8, 8), left=0.125, right=0.9, bottom=0.125, top=0.9, wspace=0.0, hspace=0.0, fileName='./Cancer_Data_Plot.png');

    selected_columns = ['Target', 'mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity',
                        'mean concave points', 'mean symmetry'];

    g = sns.pairplot(cancer_data_pd[selected_columns], hue="Target", diag_kind="kde",  palette=dict(malignant="#BF0C2B", benign="#02173E"), diag_kws=dict(shade=True));
    for i, j in zip(*np.triu_indices_from(g.axes, 1)):
        g.axes[i, j].set_visible(False);
    chart_save_image(plt=plt, f_size=(16, 16), left=0.05, right=0.97, bottom=0.05, top=0.97, wspace=0.02, hspace=0.02, fileName='./Cancer_Data_PairPlot.png');

    logger.info(f'');
    logger.info(f'{"-"*20}  Load dataset ends here {"-"*20}');
    logger.info(f'');

    # Load_Dataset --- Process ends

    # __Placeholder__ --- Process Starts

    # TODO: DONE; 001; Train test split; stratified;
    X_train, X_test, y_train, y_test = train_test_split(cancer_data_pd[cancer_data_dict.feature_names],
                                                        # cancer_data_pd['Target'],
                                                        cancer_data_dict['target'], # Has to be binary for scorer F1 and Percision;
                                                        test_size=0.20,
                                                        # stratify=cancer_data_pd['Target'],
                                                        stratify=cancer_data_dict['target'],
                                                        random_state=111,
                                                        shuffle=True);

    logger.info(f'X_train.shape : {X_train.shape}');
    logger.info(f'X_test.shape  : {X_test.shape}');
    logger.info(f'Y_train.shape : {y_train.shape}');
    logger.info(f'Y_test.shape  : {y_test.shape}');

    # TODO: DONE; 002; Dummy Classifier ;

    # dummy_classifier = DummyClassifier(strategy="stratified");
    dummy_classifier = DummyClassifier(strategy="most_frequent");

    # TODO: DONE; 003; Cross_over_score and predict and Metrics (make_scorer)

    accuracy_scorer = make_scorer(cost_accuracy, greater_is_better=True);

    kfold = model_selection.KFold(n_splits=10, random_state=111);
    # results = model_selection.cross_val_score(dummy_classifier, X_train, y_train, cv=kfold, scoring='accuracy');
    # logger.info(f'{results} {np.mean(results)} {np.var(results)} {np.std(results)}');

    results = model_selection.cross_val_score(dummy_classifier, X_train, y_train, cv=kfold, scoring=accuracy_scorer);
    logger.info(f'{results} {np.mean(results)} {np.var(results)} {np.std(results)}');

    DummyClassifier_mean = np.mean(results);

    # TODO: DONE; 004; Standardization ;

    # std_scaler = preprocessing.StandardScaler();  # Contains the negative values
    std_scaler = preprocessing.MinMaxScaler(); # Range between 0 to 1; No negative terms;
    std_scaler = std_scaler.fit(X_train);
    scaled_X_train = pd.DataFrame(std_scaler.transform(X_train), columns=X_train.columns);

    logger.info(f'{X_train["mean radius"].describe()}');
    logger.info(f'{scaled_X_train["mean radius"].describe()}');

    # TODO: DONE; 005; SelectKBest; Feature selection ;

    # selectKbest_est = SelectKBest(chi2, k=4); f_classif
    selectKbest_est = SelectKBest(f_classif, k=8);
    selectKbest_X_train = selectKbest_est.fit_transform(X_train, y_train);

    logger.info(f'{selectKbest_est.get_params(deep=True)}');
    logger.info(f'{selectKbest_est.get_support(indices=False)}');
    logger.info(f'{selectKbest_est.get_support(indices=True)}');
    logger.info(f'{X_train.columns[selectKbest_est.get_support(indices=True)]}');

    # TODO: DONE; 006; Polynomial Features ;

    poly = preprocessing.PolynomialFeatures(degree=2, include_bias=False, interaction_only=False);
    X_train_poly = poly.fit_transform(X_train);
    X_train_p2 = pd.DataFrame(X_train_poly, columns=poly.get_feature_names(X_train.columns));

    lr = linear_model.LogisticRegression(fit_intercept=False, random_state=111);
    results = model_selection.cross_val_score(lr, X_train_p2, y_train, cv=kfold, scoring=accuracy_scorer); # , verbose=True);

    imp_percentage = round((np.mean(results) - DummyClassifier_mean) / DummyClassifier_mean, 4);

    logger.info(f'DummyClassifier accuracy : {DummyClassifier_mean}');
    logger.info(f'LogisticRegression accuracy : {np.mean(results)}');

    logger.info(f'The improvement over the DummyClassifier is : {imp_percentage}');

    # TODO: DONE; 007; Kernel PCA ;

    # kernel_param = ('rbf', 0.25);
    kernel_param = ('rbf', 1);

    kpca = KernelPCA(n_components=4, kernel=kernel_param[0], gamma=kernel_param[1], fit_inverse_transform=True, random_state=111) # n_jobs=-1,
    kpca.fit(scaled_X_train);   # The data has to be scaled;
    kpca_X_train = kpca.transform(scaled_X_train);

    lr = linear_model.LogisticRegression(fit_intercept=False, random_state=111);
    results = model_selection.cross_val_score(lr, kpca_X_train, y_train, cv=kfold, scoring=accuracy_scorer); # , verbose=True);

    imp_percentage = round((np.mean(results) - DummyClassifier_mean) / DummyClassifier_mean, 4);

    logger.info(f'DummyClassifier accuracy : {DummyClassifier_mean}');
    logger.info(f'LogisticRegression accuracy : {np.mean(results)}');

    logger.info(f'The improvement over the DummyClassifier is : {imp_percentage}');

    # TODO: DONE; 008; Grid-Search ;

    # tuned_parameters = [{
    #                      'n_estimators' : [1, 10, 100, 500, 1000, 2000],
    #                      'max_depth' : [10, 20],
    #                      'max_features' : [0.80, 0.40],
    #                      'random_state' : [111]
    #                      }];

    tuned_parameters = [{
                         'n_estimators' : [1, 10],
                         'max_depth' : [10, 20],
                         'max_features' : [0.80, 0.40],
                         'random_state' : [111]
                         }];

    clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, scoring=accuracy_scorer);
    clf.fit(X_train, y_train);

    logger.info(f'Best parameters set found on development set: {clf.best_score_} {clf.best_params_}');
    logger.info('');
    logger.info('Grid scores on development set:');
    logger.info('');
    means = clf.cv_results_['mean_test_score'];
    stds = clf.cv_results_['std_test_score'];
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        logger.info(f'{round(mean,3)} (+/-{round(std*2,2)}) for {params}');
    logger.info('');

    logger.info('Detailed classification report:');
    logger.info('');
    logger.info('The model is trained on the full development set.');
    logger.info('The scores are computed on the full evaluation set.');
    logger.info('');
    y_true, y_pred = y_test, clf.predict(X_test);
    logger.info(f'{metrics.classification_report(y_true, y_pred)}');
    logger.info('');

    imp_percentage = round((clf.best_score_ - DummyClassifier_mean) / DummyClassifier_mean, 4);
    logger.info(f'DummyClassifier accuracy : {DummyClassifier_mean}');
    logger.info(f'GridSearchCV RandomForestClassifier accuracy : {clf.best_score_}');
    logger.info(f'The improvement over the DummyClassifier is : {imp_percentage}');

    # logger.info(f'{clf.best_estimator_}');

    # TODO: DONE; 009; Customer Transformer for the pipeline ;
    # reference : https://ramhiser.com/post/2018-04-16-building-scikit-learn-pipeline-with-pandas-dataframe/
    # http://philipmgoddard.com/modeling/sklearn_pipelines

    ctf = ColumnTypeFilter(np.number);
    ctf.fit_transform(X_train).head();

    # TODO: YTS; 010; Pipeline ;

    custom_pipeline = make_pipeline(
            FeatureUnion(transformer_list=[
                ('StdScl', make_pipeline(
                    ColumnTypeFilter(np.number),
                    preprocessing.StandardScaler()
                )),
                ('MMScl', make_pipeline(
                    ColumnTypeFilter(np.number),
                    preprocessing.MinMaxScaler()
                ))
            ])
    );

    custom_pipeline.fit(X_train);
    X_test_transformed = custom_pipeline.transform(X_test);

    logger.info(f'{X_test.shape} {type(X_test_transformed)} {X_test_transformed.shape}');

    # TODO: DONE; 011; Ensemble (VotingClassifier) and BaseClone;

    ensemble_clf = VotingClassifier(estimators=[
                            ('dummy', dummy_classifier),
                            ('logistic', lr),
                            # ('supportvector', SVC(probability=True)),
                            ('randomforest', RandomForestClassifier())],
                            voting='soft');

    ensemble_clf.fit(X_train, y_train);
    ensemble_clf_accuracy_ = cost_accuracy(y_test, ensemble_clf.predict(X_test));

    imp_percentage = round((ensemble_clf_accuracy_ - DummyClassifier_mean) / DummyClassifier_mean, 4);
    logger.info(f'DummyClassifier accuracy : {DummyClassifier_mean}');
    logger.info(f'GridSearchCV RandomForestClassifier accuracy : {ensemble_clf_accuracy_}');
    logger.info(f'The improvement over the DummyClassifier is : {imp_percentage}');

    # TODO: DONE; 012; One-hot encoder; Label Encoder; Binary Encoder;

    baby_names = ['Ava', 'Lily', 'Noah', 'Jacob', 'Mia', 'Sophia'];
    X_train_list = [ np.random.choice(baby_names) for i in range(40) ];
    X_test_list = [ np.random.choice(baby_names) for i in range(6) ];

    bb_labelencoder = preprocessing.LabelEncoder();
    bb_labelencoder.fit(X_train_list);
    bb_encoded = bb_labelencoder.transform(X_test_list);

    bb_onehotencoder = preprocessing.OneHotEncoder(sparse=False);
    bb_encoded = bb_encoded.reshape(len(bb_encoded), 1);
    bb_onehot = bb_onehotencoder.fit_transform(bb_encoded);

    for i, v in enumerate(X_test_list):
        logger.info(f'Actual : {v} \t | LabelEncoded : {bb_encoded[i][0]} \t | OneHot : {bb_onehot[i]}');

    # TODO: DONE; 013; Feature Extraction from image and text;

    corpus = [  'This is the first document.',
                'This document is the second document.',
                'And this is the third one.',
                'Is this the first document?', ]

    vectorizer = CountVectorizer();
    X = vectorizer.fit_transform(corpus);

    cntvector_out = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names());

    for i, v in enumerate(corpus):
        logger.info(f'Input text : {v}');
        logger.info(f'Output counter vector : {v}');
        logger.info(f'{cntvector_out.iloc[i]}');
Example #26
0
def test_classifier_exceptions():
    clf = DummyClassifier(strategy="unknown")
    assert_raises(ValueError, clf.fit, [], [])

    assert_raises(ValueError, clf.predict, [])
    assert_raises(ValueError, clf.predict_proba, [])
Example #27
0
# %%
from sklearn.model_selection import train_test_split

data_numeric_train, data_numeric_test, target_train, target_test = \
    train_test_split(data_numeric, target, random_state=0)

# %% [markdown]
# We will first create a dummy classifier which will always predict the
# high revenue class class, i.e. `" >50K"`, and check the statistical
# performance.

# %%
from sklearn.dummy import DummyClassifier

class_to_predict = " >50K"
high_revenue_clf = DummyClassifier(strategy="constant",
                                   constant=class_to_predict)
high_revenue_clf.fit(data_numeric_train, target_train)
score = high_revenue_clf.score(data_numeric_test, target_test)
print(f"Accuracy of a model predicting only high revenue: {score:.3f}")

# %% [markdown]
# We clearly see that the score is below 0.5 which might be surprising at
# first. We will now check the statistical performance of a model which always
# predict the low revenue class, i.e. `" <=50K"`.

# %%
class_to_predict = " <=50K"
low_revenue_clf = DummyClassifier(strategy="constant",
                                  constant=class_to_predict)
low_revenue_clf.fit(data_numeric_train, target_train)
score = low_revenue_clf.score(data_numeric_test, target_test)
Example #28
0
# Completely custom scorer objects
######

import numpy as np
def my_custom_loss_func(y_true, y_pred):
    diff = np.abs(y_true - y_pred).max()
    return np.log1p(diff) # log(1+x) # log shifted to the left by 1

# score will negate the return value of my_custom_loss_func,
# which will be np.log(2), 0.693, given the values for X
# and y defined below.
score = make_scorer(my_custom_loss_func, greater_is_better=False)
X = [[1], [1]]
y = [0, 1]
from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy='most_frequent', random_state=0) # always predicts the most frequent label in the training set.
clf = clf.fit(X, y)
print(my_custom_loss_func(y, clf.predict(X)))

print(score(clf, X, y)) # Here I use the scorer directly, not in CV
# N.B. score automatically negated, as it is a loss

from sklearn.metrics import accuracy_score
print(accuracy_score(clf.predict(X), y)) # Just to show a comparison

#####
# Using multiple metric evaluation
#####

# as an iterable of string metrics
scoring = ['accuracy', 'precision']
Example #29
0
	def __init__(self):
		self._clf = DummyClassifier('most_frequent')
Example #30
0
 def create_model(self):
     self.model_dummy = DummyClassifier(strategy='most_frequent', random_state=0)
     self.model_dummy.fit(self.X_train, self.y_train)
     print("score for baseline model: {0:.2f}".format(self.model_dummy.score(self.X_test, self.y_test)))