Ejemplo n.º 1
0
def main():
    # Get label encoder
    lb = LabelBinarizer()
    lbenc = lb.fit(utils.get_classes())

    # Get train data
    X_train, y_train, train_filenames = utils.get_train(
        '../input/train', list(lbenc.classes_), img_width, img_height)

    # Create and train model
    model = train(X_train, y_train, epochs=100, batch_size=32)

    print("+++++++++++++++++++++++++++++++++++++++++++")

    # Load model ...
    #model = load_model('../models/'+ 'model2_f0.86/'+ 'model2-64-0.341.h5')

    # Get test data
    X_test, X_test_id = utils.get_test('../input/test', img_width, img_height)
    # Predict on test data
    preds = model.predict(X_test, verbose=1)

    # Create submission
    utils.create_submission(lbenc.inverse_transform(preds),
                            X_test_id,
                            output_path="../submissions/",
                            filename=modelname,
                            isSubmission=True)
    utils.to_csv_ens(lbenc.inverse_transform(preds),
                     preds,
                     X_test_id,
                     utils.get_classes(),
                     output_path="../submissions/",
                     filename=modelname)
    print('Finished.')
Ejemplo n.º 2
0
def main():
    parser = build_parser()
    options = parser.parse_args()
    batch_size=options.batch_size
    #train_names=utils.train_names

    # train_set = utils.get_train()
    # val_set = utils.val_n

    model = Resnet()
    model.cuda()
    model = torch.nn.DataParallel(model)

    ###########################
    #train_set,val_set = train_test_split(train_names, test_size=0.2, random_state=2050)
    train_set,val_set=utils.get_split()
    train_set = utils.get_train(train_set)

    train_datasest=ProteinDataset(dirpath=utils.TRAIN,fnames=train_set)
    train_loader = DataLoader(train_datasest, batch_size=batch_size, shuffle=True,num_workers=4)

    val_dataset = ProteinDataset(dirpath=utils.TRAIN,fnames=val_set)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

    train(model, options.epochs, train_loader,val_loader,'sgdr_rgb.pkl')
Ejemplo n.º 3
0
def pca(data):
    train = get_train(data).drop("cnt", axis=1)
    test = get_holdout(data).drop("cnt", axis=1)

    pca = daskpca(n_components=0.95, svd_solver="full").fit(train)

    print("\tPerforming PCA dimensionality reduction...")

    pca_train = dd.DataFrame(data=pca.transform(train))
    pca_test = dd.DataFrame(data=pca.transform(test))

    new_df = pca_train.append(pca_test)
    new_df["cnt"] = data["cnt"]

    return new_df
Ejemplo n.º 4
0
def weather_cluster(data):
    """
    Creates a column that gives a cluster id based on KMeans clustering of only weather-related features

    :param data: a pandas dataframe where each row is an hour
    :return: a pandas dataframe containing the new column
    """
    print("\tAdding clustering variable based on weather-related features...")
    df = data.copy()[["weathersit", "temp", "atemp", "hum", "windspeed"]]
    to_cluster = dd.get_dummies(df)
    train = get_train(to_cluster)
    holdout = get_holdout(to_cluster)

    kmeans = KMeans(n_clusters=5,
                    random_state=SEED).fit(train)  # magic numbers, blech

    data["weather_cluster"] = da.append(kmeans.labels_,
                                        kmeans.predict(holdout))

    data["weather_cluster"] = data["weather_cluster"].astype("category")

    return data
Ejemplo n.º 5
0
def cluster_variable(data):
    """
    Creates a column that gives a cluster id based on KMeans clustering of all features

    :param data: a pandas dataframe where each row is an hour
    :return: a pandas dataframe containing the new column
    """
    print("\tAdding cluster variable...")
    data = data.copy()
    to_cluster = dd.get_dummies(data)
    train = get_train(to_cluster)
    holdout = get_holdout(to_cluster)

    kmeans = KMeans(n_clusters=5, random_state=SEED).fit(
        train.drop("cnt", axis=1))  # magic numbers, blech

    data["cluster"] = da.append(kmeans.labels_,
                                kmeans.predict(holdout.drop("cnt", axis=1)))

    data["cluster"] = data["cluster"].astype("category")

    return data
Ejemplo n.º 6
0
def subcount_forecast(data, feature):
    """
    Creates a new a column that is the predicted value of the input feature

    Essentially an abstraction for 'prediction_forecasts'

    :param data: a pandas dataframe where each row is an hour
    :param feature: a String containing the feature that should be forecasted (one of: casual, registered)
    :return: a pandas dataframe containing the new column
    """
    var_name = feature + "_forecast"
    print("\tAdding {} variable...".format(var_name))
    df = dd.get_dummies(data.copy().drop("cnt", axis=1))
    to_predict = dd.read_csv(PATH)[feature]
    df[feature] = to_predict

    train = get_train(df)

    model = RandomForestRegressor(random_state=SEED)
    model_params = {"n_estimators": list(range(10, 110, 10))}
    #tscv = TimeSeriesSplit(n_splits=5)

    grid_search = GridSearchCV(estimator=model,
                               param_grid=model_params,
                               scoring="r2",
                               cv=None,
                               refit=True)
    grid_search.fit(train.drop(feature, axis=1), train[feature])
    print("\t\tPredictions for GridSearchCV on {}: {:.5f} +/- {:.5f}".format(
        feature, grid_search.best_score_,
        grid_search.cv_results_["std_test_score"][da.argmax(
            grid_search.cv_results_["mean_test_score"])]))

    data[var_name] = grid_search.best_estimator_.predict(
        dd.get_dummies(data.drop("cnt", axis=1)))

    return data
Ejemplo n.º 7
0
def summary(model,
            sampling_method,
            k_folds,
            use_international,
            cat_code,
            data_dir,
            results_dir,
            verbose=True):
    print("model: {} - sampling method: {}".format(model, sampling_method))
    aggregate = {}
    for k in range(k_folds):
        result = pickle.load(
            open(
                os.path.join(
                    results_dir,
                    "{}_{}_fold_{}.p".format(model, sampling_method, k + 1)),
                "rb"))
        print("Fold {}".format(k + 1))
        for key, val in result.items():
            print(key)
            print(val)
            if key not in aggregate:
                if type(val) is np.float32 or type(val) is np.float64 or type(
                        val) is float:
                    aggregate[key] = val
            else:
                if type(val) is np.float32 or type(val) is np.float64 or type(
                        val) is float:
                    aggregate[key] += val
    aggregate = {key: val / k_folds for key, val in aggregate.items()}
    print("Aggregate")
    print(aggregate)
    if model == 'logistic':
        print("feature importance not implemented for logistic regression")
        return
    features, labels, feature_labels = get_train(
        data_dir, one_hot=not cat_code, use_international=use_international)
    country_names = get_country_names(data_dir)
    if use_international:
        country_names = country_names[:2].tolist() + ['international']
    for k in range(k_folds):
        print("Fold {}".format(k + 1))
        correct_examples = pickle.load(
            open(
                os.path.join(
                    results_dir, "{}_{}_fold_{}_correct_examples.p".format(
                        model, sampling_method, k + 1)), "rb"))
        incorrect_examples = pickle.load(
            open(
                os.path.join(
                    results_dir, "{}_{}_fold_{}_incorrect_examples.p".format(
                        model, sampling_method, k + 1)), "rb"))
        feature_imp = pickle.load(
            open(
                os.path.join(
                    results_dir, "{}_{}_feature_imp_fold_{}.p".format(
                        model, sampling_method, k + 1)), "rb"))
        top_20 = [
            (label, feature_imp[label])
            for label in sorted(feature_imp, key=feature_imp.get, reverse=True)
        ][:20]
        print(top_20)
        print("correct examples\n")
        for example in correct_examples:
            print("{} features\n".format(country_names[example['label']]))
            feature_dict = {
                label: feature
                for label, feature in zip(feature_labels, example['features'])
            }
            for label, weight in top_20:
                print("{},{}".format(label, feature_dict[label]))
            print("")
        print("\nincorrect examples\n")
        for example in incorrect_examples:
            print("{} features".format(country_names[example['label']]))
            print("prediction was {}\n".format(
                country_names[example['prediction']]))
            feature_dict = {
                label: feature
                for label, feature in zip(feature_labels, example['features'])
            }
            for label, weight in top_20:
                print("{},{}".format(label, feature_dict[label]))
            print("")
Ejemplo n.º 8
0
def train(sampling_method, k_folds, data_dir, results_dir, device='cpu', use_international=False, verbose=True):
    model = 'lgbm'
    start_time = time.time()
    if verbose:
        print("Using device: {}".format(device))
        print("Reading train data in...")
        if use_international:
            print("Using international class.")
    X_train, Y_train, feature_labels = get_train(data_dir, one_hot=False, use_international=use_international)
    categorical_feature = ['age_bucket', 'gender', 'signup_method', 'signup_flow', 'language', 
                           'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked',
                           'signup_app', 'first_device_type', 'first_browser']
    if verbose:
        print("Successfully loaded data")

    print("Starting Cross-Validation with {} folds...".format(k_folds))
    kf = KFold(n_splits=k_folds)
    kf.get_n_splits(X_train)
    params = {
        'task': 'train',
        'objective': 'multiclass',
        'num_class': 12,
        'num_leaves': 31,
        'lambda_l2': 0.1,
        'learning_rate': 0.3,
        'feature_fraction': 0.9,
        'min_child_weight': 1.0,
        'device': device,
        'gpu_device_id': 0,
        'gpu_platform_id': 0,
        'max_bin': 63,
        'verbose': 0
    }
    if use_international:
        params['objective'] = 'binary'
        del params["num_class"]
     
    for k, (train_index, test_index) in enumerate(kf.split(X_train)):
        print("Processing Fold {} out of {}".format(k+1, k_folds))

        X_trainCV, X_testCV = X_train[train_index], X_train[test_index]
        Y_trainCV, Y_testCV = Y_train[train_index], Y_train[test_index]

        if verbose:
            print("{} sampling process started...".format(sampling_method))
        curr_time = time.time()

        if sampling_method == "adasyn":
            X_train_resampled, Y_train_resampled = ADASYN().fit_sample(X_trainCV, Y_trainCV)
        elif sampling_method == "smote":
            X_train_resampled, Y_train_resampled = SMOTE().fit_sample(X_trainCV, Y_trainCV)
        elif sampling_method == "random":
            X_train_resampled, Y_train_resampled = RandomOverSampler().fit_sample(X_trainCV, Y_trainCV)
        elif sampling_method == "smoteenn":
            X_train_resampled, Y_train_resampled = SMOTEENN().fit_sample(X_trainCV, Y_trainCV)
        else:
            X_train_resampled, Y_train_resampled = X_trainCV, Y_trainCV

        if verbose:
            print("Oversampling completed")
            print("Time Taken: {:.2f}".format(time.time()-curr_time))
            print("Size of Oversampled data: {}".format(X_train_resampled.shape))
            print("{} model(s) selected for classification".format(model))

        curr_time = time.time()
        lgb_train = lgb.Dataset(data=X_train_resampled, label=Y_train_resampled, 
                                feature_name=feature_labels, categorical_feature=categorical_feature)
        clf = lgb.train(params, lgb_train, num_boost_round=30) 
        print("Time taken: {:.2f}".format(time.time()-curr_time))
        Y_probs = clf.predict(X_testCV) 
        result = evaluate(Y_testCV, Y_probs)
        print(result)
        feature_imp = clf.feature_importance(importance_type='gain') 
        feature_imp = {label: imp for label, imp in zip(feature_labels, feature_imp)}
        pickle.dump(feature_imp, open(os.path.join(results_dir, "{}_{}_feature_imp_fold_{}.p".format(model, sampling_method, k+1)), "wb" ))
        pickle.dump(result, open(os.path.join(results_dir, "{}_{}_fold_{}.p".format(model, sampling_method, k+1)), "wb" )) 
        save_examples(X_testCV, Y_testCV, Y_probs, model, sampling_method, k+1, save_dir=results_dir)

    print("Training took {:.2f}s.".format(time.time()-start_time))
    print("Finished.")
Ejemplo n.º 9
0
def submission(model,
               sampling_method,
               data_dir,
               results_dir,
               device='cpu',
               verbose=True):
    if verbose:
        print("Using device: {}".format(device))
        print("Reading train data in...")
    if model == 'lgbm':
        X_train, Y_train, feature_labels = get_train(data_dir, one_hot=False)
    else:
        X_train, Y_train, feature_labels = get_train(data_dir)

    X_test = get_test(data_dir)
    train_ids, test_ids = get_ids(data_dir)
    country_names = get_country_names(data_dir)

    if verbose:
        print("Successfully loaded data")

    lgbm_params = {
        'task': 'train',
        'objective': 'multiclass',
        'num_class': 12,
        'num_leaves': 31,
        'learning_rate': 0.3,
        'lambda_l2': 1.0,
        'feature_fraction': 0.9,
        'min_child_weight': 1.0,
        'device': device,
        'gpu_device_id': 0,
        'gpu_platform_id': 0,
        'max_bin': 63,
        'verbose': 0
    }

    if device == 'cpu':
        xgb_params = {
            "objective": "multi:softprob",
            "num_class": 12,
            "tree_method": "hist",
            "colsample_bytree": 0.9,
            "n_jobs": 2,
            "silent": 1
        }
    else:
        xgb_params = {
            "objective": "multi:softprob",
            "num_class": 12,
            "tree_method": "gpu_hist",
            "colsample_bytree": 0.9,
            "gpu_id": 0,
            "max_bin": 16,
            "silent": 1
        }
    if verbose:
        print("{} sampling process started...".format(sampling_method))
    curr_time = time.time()

    if sampling_method == "adasyn":
        X_train_resampled, Y_train_resampled = ADASYN().fit_sample(
            X_train, Y_train)
    elif sampling_method == "smote":
        X_train_resampled, Y_train_resampled = SMOTE().fit_sample(
            X_train, Y_train)
    elif sampling_method == "random":
        X_train_resampled, Y_train_resampled = RandomOverSampler().fit_sample(
            X_train, Y_train)
    elif sampling_method == "smoteenn":
        X_train_resampled, Y_train_resampled = SMOTEENN().fit_sample(
            X_train, Y_train)
    else:
        X_train_resampled, Y_train_resampled = X_train, Y_train

    if verbose:
        print("Oversampling completed")
        print("Time Taken: {:.2f}".format(time.time() - curr_time))
        print("Size of Oversampled data: {}".format(X_train_resampled.shape))
        print("{} selected for classification".format(model))

    curr_time = time.time()
    if model == 'lgbm':
        categorical_feature = [
            'age_bucket', 'gender', 'signup_method', 'signup_flow', 'language',
            'affiliate_channel', 'affiliate_provider',
            'first_affiliate_tracked', 'signup_app', 'first_device_type',
            'first_browser'
        ]
        lgb_train = lgb.Dataset(data=X_train_resampled,
                                label=Y_train_resampled,
                                feature_name=feature_labels,
                                categorical_feature=categorical_feature)
        clf = lgb.train(lgbm_params, lgb_train, num_boost_round=30)
        print("Time taken: {:.2f}".format(time.time() - curr_time))
        Y_probs = clf.predict(X_test)
        order = np.argsort(-Y_probs[:, :5], axis=1)
    else:
        X_train_xgb = xgb.DMatrix(X_train_resampled,
                                  Y_train_resampled,
                                  feature_names=feature_labels)
        X_test_xgb = xgb.DMatrix(X_test, feature_names=feature_labels)
        clf = xgb.train(xgb_params, X_train_xgb, 30)
        print("Time taken: {:.2f}".format(time.time() - curr_time))
        Y_probs = clf.predict(X_test_xgb)
        order = np.argsort(-Y_probs[:, :5], axis=1)

    print("Generating submission csv...")
    with open(os.path.join(results_dir, 'submission_{}.csv'.format(model)),
              'w') as f:
        writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(['id', 'country'])
        for i in range(len(test_ids)):
            for k in range(5):
                writer.writerow([test_ids[i], country_names[order[i, k]]])
    print("Finished.")
Ejemplo n.º 10
0
"""
This file get preprocessed data
"""

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from utils import load_glove, clean_str, get_train
import pandas as pd
import numpy as np
import re

# get input
train_df = get_train()
texts = train_df['text'].to_list()
tags = train_df['tag'].to_list()

# clean the text
train_df['text'] = train_df['text'].apply(clean_str)

# text2sequence
emb_size = 300
max_features = 6000
maxlen = 50

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(texts)
Ejemplo n.º 11
0
def train(model,
          sampling_method,
          k_folds,
          data_dir,
          results_dir,
          device='cpu',
          use_international=False,
          verbose=True):
    start_time = time.time()
    if verbose:
        print("Using device: {}".format(device))
        print("Reading train data in...")
        if use_international:
            print("Using international class.")

    X_train, Y_train, feature_labels = get_train(
        data_dir, use_international=use_international)

    if verbose:
        print("Successfully loaded data")

    print("Starting Cross-Validation with {} folds...".format(k_folds))
    kf = KFold(n_splits=k_folds)
    kf.get_n_splits(X_train)
    lgbm_params = {
        'task': 'train',
        'objective': 'multiclass',
        'num_class': 12,
        'num_leaves': 31,
        'learning_rate': 0.1,
        'lambda_l2': 1.0,
        'feature_fraction': 0.9,
        'min_child_weight': 1.0,
        'device': device,
        'gpu_device_id': 0,
        'gpu_platform_id': 0,
        'max_bin': 63,
        'verbose': 0
    }
    if use_international:
        lgbm_params['objective'] = 'binary'
        del lgbm_params["num_class"]
    if device == 'cpu':
        xgb_params = {
            "objective": "multi:softprob",
            "num_class": 12,
            "tree_method": "hist",
            "colsample_bytree": 0.9,
            "n_jobs": 2,
            "silent": 1
        }
    else:
        xgb_params = {
            "objective": "multi:softprob",
            "num_class": 12,
            "tree_method": "gpu_hist",
            "colsample_bytree": 0.9,
            "gpu_id": 0,
            "max_bin": 16,
            "silent": 1
        }
    if use_international:
        xgb_params["objective"] = "binary:logistic"
        del xgb_params["num_class"]

    for k, (train_index, test_index) in enumerate(kf.split(X_train)):
        print("Processing Fold {} out of {}".format(k + 1, k_folds))

        X_trainCV, X_testCV = X_train[train_index], X_train[test_index]
        Y_trainCV, Y_testCV = Y_train[train_index], Y_train[test_index]

        if verbose:
            print("{} sampling process started...".format(sampling_method))
        curr_time = time.time()

        if sampling_method == "adasyn":
            X_train_resampled, Y_train_resampled = ADASYN().fit_sample(
                X_trainCV, Y_trainCV)
        elif sampling_method == "smote":
            X_train_resampled, Y_train_resampled = SMOTE().fit_sample(
                X_trainCV, Y_trainCV)
        elif sampling_method == "random":
            X_train_resampled, Y_train_resampled = RandomOverSampler(
            ).fit_sample(X_trainCV, Y_trainCV)
        elif sampling_method == "smoteenn":
            X_train_resampled, Y_train_resampled = SMOTEENN().fit_sample(
                X_trainCV, Y_trainCV)
        else:
            X_train_resampled, Y_train_resampled = X_trainCV, Y_trainCV

        if verbose:
            print("Oversampling completed")
            print("Time Taken: {:.2f}".format(time.time() - curr_time))
            print("Size of Oversampled data: {}".format(
                X_train_resampled.shape))
            print("{} model(s) selected for classification".format(model))

        curr_time = time.time()
        if model == "tree":
            clf = DecisionTreeClassifier().fit(X_train_resampled,
                                               Y_train_resampled)
            print("Time taken: {:.2f}".format(time.time() - curr_time))
            Y_probs = clf.predict_proba(X_testCV)
            result = evaluate(Y_testCV, Y_probs)
            print(result)
            feature_imp = clf.feature_importances_
            feature_imp = {
                label: imp
                for label, imp in zip(feature_labels, feature_imp)
            }
            pickle.dump(
                feature_imp,
                open(
                    os.path.join(
                        results_dir, "{}_{}_feature_imp_fold_{}.p".format(
                            model, sampling_method, k + 1)), "wb"))
            pickle.dump(
                result,
                open(
                    os.path.join(
                        results_dir,
                        "{}_{}_fold_{}.p".format(model, sampling_method,
                                                 k + 1)), "wb"))
            save_examples(X_testCV,
                          Y_testCV,
                          Y_probs,
                          model,
                          sampling_method,
                          k + 1,
                          save_dir=results_dir)
        elif model == "logistic":
            clf = LogisticRegression(penalty="l2",
                                     C=1).fit(X_train_resampled,
                                              Y_train_resampled)
            print("Time taken: {:.2f}".format(time.time() - curr_time))
            Y_probs = clf.predict_proba(X_testCV)
            assert (np.all(
                np.argmax(Y_probs, axis=1) == clf.predict(X_testCV)))
            result = evaluate(Y_testCV, Y_probs)
            print(result)
            pickle.dump(
                result,
                open(
                    os.path.join(
                        results_dir,
                        "{}_{}_fold_{}.p".format(model, sampling_method,
                                                 k + 1)), "wb"))
            save_examples(X_testCV,
                          Y_testCV,
                          Y_probs,
                          model,
                          sampling_method,
                          k + 1,
                          save_dir=results_dir)
        elif model == "xgb":
            X_train_xgb = xgb.DMatrix(X_train_resampled,
                                      Y_train_resampled,
                                      feature_names=feature_labels)
            X_test_xgb = xgb.DMatrix(X_testCV, feature_names=feature_labels)
            clf = xgb.train(xgb_params, X_train_xgb, 30)
            print("Time taken: {:.2f}".format(time.time() - curr_time))
            Y_probs = clf.predict(X_test_xgb)
            result = evaluate(Y_testCV, Y_probs)
            print(result)
            feature_imp = clf.get_score(importance_type='gain')
            pickle.dump(
                feature_imp,
                open(
                    os.path.join(
                        results_dir, "{}_{}_feature_imp_fold_{}.p".format(
                            model, sampling_method, k + 1)), "wb"))
            pickle.dump(
                result,
                open(
                    os.path.join(
                        results_dir,
                        "{}_{}_fold_{}.p".format(model, sampling_method,
                                                 k + 1)), "wb"))
            save_examples(X_testCV,
                          Y_testCV,
                          Y_probs,
                          model,
                          sampling_method,
                          k + 1,
                          save_dir=results_dir)
        elif model == "lgbm":
            lgb_train = lgb.Dataset(data=X_train_resampled,
                                    label=Y_train_resampled,
                                    feature_name=feature_labels)
            clf = lgb.train(lgbm_params, lgb_train, num_boost_round=30)
            print("Time taken: {:.2f}".format(time.time() - curr_time))
            Y_probs = clf.predict(X_testCV)
            result = evaluate(Y_testCV, Y_probs)
            print(result)
            feature_imp = clf.feature_importance(importance_type='gain')
            feature_imp = {
                label: imp
                for label, imp in zip(feature_labels, feature_imp)
            }
            pickle.dump(
                feature_imp,
                open(
                    os.path.join(
                        results_dir, "{}_{}_feature_imp_fold_{}.p".format(
                            model, sampling_method, k + 1)), "wb"))
            pickle.dump(
                result,
                open(
                    os.path.join(
                        results_dir,
                        "{}_{}_fold_{}.p".format(model, sampling_method,
                                                 k + 1)), "wb"))
            save_examples(X_testCV,
                          Y_testCV,
                          Y_probs,
                          model,
                          sampling_method,
                          k + 1,
                          save_dir=results_dir)
        elif model == "ada":
            clf = AdaBoostClassifier(n_estimators=30).fit(
                X_train_resampled, Y_train_resampled)
            print("Time taken for {}: {:.2f}".format(model,
                                                     time.time() - curr_time))
            Y_probs = clf.predict_proba(X_testCV)
            result = evaluate(Y_testCV, Y_probs)
            print(result)
            feature_imp = clf.feature_importances_
            feature_imp = {
                label: imp
                for label, imp in zip(feature_labels, feature_imp)
            }
            pickle.dump(
                feature_imp,
                open(
                    os.path.join(
                        results_dir, "{}_{}_feature_imp_fold_{}.p".format(
                            model, sampling_method, k + 1)), "wb"))
            pickle.dump(
                result,
                open(
                    os.path.join(
                        results_dir,
                        "{}_{}_fold_{}.p".format(model, sampling_method,
                                                 k + 1)), "wb"))
            save_examples(X_testCV,
                          Y_testCV,
                          Y_probs,
                          model,
                          sampling_method,
                          k + 1,
                          save_dir=results_dir)
        elif model == "forest":
            clf = RandomForestClassifier(n_estimators=30,
                                         n_jobs=2).fit(X_train_resampled,
                                                       Y_train_resampled)
            print("Time taken: {:.2f}".format(time.time() - curr_time))
            Y_probs = clf.predict_proba(X_testCV)
            result = evaluate(Y_testCV, Y_probs)
            print(result)
            feature_imp = clf.feature_importances_
            feature_imp = {
                label: imp
                for label, imp in zip(feature_labels, feature_imp)
            }
            pickle.dump(
                feature_imp,
                open(
                    os.path.join(
                        results_dir, "{}_{}_feature_imp_fold_{}.p".format(
                            model, sampling_method, k + 1)), "wb"))
            pickle.dump(
                result,
                open(
                    os.path.join(
                        results_dir,
                        "{}_{}_fold_{}.p".format(model, sampling_method,
                                                 k + 1)), "wb"))
            save_examples(X_testCV,
                          Y_testCV,
                          Y_probs,
                          model,
                          sampling_method,
                          k + 1,
                          save_dir=results_dir)
        else:
            models = [
                "lgbm", "xgb", "ada", "forest", "tree", "logistic"
            ]  # for category codes instead of one hot, use lgbm_train.py
            for m in models:
                print("Training {}...".format(m))
                curr_time = time.time()
                if m == "xgb":
                    X_train_xgb = xgb.DMatrix(X_train_resampled,
                                              Y_train_resampled,
                                              feature_names=feature_labels)
                    X_test_xgb = xgb.DMatrix(X_testCV,
                                             feature_names=feature_labels)
                    clf = xgb.train(xgb_params, X_train_xgb, 30)
                    print("Time taken for {}: {:.2f}".format(
                        m,
                        time.time() - curr_time))
                    Y_probs = clf.predict(X_test_xgb)
                    result = evaluate(Y_testCV, Y_probs)
                    print(result)
                    feature_imp = clf.get_score(importance_type='gain')
                    pickle.dump(
                        feature_imp,
                        open(
                            os.path.join(
                                results_dir,
                                "{}_{}_feature_imp_fold_{}.p".format(
                                    m, sampling_method, k + 1)), "wb"))
                    pickle.dump(
                        result,
                        open(
                            os.path.join(
                                results_dir, "{}_{}_fold_{}.p".format(
                                    m, sampling_method, k + 1)), "wb"))
                    save_examples(X_testCV,
                                  Y_testCV,
                                  Y_probs,
                                  m,
                                  sampling_method,
                                  k + 1,
                                  save_dir=results_dir)
                elif m == "lgbm":
                    lgb_train = lgb.Dataset(data=X_train_resampled,
                                            label=Y_train_resampled,
                                            feature_name=feature_labels)
                    clf = lgb.train(lgbm_params, lgb_train, num_boost_round=30)
                    print("Time taken for {}: {:.2f}".format(
                        m,
                        time.time() - curr_time))
                    Y_probs = clf.predict(X_testCV)
                    result = evaluate(Y_testCV, Y_probs)
                    print(result)
                    feature_imp = clf.feature_importance(
                        importance_type='gain')
                    feature_imp = {
                        label: imp
                        for label, imp in zip(feature_labels, feature_imp)
                    }
                    pickle.dump(
                        feature_imp,
                        open(
                            os.path.join(
                                results_dir,
                                "{}_{}_feature_imp_fold_{}.p".format(
                                    m, sampling_method, k + 1)), "wb"))
                    pickle.dump(
                        result,
                        open(
                            os.path.join(
                                results_dir, "{}_{}_fold_{}.p".format(
                                    m, sampling_method, k + 1)), "wb"))
                    save_examples(X_testCV,
                                  Y_testCV,
                                  Y_probs,
                                  m,
                                  sampling_method,
                                  k + 1,
                                  save_dir=results_dir)
                elif m == "ada":
                    clf = AdaBoostClassifier(n_estimators=30).fit(
                        X_train_resampled, Y_train_resampled)
                    print("Time taken for {}: {:.2f}".format(
                        m,
                        time.time() - curr_time))
                    Y_probs = clf.predict_proba(X_testCV)
                    result = evaluate(Y_testCV, Y_probs)
                    print(result)
                    feature_imp = clf.feature_importances_
                    feature_imp = {
                        label: imp
                        for label, imp in zip(feature_labels, feature_imp)
                    }
                    pickle.dump(
                        feature_imp,
                        open(
                            os.path.join(
                                results_dir,
                                "{}_{}_feature_imp_fold_{}.p".format(
                                    m, sampling_method, k + 1)), "wb"))
                    pickle.dump(
                        result,
                        open(
                            os.path.join(
                                results_dir, "{}_{}_fold_{}.p".format(
                                    m, sampling_method, k + 1)), "wb"))
                    save_examples(X_testCV,
                                  Y_testCV,
                                  Y_probs,
                                  m,
                                  sampling_method,
                                  k + 1,
                                  save_dir=results_dir)
                elif m == "forest":
                    clf = RandomForestClassifier(n_estimators=30,
                                                 n_jobs=2).fit(
                                                     X_train_resampled,
                                                     Y_train_resampled)
                    print("Time taken for {}: {:.2f}".format(
                        m,
                        time.time() - curr_time))
                    Y_probs = clf.predict_proba(X_testCV)
                    result = evaluate(Y_testCV, Y_probs)
                    print(result)
                    feature_imp = clf.feature_importances_
                    feature_imp = {
                        label: imp
                        for label, imp in zip(feature_labels, feature_imp)
                    }
                    pickle.dump(
                        feature_imp,
                        open(
                            os.path.join(
                                results_dir,
                                "{}_{}_feature_imp_fold_{}.p".format(
                                    m, sampling_method, k + 1)), "wb"))
                    pickle.dump(
                        result,
                        open(
                            os.path.join(
                                results_dir, "{}_{}_fold_{}.p".format(
                                    m, sampling_method, k + 1)), "wb"))
                    save_examples(X_testCV,
                                  Y_testCV,
                                  Y_probs,
                                  m,
                                  sampling_method,
                                  k + 1,
                                  save_dir=results_dir)
                elif m == "tree":
                    clf = DecisionTreeClassifier(min_samples_split=2,
                                                 min_samples_leaf=5).fit(
                                                     X_train_resampled,
                                                     Y_train_resampled)
                    print("Time taken for {}: {:.2f}".format(
                        m,
                        time.time() - curr_time))
                    Y_probs = clf.predict_proba(X_testCV)
                    result = evaluate(Y_testCV, Y_probs)
                    print(result)
                    feature_imp = clf.feature_importances_
                    feature_imp = {
                        label: imp
                        for label, imp in zip(feature_labels, feature_imp)
                    }
                    pickle.dump(
                        feature_imp,
                        open(
                            os.path.join(
                                results_dir,
                                "{}_{}_feature_imp_fold_{}.p".format(
                                    m, sampling_method, k + 1)), "wb"))
                    pickle.dump(
                        result,
                        open(
                            os.path.join(
                                results_dir, "{}_{}_fold_{}.p".format(
                                    m, sampling_method, k + 1)), "wb"))
                    save_examples(X_testCV,
                                  Y_testCV,
                                  Y_probs,
                                  m,
                                  sampling_method,
                                  k + 1,
                                  save_dir=results_dir)
                else:
                    clf = LogisticRegression(penalty="l2").fit(
                        X_train_resampled, Y_train_resampled)
                    print("Time taken for {}: {:.2f}".format(
                        m,
                        time.time() - curr_time))
                    Y_probs = clf.predict_proba(X_testCV)
                    result = evaluate(Y_testCV, Y_probs)
                    print(result)
                    pickle.dump(
                        result,
                        open(
                            os.path.join(
                                results_dir, "{}_{}_fold_{}.p".format(
                                    m, sampling_method, k + 1)), "wb"))
                    save_examples(X_testCV,
                                  Y_testCV,
                                  Y_probs,
                                  m,
                                  sampling_method,
                                  k + 1,
                                  save_dir=results_dir)

    print("Training took {:.2f}s.".format(time.time() - start_time))
    print("Finished.")
Ejemplo n.º 12
0
device = torch.device('cuda')
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3"

# hyperparameters
FILE_PATH = "training file.txt"
MAX_SEQUENCE_LENGTH = 75
TRAIN_SIZE = 6500
SEED = 666
EPOCHS = 5
LR = 2e-5
BATCH_SIZE = 32
ACCUMULATION_STEPS = 2  # how many steps it should backward propagate before optimization
OUTPUT_FILE_NAME = "bert_pytorch.bin"

# convert the origin data into a formatted pandas dataframe
train_df = get_train(FILE_PATH)
train_df['text'] = train_df['text'].apply(clean_str)

#convert tag to sequence, maybe there are more elegant way to do this
tags = train_df['tag'].to_list()
tokenizer_tag = Tokenizer()
tokenizer_tag.fit_on_texts(tags)
tags = tokenizer_tag.texts_to_sequences(tags)
tags = np.array(list((map(lambda x: x[0], tags))))

# convert text to bert format sequence
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
sequences = convert_lines(train_df["text"].fillna("DUMMY_VALUE"),
                          MAX_SEQUENCE_LENGTH, tokenizer)

#shuffle the data
Ejemplo n.º 13
0
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 14 16:59:19 2021

@author: clara
"""

import utils


label = 0

# parameters of the classifier
SVMparams = [0.006, 0.005]
methods = ['KS_7']


Xtr, ytr = utils.get_train(label)
utils.grid_search(label, Xtr, ytr,
                  SVMparams, methods,
                  train_size=0.75, graph=False)