def __init__(self):
        """
        Constructor
        """
        # preprocessor instance
        self.__pre_process = PreProcessor()
        self.__train, self.__y_train = self.__pre_process.get_train_data()

        # Tuning Parameters
        self.__n_folds = 3  # Cross-validation with k-folds

        # Models
        self.__lasso = make_pipeline(
            RobustScaler(), Lasso(alpha=0.0005, random_state=1))
        self.__ENet = make_pipeline(RobustScaler(), ElasticNet(
            alpha=0.0005, l1_ratio=.9, random_state=3))
        self.__KRR = KernelRidge(
            alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
        self.__GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                                  max_depth=4, max_features='sqrt',
                                                  min_samples_leaf=15, min_samples_split=10,
                                                  loss='huber', random_state=5)
        self.__model_xgb = xgb.XGBRegressor(colsample_bytree=0.2, gamma=0.0,
                                            learning_rate=0.05, max_depth=6,
                                            min_child_weight=1.5, n_estimators=7200,
                                            reg_alpha=0.9, reg_lambda=0.6,
                                            subsample=0.2, seed=42, silent=1,
                                            random_state=7)
Beispiel #2
0
def test_bptt():
    file_path = "\\data\\reddit.csv"
    preprocessor = PreProcessor()
    preprocessor.process_word_index(file_path)
    X_train = preprocessor.X_train[10]
    Y_train = preprocessor.Y_train[10]
    vocabulary_size = BaseConfig.vocabulary_size
    np.random.seed(10)
    rnn = SimpleRNN(vocabulary_size)
    print rnn.bptt(X_train, Y_train)
def worker(X_train, y_train, X_test, y_test, pm, rs):
    p = PreProcessor(stop_words=pm[0], tf=pm[1], idf=pm[2], scale=pm[3])
    print 'Test', p.get_name()

    m = SGDClassifier()
    m.fit(p.fit_training(X_train), y_train)
    y_pred = m.predict(p.fit_test(X_test))

    f1 = metrics.f1_score(y_test, y_pred)
    rs[float(f1)] = p.get_name()
    def __init__(self):
        """
        Constructor
        """
        # preprocessor instance
        self.__pre_process = PreProcessor()
        self.__train_data, self.__train_targets = self.__pre_process.get_train_data(
        )
        # print(self.__train_data)

        # Tuning Parameters
        self.__n_folds = 5  # Cross-validation with k-folds
        self.__num_epochs = 400
Beispiel #5
0
 def __init__(self):
     """
     constructor
     """
     self.__pre_processor = PreProcessor()
     self.__trainer = Trainer()
     self.__predictor = Predictor()
Beispiel #6
0
class TestPreprocess(unittest.TestCase):
    def setUp(self):
        self.pp = PreProcessor()
        self.pp.parse_file("../data/samples.txt")

    def test_file_parser(self):
        """ Test routine for file parser """
        count = self.pp.trans_count
        self.assertEqual(count, 10, "Sample file size must be 10")
        print("PreProcessor::file_parser")

    def test_unique_counts(self):
        """ Test unique field counters """
        uq = self.pp.unique
        # self.assertEqual(uq['RACE_IS_HISP_RC'], 1, "Must be equal")
        # self.assertEqual(uq['RACE_IS_BLACK'], 1, "Must be equal")
        self.assertEqual(uq['RACE_IS_WHITE'], 8, "Must be equal")
        self.assertEqual(uq['SEX_IS_FEMALE'], 3, "Must be equal")
        self.assertEqual(uq['SEX_IS_MALE'], 7, "Must be equal")
        print("PreProcessor.unique_counts")

    def test_mapping(self):
        """ Test discretize/binarize here """
        import collections
        self.pp._print_transactions()
        trans = self.pp.get_transactions()
        t1 = trans[0]
        is_others = self.pp.mapper.race['OTHERS'] != None
        self.assertEqual(t1['ID'], 1, "Must be first transaction")
        if is_others:
            self.assertEqual(
                t1['ITEMS'],
                collections.OrderedDict([('RACE_IS_OTHERS', True),
                                         ('SCORE_IS_[44-57]', True),
                                         ('SEX_IS_MALE', True)]),
                "Must be first transaction")
        else:
            self.assertEqual(
                t1['ITEMS'],
                collections.OrderedDict([('RACE_IS_HISP_RC', True),
                                         ('SCORE_IS_[44-57]', True),
                                         ('SEX_IS_MALE', True)]),
                "Must be first transaction")

        print("PreProcessor::mappers")
def main():
    pre_process = PreProcessor()
    X, y = pre_process.get_train_data()

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.3,
                                                        shuffle=True,
                                                        random_state=42,
                                                        stratify=y)

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
    }

    # 学習時に用いる検証用データ
    evals = [(dtrain, 'train'), (dtest, 'eval')]
    # 学習過程を記録するための辞書
    evals_result = {}
    bst = xgb.train(xgb_params,
                    dtrain,
                    num_boost_round=1000,  # ラウンド数を増やしておく
                    evals=evals,
                    evals_result=evals_result,
                    )

    y_pred_proba = bst.predict(dtest)
    y_pred = np.where(y_pred_proba > 0.5, 1, 0)
    acc = accuracy_score(y_test, y_pred)
    print('Accuracy:', acc)

    # 学習の課程を折れ線グラフとしてプロットする
    train_metric = evals_result['train']['logloss']
    plt.plot(train_metric, label='train logloss')
    eval_metric = evals_result['eval']['logloss']
    plt.plot(eval_metric, label='eval logloss')
    plt.grid()
    plt.legend()
    plt.xlabel('rounds')
    plt.ylabel('logloss')
    plt.show()
Beispiel #8
0
 def setUp(self):
     pp = PreProcessor()
     pp.parse_file("../data/samples.txt")
     transactions = pp.get_transactions()
     uniques = pp.get_uniques()
     sup = 2.0
     conf = 0.374
     self.apriori = Apriori(transactions, uniques, sup, conf)
class Predictor(object):
    """
    Predict処理を実行する
    """
    def __init__(self):
        """
        constructor
        """
        # preprocessor instance
        self.__pre_process = PreProcessor()

    def predict(self, model):
        """
        Predict and write data for submit
        :param model:
        :return:
        """
        test = self.__pre_process.get_test_data()

        predict_data = model.predict(test)

        return predict_data

    @staticmethod
    def write_file_submit(predict_data):
        """
        Write predict data to submit file
        :param predict_data:
        :return:
        """
        submit = pd.read_csv("./Data/sample_submit.csv", header=None)
        submit[1] = predict_data
        print(submit[1])
        now = datetime.datetime.now()
        now_str = '{}_{}_{}_{}_{}'.format(now.year, now.month, now.day,
                                          now.hour, now.minute)
        submit_file = './Data/submit/submit_{}.csv'.format(now_str)
        submit.to_csv(submit_file, header=None, index=None)

    @staticmethod
    def essemble_results(file1, file2):

        data1 = pd.read_csv(file1, sep="\t", header=None)
        data2 = pd.read_csv(file2, sep="\t", header=None)
        predict_data = (np.array(data1[1]) + np.array(data2[1])) / 2

        return predict_data
from timeit import default_timer as timer
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from ModelLoader import ModelLoader
from PostProcessor import PostProcessor
from preprocess import PreProcessor
import json

model_loader = ModelLoader()
img = Image.open('/home/bilal/Downloads/foto_van_yosra1.jpg')
# print("---------------------------------")
# print("0 for tinyYolo")
# val = input("give your neural network architecture type: ")

preprocessor = PreProcessor(0, img)
img_data = preprocessor.preprocess()

# print("---------------------------------------------")
# load a simple model
session = model_loader.load_session(1)
begin = timer()

# see the input name and shape
input_name = session.get_inputs()[0].name
"""# print("input name = ", input_name)

input_shape = session.get_inputs()[0].shape
# print("input shape =",input_shape)

input_type = session.get_inputs()[0].type
Beispiel #11
0
 def setUp(self):
     self.pp = PreProcessor()
     self.pp.parse_file("../data/samples.txt")
Beispiel #12
0
from preprocess import PreProcessor
import os
import facenet
import numpy as np

raw_img_folder = './data/raw_img'
train_img_folder = './data/train_img'
pp = PreProcessor()

# process all sub folders(people) under raw image folder


def processAllFolder():
    dataset = facenet.get_dataset(raw_img_folder)

    # looping the subfolders for all people
    for subfolder in dataset:
        output_class_dir = os.path.join(train_img_folder, subfolder.name)
        # create training image folder if not exists
        if not os.path.exists(output_class_dir):
            os.makedirs(output_class_dir)

        # align all images for each person and save in train_img folder
        num_image = pp.align_dir(subfolder, output_class_dir)
        print("Aligned %s images from %s folder" % (num_image, subfolder.name))
    print("Image Preprocess All Done!")


# make sure folder is placed under raw_image_folder
# E.g processOneFolder('Jason Jia')
def processOneFolder(folder_name):
        tf = bool(row[3])
        idf = bool(row[4])
        scale = bool(row[5])
        params[category] = [stop_words, tf, idf, scale]


with open(RESULT, 'w') as out:
    rows = csv.writer(out)
    for category in categories:
        source = os.path.join(TRAINING_CATEGORIES, category)
        if os.path.exists(source):

            X_train, y_train = load(source, category)
            pms = params[category]
            p = PreProcessor(stop_words=pms[0],
                             tf=pms[1],
                             idf=pms[2],
                             scale=pms[3])

            X_train = p.fit_training(X_train)

            print '.fit()'
            m = SGDClassifier()
            m.fit(X_train, y_train)

            print '.predict()'
            for test_id, text in test_entries():
                text = p.fit_test(np.array([text]))
                result = [category, test_id, m.predict(text)[0]]
                rows.writerow(result)
Beispiel #14
0
from preprocess import PreProcessor
import numpy as np

pre_processor = PreProcessor()


def load_data(n=0):
    tranning_sets, validation_sets, test_sets = pre_processor.read_dataset()

    train_set = [i for i in tranning_sets[n]]
    validation_set = [i for i in validation_sets[n]]
    test_set = [i for i in test_sets]

    train_data = [i[0] for i in train_set]
    train_label = [i[1] for i in train_set]
    validation_data = [i[0] for i in validation_set]
    validation_label = [i[1] for i in validation_set]
    test_data = [i[0] for i in test_set]
    test_label = [i[1] for i in test_set]

    # transform to np array
    train_data = np.array(train_data)
    train_label = np.array(train_label)
    validation_data = np.array(validation_data)
    validation_label = np.array(validation_label)
    test_data = np.array(test_data)
    test_label = np.array(test_label)

    return ((train_data, train_label), (validation_data, validation_label),
            (test_data, test_label))
 def __init__(self):
     """
     constructor
     """
     # preprocessor instance
     self.__pre_process = PreProcessor()
Beispiel #16
0
    parser = argparse.ArgumentParser()

    parser.add_argument('-data',
                        help='location of dataset',
                        default='data/out_split.pk')
    parser.add_argument('-We',
                        help='location of word embeddings',
                        default='data/glove.6B.300d.txt')
    parser.add_argument('-model',
                        help='model to run: nbow or dan',
                        default='nbow')
    parser.add_argument('-wd', help='use word dropout or not', default='y')

    args = vars(parser.parse_args())

    pp = PreProcessor(args['data'], args['We'])
    pp.tokenize()
    data, labels, data_val, labels_val = pp.make_data()

    embedding_matrix = pp.get_word_embedding_matrix(embedding_dim)

    model = Sequential()

    if args['We'] == "rand":
        model.add(
            Embedding(len(pp.word_index) + 1,
                      embedding_dim,
                      input_length=pp.MAX_SEQUENCE_LENGTH,
                      trainable=False))
    else:
        model.add(
class Trainer(object):
    """
    Train Class
    """
    def __init__(self):
        """
        Constructor
        """
        # preprocessor instance
        self.__pre_process = PreProcessor()
        self.__train_data, self.__train_targets = self.__pre_process.get_train_data(
        )
        # print(self.__train_data)

        # Tuning Parameters
        self.__n_folds = 5  # Cross-validation with k-folds
        self.__num_epochs = 400

    def build_model(self):
        """
        モデル構築
        :return:
        """
        # NN model
        model = models.Sequential()
        model.add(
            layers.Dense(256,
                         activation='relu',
                         kernel_initializer='normal',
                         input_shape=(self.__train_data.shape[1], )))
        model.add(
            layers.Dense(256, activation='relu', kernel_initializer='normal'))
        model.add(
            layers.Dense(256, activation='relu', kernel_initializer='normal'))
        model.add(
            layers.Dense(1, kernel_initializer='normal', activation='linear'))
        model.compile(optimizer='adam', loss="mse", metrics=['mape'])
        model.summary()
        return model

    def fit_model(self):
        """
        モデルをFitする
        :return:
        """
        # Kerasモデル構築(コンパイル済)
        model = self.build_model()

        # モデルをサイレントモード(verbose=0)で適合
        model.fit(self.__train_data,
                  self.__train_targets,
                  epochs=self.__num_epochs,
                  batch_size=16,
                  verbose=0)

        return model

    def evaluate_cross(self):
        """
        交差評価
        :return:
        """
        all_scores = []
        num_val_samples = int(len(self.__train_data) / self.__n_folds)

        for i in range(self.__n_folds):
            print('processing fold #  {}'.format(i))

            # 検証データの準備
            val_data = self.__train_data[i * num_val_samples:(i + 1) *
                                         num_val_samples]
            val_targets = self.__train_targets[i * num_val_samples:(i + 1) *
                                               num_val_samples]

            # 訓練データの準備
            partial_train_data = np.concatenate([
                self.__train_data[:i * num_val_samples],
                self.__train_data[(i + 1) * num_val_samples:]
            ],
                                                axis=0)
            partial_targets_data = np.concatenate([
                self.__train_targets[:i * num_val_samples],
                self.__train_targets[(i + 1) * num_val_samples:]
            ],
                                                  axis=0)

            # Kerasモデル構築(コンパイル済)
            model = self.build_model()

            # モデルをサイレントモード(verbose=0)で適合
            model.fit(partial_train_data,
                      partial_targets_data,
                      epochs=self.__num_epochs,
                      batch_size=16,
                      verbose=0)

            # モデルを検証データで評価
            val_mse, val_mape = model.evaluate(val_data,
                                               val_targets,
                                               verbose=0)
            all_scores.append(val_mape)

        print(all_scores)

        return np.mean(all_scores)

    def visualize_k_folds(self):
        """
        k分割交差検証のvisualization
        :return:
        """
        all_mape_histories = []
        num_val_samples = int(len(self.__train_data) / self.__n_folds)

        for i in range(self.__n_folds):
            print('processing fold #  {}'.format(i))

            # 検証データの準備
            val_data = self.__train_data[i * num_val_samples:(i + 1) *
                                         num_val_samples]
            val_targets = self.__train_targets[i * num_val_samples:(i + 1) *
                                               num_val_samples]

            # 訓練データの準備
            partial_train_data = np.concatenate([
                self.__train_data[:i * num_val_samples],
                self.__train_data[(i + 1) * num_val_samples:]
            ],
                                                axis=0)
            partial_targets_data = np.concatenate([
                self.__train_targets[:i * num_val_samples],
                self.__train_targets[(i + 1) * num_val_samples:]
            ],
                                                  axis=0)

            # Kerasモデル構築(コンパイル済)
            model = self.build_model()

            # モデルをサイレントモード(verbose=0)で適合
            history = model.fit(partial_train_data,
                                partial_targets_data,
                                validation_data=(val_data, val_targets),
                                epochs=self.__num_epochs,
                                batch_size=16,
                                verbose=0)

            # モデルを検証データで評価
            mape_history = history.history[
                'val_mean_absolute_percentage_error']
            all_mape_histories.append(mape_history)

        print(all_mape_histories)

        average_mape_history = [
            np.mean([x[i] for x in all_mape_histories])
            for i in range(self.__num_epochs)
        ]

        plt.plot(range(1, len(average_mape_history) + 1), average_mape_history)
        plt.xlabel('Epochs')
        plt.ylabel('Validation MAPE')
        plt.show()
class Trainer(object):
    """
    Train Class
    """

    def __init__(self):
        """
        Constructor
        """
        # preprocessor instance
        self.__pre_process = PreProcessor()
        self.__train, self.__y_train = self.__pre_process.get_train_data()

        # Tuning Parameters
        self.__n_folds = 3  # Cross-validation with k-folds

        # Models
        self.__lasso = make_pipeline(
            RobustScaler(), Lasso(alpha=0.0005, random_state=1))
        self.__ENet = make_pipeline(RobustScaler(), ElasticNet(
            alpha=0.0005, l1_ratio=.9, random_state=3))
        self.__KRR = KernelRidge(
            alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
        self.__GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                                  max_depth=4, max_features='sqrt',
                                                  min_samples_leaf=15, min_samples_split=10,
                                                  loss='huber', random_state=5)
        self.__model_xgb = xgb.XGBRegressor(colsample_bytree=0.2, gamma=0.0,
                                            learning_rate=0.05, max_depth=6,
                                            min_child_weight=1.5, n_estimators=7200,
                                            reg_alpha=0.9, reg_lambda=0.6,
                                            subsample=0.2, seed=42, silent=1,
                                            random_state=7)
        # self.__model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=5,
        #                                      learning_rate=0.05, n_estimators=720,
        #                                      max_bin=55, bagging_fraction=0.8,
        #                                      bagging_freq=5, feature_fraction=0.2319,
        #                                      feature_fraction_seed=9, bagging_seed=9,
        #                                      min_data_in_leaf=6, min_sum_hessian_in_leaf=11)

    def get_scores(self):
        """
        学習関数
        :return:
        """
        score = self.rmsle_cv(self.__lasso)
        print("\nLasso score: {:.4f} ({:.4f})\n".format(
            score.mean(), score.std()))
        score = self.rmsle_cv(self.__ENet)
        print("ElasticNet score: {:.4f} ({:.4f})\n".format(
            score.mean(), score.std()))
        score = self.rmsle_cv(self.__KRR)
        print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(
            score.mean(), score.std()))
        score = self.rmsle_cv(self.__GBoost)
        print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(
            score.mean(), score.std()))
        score = self.rmsle_cv(self.__model_xgb)
        print("Xgboost score: {:.4f} ({:.4f})\n".format(
            score.mean(), score.std()))
        # score = self.rmsle_cv(self.__model_lgb)
        # print("LGBM score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

    def mean_absolute_percentage_error(self, y_true, y_pred):
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    def adaboost(self):
        regr = AdaBoostRegressor(random_state=0, n_estimators=100)
        regr.fit(self.__train,self.__y_train)
        score = regr.score(self.__train,self.__y_train)
        print(score)
        return regr
        
    
    def fit_model(self):
        """
        モデルをフィットする
        :return:
        """
        # model = self.train_model(self.__train, self.__y_train)
        test_size = 1/self.__n_folds
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(self.__train, 
                                                                                    self.__y_train,
                                                                                    test_size = test_size,
                                                                                    random_state=0)
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        lasso = LassoCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
                                0.3, 0.6, 1],
                        max_iter=50000, cv=10)
        # lasso = RidgeCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
        #                         0.3, 0.6, 1], cv=10)

        # lasso = ElasticNetCV(cv=10, random_state=0)

   


        # lasso.fit(x_train_split, y_train_split)
        # y_predicted = lasso.predict(X=x_test_split)
        # mape = self.mean_absolute_percentage_error(y_test_split,y_predicted)
        # print(mape)
     

        # xgboostモデルの作成
        reg = xgb.XGBRegressor()

        # ハイパーパラメータ探索
        reg_cv = GridSearchCV(reg, {'max_depth': [2,4,6], 'n_estimators': [50,100,200]}, verbose=1)
        reg_cv.fit(x_train_split, y_train_split)
        print(reg_cv.best_params_, reg_cv.best_score_)
        # 改めて最適パラメータで学習
        reg = xgb.XGBRegressor(**reg_cv.best_params_)
        reg.fit(x_train_split, y_train_split)


        # 学習モデルの保存、読み込み
        # import pickle
        # pickle.dump(reg, open("model.pkl", "wb"))
        # reg = pickle.load(open("model.pkl", "rb"))

        # 学習モデルの評価
        pred_train = reg.predict(x_train_split)
        pred_test = reg.predict(x_test_split)
        # print(self.mean_absolute_percentage_error(y_train_split, pred_train))
        print(self.mean_absolute_percentage_error(y_test_split, pred_test))

        # import pandas as pd
        # import matplotlib.pyplot as plt
        # importances = pd.Series(reg.feature_importances_, index = boston.feature_names)
        # importances = importances.sort_values()
        # importances.plot(kind = "barh")
        # plt.title("imporance in the xgboost Model")
        # plt.show()
        return reg

    def train_model(self, X, y):
        """ Performs grid search over the 'max_depth' parameter for a
            decision tree regressor trained on the input data [X, y]. """

        # Create cross-validation sets from the training data
        cv_sets = ShuffleSplit(n_splits=10, test_size=0.20, random_state=0)

        # Create a decision tree regressor object
        regressor = DecisionTreeRegressor()

        # Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
        params = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

        # Transform 'performance_metric' into a scoring function using 'make_scorer'
        scoring_fnc = make_scorer(self.r2_score)

        # Create the grid search cv object --> GridSearchCV()
        grid = GridSearchCV(estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets)

        # Fit the grid search object to the data to compute the optimal model
        grid = grid.fit(X, y)

        # Return the optimal model after fitting the data
        return grid.best_estimator_

    def rmsle_cv(self, model):
        """
        calculate rmse for cross validation
        :return:
        """
        kf = KFold(self.__n_folds, shuffle=True, random_state=42).get_n_splits(self.__train.values)
        rmse = np.sqrt(-cross_val_score(model, self.__train.values, self.__y_train, scoring="neg_mean_squared_error",
                                        cv=kf))
        return rmse

    @staticmethod
    def r2_score(y_true, y_predict):
        """ Calculates and returns the performance score between
                true (y_true) and predicted (y_predict) values based on the metric chosen. """

        score = r2_score(y_true, y_predict)

        # Return the score
        return score