Example #1
0
def get_model(input_dim):
    logger.debug(f'The input size for DNN is:{input_dim}')
    dropout= 0.5
    lr =0.0001
    model = Sequential()
    model.add(Dense(int(input_dim*1.5), input_shape=(input_dim,)))

    # model.add(Dropout(dropout))


    # model.add(Dense(100))
    # model.add(LeakyReLU(alpha=0.01))
    # model.add(BatchNormalization())
    # model.add(Dropout(dropout))


    model.add(Dense(1, kernel_initializer='normal'))

    # model.compile(optimizer="sgd", loss="mse")
    adam = Adam(lr=lr)
    model.compile(loss='mean_squared_error', optimizer=adam,
                    #metrics=['categorical_crossentropy'],
                  )
    model.summary()



    return  model
Example #2
0
def get_feature_label_dnn(version, ensemble):
    from code_felix.tiny.util import get_stable_feature
    feature_label = get_stable_feature(version)

    if ensemble:
        file_list = [
            #0.2
            ('xgb_age', './sub/baseline_1.999298_3194_xgb_age_.h5'),
            ('xgb', './sub/baseline_2.606958_2666_xgb_1632_.h5'),
            ('lgb', './sub/baseline_2.61447_294_lgb_.h5'),

            #('sex_xgb',  './output/best/baseline_0.653098_2794_xgb_sex_0.95.h5'),
            #
            # # #
            # ('lgb', './output/best/baseline_2.62099_287_lgb_min_data_in_leaf1472.h5'),
            # ('dnn', './output/best/baseline_2.613028_2631_xgb_1615_svd_cmp0.h5'),
        ]
        feature_label = ensemble_feature_other_model(feature_label, file_list)

    feature_label['sex'] = feature_label['sex'].astype('category')
    feature_label['age'] = feature_label['age'].astype('category')
    feature_label['sex_age'] = feature_label['sex_age'].astype('category')

    logger.debug(f"type of sex is {feature_label['sex'].dtype}")
    return feature_label
Example #3
0
def get_feature(hours=4):
    raw = get_raw_input()
    if 'time' in raw:
        del raw['time']

    logger.debug(raw.shape)
    report = get_report()
    logger.debug(f"The shape of the report is {report.shape}")

    from code_felix.feature.config import time_interval
    gap = 3600*hours//time_interval

    columns = raw.columns
    final_columns = []
    for i in range(0, gap):
        final_columns.extend([f'{item}#{i}' for item in columns])

    feature = np.zeros(( len(report), len(final_columns)))


    for i, end in enumerate(report.index):
        begin = end - gap + 1
        #logger.debug(f'{begin}:{end}')
        #logger.debug(raw.loc[begin:end, :].values.shape)
        feature[i] = np.round(raw.loc[begin:end, :].values.flatten(),6)


    return pd.DataFrame(feature, columns=final_columns,index=report.index)
Example #4
0
    def predict(self, X_test):

        classifier = models.load_model(self.best_model)

        y_test = classifier.predict(X_test)

        logger.debug(f"y_test:{y_test.shape}")
        return y_test[:, 0]
Example #5
0
 def transform(self, X, y=None):
     # if X.name in ('gap_avg', 'gap_pre'):
     #     logger.debug('No need to fill missing value for %s' % X.name)
     #     return X
     # else:
     logger.debug("Try to fill %s with value %s" % (X.name, self.fill))
     X = X.replace([numpy.inf, -numpy.inf], numpy.nan)
     #X = X.apply(lambda x: self.fill if str(x).lower() in [' ', 'null', 'na'] else x)
     return X.fillna(self.fill)
Example #6
0
def _reduce_mem_usage(df, verbose=True):
    if isinstance(df, pd.Series):
        return df
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    mem = df.memory_usage()
    mem = mem if isinstance(mem, (int, float)) else mem.sum()
    start_mem = mem / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                        np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
                        np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
                        np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
                        np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(
                        np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    mem = df.memory_usage()
    mem = mem if isinstance(mem, (int, float)) else mem.sum()
    end_mem = mem / 1024**2
    if verbose:
        logger.debug(
            'Mem. usage decreased from {:7.2f} to {:7.2f} Mb ({:.1f}% reduction)'
            .format(start_mem, end_mem,
                    100 * (start_mem - end_mem) / start_mem))
    return df
Example #7
0
def get_raw_input():
    rootdir = ['./input/fix_interval/生产参数记录表(固定时间间隔)-2018年4月' ,
               './input/fix_interval/生产参数记录表(固定时间间隔)-2018年5月' ,
               ]

    df_month_list = []
    drop_time = False
    for cur_dir, month in zip(rootdir, [4, 5]):
        list = os.listdir(cur_dir)
        list = [f'{cur_dir}/{item}' for item in list if 'csv' in item]
        #logger.debug(f'=====File in one of the folder#{month}:{list}')
        #logger.debug(month)
        df_list = []
        for file in list:
           # logger.debug(file)
            f = open(file)
            file_name = os.path.basename(file)
            logger.debug(f'Get {file_name} base on {file}')
            file_sn_dict = get_file_order()
            file_sn = file_sn_dict[file_name]
            df = pd.read_csv(f, header=None, )
            df.columns = ['time_sn', 'time', f'val#{str(file_sn).rjust(2,"0")}']
            #df['month'] = month
            if month==5:
                df.time_sn = df.time_sn+518400
            df.set_index('time_sn',inplace=True)

            if drop_time:
                df.drop(['time'], axis=1, inplace=True)
            drop_time = True

            df_list.append(df)
            #logger.debug(df.shape)
        one_month = pd.concat(df_list, axis=1)

        df_month_list.append(one_month)
    all = pd.concat(df_month_list)
    all.sort_index(axis=1, inplace=True)
    return all
Example #8
0
    def fit(self, X_train, y_train, X_valid, y_valid):
        check_best = ModelCheckpoint(filepath=self.best_model,
                                     monitor='val_loss',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='min')

        early_stop = EarlyStopping(
            monitor='val_loss',
            verbose=1,
            patience=20,
        )

        reduce = ReduceLROnPlateau(monitor='val_loss',
                                   factor=0.2,
                                   patience=10,
                                   verbose=1,
                                   mode='min')

        model = self.get_dnn_model(self.model_type, self.input_dim,
                                   self.dropout)
        history = model.fit(
            X_train,
            y_train,
            validation_data=(X_valid, y_valid),
            callbacks=[check_best, early_stop, reduce],
            batch_size=128,
            #steps_per_epoch= len(X_test)//128,
            epochs=100,
            verbose=1,
        )

        best_epoch = np.array(history.history['val_loss']).argmin() + 1
        best_score = np.array(history.history['val_loss']).min()
        logger.debug(
            f'Best model save to:{self.best_model}, input:{X_train.shape}, bets_epoch:{best_epoch}, best_score:{best_score}'
        )
Example #9
0
    def fit(self, X, y=None):
        # logger.debug(X.name, X.dtype)
        # logger.debug("SeriesImputer:%s for %s:%s" % (id(self), X.name, X.dtype) )
        if X.name == 'renewed_yorn':
            self.fill = 'null'
        elif 'datetime' in X.dtype.name:
            logger.warning(
                "%s will be fill missing value with None as datetime" % X.name)
            self.fill = numpy.nan
        elif X.dtype.name in ['object', 'category']:
            if len(X.value_counts()) == 0:
                self.fill = numpy.nan
            else:
                self.fill = X.value_counts().index[0]
            logger.debug("Fill %s with value:%s, type:%s, count:%d" %
                         (X.name, self.fill, X.dtype, len(X.unique())))
        else:
            mean = X.mean()
            self.fill = 0 if numpy.math.isnan(mean) else mean
            #self.fill = X.mean()
            logger.debug("Fill %s with value:%s, type:%s" %
                         (X.name, self.fill, X.dtype))

        return self
Example #10
0
def convert_missing(sample):
    imputer = defaultdict(SeriesImputer)
    train_temp = sample[sample.label_del == 'train']

    logger.debug("Begin to fill the missing data base on %s/%s" %
                 (len(train_temp), len(sample)))

    logger.debug("try to get the fill value base on %d training data" %
                 len(train_temp))
    train_temp = train_temp.apply(lambda x: imputer[x.name].fit(x),
                                  reduce=False)

    del train_temp

    temp_list = sorted(imputer.items(),
                       key=lambda item: sample[item[0]].dtype.name)

    sample = sample.apply(lambda x: imputer[x.name].transform(x), reduce=False)
    logger.debug("There are %d columns already fill the missing value" %
                 len(temp_list))

    logger.debug("End clean the data" + str(sample.shape))
    return sample
Example #11
0
def learning(model ,Xtrain ,y ,Xtest,label_name, number_of_folds= 5, seed = 777):
    train_index = Xtrain.index
    test_index = Xtest.index

    Xtrain = Xtrain.reset_index(drop=True)
    Xtest  = Xtest.reset_index(drop=True)
    y = y.reset_index(drop=True)
    y_train = y

    logger.debug(f'train:{Xtrain.shape}, label:{y.shape}, test:{Xtest.shape}')
    print( 'Model: %s' % model)

    """ Each model iteration """
    train_predict_y = np.zeros((len(y)))
    test_predict_y = np.zeros((Xtest.shape[0]))
    learn_loss = 0.
    """ Important to set seed """

    tmp_model = './output//model/checkpoint/dnn_best_tmp.hdf5'
    check_best = ModelCheckpoint(filepath=tmp_model,
                                 monitor='val_loss', verbose=1,
                                 save_best_only=True, mode='min')

    early_stop = EarlyStopping(monitor='val_loss', verbose=1,
                               patience=100,
                               )
    reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                               patience=30, verbose=1, mode='min')

    logger.debug(f'y_train.shape:{y_train.shape}')



    from sklearn.model_selection import KFold
    skf = KFold(n_splits = number_of_folds ,shuffle=True, random_state=seed)
    """ Each fold cross validation """

    for i, (train_idx, val_idx) in enumerate(skf.split(Xtrain)):
        logger.debug(f'Fold#{i + 1}' )
        #print(train_idx)

        history = model.fit(Xtrain.values[train_idx], y[train_idx],
                            validation_data=((Xtrain.values[val_idx],  y[val_idx])),
                            #callbacks=[check_best, early_stop, reduce],
                            batch_size=8,
                            # steps_per_epoch= len(X_test)//128,
                            epochs=2,
                            verbose=1,
                            )

        best_epoch = np.array(history.history['val_loss']).argmin() + 1
        best_score = np.array(history.history['val_loss']).min()


        logger.debug(f"Fold#{i+1} arrive {best_score} at {best_epoch}")

        scoring = model.predict(Xtrain.values[val_idx])
        #bst.predict(data.data, ntree_limit=bst.best_ntree_limit)
        """ Out of fold prediction """
        train_predict_y[val_idx] = scoring
        l_score = mean_squared_error(y[val_idx], scoring)
        learn_loss += l_score
        logger.debug('Fold %d score: %f' % (i + 1, l_score))

        test_predict_y = test_predict_y + model.predict(Xtest.values)

    test_predict_y = test_predict_y / number_of_folds

    avg_loss = round(learn_loss / number_of_folds, 6)
    print('average val log_loss: %f' % (avg_loss))
    """ Fit Whole Data and predict """
    print('training whole data for test prediction...')

    # np.save('./output/xgb_train.np', train_predict_y)
    # np.save('./output/xgb_test.np', test_predict_y)


    ###Save result for ensemble
    train_bk = pd.DataFrame(train_predict_y,
                            index=train_index,
                            columns=[label_name]
                            )

    test_bk = pd.DataFrame(test_predict_y,
                           index=test_index,
                           columns=[label_name]
                           )

    label_bk = pd.DataFrame({'label': y},
                            columns=[label_name],
                            index=train_index,
                            )
    model_name = type(model).__name__
    save_result_for_ensemble(f'kfold_{label_name}_{model_name}_{avg_loss}',
                             label_name=label_name,
                             train=train_bk,
                             test=test_bk,
                             label=label_bk,
                             )
Example #12
0
def get_all_file(path):
    logger.debug(f'Try to read file from"{path}')
    file_list = os.listdir(path)
    file_list = [file for file in file_list if '.h5' in file]
    return file_list
Example #13
0
def get_train(hours_gap):
    train =  get_feature(hours_gap)[:144]
    logger.debug(f'The size of train is:{train.shape}')
    return train
Example #14
0
def get_file_order():
    from code_felix.feature.config import file_order
    file_sn = dict(zip(file_order, range(0, len(file_order))))
    file_sn_list = sorted(file_sn.items(), key=lambda val: val[1])
    logger.debug(file_sn_list)
    return file_sn
Example #15
0
def train_dnn(dropout, lr, ensemble):
    #dropout = 0.7
    version = '1011'

    args = locals()
    logger.debug(f'Run train dnn:{args}')

    feature_label = get_feature_label_dnn(version, ensemble)

    test = feature_label[feature_label['sex'].isnull()]
    train = feature_label[feature_label['sex'].notnull()]

    logger.debug(f"type of sex is {feature_label['sex'].dtype}")
    X_train, X_test, y_train, y_test = split_train(train)

    input_dim = X_train.shape[1]

    logger.debug(
        f'X_train:{X_train.shape}, y_train:{y_train.shape}, score:{test.shape}, input_dim:{input_dim}'
    )

    model = Sequential()
    model.add(Dense(1200, input_shape=(input_dim, )))
    #model.add(Activation('sigmoid'))
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dropout(dropout))

    model.add(Dense(100))
    model.add(LeakyReLU(alpha=0.01))
    model.add(BatchNormalization())
    model.add(Dropout(dropout))

    model.add(Dense(15, ))
    model.add(LeakyReLU(alpha=0.01))

    model.add(Dense(22, ))
    model.add(Activation('softmax'))

    # model.compile(optimizer="sgd", loss="mse")
    adam = Adam(lr=lr)
    model.compile(
        loss='categorical_crossentropy',
        optimizer=adam,
        #metrics=['categorical_crossentropy'],
    )
    model.summary()
    #model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[categorical_accuracy])

    tmp_model = './model/checkpoint/dnn_best_tmp.hdf5'
    check_best = ModelCheckpoint(filepath=tmp_model,
                                 monitor='val_loss',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='min')

    early_stop = EarlyStopping(
        monitor='val_loss',
        verbose=1,
        patience=100,
    )
    reduce = ReduceLROnPlateau(monitor='val_loss',
                               factor=0.2,
                               patience=30,
                               verbose=1,
                               mode='min')

    from keras.utils import np_utils
    logger.debug(f'y_train.shape:{y_train.shape}')
    history = model.fit(
        X_train,
        np_utils.to_categorical(y_train),
        validation_data=(X_test, np_utils.to_categorical(y_test)),
        callbacks=[check_best, early_stop, reduce],
        batch_size=128,
        #steps_per_epoch= len(X_test)//128,
        epochs=50000,
        verbose=1,
    )

    best_epoch = np.array(history.history['val_loss']).argmin() + 1
    best_score = np.array(history.history['val_loss']).min()

    classifier = models.load_model(tmp_model)

    pre_x = test.drop(['sex', 'age', 'sex_age', 'device'], axis=1)
    logger.debug(f'Test:{test.shape}, pre_x:{pre_x.shape}')

    logger.debug(f'pre_x.values:{pre_x.values.shape}')
    sub = pd.DataFrame(classifier.predict_proba(pre_x.values))

    sub.columns = train.sex_age.cat.categories
    sub['DeviceID'] = test['device'].values
    sub = sub[[
        'DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', '1-7',
        '1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', '2-5', '2-6',
        '2-7', '2-8', '2-9', '2-10'
    ]]

    logger.debug(f'best_score(his):{best_score} @ epoch:{best_epoch}')

    model_file = f'./model/checkpoint/dnn_best_{best_score}_{args}_epoch_{best_epoch}.hdf5'
    model.save(model_file, overwrite=True)

    print(
        f'=============Final train feature({len(feature_label.columns)}):\n{list(feature_label.columns)} \n {len(feature_label.columns)}'
    )

    # file = f'./sub/baseline_dnn_{best_score}_{args}_epoch_{best_epoch}.csv'

    #
    # file = replace_invalid_filename_char(file)
    # logger.info(f'sub file save to {file}')
    # sub = round(sub, 10)
    # sub.to_csv(file, index=False)

    ###Save result for ensemble
    train_bk = pd.DataFrame(classifier.predict_proba(
        train.drop(['sex', 'age', 'sex_age', 'device'], axis=1)),
                            index=train.device,
                            columns=train.sex_age.cat.categories)

    test_bk = pd.DataFrame(classifier.predict_proba(pre_x.values),
                           index=test.device,
                           columns=test.sex_age.cat.categories)
    label_bk = pd.DataFrame(
        {'label': train.sex_age.cat.codes},
        index=train.device,
    )

    from code_felix.tiny.util import save_result_for_ensemble
    save_result_for_ensemble(
        f'{round(best_score,5)}_{best_epoch}_v_{version}_dnn_{args}',
        train=train_bk,
        test=test_bk,
        label=label_bk,
    )