def test_input_load(feat_path, label_path):
    inputs, targets = read_feature(feat_path, label_path)

    if args.Backbone_model == 'baseLSTM' or args.Backbone_model == 'CLDNN':
        train_DB = read_DB_structure(os.path.join(c.MFB_DIR + '_' + str(args.padding_time), 'train_folder'), 'train')
        MS_path = os.path.join(c.MFB_DIR + '_' + str(args.padding_time), 'Train_Mean_Var')

    elif args.Backbone_model == '2DCRNN':
        train_DB = read_DB_structure(os.path.join(c.STFT_DIR + '_1.0', 'train_folder'), 'train')
        MS_path = os.path.join(c.STFT_DIR + '_1.0', 'Train_Mean_Var')

    if c.USE_GLOBAL_NORM:
        mean_path = os.path.join(MS_path, 'train_mean.txt')
        std_path = os.path.join(MS_path, 'train_std.txt')
        train_mean, train_std = calc_global_mean_std(mean_path, std_path, train_DB)
        inputs = global_feature_normalize(inputs, train_mean, train_std)

    TI = LSTMInputTest()
    TT = ToTensorInput()

    inputs, targets = TI(inputs, targets)
    inputs, targets = TT(inputs, targets)

    with torch.no_grad():
        inputs = Variable(inputs)
        targets = Variable(targets)

    return inputs, targets
Exemple #2
0
def Aurora_EPD(DB, feature_directory, label_directory):
    num_of_DB = len(DB)
    for i in range(num_of_DB):
        feat_path = DB['filename'][i]
        output_file = feat_path.split('/')[-1]
        output_file_path = os.path.join(feature_directory, output_file)

        label_path = DB['label_path'][i]
        output_label = label_path.split('/')[-1]
        output_label = output_label.split('.')[0] + '.pkl'
        output_label_path = os.path.join(label_directory, output_label)

        feature, label = read_feature(feat_path, label_path)

        # Getting start and end points in utterance
        start_point = np.where(label == 1)[0][0]
        end_point = np.where(label == 1)[0][-1]

        epd_feature = feature[start_point:end_point]
        epd_label = label[start_point:end_point]

        if os.path.isfile(output_file_path):
            print('\'' + output_file + '\'' + 'feature already extracted!')
        else:
            with open(output_file_path, 'wb') as fp:
                pickle.dump(epd_feature, fp)
                print('[EPD] Feature : %s is done!' % output_file)

        if os.path.isfile(output_label_path):
            print('\'' + output_label + '\'' + 'label already extracted!')
        else:
            with open(output_label_path, 'wb') as fp:
                pickle.dump(epd_label, fp)
                print('[EPD] Label : %s is done!' % output_label)
Exemple #3
0
def get_onsets(track_id):
    """Read onset times and intervals from file onsets_dir + track_id + '.csv'.
    File should contain a time column followed by one column of
        inter-onset intervals.
    """
    onsets_file = os.path.join(onsets_dir, str(track_id) + '.csv')
    t, ioi = utils.read_feature(onsets_file, time=True)
    return t, ioi
Exemple #4
0
def get_beats(track_id):
    """Read beat times and intervals from file beats_dir + track_id + '.csv'.
    File should contain a time column followed by one column of
        beat intervals.
    """
    beats_file = os.path.join(beats_dir, str(track_id) + '.csv')
    t, beat_intervals = utils.read_feature(beats_file, time=True)
    return t, beat_intervals
Exemple #5
0
def get_onsets(track_id):
    """Read onset times and intervals from file onsets_dir + track_id + '.csv'.
    File should contain a time column followed by one column of
        inter-onset intervals.
    """
    onsets_file = os.path.join(onsets_dir, str(track_id) + '.csv')
    t, ioi = utils.read_feature(onsets_file, time=True)
    return t, ioi
Exemple #6
0
def get_beats(track_id):
    """Read beat times and intervals from file beats_dir + track_id + '.csv'.
    File should contain a time column followed by one column of
        beat intervals.
    """
    beats_file = os.path.join(beats_dir, str(track_id) + '.csv')
    t, beat_intervals = utils.read_feature(beats_file, time=True)
    return t, beat_intervals
Exemple #7
0
def get_chroma(track_id):
    """Read chroma data from file chroma_dir + track_id + '.csv'.
    File should contain a time column followed by one column per chroma
        dimension.
    """
    chroma_file = os.path.join(chroma_dir, track_id + '.csv')
    t, chroma = utils.read_feature(chroma_file, time=True)
    return t, chroma
Exemple #8
0
def get_chroma(track_id):
    """Read chroma data from file chroma_dir + track_id + '.csv'.
    File should contain a time column followed by one column per chroma
        dimension.
    """
    chroma_file = os.path.join(chroma_dir, track_id + '.csv')
    t, chroma = utils.read_feature(chroma_file, time=True)
    return t, chroma
Exemple #9
0
def get_melody(track_id):
    """Read melody data from file melody_dir + track_id + '.csv'.
    File should contain melody data in two columns: (time, melody)
        with melody in midi note number (float or int).
        Frames in which no pitch is present can be set to 0, None or np.nan.
    """
    melody_file = os.path.join(melody_dir, track_id + '.csv')
    t, melody = utils.read_feature(melody_file, time=True)
    return t, melody.flatten()
Exemple #10
0
def get_melody(track_id):
    """Read melody data from file melody_dir + track_id + '.csv'.
    File should contain melody data in two columns: (time, melody)
        with melody in midi note number (float or int).
        Frames in which no pitch is present can be set to 0, None or np.nan.
    """
    melody_file = os.path.join(melody_dir, track_id + '.csv')
    t, melody = utils.read_feature(melody_file, time=True)
    return t, melody.flatten()
def compute(segment_dict, features):
    """
    Args:
        segment_dict (dict): dictionary of song segments, containing a list of
            segment ids (values) for a set of unique song identifiers (keys).
    """

    data_dict = {}

    # compute features
    for feature in features:
        print('computing ' + feature + '...')
        feature_name, first_order_aggregates, second_order_aggregates = parse_feature(
            feature)

        corpus_features = []
        for song_id in segment_dict.keys():
            song_features = []
            for segment in segment_dict[song_id]:
                raw_features = utils.read_feature(
                    [data_dir, feature_name, segment], skip_cols='auto')
                segment_features = first_order(raw_features,
                                               first_order_aggregates,
                                               verbose=False)
                song_features.append(segment_features)
            if 'song' in second_order_aggregates:
                song_features = second_order(song_features,
                                             second_order_aggregates,
                                             verbose=False)
            corpus_features.extend(song_features)
        if 'corpus' in second_order_aggregates:
            # print('        in: len(corpus_features) = {}, corpus_features[0] = {}'.format(len(corpus_features), corpus_features[0]))
            corpus_features = second_order(corpus_features,
                                           second_order_aggregates,
                                           verbose=False)
        # print('        out: len(corpus_features) = {}, corpus_features[0] = {}'.format(len(corpus_features), corpus_features[0]))
        data_dict[feature] = np.squeeze(corpus_features)

    # add segment ids
    song_ids = []
    segments = []
    for song_id in segment_dict.keys():
        for segment in segment_dict[song_id]:
            song_ids.append(song_id)
            segments.append(segment)
    data_dict['song.id'] = np.array(song_ids)
    data_dict['segment.id'] = np.array(segments)

    # convert to dataframe
    return pd.DataFrame(data_dict)
Exemple #12
0
def load_graph(dataset, labels_is_onehot=True):
    features = read_feature("./data/" + dataset + ".feature",
                            is_normalize=False)

    if os.path.exists("./data/" + dataset + ".label"):
        labels = read_label("./data/" + dataset + ".label",
                            is_onehot=labels_is_onehot)
    else:
        labels = None

    G = read_graph("./data/" + dataset + '.edgelist')

    graph = Graph(features, G, labels)

    return graph
Exemple #13
0
def main():
    logging.info('reading data')
    train_mat = read_rating('data/ml-1m/normalTrain.csv')
    test_mat = read_rating('data/ml-1m/test.csv')
    item_mat = read_feature('data/ml-1m/itemFeat.csv')
    num_item_feat = item_mat.shape[1]

    model = CollaborativeDeepLearning(item_mat, [num_item_feat, 16, 8])
    model.pretrain(lamda_w=0.001, encoder_noise=0.3, epochs=10)
    model_history = model.fineture(train_mat,
                                   test_mat,
                                   lamda_u=0.01,
                                   lamda_v=0.1,
                                   lamda_n=0.1,
                                   lr=0.01,
                                   epochs=3)
    testing_rmse = model.getRMSE(test_mat)
    print('Testing RMSE = {}'.format(testing_rmse))
Exemple #14
0
def padding(DB, feature_directory, label_directory, padding_time):
    num_of_DB = len(DB)
    for i in range(num_of_DB):
        feat_path = DB['filename'][i]
        output_file = feat_path.split('/')[-1]
        output_file_path = os.path.join(feature_directory, output_file)

        label_path = DB['label_path'][i]
        output_label = label_path.split('/')[-1]
        output_label = output_label.split('.')[0] + '.pkl'
        output_label_path = os.path.join(label_directory, output_label)

        feature, label = read_feature(feat_path, label_path)

        start_seg_feat, end_seg_feat = feature[:100], feature[-100:]
        start_seg_label, end_seg_label = label[:100], label[-100:]

        # Data have already been padded with 1 seconds silence at at both ends of speech.
        if padding_time == 0.0:
            final_feature = feature[100:-100]
            final_label = label[100:-100]
        elif padding_time == 2.0:
            final_feature = np.concatenate((start_seg_feat, feature, end_seg_feat))
            final_label = np.concatenate((start_seg_label, label, end_seg_label))
        elif padding_time == 3.0:
            final_feature = np.concatenate((start_seg_feat, start_seg_feat, feature, end_seg_feat, end_seg_feat))
            final_label = np.concatenate((start_seg_label, start_seg_label, label, end_seg_label, end_seg_label))
        else:
            raise ValueError

        if os.path.isfile(output_file_path):
            print('\'' + output_file + '\'' + 'feature already extracted!')
        else:
            with open(output_file_path, 'wb') as fp:
                pickle.dump(final_feature, fp)
                print('[Padding] Feature : %s is done!' % output_file)

        if os.path.isfile(output_label_path):
            print('\'' + output_label + '\'' + 'label already extracted!')
        else:
            with open(output_label_path, 'wb') as fp:
                pickle.dump(final_label, fp)
                print('[Padding] Label : %s is done!' % output_label)
    dic = f.pickle_op(file_name='saved_pickle/{}.p'.format(args.input),
                      mode='r')
    y = dic['y']  # 26 x 27
    x = dic['x']  # 26 x 27 x 30 x 6080 or 26 x 513 x 30 x 419
    x = f.sub_baseline(x)
    '''
    ff.wavelet_power(x[0][0][0])
    f, t, Zxx = signal.stft(x[0][0][0],200)
    print(f.shape)
    print(t.shape)
    '''
    #-------------------------------   read feature -------------------------------
    # -------------------------------uncomment the feature you want-------------------------------

    FT_psd, FT_psd_log = f.read_feature('{}_feature_pickle/{}/FT_psd.p'.format(
        args.input_type, args.dir))
    T_mean_power, T_mean_power_log = f.read_feature(
        '{}_feature_pickle/{}/T_mean_power.p'.format(args.input_type,
                                                     args.dir))
    T_mean, T_mean_log = f.read_feature('{}_feature_pickle/{}/T_mean.p'.format(
        args.input_type, args.dir))
    T_std, T_std_log = f.read_feature('{}_feature_pickle/{}/T_std.p'.format(
        args.input_type, args.dir))
    T_first_diff, T_first_diff_log = f.read_feature(
        '{}_feature_pickle/{}/T_first_diff.p'.format(args.input_type,
                                                     args.dir))
    T_second_diff, T_second_diff_log = f.read_feature(
        '{}_feature_pickle/{}/T_second_diff.p'.format(args.input_type,
                                                      args.dir))
    STFT_power, STFT_power_log = f.read_feature(
        '{}_feature_pickle/{}/STFT_power.p'.format(args.input_type, args.dir))
Exemple #16
0
def preprocessing( args ):
    gc.enable()
    time_bar = tqdm(total = 90, desc = "preprocessing")

    # 目的変数
    target_name = 'TARGET'
    one_hot_encode = args.onehot_encode

    #===========================
    # 元データ
    #===========================
    # application_{train|test}
    if( args.feature_format ):
        df_application_train = read_feature( os.path.join(args.dataset_dir, "application_train.feature") )
        df_application_test = read_feature( os.path.join(args.dataset_dir, "application_test.feature" ) )
    else:
        df_application_train = pd.read_csv( os.path.join(args.dataset_dir, "application_train.csv" ) )
        df_application_test = pd.read_csv( os.path.join(args.dataset_dir, "application_test.csv" ) )

    #df_application_train.drop(['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'], axis=1, inplace=True)
    #df_application_test.drop(['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'], axis=1, inplace=True)
    if( args.onehot_encode ):
        for col in df_application_train.columns:
            if( df_application_train[col].dtypes == "object" ):
                df_application_train[col] = df_application_train[col].fillna('NA')
                df_application_test[col] = df_application_test[col].fillna('NA')

        df_application_train = pd.get_dummies( df_application_train )
        df_application_test = pd.get_dummies( df_application_test )

        # one-hot ecode により、学習用データには存在するがテストデータにない列が生じるので、それらを align する
        train_labels = df_application_train['TARGET']

        # Align the training and testing data, keep only columns present in both dataframes
        df_application_train, df_application_test = df_application_train.align(df_application_test, join = 'inner', axis = 1)

        # Add the target back in
        df_application_train['TARGET'] = train_labels

    # 元データ
    df_train = df_application_train
    df_test = df_application_test

    #----------------------------
    # ドメイン知識に基づく特徴量
    #----------------------------
    if( args.domain_features ):
        # CREDIT_INCOME_PERCENT: クライアントの収入に対する信用額の割合。
        df_train['CREDIT_INCOME_PERCENT'] = df_train['AMT_CREDIT'] / df_train['AMT_INCOME_TOTAL']
        df_test['CREDIT_INCOME_PERCENT'] = df_test['AMT_CREDIT'] / df_test['AMT_INCOME_TOTAL']

        # ANNUITY_INCOME_PERCENT: クライアントの収入に対するローン年金の割合。
        df_train['ANNUITY_INCOME_PERCENT'] = df_train['AMT_ANNUITY'] / df_train['AMT_INCOME_TOTAL']
        df_test['ANNUITY_INCOME_PERCENT'] = df_test['AMT_ANNUITY'] / df_test['AMT_INCOME_TOTAL']

        # CREDIT_TERM: お支払い期間を月単位で指定します。
        df_train['CREDIT_TERM'] = df_train['AMT_ANNUITY'] / df_train['AMT_CREDIT']
        df_test['CREDIT_TERM'] = df_test['AMT_ANNUITY'] / df_test['AMT_CREDIT']

        # DAYS_EMPLOYED_PERCENT: クライアントの年齢に対する在職日数の割合。
        df_train['DAYS_EMPLOYED_PERCENT'] = df_train['DAYS_EMPLOYED'] / df_train['DAYS_BIRTH']
        df_test['DAYS_EMPLOYED_PERCENT'] = df_test['DAYS_EMPLOYED'] / df_test['DAYS_BIRTH']

    time_bar.update(10)

    #===========================
    # サブ構造の結合
    #===========================
    #---------------------------
    # bureau
    #---------------------------
    if( args.feature_format ):
        df_bureau = read_feature( os.path.join(args.dataset_dir, "bureau.feature") )
    else:
        df_bureau = pd.read_csv( os.path.join(args.dataset_dir, "bureau.csv" ) )

    df_bureau_agg_numric = agg_dataframe_numric( df_bureau, agg_column = 'SK_ID_CURR', base_column_name = "bureau" )
    df_bureau_agg_categorical = agg_dataframe_categorical( df_bureau, agg_column = 'SK_ID_CURR', base_column_name = "bureau", one_hot_encode = one_hot_encode )

    # 元のデータに統合
    df_train = pd.merge(df_train, df_bureau_agg_numric, on='SK_ID_CURR', how='left' )
    df_train = pd.merge(df_train, df_bureau_agg_categorical, on='SK_ID_CURR', how='left' )
    df_test = pd.merge(df_test, df_bureau_agg_numric, on='SK_ID_CURR', how='left' )
    df_test = pd.merge(df_test, df_bureau_agg_categorical, on='SK_ID_CURR', how='left' )

    # 不要になったメモリを解放
    del df_bureau_agg_numric, df_bureau_agg_categorical
    gc.collect()
    time_bar.update(10)

    #---------------------------
    # bureau_balance
    #---------------------------
    if( args.feature_format ):
        df_bureau_balance = read_feature( os.path.join(args.dataset_dir, "bureau_balance.feature") )
    else:
        df_bureau_balance = pd.read_csv( os.path.join(args.dataset_dir, "bureau_balance.csv" ) )

    # 同じ SK_ID_BUREAU を集約
    df_bureau_balance_agg_numric = agg_dataframe_numric( df_bureau_balance, agg_column = 'SK_ID_BUREAU', base_column_name = "bureau_balance" )
    df_bureau_balance_agg_categorical = agg_dataframe_categorical( df_bureau_balance, agg_column = 'SK_ID_BUREAU', base_column_name = "bureau_balance", one_hot_encode = one_hot_encode )

    # 親データ (df_bureau) の 'SK_ID_CURR' に、対応する 'SK_ID_BUREAU' を紐付け
    df_bureau_balance_agg_numric = df_bureau[['SK_ID_BUREAU', 'SK_ID_CURR']].merge(df_bureau_balance_agg_numric, on = 'SK_ID_BUREAU', how = 'left')
    df_bureau_balance_agg_categorical = df_bureau[['SK_ID_BUREAU', 'SK_ID_CURR']].merge(df_bureau_balance_agg_categorical, on = 'SK_ID_BUREAU', how = 'left')

    # 1つの `SK_ID_CURR` に対して、複数の `SK_ID_BUREAU` が存在することになるので、`SK_ID_CURR` を集約
    df_bureau_balance_agg_numric = agg_dataframe_numric( df_bureau_balance_agg_numric.drop(columns = ['SK_ID_BUREAU']), agg_column = 'SK_ID_CURR', base_column_name = "bureau_balance" )
    df_bureau_balance_agg_categorical = agg_dataframe_numric( df_bureau_balance_agg_categorical.drop(columns = ['SK_ID_BUREAU']), agg_column = 'SK_ID_CURR', base_column_name = "bureau_balance" )

    # 元のデータに統合
    df_train = pd.merge(df_train, df_bureau_balance_agg_numric, on='SK_ID_CURR', how='left' )
    df_train = pd.merge(df_train, df_bureau_balance_agg_categorical, on='SK_ID_CURR', how='left' )
    df_test = pd.merge(df_test, df_bureau_balance_agg_numric, on='SK_ID_CURR', how='left' )
    df_test = pd.merge(df_test, df_bureau_balance_agg_categorical, on='SK_ID_CURR', how='left' )

    # 不要になったメモリを解放
    del df_bureau, df_bureau_balance, df_bureau_balance_agg_numric, df_bureau_balance_agg_categorical
    gc.collect()
    time_bar.update(10)

    #---------------------------
    # previous_application
    #--------------------------
    if( args.feature_format ):
        df_previous_application = read_feature( os.path.join(args.dataset_dir, "previous_application.feature") )
    else:
        df_previous_application = pd.read_csv( os.path.join(args.dataset_dir, "previous_application.csv" ) )

    df_previous_application_agg_numric = agg_dataframe_numric( df_previous_application, agg_column = 'SK_ID_CURR', base_column_name = "previous_application" )
    df_previous_application_agg_categorical = agg_dataframe_categorical( df_previous_application, agg_column = 'SK_ID_CURR', base_column_name = "previous_application", one_hot_encode = one_hot_encode )

    # 元のデータに統合
    df_train = pd.merge(df_train, df_previous_application_agg_numric, on='SK_ID_CURR', how='left' )
    df_train = pd.merge(df_train, df_previous_application_agg_categorical, on='SK_ID_CURR', how='left' )
    df_test = pd.merge(df_test, df_previous_application_agg_numric, on='SK_ID_CURR', how='left' )
    df_test = pd.merge(df_test, df_previous_application_agg_categorical, on='SK_ID_CURR', how='left' )

    # 不要になったメモリを解放
    del df_previous_application_agg_numric, df_previous_application_agg_categorical
    gc.collect()
    time_bar.update(10)

    #---------------------------
    # pos_cash_balance
    #---------------------------
    if( args.feature_format ):
        df_pos_cash_balance = read_feature( os.path.join(args.dataset_dir, "POS_CASH_balance.feature") )
    else:
        df_pos_cash_balance = pd.read_csv( os.path.join(args.dataset_dir, "POS_CASH_balance.csv" ) )

    # 同じ SK_ID_PREV を集約
    df_pos_cash_balance_agg_numric = agg_dataframe_numric( df_pos_cash_balance, agg_column = 'SK_ID_PREV', base_column_name = "pos_cash_balance" )
    df_pos_cash_balance_agg_categorical = agg_dataframe_categorical( df_pos_cash_balance, agg_column = 'SK_ID_PREV', base_column_name = "pos_cash_balance", one_hot_encode = one_hot_encode )

    # 親データ の 'SK_ID_CURR' に、対応する 'SK_ID_PREV' を紐付け
    df_pos_cash_balance_agg_numric = df_previous_application[['SK_ID_PREV', 'SK_ID_CURR']].merge(df_pos_cash_balance_agg_numric, on = 'SK_ID_PREV', how = 'left')
    df_pos_cash_balance_agg_categorical = df_previous_application[['SK_ID_PREV', 'SK_ID_CURR']].merge(df_pos_cash_balance_agg_categorical, on = 'SK_ID_PREV', how = 'left')

    # 1つの `SK_ID_CURR` に対して、複数の `SK_ID_BUREAU` が存在することになるので、`SK_ID_CURR` を集約
    df_pos_cash_balance_agg_numric = agg_dataframe_numric( df_pos_cash_balance_agg_numric.drop(columns = ['SK_ID_PREV']), agg_column = 'SK_ID_CURR', base_column_name = "pos_cash_balance" )
    df_pos_cash_balance_agg_categorical = agg_dataframe_numric( df_pos_cash_balance_agg_categorical.drop(columns = ['SK_ID_PREV']), agg_column = 'SK_ID_CURR', base_column_name = "pos_cash_balance" )

    # 元のデータに統合
    df_train = pd.merge(df_train, df_pos_cash_balance_agg_numric, on='SK_ID_CURR', how='left' )
    df_train = pd.merge(df_train, df_pos_cash_balance_agg_categorical, on='SK_ID_CURR', how='left' )
    df_test = pd.merge(df_test, df_pos_cash_balance_agg_numric, on='SK_ID_CURR', how='left' )
    df_test = pd.merge(df_test, df_pos_cash_balance_agg_categorical, on='SK_ID_CURR', how='left' )

    # 不要になったメモリを解放
    del df_pos_cash_balance, df_pos_cash_balance_agg_numric, df_pos_cash_balance_agg_categorical
    gc.collect()
    time_bar.update(10)

    #---------------------------
    # installments_payments
    #---------------------------
    if( args.feature_format ):
        df_installments_payments = read_feature( os.path.join(args.dataset_dir, "installments_payments.feature") )
    else:
        df_installments_payments = pd.read_csv( os.path.join(args.dataset_dir, "installments_payments.csv" ) )

    # カテゴリーデータは存在しない
    # 同じ SK_ID_PREV を集約
    df_installments_payments_agg_numric = agg_dataframe_numric( df_installments_payments, agg_column = 'SK_ID_PREV', base_column_name = "installments_payments" )

    # 親データ の 'SK_ID_CURR' に、対応する 'SK_ID_PREV' を紐付け
    df_installments_payments_agg_numric = df_previous_application[['SK_ID_PREV', 'SK_ID_CURR']].merge(df_installments_payments_agg_numric, on = 'SK_ID_PREV', how = 'left')

    # 1つの `SK_ID_CURR` に対して、複数の `SK_ID_BUREAU` が存在することになるので、`SK_ID_CURR` を集約
    df_installments_payments_agg_numric = agg_dataframe_numric( df_installments_payments_agg_numric.drop(columns = ['SK_ID_PREV']), agg_column = 'SK_ID_CURR', base_column_name = "installments_payments" )

    # 元のデータに統合
    df_train = pd.merge(df_train, df_installments_payments_agg_numric, on='SK_ID_CURR', how='left' )
    df_test = pd.merge(df_test, df_installments_payments_agg_numric, on='SK_ID_CURR', how='left' )

    # 不要になったメモリを解放
    del df_installments_payments, df_installments_payments_agg_numric
    gc.collect()
    time_bar.update(10)

    #---------------------------
    # credit_card_balance
    #---------------------------
    if( args.feature_format ):
        df_credit_card_balance = read_feature( os.path.join(args.dataset_dir, "credit_card_balance.feature") )
    else:
        df_credit_card_balance = pd.read_csv( os.path.join(args.dataset_dir, "credit_card_balance.csv" ) )

    # 同じ SK_ID_PREV を集約
    df_credit_card_balance_agg_numric = agg_dataframe_numric( df_credit_card_balance, agg_column = 'SK_ID_PREV', base_column_name = "credit_card_balance" )
    df_credit_card_balance_agg_categorical = agg_dataframe_categorical( df_credit_card_balance, agg_column = 'SK_ID_PREV', base_column_name = "credit_card_balance", one_hot_encode = one_hot_encode )

    # 親データ の 'SK_ID_CURR' に、対応する 'SK_ID_PREV' を紐付け
    df_credit_card_balance_agg_numric = df_previous_application[['SK_ID_PREV', 'SK_ID_CURR']].merge(df_credit_card_balance_agg_numric, on = 'SK_ID_PREV', how = 'left')
    df_credit_card_balance_agg_categorical = df_previous_application[['SK_ID_PREV', 'SK_ID_CURR']].merge(df_credit_card_balance_agg_categorical, on = 'SK_ID_PREV', how = 'left')

    # 1つの `SK_ID_CURR` に対して、複数の `SK_ID_BUREAU` が存在することになるので、`SK_ID_CURR` を集約
    df_credit_card_balance_agg_numric = agg_dataframe_numric( df_credit_card_balance_agg_numric.drop(columns = ['SK_ID_PREV']), agg_column = 'SK_ID_CURR', base_column_name = "credit_card_balance" )
    df_credit_card_balance_agg_categorical = agg_dataframe_numric( df_credit_card_balance_agg_categorical.drop(columns = ['SK_ID_PREV']), agg_column = 'SK_ID_CURR', base_column_name = "credit_card_balance" )

    # 元のデータに統合
    df_train = pd.merge(df_train, df_credit_card_balance_agg_numric, on='SK_ID_CURR', how='left' )
    df_train = pd.merge(df_train, df_credit_card_balance_agg_categorical, on='SK_ID_CURR', how='left' )
    df_test = pd.merge(df_test, df_credit_card_balance_agg_numric, on='SK_ID_CURR', how='left' )
    df_test = pd.merge(df_test, df_credit_card_balance_agg_categorical, on='SK_ID_CURR', how='left' )

    # 不要になったメモリを解放
    del df_credit_card_balance, df_credit_card_balance_agg_numric, df_credit_card_balance_agg_categorical
    gc.collect()
    time_bar.update(10)

    #===========================
    # 特徴量の追加(結合後)
    #===========================
    # 異常値を含む特徴量
    if( args.invalid_features ):
        df_train['DAYS_EMPLOYED_ANOM'] = df_train["DAYS_EMPLOYED"] == 365243    # 異常値のフラグ
        df_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
        df_test['DAYS_EMPLOYED_ANOM'] = df_test["DAYS_EMPLOYED"] == 365243      # 異常値のフラグ
        df_test['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

    # 時系列データ
    if( args.time_features ):
        df_train['DAYS_BIRTH'] = -1 * df_train['DAYS_BIRTH']
        df_test['DAYS_BIRTH'] = -1 * df_test['DAYS_BIRTH']
        df_train['YEARS_BIRTH'] = df_train['DAYS_BIRTH'] / 365
        df_test['YEARS_BIRTH'] = df_test['DAYS_BIRTH'] / 365
        #df_train['YEARS_BINNED'] = pd.cut(df_train['YEARS_BIRTH'], bins = np.linspace(20, 70, num = 11))
        #df_test['YEARS_BINNED'] = pd.cut(df_test['YEARS_BIRTH'], bins = np.linspace(20, 70, num = 11))

    #----------------------------
    # 目的変数と強い相関をもつ特徴量での多項式特徴量(PolynomialFeatures)
    #----------------------------
    if( args.polynomial_features ):
        df_train_poly_features = df_train[ ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'] ]
        df_train_poly_features_target = df_train[ ["TARGET"] ]
        df_test_poly_features = df_test[ ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'] ]

        # Need to impute missing values
        imputer = SimpleImputer(strategy = 'median')
        df_train_poly_features = imputer.fit_transform(df_train_poly_features)
        df_test_poly_features = imputer.transform(df_test_poly_features)

        # Train the polynomial features and Transform the features
        poly_transformer = PolynomialFeatures(degree = 3)
        poly_transformer.fit(df_train_poly_features)
        df_train_poly_features = poly_transformer.transform(df_train_poly_features)
        df_test_poly_features = poly_transformer.transform(df_test_poly_features)

        # Create a dataframe of the features 
        df_train_poly_features = pd.DataFrame(
            df_train_poly_features, 
            columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])
        )
        df_train_poly_features[target_name] = df_train_poly_features_target

        # Put test features into dataframe
        df_test_poly_features = pd.DataFrame(
            df_test_poly_features, 
            columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])
        )

        # Merge polynomial features into training dataframe
        df_train_poly_features['SK_ID_CURR'] = df_train['SK_ID_CURR']
        df_train = pd.merge( df_train, df_train_poly_features, on = 'SK_ID_CURR', how = 'left')

        # Merge polnomial features into testing dataframe
        df_test_poly_features['SK_ID_CURR'] = df_test['SK_ID_CURR']
        df_test = pd.merge( df_test, df_test_poly_features, on = 'SK_ID_CURR', how = 'left')

        # Align the dataframes
        df_train.drop(['TARGET_y'], axis=1, inplace=True)
        df_train = df_train.rename( columns={'TARGET_x': 'TARGET'} )
        #df_train, df_test = df_train.align(df_test, join = 'inner', axis = 1)

    time_bar.update(10)
    
    #===========================
    # 無用なデータを除外(結合後)
    #===========================
    if 'SK_ID_CURR' in df_train.columns:
        df_train.drop(['SK_ID_CURR'], axis=1, inplace=True)
        df_test.drop(['SK_ID_CURR'], axis=1, inplace=True)
    if 'SK_ID_BUREAU' in df_train.columns:
        df_train.drop(['SK_ID_BUREAU'], axis=1, inplace=True)
        df_test.drop(['SK_ID_BUREAU'], axis=1, inplace=True)
    if 'SK_ID_PREV' in df_train.columns:
        df_train.drop(['SK_ID_PREV'], axis=1, inplace=True)
        df_test.drop(['SK_ID_PREV'], axis=1, inplace=True)
    
    #===========================
    # 全特徴量を一括で処理
    #===========================
    # 全データセット
    df_data = pd.concat([df_train, df_test], sort=False)

    for col in df_train.columns:
        # 目的変数
        if( col in [target_name] ):
            continue

        #-----------------------------
        # 欠損値の埋め合わせ
        #-----------------------------
        # NAN 値の埋め合わせ(平均値)
        if( col in ["OWN_CAR_AGE"] ):
            # データセット全体 df_data での平均値とする
            df_data[col].fillna(np.mean(df_data[col]), inplace=True)
            df_train[col].fillna(np.mean(df_data[col]), inplace=True)
            df_test[col].fillna(np.mean(df_data[col]), inplace=True)
        # NAN 値の埋め合わせ(ゼロ値)/ int 型
        elif( df_train[col].dtypes in ["int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64"] ):
            df_data[col].fillna(0, inplace=True)
            df_train[col].fillna(0, inplace=True)
            df_test[col].fillna(0, inplace=True)
        # NAN 値の埋め合わせ(ゼロ値)/ float 型
        elif( df_train[col].dtypes in ["float16", "float32", "float64", "float128"] ):
            df_data[col].fillna(0.0, inplace=True)
            df_train[col].fillna(0.0, inplace=True)
            df_test[col].fillna(0.0, inplace=True)
        # NAN 値の補完(None値)/ object 型
        else:
            df_data[col] = df_data[col].fillna('NA')
            df_train[col] = df_train[col].fillna('NA')
            df_test[col] = df_test[col].fillna('NA')

        #-----------------------------
        # ラベル情報のエンコード
        #-----------------------------
        if( df_train[col].dtypes == "object" ):
            label_encoder = LabelEncoder()
            label_encoder.fit(list(df_data[col]))
            df_train[col] = label_encoder.transform(list(df_train[col]))

            label_encoder = LabelEncoder()
            label_encoder.fit(list(df_data[col]))
            df_test[col] = label_encoder.transform(list(df_test[col]))

        #-----------------------------
        # 正規化処理
        #-----------------------------
        """
        if( df_train[col].dtypes in ["float16", "float32", "float64", "float128"] ):
            scaler = StandardScaler()
            scaler.fit( df_train[col].values.reshape(-1,1) )
            df_train[col] = scaler.transform( df_train[col].values.reshape(-1,1) )
            df_test[col] = scaler.transform( df_test[col].values.reshape(-1,1) )
        """

        #-----------------------------
        # 値が単一の特徴量をクレンジング
        #-----------------------------
        """
        if( df_train[col].nunique() == 1 ):
            print( "remove {} : {}".format(col,df_train[col].nunique()) )
            df_train.drop([col], axis=1, inplace=True)
            df_test.drop([col], axis=1, inplace=True)
        """

    time_bar.update(10)
    gc.disable()
    return df_train, df_test