importMySub3 = True
importMySub4 = True

combineSubs = False
combineBruteForce = False
combineGradDescAll = True
combineGradDescMeter = False

leak = pd.read_feather('./data/kaggle_leak.feather')
leak.fillna(0, inplace=True)
leak = leak[(leak['timestamp'].dt.year > 2016)
            & (leak['timestamp'].dt.year < 2019)]
leak.loc[leak['meter_reading'] < 0,
         'meter_reading'] = 0  # remove large negative values
leak = leak[leak['building_id'] != 245]
leak = myutils.reduce_mem_usage(leak)

# Load test data
test = pd.read_feather('./data/test_clean.feather')
test = test[['building_id', 'meter', 'timestamp', 'row_id']]

# Load predictions data
if importMySub1:
    mysub_1 = pd.read_feather(
        './submissions/submission_30_noleak.feather')  # 3-fold CV
if importMySub2:
    mysub_2 = pd.read_feather('./submissions/submission_31_noleak.feather'
                              )  # 5-fold building_id grouping CV
if importMySub3:
    mysub_3 = pd.read_feather('./submissions/submission_32_noleak.feather'
                              )  # 3-fold meter type grouping CV
Beispiel #2
0
if alignTimestamps:
    print('Aligning weather timestamps..')
    # Using 14h calculation
    weather = myutils.align_timestamp(weather, offset, strategy='14h_calc')
    print('Done!')

print('Saving data to feather...')
if dataset == 'train':
    weather.to_feather('./data/weather_train_feats.feather')
elif dataset == 'test':
    weather.to_feather('./data/weather_test_feats.feather')
print('Done!')

print('Merging weather and meter dataframes...')
# Merge weather data
all_data = myutils.reduce_mem_usage(weather)
all_data = meter_data.merge(weather, on=['site_id', 'timestamp'], how='left')
all_data = myutils.reduce_mem_usage(all_data)
del weather, meter_data
gc.collect()
print('Done!')

print('Adding new date time features...')
all_data = myutils.preprocess_datetime(all_data,
                                       date_feat=['h', 'd', 'w', 'm', 'wk'])
if encodeCyclic:
    all_data = myutils.encode_cyclic_feature(all_data, 'weekday', 7)
    all_data = myutils.encode_cyclic_feature(all_data, 'hour', 24)
    all_data = myutils.encode_cyclic_feature(all_data, 'day', 31)
    all_data = myutils.encode_cyclic_feature(all_data, 'month', 12)
all_data = myutils.reduce_mem_usage(all_data)
if splitOOF:
    total_RMSE = list()
    for model in models_dict['models']:
        rmse = myutils.rmse(model.predict(oof_1[feat_cols]), np.log1p(oof_1['meter_reading']))
        total_RMSE.append(rmse)
        print('Model RMSE on OOF data: {:}'.format(rmse))
    print('Ensemble RMSE: {:.3f}'.format(np.asarray(total_RMSE).mean()))

pickle.dump(models_dict, open('./models/models_{:d}.bin'.format(subN), 'wb'))
del train, X, y
gc.collect()

# doing the predictions
if doPred:
    test = pd.read_feather('./data/test_clean.feather')
    test = myutils.reduce_mem_usage(test)
    test['meter_reading'] = 0
    test['row_id_'] = 0
    feat_cols = models_dict['features']
    batch_size = 500000
    if groupMeter:
        for meter in np.unique(models_dict['meter']):
            print('Models for meter {:d}'.format(meter))
            models_idx = np.where(np.asarray(models_dict['meter']) == meter)[0]
            N_models = len(models_idx)
            test_meter = test.loc[test['meter']==meter, feat_cols]
            row_id_meter = test.loc[test['meter']==meter, 'row_id']
            test_meter.reset_index(drop=True, inplace=True)
            row_id_meter.reset_index(drop=True, inplace=True)
            N_BATCHES = int(len(test_meter)/batch_size) + 1
            
            plt.plot(time_plt, col_plt, 'o-')
            plt.plot(time_interp, col_interp, '.-')
            plt.title('{:} interpolation'.format(col))
    del weather_monthly

# Merging building metadata and train/test sets
meter_data = meter_data.merge(building, on='building_id', how='left')
del building

# Align timestamps
offset = myutils.calculate_time_offset(weather_train)
meter_data = meter_data.merge(weather_train,
                              on=['site_id', 'timestamp'],
                              how='left')
meter_data = myutils.align_timestamp(meter_data, offset)
meter_data = myutils.reduce_mem_usage(meter_data)
del weather_train
gc.collect()

if plotWeatherHist:
    # Air temperature histogram
    plot_cols = [
        'air_temperature', 'cloud_coverage', 'dew_temperature',
        'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
        'wind_speed'
    ]
    for col in plot_cols:
        myutils.plot_dist_col(col, meter_data)

# daily weather data
#train_daily = train.loc[train['building_id'].between(0, 100), ['timestamp', 'building_id', 'meter', 'meter_reading']]
Beispiel #5
0
        features[prefix +
                 'height'] = features[prefix +
                                      'dim_size'].apply(lambda x: x[1])
        #features.drop(prefix+'dominant_color', axis=1, inplace=True)
        features.drop(prefix + 'average_color', axis=1, inplace=True)
        features.drop(prefix + 'dim_size', axis=1, inplace=True)

    gc.collect()
    return features


if __name__ == '__main__':
    targetdir = '../input/ants/'
    imgs = os.listdir(targetdir)
    features = pd.DataFrame()
    features['imagepath'] = imgs
    features['imagepath'] = features['imagepath'].apply(
        lambda x: targetdir + str(x))

    numcpu = multiprocessing.cpu_count()
    print(f'use {numcpu} cpus')
    get_imagefeatures_multi(features,
                            'imagepath',
                            prefix='debug',
                            n_workers=numcpu)

    #features.drop('imagepath', axis=1, inplace=True)
    features = reduce_mem_usage(features)
    #features.to_feather('./sample.feather')

    print(features.head(30))