def gen_ic_feats(path='../data_',part=1):
    print 'Extracting user/cat-independent feats...'

    dates = os.listdir(path + str(part) + '/date')
    dates.sort()
    #dates

    _, P_item_id, _, P_item_cat = data_utils.load_P_item(path + str(part) + '/itemdict')
    P_item_id_unique = np.unique(P_item_id).tolist()
    P_item_id_unique = dict((el,0) for el in P_item_id_unique)
    P_item_cat_unique = np.unique(P_item_cat).tolist()
    P_item_cat_unique = dict((el, 0) for el in P_item_cat_unique)

    global duration_hours
    global dt0

    # Train
    print 'Extracting train feats...'
    train_time_start_1 = '2014-11-18'
    train_time_start_2 = '2014-12-15'
    train_time_start_3 = '2014-12-13'
    train_time_start_4 = '2014-12-09'

    train_time_end = '2014-12-15'
    train_time_thresh = '2014-12-16'

    dt_train_start_1 = datetime.strptime(train_time_start_1 + ' 00', '%Y-%m-%d %H')
    dt_train_start_2 = datetime.strptime(train_time_start_2 + ' 00', '%Y-%m-%d %H')
    dt_train_start_3 = datetime.strptime(train_time_start_3 + ' 00', '%Y-%m-%d %H')
    dt_train_start_4 = datetime.strptime(train_time_start_4 + ' 00', '%Y-%m-%d %H')
    dt_train_thresh = datetime.strptime(train_time_thresh + ' 00', '%Y-%m-%d %H')

    U_train_item_1 = data_utils.get_data_dict_4(P_item_id_unique, P_item_cat_unique, train_time_start_1, train_time_end, False, 'ci', path + str(part) + '/date/')
    fi_train_1, fc_train_1 = feature_extraction.wrapper(U_train_item_1, P_item_id_unique, duration_hours(dt_train_start_1, dt0), duration_hours(dt_train_thresh, dt0))

    U_train_item_2 = data_utils.get_data_dict_4(P_item_id_unique, P_item_cat_unique, train_time_start_2, train_time_end, False, 'ci', path + str(part) + '/date/')
    fi_train_2, fc_train_2 = feature_extraction.wrapper(U_train_item_2, P_item_id_unique, duration_hours(dt_train_start_2, dt0), duration_hours(dt_train_thresh, dt0))

    U_train_item_3 = data_utils.get_data_dict_4(P_item_id_unique, P_item_cat_unique, train_time_start_3, train_time_end, False, 'ci', path + str(part) + '/date/')
    fi_train_3, fc_train_3 = feature_extraction.wrapper(U_train_item_3, P_item_id_unique, duration_hours(dt_train_start_3, dt0), duration_hours(dt_train_thresh, dt0))

    U_train_item_4 = data_utils.get_data_dict_4(P_item_id_unique, P_item_cat_unique, train_time_start_4, train_time_end, False, 'ci', path + str(part) + '/date/')
    fi_train_4, fc_train_4 = feature_extraction.wrapper(U_train_item_4, P_item_id_unique, duration_hours(dt_train_start_4, dt0), duration_hours(dt_train_thresh, dt0))

    train_ic_list = [fi_train_1, fc_train_1, fi_train_2, fc_train_2, fi_train_3, fc_train_3, fi_train_4, fc_train_4]

    print 'Raw train ic feats list: ' + path + str(part) + '/train_ic_list'
    with open(path + str(part) + '/train_ic_list', 'wb') as fp:
        pickle.dump(train_ic_list, fp, protocol=2)
    
    # Val
    print 'Extracting val feats...'
    val_time_start_1 = '2014-11-19'
    val_time_start_2 = '2014-12-16'
    val_time_start_3 = '2014-12-14'
    val_time_start_4 = '2014-12-10'

    val_time_end = '2014-12-16'
    val_time_thresh = '2014-12-17'

    dt_val_start_1 = datetime.strptime(val_time_start_1 + ' 00', '%Y-%m-%d %H')
    dt_val_start_2 = datetime.strptime(val_time_start_2 + ' 00', '%Y-%m-%d %H')
    dt_val_start_3 = datetime.strptime(val_time_start_3 + ' 00', '%Y-%m-%d %H')
    dt_val_start_4 = datetime.strptime(val_time_start_4 + ' 00', '%Y-%m-%d %H')
    dt_val_thresh = datetime.strptime(val_time_thresh + ' 00', '%Y-%m-%d %H')

    U_val_item_1 = data_utils.get_data_dict_4(P_item_id_unique, P_item_cat_unique, val_time_start_1, val_time_end, False, 'ci', path + str(part) + '/date/')
    fi_val_1, fc_val_1 = feature_extraction.wrapper(U_val_item_1, P_item_id_unique, duration_hours(dt_val_start_1, dt0), duration_hours(dt_val_thresh, dt0))

    U_val_item_2 = data_utils.get_data_dict_4(P_item_id_unique, P_item_cat_unique, val_time_start_2, val_time_end, False, 'ci', path + str(part) + '/date/')
    fi_val_2, fc_val_2 = feature_extraction.wrapper(U_val_item_2, P_item_id_unique, duration_hours(dt_val_start_2, dt0), duration_hours(dt_val_thresh, dt0))

    U_val_item_3 = data_utils.get_data_dict_4(P_item_id_unique, P_item_cat_unique, val_time_start_3, val_time_end, False, 'ci', path + str(part) + '/date/')
    fi_val_3, fc_val_3 = feature_extraction.wrapper(U_val_item_3, P_item_id_unique, duration_hours(dt_val_start_3, dt0), duration_hours(dt_val_thresh, dt0))

    U_val_item_4 = data_utils.get_data_dict_4(P_item_id_unique, P_item_cat_unique, val_time_start_4, val_time_end, False, 'ci', path + str(part) + '/date/')
    fi_val_4, fc_val_4 = feature_extraction.wrapper(U_val_item_4, P_item_id_unique, duration_hours(dt_val_start_4, dt0), duration_hours(dt_val_thresh, dt0))

    val_ic_list = [fi_val_1, fc_val_1, fi_val_2, fc_val_2, fi_val_3, fc_val_3, fi_val_4, fc_val_4]

    print 'Raw val ic feats list: ' + path + str(part) + '/val_ic_list'
    with open(path + str(part) + '/val_ic_list', 'wb') as fp:
        pickle.dump(val_ic_list, fp, protocol=2)


    # Test
    print 'Extracting test feats...'
    test_time_start_1 = '2014-11-21'
    test_time_start_2 = '2014-12-18'
    test_time_start_3 = '2014-12-16'
    test_time_start_4 = '2014-12-12'

    test_time_end = '2014-12-18'
    test_time_thresh = '2014-12-19'

    duration_hours = lambda x, y: int((x - y).total_seconds() / 3600)

    dt_test_start_1 = datetime.strptime(test_time_start_1 + ' 00', '%Y-%m-%d %H')
    dt_test_start_2 = datetime.strptime(test_time_start_2 + ' 00', '%Y-%m-%d %H')
    dt_test_start_3 = datetime.strptime(test_time_start_3 + ' 00', '%Y-%m-%d %H')
    dt_test_start_4 = datetime.strptime(test_time_start_4 + ' 00', '%Y-%m-%d %H')
    dt_test_thresh = datetime.strptime(test_time_thresh + ' 00', '%Y-%m-%d %H')


    U_test_item_1 = data_utils.get_data_dict_4(P_item_id_unique, P_item_cat_unique, test_time_start_1, test_time_end, False, 'ci', path + str(part) + '/date/')
    fi_test_1, fc_test_1 = feature_extraction.wrapper(U_test_item_1, P_item_id_unique, duration_hours(dt_test_start_1, dt0), duration_hours(dt_test_thresh, dt0))

    U_test_item_2 = data_utils.get_data_dict_4(P_item_id_unique, P_item_cat_unique, test_time_start_2, test_time_end, False, 'ci', path + str(part) + '/date/')
    fi_test_2, fc_test_2 = feature_extraction.wrapper(U_test_item_2, P_item_id_unique, duration_hours(dt_test_start_2, dt0), duration_hours(dt_test_thresh, dt0))

    U_test_item_3 = data_utils.get_data_dict_4(P_item_id_unique, P_item_cat_unique, test_time_start_3, test_time_end, False, 'ci', path + str(part) + '/date/')
    fi_test_3, fc_test_3 = feature_extraction.wrapper(U_test_item_3, P_item_id_unique, duration_hours(dt_test_start_3, dt0), duration_hours(dt_test_thresh, dt0))

    U_test_item_4 = data_utils.get_data_dict_4(P_item_id_unique, P_item_cat_unique, test_time_start_4, test_time_end, False, 'ci', path + str(part) + '/date/')
    fi_test_4, fc_test_4 = feature_extraction.wrapper(U_test_item_4, P_item_id_unique, duration_hours(dt_test_start_4, dt0), duration_hours(dt_test_thresh, dt0))

    test_ic_list = [fi_test_1, fc_test_1, fi_test_2, fc_test_2, fi_test_3, fc_test_3, fi_test_4, fc_test_4]

    print 'Raw test ic feats list: ' + path + str(part) + '/test_ic_list'
    with open(path + str(part) + '/test_ic_list', 'wb') as fp:
        pickle.dump(test_ic_list, fp, protocol=2)

    print 'Completed!'
def gen_uci_feats(path='../data_',part=1):
    dates = os.listdir(path + str(part) + '/date')
    dates.sort()
    #dates

    global dt0
    global duration_hours

    fe = feature_extraction.Feature(path + str(part) + '/itemdict')

    _, P_item_id, _, _ = data_utils.load_P_item(path + str(part) + '/itemdict')
    P_item_id_unique = np.unique(P_item_id).tolist()
    P_item_id_unique = dict((el,0) for el in P_item_id_unique)

    # Train
    print 'Extracting train feats...'

    train_time_start_1 = '2014-11-18'
    train_time_start_2 = '2014-12-15'
    train_time_start_3 = '2014-12-13'
    train_time_start_4 = '2014-12-09'

    train_time_end = '2014-12-15'
    train_time_thresh = '2014-12-16'

    dt_train_start_1 = datetime.strptime(train_time_start_1 + ' 00', '%Y-%m-%d %H')
    dt_train_start_2 = datetime.strptime(train_time_start_2 + ' 00', '%Y-%m-%d %H')
    dt_train_start_3 = datetime.strptime(train_time_start_3 + ' 00', '%Y-%m-%d %H')
    dt_train_start_4 = datetime.strptime(train_time_start_4 + ' 00', '%Y-%m-%d %H')
    dt_train_thresh = datetime.strptime(train_time_thresh + ' 00', '%Y-%m-%d %H')

    U_train_item_1, label_train_1, result_train_truth_1 = data_utils.get_data_dict_4(P_item_id_unique, None, train_time_start_1, train_time_end, True, 'uci', path + str(part) + '/date/')
    X_train_1, y_train_1, ks_train_1 = fe.extract_features_item_2(U_train_item_1, label_train_1, duration_hours(dt_train_thresh, dt0), duration_hours(dt_train_start_1, dt0))

    U_train_item_2, label_train_2, result_train_truth_2 = data_utils.get_data_dict_4(P_item_id_unique, None, train_time_start_2, train_time_end, True, 'uci', path + str(part) + '/date/')
    X_train_2, y_train_2, ks_train_2 = fe.extract_features_item_2(U_train_item_2, label_train_2, duration_hours(dt_train_thresh, dt0), duration_hours(dt_train_start_2, dt0))

    U_train_item_3, label_train_3, result_train_truth_3 = data_utils.get_data_dict_4(P_item_id_unique, None, train_time_start_3, train_time_end, True, 'uci', path + str(part) + '/date/')
    X_train_3, y_train_3, ks_train_3 = fe.extract_features_item_2(U_train_item_3, label_train_3, duration_hours(dt_train_thresh, dt0), duration_hours(dt_train_start_3, dt0))

    U_train_item_4, label_train_4, result_train_truth_4 = data_utils.get_data_dict_4(P_item_id_unique, None, train_time_start_4, train_time_end, True, 'uci', path + str(part) + '/date/')
    X_train_4, y_train_4, ks_train_4 = fe.extract_features_item_2(U_train_item_4, label_train_4, duration_hours(dt_train_thresh, dt0), duration_hours(dt_train_start_4, dt0))

    train_list = [X_train_1, y_train_1, ks_train_1, result_train_truth_1, X_train_2, y_train_2, ks_train_2, result_train_truth_2, X_train_3, y_train_3, ks_train_3, result_train_truth_3, X_train_4, y_train_4, ks_train_4, result_train_truth_4]

    print 'Raw train feats list: ' + path + str(part) + '/train_list'
    with open(path + str(part) + '/train_list', 'wb') as fp:
        pickle.dump(train_list, fp, protocol=2)

    X_train = concat3([X_train_2, X_train_3, X_train_4, X_train_1], [ks_train_2, ks_train_3, ks_train_4, ks_train_1])
    y_train = y_train_1
    ks_train = ks_train_1

    train_concat = [X_train, y_train, ks_train]

    print 'Concatenated train feats: ' + path + str(part) + '/train_concat'
    with open(path + str(part) + '/train_concat', 'wb') as fp:
        pickle.dump(train_concat, fp, protocol=2)

    # Val
    print 'Extracting val feats...'

    val_time_start_1 = '2014-11-19'
    val_time_start_2 = '2014-12-16'
    val_time_start_3 = '2014-12-14'
    val_time_start_4 = '2014-12-10'

    val_time_end = '2014-12-16'
    val_time_thresh = '2014-12-17'

    dt_val_start_1 = datetime.strptime(val_time_start_1 + ' 00', '%Y-%m-%d %H')
    dt_val_start_2 = datetime.strptime(val_time_start_2 + ' 00', '%Y-%m-%d %H')
    dt_val_start_3 = datetime.strptime(val_time_start_3 + ' 00', '%Y-%m-%d %H')
    dt_val_start_4 = datetime.strptime(val_time_start_4 + ' 00', '%Y-%m-%d %H')
    dt_val_thresh = datetime.strptime(val_time_thresh + ' 00', '%Y-%m-%d %H')

    U_val_item_1, label_val_1, result_val_truth_1 = data_utils.get_data_dict_4(P_item_id_unique, None, val_time_start_1, val_time_end, True, 'uci', path + str(part) + '/date/')
    X_val_1, y_val_1, ks_val_1 = fe.extract_features_item_2(U_val_item_1, label_val_1, duration_hours(dt_val_thresh, dt0), duration_hours(dt_val_start_1, dt0))

    U_val_item_2, label_val_2, result_val_truth_2 = data_utils.get_data_dict_4(P_item_id_unique, None, val_time_start_2, val_time_end, True, 'uci', path + str(part) + '/date/')
    X_val_2, y_val_2, ks_val_2 = fe.extract_features_item_2(U_val_item_2, label_val_2, duration_hours(dt_val_thresh, dt0), duration_hours(dt_val_start_2, dt0))

    U_val_item_3, label_val_3, result_val_truth_3 = data_utils.get_data_dict_4(P_item_id_unique, None, val_time_start_3, val_time_end, True, 'uci', path + str(part) + '/date/')
    X_val_3, y_val_3, ks_val_3 = fe.extract_features_item_2(U_val_item_3, label_val_3, duration_hours(dt_val_thresh, dt0), duration_hours(dt_val_start_3, dt0))

    U_val_item_4, label_val_4, result_val_truth_4 = data_utils.get_data_dict_4(P_item_id_unique, None, val_time_start_4, val_time_end, True, 'uci', path + str(part) + '/date/')
    X_val_4, y_val_4, ks_val_4 = fe.extract_features_item_2(U_val_item_4, label_val_4, duration_hours(dt_val_thresh, dt0), duration_hours(dt_val_start_4, dt0))

    val_list = [X_val_1, y_val_1, ks_val_1, result_val_truth_1, X_val_2, y_val_2, ks_val_2, result_val_truth_2, X_val_3, y_val_3, ks_val_3, result_val_truth_3, X_val_4, y_val_4, ks_val_4, result_val_truth_4]

    print 'Raw val feats list: ' + path + str(part) + '/val_list'
    with open(path + str(part) + '/val_list', 'wb') as fp:
        pickle.dump(val_list, fp, protocol=2)

    X_val = concat3([X_val_2, X_val_3, X_val_4, X_val_1], [ks_val_2, ks_val_3, ks_val_4, ks_val_1])
    y_val = y_val_1
    ks_val = ks_val_1
    result_val_truth = result_val_truth_1

    val_concat = [X_val, y_val, ks_val, result_val_truth]

    print 'Concatenated val feats: ' + path + str(part) + '/val_concat'
    with open(path + str(part) + '/val_concat', 'wb') as fp:
        pickle.dump(val_concat, fp, protocol=2)

    # Test
    print 'Extracting test feats...'

    test_time_start_1 = '2014-11-21'
    test_time_start_2 = '2014-12-18'
    test_time_start_3 = '2014-12-16'
    test_time_start_4 = '2014-12-12'

    test_time_end = '2014-12-18'
    test_time_thresh = '2014-12-19'

    duration_hours = lambda x, y: int((x - y).total_seconds() / 3600)

    dt_test_start_1 = datetime.strptime(test_time_start_1 + ' 00', '%Y-%m-%d %H')
    dt_test_start_2 = datetime.strptime(test_time_start_2 + ' 00', '%Y-%m-%d %H')
    dt_test_start_3 = datetime.strptime(test_time_start_3 + ' 00', '%Y-%m-%d %H')
    dt_test_start_4 = datetime.strptime(test_time_start_4 + ' 00', '%Y-%m-%d %H')
    dt_test_thresh = datetime.strptime(test_time_thresh + ' 00', '%Y-%m-%d %H')

    U_test_item_1 = data_utils.get_data_dict_4(P_item_id_unique, None, test_time_start_1, test_time_end, False, 'uci', path + str(part) + '/date/')
    X_test_1, _, ks_test_1 = fe.extract_features_item_2(U_test_item_1, None, duration_hours(dt_test_thresh, dt0), duration_hours(dt_test_start_1, dt0))

    U_test_item_2 = data_utils.get_data_dict_4(P_item_id_unique, None, test_time_start_2, test_time_end, False, 'uci', path + str(part) + '/date/')
    X_test_2, _, ks_test_2 = fe.extract_features_item_2(U_test_item_2, None, duration_hours(dt_test_thresh, dt0), duration_hours(dt_test_start_2, dt0))

    U_test_item_3 = data_utils.get_data_dict_4(P_item_id_unique, None, test_time_start_3, test_time_end, False, 'uci', path + str(part) + '/date/')
    X_test_3, _, ks_test_3 = fe.extract_features_item_2(U_test_item_3, None, duration_hours(dt_test_thresh, dt0), duration_hours(dt_test_start_3, dt0))

    U_test_item_4 = data_utils.get_data_dict_4(P_item_id_unique, None, test_time_start_4, test_time_end, False, 'uci', path + str(part) + '/date/')
    X_test_4, _, ks_test_4 = fe.extract_features_item_2(U_test_item_4, None, duration_hours(dt_test_thresh, dt0), duration_hours(dt_test_start_4, dt0))

    test_list = [X_test_1, ks_test_1, X_test_2, ks_test_2, X_test_3, ks_test_3, X_test_4, ks_test_4]

    print 'Raw test feats list: ' + path + str(part) + '/test_list'
    with open(path + str(part) + '/test_list', 'wb') as fp:
        pickle.dump(test_list, fp, protocol=2)

    X_test = concat3([X_test_2, X_test_3, X_test_4, X_test_1], [ks_test_2, ks_test_3, ks_test_4, ks_test_1])
    ks_test = ks_test_1

    test_concat = [X_test, ks_test]

    print 'Concatenated test feats: ' + path + str(part) + '/test_concat'
    with open(path + str(part) + '/test_concat', 'wb') as fp:
        pickle.dump(test_concat, fp, protocol=2)
Exemple #3
0
def gen_ic_feats(path='../data_', part=1):
    print 'Extracting user/cat-independent feats...'

    dates = os.listdir(path + str(part) + '/date')
    dates.sort()
    #dates

    _, P_item_id, _, P_item_cat = data_utils.load_P_item(path + str(part) +
                                                         '/itemdict')
    P_item_id_unique = np.unique(P_item_id).tolist()
    P_item_id_unique = dict((el, 0) for el in P_item_id_unique)
    P_item_cat_unique = np.unique(P_item_cat).tolist()
    P_item_cat_unique = dict((el, 0) for el in P_item_cat_unique)

    global duration_hours
    global dt0

    # Train
    print 'Extracting train feats...'
    train_time_start_1 = '2014-11-18'
    train_time_start_2 = '2014-12-15'
    train_time_start_3 = '2014-12-13'
    train_time_start_4 = '2014-12-09'

    train_time_end = '2014-12-15'
    train_time_thresh = '2014-12-16'

    dt_train_start_1 = datetime.strptime(train_time_start_1 + ' 00',
                                         '%Y-%m-%d %H')
    dt_train_start_2 = datetime.strptime(train_time_start_2 + ' 00',
                                         '%Y-%m-%d %H')
    dt_train_start_3 = datetime.strptime(train_time_start_3 + ' 00',
                                         '%Y-%m-%d %H')
    dt_train_start_4 = datetime.strptime(train_time_start_4 + ' 00',
                                         '%Y-%m-%d %H')
    dt_train_thresh = datetime.strptime(train_time_thresh + ' 00',
                                        '%Y-%m-%d %H')

    U_train_item_1 = data_utils.get_data_dict_4(P_item_id_unique,
                                                P_item_cat_unique,
                                                train_time_start_1,
                                                train_time_end, False, 'ci',
                                                path + str(part) + '/date/')
    fi_train_1, fc_train_1 = feature_extraction.wrapper(
        U_train_item_1, P_item_id_unique,
        duration_hours(dt_train_start_1, dt0),
        duration_hours(dt_train_thresh, dt0))

    U_train_item_2 = data_utils.get_data_dict_4(P_item_id_unique,
                                                P_item_cat_unique,
                                                train_time_start_2,
                                                train_time_end, False, 'ci',
                                                path + str(part) + '/date/')
    fi_train_2, fc_train_2 = feature_extraction.wrapper(
        U_train_item_2, P_item_id_unique,
        duration_hours(dt_train_start_2, dt0),
        duration_hours(dt_train_thresh, dt0))

    U_train_item_3 = data_utils.get_data_dict_4(P_item_id_unique,
                                                P_item_cat_unique,
                                                train_time_start_3,
                                                train_time_end, False, 'ci',
                                                path + str(part) + '/date/')
    fi_train_3, fc_train_3 = feature_extraction.wrapper(
        U_train_item_3, P_item_id_unique,
        duration_hours(dt_train_start_3, dt0),
        duration_hours(dt_train_thresh, dt0))

    U_train_item_4 = data_utils.get_data_dict_4(P_item_id_unique,
                                                P_item_cat_unique,
                                                train_time_start_4,
                                                train_time_end, False, 'ci',
                                                path + str(part) + '/date/')
    fi_train_4, fc_train_4 = feature_extraction.wrapper(
        U_train_item_4, P_item_id_unique,
        duration_hours(dt_train_start_4, dt0),
        duration_hours(dt_train_thresh, dt0))

    train_ic_list = [
        fi_train_1, fc_train_1, fi_train_2, fc_train_2, fi_train_3, fc_train_3,
        fi_train_4, fc_train_4
    ]

    print 'Raw train ic feats list: ' + path + str(part) + '/train_ic_list'
    with open(path + str(part) + '/train_ic_list', 'wb') as fp:
        pickle.dump(train_ic_list, fp, protocol=2)

    # Val
    print 'Extracting val feats...'
    val_time_start_1 = '2014-11-19'
    val_time_start_2 = '2014-12-16'
    val_time_start_3 = '2014-12-14'
    val_time_start_4 = '2014-12-10'

    val_time_end = '2014-12-16'
    val_time_thresh = '2014-12-17'

    dt_val_start_1 = datetime.strptime(val_time_start_1 + ' 00', '%Y-%m-%d %H')
    dt_val_start_2 = datetime.strptime(val_time_start_2 + ' 00', '%Y-%m-%d %H')
    dt_val_start_3 = datetime.strptime(val_time_start_3 + ' 00', '%Y-%m-%d %H')
    dt_val_start_4 = datetime.strptime(val_time_start_4 + ' 00', '%Y-%m-%d %H')
    dt_val_thresh = datetime.strptime(val_time_thresh + ' 00', '%Y-%m-%d %H')

    U_val_item_1 = data_utils.get_data_dict_4(P_item_id_unique,
                                              P_item_cat_unique,
                                              val_time_start_1, val_time_end,
                                              False, 'ci',
                                              path + str(part) + '/date/')
    fi_val_1, fc_val_1 = feature_extraction.wrapper(
        U_val_item_1, P_item_id_unique, duration_hours(dt_val_start_1, dt0),
        duration_hours(dt_val_thresh, dt0))

    U_val_item_2 = data_utils.get_data_dict_4(P_item_id_unique,
                                              P_item_cat_unique,
                                              val_time_start_2, val_time_end,
                                              False, 'ci',
                                              path + str(part) + '/date/')
    fi_val_2, fc_val_2 = feature_extraction.wrapper(
        U_val_item_2, P_item_id_unique, duration_hours(dt_val_start_2, dt0),
        duration_hours(dt_val_thresh, dt0))

    U_val_item_3 = data_utils.get_data_dict_4(P_item_id_unique,
                                              P_item_cat_unique,
                                              val_time_start_3, val_time_end,
                                              False, 'ci',
                                              path + str(part) + '/date/')
    fi_val_3, fc_val_3 = feature_extraction.wrapper(
        U_val_item_3, P_item_id_unique, duration_hours(dt_val_start_3, dt0),
        duration_hours(dt_val_thresh, dt0))

    U_val_item_4 = data_utils.get_data_dict_4(P_item_id_unique,
                                              P_item_cat_unique,
                                              val_time_start_4, val_time_end,
                                              False, 'ci',
                                              path + str(part) + '/date/')
    fi_val_4, fc_val_4 = feature_extraction.wrapper(
        U_val_item_4, P_item_id_unique, duration_hours(dt_val_start_4, dt0),
        duration_hours(dt_val_thresh, dt0))

    val_ic_list = [
        fi_val_1, fc_val_1, fi_val_2, fc_val_2, fi_val_3, fc_val_3, fi_val_4,
        fc_val_4
    ]

    print 'Raw val ic feats list: ' + path + str(part) + '/val_ic_list'
    with open(path + str(part) + '/val_ic_list', 'wb') as fp:
        pickle.dump(val_ic_list, fp, protocol=2)

    # Test
    print 'Extracting test feats...'
    test_time_start_1 = '2014-11-21'
    test_time_start_2 = '2014-12-18'
    test_time_start_3 = '2014-12-16'
    test_time_start_4 = '2014-12-12'

    test_time_end = '2014-12-18'
    test_time_thresh = '2014-12-19'

    duration_hours = lambda x, y: int((x - y).total_seconds() / 3600)

    dt_test_start_1 = datetime.strptime(test_time_start_1 + ' 00',
                                        '%Y-%m-%d %H')
    dt_test_start_2 = datetime.strptime(test_time_start_2 + ' 00',
                                        '%Y-%m-%d %H')
    dt_test_start_3 = datetime.strptime(test_time_start_3 + ' 00',
                                        '%Y-%m-%d %H')
    dt_test_start_4 = datetime.strptime(test_time_start_4 + ' 00',
                                        '%Y-%m-%d %H')
    dt_test_thresh = datetime.strptime(test_time_thresh + ' 00', '%Y-%m-%d %H')

    U_test_item_1 = data_utils.get_data_dict_4(P_item_id_unique,
                                               P_item_cat_unique,
                                               test_time_start_1,
                                               test_time_end, False, 'ci',
                                               path + str(part) + '/date/')
    fi_test_1, fc_test_1 = feature_extraction.wrapper(
        U_test_item_1, P_item_id_unique, duration_hours(dt_test_start_1, dt0),
        duration_hours(dt_test_thresh, dt0))

    U_test_item_2 = data_utils.get_data_dict_4(P_item_id_unique,
                                               P_item_cat_unique,
                                               test_time_start_2,
                                               test_time_end, False, 'ci',
                                               path + str(part) + '/date/')
    fi_test_2, fc_test_2 = feature_extraction.wrapper(
        U_test_item_2, P_item_id_unique, duration_hours(dt_test_start_2, dt0),
        duration_hours(dt_test_thresh, dt0))

    U_test_item_3 = data_utils.get_data_dict_4(P_item_id_unique,
                                               P_item_cat_unique,
                                               test_time_start_3,
                                               test_time_end, False, 'ci',
                                               path + str(part) + '/date/')
    fi_test_3, fc_test_3 = feature_extraction.wrapper(
        U_test_item_3, P_item_id_unique, duration_hours(dt_test_start_3, dt0),
        duration_hours(dt_test_thresh, dt0))

    U_test_item_4 = data_utils.get_data_dict_4(P_item_id_unique,
                                               P_item_cat_unique,
                                               test_time_start_4,
                                               test_time_end, False, 'ci',
                                               path + str(part) + '/date/')
    fi_test_4, fc_test_4 = feature_extraction.wrapper(
        U_test_item_4, P_item_id_unique, duration_hours(dt_test_start_4, dt0),
        duration_hours(dt_test_thresh, dt0))

    test_ic_list = [
        fi_test_1, fc_test_1, fi_test_2, fc_test_2, fi_test_3, fc_test_3,
        fi_test_4, fc_test_4
    ]

    print 'Raw test ic feats list: ' + path + str(part) + '/test_ic_list'
    with open(path + str(part) + '/test_ic_list', 'wb') as fp:
        pickle.dump(test_ic_list, fp, protocol=2)

    print 'Completed!'
def gen_uci_feats(path='../data_', part=1):
    dates = os.listdir(path + str(part) + '/date')
    dates.sort()
    #dates

    global dt0
    global duration_hours

    fe = feature_extraction.Feature(path + str(part) + '/itemdict')

    _, P_item_id, _, _ = data_utils.load_P_item(path + str(part) + '/itemdict')
    P_item_id_unique = np.unique(P_item_id).tolist()
    P_item_id_unique = dict((el, 0) for el in P_item_id_unique)

    # Train
    print 'Extracting train feats...'

    train_time_start_1 = '2014-11-18'
    train_time_start_2 = '2014-12-15'
    train_time_start_3 = '2014-12-13'
    train_time_start_4 = '2014-12-09'

    train_time_end = '2014-12-15'
    train_time_thresh = '2014-12-16'

    dt_train_start_1 = datetime.strptime(train_time_start_1 + ' 00',
                                         '%Y-%m-%d %H')
    dt_train_start_2 = datetime.strptime(train_time_start_2 + ' 00',
                                         '%Y-%m-%d %H')
    dt_train_start_3 = datetime.strptime(train_time_start_3 + ' 00',
                                         '%Y-%m-%d %H')
    dt_train_start_4 = datetime.strptime(train_time_start_4 + ' 00',
                                         '%Y-%m-%d %H')
    dt_train_thresh = datetime.strptime(train_time_thresh + ' 00',
                                        '%Y-%m-%d %H')

    U_train_item_1, label_train_1, result_train_truth_1 = data_utils.get_data_dict_4(
        P_item_id_unique, None, train_time_start_1, train_time_end, True,
        'uci', path + str(part) + '/date/')
    X_train_1, y_train_1, ks_train_1 = fe.extract_features_item_2(
        U_train_item_1, label_train_1, duration_hours(dt_train_thresh, dt0),
        duration_hours(dt_train_start_1, dt0))

    U_train_item_2, label_train_2, result_train_truth_2 = data_utils.get_data_dict_4(
        P_item_id_unique, None, train_time_start_2, train_time_end, True,
        'uci', path + str(part) + '/date/')
    X_train_2, y_train_2, ks_train_2 = fe.extract_features_item_2(
        U_train_item_2, label_train_2, duration_hours(dt_train_thresh, dt0),
        duration_hours(dt_train_start_2, dt0))

    U_train_item_3, label_train_3, result_train_truth_3 = data_utils.get_data_dict_4(
        P_item_id_unique, None, train_time_start_3, train_time_end, True,
        'uci', path + str(part) + '/date/')
    X_train_3, y_train_3, ks_train_3 = fe.extract_features_item_2(
        U_train_item_3, label_train_3, duration_hours(dt_train_thresh, dt0),
        duration_hours(dt_train_start_3, dt0))

    U_train_item_4, label_train_4, result_train_truth_4 = data_utils.get_data_dict_4(
        P_item_id_unique, None, train_time_start_4, train_time_end, True,
        'uci', path + str(part) + '/date/')
    X_train_4, y_train_4, ks_train_4 = fe.extract_features_item_2(
        U_train_item_4, label_train_4, duration_hours(dt_train_thresh, dt0),
        duration_hours(dt_train_start_4, dt0))

    train_list = [
        X_train_1, y_train_1, ks_train_1, result_train_truth_1, X_train_2,
        y_train_2, ks_train_2, result_train_truth_2, X_train_3, y_train_3,
        ks_train_3, result_train_truth_3, X_train_4, y_train_4, ks_train_4,
        result_train_truth_4
    ]

    print 'Raw train feats list: ' + path + str(part) + '/train_list'
    with open(path + str(part) + '/train_list', 'wb') as fp:
        pickle.dump(train_list, fp, protocol=2)

    X_train = concat3([X_train_2, X_train_3, X_train_4, X_train_1],
                      [ks_train_2, ks_train_3, ks_train_4, ks_train_1])
    y_train = y_train_1
    ks_train = ks_train_1

    train_concat = [X_train, y_train, ks_train]

    print 'Concatenated train feats: ' + path + str(part) + '/train_concat'
    with open(path + str(part) + '/train_concat', 'wb') as fp:
        pickle.dump(train_concat, fp, protocol=2)

    # Val
    print 'Extracting val feats...'

    val_time_start_1 = '2014-11-19'
    val_time_start_2 = '2014-12-16'
    val_time_start_3 = '2014-12-14'
    val_time_start_4 = '2014-12-10'

    val_time_end = '2014-12-16'
    val_time_thresh = '2014-12-17'

    dt_val_start_1 = datetime.strptime(val_time_start_1 + ' 00', '%Y-%m-%d %H')
    dt_val_start_2 = datetime.strptime(val_time_start_2 + ' 00', '%Y-%m-%d %H')
    dt_val_start_3 = datetime.strptime(val_time_start_3 + ' 00', '%Y-%m-%d %H')
    dt_val_start_4 = datetime.strptime(val_time_start_4 + ' 00', '%Y-%m-%d %H')
    dt_val_thresh = datetime.strptime(val_time_thresh + ' 00', '%Y-%m-%d %H')

    U_val_item_1, label_val_1, result_val_truth_1 = data_utils.get_data_dict_4(
        P_item_id_unique, None, val_time_start_1, val_time_end, True, 'uci',
        path + str(part) + '/date/')
    X_val_1, y_val_1, ks_val_1 = fe.extract_features_item_2(
        U_val_item_1, label_val_1, duration_hours(dt_val_thresh, dt0),
        duration_hours(dt_val_start_1, dt0))

    U_val_item_2, label_val_2, result_val_truth_2 = data_utils.get_data_dict_4(
        P_item_id_unique, None, val_time_start_2, val_time_end, True, 'uci',
        path + str(part) + '/date/')
    X_val_2, y_val_2, ks_val_2 = fe.extract_features_item_2(
        U_val_item_2, label_val_2, duration_hours(dt_val_thresh, dt0),
        duration_hours(dt_val_start_2, dt0))

    U_val_item_3, label_val_3, result_val_truth_3 = data_utils.get_data_dict_4(
        P_item_id_unique, None, val_time_start_3, val_time_end, True, 'uci',
        path + str(part) + '/date/')
    X_val_3, y_val_3, ks_val_3 = fe.extract_features_item_2(
        U_val_item_3, label_val_3, duration_hours(dt_val_thresh, dt0),
        duration_hours(dt_val_start_3, dt0))

    U_val_item_4, label_val_4, result_val_truth_4 = data_utils.get_data_dict_4(
        P_item_id_unique, None, val_time_start_4, val_time_end, True, 'uci',
        path + str(part) + '/date/')
    X_val_4, y_val_4, ks_val_4 = fe.extract_features_item_2(
        U_val_item_4, label_val_4, duration_hours(dt_val_thresh, dt0),
        duration_hours(dt_val_start_4, dt0))

    val_list = [
        X_val_1, y_val_1, ks_val_1, result_val_truth_1, X_val_2, y_val_2,
        ks_val_2, result_val_truth_2, X_val_3, y_val_3, ks_val_3,
        result_val_truth_3, X_val_4, y_val_4, ks_val_4, result_val_truth_4
    ]

    print 'Raw val feats list: ' + path + str(part) + '/val_list'
    with open(path + str(part) + '/val_list', 'wb') as fp:
        pickle.dump(val_list, fp, protocol=2)

    X_val = concat3([X_val_2, X_val_3, X_val_4, X_val_1],
                    [ks_val_2, ks_val_3, ks_val_4, ks_val_1])
    y_val = y_val_1
    ks_val = ks_val_1
    result_val_truth = result_val_truth_1

    val_concat = [X_val, y_val, ks_val, result_val_truth]

    print 'Concatenated val feats: ' + path + str(part) + '/val_concat'
    with open(path + str(part) + '/val_concat', 'wb') as fp:
        pickle.dump(val_concat, fp, protocol=2)

    # Test
    print 'Extracting test feats...'

    test_time_start_1 = '2014-11-21'
    test_time_start_2 = '2014-12-18'
    test_time_start_3 = '2014-12-16'
    test_time_start_4 = '2014-12-12'

    test_time_end = '2014-12-18'
    test_time_thresh = '2014-12-19'

    duration_hours = lambda x, y: int((x - y).total_seconds() / 3600)

    dt_test_start_1 = datetime.strptime(test_time_start_1 + ' 00',
                                        '%Y-%m-%d %H')
    dt_test_start_2 = datetime.strptime(test_time_start_2 + ' 00',
                                        '%Y-%m-%d %H')
    dt_test_start_3 = datetime.strptime(test_time_start_3 + ' 00',
                                        '%Y-%m-%d %H')
    dt_test_start_4 = datetime.strptime(test_time_start_4 + ' 00',
                                        '%Y-%m-%d %H')
    dt_test_thresh = datetime.strptime(test_time_thresh + ' 00', '%Y-%m-%d %H')

    U_test_item_1 = data_utils.get_data_dict_4(P_item_id_unique, None,
                                               test_time_start_1,
                                               test_time_end, False, 'uci',
                                               path + str(part) + '/date/')
    X_test_1, _, ks_test_1 = fe.extract_features_item_2(
        U_test_item_1, None, duration_hours(dt_test_thresh, dt0),
        duration_hours(dt_test_start_1, dt0))

    U_test_item_2 = data_utils.get_data_dict_4(P_item_id_unique, None,
                                               test_time_start_2,
                                               test_time_end, False, 'uci',
                                               path + str(part) + '/date/')
    X_test_2, _, ks_test_2 = fe.extract_features_item_2(
        U_test_item_2, None, duration_hours(dt_test_thresh, dt0),
        duration_hours(dt_test_start_2, dt0))

    U_test_item_3 = data_utils.get_data_dict_4(P_item_id_unique, None,
                                               test_time_start_3,
                                               test_time_end, False, 'uci',
                                               path + str(part) + '/date/')
    X_test_3, _, ks_test_3 = fe.extract_features_item_2(
        U_test_item_3, None, duration_hours(dt_test_thresh, dt0),
        duration_hours(dt_test_start_3, dt0))

    U_test_item_4 = data_utils.get_data_dict_4(P_item_id_unique, None,
                                               test_time_start_4,
                                               test_time_end, False, 'uci',
                                               path + str(part) + '/date/')
    X_test_4, _, ks_test_4 = fe.extract_features_item_2(
        U_test_item_4, None, duration_hours(dt_test_thresh, dt0),
        duration_hours(dt_test_start_4, dt0))

    test_list = [
        X_test_1, ks_test_1, X_test_2, ks_test_2, X_test_3, ks_test_3,
        X_test_4, ks_test_4
    ]

    print 'Raw test feats list: ' + path + str(part) + '/test_list'
    with open(path + str(part) + '/test_list', 'wb') as fp:
        pickle.dump(test_list, fp, protocol=2)

    X_test = concat3([X_test_2, X_test_3, X_test_4, X_test_1],
                     [ks_test_2, ks_test_3, ks_test_4, ks_test_1])
    ks_test = ks_test_1

    test_concat = [X_test, ks_test]

    print 'Concatenated test feats: ' + path + str(part) + '/test_concat'
    with open(path + str(part) + '/test_concat', 'wb') as fp:
        pickle.dump(test_concat, fp, protocol=2)