create_normalized_features.py

import sys
import math
# To initialize the logger
#test
import logging
logger = logging.getLogger('lambda')
logging.basicConfig(filename='lambda.log', level=logging.DEBUG)

try:
    import coloredlogs
    coloredlogs.install(level='DEBUG')
except ImportError:
    pass

# To use a data analysis framework
import numpy as np
from scipy.stats import itemfreq

# To manage the item entry
from item import ItemHelper

class GlobalData(object):
    _name = None
    _internal = {}

    def __init__(self, name):
        self._name = name
        self._internal = {  "srch_id": {"type": np.integer, "data": None},
                            "date_time": {"type": np.chararray, "data": None},
                            "site_id": {"type": np.integer, "data": None},
                            "visitor_location_country_id": {"type": np.integer, "data": None},
                            "visitor_hist_starrating": {"type": np.float, "data": None},
                            "visitor_hist_adr_usd": {"type": np.float, "data": None},
                            "prop_country_id": {"type": np.integer, "data": None},
                            "prop_id": {"type": np.integer, "data": None},
                            "prop_starrating": {"type": np.integer, "data": None},
                            "prop_review_score": {"type": np.float, "data": None},
                            "prop_brand_bool": {"type": np.integer, "data": None},
                            "prop_location_score1": {"type": np.float, "data": None},
                            "prop_location_score2": {"type": np.float, "data": None},
                            "prop_log_historical_price": {"type": np.float, "data": None},
                            "position": {"type": np.integer, "data": None},
                            "price_usd": {"type": np.float, "data": None},
                            "promotion_flag": {"type": np.integer, "data": None},
                            "srch_destination_id": {"type": np.integer, "data": None},
                            "srch_length_of_stay": {"type": np.integer, "data": None},
                            "srch_booking_window": {"type": np.integer, "data": None},
                            "srch_adults_count": {"type": np.integer, "data": None},
                            "srch_children_count": {"type": np.integer, "data": None},
                            "srch_room_count": {"type": np.integer, "data": None},
                            "srch_saturday_night_bool": {"type": np.integer, "data": None},
                            "srch_query_affinity_score": {"type": np.float, "data": None},
                            "orig_destination_distance": {"type": np.float, "data": None},
                            "random_bool": {"type": np.integer, "data": None},
                            "comp1_rate": {"type": np.float, "data": None},
                            "comp1_inv": {"type": np.float, "data": None},
                            "comp1_rate_percent_diff": {"type": np.float, "data": None},
                            "comp2_rate": {"type": np.float, "data": None},
                            "comp2_inv": {"type": np.float, "data": None},
                            "comp2_rate_percent_diff": {"type": np.float, "data": None},
                            "comp3_rate": {"type": np.float, "data": None},
                            "comp3_inv": {"type": np.float, "data": None},
                            "comp3_rate_percent_diff": {"type": np.float, "data": None},
                            "comp4_rate": {"type": np.float, "data": None},
                            "comp4_inv": {"type": np.float, "data": None},
                            "comp4_rate_percent_diff": {"type": np.float, "data": None},
                            "comp5_rate": {"type": np.float, "data": None},
                            "comp5_inv": {"type": np.float, "data": None},
                            "comp5_rate_percent_diff": {"type": np.float, "data": None},
                            "comp6_rate": {"type": np.float, "data": None},
                            "comp6_inv": {"type": np.float, "data": None},
                            "comp6_rate_percent_diff": {"type": np.float, "data": None},
                            "comp7_rate": {"type": np.float, "data": None},
                            "comp7_inv": {"type": np.float, "data": None},
                            "comp7_rate_percent_diff": {"type": np.float, "data": None},
                            "comp8_rate": {"type": np.float, "data": None},
                            "comp8_inv": {"type": np.float, "data": None},
                            "comp8_rate_percent_diff": {"type": np.float, "data": None},
                            "click_bool": {"type": np.integer, "data": None},
                            "gross_bookings_usd": {"type": np.float, "data": None},
                            "booking_bool": {"type": np.integer, "data": None},
                            # Newly Created Attributes
                            "new_hist_price_booking":{"type": np.float, "data": None},
                            "new_hist_price_click":{"type": np.float, "data": None},
                            "new_hist_starring_booking":{"type": np.float, "data": None},
                            "new_hist_starring_click":{"type": np.float, "data": None},
                            }

    def load(self, attribute):
        np_array = np.load(self.get_path(attribute))
        self._internal[attribute]["data"] = np_array
        logger.info("Attribute ({0}) {1} items are loaded.".format(attribute, np_array.size))

    # To keep the memory usage low
    def discard(self, attribute):
        self._internal[attribute]["data"] = None
        logger.info("Attribute ({0}) every item is discarded.".format(attribute))

    def get(self, attribute):
        return self._internal[attribute]["data"]

    def convert(self, attribute, raw_array, auto_save):
        logger.info("Converting the attribute ({0})...".format(attribute))
        valid_data = GlobalData("valid_train")
        # These keys have missing values. 'None' or 'NULL' values are substituted with np.nan.
        missing_value_group1 = [ "comp1_rate", "comp1_inv", "comp1_rate_percent_diff",
        "comp2_rate", "comp2_inv", "comp2_rate_percent_diff",
        "comp3_rate", "comp3_inv", "comp3_rate_percent_diff",
        "comp4_rate", "comp4_inv", "comp4_rate_percent_diff",
        "comp5_rate", "comp5_inv", "comp5_rate_percent_diff",
        "comp6_rate", "comp6_inv", "comp6_rate_percent_diff",
        "comp7_rate", "comp7_inv", "comp7_rate_percent_diff",
        "comp8_rate", "comp8_inv", "comp8_rate_percent_diff"]
        missing_value_group2 = [ "prop_review_score", "prop_location_score2",
        "srch_query_affinity_score", "orig_destination_distance"]
        #the worse case need to fill the missing value
        #the mim of prop_location_score2 is 0.0
        #the min of prop_review_score is 0.0
        #the min of srch_query_affinity_score is -326.5675
        #the max of orig_destination_distance =  11692.98
        if attribute in missing_value_group1:
            v = np.vectorize(lambda x: 0 if x == "NULL" or x == "None" else x)
            raw_array = v(raw_array)
        if attribute == "prop_review_score" or attribute == "prop_location_score2":
            v = np.vectorize(lambda x: 0.0 if x == "NULL" or x == "None" else x)
            raw_array = v(raw_array)
        if attribute == "srch_query_affinity_score":
            v = np.vectorize(lambda x: -326.5675 if x == "NULL" or x == "None" else x)
            raw_array = v(raw_array)
        if attribute == "orig_destination_distance":
            v = np.vectorize(lambda x: 11692.98 if x == "NULL" or x == "None" else x)
            raw_array = v(raw_array)

        np_array = np.array(raw_array)
        np.save(valid_data.get_path(attribute), np_array)
        logger.info("Attribute ({0}) {1} items are converted.".format(attribute, np_array.size))

    def save(self, attribute):
        np_array = self._internal[attribute]["data"]
        np.save(self.get_path(attribute), np_array)
        logger.info("Attribute ({0}) {1} items are saved.".format(attribute, np_array.size))

    def export(self, attribute, np_array):
        np.save(self.get_path(attribute), np_array)
        logger.info("Attribute ({0}) {1} items are exported.".format(attribute, np_array.size))

    def get_path(self, attribute):
        return "data_numpy/{0}_{1}.npy".format(self._name, attribute)

    #def get_data_outline_path(self, attribute):
    def get_data_outline_path(self, prefix, attribute):
        return "data_outline/{2}_{0}_{1}.txt".format(self._name, attribute, prefix)

def load_benchmark(path):
    ret = {}
    with open(path) as fp:
        fp.readline() # to ignore the header

    for line in fp:
        fields = line.strip().split(",")

        if (not(fields[0] in ret)):
            ret[fields[0]] = []

        ret[fields[0]].append(fields[1])

    logger.info("Number of benchmark items: %d".format(len(ret)))
    return ret

def combine_something(self):
    #this part can be used to create new feature npy file
    train_data = GlobalData("train")
    train_data.load("visitor_hist_adr_usd")
    train_data.load("price_usd")
    train_data.load("click_bool")
    np_array_1 = train_data.get("visitor_hist_adr_usd")
    np_array_2 = train_data.get("price_usd")
    np_array_3 = train_data.get("click_bool")
    new =[]
    for i in range(0,np_array_1.size):
        if np_array_1[i] != "nan" :
            if(float(np_array_1[i]) > 0 and float(np_array_2[i]) > 0):
                    diff = abs(math.log(float(np_array_1[i])) - math.log(float(np_array_2[i])))
                    if 0<=diff<0.1:
                        new.append(1.353)
                    elif 0.1<=diff<0.2:
                        new.append(1.066)
                    elif 0.2<=diff<0.3:
                        new.append(1.013)
                    elif 0.3<=diff<0.4:
                        new.append(0.623)
                    elif 0.4<=diff<0.5:
                        new.append(0)
                    elif 0.5<=diff<0.6:
                        new.append(-0.298)
                    elif 0.6<=diff<0.7:
                        new.append(-0.55)
                    elif 0.7<=diff<0.8:
                        new.append(-0.976)
                    elif 0.8<=diff<0.9:
                        new.append(-1.152)
                    else:
                        new.append(-2.199)
            else:
                new.append(0)
        else:
            new.append(0)
    #for i in np_array_1:
    #    for j in np_array_2:
    #        if(i == "NULL"):
    #            np_array_3 = "0"
    #        else:
    #            np_array_3 = "1"
    raw_array = np.array(new)
    np_array = raw_array.astype(self._internal["new_hist_price_click"]["type"])
    self._internal["new_hist_price_click"]["data"] = np_array
    train_data.export("new_hist_price_click", np_array)
def combine_something_2(self):
    #this part can be used to create new feature npy file
    train_data = GlobalData("train")
    train_data.load("visitor_hist_starrating")
    train_data.load("prop_starrating")
    #train_data.load("click_bool")
    np_array_1 = train_data.get("visitor_hist_starrating")
    np_array_2 = train_data.get("prop_starrating")
    #np_array_3 = train_data.get("click_bool")
    new =[]
    for i in range(0,np_array_1.size):
        print np_array_1[i]
        if np_array_1[i] != "nan":
            diff = abs(float(np_array_1[i]) - int(np_array_2[i]))
            print diff
            if 0<=diff<1:
                new.append(1.672)
            elif 1<=diff<2:
                new.append(0.053)
            elif 2<=diff<3:
                new.append(-0.856)
            elif 3<=diff<4:
                new.append(-1.017)
            elif 4<=diff<5:
                new.append(-0.876)
            else:
                new.append(0)
        else:
            new.append(0)
    #for i in np_array_1:
    #    for j in np_array_2:
    #        if(i == "NULL"):
    #            np_array_3 = "0"
    #        else:
    #            np_array_3 = "1"
    raw_array = np.array(new)
    np_array = raw_array.astype(self._internal["new_hist_starring_booking"]["type"])
    self._internal["new_hist_starring_booking"]["data"] = np_array
    train_data.export("new_hist_starring_booking", np_array)

def convert_data_to_numpy(path, train_data):
    line_number = 0
    with open(path) as fp:
        temp = fp.readline().strip().split(",") # to ignore the header but to count the number of fields
        num_of_fields = len(temp)
        logger.info("Number of fields: {0}".format(num_of_fields))

        # DO NOT TRY TO UPDATE CONVERT EVERY ATTRIBUTE (IT WILL CONSUME HUGE MEMORY SPACE)
        need_to_be_convert = [
            #"srch_id",
            #"date_time",
            "site_id",
            "visitor_location_country_id",
            # "visitor_hist_starrating",
            "visitor_hist_adr_usd",
            "prop_country_id",
            "prop_id",
            #"prop_starrating",
            #"prop_review_score",
            "prop_brand_bool",
            "prop_location_score1",
            # "prop_location_score2",
            "prop_log_historical_price",
            "position",
            "price_usd",
            "promotion_flag",
            "srch_destination_id",
            "srch_length_of_stay",
            "srch_booking_window",
            "srch_adults_count",
            "srch_children_count",
            "srch_room_count",
            "srch_saturday_night_bool",
            #"srch_query_affinity_score",
            #"orig_destination_distance",
            "random_bool",
            # "comp1_rate",
            # "comp1_inv",
            # "comp1_rate_percent_diff",
            # "comp2_rate",
            # "comp2_inv",
            # "comp2_rate_percent_diff",
            # "comp3_rate",
            # "comp3_inv",
            # "comp3_rate_percent_diff",
            # "comp4_rate",
            # "comp4_inv",
            # "comp4_rate_percent_diff",
            # "comp5_rate",
            # "comp5_inv",
            # "comp5_rate_percent_diff",
            # "comp6_rate",
            # "comp6_inv",
            # "comp6_rate_percent_diff",
            # "comp7_rate",
            # "comp7_inv",
            # "comp7_rate_percent_diff",
            # "comp8_rate",
            # "comp8_inv",
            # "comp8_rate_percent_diff",
            "click_bool",
            # "gross_bookings_usd",
            "booking_bool",
        ]


        item_helper = ItemHelper()
        entire_data = []
        mask_entire_data = map(lambda x: item_helper.get_column_index_of(x), need_to_be_convert)

        #for i in range(0,100000): # partial convertion
            #linebuf = fp.readline()
        for linebuf in fp: # full convertion
            line_number = line_number + 1
            fields = linebuf.strip().split(",")

            if (line_number % 1000 == 0 ):
                print "Reading the line : {0}\r".format(line_number),

            if (len(fields)==num_of_fields):
                entire_data.append(map(lambda i: fields[i], mask_entire_data))
            else:
                logger.warning("Mismatching fields: {0}".format(fields))

        print ""

        np_array_entire_data = np.array(entire_data)
        for idx, val in enumerate(need_to_be_convert):
            selected = np_array_entire_data[:,idx]
            train_data.convert(val, selected, True)
            selected = None

        logger.info("Completed: {0}".format(line_number))

def print_possible_values(attribute, create_file=False):
    train_data = GlobalData("train")
    train_data.load(attribute)
    np_array = train_data.get(attribute)

    outline_path = train_data.get_data_outline_path("count", attribute)
    if create_file:
        f = open(outline_path, "w")
    else:
        f = sys.stdout

    for x in itemfreq(np_array):
        if ((np_array.dtype.char == "d" and ~np.isnan(x[0])) or np_array.dtype.char != "d"):
            print >> f, x[0], x[1]

    train_data.discard(attribute)

    if create_file:
        f.close()

def print_summary_statistics(attribute, create_file=False):
    train_data = GlobalData("train")
    train_data.load(attribute)
    np_array = train_data.get(attribute)

    outline_path = train_data.get_data_outline_path("summary", attribute)
    if create_file:
        f = open(outline_path, "w")
    else:
        f = sys.stdout


    if np_array.dtype.char != "O":
        print >>f, "=== statistics (nan ignored) ==="
        print >>f, "min: ", np.nanmin(np_array)
        print >>f, "max: ", np.nanmax(np_array)
        print >>f, "percentile .1: ", np.nanpercentile(np_array, 0.1)
        print >>f, "percentile 1: ", np.nanpercentile(np_array, 1)
        print >>f, "percentile 10: ", np.nanpercentile(np_array, 10)
        print >>f, "percentile 50: ", np.nanpercentile(np_array, 50)
        print >>f, "percentile 90: ", np.nanpercentile(np_array, 90)
        print >>f, "percentile 99: ", np.nanpercentile(np_array, 99)
        print >>f, "percentile 99.9: ", np.nanpercentile(np_array, 99.9)

    #for x in itemfreq(np_array):
    #    if ((np_array.dtype.char == "d" and ~np.isnan(x[0])) or np_array.dtype.char != "d"):

    train_data.discard(attribute)

    if create_file:
        f.close()

def get_relative_portion_of_missing_values():
    train_data = GlobalData("train")
    item_helper = ItemHelper()
    result = []
    for key in item_helper.get_all_column_names():
        train_data.load(key)
        np_array = train_data.get(key)
        if np_array.dtype.char == "d":
            result.append((key, np.count_nonzero(np.isnan(np_array)), np_array.size))
            train_data.discard(key)
    return result

def get_rid_outlier (np_array, lower_percentile, upper_percentile):
    lower_bound = np.nanpercentile(np_array, lower_percentile)
    upper_bound = np.nanpercentile(np_array, upper_percentile)
    np_array[ np_array < lower_bound ] = lower_bound
    np_array[ np_array > upper_bound ] = upper_bound
    return np_array

def normalize_linear(np_array, lower_percentile, upper_percentile):
    lower_bound = np.nanpercentile(np_array, lower_percentile)
    upper_bound = np.nanpercentile(np_array, upper_percentile)

    if (upper_bound != lower_bound):
        np_array[np_array < lower_bound] = lower_bound
        np_array[np_array > upper_bound] = upper_bound
        np_array = np_array - lower_bound
        np_array = np_array / (upper_bound - lower_bound)
    return np_array

# apply same sampling to each attribute
def sampling_data(data_name, sampling_rate, method_type):

    data = GlobalData(data_name) # normalized_train_attri
    sampled_data = GlobalData("sampled_"+data_name)
    item_helper = ItemHelper ()
    if(method_type ==1):
        sampled = np.random.choice(9917530, int(sampling_rate*9917530))
    elif(method_type==2):
        sampled = np.array(range(0,int(sampling_rate*9917530)))
    for key in item_helper.get_all_column_names_new():
        np_array = np.load(data.get_path(key))
        logger.info("Sampling on the attribute ({0}) with {1} in total.".format(key, np_array.size))
        np.save(sampled_data.get_path(key), np_array[sampled])


# combine separated files together
# data_name = sampled_normalized_valid_train or ..._test

def combine_npys(data_name):

    data = GlobalData(data_name)
    item_helper = ItemHelper()
    keys = item_helper.get_all_column_names_new()
    result = np.load(data.get_path(keys[0]))
    for i in range (1, len(keys)):
        np_array = np.load(data.get_path(keys[i]))
        logger.info("Combining {0} with {1} in total \n.".format(keys[i], np_array.size))
        result = np.vstack((result, np_array))
    np.save("data_numpy/combined_"+data_name+".npy", result)


def create_normalized_attribute(data_name, attribute):

    logger.info("Normalizing on the attribute ({0}).".format(attribute))
    data = GlobalData(data_name)
    normalized_data = GlobalData("normalized_" + data_name)
    np_array = np.load(data.get_path(attribute))

    if(attribute=="srch_id" or attribute=="prop_id"):
        np.save(normalized_data.get_path(attribute), np_array)
        return

    ignore_attributes = ["date_time"]
    need_to_remove_outliers = ["price_usd", "visitor_hist_adr_usd_booking", "visitor_hist_adr_usd_click"]
    for i in range(1,9):
       need_to_remove_outliers.append("comp"+str(i)+"_rate_percent_diff")
    need_to_apply_log = ["price_usd", "visitor_hist_adr_usd", "gross_bookings_usd"]

    if not(attribute in ignore_attributes):
        if attribute in need_to_remove_outliers:
            np_array = get_rid_outlier(np_array, 0.1, 99.9)
        if attribute in need_to_apply_log:
            np_array = np.log10(np_array)
        np_array = normalize_linear(np_array, 0.1, 99.9)
    np.save(normalized_data.get_path(attribute), np_array)
    np_array = None

def validate_normalized_attribute(data_name, attribute):
    logger.info("Validating on the normalized attribute ({0}).".format(attribute))
    normalized_data = GlobalData("normalized_" + data_name)

    ignore_attributes = ["date_time"]

    np_array = np.load(normalized_data.get_path(attribute))
    if not(attribute in ignore_attributes):
        print np.nanmin(np_array), np.nanmax(np_array)
    np_array = None

def average_over(data_name, categorical_key, attribute):
    ignore_attributes = ["date_time"]
    need_to_remove_outliers = ["price_usd", "visitor_hist_adr_usd_booking", "visitor_hist_adr_usd_click"]
    for i in range(1,9):
       need_to_remove_outliers.append("comp"+str(i)+"_rate_percent_diff")
    need_to_apply_log = ["price_usd", "visitor_hist_adr_usd", "gross_bookings_usd"]

    data = GlobalData(data_name)
    np_array = np.load(data.get_path(attribute))

    logger.info("Removing the outliers ({0}) over ({1}).".format(attribute,categorical_key))
    if not(attribute in ignore_attributes):
        if attribute in need_to_remove_outliers:
            np_array = get_rid_outlier(np_array, 0.1, 99.9)
        if attribute in need_to_apply_log:
            np_array = np.log10(np_array)

    logger.info("Categorizing ({0}) over ({1}).".format(attribute,categorical_key))
    np_category = np.load(data.get_path(categorical_key))
    category_index = {}
    for idx, category_value in enumerate(np_category):
        if not(category_value in category_index):
            category_index[category_value] = []
        category_index[category_value].append(idx)

    logger.info("Normalizing the attribute ({0}) over ({1}).".format(attribute,categorical_key))
    for category_value in category_index:
        target_index = category_index[category_value]
        np_array[target_index] = normalize_linear(np_array[target_index], 0.1, 99.9)

    averaged_data = GlobalData("averaged_{0}_{1}".format(categorical_key,data_name))
    np.save(averaged_data.get_path(attribute), np_array)

def create_summary_of_prop_id(data_name, categorical_key, attribute):
    ignore_attributes = ["date_time"]
    need_to_remove_outliers = ["price_usd", "visitor_hist_adr_usd_booking", "visitor_hist_adr_usd_click"]
    for i in range(1,9):
       need_to_remove_outliers.append("comp"+str(i)+"_rate_percent_diff")
    need_to_apply_log = ["price_usd", "visitor_hist_adr_usd", "gross_bookings_usd"]

    data = GlobalData(data_name)
    np_array = np.load(data.get_path(attribute))

    logger.info("Removing the outliers ({0}) over ({1}).".format(attribute,categorical_key))
    if not(attribute in ignore_attributes):
        if attribute in need_to_remove_outliers:
            np_array = get_rid_outlier(np_array, 0.1, 99.9)
        if attribute in need_to_apply_log:
            np_array = np.log10(np_array)

    logger.info("Categorizing ({0}) over ({1}).".format(attribute,categorical_key))
    np_category = np.load(data.get_path(categorical_key))
    category_index = {}
    for idx, category_value in enumerate(np_category):
        if not(category_value in category_index):
            category_index[category_value] = []
        category_index[category_value].append(idx)

    temp = []
    for category_value in category_index:
        target_index = category_index[category_value]
        temp.append([category_value, np.mean(np_array[target_index]), np.std(np_array[target_index]), np.median(np_array[target_index])])

    np.save("data_prop_id/{0}_{1}.npy".format(data_name, attribute), np.array(temp))

def main_old():
    import sys
    feature_index = int(sys.argv[1])
    categorical_features = ["srch_id",
                      "site_id",
                      "visitor_location_country_id",
                      "prop_country_id",
                      "prop_id",
                      "srch_destination_id"]
    print "Feature index: {0} - {1}".format(feature_index, categorical_features[feature_index])

    #for categorical_key in categorical_features:
    categorical_key = categorical_features[feature_index]
    #"date_time"
    #for key in []:
    #["srch_id","site_id","visitor_location_country_id","visitor_hist_starrating","visitor_hist_adr_usd","prop_country_id","prop_id","prop_starrating","prop_review_score","prop_brand_bool","prop_location_score1","prop_location_score2","prop_log_historical_price"
    #,"price_usd","promotion_flag","srch_destination_id","srch_length_of_stay","srch_booking_window","srch_adults_count","srch_children_count","srch_room_count","srch_saturday_night_bool","srch_query_affinity_score","orig_destination_distance","random_bool","comp1_rate","comp1_inv","comp1_rate_percent_diff","comp2_rate","comp2_inv","comp2_rate_percent_diff","comp3_rate","comp3_inv","comp3_rate_percent_diff","comp4_rate","comp4_inv","comp4_rate_percent_diff","comp5_rate","comp5_inv","comp5_rate_percent_diff","comp6_rate","comp6_inv","comp6_rate_percent_diff","comp7_rate","comp7_inv","comp7_rate_percent_diff","comp8_rate","comp8_inv","comp8_rate_percent_diff", 'new_datetime_year', 'new_datetime_month', 'new_datetime_day', 'new_datetime_hour', 'new_hist_starring', 'new_hist_price']:
    for key in ["price_usd", "srch_length_of_stay", "srch_adults_count", "srch_children_count", "srch_room_count", "srch_saturday_night_bool"]:
        #if not(key in categorical_features):
            create_summary_of_prop_id("valid_train", categorical_key, key)
            create_summary_of_prop_id("valid_test", categorical_key, key)

def main():
    for data_name in ["train", "test"]:
        print data_name
        prop_id = np.load("data_numpy/valid_{0}_prop_id.npy".format(data_name))
        for key in ["price_usd", "srch_length_of_stay", "srch_adults_count", "srch_children_count", "srch_room_count", "srch_saturday_night_bool"]:
            print key
            feature_mean = np.zeros(prop_id.shape)
            feature_std = np.zeros(prop_id.shape)
            feature_median = np.zeros(prop_id.shape)
            prop_id_info = np.load("data_prop_id/valid_{0}_{1}.npy".format(data_name, key))
            temp_prop_id_info = {}
            for i, mean, std, median in prop_id_info:
                temp_prop_id_info[i] = [mean, std, median]

            for idx, i in enumerate(prop_id):
                ttt = temp_prop_id_info[i]
                feature_mean[idx] = ttt[0]
                feature_std[idx] = ttt[1]
                feature_median[idx] = ttt[2]
                if idx % 1000 == 0:
                    print idx, "\r",

            np.save("data_numpy/valid_{0}_new_prop_id_{1}_mean.npy".format(data_name, key), feature_mean)
            np.save("data_numpy/valid_{0}_new_prop_id_{1}_std.npy".format(data_name, key), feature_std)
            np.save("data_numpy/valid_{0}_new_prop_id_{1}_median.npy".format(data_name, key), feature_median)

if __name__ == "__main__":
    main()