def show_histogram(target_field_name = None, user_data_pkl_name='sampled_user_data.pkl', divider_output_filename='bucket_dividers.pkl', num_buckets=10): import re f = open(csrec_paths.get_features_dir()+'relevant_features', 'rb') field_names = [] if f: for line in f: line = re.sub(r'\s', '', line) if len(line)>1: field_names.append(line) ensure_user_data_loaded() histograms = {} if target_field_name.lower() == 'all': for field_name, possible_values in ALL_VALUES.iteritems(): if field_name in field_names: histograms[field_name] = get_histograms_from_values( USER_DATA[1346062][field_name]['field_type'], field_name, possible_values, num_buckets) else: possible_values = ALL_VALUES[target_field_name] get_histograms_from_values( USER_DATA[1346062][target_field_name]['field_type'], target_field_name, possible_values, num_buckets)
def init_field_names(self): f = open(csrec_paths.get_features_dir()+'relevant_features', 'rb') self.field_names = [] if f: for line in f: line = re.sub(r'\s', '', line) if line: self.field_names.append(line) self.num_fields = len(self.field_names)
def ensure_user_data_loaded(): user_data_pkl_name='sampled_user_data.pkl' global USER_DATA, ALL_VALUES if not USER_DATA: print 'loading user data...' USER_DATA = cPickle.load(open(csrec_paths.get_features_dir()+user_data_pkl_name, 'rb')) #pprint.pprint(USER_DATA[1346062]['languages']) #raise print 'data for %s users loaded' % (len(USER_DATA)) ALL_VALUES = find_all_values_of_cols(USER_DATA)
def __init__(self, testing=False): if testing: DATA_FILE = 'sampled_user_data.pkl' self.user_pklfile = csrec_paths.get_features_dir() +DATA_FILE self.load_user_features_pkl() self.init_field_names() self.init_dimensions() self.num_users_not_found = 0 self.num_users_total = 0 self.total_num_fields = 0 self.testing = testing self.outer_product_getter = OuterProducGetter(self.dimension)
def generate_bucket_dividers(user_data_pkl_name='sampled_user_data.pkl', divider_output_filename='bucket_dividers.pkl'): ensure_user_data_loaded() bucket_dividers = {} for field_name, possible_values in ALL_VALUES.iteritems(): if field_name in NUM_DIVIDERS: num_buckets = NUM_DIVIDERS[field_name] else: num_buckets= DEFAULT_NUM_DIVIDERS bucket_dividers[field_name] = get_dividers_from_values(possible_values, num_buckets) #pprint.pprint(bucket_dividers) cPickle.dump(bucket_dividers, open(csrec_paths.get_features_dir()+divider_output_filename, 'wb')) global DIVIDERS DIVIDERS = bucket_dividers
import cPickle import numpy as np import math import csrec_paths import pprint import optparse import feature_processor import re from collections import Counter NUM_DIVIDERS = {'age': 5} DEFAULT_NUM_DIVIDERS = 10 DIVIDERS = cPickle.load(open(csrec_paths.get_features_dir()+'bucket_dividers.pkl', 'rb')) USER_DATA = None ALL_VALUES = None def ensure_user_data_loaded(): user_data_pkl_name='sampled_user_data.pkl' global USER_DATA, ALL_VALUES if not USER_DATA: print 'loading user data...' USER_DATA = cPickle.load(open(csrec_paths.get_features_dir()+user_data_pkl_name, 'rb')) #pprint.pprint(USER_DATA[1346062]['languages']) #raise print 'data for %s users loaded' % (len(USER_DATA)) ALL_VALUES = find_all_values_of_cols(USER_DATA) def generate_bucket_dividers(user_data_pkl_name='sampled_user_data.pkl', divider_output_filename='bucket_dividers.pkl'): ensure_user_data_loaded()
import csrec_paths import cPickle pkl_path = csrec_paths.get_features_dir()+'interests/interest_extraction/merged_interest_dct.pkl' cached_profiles_dct = cPickle.load(open(pkl_path,'rb'))