Esempio n. 1
0
def show_histogram(target_field_name = None,
                   user_data_pkl_name='sampled_user_data.pkl',
                   divider_output_filename='bucket_dividers.pkl',
                   num_buckets=10):
    import re
    f = open(csrec_paths.get_features_dir()+'relevant_features', 'rb')
    field_names = []
    if f:
        for line in f:
            line = re.sub(r'\s', '', line)
            if len(line)>1:
                field_names.append(line)
    ensure_user_data_loaded()
    histograms = {}
    if target_field_name.lower() == 'all':
        for field_name, possible_values in ALL_VALUES.iteritems():
            if field_name in field_names:
                histograms[field_name] = get_histograms_from_values(
                    USER_DATA[1346062][field_name]['field_type'],
                    field_name, possible_values, num_buckets)
    else:
        possible_values = ALL_VALUES[target_field_name]
        get_histograms_from_values(
            USER_DATA[1346062][target_field_name]['field_type'],
            target_field_name, possible_values, num_buckets)
Esempio n. 2
0
 def init_field_names(self):
     f = open(csrec_paths.get_features_dir()+'relevant_features', 'rb')
     self.field_names = []
     if f:
         for line in f:
             line = re.sub(r'\s', '', line)
             if line:
                 self.field_names.append(line)
     self.num_fields = len(self.field_names)
Esempio n. 3
0
def ensure_user_data_loaded():
    user_data_pkl_name='sampled_user_data.pkl'
    global USER_DATA, ALL_VALUES
    if not USER_DATA:
        print 'loading user data...'
        USER_DATA = cPickle.load(open(csrec_paths.get_features_dir()+user_data_pkl_name, 'rb'))
        #pprint.pprint(USER_DATA[1346062]['languages'])
        #raise

        print 'data for %s users loaded' % (len(USER_DATA))
        ALL_VALUES = find_all_values_of_cols(USER_DATA)
Esempio n. 4
0
 def __init__(self, testing=False):
     if testing:
         DATA_FILE = 'sampled_user_data.pkl'
         self.user_pklfile = csrec_paths.get_features_dir() +DATA_FILE
         self.load_user_features_pkl()
     self.init_field_names()
     self.init_dimensions()
     self.num_users_not_found = 0
     self.num_users_total = 0
     self.total_num_fields = 0
     self.testing = testing
     self.outer_product_getter = OuterProducGetter(self.dimension)
Esempio n. 5
0
def generate_bucket_dividers(user_data_pkl_name='sampled_user_data.pkl',
                             divider_output_filename='bucket_dividers.pkl'):
    ensure_user_data_loaded()
    bucket_dividers = {}
    for field_name, possible_values in ALL_VALUES.iteritems():
        if field_name in NUM_DIVIDERS:
            num_buckets = NUM_DIVIDERS[field_name]
        else:
            num_buckets= DEFAULT_NUM_DIVIDERS
        bucket_dividers[field_name] = get_dividers_from_values(possible_values, num_buckets)
    #pprint.pprint(bucket_dividers)
    cPickle.dump(bucket_dividers, open(csrec_paths.get_features_dir()+divider_output_filename, 'wb'))
    global DIVIDERS
    DIVIDERS = bucket_dividers
Esempio n. 6
0
import cPickle
import numpy as np
import math
import csrec_paths
import pprint
import optparse
import feature_processor
import re
from collections import Counter

NUM_DIVIDERS = {'age': 5}
DEFAULT_NUM_DIVIDERS = 10
DIVIDERS = cPickle.load(open(csrec_paths.get_features_dir()+'bucket_dividers.pkl', 'rb'))
USER_DATA = None
ALL_VALUES = None

def ensure_user_data_loaded():
    user_data_pkl_name='sampled_user_data.pkl'
    global USER_DATA, ALL_VALUES
    if not USER_DATA:
        print 'loading user data...'
        USER_DATA = cPickle.load(open(csrec_paths.get_features_dir()+user_data_pkl_name, 'rb'))
        #pprint.pprint(USER_DATA[1346062]['languages'])
        #raise

        print 'data for %s users loaded' % (len(USER_DATA))
        ALL_VALUES = find_all_values_of_cols(USER_DATA)

def generate_bucket_dividers(user_data_pkl_name='sampled_user_data.pkl',
                             divider_output_filename='bucket_dividers.pkl'):
    ensure_user_data_loaded()
Esempio n. 7
0
import csrec_paths
import cPickle

pkl_path = csrec_paths.get_features_dir()+'interests/interest_extraction/merged_interest_dct.pkl'
cached_profiles_dct = cPickle.load(open(pkl_path,'rb'))