def calc_probs(self): self.probs = np.zeros([1001, 2, 2]) # age, gender(M,F), for and not for the person's name nameps = pickle.load(open(config.pathToData + "names_us.p", "rb")) #nameps =pickle.load(open("names_us.p", "rb")) years = nameps['years'] ages = [2016 - y for y in years] # todo use current year if self.answer == None: ans_given = 'None' # this won't be found and the default prior will be used instead else: ans_given = self.answer contractions = pickle.load(open(config.pathToData + "contractions.p", "rb")) #contractions = pickle.load(open("contractions_us.p", "rb")) # ans_given = 'rachel' if ans_given.upper() in contractions: possible_name_list = contractions[ans_given.upper()] print ans_given else: possible_name_list = [ans_given] nameused = possible_name_list[0] # in future could search/integrate over.TODO: make names start with initial caps print "(using name %s)" % nameused if nameused in nameps['boys']: p_male = nameps['boys'][nameused] # print p_male else: p_male = np.ones(len(years)) * 0.00000001 # todo: what if their name isn't in the list? #print p_male, '----', if (nameused in nameps['girls']): p_female = nameps['girls'][nameused] else: p_female = np.ones(len(years)) * 0.00000001 # TODO p_male = p_male[-1:0:-1] p_female = p_female[-1:0:-1] ages = ages[-1:0:-1] p_male = np.hstack([p_male, p_male[-1]]) p_female = np.hstack([p_female, p_female[-1]]) ages.append(1001) # add last boundary p_male = ans.distribute_probs(p_male, ages) p_female = ans.distribute_probs(p_female, ages) # print p_male self.probs = np.zeros([101, 2, 2]) self.probs[:, 0, 1] = p_male # *5000 self.probs[:, 0, 0] = 1 - p_male # *5000 self.probs[:, 1, 1] = p_female # *5000 self.probs[:, 1, 0] = 1 - p_female # *5000 # print self.probs[:, 0, 1] logging.info('***************************************') logging.info(self.probs) logging.info('***************************************')
def calc_probs(self): self.probs = np.zeros([101, 2, 2]) # age, gender, seen or not seen ages = {} c_ages = MovieAnswer._movielens.execute( "SELECT DISTINCT(age) FROM users;" ) # Maybe could do all this with some outer joins, but couldn't get them working. ages_list = [] for i, r in enumerate(c_ages): ages[r[0]] = i ages_list.append(r[0]) for genderi, gender in enumerate(["M", "F"]): nSeen = np.zeros(len(ages)) nTotal = np.zeros(len(ages)) c_movie = MovieAnswer._movielens.execute( "SELECT users.age,count(*) FROM users JOIN ratings ON users.user=ratings.user WHERE ratings.movie=? AND users.gender=? GROUP BY users.age ORDER BY users.age;", (self.movie, gender), ) for r in c_movie: nSeen[ages[r[0]]] = r[1] # find p(seen,age,gender) c_all = MovieAnswer._movielens.execute( "SELECT users.age,count(*) FROM users WHERE users.gender=? GROUP BY users.age ORDER BY users.age;", (gender), ) for r in c_all: nTotal[ages[r[0]]] = r[1] # find p(age,gender) pSeen = 1.0 * nSeen / nTotal # p(s|age,gender) = p(s,age,gender)/p(age,gender) pNotSeen = ( 1.0 * (nTotal - nSeen) / nTotal ) # p(not s|age,gender) = [p(age,gender)-p(s,age,gender)]/p(age,gender) ages_list = np.array(ages_list) # the movielens timestamps are between # 26 Apr 2000 # and 28 Feb 2003. # average: 27 Sep 2001, from datetime import datetime currentYear = datetime.now().year ageDiff = currentYear - 2001 # the people are now older by this age difference ages_list = ages_list + ageDiff # print ages_list dSeen = ans.distribute_probs(pSeen, ages_list[1:]) dNotSeen = ans.distribute_probs(pNotSeen, ages_list[1:]) self.probs[:, genderi, 0] = dNotSeen self.probs[:, genderi, 1] = dSeen
def calc_probs(self): self.probs = np.zeros([101, 2, 2]) #age, gender, seen or not seen ages = {} c_ages = MovieAnswer._movielens.execute( "SELECT DISTINCT(age) FROM users;" ) #Maybe could do all this with some outer joins, but couldn't get them working. ages_list = [] for i, r in enumerate(c_ages): ages[r[0]] = i ages_list.append(r[0]) for genderi, gender in enumerate(['M', 'F']): nSeen = np.zeros(len(ages)) nTotal = np.zeros(len(ages)) c_movie = MovieAnswer._movielens.execute( "SELECT users.age,count(*) FROM users JOIN ratings ON users.user=ratings.user WHERE ratings.movie=? AND users.gender=? GROUP BY users.age ORDER BY users.age;", (self.movie, gender)) for r in c_movie: nSeen[ages[r[0]]] = r[1] #find p(seen,age,gender) c_all = MovieAnswer._movielens.execute( "SELECT users.age,count(*) FROM users WHERE users.gender=? GROUP BY users.age ORDER BY users.age;", (gender)) for r in c_all: nTotal[ages[r[0]]] = r[1] #find p(age,gender) pSeen = 1. * nSeen / nTotal #p(s|age,gender) = p(s,age,gender)/p(age,gender) pNotSeen = 1. * ( nTotal - nSeen ) / nTotal #p(not s|age,gender) = [p(age,gender)-p(s,age,gender)]/p(age,gender) ages_list = np.array(ages_list) #the movielens timestamps are between # 26 Apr 2000 # and 28 Feb 2003. # average: 27 Sep 2001, from datetime import datetime currentYear = datetime.now().year ageDiff = (currentYear - 2001 ) #the people are now older by this age difference ages_list = ages_list + ageDiff # print ages_list dSeen = ans.distribute_probs(pSeen, ages_list[1:]) dNotSeen = ans.distribute_probs(pNotSeen, ages_list[1:]) self.probs[:, genderi, 0] = dNotSeen self.probs[:, genderi, 1] = dSeen
def calc_probs_age(self, facts): logging.info("Starting to calculate age probs") bgs = self.get_list_of_bgs(facts) threadData = [] threads = [] bgs.append([None, None, None, None]) #Last BG is whole of US for bg in bgs: data = [0] threadData.append(data) # bg_geo = bg['item'] t = Thread(target=USCensusAnswer.getAgeDist, args=(bg, data)) threads.append(t) t.start() for t in threads: t.join() localAgeDists = np.array([td[0] for td in threadData[:-1]]) nationalAgeDist = np.array(threadData[-1][0]) self.localAgeDists = localAgeDists self.nationalAgeDist = nationalAgeDist logging.info("Calculated.") #we want p(postcode|age), which we assume is equal to p(output area|age) #if n = number of people in output area # N = number of people # na = number of people of age a in output area # Na = number of people of age a # #p(output area|age) = p(age|output area) x p(output area) / p(age) # #we can write the three terms on the right as: # #p(age|output area) = na/n #p(output area) = n/N #p(age) = Na/N # #substituting in... na/n x n/N / (Na/N) = (na/N) / (Na/N) = na/Na #so localAgeDist/nationalAgeDist self.age_probs = np.zeros([101, len(localAgeDists), 2]) for i, dist in enumerate(localAgeDists): p = (0.0001 + dist) / nationalAgeDist p = np.sum(p[0], 0) p = ans.distribute_probs( p, USCensusAnswer._age_range ) #spread over our standard age distribution self.age_probs[:, i, 0] = 1 - p self.age_probs[:, i, 1] = p temp = p / np.sum(p)
def calc_probs_age(self,facts): bgs = self.get_list_of_bgs(facts) threadData = [] threads = [] bgs.append([None,None,None,None]) #Last BG is whole of US for bg in bgs: data = [0] threadData.append(data) # bg_geo = bg['item'] t = Thread(target=USCensusAnswer.getAgeDist,args=(bg,data)) threads.append(t) t.start() for t in threads: t.join() localAgeDists = np.array([td[0] for td in threadData[:-1]]) nationalAgeDist = np.array(threadData[-1][0]) self.localAgeDists = localAgeDists self.nationalAgeDist = nationalAgeDist #we want p(postcode|age), which we assume is equal to p(output area|age) #if n = number of people in output area # N = number of people # na = number of people of age a in output area # Na = number of people of age a # #p(output area|age) = p(age|output area) x p(output area) / p(age) # #we can write the three terms on the right as: # #p(age|output area) = na/n #p(output area) = n/N #p(age) = Na/N # #substituting in... na/n x n/N / (Na/N) = (na/N) / (Na/N) = na/Na #so localAgeDist/nationalAgeDist self.age_probs = np.zeros([101,len(localAgeDists),2]) for i,dist in enumerate(localAgeDists): p = (0.0001+dist)/nationalAgeDist p = np.sum(p[0],0) p = ans.distribute_probs(p,USCensusAnswer._age_range) #spread over our standard age distribution self.age_probs[:,i,0] = 1-p self.age_probs[:,i,1] = p temp = p/np.sum(p)
def insights(self, inference_result, facts): if self.prob_in_us(facts) < 0.01: return {} #we're not in the us insights = {} ages = np.zeros([2, 23]) area_ratios = self.get_list_of_bg_probs(facts) for i, ratio in enumerate(area_ratios): ages = ages + self.localAgeDists[i, 0, :] * ratio ages_combined = np.sum(ages, 0) logging.info(ages) gender_bias = ( 1.0 * (ages[0, :] - ages[1, :]) ) #originally divided by the sum, but important to consider absolute values as the relative values can show spurious results with small numbers of people. logging.info('gender bias') logging.info(gender_bias) if (np.min(gender_bias) < -50): idx = np.argmin(gender_bias) prop = ((1.0 * (ages[1, idx] / ages[0, idx]))) if (prop > 1.1): #otherwise it's unremarkable odd_age = USCensusAnswer._age_range[idx] insights[ 'uscensus_genderratio'] = 'There are %d%% more women than men aged %d to %d living in your area.' % ( round((prop - 1) * 100), odd_age, odd_age + 5) #insights['future_uscensus_genderratio'] = {'value':'There are %d%% more women than men aged %d to %d living in your area.' % (round((prop-1)*100), odd_age, odd_age+5), 'type':'msg'} #msg, dist or debug if (np.max(gender_bias) > 50): idx = np.argmax(gender_bias) prop = ((1.0 * (ages[0, idx] / ages[1, idx]))) if (prop > 1.1): #otherwise it's unremarkable odd_age = USCensusAnswer._age_range[idx] #fix odd_age + 5... insights[ 'uscensus_genderratio'] = 'There are %d%% more men than women aged %d to %d living in your area.' % ( round((prop - 1) * 100), odd_age, odd_age + 5) d = ans.distribute_probs(ages_combined, USCensusAnswer._age_range, spread=True) #TODO This code below is duplicated from uk_census, need to move it to a fn. popage = None if ( 'age' in facts ): #if we know the person's age we'll give the stat in proportion to them... age = facts['age'] prop_younger = 1.0 * np.sum(d[0:age]) / np.sum(d) if prop_younger > 0.5: popage = "%d%% of people in your area are younger than you." % round( prop_younger * 100) else: popage = "%d%% of people in your area are older than you." % round( (1 - prop_younger) * 100) else: #otherwise we'll give it wrt 'half' halfway = np.sum(np.cumsum(d) <= np.sum(d) / 2) if (halfway < 40): popage = 'Half the people in your neighbourhood are younger than %d years old.' % halfway else: popage = 'Half the people in your neighbourhood are older than %d years old.' % halfway if popage is not None: insights['uscensus_popage'] = popage #Get all languages bgs = self.get_list_of_bgs(facts) logging.info('GET ALL LANGUAGES') for bg in bgs: #TODO!!! THIS MAKES NO SENSE! WE'RE ONLY GETTING A RESEULT FOR THE LAST BG! logging.info(str(bg)) if (bg[3] != None): #we need to reduce the resolution and recompute bg[2] = [bg[2]] bg[3] = None results, geolocs = USCensusAnswer.USCensusApiQuery( bg, USCensusAnswer.language_codes) lang_counts = results[0] active_languages = [ USCensusAnswer.languages_text[i] for i in np.nonzero(np.array(lang_counts))[0] ] langaugestring = ', '.join(active_languages[0:-1]) if (len(active_languages) > 1): langaugestring += ' and ' + active_languages[-1] insights[ 'uscensus_languages'] = "Languages spoken in your area include " + langaugestring insights['uscensus_language_list'] = lang_counts insights['uscensus_debug_languages'] = json.dumps(results) #Birthplaces.... results, geolocs = USCensusAnswer.USCensusApiQuery( bg, USCensusAnswer.birthplace_codes) birthplace_counts = results[0] birthplace_percs = np.round(100.0 * np.array(birthplace_counts) / np.sum(birthplace_counts)) insights[ 'uscensus_birthplace'] = "%d%% of people in your neighbourhood were born in your state." % birthplace_percs[ 0] insights['uscensus_birthplace_list'] = birthplace_counts #households.... results, geolocs = USCensusAnswer.USCensusApiQuery( bg, USCensusAnswer.households_codes) households_counts = results[0] insights['uscensus_households_list'] = households_counts #halfway = np.sum((np.cumsum(ages_combined)/np.sum(ages_combined))<0.5) #insights['uscensus_popage_previous_version'] = "Half the people in your area are under the age of %d" % (USCensusAnswer._age_range[halfway]) return insights
def insights(self,inference_result,facts): if self.prob_in_us(facts)<0.01: return {} #we're not in the us insights = {} ages = np.zeros([2,23]) area_ratios = self.get_list_of_bg_probs(facts) for i,ratio in enumerate(area_ratios): ages = ages + self.localAgeDists[i,0,:] * ratio ages_combined = np.sum(ages,0) logging.info(ages) gender_bias = (1.0*(ages[0,:]-ages[1,:])) #originally divided by the sum, but important to consider absolute values as the relative values can show spurious results with small numbers of people. logging.info('gender bias') logging.info(gender_bias) if (np.min(gender_bias)<-50): idx = np.argmin(gender_bias) prop = ((1.0*(ages[1,idx]/ages[0,idx]))) if (prop>1.1): #otherwise it's unremarkable odd_age = USCensusAnswer._age_range[idx] insights['uscensus_genderratio'] = 'There are %d%% more women than men aged %d to %d living in your area.' % (round((prop-1)*100), odd_age, odd_age+5) #insights['future_uscensus_genderratio'] = {'value':'There are %d%% more women than men aged %d to %d living in your area.' % (round((prop-1)*100), odd_age, odd_age+5), 'type':'msg'} #msg, dist or debug if (np.max(gender_bias)>50): idx = np.argmax(gender_bias) prop = ((1.0*(ages[0,idx]/ages[1,idx]))) if (prop>1.1): #otherwise it's unremarkable odd_age = USCensusAnswer._age_range[idx] #fix odd_age + 5... insights['uscensus_genderratio'] = 'There are %d%% more men than women aged %d to %d living in your area.' % (round((prop-1)*100), odd_age, odd_age+5) d = ans.distribute_probs(ages_combined,USCensusAnswer._age_range,spread=True) #TODO This code below is duplicated from uk_census, need to move it to a fn. popage = None if ('age' in facts): #if we know the person's age we'll give the stat in proportion to them... age = facts['age'] prop_younger = 1.0*np.sum(d[0:age])/np.sum(d) if prop_younger>0.5: popage = "%d%% of people in your area are younger than you." % round(prop_younger*100) else: popage = "%d%% of people in your area are older than you." % round((1-prop_younger)*100) else: #otherwise we'll give it wrt 'half' halfway = np.sum(np.cumsum(d)<=np.sum(d)/2) if (halfway<40): popage = 'Half the people in your neighbourhood are younger than %d years old.' % halfway else: popage = 'Half the people in your neighbourhood are older than %d years old.' % halfway if popage is not None: insights['uscensus_popage'] = popage #Get all languages bgs = self.get_list_of_bgs(facts) logging.info('GET ALL LANGUAGES') for bg in bgs: #TODO!!! THIS MAKES NO SENSE! WE'RE ONLY GETTING A RESEULT FOR THE LAST BG! logging.info(str(bg)) if (bg[3]!=None): #we need to reduce the resolution and recompute bg[2] = [bg[2]] bg[3] = None results, geolocs = USCensusAnswer.USCensusApiQuery(bg,USCensusAnswer.language_codes) lang_counts = results[0] active_languages = [USCensusAnswer.languages_text[i] for i in np.nonzero(np.array(lang_counts))[0]] langaugestring = ', '.join(active_languages[0:-1]) if (len(active_languages)>1): langaugestring += ' and ' + active_languages[-1] insights['uscensus_languages'] = "Languages spoken in your area include " + langaugestring insights['uscensus_language_list'] = lang_counts insights['uscensus_debug_languages'] = json.dumps(results) #Birthplaces.... results, geolocs = USCensusAnswer.USCensusApiQuery(bg,USCensusAnswer.birthplace_codes) birthplace_counts = results[0] birthplace_percs = np.round(100.0 * np.array(birthplace_counts) / np.sum(birthplace_counts)) insights['uscensus_birthplace'] = "%d%% of people in your neighbourhood were born in your state." % birthplace_percs[0] insights['uscensus_birthplace_list'] = birthplace_counts #households.... results, geolocs = USCensusAnswer.USCensusApiQuery(bg,USCensusAnswer.households_codes) households_counts = results[0] insights['uscensus_households_list'] = households_counts #halfway = np.sum((np.cumsum(ages_combined)/np.sum(ages_combined))<0.5) #insights['uscensus_popage_previous_version'] = "Half the people in your area are under the age of %d" % (USCensusAnswer._age_range[halfway]) return insights