Beispiel #1
0
    def calc_probs(self):
        self.probs = np.zeros([1001, 2, 2])  # age, gender(M,F), for and not for the person's name
        nameps = pickle.load(open(config.pathToData + "names_us.p", "rb"))
        #nameps =pickle.load(open("names_us.p", "rb"))
        years = nameps['years']
        ages = [2016 - y for y in years]  # todo use current year
        if self.answer == None:
            ans_given = 'None'  # this won't be found and the default prior will be used instead
        else:
            ans_given = self.answer
        contractions = pickle.load(open(config.pathToData + "contractions.p", "rb"))
        #contractions = pickle.load(open("contractions_us.p", "rb"))

        #        ans_given = 'rachel'
        if ans_given.upper() in contractions:
            possible_name_list = contractions[ans_given.upper()]
            print ans_given
        else:
            possible_name_list = [ans_given]

        nameused = possible_name_list[0]  # in future could search/integrate over.TODO: make names start with initial caps
        print "(using name %s)" % nameused
        if nameused in nameps['boys']:
            p_male = nameps['boys'][nameused]
           # print p_male
        else:
            p_male = np.ones(len(years)) * 0.00000001  # todo: what if their name isn't in the list?
            #print  p_male, '----',
        if (nameused in nameps['girls']):
            p_female = nameps['girls'][nameused]
        else:
            p_female = np.ones(len(years)) * 0.00000001  # TODO

        p_male = p_male[-1:0:-1]
        p_female = p_female[-1:0:-1]
        ages = ages[-1:0:-1]
        p_male = np.hstack([p_male, p_male[-1]])
        p_female = np.hstack([p_female, p_female[-1]])
        ages.append(1001)  # add last boundary

        p_male = ans.distribute_probs(p_male, ages)
        p_female = ans.distribute_probs(p_female, ages)
        # print p_male

        self.probs = np.zeros([101, 2, 2])
        self.probs[:, 0, 1] = p_male  # *5000
        self.probs[:, 0, 0] = 1 - p_male  # *5000
        self.probs[:, 1, 1] = p_female  # *5000
        self.probs[:, 1, 0] = 1 - p_female  # *5000
        # print self.probs[:, 0, 1]

        logging.info('***************************************')
        logging.info(self.probs)
        logging.info('***************************************')
Beispiel #2
0
    def calc_probs(self):
        self.probs = np.zeros([101, 2, 2])  # age, gender, seen or not seen

        ages = {}
        c_ages = MovieAnswer._movielens.execute(
            "SELECT DISTINCT(age) FROM users;"
        )  # Maybe could do all this with some outer joins, but couldn't get them working.
        ages_list = []
        for i, r in enumerate(c_ages):
            ages[r[0]] = i
            ages_list.append(r[0])

        for genderi, gender in enumerate(["M", "F"]):
            nSeen = np.zeros(len(ages))
            nTotal = np.zeros(len(ages))

            c_movie = MovieAnswer._movielens.execute(
                "SELECT users.age,count(*) FROM users JOIN ratings ON users.user=ratings.user WHERE ratings.movie=? AND users.gender=? GROUP BY users.age ORDER BY users.age;",
                (self.movie, gender),
            )
            for r in c_movie:
                nSeen[ages[r[0]]] = r[1]  # find p(seen,age,gender)
            c_all = MovieAnswer._movielens.execute(
                "SELECT users.age,count(*) FROM users WHERE users.gender=? GROUP BY users.age ORDER BY users.age;",
                (gender),
            )
            for r in c_all:
                nTotal[ages[r[0]]] = r[1]  # find p(age,gender)
            pSeen = 1.0 * nSeen / nTotal  # p(s|age,gender) = p(s,age,gender)/p(age,gender)
            pNotSeen = (
                1.0 * (nTotal - nSeen) / nTotal
            )  # p(not s|age,gender) = [p(age,gender)-p(s,age,gender)]/p(age,gender)

            ages_list = np.array(ages_list)
            # the movielens timestamps are between
            # 26 Apr 2000 # and 28 Feb 2003.
            # average: 27 Sep 2001,
            from datetime import datetime

            currentYear = datetime.now().year
            ageDiff = currentYear - 2001  # the people are now older by this age difference
            ages_list = ages_list + ageDiff
            #   print ages_list
            dSeen = ans.distribute_probs(pSeen, ages_list[1:])
            dNotSeen = ans.distribute_probs(pNotSeen, ages_list[1:])
            self.probs[:, genderi, 0] = dNotSeen
            self.probs[:, genderi, 1] = dSeen
Beispiel #3
0
    def calc_probs(self):
        self.probs = np.zeros([101, 2, 2])  #age, gender, seen or not seen

        ages = {}
        c_ages = MovieAnswer._movielens.execute(
            "SELECT DISTINCT(age) FROM users;"
        )  #Maybe could do all this with some outer joins, but couldn't get them working.
        ages_list = []
        for i, r in enumerate(c_ages):
            ages[r[0]] = i
            ages_list.append(r[0])

        for genderi, gender in enumerate(['M', 'F']):
            nSeen = np.zeros(len(ages))
            nTotal = np.zeros(len(ages))

            c_movie = MovieAnswer._movielens.execute(
                "SELECT users.age,count(*) FROM users JOIN ratings ON users.user=ratings.user WHERE ratings.movie=? AND users.gender=? GROUP BY users.age ORDER BY users.age;",
                (self.movie, gender))
            for r in c_movie:
                nSeen[ages[r[0]]] = r[1]  #find p(seen,age,gender)
            c_all = MovieAnswer._movielens.execute(
                "SELECT users.age,count(*) FROM users WHERE users.gender=? GROUP BY users.age ORDER BY users.age;",
                (gender))
            for r in c_all:
                nTotal[ages[r[0]]] = r[1]  #find p(age,gender)
            pSeen = 1. * nSeen / nTotal  #p(s|age,gender) = p(s,age,gender)/p(age,gender)
            pNotSeen = 1. * (
                nTotal - nSeen
            ) / nTotal  #p(not s|age,gender) = [p(age,gender)-p(s,age,gender)]/p(age,gender)

            ages_list = np.array(ages_list)
            #the movielens timestamps are between
            # 26 Apr 2000 # and 28 Feb 2003.
            # average: 27 Sep 2001,
            from datetime import datetime
            currentYear = datetime.now().year
            ageDiff = (currentYear - 2001
                       )  #the people are now older by this age difference
            ages_list = ages_list + ageDiff
            #   print ages_list
            dSeen = ans.distribute_probs(pSeen, ages_list[1:])
            dNotSeen = ans.distribute_probs(pNotSeen, ages_list[1:])
            self.probs[:, genderi, 0] = dNotSeen
            self.probs[:, genderi, 1] = dSeen
Beispiel #4
0
    def calc_probs_age(self, facts):
        logging.info("Starting to calculate age probs")
        bgs = self.get_list_of_bgs(facts)
        threadData = []
        threads = []
        bgs.append([None, None, None, None])  #Last BG is whole of US
        for bg in bgs:
            data = [0]
            threadData.append(data)
            # bg_geo = bg['item']
            t = Thread(target=USCensusAnswer.getAgeDist, args=(bg, data))
            threads.append(t)
            t.start()

        for t in threads:
            t.join()
        localAgeDists = np.array([td[0] for td in threadData[:-1]])
        nationalAgeDist = np.array(threadData[-1][0])

        self.localAgeDists = localAgeDists
        self.nationalAgeDist = nationalAgeDist
        logging.info("Calculated.")

        #we want p(postcode|age), which we assume is equal to p(output area|age)
        #if n = number of people in output area
        #   N = number of people
        #   na = number of people of age a in output area
        #   Na = number of people of age a
        #
        #p(output area|age) = p(age|output area) x p(output area) / p(age)
        #
        #we can write the three terms on the right as:
        #
        #p(age|output area) = na/n
        #p(output area) = n/N
        #p(age) = Na/N
        #
        #substituting in... na/n x n/N / (Na/N) = (na/N) / (Na/N) = na/Na
        #so localAgeDist/nationalAgeDist

        self.age_probs = np.zeros([101, len(localAgeDists), 2])

        for i, dist in enumerate(localAgeDists):
            p = (0.0001 + dist) / nationalAgeDist
            p = np.sum(p[0], 0)
            p = ans.distribute_probs(
                p, USCensusAnswer._age_range
            )  #spread over our standard age distribution
            self.age_probs[:, i, 0] = 1 - p
            self.age_probs[:, i, 1] = p
            temp = p / np.sum(p)
Beispiel #5
0
    def calc_probs_age(self,facts):
        bgs = self.get_list_of_bgs(facts)
        threadData = []
        threads = []
        bgs.append([None,None,None,None]) #Last BG is whole of US
        for bg in bgs:
            data = [0]
            threadData.append(data)
           # bg_geo = bg['item']
            t = Thread(target=USCensusAnswer.getAgeDist,args=(bg,data))
            threads.append(t)
            t.start()

        for t in threads:
            t.join()
        localAgeDists = np.array([td[0] for td in threadData[:-1]])
        nationalAgeDist = np.array(threadData[-1][0])
       
        self.localAgeDists = localAgeDists
        self.nationalAgeDist = nationalAgeDist
        
        #we want p(postcode|age), which we assume is equal to p(output area|age)
        #if n = number of people in output area
        #   N = number of people
        #   na = number of people of age a in output area
        #   Na = number of people of age a
        #
        #p(output area|age) = p(age|output area) x p(output area) / p(age)
        #
        #we can write the three terms on the right as:
        #
        #p(age|output area) = na/n
        #p(output area) = n/N
        #p(age) = Na/N
        #
        #substituting in... na/n x n/N / (Na/N) = (na/N) / (Na/N) = na/Na
        #so localAgeDist/nationalAgeDist

        self.age_probs = np.zeros([101,len(localAgeDists),2])

        for i,dist in enumerate(localAgeDists):
            p = (0.0001+dist)/nationalAgeDist
            p = np.sum(p[0],0)
            p = ans.distribute_probs(p,USCensusAnswer._age_range) #spread over our standard age distribution
            self.age_probs[:,i,0] = 1-p
            self.age_probs[:,i,1] = p
            temp = p/np.sum(p)
Beispiel #6
0
    def insights(self, inference_result, facts):

        if self.prob_in_us(facts) < 0.01:
            return {}  #we're not in the us

        insights = {}
        ages = np.zeros([2, 23])
        area_ratios = self.get_list_of_bg_probs(facts)
        for i, ratio in enumerate(area_ratios):
            ages = ages + self.localAgeDists[i, 0, :] * ratio
        ages_combined = np.sum(ages, 0)
        logging.info(ages)
        gender_bias = (
            1.0 * (ages[0, :] - ages[1, :])
        )  #originally divided by the sum, but important to consider absolute values as the relative values can show spurious results with small numbers of people.
        logging.info('gender bias')
        logging.info(gender_bias)
        if (np.min(gender_bias) < -50):
            idx = np.argmin(gender_bias)
            prop = ((1.0 * (ages[1, idx] / ages[0, idx])))
            if (prop > 1.1):  #otherwise it's unremarkable
                odd_age = USCensusAnswer._age_range[idx]
                insights[
                    'uscensus_genderratio'] = 'There are %d%% more women than men aged %d to %d living in your area.' % (
                        round((prop - 1) * 100), odd_age, odd_age + 5)
                #insights['future_uscensus_genderratio'] = {'value':'There are %d%% more women than men aged %d to %d living in your area.' % (round((prop-1)*100), odd_age, odd_age+5), 'type':'msg'}    #msg, dist or debug
        if (np.max(gender_bias) > 50):
            idx = np.argmax(gender_bias)
            prop = ((1.0 * (ages[0, idx] / ages[1, idx])))
            if (prop > 1.1):  #otherwise it's unremarkable
                odd_age = USCensusAnswer._age_range[idx]  #fix odd_age + 5...
                insights[
                    'uscensus_genderratio'] = 'There are %d%% more men than women aged %d to %d living in your area.' % (
                        round((prop - 1) * 100), odd_age, odd_age + 5)

        d = ans.distribute_probs(ages_combined,
                                 USCensusAnswer._age_range,
                                 spread=True)
        #TODO This code below is duplicated from uk_census, need to move it to a fn.
        popage = None
        if (
                'age' in facts
        ):  #if we know the person's age we'll give the stat in proportion to them...
            age = facts['age']
            prop_younger = 1.0 * np.sum(d[0:age]) / np.sum(d)
            if prop_younger > 0.5:
                popage = "%d%% of people in your area are younger than you." % round(
                    prop_younger * 100)
            else:
                popage = "%d%% of people in your area are older than you." % round(
                    (1 - prop_younger) * 100)
        else:  #otherwise we'll give it wrt 'half'
            halfway = np.sum(np.cumsum(d) <= np.sum(d) / 2)
            if (halfway < 40):
                popage = 'Half the people in your neighbourhood are younger than %d years old.' % halfway
            else:
                popage = 'Half the people in your neighbourhood are older than %d years old.' % halfway
        if popage is not None:
            insights['uscensus_popage'] = popage

        #Get all languages
        bgs = self.get_list_of_bgs(facts)
        logging.info('GET ALL LANGUAGES')
        for bg in bgs:  #TODO!!! THIS MAKES NO SENSE! WE'RE ONLY GETTING A RESEULT FOR THE LAST BG!
            logging.info(str(bg))
            if (bg[3] !=
                    None):  #we need to reduce the resolution and recompute
                bg[2] = [bg[2]]
                bg[3] = None
            results, geolocs = USCensusAnswer.USCensusApiQuery(
                bg, USCensusAnswer.language_codes)
        lang_counts = results[0]
        active_languages = [
            USCensusAnswer.languages_text[i]
            for i in np.nonzero(np.array(lang_counts))[0]
        ]
        langaugestring = ', '.join(active_languages[0:-1])
        if (len(active_languages) > 1):
            langaugestring += ' and ' + active_languages[-1]
        insights[
            'uscensus_languages'] = "Languages spoken in your area include " + langaugestring
        insights['uscensus_language_list'] = lang_counts
        insights['uscensus_debug_languages'] = json.dumps(results)

        #Birthplaces....
        results, geolocs = USCensusAnswer.USCensusApiQuery(
            bg, USCensusAnswer.birthplace_codes)
        birthplace_counts = results[0]

        birthplace_percs = np.round(100.0 * np.array(birthplace_counts) /
                                    np.sum(birthplace_counts))
        insights[
            'uscensus_birthplace'] = "%d%% of people in your neighbourhood were born in your state." % birthplace_percs[
                0]
        insights['uscensus_birthplace_list'] = birthplace_counts

        #households....
        results, geolocs = USCensusAnswer.USCensusApiQuery(
            bg, USCensusAnswer.households_codes)
        households_counts = results[0]
        insights['uscensus_households_list'] = households_counts

        #halfway = np.sum((np.cumsum(ages_combined)/np.sum(ages_combined))<0.5)
        #insights['uscensus_popage_previous_version'] = "Half the people in your area are under the age of %d" % (USCensusAnswer._age_range[halfway])
        return insights
Beispiel #7
0
    def insights(self,inference_result,facts):
    
        if self.prob_in_us(facts)<0.01:
            return {} #we're not in the us
            
        insights = {}
        ages = np.zeros([2,23])
        area_ratios = self.get_list_of_bg_probs(facts)
        for i,ratio in enumerate(area_ratios):
            ages = ages + self.localAgeDists[i,0,:] * ratio 
        ages_combined = np.sum(ages,0)
        logging.info(ages)
        gender_bias = (1.0*(ages[0,:]-ages[1,:])) #originally divided by the sum, but important to consider absolute values as the relative values can show spurious results with small numbers of people.
        logging.info('gender bias')
        logging.info(gender_bias)
        if (np.min(gender_bias)<-50):
            idx = np.argmin(gender_bias)            
            prop = ((1.0*(ages[1,idx]/ages[0,idx])))
            if (prop>1.1): #otherwise it's unremarkable
                odd_age = USCensusAnswer._age_range[idx]
                insights['uscensus_genderratio'] = 'There are %d%% more women than men aged %d to %d living in your area.' % (round((prop-1)*100), odd_age, odd_age+5)
                #insights['future_uscensus_genderratio'] = {'value':'There are %d%% more women than men aged %d to %d living in your area.' % (round((prop-1)*100), odd_age, odd_age+5), 'type':'msg'}    #msg, dist or debug            
        if (np.max(gender_bias)>50):
            idx = np.argmax(gender_bias)
            prop = ((1.0*(ages[0,idx]/ages[1,idx])))
            if (prop>1.1): #otherwise it's unremarkable
                odd_age = USCensusAnswer._age_range[idx] #fix odd_age + 5...
                insights['uscensus_genderratio'] = 'There are %d%% more men than women aged %d to %d living in your area.' % (round((prop-1)*100), odd_age, odd_age+5)
           

        d = ans.distribute_probs(ages_combined,USCensusAnswer._age_range,spread=True)
        #TODO This code below is duplicated from uk_census, need to move it to a fn.
        popage = None
        if ('age' in facts): #if we know the person's age we'll give the stat in proportion to them...
            age = facts['age']
            prop_younger = 1.0*np.sum(d[0:age])/np.sum(d)
            if prop_younger>0.5:
                popage = "%d%% of people in your area are younger than you." % round(prop_younger*100)
            else:
                popage = "%d%% of people in your area are older than you." % round((1-prop_younger)*100)
        else: #otherwise we'll give it wrt 'half'
            halfway = np.sum(np.cumsum(d)<=np.sum(d)/2)
            if (halfway<40):
                popage = 'Half the people in your neighbourhood are younger than %d years old.' % halfway
            else:
                popage = 'Half the people in your neighbourhood are older than %d years old.' % halfway
        if popage is not None:        
            insights['uscensus_popage'] = popage
       
    
        #Get all languages
        bgs = self.get_list_of_bgs(facts)
        logging.info('GET ALL LANGUAGES')        
        for bg in bgs: #TODO!!! THIS MAKES NO SENSE! WE'RE ONLY GETTING A RESEULT FOR THE LAST BG!
            logging.info(str(bg))
            if (bg[3]!=None): #we need to reduce the resolution and recompute
                bg[2] = [bg[2]]
                bg[3] = None
            results, geolocs = USCensusAnswer.USCensusApiQuery(bg,USCensusAnswer.language_codes)
        lang_counts = results[0]
        active_languages = [USCensusAnswer.languages_text[i] for i in np.nonzero(np.array(lang_counts))[0]]
        langaugestring = ', '.join(active_languages[0:-1])
        if (len(active_languages)>1):
            langaugestring += ' and ' + active_languages[-1]
        insights['uscensus_languages'] = "Languages spoken in your area include " + langaugestring
        insights['uscensus_language_list'] = lang_counts
        insights['uscensus_debug_languages'] = json.dumps(results)
        
        #Birthplaces....
        results, geolocs = USCensusAnswer.USCensusApiQuery(bg,USCensusAnswer.birthplace_codes)
        birthplace_counts = results[0]
        
        birthplace_percs = np.round(100.0 * np.array(birthplace_counts) / np.sum(birthplace_counts))
        insights['uscensus_birthplace'] = "%d%% of people in your neighbourhood were born in your state." % birthplace_percs[0]
        insights['uscensus_birthplace_list'] = birthplace_counts
     
        #households....
        results, geolocs = USCensusAnswer.USCensusApiQuery(bg,USCensusAnswer.households_codes)
        households_counts = results[0]
        insights['uscensus_households_list'] = households_counts    
    
        #halfway = np.sum((np.cumsum(ages_combined)/np.sum(ages_combined))<0.5)        
        #insights['uscensus_popage_previous_version'] = "Half the people in your area are under the age of %d" % (USCensusAnswer._age_range[halfway])
        return insights