def probYearGivenWeightedNames(wnames, r): ''' Now the first parameter, the set of names, is provided with the associated weights with names. Our task is to calculate p(year|names), where year is between r[0] and r[1], inclusive. The data structure of the wnames is as follows: [(name, weight), (name, weight), ...] ''' assert(len(r) == 2) assert(r[0] <= r[1]) names, weights = zip(*wnames) # normalize the weights sum_weights = float(sum(weights)) weights = [x / sum_weights for x in weights] wnames = zip(names, weights) # calculation part sum_prob_year = None for name, weight in wnames: prob_year = probYearGivenName(name, r[0], r[1]) prob_year = vector_scalar_product(prob_year, weight) if sum_prob_year is None: sum_prob_year = prob_year else: sum_prob_year = vector_sum(sum_prob_year, prob_year) # Finally, we need to normalize the prob array to ensure that the sum is 1. factor = 1 / float(sum(sum_prob_year)) sum_prob_year = vector_scalar_product(sum_prob_year, factor) return sum_prob_year
def probYearGivenNames(names, r): ''' Given a set of names (first names), we return p(year|names), i.e. the probability of a random user from the set might be born in that year. Since the year is a variable, we return an array in which each element holds a probability for corresponding year. The range specified by 'r' is inclusive. The output array is from r[0] through r[1]. ''' assert(len(r) == 2) # r must be a tuple or a list of length 2 assert(r[0] <= r[1]) prob_year_sum = None for name in names: prob_year = probYearGivenName(name, r[0], r[1]) if prob_year_sum is None: prob_year_sum = prob_year else: prob_year_sum = vector_sum(prob_year_sum, prob_year) # Finally, we need to normalize the prob array to ensure that the sum is 1. factor = 1 / float(sum(sum_prob_year)) sum_prob_year = vector_scalar_product(sum_prob_year, factor) return prob_year_sum
def run_allName(): con = mdb.connect('localhost', 'yongjoo', 'Fgla4Zp0', 'yongjoo') with con: cur = con.cursor() sql = '''SELECT name, SUM(num) AS total FROM babyname GROUP BY name ORDER BY total DESC LIMIT 1000 ''' cur.execute(sql) bins = [0 for x in range(1950, 2001)] debugout = open('debug.txt', 'w') numrows = int(cur.rowcount) for i in range(numrows): row = cur.fetchone() name = row[0] firstname = name.split(' ')[0] prob_year = babyname.probYearGivenName(firstname, 1950, 2000) try: assert(len(bins) == len(prob_year)) except AssertionError: print len(bins), len(prob_year) raise RuntimeError if sum(prob_year) != 0: bins = [bins[i] + prob_year[i] for i in range(len(bins))] debugout.close() print bins
def probYearGivenDomainInYear(start_year, end_year): ''' We compute in the function y(year) = avg_i p(year|name_i) where name_i is the i-th element in the set of users who were born between start_year (inclusive) and end_year (exclusive). The function avg is not a simple average since we consider the weights between different names, thus it's rather a weighted average. Weights are determined by considering the sum of the counts of the names between the periods specified in the arguments. ''' assert(start_year < end_year) return_range = (2012-50, 2012-11) # this is inclusive q = '''SELECT b.name AS name, sum(b.num) AS count FROM babyname b INNER JOIN popularname p ON b.name = p.name WHERE year BETWEEN %s AND %s GROUP BY name ORDER BY count ''' % (start_year, end_year - 1) # sql between is inclusive name_weights = {} # key: name, value: weight con = mdb.connect('localhost', 'yongjoo', 'Fgla4Zp0', 'yongjoo'); with con: cur = con.cursor() cur.execute(q) numrows = int(cur.rowcount) for i in range(numrows): row = cur.fetchone() name = row[0] count = row[1] name_weights[name] = count # noramlize the weights for the sake of the calucation safety sum_name_weights = sum(name_weights.values()) for n, w in name_weights.iteritems(): name_weights[n] /= sum_name_weights # Get prob distribution for every name name_prob_year = {} for n in name_weights.keys(): prob_year = probYearGivenName(n, return_range[0], return_range[1]) name_prob_year[n] = prob_year # Get weighted average of them assert(len(name_prob_year) > 0) # sanity check d = name_prob_year sum_prob_year = [0] * len(d[random.choice(d.keys())]) for n in name_prob_year.keys(): weighted = vector_scalar_product(name_prob_year[n], name_weights[n]) sum_prob_year = vector_sum(sum_prob_year, weighted) # We don't need to average since the summation of the weights is one. # However, we need to normalize the prob array to ensure that the sum is 1. factor = 1 / float(sum(sum_prob_year)) sum_prob_year = vector_scalar_product(sum_prob_year, factor) return sum_prob_year
prob_year_sum[age] = [] skip_count[age] = 0 # Start to collect prob for every age. for line in datain: tweet = json.loads(line) name = tweet['user']['name'].encode('ascii', 'ignore') firstname = name.split(' ')[0] age = int(tweet['user']['age']) if not age in given_ages: continue; start_year = 2012 - 50 end_year = 2012 - 11 prob_year = probYearGivenName(firstname, start_year, end_year) if sum(prob_year) == 0: skip_count[age] += 1 continue prob_year_sum[age].append(prob_year) for age in sorted(prob_year_sum.keys()): print "For %d, collected %d number of users. Skipped %d" % (age, len(prob_year_sum[age]), skip_count[age]) # Average the prob array def elementWiseSum(x, y): assert(len(x) == len(y))