Ejemplo n.º 1
0
def probYearGivenNames(names, r=(2012 - 50, 2012 - 11)):
    """
    Given a set of names (first names), we return p(year|names), i.e. the
    probability of a random user from the set might be born in that year. Since
    the year is a variable, we return an array in which each element holds a
    probability for corresponding year.

    The range specified by 'r' is inclusive. The output array is from r[0]
    through r[1].
    """
    assert len(r) == 2  # r must be a tuple or a list of length 2
    assert r[0] <= r[1]

    prob_year_sum = None

    for name in names:
        prob_year = probYearGivenName(name, r[0], r[1])
        if prob_year_sum is None:
            prob_year_sum = prob_year
        else:
            prob_year_sum = vector_sum(prob_year_sum, prob_year)

    # Finally, we need to normalize the prob array to ensure that the sum is 1.
    sum_prob = sum(prob_year_sum)
    if sum_prob != 0:
        factor = 1 / float(sum_prob)
        prob_year_sum = vector_scalar_product(prob_year_sum, factor)

    return prob_year_sum
Ejemplo n.º 2
0
def probYearGivenWeightedNames(wnames, r):
    """
    Now the first parameter, the set of names, is provided with the associated
    weights with names. Our task is to calculate p(year|names), where year is
    between r[0] and r[1], inclusive.

    The data structure of the wnames is as follows:
        [(name, weight), (name, weight), ...]
    """
    assert len(r) == 2
    assert r[0] <= r[1]

    names, weights = zip(*wnames)

    # normalize the weights
    sum_weights = float(sum(weights))
    weights = [x / sum_weights for x in weights]
    wnames = zip(names, weights)

    # calculation part
    sum_prob_year = None

    for name, weight in wnames:
        prob_year = probYearGivenName(name, r[0], r[1])
        prob_year = vector_scalar_product(prob_year, weight)
        if sum_prob_year is None:
            sum_prob_year = prob_year
        else:
            sum_prob_year = vector_sum(sum_prob_year, prob_year)

    # Finally, we need to normalize the prob array to ensure that the sum is 1.
    sum_prob = sum(sum_prob_year)
    if sum_prob != 0:
        factor = 1 / float(sum_prob)
        sum_prob_year = vector_scalar_product(sum_prob_year, factor)

    return sum_prob_year
Ejemplo n.º 3
0
def probYearGivenDomainInYear(start_year, end_year):
    """
    We compute in the function y(year) = avg_i p(year|name_i) where name_i is
    the i-th element in the set of users who were born between start_year
    (inclusive) and end_year (exclusive).

    The function avg is not a simple average since we consider the weights
    between different names, thus it's rather a weighted average. Weights are
    determined by considering the sum of the counts of the names between the
    periods specified in the arguments.
    """
    assert start_year < end_year

    return_range = (2012 - 50, 2012 - 11)  # this is inclusive

    q = """SELECT b.name AS name, sum(b.num) AS count
           FROM babyname b
           INNER JOIN popularname p
           ON b.name = p.name
           WHERE year BETWEEN %s AND %s
           GROUP BY name
           ORDER BY count
           """ % (
        start_year,
        end_year - 1,
    )  # sql between is inclusive

    name_weights = {}  # key: name, value: weight

    con = db.con()
    with con:
        cur = con.cursor()
        cur.execute(q)
        numrows = int(cur.rowcount)
        for i in range(numrows):
            row = cur.fetchone()
            name = row[0]
            count = row[1]
            name_weights[name] = count

    # noramlize the weights for the sake of the calucation safety
    sum_name_weights = sum(name_weights.values())
    for n, w in name_weights.iteritems():
        name_weights[n] /= sum_name_weights

    # Get prob distribution for every name
    name_prob_year = {}
    for n in name_weights.keys():
        prob_year = probYearGivenName(n, return_range[0], return_range[1])
        name_prob_year[n] = prob_year

    # Get weighted average of them

    assert len(name_prob_year) > 0  # sanity check

    d = name_prob_year
    sum_prob_year = [0] * len(d[random.choice(d.keys())])
    for n in name_prob_year.keys():
        weighted = vector_scalar_product(name_prob_year[n], name_weights[n])
        sum_prob_year = vector_sum(sum_prob_year, weighted)

    # We don't need to average since the summation of the weights is one.

    # However, we need to normalize the prob array to ensure that the sum is 1.
    factor = 1 / float(sum(sum_prob_year))
    sum_prob_year = vector_scalar_product(sum_prob_year, factor)

    return sum_prob_year