Exemple #1
0
def seeAnnotated():
    datadir = '../data'
    csvfile = 'sampled2.8-annotated-engfiltered.csv'

    reader = csv.reader(open(os.path.join(datadir, csvfile), 'rb'))

    age_group_count = {}

    for row in reader:
        user_id = int(row[0])
        age = int(row[4])
        isvalid = row[5]

        try:
            age_group = friends.getGroup(age)
        except RuntimeError:
            print age

        if not age_group in age_group_count:
            age_group_count[age_group] = 0

        if isvalid == '1':
            age_group_count[age_group] += 1

    print age_group_count
        def _fromCSV():
            user_ids_age = []

            # read in the data file
            reader = csv.reader(open(csvfile, 'rb'))

            for row in reader:
                user_id = row[0]
                age = int(row[4])

                try:
                    age_group = friends.getGroup(age)
                except RuntimeError:
                    continue

                is_valid = row[5]
                c_following = bs.DBConnector().count_following(user_id)
                text, length = DBConnector().loadText(user_id)
                #s_count = statuses_count[int(user_id)]

                if length != 0 and is_valid == '1' and c_following >= 10:
                    # this means public user
                    meta = {'statuses_count': length}

                    user_ids_age.append((user_id, age, meta))

            return user_ids_age
        def _fromJson():
            user_ids_group = []

            tweets = json.loads(open(filename).read())

            for tweet in tweets:
                user_id = tweet['user']['id']
                age = int(tweet['user']['age'])
                age_group = friends.getGroup(age)
                text = DBConnector().loadText(user_id)

                if not text == "":
                    user_ids_group.append((user_id, age_group))

            return user_ids_group
Exemple #4
0
def probYearForAllGroups():
    """
    We define four age groups for each of which we obtain the probability
    distribution of years given the weighted names born in that duration.
    """
    groups = friends.getDefinedGroups()
    pre = friends.getGroupPrefix()
    ids = [(lambda x: friends.getGroup(random.randint(x[0], x[1])))(g) for x in groups]
    id_groups = zip(ids, groups)

    def convert(idg):
        prob_year = probYearGivenAgeRange(idg[1])
        return (idg[0], prob_year)

    return map(convert, id_groups)
Exemple #5
0
def seeStatusCounts():
    datadir = '../data'
    filename = 'ageEmbededTweets-Jun20-sampled2.5.json'
    csvfile = 'sampled2.5-annotated.csv'

    tweets = json.loads(open(os.path.join(datadir, filename)).read())


    # create a status count dic; key: user_id, value: count
    statuses_count = {}

    for tweet in tweets:
        user_id = tweet['user']['id']
        count = tweet['user']['statuses_count']
        statuses_count[user_id] = count


    # in the annotated file
    reader = csv.reader(open(os.path.join(datadir, csvfile), 'rb'))
    statuses_length = {}

    for row in reader:
        user_id = int(row[0])
        age = int(row[4])
        isvalid = row[5]
        age_group = friends.getGroup(age)

        if isvalid == '1':
            if not age_group in statuses_length:
                statuses_length[age_group] = []

            statuses_length[age_group].append(statuses_count[user_id])

    # average the length
    for label, count_arr in statuses_length.iteritems():
        statuses_length[label] = np.average(np.array(count_arr))


    print statuses_length
 def convertSingle(t):
     ''' t = (age, prediction, meta) into (true, prediction) '''
     age, prediction, meta = t
     true = friends.getGroup(age)
     return (true, prediction, meta)