def seeAnnotated(): datadir = '../data' csvfile = 'sampled2.8-annotated-engfiltered.csv' reader = csv.reader(open(os.path.join(datadir, csvfile), 'rb')) age_group_count = {} for row in reader: user_id = int(row[0]) age = int(row[4]) isvalid = row[5] try: age_group = friends.getGroup(age) except RuntimeError: print age if not age_group in age_group_count: age_group_count[age_group] = 0 if isvalid == '1': age_group_count[age_group] += 1 print age_group_count
def _fromCSV(): user_ids_age = [] # read in the data file reader = csv.reader(open(csvfile, 'rb')) for row in reader: user_id = row[0] age = int(row[4]) try: age_group = friends.getGroup(age) except RuntimeError: continue is_valid = row[5] c_following = bs.DBConnector().count_following(user_id) text, length = DBConnector().loadText(user_id) #s_count = statuses_count[int(user_id)] if length != 0 and is_valid == '1' and c_following >= 10: # this means public user meta = {'statuses_count': length} user_ids_age.append((user_id, age, meta)) return user_ids_age
def _fromJson(): user_ids_group = [] tweets = json.loads(open(filename).read()) for tweet in tweets: user_id = tweet['user']['id'] age = int(tweet['user']['age']) age_group = friends.getGroup(age) text = DBConnector().loadText(user_id) if not text == "": user_ids_group.append((user_id, age_group)) return user_ids_group
def probYearForAllGroups(): """ We define four age groups for each of which we obtain the probability distribution of years given the weighted names born in that duration. """ groups = friends.getDefinedGroups() pre = friends.getGroupPrefix() ids = [(lambda x: friends.getGroup(random.randint(x[0], x[1])))(g) for x in groups] id_groups = zip(ids, groups) def convert(idg): prob_year = probYearGivenAgeRange(idg[1]) return (idg[0], prob_year) return map(convert, id_groups)
def seeStatusCounts(): datadir = '../data' filename = 'ageEmbededTweets-Jun20-sampled2.5.json' csvfile = 'sampled2.5-annotated.csv' tweets = json.loads(open(os.path.join(datadir, filename)).read()) # create a status count dic; key: user_id, value: count statuses_count = {} for tweet in tweets: user_id = tweet['user']['id'] count = tweet['user']['statuses_count'] statuses_count[user_id] = count # in the annotated file reader = csv.reader(open(os.path.join(datadir, csvfile), 'rb')) statuses_length = {} for row in reader: user_id = int(row[0]) age = int(row[4]) isvalid = row[5] age_group = friends.getGroup(age) if isvalid == '1': if not age_group in statuses_length: statuses_length[age_group] = [] statuses_length[age_group].append(statuses_count[user_id]) # average the length for label, count_arr in statuses_length.iteritems(): statuses_length[label] = np.average(np.array(count_arr)) print statuses_length
def convertSingle(t): ''' t = (age, prediction, meta) into (true, prediction) ''' age, prediction, meta = t true = friends.getGroup(age) return (true, prediction, meta)