def ds_all_tweets(mode):

    tweets_dist = {}

    count = 0

    mongo_client = helper.get_mongo_client()

    maif_db = mongo_client['maif_db']
    tweets_col = maif_db['tweets']
    age_labels_col = maif_db['age_labels']

    twitter_ids = tweets_col.distinct('user.id_str')

    if mode == TOKENS_MODE:
        all_tweets_path = ALL_TWEETS_TOKENS_PATH
    elif mode == EXT_FEATURES_MODE:
        all_tweets_path = ALL_TWEETS_EXT_FEATURES_PATH
    elif mode == TWEETS_MODE:
        all_tweets_path = ALL_TWEETS_COMBINED_PATH

    with open(all_tweets_path, 'w', newline='', encoding='utf-8') as wf_csv:
        writer = csv.writer(wf_csv, delimiter=',')

        for twitter_id in twitter_ids:

            age = age_labels_col.find_one({'id_str': twitter_id})['age']

            tweet_objs = tweets_col.find({'user.id_str': twitter_id})
            tweet_obj_list = list(tweet_objs)

            tweets_dist[twitter_id] = len(tweet_obj_list)

            if tweet_obj_list:
                writer.writerow(
                    get_csv_row(twitter_id, tweet_obj_list, 0,
                                len(tweet_obj_list) - 1, age, mode))
                count += 1

    print('count: {}'.format(count))

    with open(all_tweets_path + 'dist.txt', 'w') as wf:
        for k, v in tweets_dist.items():
            wf.write('{} {}\n'.format(k, v))
def insert_ages_in_db():

    mongo_client = helper.get_mongo_client()

    maif_db = mongo_client['maif_db']
    tweets_col = maif_db['tweets']
    age_labels_col = maif_db['age_labels']

    age_label_objs = []

    with open(
            r'D:\Data\Linkage\Other Datasets\Age\Zhang_ICWSM_2016\ageLabels.txt'
    ) as rf:
        for line in rf:
            tokens = line.strip().split(' ')
            id_str = tokens[0]
            age = tokens[1]
            age_label_obj = {'id_str': id_str, 'age': age}
            age_label_objs.append(age_label_obj)

    age_labels_col.insert_many(age_label_objs)
Beispiel #3
0
def gen_ds_x_tweets():
    mongo_client = helper.get_mongo_client()

    twitter_db = mongo_client['twitter']
    tweets_col = twitter_db['tweets']
    voters_col = twitter_db['voters']
    ground_truths_col = twitter_db['ground_truths']

    tuples = [(x['twitter_id'], x['voter_serial']) for x in ground_truths_col.find({})]

    with open(X_TWEETS_PATH, 'w', newline='', encoding='utf-8') as wf_csv:
        writer = csv.writer(wf_csv, delimiter=',')
        writer.writerow(CSV_HEADER)

        for twitter_id, voter_serial in tuples:

            voter = voters_col.find_one({'serial': voter_serial})
            voter['twitter_id'] = twitter_id

            tweet_objs = tweets_col.find({'user.id_str': twitter_id, 'retweeted_status': {'$exists': False}})
            tweet_obj_list = list(tweet_objs)

            begin_index = None
            begin_datetime = None

            for curr_index in range(len(tweet_obj_list)):

                if begin_index == None:
                    begin_index = curr_index
                    begin_datetime = get_datetime(tweet_obj_list[begin_index]['created_at'])
                else:
                    curr_datetime = get_datetime(tweet_obj_list[curr_index]['created_at'])
                    while date_difference_days(curr_datetime, begin_datetime) > 365:
                        begin_index += 1
                        begin_datetime = get_datetime(tweet_obj_list[begin_index]['created_at'])
                    if curr_index - begin_index == 49:
                        writer.writerow(
                            get_csv_row(voter, tweet_obj_list, begin_index, curr_index))
                        begin_index = curr_index
Beispiel #4
0
def gen_ds_yearly_tweets():
    mongo_client = helper.get_mongo_client()

    twitter_db = mongo_client['twitter']
    tweets_col = twitter_db['tweets']
    voters_col = twitter_db['voters']
    ground_truths_col = twitter_db['ground_truths']

    tuples = [(x['twitter_id'], x['voter_serial']) for x in ground_truths_col.find({})]

    with open(YEARLY_TWEETS_PATH, 'w', newline='', encoding='utf-8') as wf_csv:
        writer = csv.writer(wf_csv, delimiter=',')
        writer.writerow(CSV_HEADER)

        for twitter_id, voter_serial in tuples:

            voter = voters_col.find_one({'serial': voter_serial})
            voter['twitter_id'] = twitter_id

            tweet_objs = tweets_col.find({'user.id_str': twitter_id, 'retweeted_status': {'$exists': False}}).sort(
                [('id', 1)])
            tweet_obj_list = list(tweet_objs)

            end_index = len(tweet_obj_list) - 1
            curr_index = end_index

            while curr_index >= 0:

                end_datetime = get_datetime(tweet_obj_list[end_index]['created_at'])
                curr_datetime = get_datetime(tweet_obj_list[curr_index]['created_at'])

                if date_difference_days(curr_datetime, end_datetime) <= 365:
                    if curr_index == 0:
                        writer.writerow(get_csv_row(voter, tweet_obj_list, curr_index, end_index))
                    curr_index -= 1
                else:
                    writer.writerow(get_csv_row(voter, tweet_obj_list, curr_index + 1, end_index))
                    end_index = curr_index
Beispiel #5
0
def gen_ds_all_tweets_chunked():
    mongo_client = helper.get_mongo_client()

    twitter_db = mongo_client['twitter']
    tweets_col = twitter_db['tweets']
    voters_col = twitter_db['voters']
    ground_truths_col = twitter_db['ground_truths']

    tuples = [(x['twitter_id'], x['voter_serial']) for x in ground_truths_col.find({})]

    with open(ALL_TWEETS_CHUNKED_PATH, 'w', newline='', encoding='utf-8') as wf_csv:
        writer = csv.writer(wf_csv, delimiter=',')
        writer.writerow(CSV_HEADER_CHUNKED)

        for twitter_id, voter_serial in tuples:

            voter = voters_col.find_one({'serial': voter_serial})
            voter['twitter_id'] = twitter_id

            tweet_objs = tweets_col.find({'user.id_str': twitter_id, 'retweeted_status': {'$exists': False}})
            tweet_obj_list = list(tweet_objs)

            if tweet_obj_list:
                writer.writerow(get_csv_row_chunked(voter, tweet_obj_list, 0, len(tweet_obj_list) - 1))
                                                      include_rts=include_rts)

            error = False

        except tweepy.TweepError as te:
            print('api error: {}'.format(te.reason))
            if 'Not authorized' in te.reason or 'page does not exist' in te.reason:
                error = False

    return tweet_objs


if __name__ == '__main__':

    apis = helper.get_twitter_app_apis()
    mongo_client = helper.get_mongo_client()
    twitter_db = mongo_client['twitter']
    tweets_col = twitter_db['new_tweets']
    gt_col = twitter_db['ground_truths']
    index_col = twitter_db['new_tweets_index']

    user_ids = []

    gts = gt_col.find({})
    for gt in gts:
        user_ids.append(gt['twitter_id'])

    user_ids.sort()

    index = get_index(index_col)
    position = index['position']
Beispiel #7
0
    voter_objs = []

    for i in range(2000):
        voter_file_path = os.path.join(FL_REC_SPLITS_DIR,
                                       'rec_{}.txt'.format(i))

        with open(voter_file_path, 'r') as rf:
            for line in rf:
                tokens = list(map(lambda x: x.strip(), line.split('\t')))
                voter_obj = {
                    'serial': tokens[0],
                    'fname': tokens[1],
                    'mname': tokens[2],
                    'lname': tokens[3],
                    'sex': tokens[4],
                    'dob': tokens[5],
                    'race_code': tokens[6],
                    'add1': tokens[7],
                    'add2': tokens[8],
                    'city': tokens[9],
                    'zip_code': tokens[10],
                    'county_code': tokens[11],
                    'party': tokens[12],
                    'phone': tokens[13],
                    'email': tokens[14]
                }
                voter_objs.append(voter_obj)

    store_voters(helper.get_mongo_client(), voter_objs)
                    raise Exception(
                        'Already visited index {} at file {}'.format(
                            vt_index, i))

                vt_indexes_visited[vt_index] = 1

                twitter_info = line[line.index(':\t') + 2:]

                attributes = twitter_info.split('\t')

                if len(attributes) > 3:
                    twitter_name = attributes[0]
                    twitter_id = attributes[2]

                    count += 1

                    print('{} : {}  -----  {}'.format(count, twitter_name,
                                                      flnames[index]))

                    ground_truth_obj = {
                        'twitter_id': twitter_id,
                        'voter_serial': serials[vt_index]
                    }

                    ground_truth_objs.append(ground_truth_obj)

    #                    if twitter_name == '':
    #                        print(master_ground_truths_file, index)

    store_ground_truths(helper.get_mongo_client(), ground_truth_objs)