Beispiel #1
0
def make(item_map, axis, top=None):
    """ item_map can be url/hashtag -> [(user_id, tweet_id)...]
    make item_time_map = url/hashtag -> [(user_id, tweet_id, created_at datetime, axis place)...] """
    min_dt, max_dt = axis
    axis_map, center_date_map = map_axis(min_dt, max_dt)
    print('Axis Map Done')

    time_table = db.execute(
        db.mk_connection(), 'select user_id, tweet_id, created_at from tweets')
    time_map = {}
    for user_id, tweet_id, dt in time_table:
        time_map[(user_id, tweet_id)] = dt
    print('Time Map Done')

    item_time_map = {}
    key_set = item_map.keys() if top is None else sorted(
        item_map.keys(), key=lambda k: len(item_map[k]), reverse=True)[:top]
    for item in key_set:  # keys can be url/hashtag
        instances = item_map[item]
        time_instances = [(user_id, tweet_id, time_map[(user_id, tweet_id)])
                          for user_id, tweet_id in instances]
        time_instances = [(user_id, tweet_id, dt)
                          for user_id, tweet_id, dt in time_instances
                          if dt is not None]
        time_instances.sort(key=lambda t: t[2])
        item_time_map[item] = [(user_id, tweet_id, dt, axis_map[(dt.year,
                                                                 quarter(dt))])
                               for user_id, tweet_id, dt in time_instances]
        print('Done Item', item)
    return item_time_map, center_date_map
Beispiel #2
0
 def runner(user, sos=False):
     try:
         tweets = user_tweets[user]
         conn = db.mk_connection()
         for ts in tweets:
             try:
                 tweet = db.execute(
                     conn,
                     'select tweet from connection_tweets where user_id = %s and tweet_id = %s'
                     % (user, ts))[0][0]
                 obj = json.loads(tweet)
                 t = obj['created_at']
                 t = datetime.datetime.strptime(
                     t, '%a %b %d %H:%M:%S +0000 %Y').strftime(
                         '%Y-%m-%d %H:%M:%S')
                 db.execute(
                     conn,
                     'update tweets set created_at = %s where user_id = %s and tweet_id = %s',
                     has_res=False,
                     args=[t, user, ts])
             except:
                 print('Error', user, ts, t)
                 if sos:
                     raise
         conn.commit()
         conn.close()
         print('Done', user)
     except:
         if sos:
             raise
         print('Error', user)
    def runner(user, sos = False):
        conn = db.mk_connection()
        tweets = user_tweets[user]

        try:
            metadata = {}
            for tweet in tweets:
                metadata[tweet] = db.execute(conn, 'select tweet from connection_tweets where user_id = %s and tweet_id = %s' % (user, tweet))[0][0]
        except:
            print(user, 'Error', '(fetch metadata)')
            if sos:
                raise

        try:
            urls = []
            for tweet in tweets:
                tweet_urls = json.loads(metadata[tweet])['entities']['urls']
                if len(tweet_urls) > 0:
                    for url in tweet_urls:
                        urls.append((tweet, url['expanded_url']))
        except:
            print(user, 'Error', '(parse metadata)')
            if sos:
                raise

        try:
            for tweet, expanded_url in urls:
                db.execute(conn, 'insert into tweets_url values (%s, %s, %s)', has_res = False, args = [user, tweet, expanded_url.encode('ascii', 'ignore').decode('ascii')])
            conn.commit()
            conn.close()
        except:
            print(user, 'Error', '(insert url)')
            if sos:
                raise
        print('Done', user)
Beispiel #4
0
 def runner(args):
     user_id, tweet_id, url = args
     try:
         r = requests.get(url, timeout = REQUEST_TIMEOUT)
         try:
             conn = db.mk_connection()
             db.execute(conn, 'update tweets_url set expanded_url = %s where user_id = %s and tweet_id = %s', has_res = False, args = [r.url, user_id, tweet_id])
             db.close_connection(conn)
         except:
             print('Insert Error', user_id, tweet_id, r.url)
     except requests.exceptions.Timeout:
         print('Request Timeout', user_id, tweet_id, url)
     except:
         print('Request Error', user_id, tweet_id, url)
         try:
             conn = db.mk_connection()
             db.execute(conn, 'update tweets_url set expanded_url = %s where user_id = %s and tweet_id = %s', has_res = False, args = ['Request Error', user_id, tweet_id])
             db.close_connection(conn)
         except:
             print('Request Error [Insert Error]', user_id, tweet_id)
Beispiel #5
0
 def runner(user):
     try:
         conn = db.mk_connection()
         for tweet in tweets[user]:
             text = db.get_tweet_text(conn, user, tweet).encode(
                 'ascii', 'ignore').decode('ascii')
             db.insert_tweet_text(conn, user, tweet, text)
         conn.commit()
         conn.close()
         print('Done', user)
     except:
         print('Exception on', user)
def map():
    """ create url -> (user_id, tweet_id) list map
    frequency of url = len(url_map[url])
    rank all urls = sorted(url_map.keys(), key = lambda url: len(url_map[url]), reverse = True) """
    table = db.execute(db.mk_connection(), 'select user_id, tweet_id, url, expanded_url from tweets_url')
    print(len(table), 'Table Entries')
    url_map = {}
    for user_id, tweet_id, url, expanded_url in table:
        url = normalize(url) if expanded_url is None or expanded_url == 'Request Error' else normalize(expanded_url)
        if url not in url_map:
            url_map[url] = []
        url_map[url].append((user_id, tweet_id))
    return url_map
Beispiel #7
0
def map():
    """ create hashtag -> (user_id, tweet_id) list map
    frequency of hashtag = len(hashtag_map[hashtag])
    rank all hashtags = sorted(hashtag_map.keys(), key = lambda hashtag: len(hashtag_map[hashtag]), reverse = True) """
    table = db.execute(
        db.mk_connection(),
        'select user_id, tweet_id, hashtag from tweets_hashtags')
    print(len(table), 'Table Entries')
    hashtag_map = {}
    for user_id, tweet_id, hashtag in table:
        hashtag = normalize(hashtag)
        if hashtag not in hashtag_map:
            hashtag_map[hashtag] = []
        hashtag_map[hashtag].append((user_id, tweet_id))
    return hashtag_map
Beispiel #8
0
                db.execute(conn, 'update tweets_url set expanded_url = %s where user_id = %s and tweet_id = %s', has_res = False, args = [r.url, user_id, tweet_id])
                db.close_connection(conn)
            except:
                print('Insert Error', user_id, tweet_id, r.url)
        except requests.exceptions.Timeout:
            print('Request Timeout', user_id, tweet_id, url)
        except:
            print('Request Error', user_id, tweet_id, url)
            try:
                conn = db.mk_connection()
                db.execute(conn, 'update tweets_url set expanded_url = %s where user_id = %s and tweet_id = %s', has_res = False, args = ['Request Error', user_id, tweet_id])
                db.close_connection(conn)
            except:
                print('Request Error [Insert Error]', user_id, tweet_id)

    if len(sys.argv) == 3:
        user_id = int(sys.argv[1])
        tweet_id = int(sys.argv[2])
        runner((user_id, tweet_id))

    if len(sys.argv) <= 1:
        table = db.execute(db.mk_connection(), 'select user_id, tweet_id, url, expanded_url from tweets_url')
        print(len(table), 'Table Entries')
        table = [(user_id, tweet_id, url) for user_id, tweet_id, url, expanded_url in table if expanded_url is None]
        print(len(table), 'Not Unshortened Entries')

        random.shuffle(table)
        with multiprocessing.Pool(processes = PROCESSES) as pool:
            [0 for _ in pool.imap_unordered(runner, table)]

Beispiel #9
0
if __name__ == '__main__':
    import db
    import requests
    from termcolor import colored as cr

    connection = db.mk_connection()
    urls = [t[0] for t in db.execute(connection, 'select url from tweets_url')]
    print(len(urls), 'URLs')
    connection.close()

    redirect_counter = 0
    seen_counter = 0
    for url in urls:
        seen_counter += 1
        try:
            r = requests.get(url, timeout=5)
            if r.url != url:
                redirect_counter += 1
                print(cr(redirect_counter, 'red', attrs=['bold']),
                      '/',
                      cr(seen_counter, 'green', attrs=['bold']),
                      url,
                      '->',
                      r.url,
                      flush=True)
        except requests.exceptions.Timeout:
            print('Timeout', url)
        except:
            print('Fail', url)
Beispiel #10
0
    return \
            'const data = {\n' +\
            '  datasets: [\n' +\
            ',\n'.join([make_data(i) for i in range(limit)]) +\
            '\n  ]\n};'

if __name__ == '__main__':
    import sys
    import db

    if len(sys.argv) >= 2:
        if sys.argv[1] == 'hashtag':
            import hashtag_frequency
            import timeline

            axis = db.execute(db.mk_connection(), 'select min(created_at), max(created_at) from tweets')[0]
            limit = int(sys.argv[2]) if len(sys.argv) >= 3 else 15
            m, center_date_map = timeline.make(hashtag_frequency.map(), axis, top = limit)
            def get_x_date(axis_t):
                d = center_date_map[axis_t]
                return '"%s"' % d.strftime('%d/%m/%Y')
            def get_y_reverse(item_i):
                return 10 + (limit - item_i) * 16

            s = make(m, limit, xfn = get_x_date, yfn = get_y_reverse)
            fname = sys.argv[3] if len(sys.argv) >= 4 else 'data.js'
            with open(fname, 'w') as f:
                f.write(s)

        if sys.argv[1] == 'url':
            import url_frequency