def query_top_spreaders(): """Handle API request '/top-user'. API Request Parameters ---------------------- upper_day : string formatted datetime most_recent : bool API Response Keys ----------------- status : string num_of_entries : int spreaders : dict bot_score : float number_of_tweets : int site_type : {'claim', 'fact_checking'} spreading_type : {'active', 'influencial'} upper_day : string formatted datetime user_id : int user_raw_id : string user_screen_name : string """ lucene.getVMEnv().attachCurrentThread() yesterday = datetime.utcnow().date() - timedelta(days=1) yesterday = yesterday.strftime('%Y-%m-%d') q_top_spreaders_schema = Schema({ Optional('upper_day', default=yesterday): And(Regex('^\d{4}-\d{2}-\d{2}$'), Use(dateutil.parser.parse), error='Invalid date, should be yyyy-mm-dd format'), Optional('most_recent', default=True): And(unicode, Use(lambda s: s.lower()), lambda s: s in ('true', 'false'), Use(lambda s: True if s == 'true' else False)), }) q_kwargs = copy_req_args(request.args) try: q_kwargs = q_top_spreaders_schema.validate(q_kwargs) df = db_query_top_spreaders(engine, **q_kwargs) if len(df) == 0: raise APINoResultError('No top spreader found!') response = dict( status='OK', num_of_entries=len(df), spreaders=flask.json.loads(df.to_json(**TO_JSON_KWARGS))) except SchemaError as e: response = dict(status='ERROR', error=str(e)) except APINoResultError as e: response = dict(status='No result error', error=str(e)) except Exception as e: logger.exception(e) response = dict(status='ERROR', error='Server error, query failed') return flask.jsonify(response)
def look_up_top_spreaders(cls, session, upper_day, most_recent): """Look up top spreaders. Parameters ---------- session : object An instance of SQLAlchemy Session. upper_day : date The right edge of 30 days window. most_recent : bool If no result in 30 days windows with `upper_day`, whether return most recently available data. """ df = db_query_top_spreaders(session, upper_day, most_recent) if len(df) == 0: logger.warning('No result found!') cls.print_psql(df) else: logger.info('Top spreaders for day %s', upper_day) cls.print_psql(df)
def generate_top_spreaders(cls, session, upper_day): """Generate top spreaders. Parameters ---------- session : object An instance of SQLAlchemy Session. upper_day : Date The right edge of 30 days window. """ df = db_query_top_spreaders(session, upper_day) if len(df) > 0: logger.warning('Top spreaders for upper_day %s, already exist!', upper_day) cls.print_psql(df) return month_delta = relativedelta(days=30) lower_day = upper_day - month_delta # top 20 most active spreaders for 'fact_checking' q1 = """ INSERT INTO top20_spreader_monthly (upper_day, user_id, user_raw_id, user_screen_name, site_type, spreading_type, number_of_tweets) SELECT DISTINCT ON(tw.user_id) :upper_day AS upper_day, tw.user_id, tu.raw_id AS user_raw_id, tw.json_data#>>'{user, screen_name}' AS user_screen_name, 'fact_checking' AS site_type, 'active' AS spreading_type, t.number_of_tweets FROM ( SELECT tw1.user_id, COUNT(tw1.id) AS number_of_tweets FROM tweet AS tw1 JOIN ass_tweet_url AS atu ON atu.tweet_id=tw1.id JOIN url AS u ON u.id=atu.url_id JOIN site AS s ON s.id=u.site_id WHERE tw1.created_at BETWEEN :lower_day AND :upper_day AND site_type LIKE 'fact_checking' GROUP BY tw1.user_id ORDER BY number_of_tweets DESC LIMIT 20 ) AS t JOIN tweet AS tw ON t.user_id=tw.user_id JOIN twitter_user AS tu ON tu.id=tw.user_id ORDER BY tw.user_id, tw.created_at DESC, t.number_of_tweets DESC """ # top 20 most active spreaders for 'claim' q2 = """ INSERT INTO top20_spreader_monthly (upper_day, user_id, user_raw_id, user_screen_name, site_type, spreading_type, number_of_tweets) SELECT DISTINCT ON(tw.user_id) :upper_day AS upper_day, tw.user_id, tu.raw_id AS user_raw_id, tw.json_data#>>'{user, screen_name}' AS user_screen_name, 'claim' AS site_type, 'active' AS spreading_type, t.number_of_tweets FROM ( SELECT tw1.user_id, COUNT(tw1.id) AS number_of_tweets FROM tweet AS tw1 JOIN ass_tweet_url AS atu ON atu.tweet_id=tw1.id JOIN url AS u ON u.id=atu.url_id JOIN site AS s ON s.id=u.site_id WHERE tw1.created_at BETWEEN :lower_day AND :upper_day AND site_type LIKE 'claim' GROUP BY tw1.user_id ORDER BY number_of_tweets DESC LIMIT 20 ) AS t JOIN tweet AS tw ON t.user_id=tw.user_id JOIN twitter_user AS tu ON tu.id=tw.user_id ORDER BY tw.user_id, t.number_of_tweets DESC, tw.created_at DESC """ # top 20 most influential spreaders for 'fact_checking' q3 = """ INSERT INTO top20_spreader_monthly (upper_day, user_id, user_raw_id, user_screen_name, site_type, spreading_type, number_of_tweets) SELECT DISTINCT ON(tw.user_id) :upper_day AS upper_day, tw.user_id, tu.raw_id AS user_raw_id, tw.json_data#>>'{user, screen_name}' AS user_screen_name, 'fact_checking' AS site_type, 'influential' AS spreading_type, t.number_of_retweets FROM ( SELECT COUNT(DISTINCT atw.id) AS number_of_retweets, tu.id AS user_id FROM ass_tweet AS atw JOIN tweet AS tw ON tw.raw_id=atw.retweeted_status_id JOIN twitter_user AS tu ON tu.id=tw.user_id JOIN ass_tweet_url AS atu ON atu.tweet_id=tw.id JOIN url AS u ON u.id=atu.url_id JOIN site AS s ON s.id=u.site_id WHERE tw.created_at BETWEEN :lower_day AND :upper_day AND s.site_type LIKE 'fact_checking' GROUP BY tu.id ORDER BY number_of_retweets DESC LIMIT 20 ) AS t JOIN tweet AS tw ON t.user_id=tw.user_id JOIN twitter_user AS tu ON tu.id=tw.user_id ORDER BY tw.user_id, tw.created_at DESC, t.number_of_retweets DESC """ # top 20 most influential for 'claim' q4 = """ INSERT INTO top20_spreader_monthly (upper_day, user_id, user_raw_id, user_screen_name, site_type, spreading_type, number_of_tweets) SELECT DISTINCT ON(tw.user_id) :upper_day AS upper_day, tw.user_id, tu.raw_id AS user_raw_id, tw.json_data#>>'{user, screen_name}' AS user_screen_name, 'claim' AS site_type, 'influential' AS spreading_type, t.number_of_retweets FROM ( SELECT COUNT(DISTINCT atw.id) AS number_of_retweets, tu.id AS user_id FROM ass_tweet AS atw JOIN tweet AS tw ON tw.raw_id=atw.retweeted_status_id JOIN twitter_user AS tu ON tu.id=tw.user_id JOIN ass_tweet_url AS atu ON atu.tweet_id=tw.id JOIN url AS u ON u.id=atu.url_id JOIN site AS s ON s.id=u.site_id WHERE tw.created_at BETWEEN :lower_day AND :upper_day AND s.site_type LIKE 'claim' GROUP BY tu.id ORDER BY number_of_retweets DESC LIMIT 20 ) AS t JOIN tweet AS tw ON t.user_id=tw.user_id JOIN twitter_user AS tu ON tu.id=tw.user_id ORDER BY tw.user_id, tw.created_at DESC, t.number_of_retweets DESC """ session.execute( text(q1).bindparams(lower_day=lower_day, upper_day=upper_day)) session.execute( text(q2).bindparams(lower_day=lower_day, upper_day=upper_day)) session.execute( text(q3).bindparams(lower_day=lower_day, upper_day=upper_day)) session.execute( text(q4).bindparams(lower_day=lower_day, upper_day=upper_day)) session.commit() mspeaders = session.query(Top20SpreaderMonthly).filter_by( upper_day=upper_day).all() if cls.conf['botometer']['enabled'] is True: cls.set_bot_or_not(mspeaders) try: session.commit() df = db_query_top_spreaders(session, upper_day) if len(df) == 0: logger.warning('No new top spreaders found!') cls.print_psql(df) else: logger.info('Insert new top spreaders for day %s', upper_day) cls.print_psql(df) except SQLAlchemyError as e: logger.error(e) session.rollback()