Esempio n. 1
0
def query_top_spreaders():
    """Handle API request '/top-user'.

    API Request Parameters
    ----------------------
        upper_day : string formatted datetime
        most_recent : bool

    API Response Keys
    -----------------
        status : string
        num_of_entries : int
        spreaders : dict
            bot_score : float
            number_of_tweets : int
            site_type : {'claim', 'fact_checking'}
            spreading_type : {'active', 'influencial'}
            upper_day : string formatted datetime
            user_id : int
            user_raw_id : string
            user_screen_name : string

    """
    lucene.getVMEnv().attachCurrentThread()
    yesterday = datetime.utcnow().date() - timedelta(days=1)
    yesterday = yesterday.strftime('%Y-%m-%d')

    q_top_spreaders_schema = Schema({
        Optional('upper_day', default=yesterday):
        And(Regex('^\d{4}-\d{2}-\d{2}$'),
            Use(dateutil.parser.parse),
            error='Invalid date, should be yyyy-mm-dd format'),
        Optional('most_recent', default=True):
        And(unicode,
            Use(lambda s: s.lower()), lambda s: s in ('true', 'false'),
            Use(lambda s: True if s == 'true' else False)),
    })
    q_kwargs = copy_req_args(request.args)
    try:
        q_kwargs = q_top_spreaders_schema.validate(q_kwargs)
        df = db_query_top_spreaders(engine, **q_kwargs)
        if len(df) == 0:
            raise APINoResultError('No top spreader found!')
        response = dict(
            status='OK',
            num_of_entries=len(df),
            spreaders=flask.json.loads(df.to_json(**TO_JSON_KWARGS)))
    except SchemaError as e:
        response = dict(status='ERROR', error=str(e))
    except APINoResultError as e:
        response = dict(status='No result error', error=str(e))
    except Exception as e:
        logger.exception(e)
        response = dict(status='ERROR', error='Server error, query failed')
    return flask.jsonify(response)
Esempio n. 2
0
    def look_up_top_spreaders(cls, session, upper_day, most_recent):
        """Look up top spreaders.

        Parameters
        ----------
        session : object
            An instance of SQLAlchemy Session.
        upper_day : date
            The right edge of 30 days window.
        most_recent : bool
            If no result in 30 days windows with `upper_day`, whether return
            most recently available data.
        """
        df = db_query_top_spreaders(session, upper_day, most_recent)
        if len(df) == 0:
            logger.warning('No result found!')
            cls.print_psql(df)
        else:
            logger.info('Top spreaders for day %s', upper_day)
            cls.print_psql(df)
Esempio n. 3
0
    def generate_top_spreaders(cls, session, upper_day):
        """Generate top spreaders.

        Parameters
        ----------
        session : object
            An instance of SQLAlchemy Session.
        upper_day : Date
            The right edge of 30 days window.
        """
        df = db_query_top_spreaders(session, upper_day)
        if len(df) > 0:
            logger.warning('Top spreaders for upper_day %s, already exist!',
                           upper_day)
            cls.print_psql(df)
            return
        month_delta = relativedelta(days=30)
        lower_day = upper_day - month_delta
        # top 20 most active spreaders for 'fact_checking'
        q1 = """
        INSERT INTO top20_spreader_monthly (upper_day, user_id, user_raw_id,
            user_screen_name, site_type, spreading_type, number_of_tweets)
        SELECT DISTINCT ON(tw.user_id)
            :upper_day AS upper_day,
            tw.user_id,
            tu.raw_id AS user_raw_id,
            tw.json_data#>>'{user, screen_name}' AS user_screen_name,
            'fact_checking' AS site_type,
            'active' AS spreading_type,
            t.number_of_tweets
        FROM
            (
                SELECT tw1.user_id, COUNT(tw1.id) AS number_of_tweets
                FROM tweet AS tw1
                    JOIN ass_tweet_url AS atu ON atu.tweet_id=tw1.id
                    JOIN url AS u ON u.id=atu.url_id
                    JOIN site AS s ON s.id=u.site_id
                WHERE tw1.created_at BETWEEN :lower_day AND :upper_day
                    AND site_type LIKE 'fact_checking'
                GROUP BY tw1.user_id
                ORDER BY number_of_tweets DESC LIMIT 20
            ) AS t
            JOIN tweet AS tw ON t.user_id=tw.user_id
            JOIN twitter_user AS tu ON tu.id=tw.user_id
        ORDER BY tw.user_id, tw.created_at DESC, t.number_of_tweets DESC
        """
        # top 20 most active spreaders for 'claim'
        q2 = """
        INSERT INTO top20_spreader_monthly (upper_day, user_id, user_raw_id,
            user_screen_name, site_type, spreading_type, number_of_tweets)
        SELECT DISTINCT ON(tw.user_id)
            :upper_day AS upper_day,
            tw.user_id,
            tu.raw_id AS user_raw_id,
            tw.json_data#>>'{user, screen_name}' AS user_screen_name,
            'claim' AS site_type,
            'active' AS spreading_type,
            t.number_of_tweets
        FROM
            (
                SELECT tw1.user_id, COUNT(tw1.id) AS number_of_tweets
                FROM tweet AS tw1
                    JOIN ass_tweet_url AS atu ON atu.tweet_id=tw1.id
                    JOIN url AS u ON u.id=atu.url_id
                    JOIN site AS s ON s.id=u.site_id
                WHERE tw1.created_at BETWEEN :lower_day AND :upper_day
                    AND site_type LIKE 'claim'
                GROUP BY tw1.user_id
                ORDER BY number_of_tweets DESC LIMIT 20
            ) AS t
            JOIN tweet AS tw ON t.user_id=tw.user_id
            JOIN twitter_user AS tu ON tu.id=tw.user_id
        ORDER BY tw.user_id, t.number_of_tweets DESC, tw.created_at DESC
        """
        # top 20 most influential spreaders for 'fact_checking'
        q3 = """
        INSERT INTO top20_spreader_monthly (upper_day, user_id, user_raw_id,
            user_screen_name, site_type, spreading_type, number_of_tweets)
        SELECT DISTINCT ON(tw.user_id)
            :upper_day AS upper_day,
            tw.user_id,
            tu.raw_id AS user_raw_id,
            tw.json_data#>>'{user, screen_name}' AS user_screen_name,
            'fact_checking' AS site_type,
            'influential' AS spreading_type,
            t.number_of_retweets
        FROM (
            SELECT COUNT(DISTINCT atw.id) AS number_of_retweets,
                    tu.id AS user_id
            FROM ass_tweet AS atw
                JOIN tweet AS tw ON tw.raw_id=atw.retweeted_status_id
                JOIN twitter_user AS tu ON tu.id=tw.user_id
                JOIN ass_tweet_url AS atu ON atu.tweet_id=tw.id
                JOIN url AS u ON u.id=atu.url_id
                JOIN site AS s ON s.id=u.site_id
            WHERE tw.created_at BETWEEN :lower_day AND :upper_day
                AND s.site_type LIKE 'fact_checking'
            GROUP BY tu.id
            ORDER BY number_of_retweets DESC LIMIT 20
            ) AS t
            JOIN tweet AS tw ON t.user_id=tw.user_id
            JOIN twitter_user AS tu ON tu.id=tw.user_id
        ORDER BY tw.user_id, tw.created_at DESC, t.number_of_retweets DESC
        """
        # top 20 most influential for 'claim'
        q4 = """
        INSERT INTO top20_spreader_monthly (upper_day, user_id, user_raw_id,
            user_screen_name, site_type, spreading_type, number_of_tweets)
        SELECT DISTINCT ON(tw.user_id)
            :upper_day AS upper_day,
            tw.user_id,
            tu.raw_id AS user_raw_id,
            tw.json_data#>>'{user, screen_name}' AS user_screen_name,
            'claim' AS site_type,
            'influential' AS spreading_type,
            t.number_of_retweets
        FROM (
            SELECT COUNT(DISTINCT atw.id) AS number_of_retweets,
                    tu.id AS user_id
            FROM ass_tweet AS atw
                JOIN tweet AS tw ON tw.raw_id=atw.retweeted_status_id
                JOIN twitter_user AS tu ON tu.id=tw.user_id
                JOIN ass_tweet_url AS atu ON atu.tweet_id=tw.id
                JOIN url AS u ON u.id=atu.url_id
                JOIN site AS s ON s.id=u.site_id
            WHERE tw.created_at BETWEEN :lower_day AND :upper_day
                AND s.site_type LIKE 'claim'
            GROUP BY tu.id
            ORDER BY number_of_retweets DESC LIMIT 20
            ) AS t
            JOIN tweet AS tw ON t.user_id=tw.user_id
            JOIN twitter_user AS tu ON tu.id=tw.user_id
        ORDER BY tw.user_id, tw.created_at DESC, t.number_of_retweets DESC
"""
        session.execute(
            text(q1).bindparams(lower_day=lower_day, upper_day=upper_day))
        session.execute(
            text(q2).bindparams(lower_day=lower_day, upper_day=upper_day))
        session.execute(
            text(q3).bindparams(lower_day=lower_day, upper_day=upper_day))
        session.execute(
            text(q4).bindparams(lower_day=lower_day, upper_day=upper_day))
        session.commit()
        mspeaders = session.query(Top20SpreaderMonthly).filter_by(
            upper_day=upper_day).all()
        if cls.conf['botometer']['enabled'] is True:
            cls.set_bot_or_not(mspeaders)
        try:
            session.commit()
            df = db_query_top_spreaders(session, upper_day)
            if len(df) == 0:
                logger.warning('No new top spreaders found!')
                cls.print_psql(df)
            else:
                logger.info('Insert new top spreaders for day %s', upper_day)
                cls.print_psql(df)
        except SQLAlchemyError as e:
            logger.error(e)
            session.rollback()