Ejemplo n.º 1
0
 def run(cls, args):
     try:
         # print(args)
         args = cls.args_schema.validate(args)
     except SchemaError as e:
         sys.exit(e)
     session = Session()
     # make sure lucene be inited
     lucene.initVM()
     lucene.getVMEnv().attachCurrentThread()
     if args['--index'] is True:
         configure_logging(
             'lucene.index', console_level=args['--console-log-level'])
         mgid = get_or_create_m(
             session,
             MetaInfo,
             data=dict(
                 name='article_group_id_lucene_index',
                 value='0',
                 value_type='int',
                 description='article.group_id used for lucene index'),
             fb_uk='name')
         if args['--mode'] == 'create':
             mgid.set_value(0)
             session.commit()
         logger.debug('Indexing started.. Getting articles..')
         q = """
         SELECT DISTINCT ON (a.group_id) a.id, a.group_id,
             a.canonical_url,
             a.title, a.meta, a.content,
             coalesce(a.date_published, a.date_captured) AS pd,
             s.domain, s.site_type
         FROM article AS a
             JOIN site AS s ON s.id=a.site_id
         WHERE a.site_id IS NOT NULL AND s.is_enabled IS TRUE
             AND a.group_id>:gid
         ORDER BY group_id, pd ASC
         """
         articles_iter = session.execute(
             sqlalchemy.text(q).bindparams(gid=mgid.get_value()))
         cls.index(session, args['--mode'], articles_iter, mgid)
     elif args['--search'] is True:
         configure_logging(
             'lucene.search', console_level=args['--console-log-level'])
         cls.search(args['--query'], args['--top'])
     else:
         print("Unrecognized command!")
         sys.exit(2)
Ejemplo n.º 2
0
 def run(cls, args):
     """Overriding method as the entry point of this command."""
     session = Session()
     cls.init(session, args['--force-drop'],
              ignore_inactive=args['--ignore-inactive'],
              force_inactive=args['--force-inactive'],
              ignore_redirected=args['--ignore-redirected'])
Ejemplo n.º 3
0
 def run(cls, args):
     """Overriding method as the entry point of this command."""
     session = Session(expire_on_commit=False)
     if args['--twitter-streaming'] is True:
         configure_logging('twitter.streaming',
                           console_level=args['--console-log-level'])
         cls.twitter_stream(session, args)
Ejemplo n.º 4
0
def db_query_top_spreaders(engine, upper_day, most_recent=False):
    """Query top 20 spreaders in the 30 days window.

    Parameters
    ----------
    engine : object
        A SQLAlchemy connection, e.g., engine or session.
    upper_day : datetime
        The right edge of the 30 days window.
    most_recent : bool
        When no results for the `upper_day`, whether result the
        most recent available results.

    Returns
    -------
    pandas.DataFrame
        Columns of the dataframe are ['upper_day', 'user_id',
        'user_raw_id', 'user_screen_name', 'site_type',
        'spreading_type', 'number_of_tweets', 'bot_score']
    """
    q0 = """
    SELECT upper_day, user_id, user_raw_id, user_screen_name, site_type,
    spreading_type, number_of_tweets, bot_or_not
    FROM top20_spreader_monthly WHERE upper_day=:upper_day
    ORDER BY site_type, spreading_type, number_of_tweets DESC
    """
    q = text(q0).bindparams(upper_day=upper_day)
    rp = engine.execute(q)
    df = pd.DataFrame(iter(rp), columns=rp.keys())
    if len(df) == 0 and most_recent is True:
        session = Session()
        upper_day = get_max(session, Top20SpreaderMonthly.upper_day)
        if upper_day is None:
            raise APINoResultError
        else:
            q = text(q0).bindparams(upper_day=upper_day)
            rp = engine.execute(q)
            df = pd.DataFrame(iter(rp), columns=rp.keys())
    df['user_raw_id'] = df.user_raw_id.astype(str)

    def get_bot_score(bon):
        if bon is None:
            return None
        elif 'score' in bon:
            return bon['score']
        elif 'scores' in bon:
            return bon['scores'].get('universal')
        else:
            return None

    df['bot_score'] = df.bot_or_not.apply(get_bot_score)
    df = df.drop('bot_or_not', axis=1)
    return df
Ejemplo n.º 5
0
def saver_queue(q2, number_of_workers):
    engine = create_engine(URL(**CONF['database']['connect_args']),
                           pool_size=1,
                           pool_recycle=CONF['database']['pool_recycle'],
                           client_encoding='utf8')
    Session = scoped_session(sessionmaker(bind=engine))
    session = Session()
    workers_status = [1 for i in range(number_of_workers)]
    while True:
        pid, status, uusers = q2.get()
        if status == 'STOP':
            logger.info(
                'Saver process: STOP sign of worker process %s received from q2',
                pid)
            workers_status[pid] = 0
            if sum(workers_status) == 0:
                logger.warning('All STOP signs received from q2.')
                logger.warning('Data saving task done!')
                break
        else:
            logger.info('Saver process: size of uusers is %s', len(uusers))
            stmt_do_nothing = insert(TwitterUserUnion).values(
                uusers).on_conflict_do_nothing(index_elements=['raw_id'])
            session.execute(stmt_do_nothing)
            session.commit()
Ejemplo n.º 6
0
 def run(cls, args):
     """Overriding method as the entry point of this command."""
     try:
         args = cls.args_schema.validate(args)
     except SchemaError as e:
         raise SystemExit('\n' + e + '\n')
     session = Session(expire_on_commit=False)
     if args['--twitter-streaming'] is True:
         configure_logging('twitter.streaming')
         cls.twitter_stream(session, args)
     elif args['--load-tweets'] is True:
         configure_logging('twitter.load-tweets')
         cls.load_tweets(session, args)
     elif args['--reparse-db-tweets'] is True:
         configure_logging('twitter.reparse-db', file_level='WARNING')
         cls._test_table_names(session, args)
         cls.reparse_db(session, args)
Ejemplo n.º 7
0
def db_query_top_articles(engine, upper_day, most_recent=False):
    """Query top 20 articles in the 30 days window.

    Parameters
    ----------
    engine : object
        A SQLAlchemy connection, e.g., engine or session.
    upper_day : datetime
        The right edge of the 30 days window.
    most_recent : bool
        When no results for the `upper_day`, whether result the
        most recent available results.

    Returns
    -------
    pandas.DataFrame
        Columns of the dataframe are ['upper_day', 'date_captured',
        'title', 'canonical_url', 'site_type', 'number_of_tweets'].
    """
    q0 = """
    SELECT upper_day, date_captured, title, canonical_url, site_type,
    number_of_tweets
    FROM top20_article_monthly WHERE upper_day=:upper_day
    ORDER BY site_type, number_of_tweets DESC
    """
    q = text(q0).bindparams(upper_day=upper_day)
    rp = engine.execute(q)
    df = pd.DataFrame(iter(rp), columns=rp.keys())
    if len(df) == 0 and most_recent is True:
        session = Session()
        upper_day = get_max(session, Top20ArticleMonthly.upper_day)
        if upper_day is None:
            raise APINoResultError
        else:
            q = text(q0).bindparams(upper_day=upper_day)
            rp = engine.execute(q)
            df = pd.DataFrame(iter(rp), columns=rp.keys())
    return df
Ejemplo n.º 8
0
    def _monitor(self):
        """Monitor the queue for tweet, and use function parse to parse it.

        This method runs on a separate, internal thread.
        The thread will terminate if it sees a sentinel object in the queue.
        """
        # scoped_session
        # Session itself is not thread safe, use scoped_session
        # each thread use only one scoped_session object
        # We never delete anything from database, and we rely much
        # on the `id` of  existed object to build relaship
        # set expire_on_commit=False
        # to avoid re-fetch of these existed objects
        session = Session(expire_on_commit=False)
        parser = Parser(session, self.platform_id, **self.p_kwargs)
        q = self.queue
        has_task_done = hasattr(q, 'task_done')
        while not self._stop.isSet():
            # Server down, hold on
            if self._hold_on is True:
                logger.info('qsize is %s', q.qsize())
                time.sleep(self._hold_on_unit)
                self._hold_on_counter += self._hold_on_unit
                if self._hold_on_counter >= self._hold_on_max:
                    return
                logger.info('Hold on, keep tring to connect SQL server...')
                logger.info('Elapsed %s seconds, since recent server down',
                            self._hold_on_counter)
                if self._test_connection(session):
                    self._hold_on = False
                    self._hold_on_counter = 0
                continue
            try:
                jd = self.dequeue(True)
                if jd is self._sentinel:
                    break
                self._counter += 1
                if self._counter % self._window_size == 0:
                    logger.info('qsize is %s', q.qsize())
                parser.parse(jd)
                if has_task_done:
                    q.task_done()
            except Queue.Empty:
                break
            except Exception as e:
                logger.error('Exception %s when parsing %s', e, jd)
                if isinstance(e, SQLAlchemyError):
                    session.rollback()
                    if isinstance(e, OperationalError):
                        # if 'could not connect to server' in str(e):
                        logger.error('Hold on until SQL service back! %s', e)
                        self._hold_on = True
        # There might still be records in the queue.
        while True:
            try:
                jd = self.dequeue(False)
                if jd is self._sentinel:
                    break
                parser.parse(jd)
                if has_task_done:
                    q.task_done()
            except Queue.Empty:
                break
            except Exception as e:
                logger.error('Exception %s when parsing %s', e, jd)
                if isinstance(e, SQLAlchemyError):
                    session.rollback()
                    if isinstance(e, OperationalError):
                        return
Ejemplo n.º 9
0
 def run(cls, args):
     """Overriding method as the entry point of this command."""
     session = Session()
     if args['--volume'] is True:
         configure_logging('report.volume',
                           console_level=args['--console-log-level'],
                           file_level='WARNING')
         table_names = ['tweet', 'url', 'article']
         table = args['--table']
         if table not in table_names:
             logger.critical('Available tables are: %s', table_names)
             sys.exit(2)
         interval_names = [
             'minute', 'hour', 'day', 'week', 'month', 'quarter', 'year'
         ]
         interval = args['--interval']
         if interval not in interval_names:
             logger.critical('Available intervals are: %s', interval_names)
             sys.exit(2)
         limit = args['--limit']
         if int(limit) <= 0:
             logger.critical('%r should larger than 0', limit)
             sys.exit(2)
         sql = """
         SELECT count(id) as agg_num,
             date_trunc(:interval, created_at) as interval
         FROM %s
         GROUP BY interval
         ORDER BY interval DESC
         LIMIT :limit""" % table
         stmt = text(sql).bindparams(interval=interval, limit=limit)
         strf = '%Y-%m-%d %H:%M:%S'
         with ENGINE.connect() as conn:
             result = conn.execute(stmt).fetchall()
             print(('-' * 35))
             print(('{0:^20s} | {1:12s}'.format('Timeline (%s)' % interval,
                                                'Aggregation')))
             print(('-' * 35))
             for v, t in result:
                 print(('{0:^20s} | {1:8d}'.format(t.strftime(strf), v)))
             print(('-' * 35))
     elif args['--status']:
         configure_logging('report.streaming-status',
                           console_level=args['--console-log-level'])
         table_name = None
         if args['--status'] == 'twitter':
             table_name = 'tweet'
         if table_name is None:
             logger.critical('SNS %r has not been implemented!',
                             args['--status'])
             sys.exit(2)
         sql = 'SELECT created_at FROM {} ORDER BY id DESC LIMIT 1'.format(
             'tweet')
         with ENGINE.connect() as conn:
             most_recent, = conn.execute(text(sql)).fetchone()
             delta_minutes = 30
             delta = timedelta(minutes=delta_minutes)
             current_utc = datetime.utcnow()
             if current_utc - most_recent > delta:
                 logger.critical(
                     'No %s streaming update in the past %s minutes!',
                     args['--status'], delta_minutes)
             else:
                 logger.info('Most recent %s streaming update is %s',
                             args['--status'],
                             str(most_recent) + ' (UTC)')
     elif args['--top-spreader'] is True:
         configure_logging('report.top-spreaders',
                           console_level=args['--console-log-level'],
                           file_level='WARNING')
         # try to create table
         if (Top20SpreaderMonthly.__table__.exists(bind=ENGINE)) is False:
             Top20SpreaderMonthly.__table__.create(bind=ENGINE)
         if args['--force-today'] is True:
             upper_day = datetime.utcnow().date()
         elif args['--upper-day'] is None:
             upper_day = datetime.utcnow().date() - timedelta(days=1)
         else:
             try:
                 upper_day = parse(args['--upper-day']).date()
             except Exception:
                 raise ValueError('Invalid date: %s', args['--upper-day'])
         if args['--generate'] is True:
             logger.warning(
                 'Generating top spreaders for uppder_day=%r ...',
                 upper_day)
             cls.generate_top_spreaders(session, upper_day)
         elif args['--look-up'] is True:
             cls.look_up_top_spreaders(session, upper_day,
                                       args['--most-recent'])
     elif args['--top-article'] is True:
         configure_logging('report.top-article',
                           console_level=args['--console-log-level'],
                           file_level='WARNING')
         # try to create table
         if (Top20ArticleMonthly.__table__.exists(bind=ENGINE)) is False:
             Top20ArticleMonthly.__table__.create(bind=ENGINE)
         if args['--force-today'] is True:
             upper_day = datetime.utcnow().date()
         elif args['--upper-day'] is None:
             upper_day = datetime.utcnow().date() - timedelta(days=1)
         else:
             try:
                 upper_day = parse(args['--upper-day']).date()
             except Exception:
                 raise ValueError('Invalid date: %s', args['--upper-day'])
         if args['--generate'] is True:
             logger.warning('Generating top articles for uppder_day=%r ...',
                            upper_day)
             cls.generate_top_articles(session, upper_day)
         elif args['--look-up'] is True:
             cls.look_up_top_articles(session, upper_day,
                                      args['--most-recent'])
     session.close()
Ejemplo n.º 10
0
    def run(cls, args):
        """Overriding method as the entry point of this command."""
        try:
            args = cls.args_schema.validate(args)
        except SchemaError as e:
            raise SystemExit(e)

        session = Session(expire_on_commit=False)
        # session = Session()
        where_expr = args['--where-expr']
        ob_expr = args.get('--order-by', 'asc')
        limit = args['--limit']
        # --fetch-url
        if args['--fetch-url'] is True:
            configure_logging('crawl.fetch-url',
                              console_level='DEBUG',
                              file_level='WARNING')
            purpose = 'update' if args['--update'] is True else 'archive'
            if where_expr is None:
                where_expr = [text(DEFAULT_WHERE_EXPR_FETCH_URL)]
            else:
                where_expr = [text(where_expr)]
            ob_expr = Site.id.asc() if ob_expr == 'asc' else Site.id.desc()
            msites = get_msites(session,
                                f_expr=where_expr,
                                ob_expr=ob_expr,
                                limit=limit)
            if len(msites) == 0:
                logger.warning("None sites you queried found in DB!")
                raise SystemExit(2)
            platform_id = get_platform_id(session, name=N_PLATFORM_WEB)
            # detach msites and mplatform from session,
            # since they definitely would not be modified in session
            for ms in msites:
                session.expunge(ms)
            logger.warning('Starting crawling process to fetch URL update ...')
            cls.fetch_url(session, msites, platform_id, purpose)
        elif args['--fetch-html'] is True:
            configure_logging('crawl.fetch-html',
                              console_level='DEBUG',
                              file_level='WARNING')
            if not session.query(Site.id).count() > 0:
                raise SystemExit('Your site table is empty!')
            q = session.query(Url.id, Url.raw)
            if where_expr is None:
                where_expr = [text(DEFAULT_WHERE_EXPR_FETCH_HTML)]
            else:
                where_expr = [text(where_expr)]
            ob_expr = Url.id.asc() if ob_expr == 'asc' else Url.id.desc()
            q = q.filter(*where_expr).order_by(ob_expr)
            if limit is not None:
                q = q.limit(limit)
            logger.info(
                q.statement.compile(compile_kwargs={"literal_binds": True}))
            url_tuples = q.all()
            if not url_tuples:
                logger.warning('No such URLs in DB!')
                raise SystemExit(2)
            logger.warning('Staring crawling process to fetch HTML ...')
            cls.fetch_html(session, url_tuples)
        # --parse-article
        elif args['--parse-article'] is True:
            configure_logging('crawl.parse-article',
                              console_level='DEBUG',
                              file_level='WARNING')
            q = session.query(Url.id, Url.created_at, Url.date_published,
                              Url.canonical, Url.site_id)
            if where_expr is None:
                where_expr = [text(DEFAULT_WHERE_EXPR_PARSE_ARTICLE)]
            else:
                where_expr = [text(where_expr)]
            ob_expr = Url.id.asc() if ob_expr == 'asc' else Url.id.desc()
            q = q.filter(*where_expr).order_by(ob_expr)
            if limit is not None:
                q = q.limit(limit)
            logger.info(
                q.statement.compile(compile_kwargs={"literal_binds": True}))
            url_tuples = q.all()
            if not url_tuples:
                logger.warning('No URLs found from DB!')
                raise SystemExit(2)
            logger.warning('Starting crawling process to parse article ...')
            cls.parse_article(session, url_tuples)
        session.close()
Ejemplo n.º 11
0
            break
        # q = """
        #     SELECT tw.id, tw.json_data, u.id, u.raw
        #     FROM tweet AS tw
        #     LEFT JOIN ass_tweet_url AS atu ON atu.tweet_id=tw.id
        #     LEFT JOIN url AS u ON u.id=atu.url_id
        #     WHERE tw.raw_id in (894686360900177920)
        #     """
        for tw_id, jd, url_id, url in engine.execute(
                text(q).bindparams(l=w_open_left, r=w_close_right)):
                # text(q)):
            jds[tw_id] = jd
            if url_id is not None:
                g_urls_map[url] = url_id
        w_open_left = w_close_right
        w_close_right += window_size
        # import pdb; pdb.set_trace()
        parser.bulk_parse_and_save(session=session, jds=jds,
                                   existed_tweets=True,
                                   g_urls_map=g_urls_map)
        # break

if __name__ == '__main__':
    # setting sqlalchemy logging
    # logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
    logger = logging.getLogger()
    logging.basicConfig(level='INFO')
    session = Session()
    main_test(ENGINE, session, min_id=0, window_size=10000, drop_first=True)
    # main_test(ENGINE, session, window_size=1000, drop_first=False)
Ejemplo n.º 12
0
    def run(cls, args):
        """Overriding method as the entry point of this command."""
        session = Session(expire_on_commit=False)
        # session = Session()
        # expand user home for the file
        if args['<file>'] is not None:
            args['<file>'] = os.path.expanduser(args['<file>'])
        # --load-domains commands
        if args['--load-domains'] is True:
            configure_logging(
                'site.load-domains',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            fn = args.get('<file>', join(HOAXY_HOME, 'domains.txt'))
            logger.info('Loading data from file %r', fn)
            cls.load_domains(
                session,
                fn,
                site_type=args['--site-type'],
                ignore_inactive=args['--ignore-inactive'],
                force_inactive=args['--force-inactive'],
                ignore_redirected=args['--ignore-redirected'])
        # --load-sites commands
        elif args['--load-sites'] is True:
            configure_logging(
                'site.load-sites',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            fn = args.get('<file>', join(HOAXY_HOME, 'sites.yaml'))
            logger.info('Loading data from file %r', fn)
            cls.load_sites(
                session,
                fn,
                ignore_inactive=args['--ignore-inactive'],
                force_inactive=args['--force-inactive'],
                ignore_redirected=args['--ignore-redirected'])
        # --add commands
        elif args['--add'] is True:
            configure_logging(
                'site.add',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            msite = qquery_msite(session, domain=args['--domain'])
            if msite is not None:
                logger.warning('Site %s already exists!', args['--domain'])
            else:
                cls.add_site(
                    session,
                    domain=args['--domain'],
                    site_type=args['--site-type'],
                    name=args['--name'],
                    tag_source=args['--tag-source'],
                    site_tags=args['--site-tag'],
                    alternate_domains=args['--alternate-domain'],
                    ignore_inactive=args['--ignore-inactive'],
                    force_inactive=args['--force-inactive'],
                    ignore_redirected=args['--ignore-redirected'])
        # --add-site-tags
        elif args['--add-site-tags'] is True:
            configure_logging(
                'site.add-site-tags',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.add_site_tags(session, msite, args['--tag-source'],
                                  args['--site-tag'])
        # --replace-site-tags
        elif args['--replace-site-tags'] is True:
            configure_logging(
                'site.repalce-site-tags',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.replace_site_tags(session, msite, args['--tag-source'],
                                      args['--site-tag'])
        # --add-alternate-domains
        elif args['--add-alternate-domains'] is True:
            configure_logging(
                'site.add-alternate-domains',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.add_alternate_domains(session, msite,
                                          args['--alternate-domain'])
        # --replace-alternate-domains
        elif args['--replace-alternate-domains'] is True:
            configure_logging(
                'site.replace-alternate-domains',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.replace_alternate_domains(session, msite,
                                              args['--alternate-domain'])
        elif args['--disable'] is True:
            configure_logging(
                'site.disable',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.disable_site(session, msite)
        elif args['--enable'] is True:
            configure_logging(
                'site.enable',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.enable_site(session, msite)
        # --status
        elif args['--status'] is True:
            configure_logging(
                'site.status',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--include-disabled'] is True:
                cls.site_status(session, True)
            else:
                cls.site_status(session, False)
        # --dump
        elif args['--dump'] is True:
            configure_logging(
                'site.status',
                console_level=args['--console-log-level'],
                file_level='INFO')
            cls.dump(session, args['<file>'])

        session.close()
Ejemplo n.º 13
0
def workers_queue(pid, q1, q2):
    """Receiving parameters from q1, then computing and finally put results
       into q2
    """
    engine = create_engine(URL(**CONF['database']['connect_args']),
                           pool_size=1,
                           pool_recycle=CONF['database']['pool_recycle'],
                           client_encoding='utf8')
    Session = scoped_session(sessionmaker(bind=engine))
    session = Session()
    parser = BulkParser(platform_id=1, save_none_url_tweet=True)

    while True:
        try:
            data = q1.get(timeout=1)
        except Empty:
            logger.info('Worker process %s: queue is empty for 1 seconds', pid)
            q2.put((pid, 'STOP', None))
            break
        if data == 'STOP':
            logger.info('Worker process %s: STOP sign received from q1!', pid)
            q1.put('STOP')
            q2.put((pid, 'STOP', None))
            break
        else:
            logger.info('Worker process %s: data=%s received', pid, data)
        w_open_left, w_close_right = data
        jds = dict()
        g_urls_map = dict()
        query = """
            SELECT tw.id, tw.json_data, u.id, u.raw
            FROM tweet AS tw
            LEFT JOIN ass_tweet_url AS atu ON atu.tweet_id=tw.id
            LEFT JOIN url AS u ON u.id=atu.url_id
            WHERE tw.id>:l AND tw.id<=:r
            """
        for tw_id, jd, url_id, url in engine.execute(
                text(query).bindparams(l=w_open_left, r=w_close_right)):
            jds[tw_id] = jd
            if url_id is not None:
                g_urls_map[url] = url_id
        g_uusers_set = set()
        g_edges_set = set()
        for tw_id, jd in jds.items():
            parser.parse_existed_one(tw_id,
                                     jd,
                                     session,
                                     g_urls_map=g_urls_map,
                                     g_uusers_set=g_uusers_set,
                                     g_edges_set=g_edges_set)
        edges = [
            dict(tweet_raw_id=t0,
                 from_raw_id=t1,
                 to_raw_id=t2,
                 url_id=t3,
                 is_quoted_url=t4,
                 is_mention=t5,
                 tweet_type=t6) for t0, t1, t2, t3, t4, t5, t6 in g_edges_set
            if t3 != -1
        ]
        uusers = [dict(raw_id=t1, screen_name=t2) for t1, t2 in g_uusers_set]
        # session.bulk_insert_mappings(TwitterNetworkEdge, edges)
        stmt_do_nothing = insert(TwitterNetworkEdge).values(
            edges).on_conflict_do_nothing(index_elements=[
                'tweet_raw_id', 'from_raw_id', 'to_raw_id', 'url_id',
                'is_quoted_url', 'is_mention', 'tweet_type'
            ])
        session.execute(stmt_do_nothing)
        session.commit()
        q2.put((pid, 'RUN', uusers))
        logger.info('Worker process %s: tweets from %s to %s done', pid,
                    w_open_left + 1, w_close_right)
Ejemplo n.º 14
0
    def _monitor(self):
        """Monitor the queue for tweet incoming and then parse and save it into
        the database.

        This method runs on a separate, internal thread.
        The thread will terminate if it sees a sentinel object in the queue.
        """
        # scoped_session
        # Session itself is not thread safe, we use scoped_session.
        # Each thread uses only one scoped_session object
        # We never delete anything from database in this function.
        # set expire_on_commit=False to avoid re-fetch of these existed objects
        session = Session(expire_on_commit=False)
        parser = Parser(**self.parser_kwargs)
        platform_id = get_platform_id(session, name=N_PLATFORM_TWITTER)
        has_task_done = hasattr(self.queue, 'task_done')
        while not self._stop.isSet():
            if self.is_connection_failed is True:
                self.on_db_server_down(session)
                continue
            # normal bulk insert process
            try:
                # fill the bucket
                for i in range(self.bucket_size):
                    # dequeue with block=True
                    jd = self.queue.get(True)
                    if has_task_done is True:
                        self.queue.task_done()
                    if jd is not self._sentinel:
                        self.global_counter += 1
                        self.bucket.append(jd)
                    else:
                        break
                # consume this bucket
                self.consume_this_bucket(parser, session, platform_id)
                self.bucket = []
            # database is shutdown unexpectedly
            except OperationalError as err:
                session.rollback()
                if 'server closed the connection unexpectedly' in repr(
                        err) or 'could not connect to server' in repr(err):
                    logger.critical('Causion: database server is down!')
                    self.is_connection_failed = True
                else:
                    logger.error(err)
                    self.on_db_bulk_save_error()
            except SQLAlchemyError as err:
                session.rollback()
                logger.exception(err)
                self.on_db_bulk_save_error()
            except BaseException as err:
                # unexpected exception, logging (will exit)
                logger.exception(err)
                raise
        # There might still be records in the queue.
        while True:
            try:
                jd = self.queue.get(False)
                if has_task_done:
                    self.queue.task_done()
                if jd is self._sentinel:
                    break
                self.bucket.append(jd)
            except queue.Empty:
                break
        if self.bucket:
            try:
                self.consume_this_bucket(parser, session, platform_id)
                self.bucket = []
            except SQLAlchemyError as err:
                session.rollback()
                logger.exception('Consumer thread: %s', err)
                self.on_db_bulk_save_error()
        if self._fp_db_down is not None:
            self._fp_db_down.close()
        if self._fp_db_bulk_save is not None:
            self._fp_db_bulk_save.close()