Example #1
0
    def run(cls, args):
        """Overriding method as the entry point of this command."""
        try:
            args = cls.args_schema.validate(args)
        except SchemaError as e:
            raise SystemExit(e)

        session = Session(expire_on_commit=False)
        # session = Session()
        where_expr = args['--where-expr']
        ob_expr = args.get('--order-by', 'asc')
        limit = args['--limit']
        # --fetch-url
        if args['--fetch-url'] is True:
            configure_logging('crawl.fetch-url',
                              console_level='DEBUG',
                              file_level='WARNING')
            purpose = 'update' if args['--update'] is True else 'archive'
            if where_expr is None:
                where_expr = [text(DEFAULT_WHERE_EXPR_FETCH_URL)]
            else:
                where_expr = [text(where_expr)]
            ob_expr = Site.id.asc() if ob_expr == 'asc' else Site.id.desc()
            msites = get_msites(session,
                                f_expr=where_expr,
                                ob_expr=ob_expr,
                                limit=limit)
            if len(msites) == 0:
                logger.warning("None sites you queried found in DB!")
                raise SystemExit(2)
            platform_id = get_platform_id(session, name=N_PLATFORM_WEB)
            # detach msites and mplatform from session,
            # since they definitely would not be modified in session
            for ms in msites:
                session.expunge(ms)
            logger.warning('Starting crawling process to fetch URL update ...')
            cls.fetch_url(session, msites, platform_id, purpose)
        elif args['--fetch-html'] is True:
            configure_logging('crawl.fetch-html',
                              console_level='DEBUG',
                              file_level='WARNING')
            if not session.query(Site.id).count() > 0:
                raise SystemExit('Your site table is empty!')
            q = session.query(Url.id, Url.raw)
            if where_expr is None:
                where_expr = [text(DEFAULT_WHERE_EXPR_FETCH_HTML)]
            else:
                where_expr = [text(where_expr)]
            ob_expr = Url.id.asc() if ob_expr == 'asc' else Url.id.desc()
            q = q.filter(*where_expr).order_by(ob_expr)
            if limit is not None:
                q = q.limit(limit)
            logger.info(
                q.statement.compile(compile_kwargs={"literal_binds": True}))
            url_tuples = q.all()
            if not url_tuples:
                logger.warning('No such URLs in DB!')
                raise SystemExit(2)
            logger.warning('Staring crawling process to fetch HTML ...')
            cls.fetch_html(session, url_tuples)
        # --parse-article
        elif args['--parse-article'] is True:
            configure_logging('crawl.parse-article',
                              console_level='DEBUG',
                              file_level='WARNING')
            q = session.query(Url.id, Url.created_at, Url.date_published,
                              Url.canonical, Url.site_id)
            if where_expr is None:
                where_expr = [text(DEFAULT_WHERE_EXPR_PARSE_ARTICLE)]
            else:
                where_expr = [text(where_expr)]
            ob_expr = Url.id.asc() if ob_expr == 'asc' else Url.id.desc()
            q = q.filter(*where_expr).order_by(ob_expr)
            if limit is not None:
                q = q.limit(limit)
            logger.info(
                q.statement.compile(compile_kwargs={"literal_binds": True}))
            url_tuples = q.all()
            if not url_tuples:
                logger.warning('No URLs found from DB!')
                raise SystemExit(2)
            logger.warning('Starting crawling process to parse article ...')
            cls.parse_article(session, url_tuples)
        session.close()
Example #2
0
 def run(cls, args):
     """Overriding method as the entry point of this command."""
     session = Session()
     if args['--volume'] is True:
         configure_logging('report.volume',
                           console_level=args['--console-log-level'],
                           file_level='WARNING')
         table_names = ['tweet', 'url', 'article']
         table = args['--table']
         if table not in table_names:
             logger.critical('Available tables are: %s', table_names)
             sys.exit(2)
         interval_names = [
             'minute', 'hour', 'day', 'week', 'month', 'quarter', 'year'
         ]
         interval = args['--interval']
         if interval not in interval_names:
             logger.critical('Available intervals are: %s', interval_names)
             sys.exit(2)
         limit = args['--limit']
         if int(limit) <= 0:
             logger.critical('%r should larger than 0', limit)
             sys.exit(2)
         sql = """
         SELECT count(id) as agg_num,
             date_trunc(:interval, created_at) as interval
         FROM %s
         GROUP BY interval
         ORDER BY interval DESC
         LIMIT :limit""" % table
         stmt = text(sql).bindparams(interval=interval, limit=limit)
         strf = '%Y-%m-%d %H:%M:%S'
         with ENGINE.connect() as conn:
             result = conn.execute(stmt).fetchall()
             print(('-' * 35))
             print(('{0:^20s} | {1:12s}'.format('Timeline (%s)' % interval,
                                                'Aggregation')))
             print(('-' * 35))
             for v, t in result:
                 print(('{0:^20s} | {1:8d}'.format(t.strftime(strf), v)))
             print(('-' * 35))
     elif args['--status']:
         configure_logging('report.streaming-status',
                           console_level=args['--console-log-level'])
         table_name = None
         if args['--status'] == 'twitter':
             table_name = 'tweet'
         if table_name is None:
             logger.critical('SNS %r has not been implemented!',
                             args['--status'])
             sys.exit(2)
         sql = 'SELECT created_at FROM {} ORDER BY id DESC LIMIT 1'.format(
             'tweet')
         with ENGINE.connect() as conn:
             most_recent, = conn.execute(text(sql)).fetchone()
             delta_minutes = 30
             delta = timedelta(minutes=delta_minutes)
             current_utc = datetime.utcnow()
             if current_utc - most_recent > delta:
                 logger.critical(
                     'No %s streaming update in the past %s minutes!',
                     args['--status'], delta_minutes)
             else:
                 logger.info('Most recent %s streaming update is %s',
                             args['--status'],
                             str(most_recent) + ' (UTC)')
     elif args['--top-spreader'] is True:
         configure_logging('report.top-spreaders',
                           console_level=args['--console-log-level'],
                           file_level='WARNING')
         # try to create table
         if (Top20SpreaderMonthly.__table__.exists(bind=ENGINE)) is False:
             Top20SpreaderMonthly.__table__.create(bind=ENGINE)
         if args['--force-today'] is True:
             upper_day = datetime.utcnow().date()
         elif args['--upper-day'] is None:
             upper_day = datetime.utcnow().date() - timedelta(days=1)
         else:
             try:
                 upper_day = parse(args['--upper-day']).date()
             except Exception:
                 raise ValueError('Invalid date: %s', args['--upper-day'])
         if args['--generate'] is True:
             logger.warning(
                 'Generating top spreaders for uppder_day=%r ...',
                 upper_day)
             cls.generate_top_spreaders(session, upper_day)
         elif args['--look-up'] is True:
             cls.look_up_top_spreaders(session, upper_day,
                                       args['--most-recent'])
     elif args['--top-article'] is True:
         configure_logging('report.top-article',
                           console_level=args['--console-log-level'],
                           file_level='WARNING')
         # try to create table
         if (Top20ArticleMonthly.__table__.exists(bind=ENGINE)) is False:
             Top20ArticleMonthly.__table__.create(bind=ENGINE)
         if args['--force-today'] is True:
             upper_day = datetime.utcnow().date()
         elif args['--upper-day'] is None:
             upper_day = datetime.utcnow().date() - timedelta(days=1)
         else:
             try:
                 upper_day = parse(args['--upper-day']).date()
             except Exception:
                 raise ValueError('Invalid date: %s', args['--upper-day'])
         if args['--generate'] is True:
             logger.warning('Generating top articles for uppder_day=%r ...',
                            upper_day)
             cls.generate_top_articles(session, upper_day)
         elif args['--look-up'] is True:
             cls.look_up_top_articles(session, upper_day,
                                      args['--most-recent'])
     session.close()
Example #3
0
    def run(cls, args):
        """Overriding method as the entry point of this command."""
        session = Session(expire_on_commit=False)
        # session = Session()
        # expand user home for the file
        if args['<file>'] is not None:
            args['<file>'] = os.path.expanduser(args['<file>'])
        # --load-domains commands
        if args['--load-domains'] is True:
            configure_logging(
                'site.load-domains',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            fn = args.get('<file>', join(HOAXY_HOME, 'domains.txt'))
            logger.info('Loading data from file %r', fn)
            cls.load_domains(
                session,
                fn,
                site_type=args['--site-type'],
                ignore_inactive=args['--ignore-inactive'],
                force_inactive=args['--force-inactive'],
                ignore_redirected=args['--ignore-redirected'])
        # --load-sites commands
        elif args['--load-sites'] is True:
            configure_logging(
                'site.load-sites',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            fn = args.get('<file>', join(HOAXY_HOME, 'sites.yaml'))
            logger.info('Loading data from file %r', fn)
            cls.load_sites(
                session,
                fn,
                ignore_inactive=args['--ignore-inactive'],
                force_inactive=args['--force-inactive'],
                ignore_redirected=args['--ignore-redirected'])
        # --add commands
        elif args['--add'] is True:
            configure_logging(
                'site.add',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            msite = qquery_msite(session, domain=args['--domain'])
            if msite is not None:
                logger.warning('Site %s already exists!', args['--domain'])
            else:
                cls.add_site(
                    session,
                    domain=args['--domain'],
                    site_type=args['--site-type'],
                    name=args['--name'],
                    tag_source=args['--tag-source'],
                    site_tags=args['--site-tag'],
                    alternate_domains=args['--alternate-domain'],
                    ignore_inactive=args['--ignore-inactive'],
                    force_inactive=args['--force-inactive'],
                    ignore_redirected=args['--ignore-redirected'])
        # --add-site-tags
        elif args['--add-site-tags'] is True:
            configure_logging(
                'site.add-site-tags',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.add_site_tags(session, msite, args['--tag-source'],
                                  args['--site-tag'])
        # --replace-site-tags
        elif args['--replace-site-tags'] is True:
            configure_logging(
                'site.repalce-site-tags',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.replace_site_tags(session, msite, args['--tag-source'],
                                      args['--site-tag'])
        # --add-alternate-domains
        elif args['--add-alternate-domains'] is True:
            configure_logging(
                'site.add-alternate-domains',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.add_alternate_domains(session, msite,
                                          args['--alternate-domain'])
        # --replace-alternate-domains
        elif args['--replace-alternate-domains'] is True:
            configure_logging(
                'site.replace-alternate-domains',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.replace_alternate_domains(session, msite,
                                              args['--alternate-domain'])
        elif args['--disable'] is True:
            configure_logging(
                'site.disable',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.disable_site(session, msite)
        elif args['--enable'] is True:
            configure_logging(
                'site.enable',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.enable_site(session, msite)
        # --status
        elif args['--status'] is True:
            configure_logging(
                'site.status',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--include-disabled'] is True:
                cls.site_status(session, True)
            else:
                cls.site_status(session, False)
        # --dump
        elif args['--dump'] is True:
            configure_logging(
                'site.status',
                console_level=args['--console-log-level'],
                file_level='INFO')
            cls.dump(session, args['<file>'])

        session.close()