Exemple #1
0
    def __init__(self, session, url_tuples, *args, **kwargs):
        """Constructor of ArticleParserSpider.

        Parameters
        ----------
        session : obj
            A SQLAlchemy session instance.
        url_tuples : list
            A list of tuple (id, created_at, date_published,
            canonical_url, site_id), which is a URL collection fetched from
            database.
        node_path : string
            node executable path.
        mercury_parser_installation_path : string
            pwd of <hoaxy-backened>/hoaxy/node_scripts/parse_with_mercury.js.
        """
        self.session = session
        self.url_tuples = url_tuples
        self.node_path = kwargs.pop('node_path')
        self.mercury_parser_installation_path = kwargs.pop('mercury_parser_path')
        configure_logging(
            'crawl.parse-article',
            console_level='CRITICAL',
            file_level='WARNING')
        super(ArticleParserSpider, self).__init__(*args, **kwargs)
Exemple #2
0
 def run(cls, args):
     """Overriding method as the entry point of this command."""
     session = Session(expire_on_commit=False)
     if args['--twitter-streaming'] is True:
         configure_logging('twitter.streaming',
                           console_level=args['--console-log-level'])
         cls.twitter_stream(session, args)
Exemple #3
0
    def init(cls, session, force_drop,
             ignore_inactive=False,
             force_inactive=False,
             ignore_redirected=False):
        configure_logging('init',
                          console_level='INFO',
                          file_level='WARNING')
        dt_before = datetime.utcnow()
        logging.info('Creating database tables:')
        if force_drop is True:
            logging.warning('Existed tables would be dropped and recreated!')
            Base.metadata.drop_all(ENGINE)
        else:
            logging.warning('Ignore existed tables')
        Base.metadata.create_all(ENGINE)
        logging.info('Inserting platforms if not exist')
        get_or_create_m(session, Platform, TWITTER_PLATFORM_DICT, fb_uk='name')
        get_or_create_m(session, Platform, WEB_PLATFORM_DICT, fb_uk='name')
        logging.info('Trying to load site data:')
        dc_file = join(HOAXY_HOME, 'domains_claim.txt')
        df_file = join(HOAXY_HOME, 'domains_factchecking.txt')
        site_file = join(HOAXY_HOME, 'sites.yaml')
        if isfile(dc_file) is True:
            logging.info('Claim domains %s found', dc_file)
            SiteCmd.load_domains(session, dc_file, site_type='claim',
                                 ignore_inactive=ignore_inactive,
                                 force_inactive=force_inactive,
                                 ignore_redirected=ignore_redirected)
        else:
            logging.info('Claim domains %s not found', dc_file)
        if isfile(df_file) is True:
            logging.info('Fact checking domains %s found', df_file)
            SiteCmd.load_domains(session, df_file, site_type='fact_checking',
                                 ignore_inactive=ignore_inactive,
                                 force_inactive=force_inactive,
                                 ignore_redirected=ignore_redirected)
        else:
            logging.info('Fact checking domains %s not found', df_file)

        if isfile(site_file) is True:
            logging.info('Site file %s found', site_file)
            SiteCmd.load_sites(session, site_file,
                               ignore_inactive=ignore_inactive,
                               force_inactive=force_inactive,
                               ignore_redirected=ignore_redirected)
        else:
            logging.info('Site file %s not found', site_file)
        sites = session.query(Site.domain, Site.site_type, Site.base_url
                              ).filter(or_(
                                  Site.created_at > dt_before,
                                  Site.updated_at > dt_before
                              )).order_by(Site.id).all()
        logger.info("Added or updated sites are:\n %s", pprint.pformat(sites))
        logger.info("Done.")
Exemple #4
0
 def run(cls, args):
     try:
         # print(args)
         args = cls.args_schema.validate(args)
     except SchemaError as e:
         sys.exit(e)
     session = Session()
     # make sure lucene be inited
     lucene.initVM()
     lucene.getVMEnv().attachCurrentThread()
     if args['--index'] is True:
         configure_logging(
             'lucene.index', console_level=args['--console-log-level'])
         mgid = get_or_create_m(
             session,
             MetaInfo,
             data=dict(
                 name='article_group_id_lucene_index',
                 value='0',
                 value_type='int',
                 description='article.group_id used for lucene index'),
             fb_uk='name')
         if args['--mode'] == 'create':
             mgid.set_value(0)
             session.commit()
         logger.debug('Indexing started.. Getting articles..')
         q = """
         SELECT DISTINCT ON (a.group_id) a.id, a.group_id,
             a.canonical_url,
             a.title, a.meta, a.content,
             coalesce(a.date_published, a.date_captured) AS pd,
             s.domain, s.site_type
         FROM article AS a
             JOIN site AS s ON s.id=a.site_id
         WHERE a.site_id IS NOT NULL AND s.is_enabled IS TRUE
             AND a.group_id>:gid
         ORDER BY group_id, pd ASC
         """
         articles_iter = session.execute(
             sqlalchemy.text(q).bindparams(gid=mgid.get_value()))
         cls.index(session, args['--mode'], articles_iter, mgid)
     elif args['--search'] is True:
         configure_logging(
             'lucene.search', console_level=args['--console-log-level'])
         cls.search(args['--query'], args['--top'])
     else:
         print("Unrecognized command!")
         sys.exit(2)
Exemple #5
0
 def run(cls, args):
     """Overriding method as the entry point of this command."""
     try:
         args = cls.args_schema.validate(args)
     except SchemaError as e:
         raise SystemExit('\n' + e + '\n')
     session = Session(expire_on_commit=False)
     if args['--twitter-streaming'] is True:
         configure_logging('twitter.streaming')
         cls.twitter_stream(session, args)
     elif args['--load-tweets'] is True:
         configure_logging('twitter.load-tweets')
         cls.load_tweets(session, args)
     elif args['--reparse-db-tweets'] is True:
         configure_logging('twitter.reparse-db', file_level='WARNING')
         cls._test_table_names(session, args)
         cls.reparse_db(session, args)
Exemple #6
0
 def run(cls, args):
     """Overriding method as the entry point of this command."""
     session = Session()
     if args['--volume'] is True:
         configure_logging('report.volume',
                           console_level=args['--console-log-level'],
                           file_level='WARNING')
         table_names = ['tweet', 'url', 'article']
         table = args['--table']
         if table not in table_names:
             logger.critical('Available tables are: %s', table_names)
             sys.exit(2)
         interval_names = [
             'minute', 'hour', 'day', 'week', 'month', 'quarter', 'year'
         ]
         interval = args['--interval']
         if interval not in interval_names:
             logger.critical('Available intervals are: %s', interval_names)
             sys.exit(2)
         limit = args['--limit']
         if int(limit) <= 0:
             logger.critical('%r should larger than 0', limit)
             sys.exit(2)
         sql = """
         SELECT count(id) as agg_num,
             date_trunc(:interval, created_at) as interval
         FROM %s
         GROUP BY interval
         ORDER BY interval DESC
         LIMIT :limit""" % table
         stmt = text(sql).bindparams(interval=interval, limit=limit)
         strf = '%Y-%m-%d %H:%M:%S'
         with ENGINE.connect() as conn:
             result = conn.execute(stmt).fetchall()
             print(('-' * 35))
             print(('{0:^20s} | {1:12s}'.format('Timeline (%s)' % interval,
                                                'Aggregation')))
             print(('-' * 35))
             for v, t in result:
                 print(('{0:^20s} | {1:8d}'.format(t.strftime(strf), v)))
             print(('-' * 35))
     elif args['--status']:
         configure_logging('report.streaming-status',
                           console_level=args['--console-log-level'])
         table_name = None
         if args['--status'] == 'twitter':
             table_name = 'tweet'
         if table_name is None:
             logger.critical('SNS %r has not been implemented!',
                             args['--status'])
             sys.exit(2)
         sql = 'SELECT created_at FROM {} ORDER BY id DESC LIMIT 1'.format(
             'tweet')
         with ENGINE.connect() as conn:
             most_recent, = conn.execute(text(sql)).fetchone()
             delta_minutes = 30
             delta = timedelta(minutes=delta_minutes)
             current_utc = datetime.utcnow()
             if current_utc - most_recent > delta:
                 logger.critical(
                     'No %s streaming update in the past %s minutes!',
                     args['--status'], delta_minutes)
             else:
                 logger.info('Most recent %s streaming update is %s',
                             args['--status'],
                             str(most_recent) + ' (UTC)')
     elif args['--top-spreader'] is True:
         configure_logging('report.top-spreaders',
                           console_level=args['--console-log-level'],
                           file_level='WARNING')
         # try to create table
         if (Top20SpreaderMonthly.__table__.exists(bind=ENGINE)) is False:
             Top20SpreaderMonthly.__table__.create(bind=ENGINE)
         if args['--force-today'] is True:
             upper_day = datetime.utcnow().date()
         elif args['--upper-day'] is None:
             upper_day = datetime.utcnow().date() - timedelta(days=1)
         else:
             try:
                 upper_day = parse(args['--upper-day']).date()
             except Exception:
                 raise ValueError('Invalid date: %s', args['--upper-day'])
         if args['--generate'] is True:
             logger.warning(
                 'Generating top spreaders for uppder_day=%r ...',
                 upper_day)
             cls.generate_top_spreaders(session, upper_day)
         elif args['--look-up'] is True:
             cls.look_up_top_spreaders(session, upper_day,
                                       args['--most-recent'])
     elif args['--top-article'] is True:
         configure_logging('report.top-article',
                           console_level=args['--console-log-level'],
                           file_level='WARNING')
         # try to create table
         if (Top20ArticleMonthly.__table__.exists(bind=ENGINE)) is False:
             Top20ArticleMonthly.__table__.create(bind=ENGINE)
         if args['--force-today'] is True:
             upper_day = datetime.utcnow().date()
         elif args['--upper-day'] is None:
             upper_day = datetime.utcnow().date() - timedelta(days=1)
         else:
             try:
                 upper_day = parse(args['--upper-day']).date()
             except Exception:
                 raise ValueError('Invalid date: %s', args['--upper-day'])
         if args['--generate'] is True:
             logger.warning('Generating top articles for uppder_day=%r ...',
                            upper_day)
             cls.generate_top_articles(session, upper_day)
         elif args['--look-up'] is True:
             cls.look_up_top_articles(session, upper_day,
                                      args['--most-recent'])
     session.close()
Exemple #7
0
    def run(cls, args):
        """Overriding method as the entry point of this command."""
        try:
            args = cls.args_schema.validate(args)
        except SchemaError as e:
            raise SystemExit(e)

        session = Session(expire_on_commit=False)
        # session = Session()
        where_expr = args['--where-expr']
        ob_expr = args.get('--order-by', 'asc')
        limit = args['--limit']
        # --fetch-url
        if args['--fetch-url'] is True:
            configure_logging('crawl.fetch-url',
                              console_level='DEBUG',
                              file_level='WARNING')
            purpose = 'update' if args['--update'] is True else 'archive'
            if where_expr is None:
                where_expr = [text(DEFAULT_WHERE_EXPR_FETCH_URL)]
            else:
                where_expr = [text(where_expr)]
            ob_expr = Site.id.asc() if ob_expr == 'asc' else Site.id.desc()
            msites = get_msites(session,
                                f_expr=where_expr,
                                ob_expr=ob_expr,
                                limit=limit)
            if len(msites) == 0:
                logger.warning("None sites you queried found in DB!")
                raise SystemExit(2)
            platform_id = get_platform_id(session, name=N_PLATFORM_WEB)
            # detach msites and mplatform from session,
            # since they definitely would not be modified in session
            for ms in msites:
                session.expunge(ms)
            logger.warning('Starting crawling process to fetch URL update ...')
            cls.fetch_url(session, msites, platform_id, purpose)
        elif args['--fetch-html'] is True:
            configure_logging('crawl.fetch-html',
                              console_level='DEBUG',
                              file_level='WARNING')
            if not session.query(Site.id).count() > 0:
                raise SystemExit('Your site table is empty!')
            q = session.query(Url.id, Url.raw)
            if where_expr is None:
                where_expr = [text(DEFAULT_WHERE_EXPR_FETCH_HTML)]
            else:
                where_expr = [text(where_expr)]
            ob_expr = Url.id.asc() if ob_expr == 'asc' else Url.id.desc()
            q = q.filter(*where_expr).order_by(ob_expr)
            if limit is not None:
                q = q.limit(limit)
            logger.info(
                q.statement.compile(compile_kwargs={"literal_binds": True}))
            url_tuples = q.all()
            if not url_tuples:
                logger.warning('No such URLs in DB!')
                raise SystemExit(2)
            logger.warning('Staring crawling process to fetch HTML ...')
            cls.fetch_html(session, url_tuples)
        # --parse-article
        elif args['--parse-article'] is True:
            configure_logging('crawl.parse-article',
                              console_level='DEBUG',
                              file_level='WARNING')
            q = session.query(Url.id, Url.created_at, Url.date_published,
                              Url.canonical, Url.site_id)
            if where_expr is None:
                where_expr = [text(DEFAULT_WHERE_EXPR_PARSE_ARTICLE)]
            else:
                where_expr = [text(where_expr)]
            ob_expr = Url.id.asc() if ob_expr == 'asc' else Url.id.desc()
            q = q.filter(*where_expr).order_by(ob_expr)
            if limit is not None:
                q = q.limit(limit)
            logger.info(
                q.statement.compile(compile_kwargs={"literal_binds": True}))
            url_tuples = q.all()
            if not url_tuples:
                logger.warning('No URLs found from DB!')
                raise SystemExit(2)
            logger.warning('Starting crawling process to parse article ...')
            cls.parse_article(session, url_tuples)
        session.close()
Exemple #8
0
def setup_logging():
    """Before first request, set up logger."""
    configure_logging('api', file_level='WARNING')
Exemple #9
0
    def run(cls, args):
        """Overriding method as the entry point of this command."""
        session = Session(expire_on_commit=False)
        # session = Session()
        # expand user home for the file
        if args['<file>'] is not None:
            args['<file>'] = os.path.expanduser(args['<file>'])
        # --load-domains commands
        if args['--load-domains'] is True:
            configure_logging(
                'site.load-domains',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            fn = args.get('<file>', join(HOAXY_HOME, 'domains.txt'))
            logger.info('Loading data from file %r', fn)
            cls.load_domains(
                session,
                fn,
                site_type=args['--site-type'],
                ignore_inactive=args['--ignore-inactive'],
                force_inactive=args['--force-inactive'],
                ignore_redirected=args['--ignore-redirected'])
        # --load-sites commands
        elif args['--load-sites'] is True:
            configure_logging(
                'site.load-sites',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            fn = args.get('<file>', join(HOAXY_HOME, 'sites.yaml'))
            logger.info('Loading data from file %r', fn)
            cls.load_sites(
                session,
                fn,
                ignore_inactive=args['--ignore-inactive'],
                force_inactive=args['--force-inactive'],
                ignore_redirected=args['--ignore-redirected'])
        # --add commands
        elif args['--add'] is True:
            configure_logging(
                'site.add',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            msite = qquery_msite(session, domain=args['--domain'])
            if msite is not None:
                logger.warning('Site %s already exists!', args['--domain'])
            else:
                cls.add_site(
                    session,
                    domain=args['--domain'],
                    site_type=args['--site-type'],
                    name=args['--name'],
                    tag_source=args['--tag-source'],
                    site_tags=args['--site-tag'],
                    alternate_domains=args['--alternate-domain'],
                    ignore_inactive=args['--ignore-inactive'],
                    force_inactive=args['--force-inactive'],
                    ignore_redirected=args['--ignore-redirected'])
        # --add-site-tags
        elif args['--add-site-tags'] is True:
            configure_logging(
                'site.add-site-tags',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.add_site_tags(session, msite, args['--tag-source'],
                                  args['--site-tag'])
        # --replace-site-tags
        elif args['--replace-site-tags'] is True:
            configure_logging(
                'site.repalce-site-tags',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.replace_site_tags(session, msite, args['--tag-source'],
                                      args['--site-tag'])
        # --add-alternate-domains
        elif args['--add-alternate-domains'] is True:
            configure_logging(
                'site.add-alternate-domains',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.add_alternate_domains(session, msite,
                                          args['--alternate-domain'])
        # --replace-alternate-domains
        elif args['--replace-alternate-domains'] is True:
            configure_logging(
                'site.replace-alternate-domains',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.replace_alternate_domains(session, msite,
                                              args['--alternate-domain'])
        elif args['--disable'] is True:
            configure_logging(
                'site.disable',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.disable_site(session, msite)
        elif args['--enable'] is True:
            configure_logging(
                'site.enable',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--name'] is not None:
                site_identity = args['--name']
            else:
                site_identity = args['--domain']
            msite = qquery_msite(
                session, name=args['--name'], domain=args['--domain'])
            if msite is None:
                logger.warning('Site %s does not exist!', site_identity)
            else:
                cls.enable_site(session, msite)
        # --status
        elif args['--status'] is True:
            configure_logging(
                'site.status',
                console_level=args['--console-log-level'],
                file_level='WARNING')
            if args['--include-disabled'] is True:
                cls.site_status(session, True)
            else:
                cls.site_status(session, False)
        # --dump
        elif args['--dump'] is True:
            configure_logging(
                'site.status',
                console_level=args['--console-log-level'],
                file_level='INFO')
            cls.dump(session, args['<file>'])

        session.close()