Example #1
0
def setup_loggin():
    # setup default logging
    default_logging(grab_log='var/grab.log',
                    level=LOG_LEVEL,
                    mode='a',
                    propagate_network_logger=False,
                    network_log='var/grab.network.log')
Example #2
0
File: crawl.py Project: abael/grab
def main(spider_name, thread_number=None, slave=False,
         settings_module='settings', network_logs=False,
         disable_proxy=False, ignore_lock=False,
         disable_report=False,
         disable_default_logs=False,
         *args, **kwargs):
    if disable_default_logs:
        default_logging(propagate_network_logger=network_logs,
                        grab_log=None, network_log=None)
    else:
        default_logging(propagate_network_logger=network_logs)

    root_config = build_root_config(settings_module)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, trash = parser.parse_known_args()
        spider_args = vars(opts)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=None,
        task_try_limit=None,
        args=spider_args,
    )
    opt_queue = spider_config.get('queue')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get('proxy_list')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get('command_interfaces')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=spider_config.get('display_timing'))
    stats_with_time = bot.render_stats(timing=True)

    if spider_config.get('display_stats'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if not disable_report:
        if spider_config.get('save_report'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.mkdir(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.items.items():
                    fname_key = key.replace('-', '_')
                    bot.save_list(key, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(make_str(stats_with_time))

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
Example #3
0
def main(spider_name,
         thread_number=None,
         slave=False,
         settings_module='settings',
         network_logs=False,
         disable_proxy=False,
         ignore_lock=False,
         disable_report=False,
         disable_default_logs=False,
         *args,
         **kwargs):
    if disable_default_logs:
        default_logging(propagate_network_logger=network_logs,
                        grab_log=None,
                        network_log=None)
    else:
        default_logging(propagate_network_logger=network_logs)

    root_config = build_root_config(settings_module)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, trash = parser.parse_known_args()
        spider_args = vars(opts)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=None,
        task_try_limit=None,
        args=spider_args,
    )
    opt_queue = spider_config.get('queue')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get('proxy_list')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get('command_interfaces')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=spider_config.get('display_timing'))
    stats_with_time = bot.render_stats(timing=True)

    if spider_config.get('display_stats'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if not disable_report:
        if spider_config.get('save_report'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.mkdir(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.items.items():
                    fname_key = key.replace('-', '_')
                    bot.save_list(key, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(make_str(stats_with_time))

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
Example #4
0
def hideme_parse():
    default_logging(grab_log=grab_settings.GRAB_LOG, network_log=grab_settings.NETWORK_LOG)
    bot = HidemeProxies(thread_number=1)
    bot.run()
Example #5
0
def foxtools_parse():
    default_logging(grab_log=grab_settings.GRAB_LOG, network_log=grab_settings.NETWORK_LOG)
    bot = FoxToolsProxy(thread_number=1)
    bot.run()
Example #6
0
    task_help = "task to run (active marked with '*'):\n{}".format(
        ", ".join(tasks_colored))

    parser.add_argument('-T', '--task', type=str, help=task_help)
    parser.add_argument('-c', '--celery',
                        action="store_true", default=False,
                        help='run as celery task')

    args = parser.parse_args()
    return args


if __name__ == '__main__':  # noqa
    colorama_init()
    default_logging(grab_log='var/grab.log', level=LOG_LEVEL, mode='w',
                    propagate_network_logger=False,
                    network_log='var/grab.network.log')

    parser = argparse.ArgumentParser(description='command line interface')
    args = command_line_interface(parser)

    if args.task:
        if args.task == 'all':
            for task in ACTIVE_TASKS:
                scraper = ScrapersRunInterface.crawl(task)
                logger.info(scraper.render_stats())
        if args.task not in TASKS.keys():
            logger.critical(
                u"Can't find crawler in list: {}".format(
                    TASKS.keys()))
        if args.celery:
Example #7
0
# -*- coding: utf-8 -*-

from __future__ import unicode_literals # py2

from grab import Grab
from spiders import BaseSpider
from grab.spider import Task

import logging

logging.basicConfig(level=logging.INFO)
logging.getLogger().addHandler(logging.FileHandler('/tmp/parse.log'))

from weblib.logs import default_logging
default_logging()


class InitialSpider(BaseSpider):
    """Spider for initial data grabbing"""

    BASE_STREAM_URL = BaseSpider.BASE_URL + '/ru/post/stream.json'

    initial_urls = (BASE_STREAM_URL,)


    def task_initial(self, grab, task):
        sorted_posts = sorted(grab.response.json['posts'], key=lambda x: int(x['id']))
        past_jump = max_offset = int(sorted_posts[0]['id'])

        g = self.create_grab_instance()
Example #8
0
def setup_loggin():
    # setup default logging
    default_logging(grab_log='var/grab.log', level=LOG_LEVEL, mode='a',
                    propagate_network_logger=False,
                    network_log='var/grab.network.log')