Ejemplo n.º 1
0
def main():
    default_logging()
    for x in xrange(500):
        url = 'http://load.local/grab.html'
        g = Grab()
        g.go(url)
        assert 'grab' in g.response.body
Ejemplo n.º 2
0
def main():
    default_logging()
    bot = CEURSpider()
    print '\n######## This is a program used to extract data from CEUR Workshop Proceedings. #######\n'
    print '\nYou can input the workshop number to get the transformed rdf data into current directory rdfdb.ttl file.\n'
    print '\nFor Example: \n' \
          '\n\t 1). 1513 \t \t \t \t- you will get the transformed rdf data from http://ceur-ws.org/Vol-1513/\n' \
          '\n\t 2). 1513-1550 \t \t \t \t- you will get the transformed rdf data between Vol-1513 and Vol-1550\n' \
          '\n\t 3). 1513 1540 1560   \t \t \t- you will get the transformed rdf data from Vol-1513, Vol-1540 ' \
          'and Vol-1560\n'

    vol_numbers = raw_input("Please enter volumes you want to transfer: ")
    input_urls = []
    if re.match(r'^\d+$', vol_numbers):
        input_urls.append("http://ceur-ws.org/Vol-" + str(vol_numbers) + "/")
    elif re.match(r'(\d+)-(\d+)$', vol_numbers):
        vols = vol_numbers.split('-')
        input_urls = ["http://ceur-ws.org/Vol-" + str(i) + "/" for i in range(int(vols[0]), int(vols[1])+1)]
    elif re.match(r'^(\d+\s)+\d(\s)?', vol_numbers):
        numbers = vol_numbers.split()
        input_urls = ["http://ceur-ws.org/Vol-" + str(i) + "/" for i in numbers]
    else:
        raise ValueError('Your input is not valid.')

    bot.initial_urls = input_urls
    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    print(bot.render_stats())
Ejemplo n.º 3
0
def main():
    default_logging()
    for x in xrange(500):
        url = 'http://load.local/grab.html'
        g = Grab()
        g.go(url)
        assert 'grab' in g.response.body
Ejemplo n.º 4
0
def main():
    default_logging()
    bot = CEURSpider()
    bot.initial_urls = config.input_urls
    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    print(bot.render_stats())
Ejemplo n.º 5
0
def main(spider_name, thread_number=None, slave=False, force_url=None,
         settings='settings', *args, **kwargs):
    default_logging(propagate_network_logger=kwargs['propagate_network_logger'])

    lock_key = None
    if not slave:
        lock_key = 'crawl.%s' % spider_name
    if lock_key is not None:
        lock_path = 'var/run/%s.lock' % lock_key
        logger.debug('Trying to lock file: %s' % lock_path)
        assert_lock(lock_path)

    config = build_global_config(settings)
    spider_class = load_spider_class(config, spider_name)
    spider_config = build_spider_config(spider_class, config)

    if thread_number is None:
        thread_number = spider_config.getint('GRAB_THREAD_NUMBER')

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'),
        task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'),
    )
    if spider_config.get('GRAB_QUEUE'):
        bot.setup_queue(**spider_config['GRAB_QUEUE'])
    if spider_config.get('GRAB_CACHE'):
        bot.setup_cache(**spider_config['GRAB_CACHE'])
    if spider_config.get('GRAB_PROXY_LIST'):
        bot.load_proxylist(**spider_config['GRAB_PROXY_LIST'])
    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING'))

    if config.get('GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if config.get('GRAB_SAVE_FATAL_ERRORS'):
        bot.save_list('fatal', 'var/fatal-%d.txt' % pid)

    if config.get('GRAB_SAVE_TASK_ADD_ERRORS'):
        bot.save_list('task-could-not-be-added', 'var/task-add-error-%d.txt' % pid)

    if config.get('GRAB_SAVE_FINAL_STATS'):
        open('var/stats-%d.txt' % pid, 'wb').write(stats)
Ejemplo n.º 6
0
def start_parsing():
    default_logging(grab_log=config.GRAB_LOG, network_log=config.NETWORK_LOG)
    bot = GlobalsourcesCrawler(thread_number=config.THREAD_NUMBER)
    bot.setup_cache('mysql', database=config.MYSQL_DATABASE, use_compression=True, user=config.MYSQL_USER, passwd=config.MYSQL_PASSWORD)
    bot.load_proxylist(config.PROXY_LIST, 'text_file', proxy_type='http')
    try:
        bot.run()
    except KeyboardInterrupt:
        pass
    if config.DEBUG:
        bot.save_list('fatal', config.FATAL_ERROR_DUMP)
    comp_db.session.commit()
    print bot.render_stats()
    sys.exit()
Ejemplo n.º 7
0
def start_parsing():
    default_logging(grab_log=config.GRAB_LOG, network_log=config.NETWORK_LOG)
    bot = GlobalsourcesCrawler(thread_number=config.THREAD_NUMBER)
    bot.setup_cache('mysql',
                    database=config.MYSQL_DATABASE,
                    use_compression=True,
                    user=config.MYSQL_USER,
                    passwd=config.MYSQL_PASSWORD)
    bot.load_proxylist(config.PROXY_LIST, 'text_file', proxy_type='http')
    try:
        bot.run()
    except KeyboardInterrupt:
        pass
    if config.DEBUG:
        bot.save_list('fatal', config.FATAL_ERROR_DUMP)
    comp_db.session.commit()
    print bot.render_stats()
    sys.exit()
Ejemplo n.º 8
0
def main():
    default_logging(grab_log="log.txt")

    fl = open("out.txt", "w")
    flval = open("outval.txt", "w")

    bot = CEURSpider()
    bot.initial_urls = config.input_urls
    bot.out = fl
    bot.validate = flval
    try:
        bot.run()
    except KeyboardInterrupt:
        pass
    fl.close()
    flval.close()

    bot.print_stats()
    print(bot.render_stats())
Ejemplo n.º 9
0
def start_parsing():
    default_logging(grab_log=config.GRAB_LOG, network_log=config.NETWORK_LOG)
    bot = RosfirmCrawler(thread_number=config.THREAD_NUMBER)
    #    bot = ProffNoCrawler(thread_number=config.THREAD_NUMBER)
    #bot.setup_queue('mysql', database='proff_no', use_compression=True, user='******', passwd='proff_no_u7Hy4')
    bot.setup_cache('mysql',
                    database=config.MYSQL_DB,
                    use_compression=True,
                    user=config.MYSQL_USER,
                    passwd=config.MYSQL_PASS)
    if config.DEBUG:
        bot.setup_grab(log_dir=config.LOG_DIR)
    bot.load_proxylist(config.PROXY_LIST, 'text_file', proxy_type='http')
    try:
        bot.run()
        print bot.render_stats() ## вывод статистики в случае render_stats_on = 1
        bot.save_list('fatal', config.FATAL_ERROR_DUMP)
    except KeyboardInterrupt:
        if config.DEBUG:
            bot.save_list('fatal', config.FATAL_ERROR_DUMP)
        print bot.render_stats()
    sys.exit()
Ejemplo n.º 10
0
# -*- coding: utf-8 -*-

from grab.spider import Spider, Task
from grab import Grab

import logging
from grab.tools.logs import default_logging
default_logging(level=logging.ERROR)

THREADS = 1
URLS_FILE = 'urls.txt'
FOUND_FILE = 'found.txt'
NOT_FOUND_FILE = 'not_found.txt'

class CookieSpider(Spider):

    errors = []

    def task_generator(self):
        self.errors = prepare_errors()

        with open(URLS_FILE) as f:
            for url in f:
                if url.strip():
                    grab = Grab()
                    grab.setup(url=url)
                    print "Start checking the - ", url
                    yield Task('initial', url=url, grab=grab)


    def task_initial(self, grab, task):
Ejemplo n.º 11
0
def main():
    default_logging()
    bot = SpeedSpider(thread_number=30)
    bot.setup_cache(database='speed_spider', use_compression=True)
    bot.run()
    print(bot.render_stats())
Ejemplo n.º 12
0
            name=name,
            description=info,
            url=lookbook_url,
            num_followers=textutils.first_int_word(fans),
            blog_url=blog_url if len(blog_url) > 0 else None,
            site_url=website_url if len(website_url) > 0 else None)
        if created:
            print "Created a new blog_obj"
        else:
            print "Object already existed"


if __name__ == '__main__':
    # change the current dir
    os.chdir(os.path.dirname(os.path.abspath(__file__)))
    # set up logging for the scraper (empty error.log and empty network.log - is a good sign)
    # set level=10 to log all events
    default_logging(grab_log='/tmp/errors.log',
                    level=20,
                    mode='w',
                    propagate_network_logger=False,
                    network_log='/tmp/network.log')
    # prepare for the battle
    bot = LookbookScraper(thread_number=THREAD_NUMBER, network_try_limit=3)
    try:
        # good luck and have fun!
        bot.run()
    finally:
        # show stats
        print bot.render_stats()
Ejemplo n.º 13
0
def main(spider_name, thread_number=None, slave=False,
         settings='settings', network_logs=False,
         *args, **kwargs):
    default_logging(propagate_network_logger=network_logs)

    lock_key = None
    if not slave:
        lock_key = 'crawl.%s' % spider_name
    if lock_key is not None:
        lock_path = 'var/run/%s.lock' % lock_key
        logger.debug('Trying to lock file: %s' % lock_path)
        assert_lock(lock_path)

    config = build_global_config(settings)
    spider_class = load_spider_class(config, spider_name)
    spider_config = build_spider_config(spider_class, config)

    if hasattr(spider_class, 'setup_extra_args'):
        parser = ArgumentParser()
        spider_class.setup_extra_args(parser)
        extra_args, trash = parser.parse_known_args()
        spider_config['extra_args'] = vars(extra_args)

    if thread_number is None:
        thread_number = spider_config.getint('GRAB_THREAD_NUMBER')

    stat_task_object = kwargs.get('stat_task_object', None)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'),
        task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'),
    )
    if spider_config.get('GRAB_QUEUE'):
        bot.setup_queue(**spider_config['GRAB_QUEUE'])
    if spider_config.get('GRAB_CACHE'):
        bot.setup_cache(**spider_config['GRAB_CACHE'])
    if spider_config.get('GRAB_PROXY_LIST'):
        bot.load_proxylist(**spider_config['GRAB_PROXY_LIST'])
    if spider_config.get('GRAB_COMMAND_INTERFACES'):
        for iface_config in spider_config['GRAB_COMMAND_INTERFACES']:
            bot.controller.add_interface(**iface_config)

    # Dirty hack
    # FIXIT: REMOVE
    bot.dump_spider_stats = kwargs.get('dump_spider_stats')
    bot.stats_object = kwargs.get('stats_object')

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING'))

    if config.get('GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if config.get('GRAB_SAVE_FATAL_ERRORS'):
        bot.save_list('fatal', 'var/fatal-%d.txt' % pid)

    if config.get('GRAB_SAVE_TASK_ADD_ERRORS'):
        bot.save_list('task-could-not-be-added', 'var/task-add-error-%d.txt' % pid)

    if config.get('GRAB_SAVE_FINAL_STATS'):
        open('var/stats-%d.txt' % pid, 'wb').write(stats)

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
Ejemplo n.º 14
0
# -*- coding: utf-8 -*-
"""
Pravda news articles spy
"""

from grab.tools.logs import default_logging

from spiders.pravda_archive import PravdaArchiveSpider
from config import default_spider_params


if __name__ == '__main__':
    default_logging()

    print "Scape python projects"
    bot = PravdaArchiveSpider(**default_spider_params())

    bot.setup_grab(timeout=4096, connect_timeout=10)
    bot.run()
    print bot.render_stats()
Ejemplo n.º 15
0
# -*- coding: utf-8 -*-
"""
Github projects spy
"""
from optparse import OptionParser

from grab import Grab
from grab.spider import Spider, Task
from grab.tools.logs import default_logging

from spiders.explore import ExploreSpider
from spiders.lang_python import LangPythonSpider
from config import default_spider_params, Session

if __name__ == '__main__':
    default_logging()
    parser = OptionParser()

    # command line options
    parser.add_option("-p", "--python", action="store_true",
                      dest="parse_python", default=False)

    options, args = parser.parse_args()
    
    if options.parse_python:
        print "Scape python projects"
        bot = LangPythonSpider(**default_spider_params())
    else:
        print "Scrape trandings"
        bot = ExploreSpider(**default_spider_params())
Ejemplo n.º 16
0
 def __init__(self):
     default_logging()
     logDir = '/tmp/fanduel'
     if not os.path.exists(logDir):
         os.makedirs(logDir)
     self.grab = Grab(log_dir=logDir, debug_post=True)
Ejemplo n.º 17
0
from random import choice
from datetime import datetime
from dateutil.parser import parse as parse_iso_date
from urlparse import urlparse
from grab.spider import Spider, Task
from grab import Grab
from grab.tools.logs import default_logging

from fetcherbase import Fetcher
from debra import models
from platformdatafetcher.activity_levels import recalculate_activity_level

#????????????????
default_logging(grab_log='/tmp/grab.log',
                level=10,
                mode='w',
                propagate_network_logger=False,
                network_log='/tmp/grab_network.log')


class BlogspotFetcher(Spider, Fetcher):

    name = 'Blogspot'

    # The names of months (for parsing of the date)
    months_names = {
        'JANUARY': 1,
        'JAN': 1,
        'FEBRUARY': 2,
        'FEB': 2,
        'MARCH': 3,
Ejemplo n.º 18
0
def main():
    default_logging()
    bot = SpeedSpider(thread_number=30)
    bot.setup_cache(database='speed_spider', use_compression=True)
    bot.run()
    print(bot.render_stats())
Ejemplo n.º 19
0
import csv

import logging
import os

from grab.spider import Spider, Task
from grab.tools import html

from grab.tools.logs import default_logging
from hashlib import sha1

from grab import Grab

g = Grab()

default_logging(level=logging.DEBUG)

path = os.path.dirname(os.path.abspath(__file__))

MAIN_LINK = 'http://www.immobilienscout24.de/Suche/S-T/P-{}/Wohnung-Miete/Berlin/Berlin'

THREADS = 2


class Immospider(Spider):
    def __init__(self):
        super(Immospider, self).__init__(thread_number=THREADS, network_try_limit=20)
        self.result_file = csv.writer(open('result.csv', 'w'))
        self.result_file.writerow(['Title', 'Address', 'Wohnungstyp', 'Etage', 'Wohnflaeche', 'Bezugsfrei_ab',
                                  'Zimmer', 'Haustiere', 'Kaltmiete', 'Nebenkosten', 'Heizkosten', 'Gesamtmiete',
                                  'Kaution_o_genossenschaftsanteile', 'URL'])
Ejemplo n.º 20
0
def main(spider_name, thread_number=None, slave=False,
         settings='settings', network_logs=False,
         disable_proxy=False, 
         *args, **kwargs):
    default_logging(propagate_network_logger=network_logs)

    lock_key = None
    if not slave:
        lock_key = 'crawl.%s' % spider_name
    if lock_key is not None:
        lock_path = 'var/run/%s.lock' % lock_key
        logger.debug('Trying to lock file: %s' % lock_path)
        assert_lock(lock_path)

    config = build_global_config(settings)
    spider_class = load_spider_class(config, spider_name)
    spider_config = build_spider_config(spider_class, config)

    if hasattr(spider_class, 'setup_extra_args'):
        parser = ArgumentParser()
        spider_class.setup_extra_args(parser)
        extra_args, trash = parser.parse_known_args()
        spider_config['extra_args'] = vars(extra_args)

    if thread_number is None:
        thread_number = spider_config.getint('GRAB_THREAD_NUMBER')

    stat_task_object = kwargs.get('stat_task_object', None)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'),
        task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'),
    )
    if spider_config.get('GRAB_QUEUE'):
        bot.setup_queue(**spider_config['GRAB_QUEUE'])
    if spider_config.get('GRAB_CACHE'):
        bot.setup_cache(**spider_config['GRAB_CACHE'])
    if spider_config.get('GRAB_PROXY_LIST'):
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**spider_config['GRAB_PROXY_LIST'])
    if spider_config.get('GRAB_COMMAND_INTERFACES'):
        for iface_config in spider_config['GRAB_COMMAND_INTERFACES']:
            bot.controller.add_interface(**iface_config)

    # Dirty hack
    # FIXIT: REMOVE
    bot.dump_spider_stats = kwargs.get('dump_spider_stats')
    bot.stats_object = kwargs.get('stats_object')

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING'))

    if spider_config.get('GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if config.get('GRAB_SAVE_REPORT'):
        for subdir in (str(pid), 'last'):
            dir_ = 'var/%s' % subdir
            if not os.path.exists(dir_):
                os.mkdir(dir_)
            else:
                clear_directory(dir_)
            bot.save_list('fatal', '%s/fatal.txt' % dir_)
            bot.save_list('task-count-rejected', '%s/task_count_rejected.txt' % dir_)
            bot.save_list('network-count-rejected', '%s/network_count_rejected.txt' % dir_)
            bot.save_list('task-with-invalid-url', '%s/task_with_invalid_url.txt' % dir_)
            with open('%s/report.txt' % dir_, 'wb') as out:
                out.write(stats)

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
Ejemplo n.º 21
0
Archivo: crawl.py Proyecto: bodja/grab
def main(spider_name, thread_number=None, slave=False,
         settings='settings', network_logs=False,
         disable_proxy=False, ignore_lock=False, 
         disable_report=False,
         *args, **kwargs):
    default_logging(propagate_network_logger=network_logs)

    root_config = build_root_config(settings)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, trash = parser.parse_known_args()
        spider_args = vars(opts)

    if thread_number is None:
        thread_number = \
            int(spider_config.get('thread_number',
                                  deprecated_key='GRAB_THREAD_NUMBER'))

    stat_task_object = kwargs.get('stat_task_object', None)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=int(spider_config.get(
            'network_try_limit', deprecated_key='GRAB_NETWORK_TRY_LIMIT')),
        task_try_limit=int(spider_config.get(
            'task_try_limit', deprecated_key='GRAB_TASK_TRY_LIMIT')),
        args=spider_args,
    )
    opt_queue = spider_config.get('queue', deprecated_key='GRAB_QUEUE')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache', deprecated_key='GRAB_CACHE')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get(
        'proxy_list', deprecated_key='GRAB_PROXY_LIST')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get(
        'command_interfaces', deprecated_key='GRAB_COMMAND_INTERFACES')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    # Dirty hack
    # FIXIT: REMOVE
    bot.dump_spider_stats = kwargs.get('dump_spider_stats')
    bot.stats_object = kwargs.get('stats_object')

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(
        timing=spider_config.get('display_timing',
                                 deprecated_key='GRAB_DISPLAY_TIMING'))

    if spider_config.get('display_stats', deprecated_key='GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if not disable_report:
        if spider_config.get('save_report', deprecated_key='GRAB_SAVE_REPORT'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.mkdir(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.items.iteritems():
                    fname_key = key.replace('-', '_')
                    bot.save_list(key, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(stats)

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
Ejemplo n.º 22
0
from models import Data, session, table_name

import json
import logging
import os
import feedparser
from datetime import datetime

from grab.spider import Spider, Task
from grab.tools import html

from grab.tools.logs import default_logging
from hashlib import sha1

default_logging(level=logging.DEBUG)

path = os.path.dirname(os.path.abspath(__file__))
URLS_FILE = os.path.join(path, 'urls.txt')


RSS_LINK = 'http://pathmark.inserts2online.com/rss.jsp?drpStoreID={0}'

IMAGE_DIR = os.path.join(path, 'images/')

THREADS = 2


class RSSspider(Spider):
    def __init__(self):
        super(RSSspider, self).__init__(thread_number=THREADS, network_try_limit=20)
Ejemplo n.º 23
0
			row = {}
			for childelems in elem.iterchildren():
				if 'b-serp-item__price' == childelems.attrib['class']:
					row['price'] = find_node_number(childelems, ignore_spaces=True)
				if 'b-serp-item__header' == childelems.attrib['class']:
					row['header'] = get_node_text(childelems)
					ahref = childelems.iterchildren()
					row['link'] = list(ahref)[0].get('href')
				if 'b-serp-item__about' == childelems.attrib['class']:
					row['about'] = get_node_text(childelems)
				if 'b-serp-item__address' == childelems.attrib['class']:
					adresselems = childelems.iterchildren()
					adress_and_subway = list(adresselems)[1]
					adress = adress_and_subway.text
					adress_and_subway_iter = adress_and_subway.iterchildren()
					subway = list(adress_and_subway_iter)[0].text
					row['adress'] = adress
					row['subway'] = subway
				if 'b-serp-item__owner' == childelems.attrib['class']:
					row['owner'] = get_node_text(childelems)
			row['time'] = int(time.time())
		self.csvfilesaver.save(listrow(row))
		grab.url.split('=page')

if __name__ == '__main__':
	default_logging(grab_log='/tmp/grab.log', level=logging.DEBUG, mode='a', propagate_network_logger=False, network_log='/tmp/grab.network.log')
	#TODO Put initial URL and filename in to constructor.
	#TODO Add SQL saver.
	bot = YarSpider()
	bot.run()
Ejemplo n.º 24
0
def main(spider_name,
         thread_number=None,
         slave=False,
         settings='settings',
         network_logs=False,
         disable_proxy=False,
         ignore_lock=False,
         disable_report=False,
         *args,
         **kwargs):
    default_logging(propagate_network_logger=network_logs)

    root_config = build_root_config(settings)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, trash = parser.parse_known_args()
        spider_args = vars(opts)

    if thread_number is None:
        thread_number = \
            int(spider_config.get('thread_number',
                                  deprecated_key='GRAB_THREAD_NUMBER'))

    stat_task_object = kwargs.get('stat_task_object', None)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=int(
            spider_config.get('network_try_limit',
                              deprecated_key='GRAB_NETWORK_TRY_LIMIT')),
        task_try_limit=int(
            spider_config.get('task_try_limit',
                              deprecated_key='GRAB_TASK_TRY_LIMIT')),
        args=spider_args,
    )
    opt_queue = spider_config.get('queue', deprecated_key='GRAB_QUEUE')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache', deprecated_key='GRAB_CACHE')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get('proxy_list',
                                       deprecated_key='GRAB_PROXY_LIST')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get('command_interfaces',
                                   deprecated_key='GRAB_COMMAND_INTERFACES')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    # Dirty hack
    # FIXIT: REMOVE
    bot.dump_spider_stats = kwargs.get('dump_spider_stats')
    bot.stats_object = kwargs.get('stats_object')

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=spider_config.get(
        'display_timing', deprecated_key='GRAB_DISPLAY_TIMING'))

    if spider_config.get('display_stats', deprecated_key='GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if not disable_report:
        if spider_config.get('save_report', deprecated_key='GRAB_SAVE_REPORT'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.mkdir(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.items.iteritems():
                    fname_key = key.replace('-', '_')
                    bot.save_list(key, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(stats)

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
Ejemplo n.º 25
0
 def __init__(self):
     default_logging()
     logDir = '/tmp/fanduel'
     if not os.path.exists(logDir):
         os.makedirs(logDir)
     self.grab = Grab(log_dir=logDir, debug_post=True)