def __init__(self, settings=None):
     SpiderLoader.__init__(self, settings)
     self.spidercls = self.load("scrapy_spider")
     self.craw_runner =my_CrawlerRunner(settings)
     #my_crawler = craw_runner._create_crawler(spidercls)
     #spidercls.set_crawler(my_crawler)
     print "wwj debug in spidermanager",self.spidercls
    def get_crawler_class(self, crawler):
        """
        Searches through the modules in self.__crawer_module for a crawler with
        the name passed along.

        :param str crawler: Name of the crawler to load
        :rtype: crawler-class
        """
        settings = Settings()
        settings.set('SPIDER_MODULES', [self.__crawer_module])
        spider_loader = SpiderLoader(settings)
        return spider_loader.load(crawler)
Example #3
0
 def setUp(self):
     orig_spiders_dir = os.path.join(module_dir, 'test_spiders')
     self.tmpdir = tempfile.mkdtemp()
     self.spiders_dir = os.path.join(self.tmpdir, 'test_spiders_xxx')
     shutil.copytree(orig_spiders_dir, self.spiders_dir)
     sys.path.append(self.tmpdir)
     settings = Settings({'SPIDER_MODULES': ['test_spiders_xxx']})
     self.spider_loader = SpiderLoader.from_settings(settings)
Example #4
0
 def setUp(self):
     orig_spiders_dir = os.path.join(module_dir, "test_spiders")
     self.tmpdir = self.mktemp()
     os.mkdir(self.tmpdir)
     self.spiders_dir = os.path.join(self.tmpdir, "test_spiders_xxx")
     shutil.copytree(orig_spiders_dir, self.spiders_dir)
     sys.path.append(self.tmpdir)
     settings = Settings({"SPIDER_MODULES": ["test_spiders_xxx"]})
     self.spider_loader = SpiderLoader.from_settings(settings)
Example #5
0
    def test_bad_spider_modules_warning(self):

        with warnings.catch_warnings(record=True) as w:
            module = 'tests.test_spiderloader.test_spiders.doesnotexist'
            settings = Settings({'SPIDER_MODULES': [module]})
            spider_loader = SpiderLoader.from_settings(settings)
            self.assertIn("Could not load spiders from module", str(w[0].message))

            spiders = spider_loader.list()
            self.assertEqual(spiders, [])
Example #6
0
def crawl(settings={}, spider_name="", key="", spider_kwargs={}):
    project_settings = get_project_settings()
    spider_loader = SpiderLoader(project_settings)
    spider_cls = spider_loader.load(spider_name)

    feed_uri = ""
    feed_format = "csv"
    spider_key = ""
    try:
        spider_key = urlparse(spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get(
            "start_urls") else urlparse(spider_cls.start_urls[0]).hostname
    except Exception as e:
        logging.exception("Spider or kwargs need start_urls.")
        logging.exception(e)

    if is_in_aws():
        # Lambda can only write to the /tmp folder.
        settings['HTTPCACHE_DIR'] = "/tmp"
        feed_uri = f"s3://{os.getenv('FEED_BUCKET_NAME')}/{spider_name}_{key}.csv"
    else:
        feed_uri = "file://{}/%(name)s-{}-%(time)s.json".format(
            os.path.join(os.getcwd(), "feed"),
            spider_key,
        )

    settings['FEED_URI'] = feed_uri
    settings['FEED_FORMAT'] = feed_format

    process = CrawlerProcess({**project_settings, **settings})

    process.crawl(spider_cls, **spider_kwargs)
    process.start()

    if is_in_aws() and has_task_token():
        import boto3
        import json
        client = boto3.client('stepfunctions')
        client.send_task_success(
            taskToken=os.getenv('TASK_TOKEN_ENV_VARIABLE'),
            output=json.dumps({"feed_uri": feed_uri})
        )
Example #7
0
 def run_spider():
     s = Settings()
     s.setmodule(ulsan_settings)
     #process = CrawlerProcess(get_project_settings())
     sl = SpiderLoader(settings=s)
     print('#### spider list=', sl.list())
     spider = sl.load(sl.list()[0])
     #process = CrawlerProcess(settings=s)
     #d = process.crawl(spider)
     #process.crawl(UillOrKr)
     #process.start(stop_after_crawl=False)
     #process.start()
     #configure_logging({'LOG_FORMAT': '## %(levelname)s: %(message)s'})
     #configure_logging({'LOG_LEVEL': 'DEBUG'})
     runner = CrawlerRunner(settings=s)
     print(f'#### settings.LOG_ENABLED = {s["LOG_ENABLED"]}')
     d = runner.crawl(spider)
     #d.addBoth(lambda _: reactor.stop())
     #reactor.run()
     #return d
     return d
def crawl(settings={}, spider_name="header_spider", spider_kwargs={}):
    project_settings = get_project_settings()
    spider_loader = SpiderLoader(project_settings)

    spider_cls = spider_loader.load(spider_name)

    feed_uri = ""
    feed_format = "json"

    try:
        spider_key = urlparse(
            spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get(
                "start_urls") else urlparse(spider_cls.start_urls[0]).hostname
    except Exception:
        logging.exception("Spider or kwargs need start_urls.")

    if is_in_aws():
        # Lambda can only write to the /tmp folder.
        settings['HTTPCACHE_DIR'] = "/tmp"
        feed_uri = f"s3://{os.getenv('FEED_BUCKET_NAME')}/%(name)s-{spider_key}.json"
    else:
        feed_uri = "file://{}/%(name)s-{}-%(time)s.json".format(
            os.path.join(os.getcwd(), "feed"),
            spider_key,
        )
    if (is_in_aws()
            and os.getenv("USE_S3_CACHE") != "0") or os.getenv("USE_S3_CACHE"):
        settings[
            "HTTPCACHE_STORAGE"] = "my_sls_scraper.extensions.s3cache.S3CacheStorage"
        settings[
            "S3CACHE_URI"] = f"s3://{os.environ['HTTP_CACHE_BUCKET_NAME']}/cache"

    settings['FEED_URI'] = feed_uri
    settings['FEED_FORMAT'] = feed_format

    process = CrawlerProcess({**project_settings, **settings})

    process.crawl(spider_cls, **spider_kwargs)
    process.start()
Example #9
0
    def test_bad_spider_modules_warning(self):

        with warnings.catch_warnings(record=True) as w:
            module = 'tests.test_spiderloader.test_spiders.doesnotexist'
            settings = Settings({
                'SPIDER_MODULES': [module],
                'SPIDER_LOADER_WARN_ONLY': True
            })
            spider_loader = SpiderLoader.from_settings(settings)
            self.assertIn("Could not load spiders from module",
                          str(w[0].message))

            spiders = spider_loader.list()
            self.assertEqual(spiders, [])
    def test_dupename_warning(self):
        # copy 1 spider module so as to have duplicate spider name
        shutil.copyfile(os.path.join(self.tmpdir, 'test_spiders_xxx/spider3.py'),
                        os.path.join(self.tmpdir, 'test_spiders_xxx/spider3dupe.py'))

        with warnings.catch_warnings(record=True) as w:
            spider_loader = SpiderLoader.from_settings(self.settings)

            self.assertEqual(len(w), 1)
            msg = str(w[0].message)
            self.assertIn("several spiders with the same name", msg)
            self.assertIn("'spider3'", msg)

            spiders = set(spider_loader.list())
            self.assertEqual(spiders, set(['spider1', 'spider2', 'spider3', 'spider4']))
Example #11
0
class Updater:

    REQUIRED_PARAMETERS = ['MONGO_HOST', 'MONGO_PORT', 'MONGO_DB', 'SPIDERS']

    def __init__(self, settings):
        self.__validate_settings(settings)
        self.settings = settings
        self.spiders = settings.get('SPIDERS')
        self.register = MongoUpdatesRegister(settings)
        self.register.open_db()
        self.spider_loader = SpiderLoader(settings)

    def __validate_settings(self, settings):
        for parameter in Updater.REQUIRED_PARAMETERS:
            if parameter not in settings:
                raise MissingSetting(parameter)

    def run(self):
        process = CrawlerProcess(self.settings)
        for spider in self.spiders:
            kwargs = self._spider_args(spider)
            process.crawl(spider, **kwargs)
        update_id = self.register.start(self.spiders)
        process.start()
        if self._failed(process):
            self.register.fail(update_id)
        else:
            self.register.succeed(update_id)

    def _spider_args(self, spider):
        spider_cls = self.spider_loader.load(spider)
        kwargs = {}
        if self._accepts_last(spider_cls):
            last = self.register.last(spider)
            if last is not None:
                kwargs['last'] = last.start
        return kwargs

    def _accepts_last(self, cls):
        spider_parameters = signature(cls.__init__).parameters
        return 'last' in spider_parameters

    def _failed(self, process):
        finish_reasons = [crawler.stats.get_value('finish_reason') for crawler in process.crawlers]
        return any(reason != 'finished' for reason in finish_reasons)
Example #12
0
    def test_bad_spider_modules_warning(self):

        with warnings.catch_warnings(record=True) as w:
            module = 'tests.test_spiderloader.test_spiders.doesnotexist'
            settings = Settings({
                'SPIDER_MODULES': [module],
                'SPIDER_LOADER_WARN_ONLY': True
            })
            spider_loader = SpiderLoader.from_settings(settings)
            if str(w[0].message).startswith("_SixMetaPathImporter"):
                # needed on 3.10 because of https://github.com/benjaminp/six/issues/349,
                # at least until all six versions we can import (including botocore.vendored.six)
                # are updated to 1.16.0+
                w.pop(0)
            self.assertIn("Could not load spiders from module",
                          str(w[0].message))

            spiders = spider_loader.list()
            self.assertEqual(spiders, [])
Example #13
0
def cmdline_crawl(args):
    from scrapy.utils.project import get_project_settings
    from scrapy.spiderloader import SpiderLoader
    settings = get_project_settings()
    spiders  = SpiderLoader.from_settings(settings)
    if not args:
        spiderlist = spiders.list()
        if spiderlist:
            print('spiders list {}'.format(spiderlist))
        sys.exit()
    spidername = args.spider
    filepath = inspect.getabsfile(spiders.load(spidername))
    os.environ.pop('SCRAPY_SETTINGS_MODULE')
    settings, _conf = _get_settings_and_conf(args)
    server = connection.get_redis(**settings['REDIS_PARAMS'])
    with open(filepath,encoding='utf-8') as f:
        script = f.read()
    jsondata = _send_script_start_work(spidername, script, server)
    jsondata.pop('script')
    print('send task:')
    print(json.dumps(jsondata,indent=4))
Example #14
0
    def _prepare_domains_items_refs(self):
        spider_loader = SpiderLoader.from_settings(self.settings)

        if hasattr(self, 'spiders'):
            spider_names = getattr(self, 'spiders').split(',')
        else:
            spider_names = [spider_name for spider_name in spider_loader.list() \
                            if spider_name not in self.spiders_ignored]

        for spider_name in spider_names:
            Spider = spider_loader.load(spider_name)

            for domain in Spider.allowed_domains:
                for i, item_ref in enumerate(Spider.items_refs):
                    item_ref['spider_name'] = spider_name
                    Spider.items_refs[i] = item_ref

                self.domains_items_refs[domain] = Spider.items_refs

            self.allowed_domains += Spider.allowed_domains

        self.allowed_domains.sort(key=len, reverse=True)
Example #15
0
 def test_load_base_spider(self):
     module = 'tests.test_spiderloader.test_spiders.spider0'
     settings = Settings({'SPIDER_MODULES': [module]})
     self.spider_loader = SpiderLoader.from_settings(settings)
     assert len(self.spider_loader._spiders) == 0
Example #16
0
 def test_load_spider_module_multiple(self):
     prefix = 'tests.test_spiderloader.test_spiders.'
     module = ','.join(prefix + s for s in ('spider1', 'spider2'))
     settings = Settings({'SPIDER_MODULES': module})
     self.spider_loader = SpiderLoader.from_settings(settings)
     assert len(self.spider_loader._spiders) == 2
Example #17
0
def main():
    logger.warn("Loading project")

    s3_bucket = os.environ.get('S3_BUCKET')
    assert s3_bucket, "Please specify an S3_BUCKET environment variable"

    utcnow = datetime.datetime.utcnow()
    tstamp = utcnow.strftime('%F-%H-%M-%S')
    pool_size = 12

    settings = get_project_settings()
    spider_loader = SpiderLoader.from_settings(settings)
    spider_names = spider_loader.list()

    pool = multiprocessing.Pool(pool_size, maxtasksperchild=1)
    logger.info("Starting to crawl %s spiders on %s processes",
                len(spider_names), pool_size)
    results = pool.imap_unordered(run_one_spider, spider_names)
    pool.close()
    pool.join()
    logger.info("Done crawling")

    # The imap_unordered call returns an iterator, so throw it in a list
    results = list(results)

    client = boto3.client('s3')
    s3_key_prefix = "runs/{}".format(tstamp)

    # Concatenate and gzip the output geojsons
    _, output_gz_filename = tempfile.mkstemp('.geojson.gz')
    with gzip.open(output_gz_filename, 'wb') as f_out:
        for r in results:
            with open(r.pop('output_filename'), 'rb') as f_in:
                shutil.copyfileobj(f_in, f_out)

    s3_output_size = os.path.getsize(output_gz_filename)
    s3_output_size_mb = s3_output_size / 1024 / 1024

    # Post it to S3
    s3_output_key = '{}/output.geojson.gz'.format(s3_key_prefix)
    client.upload_file(
        output_gz_filename,
        s3_bucket,
        s3_output_key,
        ExtraArgs={
            'ACL':
            'public-read',
            'ContentType':
            'application/json',
            'ContentDisposition':
            'attachment; filename="output-{}.geojson.gz"'.format(tstamp),
        })

    logger.warn("Saved output to https://s3.amazonaws.com/%s/%s", s3_bucket,
                s3_output_key)

    # Concatenate and gzip the log files
    _, log_gz_filename = tempfile.mkstemp('.log.gz')
    with gzip.open(log_gz_filename, 'wb') as f_out:
        for r in results:
            with open(r.pop('log_filename'), 'rb') as f_in:
                shutil.copyfileobj(f_in, f_out)

    # Post it to S3
    s3_log_key = '{}/all_logs.txt.gz'.format(s3_key_prefix)
    client.upload_file(log_gz_filename,
                       s3_bucket,
                       s3_log_key,
                       ExtraArgs={
                           'ACL': 'public-read',
                           'ContentType': 'text/plain; charset=utf-8',
                           'ContentEncoding': 'gzip',
                       })

    logger.warn("Saved logfile to https://s3.amazonaws.com/%s/%s", s3_bucket,
                s3_log_key)

    metadata = {
        'spiders': results,
        'links': {
            'download_url':
            "https://s3.amazonaws.com/{}/{}".format(s3_bucket, s3_output_key),
            'log_url':
            "https://s3.amazonaws.com/{}/{}".format(s3_bucket, s3_log_key),
        }
    }

    with open('metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2, default=json_serial)

    s3_key = '{}/metadata.json'.format(s3_key_prefix)
    client.upload_file('metadata.json',
                       s3_bucket,
                       s3_key,
                       ExtraArgs={
                           'ACL': 'public-read',
                           'ContentType': 'application/json; charset=utf-8',
                       })
    logger.warn("Saved metadata to https://s3.amazonaws.com/%s/%s", s3_bucket,
                s3_key)

    s3_key = 'runs/latest/metadata.json'
    client.upload_file('metadata.json',
                       s3_bucket,
                       s3_key,
                       ExtraArgs={
                           'ACL': 'public-read',
                           'ContentType': 'application/json; charset=utf-8',
                       })
    logger.warn("Saved metadata to https://s3.amazonaws.com/%s/%s", s3_bucket,
                s3_key)

    total_count = sum(filter(None, (s['item_scraped_count'] for s in results)))
    template_content = {
        'download_url':
        'https://s3.amazonaws.com/{}/{}'.format(s3_bucket, s3_output_key),
        'download_size':
        round(s3_output_size_mb, 1),
        'row_count':
        total_count,
        'spider_count':
        len(results),
        'updated_datetime':
        utcnow.replace(microsecond=0).isoformat(),
    }
    with open('info_embed.html', 'w') as f:
        f.write("<html><body>"
                "<a href=\"{download_url}\">Download</a> "
                "({download_size} MB)<br/><small>{row_count:,} rows from "
                "{spider_count} spiders, updated {updated_datetime}Z</small>"
                "</body></html>\n".format(**template_content))
    s3_key = 'runs/latest/info_embed.html'
    client.upload_file('info_embed.html',
                       s3_bucket,
                       s3_key,
                       ExtraArgs={
                           'ACL': 'public-read',
                           'ContentType': 'text/html; charset=utf-8',
                       })
    logger.warn("Saved embed to https://s3.amazonaws.com/%s/%s", s3_bucket,
                s3_key)
Example #18
0
from scrapy.extensions.httpcache import FilesystemCacheStorage
from scrapy.spiderloader import SpiderLoader
from scrapy.utils.project import get_project_settings

SETTINGS = get_project_settings()
STORAGE = FilesystemCacheStorage(SETTINGS)
SPIDER_LOADER = SpiderLoader.from_settings(SETTINGS)
SPIDER_CLASS = SPIDER_LOADER.load('ksdata')
SPIDER = SPIDER_CLASS(file='../sample.csv')


def get_busted_caches():
    for request in SPIDER.start_requests():
        meta = STORAGE._read_meta(SPIDER, request)
        if meta['status'] in SETTINGS['HTTPCACHE_IGNORE_HTTP_CODES']:
            yield STORAGE._get_request_path(SPIDER, request)


if __name__ == "__main__":
    import shutil

    for path in get_busted_caches():
        shutil.rmtree(path)
Example #19
0
 def test_load_base_spider(self):
     module = 'tests.test_spiderloader.test_spiders.spider0'
     settings = Settings({'SPIDER_MODULES': [module]})
     self.spider_loader = SpiderLoader.from_settings(settings)
     assert len(self.spider_loader._spiders) == 0
Example #20
0
 def test_load_spider_module(self):
     prefix = 'tests.test_spiderloader.test_spiders.'
     module = ','.join(prefix + s for s in ('spider1', 'spider2'))
     settings = Settings({'SPIDER_MODULES': module})
     self.spider_loader = SpiderLoader.from_settings(settings)
     assert len(self.spider_loader._spiders) == 2
Example #21
0
#!/usr/bin/env python
from scrapy.crawler import CrawlerProcess
from threading import Thread
from scrapy.settings import Settings
from scrapy.spiderloader import SpiderLoader
import time

settings = Settings()
settings_module_path = "settings"
if settings_module_path:
    settings.setmodule(settings_module_path, priority='project')
process = CrawlerProcess(settings)

def _start_crawler_thread():
    t = Thread(target=process.start,kwargs={'stop_after_crawl': False})
    t.daemon = True
    t.start()


loader = SpiderLoader(settings)

for spider_cls in loader._spiders:

    process.crawl(spider_cls)

_start_crawler_thread()

while 1:
    time.sleep(2)

Example #22
0
 def __init__(self):
     settings = get_scraper_settings()
     loader = SpiderLoader.from_settings(settings)
     self.spider_names = loader.list()
     self.server = redis.StrictRedis(host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT'))
Example #23
0
from scrapy.spiderloader import SpiderLoader
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

settings = get_project_settings()
process = CrawlerProcess(settings)
spider_loader = SpiderLoader.from_settings(settings)

ranking_spider = spider_loader.load("d1rankings")

print("----------------Ranking spider started----------------")
process.crawl(ranking_spider)
process.start()
print("----------------Ranking spider finished----------------\n")
Example #24
0
 def test_load_spider_module(self):
     prefix = "tests.test_spiderloader.test_spiders."
     module = ",".join(prefix + s for s in ("spider1", "spider2"))
     settings = Settings({"SPIDER_MODULES": module})
     self.spider_loader = SpiderLoader.from_settings(settings)
     assert len(self.spider_loader._spiders) == 2
Example #25
0
 def test_load_spider_module(self):
     module = "tests.test_spiderloader.test_spiders.spider1"
     settings = Settings({"SPIDER_MODULES": [module]})
     self.spider_loader = SpiderLoader.from_settings(settings)
     assert len(self.spider_loader._spiders) == 1
Example #26
0
     # "lianjia-cj-gz",
     # "lianjia-cj-hz",
     # "lianjia-cj-nj",
     # "lianjia-cj-cs",
     # "lianjia-cj-wh",
     # "lianjia-cj-tj",
     # "lianjia-cj-zz",
     #"lianjia-cj-xa",
     #"lianjia-cj-cd",
     #"lianjia-cj-su",
     #  "lianjia-cj-cq",
     # "lianjia-cj-xm",
     # "lianjia-cj-hf",
 ])
 process = CrawlerProcess(get_project_settings())
 sloader = SpiderLoader(get_project_settings())
 scheduler = TwistedScheduler()
 hour = 3
 for spidername in sloader.list():
     # scheduler.add_job(task, 'cron', minute="*/20")
     if spidername in allow2:
         #https://apscheduler.readthedocs.io/en/latest/modules/triggers/cron.html
         # scheduler.add_job(process.crawl, 'cron', args=[spidername], hour="*/" + str(hour))
         # scheduler.add_job(func=aps_test, args=('定时任务',), trigger='cron', second='*/5')
         # scheduler.add_job(func=aps_test, args=('一次性任务',),
         #                   next_run_time=datetime.datetime.now() + datetime.timedelta(seconds=12))
         # scheduler.add_job(func=aps_test, args=('循环任务',), trigger='interval', seconds=3)
         print(spidername)
         scheduler.add_job(process.crawl,
                           'cron',
                           args=[spidername],
Example #27
0
from scrapy.spiderloader import SpiderLoader
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

if __name__ == '__main__':
    spider_loader = SpiderLoader(get_project_settings())
    spiders = spider_loader.list()

    process = CrawlerProcess(get_project_settings())
    for spider in spiders:
        process.crawl(spider_loader.load(spider))
    process.start()
Example #28
0
 def list_spiders(self):
     loader = SpiderLoader.from_settings(self.settings)
     return loader.list()