def crawl(settings={}, spider_name='all'):
    project_settings = project.get_project_settings()
    spider_loader = SpiderLoader.from_settings(project_settings)
    
    feed_uri = ""
    feed_format = "json"
    
    spider_classes = []
    
    if not spider_name.lower() == 'all':
        spider_class = spider_loader.load(spider_name)
        spider_classes.append(spider_class)
    else:
        spiders = spider_loader.list()
        for name in spiders:
            spider_class = spider_loader.load(name)
            spider_classes.append(spider_class)
    
    if is_in_aws():
        settings['HTTPCACHE_DIR'] = '/tmp'
        bucket = os.getenv('FEED_BUCKET_NAME')
        feed_uri = "s3://{}/%(name)s-%(time)s.{}".format(bucket, feed_format)
    else:
        output = os.path.join(os.getcwd(), "output")
        feed_uri = "file://{}/%(name)s-%(time)s.{}".format(output, feed_format)
    
    settings['FEED_URI'] = feed_uri
    settings['FEED_FORMAT'] = feed_format
    
    configure_logging()
    for spider in spider_classes:
        run_spider({**project_settings, **settings}, spider)
Beispiel #2
0
    def test_multiple_dupename_warning(self):
        # copy 2 spider modules so as to have duplicate spider name
        # This should issue 2 warning, 1 for each duplicate spider name
        shutil.copyfile(
            os.path.join(self.tmpdir, 'test_spiders_xxx/spider1.py'),
            os.path.join(self.tmpdir, 'test_spiders_xxx/spider1dupe.py'))
        shutil.copyfile(
            os.path.join(self.tmpdir, 'test_spiders_xxx/spider2.py'),
            os.path.join(self.tmpdir, 'test_spiders_xxx/spider2dupe.py'))

        with warnings.catch_warnings(record=True) as w:
            spider_loader = SpiderLoader.from_settings(self.settings)

            self.assertEqual(len(w), 1)
            msg = str(w[0].message)
            self.assertIn("several spiders with the same name", msg)
            self.assertIn("'spider1'", msg)
            self.assertTrue(msg.count("'spider1'") == 2)

            self.assertIn("'spider2'", msg)
            self.assertTrue(msg.count("'spider2'") == 2)

            self.assertNotIn("'spider3'", msg)
            self.assertNotIn("'spider4'", msg)

            spiders = set(spider_loader.list())
            self.assertEqual(spiders,
                             {'spider1', 'spider2', 'spider3', 'spider4'})
Beispiel #3
0
 def setUp(self):
     orig_spiders_dir = os.path.join(module_dir, 'test_spiders')
     self.tmpdir = tempfile.mkdtemp()
     self.spiders_dir = os.path.join(self.tmpdir, 'test_spiders_xxx')
     shutil.copytree(orig_spiders_dir, self.spiders_dir)
     sys.path.append(self.tmpdir)
     settings = Settings({'SPIDER_MODULES': ['test_spiders_xxx']})
     self.spider_loader = SpiderLoader.from_settings(settings)
Beispiel #4
0
 def setUp(self):
     orig_spiders_dir = os.path.join(module_dir, 'test_spiders')
     self.tmpdir = tempfile.mkdtemp()
     self.spiders_dir = os.path.join(self.tmpdir, 'test_spiders_xxx')
     shutil.copytree(orig_spiders_dir, self.spiders_dir)
     sys.path.append(self.tmpdir)
     settings = Settings({'SPIDER_MODULES': ['test_spiders_xxx']})
     self.spider_loader = SpiderLoader.from_settings(settings)
Beispiel #5
0
 def setUp(self):
     orig_spiders_dir = os.path.join(module_dir, "test_spiders")
     self.tmpdir = self.mktemp()
     os.mkdir(self.tmpdir)
     self.spiders_dir = os.path.join(self.tmpdir, "test_spiders_xxx")
     shutil.copytree(orig_spiders_dir, self.spiders_dir)
     sys.path.append(self.tmpdir)
     settings = Settings({"SPIDER_MODULES": ["test_spiders_xxx"]})
     self.spider_loader = SpiderLoader.from_settings(settings)
Beispiel #6
0
    def test_bad_spider_modules_warning(self):

        with warnings.catch_warnings(record=True) as w:
            module = 'tests.test_spiderloader.test_spiders.doesnotexist'
            settings = Settings({'SPIDER_MODULES': [module]})
            spider_loader = SpiderLoader.from_settings(settings)
            self.assertIn("Could not load spiders from module", str(w[0].message))

            spiders = spider_loader.list()
            self.assertEqual(spiders, [])
    def test_dupename_warning(self):
        # copy 1 spider module so as to have duplicate spider name
        shutil.copyfile(os.path.join(self.tmpdir, 'test_spiders_xxx/spider3.py'),
                        os.path.join(self.tmpdir, 'test_spiders_xxx/spider3dupe.py'))

        with warnings.catch_warnings(record=True) as w:
            spider_loader = SpiderLoader.from_settings(self.settings)

            self.assertEqual(len(w), 1)
            msg = str(w[0].message)
            self.assertIn("several spiders with the same name", msg)
            self.assertIn("'spider3'", msg)

            spiders = set(spider_loader.list())
            self.assertEqual(spiders, set(['spider1', 'spider2', 'spider3', 'spider4']))
Beispiel #8
0
    def test_bad_spider_modules_warning(self):

        with warnings.catch_warnings(record=True) as w:
            module = 'tests.test_spiderloader.test_spiders.doesnotexist'
            settings = Settings({
                'SPIDER_MODULES': [module],
                'SPIDER_LOADER_WARN_ONLY': True
            })
            spider_loader = SpiderLoader.from_settings(settings)
            if str(w[0].message).startswith("_SixMetaPathImporter"):
                # needed on 3.10 because of https://github.com/benjaminp/six/issues/349,
                # at least until all six versions we can import (including botocore.vendored.six)
                # are updated to 1.16.0+
                w.pop(0)
            self.assertIn("Could not load spiders from module",
                          str(w[0].message))

            spiders = spider_loader.list()
            self.assertEqual(spiders, [])
Beispiel #9
0
def cmdline_crawl(args):
    from scrapy.utils.project import get_project_settings
    from scrapy.spiderloader import SpiderLoader
    settings = get_project_settings()
    spiders  = SpiderLoader.from_settings(settings)
    if not args:
        spiderlist = spiders.list()
        if spiderlist:
            print('spiders list {}'.format(spiderlist))
        sys.exit()
    spidername = args.spider
    filepath = inspect.getabsfile(spiders.load(spidername))
    os.environ.pop('SCRAPY_SETTINGS_MODULE')
    settings, _conf = _get_settings_and_conf(args)
    server = connection.get_redis(**settings['REDIS_PARAMS'])
    with open(filepath,encoding='utf-8') as f:
        script = f.read()
    jsondata = _send_script_start_work(spidername, script, server)
    jsondata.pop('script')
    print('send task:')
    print(json.dumps(jsondata,indent=4))
Beispiel #10
0
    def _prepare_domains_items_refs(self):
        spider_loader = SpiderLoader.from_settings(self.settings)

        if hasattr(self, 'spiders'):
            spider_names = getattr(self, 'spiders').split(',')
        else:
            spider_names = [spider_name for spider_name in spider_loader.list() \
                            if spider_name not in self.spiders_ignored]

        for spider_name in spider_names:
            Spider = spider_loader.load(spider_name)

            for domain in Spider.allowed_domains:
                for i, item_ref in enumerate(Spider.items_refs):
                    item_ref['spider_name'] = spider_name
                    Spider.items_refs[i] = item_ref

                self.domains_items_refs[domain] = Spider.items_refs

            self.allowed_domains += Spider.allowed_domains

        self.allowed_domains.sort(key=len, reverse=True)
Beispiel #11
0
from scrapy.extensions.httpcache import FilesystemCacheStorage
from scrapy.spiderloader import SpiderLoader
from scrapy.utils.project import get_project_settings

SETTINGS = get_project_settings()
STORAGE = FilesystemCacheStorage(SETTINGS)
SPIDER_LOADER = SpiderLoader.from_settings(SETTINGS)
SPIDER_CLASS = SPIDER_LOADER.load('ksdata')
SPIDER = SPIDER_CLASS(file='../sample.csv')


def get_busted_caches():
    for request in SPIDER.start_requests():
        meta = STORAGE._read_meta(SPIDER, request)
        if meta['status'] in SETTINGS['HTTPCACHE_IGNORE_HTTP_CODES']:
            yield STORAGE._get_request_path(SPIDER, request)


if __name__ == "__main__":
    import shutil

    for path in get_busted_caches():
        shutil.rmtree(path)
Beispiel #12
0
 def test_load_base_spider(self):
     module = 'tests.test_spiderloader.test_spiders.spider0'
     settings = Settings({'SPIDER_MODULES': [module]})
     self.spider_loader = SpiderLoader.from_settings(settings)
     assert len(self.spider_loader._spiders) == 0
Beispiel #13
0
 def test_load_spider_module(self):
     prefix = 'tests.test_spiderloader.test_spiders.'
     module = ','.join(prefix + s for s in ('spider1', 'spider2'))
     settings = Settings({'SPIDER_MODULES': module})
     self.spider_loader = SpiderLoader.from_settings(settings)
     assert len(self.spider_loader._spiders) == 2
Beispiel #14
0
from scrapy.spiderloader import SpiderLoader
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

settings = get_project_settings()
process = CrawlerProcess(settings)
spider_loader = SpiderLoader.from_settings(settings)

ranking_spider = spider_loader.load("d1rankings")

print("----------------Ranking spider started----------------")
process.crawl(ranking_spider)
process.start()
print("----------------Ranking spider finished----------------\n")
Beispiel #15
0
 def test_load_spider_module(self):
     prefix = "tests.test_spiderloader.test_spiders."
     module = ",".join(prefix + s for s in ("spider1", "spider2"))
     settings = Settings({"SPIDER_MODULES": module})
     self.spider_loader = SpiderLoader.from_settings(settings)
     assert len(self.spider_loader._spiders) == 2
Beispiel #16
0
def main():
    logger.warn("Loading project")

    s3_bucket = os.environ.get('S3_BUCKET')
    assert s3_bucket, "Please specify an S3_BUCKET environment variable"

    utcnow = datetime.datetime.utcnow()
    tstamp = utcnow.strftime('%F-%H-%M-%S')
    pool_size = 12

    settings = get_project_settings()
    spider_loader = SpiderLoader.from_settings(settings)
    spider_names = spider_loader.list()

    pool = multiprocessing.Pool(pool_size, maxtasksperchild=1)
    logger.info("Starting to crawl %s spiders on %s processes",
                len(spider_names), pool_size)
    results = pool.imap_unordered(run_one_spider, spider_names)
    pool.close()
    pool.join()
    logger.info("Done crawling")

    # The imap_unordered call returns an iterator, so throw it in a list
    results = list(results)

    client = boto3.client('s3')
    s3_key_prefix = "runs/{}".format(tstamp)

    # Concatenate and gzip the output geojsons
    _, output_gz_filename = tempfile.mkstemp('.geojson.gz')
    with gzip.open(output_gz_filename, 'wb') as f_out:
        for r in results:
            with open(r.pop('output_filename'), 'rb') as f_in:
                shutil.copyfileobj(f_in, f_out)

    s3_output_size = os.path.getsize(output_gz_filename)
    s3_output_size_mb = s3_output_size / 1024 / 1024

    # Post it to S3
    s3_output_key = '{}/output.geojson.gz'.format(s3_key_prefix)
    client.upload_file(
        output_gz_filename,
        s3_bucket,
        s3_output_key,
        ExtraArgs={
            'ACL':
            'public-read',
            'ContentType':
            'application/json',
            'ContentDisposition':
            'attachment; filename="output-{}.geojson.gz"'.format(tstamp),
        })

    logger.warn("Saved output to https://s3.amazonaws.com/%s/%s", s3_bucket,
                s3_output_key)

    # Concatenate and gzip the log files
    _, log_gz_filename = tempfile.mkstemp('.log.gz')
    with gzip.open(log_gz_filename, 'wb') as f_out:
        for r in results:
            with open(r.pop('log_filename'), 'rb') as f_in:
                shutil.copyfileobj(f_in, f_out)

    # Post it to S3
    s3_log_key = '{}/all_logs.txt.gz'.format(s3_key_prefix)
    client.upload_file(log_gz_filename,
                       s3_bucket,
                       s3_log_key,
                       ExtraArgs={
                           'ACL': 'public-read',
                           'ContentType': 'text/plain; charset=utf-8',
                           'ContentEncoding': 'gzip',
                       })

    logger.warn("Saved logfile to https://s3.amazonaws.com/%s/%s", s3_bucket,
                s3_log_key)

    metadata = {
        'spiders': results,
        'links': {
            'download_url':
            "https://s3.amazonaws.com/{}/{}".format(s3_bucket, s3_output_key),
            'log_url':
            "https://s3.amazonaws.com/{}/{}".format(s3_bucket, s3_log_key),
        }
    }

    with open('metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2, default=json_serial)

    s3_key = '{}/metadata.json'.format(s3_key_prefix)
    client.upload_file('metadata.json',
                       s3_bucket,
                       s3_key,
                       ExtraArgs={
                           'ACL': 'public-read',
                           'ContentType': 'application/json; charset=utf-8',
                       })
    logger.warn("Saved metadata to https://s3.amazonaws.com/%s/%s", s3_bucket,
                s3_key)

    s3_key = 'runs/latest/metadata.json'
    client.upload_file('metadata.json',
                       s3_bucket,
                       s3_key,
                       ExtraArgs={
                           'ACL': 'public-read',
                           'ContentType': 'application/json; charset=utf-8',
                       })
    logger.warn("Saved metadata to https://s3.amazonaws.com/%s/%s", s3_bucket,
                s3_key)

    total_count = sum(filter(None, (s['item_scraped_count'] for s in results)))
    template_content = {
        'download_url':
        'https://s3.amazonaws.com/{}/{}'.format(s3_bucket, s3_output_key),
        'download_size':
        round(s3_output_size_mb, 1),
        'row_count':
        total_count,
        'spider_count':
        len(results),
        'updated_datetime':
        utcnow.replace(microsecond=0).isoformat(),
    }
    with open('info_embed.html', 'w') as f:
        f.write("<html><body>"
                "<a href=\"{download_url}\">Download</a> "
                "({download_size} MB)<br/><small>{row_count:,} rows from "
                "{spider_count} spiders, updated {updated_datetime}Z</small>"
                "</body></html>\n".format(**template_content))
    s3_key = 'runs/latest/info_embed.html'
    client.upload_file('info_embed.html',
                       s3_bucket,
                       s3_key,
                       ExtraArgs={
                           'ACL': 'public-read',
                           'ContentType': 'text/html; charset=utf-8',
                       })
    logger.warn("Saved embed to https://s3.amazonaws.com/%s/%s", s3_bucket,
                s3_key)
Beispiel #17
0
 def test_load_spider_module_multiple(self):
     prefix = 'tests.test_spiderloader.test_spiders.'
     module = ','.join(prefix + s for s in ('spider1', 'spider2'))
     settings = Settings({'SPIDER_MODULES': module})
     self.spider_loader = SpiderLoader.from_settings(settings)
     assert len(self.spider_loader._spiders) == 2
Beispiel #18
0
 def __init__(self):
     settings = get_scraper_settings()
     loader = SpiderLoader.from_settings(settings)
     self.spider_names = loader.list()
     self.server = redis.StrictRedis(host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT'))
Beispiel #19
0
 def list_spiders(self):
     loader = SpiderLoader.from_settings(self.settings)
     return loader.list()
Beispiel #20
0
 def test_load_base_spider(self):
     module = 'tests.test_spiderloader.test_spiders.spider0'
     settings = Settings({'SPIDER_MODULES': [module]})
     self.spider_loader = SpiderLoader.from_settings(settings)
     assert len(self.spider_loader._spiders) == 0
Beispiel #21
0
 def test_load_spider_module(self):
     module = "tests.test_spiderloader.test_spiders.spider1"
     settings = Settings({"SPIDER_MODULES": [module]})
     self.spider_loader = SpiderLoader.from_settings(settings)
     assert len(self.spider_loader._spiders) == 1