Exemple #1
0
def construct_compose_dict(scraper_package, scraper_config=None):
    # Apply scraper specific config.
    apply_scraper_config(scraper_package, scraper_config)

    docker_compose = {"version": "3", "services": {}}

    for sraper_id in range(0, Config.SCRAPERS_COUNT):
        sraper_id += 1

        docker_compose["services"].update(
            create_scraper_service(sraper_id, scraper_package, scraper_config))

    return docker_compose
def construct_compose_file(path, config=None):
    # Apply scraper specific config (but first ensure it's accessible).
    sys.path.insert(0, os.path.dirname(path))
    package = path.split(os.sep)[-1]
    apply_scraper_config(package, config)

    services = [
        create_scraper_service(id, package, path, config)
        for id in range(1, Config.SCRAPERS_COUNT + 1)
    ]

    # A dynamic way to get the path of `scrapemeagain.dockerized` package.
    # Required because this file must be runnable both during development and
    # after installing the package.
    # Credits: https://stackoverflow.com/q/2419416/4183498.
    dockerized_utils_module_path = inspect.getsourcefile(apply_scraper_config)
    template_dir = os.path.dirname(dockerized_utils_module_path)
    with open(os.path.join(template_dir, "docker-compose.yml")) as f:
        template = Template(f.read())

    return template.render(services=services).strip()
from scrapemeagain.config import Config
from scrapemeagain.dockerized.utils import (
    apply_scraper_config,
    get_class_from_path,
)
from scrapemeagain.pipeline import DockerizedPipeline
from scrapemeagain.utils.logger import setup_logging
from scrapemeagain.utils.useragents import get_user_agents

from examplescraper.databaser import DockerizedDatabaser
from examplescraper.scraper import DockerizedExampleScraper

# Update config, setup logging and useragents.
apply_scraper_config()
setup_logging(logger_name="examplescraper")
Config.USER_AGENTS = get_user_agents(__file__)

# Configure DockerizedTorIpChanger.
toripchanger_class = get_class_from_path(Config.TORIPCHANGER_CLASS)
tor_ip_changer = toripchanger_class(
    local_http_proxy=Config.LOCAL_HTTP_PROXY,
    tor_password=Config.TOR_PASSWORD,
    tor_port=Config.TOR_PORT,
    new_ip_max_attempts=Config.NEW_IP_MAX_ATTEMPTS,
)

# Prepare the scraping pipeline.
scraper = DockerizedExampleScraper()
databaser = DockerizedDatabaser(scraper.db_file)
pipeline = DockerizedPipeline(scraper, databaser, tor_ip_changer)
pipeline.prepare_pipeline()