def app_factory(name): """ Create a Flask app using scraper specific config. Setup loging and add a basic `health` endpoint. To be able to use scraper specific config in a module this function has to be called first, e.g. app = app_factory(__name__) urlbroker_class = get_class_from_path(Config.URLBROKER_CLASS) :argument name: app name :type name: str :returns: Flask app """ apply_scraper_config() app = Flask(name) logger_name = app.root_path.rsplit("/", 1)[-1] setup_logging(logger_name) @app.route("/health/") def healthcheck(): return "" return app
# Configure TorIpChanger. tor_ip_changer = TorIpChanger( reuse_threshold=0, # We need to remember all exhausted IPs. local_http_proxy=Config.LOCAL_HTTP_PROXY, tor_password=Config.TOR_PASSWORD, tor_port=Config.TOR_PORT, new_ip_max_attempts=Config.NEW_IP_MAX_ATTEMPTS, ) # Configure useragents. Config.USER_AGENTS = get_user_agents() # Configure logging. setup_logging(logger_name="example-scraper2") # Prepare the scraping pipeline. scraper = ExampleScraper2() databaser = Databaser(scraper.db_file, scraper.db_table) pipeline = ExhaustApiLimitPipeLine(scraper, databaser, tor_ip_changer) pipeline.prepare_multiprocessing() try: services.start_backbone_services() # Change IP before starting. pipeline.tor_ip_changer.get_new_ip() # Collect item properties.
# Configure TorIpChanger. tor_ip_changer = TorIpChanger( reuse_threshold=Config.REUSE_THRESHOLD, local_http_proxy=Config.LOCAL_HTTP_PROXY, tor_password=Config.TOR_PASSWORD, tor_port=Config.TOR_PORT, new_ip_max_attempts=Config.NEW_IP_MAX_ATTEMPTS ) # Configure useragents. Config.USER_AGENTS = get_user_agents() # Configure logging. setup_logging(logger_name='example-scraper') # Prepare the scraping pipeline. scraper = ExampleScraper() databaser = Databaser(scraper.db_file, scraper.db_table) pipeline = Pipeline(scraper, databaser, tor_ip_changer) pipeline.prepare_multiprocessing() try: services.start_backbone_services() # Change IP before starting. pipeline.tor_ip_changer.get_new_ip() # Collect item URLs.
from scrapemeagain.config import Config from scrapemeagain.dockerized.utils import ( apply_scraper_config, get_class_from_path, ) from scrapemeagain.pipeline import DockerizedPipeline from scrapemeagain.utils.logger import setup_logging from scrapemeagain.utils.useragents import get_user_agents from examplescraper.databaser import DockerizedDatabaser from examplescraper.scraper import DockerizedExampleScraper # Update config, setup logging and useragents. apply_scraper_config() setup_logging(logger_name="examplescraper") Config.USER_AGENTS = get_user_agents(__file__) # Configure DockerizedTorIpChanger. toripchanger_class = get_class_from_path(Config.TORIPCHANGER_CLASS) tor_ip_changer = toripchanger_class( local_http_proxy=Config.LOCAL_HTTP_PROXY, tor_password=Config.TOR_PASSWORD, tor_port=Config.TOR_PORT, new_ip_max_attempts=Config.NEW_IP_MAX_ATTEMPTS, ) # Prepare the scraping pipeline. scraper = DockerizedExampleScraper() databaser = DockerizedDatabaser(scraper.db_file) pipeline = DockerizedPipeline(scraper, databaser, tor_ip_changer) pipeline.prepare_pipeline()
import flask from toripchanger import TorIpChanger from scrapemeagain.config import Config from scrapemeagain.dockerized.utils import ( apply_scraper_config, get_class_from_path, ) from scrapemeagain.utils.logger import setup_logging setup_logging(__name__) apply_scraper_config() datastore_class = get_class_from_path(Config.DATASTORE_DATABASER_CLASS) DATASTORE = datastore_class() urlbroker_class = get_class_from_path(Config.URLBROKER_CLASS) URLBROKER = urlbroker_class() IPSTORE = TorIpChanger(reuse_threshold=Config.IPSTORE_REUSE_THRESHOLD) app = flask.Flask(__name__) @app.route("/health/") def healthcheck(): return "" @app.route("/ip-is-safe/<ip>/") def ip_is_safe(ip): safe = IPSTORE._ip_is_safe(ip) if safe: