Example #1
0
def app_factory(name):
    """
    Create a Flask app using scraper specific config. Setup loging and add a
    basic `health` endpoint.

    To be able to use scraper specific config in a module this function has to
    be called first, e.g.

        app = app_factory(__name__)
        urlbroker_class = get_class_from_path(Config.URLBROKER_CLASS)

    :argument name: app name
    :type name: str

    :returns: Flask app
    """
    apply_scraper_config()

    app = Flask(name)

    logger_name = app.root_path.rsplit("/", 1)[-1]
    setup_logging(logger_name)

    @app.route("/health/")
    def healthcheck():
        return ""

    return app
Example #2
0

# Configure TorIpChanger.
tor_ip_changer = TorIpChanger(
    reuse_threshold=0,  # We need to remember all exhausted IPs.
    local_http_proxy=Config.LOCAL_HTTP_PROXY,
    tor_password=Config.TOR_PASSWORD,
    tor_port=Config.TOR_PORT,
    new_ip_max_attempts=Config.NEW_IP_MAX_ATTEMPTS,
)

# Configure useragents.
Config.USER_AGENTS = get_user_agents()

# Configure logging.
setup_logging(logger_name="example-scraper2")


# Prepare the scraping pipeline.
scraper = ExampleScraper2()
databaser = Databaser(scraper.db_file, scraper.db_table)
pipeline = ExhaustApiLimitPipeLine(scraper, databaser, tor_ip_changer)
pipeline.prepare_multiprocessing()

try:
    services.start_backbone_services()

    # Change IP before starting.
    pipeline.tor_ip_changer.get_new_ip()

    # Collect item properties.
Example #3
0

# Configure TorIpChanger.
tor_ip_changer = TorIpChanger(
    reuse_threshold=Config.REUSE_THRESHOLD,
    local_http_proxy=Config.LOCAL_HTTP_PROXY,
    tor_password=Config.TOR_PASSWORD,
    tor_port=Config.TOR_PORT,
    new_ip_max_attempts=Config.NEW_IP_MAX_ATTEMPTS
)

# Configure useragents.
Config.USER_AGENTS = get_user_agents()

# Configure logging.
setup_logging(logger_name='example-scraper')


# Prepare the scraping pipeline.
scraper = ExampleScraper()
databaser = Databaser(scraper.db_file, scraper.db_table)
pipeline = Pipeline(scraper, databaser, tor_ip_changer)
pipeline.prepare_multiprocessing()

try:
    services.start_backbone_services()

    # Change IP before starting.
    pipeline.tor_ip_changer.get_new_ip()

    # Collect item URLs.
from scrapemeagain.config import Config
from scrapemeagain.dockerized.utils import (
    apply_scraper_config,
    get_class_from_path,
)
from scrapemeagain.pipeline import DockerizedPipeline
from scrapemeagain.utils.logger import setup_logging
from scrapemeagain.utils.useragents import get_user_agents

from examplescraper.databaser import DockerizedDatabaser
from examplescraper.scraper import DockerizedExampleScraper

# Update config, setup logging and useragents.
apply_scraper_config()
setup_logging(logger_name="examplescraper")
Config.USER_AGENTS = get_user_agents(__file__)

# Configure DockerizedTorIpChanger.
toripchanger_class = get_class_from_path(Config.TORIPCHANGER_CLASS)
tor_ip_changer = toripchanger_class(
    local_http_proxy=Config.LOCAL_HTTP_PROXY,
    tor_password=Config.TOR_PASSWORD,
    tor_port=Config.TOR_PORT,
    new_ip_max_attempts=Config.NEW_IP_MAX_ATTEMPTS,
)

# Prepare the scraping pipeline.
scraper = DockerizedExampleScraper()
databaser = DockerizedDatabaser(scraper.db_file)
pipeline = DockerizedPipeline(scraper, databaser, tor_ip_changer)
pipeline.prepare_pipeline()
Example #5
0
import flask
from toripchanger import TorIpChanger

from scrapemeagain.config import Config
from scrapemeagain.dockerized.utils import (
    apply_scraper_config,
    get_class_from_path,
)
from scrapemeagain.utils.logger import setup_logging

setup_logging(__name__)
apply_scraper_config()

datastore_class = get_class_from_path(Config.DATASTORE_DATABASER_CLASS)
DATASTORE = datastore_class()
urlbroker_class = get_class_from_path(Config.URLBROKER_CLASS)
URLBROKER = urlbroker_class()
IPSTORE = TorIpChanger(reuse_threshold=Config.IPSTORE_REUSE_THRESHOLD)

app = flask.Flask(__name__)


@app.route("/health/")
def healthcheck():
    return ""


@app.route("/ip-is-safe/<ip>/")
def ip_is_safe(ip):
    safe = IPSTORE._ip_is_safe(ip)
    if safe: