def main(argv=None): """ Guido van Rossum's pattern for a Python main function """ if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt( argv[1:], "hirl:u:", ["help", "init", "reparse", "limit=", "urls="]) except getopt.error as msg: raise Usage(msg) init = False limit = 10 # !!! DEBUG default limit on number of articles to parse, unless otherwise specified reparse = False urls = None # Process options for o, a in opts: if o in ("-h", "--help"): print(__doc__) sys.exit(0) elif o in ("-i", "--init"): init = True elif o in ("-r", "--reparse"): reparse = True elif o in ("-l", "--limit"): # Maximum number of articles to parse try: limit = int(a) except ValueError: pass elif o in ('-u', "--urls"): urls = a # Text file with list of URLs # Process arguments for arg in args: pass if init: # Initialize the scraper database init_roots() else: # Read the configuration settings file try: Settings.read("config/Reynir.conf") except ConfigError as e: print("Configuration error: {0}".format(e), file=sys.stderr) return 2 # Run the scraper scrape_articles(reparse=reparse, limit=limit, urls=urls) except Usage as err: print(err.msg, file=sys.stderr) print("For help use --help", file=sys.stderr) return 2 finally: SessionContext.cleanup() Article.cleanup() # Completed with no error return 0
def init_roots(wait=False): """ Create tables and initialize the scraping roots, if not already present. If wait = True, repeated attempts are made to connect to the database before returning an error code. This is useful for instance in a Docker environment where the container may need to wait for a linked database container to start serving. """ ROOTS = [ # Root URL, top-level domain, description, authority ("http://kjarninn.is", "kjarninn.is", "Kjarninn", 1.0, "scrapers.default", "KjarninnScraper", True), ("http://www.ruv.is", "ruv.is", "RÚV", 1.0, "scrapers.default", "RuvScraper", True), ("http://www.visir.is", "visir.is", "Vísir", 0.8, "scrapers.default", "VisirScraper", True), ("http://www.mbl.is/frettir/", "mbl.is", "Morgunblaðið", 0.6, "scrapers.default", "MblScraper", True), ("http://eyjan.pressan.is", "eyjan.pressan.is", "Eyjan", 0.4, "scrapers.default", "EyjanScraper", True), ("http://kvennabladid.is", "kvennabladid.is", "Kvennablaðið", 0.4, "scrapers.default", "KvennabladidScraper", True), ("http://stjornlagarad.is", "stjornlagarad.is", "Stjórnlagaráð", 1.0, "scrapers.default", "StjornlagaradScraper", True), ("https://www.forsaetisraduneyti.is", "forsaetisraduneyti.is", "Forsætisráðuneyti", 1.0, "scrapers.default", "StjornarradScraper", True), ("https://www.innanrikisraduneyti.is", "innanrikisraduneyti.is", "Innanríkisráðuneyti", 1.0, "scrapers.default", "StjornarradScraper", True), ("https://www.fjarmalaraduneyti.is", "fjarmalaraduneyti.is", "Fjármálaráðuneyti", 1.0, "scrapers.default", "StjornarradScraper", True), ("http://reykjanes.local", "reykjanes.local", "Reykjanesbær", 1.0, "scrapers.reykjanes", "ReykjanesScraper", False), ("http://althingi.is", "althingi.is", "Alþingi", 1.0, "scrapers.default", "AlthingiScraper", False) ] retries = 36 # Do no more than 36 retries (~3 minutes) before giving up and returning an error code while True: try: db = SessionContext.db db.create_tables() with SessionContext() as session: for url, domain, description, authority, scr_module, scr_class, scrape in ROOTS: r = Root(url=url, domain=domain, description=description, authority=authority, scr_module=scr_module, scr_class=scr_class, scrape=scrape, visible=scrape and not domain.endswith(".local")) session.add(r) try: # Commit the insert session.commit() except IntegrityError as e: # The root already exist: roll back and continue session.rollback() rlist = session.query(Root).all() print("Roots initialized as follows:") for r in rlist: print("{0:24} {1:36} {2:24}".format( r.domain, r.url, r.scr_class)) # Done without error, break out of enclosing while True loop break except Exception as e: print("Exception in scraperinit.init_roots(): {0}".format(e), file=sys.stderr) sys.stderr.flush() if wait: # If we want to wait until the database responds, sleep and loop if not retries: return 2 # No more retries: Return an error code print("Retrying connection in 5 seconds ({0} retries left)...". format(retries), file=sys.stderr) sys.stderr.flush() sleep(5) retries -= 1 SessionContext.cleanup() # Loop to retry else: # Re-raise the exception raise # Finished without error return 0