コード例 #1
0
def main(argv=None):
    """ Guido van Rossum's pattern for a Python main function """

    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(
                argv[1:], "hirl:u:",
                ["help", "init", "reparse", "limit=", "urls="])
        except getopt.error as msg:
            raise Usage(msg)
        init = False
        limit = 10  # !!! DEBUG default limit on number of articles to parse, unless otherwise specified
        reparse = False
        urls = None

        # Process options
        for o, a in opts:
            if o in ("-h", "--help"):
                print(__doc__)
                sys.exit(0)
            elif o in ("-i", "--init"):
                init = True
            elif o in ("-r", "--reparse"):
                reparse = True
            elif o in ("-l", "--limit"):
                # Maximum number of articles to parse
                try:
                    limit = int(a)
                except ValueError:
                    pass
            elif o in ('-u', "--urls"):
                urls = a  # Text file with list of URLs

        # Process arguments
        for arg in args:
            pass

        if init:

            # Initialize the scraper database
            init_roots()

        else:

            # Read the configuration settings file

            try:
                Settings.read("config/Reynir.conf")
            except ConfigError as e:
                print("Configuration error: {0}".format(e), file=sys.stderr)
                return 2

            # Run the scraper
            scrape_articles(reparse=reparse, limit=limit, urls=urls)

    except Usage as err:
        print(err.msg, file=sys.stderr)
        print("For help use --help", file=sys.stderr)
        return 2

    finally:
        SessionContext.cleanup()
        Article.cleanup()

    # Completed with no error
    return 0
コード例 #2
0
ファイル: scraperinit.py プロジェクト: busla/Reynir
def init_roots(wait=False):
    """ Create tables and initialize the scraping roots, if not already present.
        If wait = True, repeated attempts are made to connect to the database
        before returning an error code. This is useful for instance in a Docker
        environment where the container may need to wait for a linked database
        container to start serving. """

    ROOTS = [
        # Root URL, top-level domain, description, authority
        ("http://kjarninn.is", "kjarninn.is", "Kjarninn", 1.0,
         "scrapers.default", "KjarninnScraper", True),
        ("http://www.ruv.is", "ruv.is", "RÚV", 1.0, "scrapers.default",
         "RuvScraper", True),
        ("http://www.visir.is", "visir.is", "Vísir", 0.8, "scrapers.default",
         "VisirScraper", True),
        ("http://www.mbl.is/frettir/", "mbl.is", "Morgunblaðið", 0.6,
         "scrapers.default", "MblScraper", True),
        ("http://eyjan.pressan.is", "eyjan.pressan.is", "Eyjan", 0.4,
         "scrapers.default", "EyjanScraper", True),
        ("http://kvennabladid.is", "kvennabladid.is", "Kvennablaðið", 0.4,
         "scrapers.default", "KvennabladidScraper", True),
        ("http://stjornlagarad.is", "stjornlagarad.is", "Stjórnlagaráð", 1.0,
         "scrapers.default", "StjornlagaradScraper", True),
        ("https://www.forsaetisraduneyti.is", "forsaetisraduneyti.is",
         "Forsætisráðuneyti", 1.0, "scrapers.default", "StjornarradScraper",
         True),
        ("https://www.innanrikisraduneyti.is", "innanrikisraduneyti.is",
         "Innanríkisráðuneyti", 1.0, "scrapers.default", "StjornarradScraper",
         True),
        ("https://www.fjarmalaraduneyti.is", "fjarmalaraduneyti.is",
         "Fjármálaráðuneyti", 1.0, "scrapers.default", "StjornarradScraper",
         True),
        ("http://reykjanes.local", "reykjanes.local", "Reykjanesbær", 1.0,
         "scrapers.reykjanes", "ReykjanesScraper", False),
        ("http://althingi.is", "althingi.is", "Alþingi", 1.0,
         "scrapers.default", "AlthingiScraper", False)
    ]

    retries = 36  # Do no more than 36 retries (~3 minutes) before giving up and returning an error code

    while True:

        try:

            db = SessionContext.db

            db.create_tables()

            with SessionContext() as session:
                for url, domain, description, authority, scr_module, scr_class, scrape in ROOTS:
                    r = Root(url=url,
                             domain=domain,
                             description=description,
                             authority=authority,
                             scr_module=scr_module,
                             scr_class=scr_class,
                             scrape=scrape,
                             visible=scrape and not domain.endswith(".local"))
                    session.add(r)
                    try:
                        # Commit the insert
                        session.commit()
                    except IntegrityError as e:
                        # The root already exist: roll back and continue
                        session.rollback()

                rlist = session.query(Root).all()
                print("Roots initialized as follows:")
                for r in rlist:
                    print("{0:24} {1:36} {2:24}".format(
                        r.domain, r.url, r.scr_class))

            # Done without error, break out of enclosing while True loop
            break

        except Exception as e:
            print("Exception in scraperinit.init_roots(): {0}".format(e),
                  file=sys.stderr)
            sys.stderr.flush()
            if wait:
                # If we want to wait until the database responds, sleep and loop
                if not retries:
                    return 2  # No more retries: Return an error code
                print("Retrying connection in 5 seconds ({0} retries left)...".
                      format(retries),
                      file=sys.stderr)
                sys.stderr.flush()
                sleep(5)
                retries -= 1
                SessionContext.cleanup()
                # Loop to retry
            else:
                # Re-raise the exception
                raise

    # Finished without error
    return 0