Python site_yielder Exemples, juriscraper.lib.importer.site_yielder Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : cl_back_scrape_opinions.py Projet : OCTO8888/LITIGAIT

    def parse_and_scrape_site(self, mod, full_crawl):
        court_str = mod.__name__.split(".")[-1].split("_")[0]
        logger.info('Using court_str: "%s"' % court_str)

        for site in site_yielder(mod.Site().back_scrape_iterable, mod):
            site.parse()
            self.scrape_court(site, full_crawl=True)

Exemple #2

0

Afficher le fichier

Fichier : cl_back_scrape_opinions.py Projet : aktary/courtlistener

    def parse_and_scrape_site(self, mod, full_crawl):
        court_str = mod.__name__.split('.')[-1].split('_')[0]
        logger.info("Using court_str: \"%s\"" % court_str)

        for site in site_yielder(mod.Site().back_scrape_iterable, mod):
            site.parse()
            self.scrape_court(site, full_crawl=True)

Exemple #3

0

Afficher le fichier

Fichier : sample_caller.py Projet : theophile/juriscraper

def main():
    global die_now

    # this line is used for handling SIGTERM (CTRL+4), so things can die safely
    signal.signal(signal.SIGTERM, signal_handler)

    usage = (
        "usage: %prog -c COURTID [-d|--daemon] [-b|--binaries] [-r|--report]\n\n"
        "To test ca1, downloading binaries, use: \n"
        "    python %prog -c opinions.united_states.federal_appellate.ca1 -b\n\n"
        "To test all federal courts, omitting binaries, use: \n"
        "    python %prog -c opinions.united_states.federal_appellate"
        "Passing the --report option will generate an HTML report in "
        "the root directory after scrapers have run")
    parser = OptionParser(usage)
    parser.add_option(
        "-c",
        "--courts",
        dest="court_id",
        metavar="COURTID",
        help=("The court(s) to scrape and extract. This should be in "
              "the form of a python module or package import "
              "from the Juriscraper library, e.g. "
              '"juriscraper.opinions.united_states.federal.ca1" or '
              'simply "opinions" to do all opinions. If desired, '
              "you can use slashes instead of dots to separate"
              "the import path."),
    )
    parser.add_option(
        "-d",
        "--daemon",
        action="store_true",
        dest="daemonmode",
        default=False,
        help=("Use this flag to turn on daemon "
              "mode, in which all courts requested "
              "will be scraped in turn, non-stop."),
    )
    parser.add_option(
        "-b",
        "--download_binaries",
        action="store_true",
        dest="binaries",
        default=False,
        help=("Use this flag if you wish to download the pdf, "
              "wpd, and doc files."),
    )
    parser.add_option(
        "-v",
        "--verbosity",
        action="count",
        default=1,
        help="Increase output verbosity (e.g., -vv is more than -v).",
    )
    parser.add_option(
        "--backscrape",
        dest="backscrape",
        action="store_true",
        default=False,
        help=
        "Download the historical corpus using the _download_backwards method.",
    )
    parser.add_option(
        "-r",
        "--report",
        action="store_true",
        default=False,
        help="Generate a report.html with the outcome of running the scrapers",
    )

    (options, args) = parser.parse_args()

    daemon_mode = options.daemonmode
    binaries = options.binaries
    court_id = options.court_id
    backscrape = options.backscrape
    generate_report = options.report

    # Set up the print function
    print("Verbosity is set to: %s" % options.verbosity)

    def _v_print(*verb_args):
        if verb_args[0] > (3 - options.verbosity):
            print(verb_args[1])

    global v_print
    v_print = _v_print

    results = {}

    if not court_id:
        parser.error("You must specify a court as a package or module.")
    else:
        court_id = court_id.replace("/", ".")
        if court_id.endswith(".py"):
            court_id = court_id[:-3]

        module_strings = build_module_list(court_id)
        if len(module_strings) == 0:
            parser.error("Unable to import module or package. Aborting.")

        v_print(3, "Starting up the scraper.")
        num_courts = len(module_strings)
        i = 0
        while i < num_courts:
            current_court = module_strings[i]
            results[current_court] = {"global_failure": False}
            # this catches SIGINT, so the code can be killed safely.
            if die_now:
                v_print(3, "The scraper has stopped.")
                sys.exit(1)

            package, module = module_strings[i].rsplit(".", 1)
            v_print(3, "Current court: %s.%s" % (package, module))

            mod = __import__("%s.%s" % (package, module), globals(), locals(),
                             [module])
            try:
                if backscrape:
                    for site in site_yielder(mod.Site().back_scrape_iterable,
                                             mod):
                        site.parse()
                        scrape_court(site, binaries)
                else:
                    site = mod.Site()
                    v_print(3,
                            "Sent %s request to: %s" % (site.method, site.url))
                    if site.uses_selenium:
                        v_print(3, "Selenium will be used.")
                    site.parse()
                    results[current_court]["scrape"] = scrape_court(
                        site, binaries)
            except Exception:
                results[current_court][
                    "global_failure"] = traceback.format_exc()
                results[current_court]["scrape"] = {}
                v_print(3, "*************!! CRAWLER DOWN !!****************")
                v_print(
                    3,
                    "*****scrape_court method failed on mod: %s*****" %
                    module_strings[i],
                )
                v_print(3, "*************!! ACTION NEEDED !!***************")
                v_print(3, traceback.format_exc())
                i += 1
                continue

            last_court_in_list = i == (num_courts - 1)
            if last_court_in_list and daemon_mode:
                i = 0
            else:
                i += 1

    v_print(3, "The scraper has stopped.")

    if generate_report:
        report_path = os.path.abspath(
            os.path.join(os.path.dirname(__file__), "../report.html"))
        v_print(3, "Generating HTML report at %s" % report_path)
        generate_scraper_report(report_path, results)

    sys.exit(0)

Exemple #4

0

Afficher le fichier

Fichier : sample_caller.py Projet : voutilad/juriscraper

def main():
    global die_now

    # this line is used for handling SIGTERM (CTRL+4), so things can die safely
    signal.signal(signal.SIGTERM, signal_handler)

    usage = ('usage: %prog -c COURTID [-d|--daemon] [-b|--binaries]\n\n'
             'To test ca1, downloading binaries, use: \n'
             '    python %prog -c opinions.united_states.federal_appellate.ca1 -b\n\n'
             'To test all federal courts, omitting binaries, use: \n'
             '    python %prog -c opinions.united_states.federal_appellate')
    parser = OptionParser(usage)
    parser.add_option('-c', '--courts', dest='court_id', metavar="COURTID",
                      help=('The court(s) to scrape and extract. This should be in '
                            'the form of a python module or package import '
                            'from the Juriscraper library, e.g. '
                            '"juriscraper.opinions.united_states.federal.ca1" or '
                            'simply "opinions" to do all opinions. If desired, '
                            'you can use slashes instead of dots to separate'
                            'the import path.'))
    parser.add_option('-d', '--daemon', action="store_true", dest='daemonmode',
                      default=False, help=('Use this flag to turn on daemon '
                                           'mode, in which all courts requested '
                                           'will be scraped in turn, non-stop.'))
    parser.add_option('-b', '--download_binaries', action='store_true',
                      dest='binaries',
                      default=False,
                      help=('Use this flag if you wish to download the pdf, '
                            'wpd, and doc files.'))
    parser.add_option('-v',
                      '--verbosity',
                      action='count',
                      default=1,
                      help='Increase output verbosity (e.g., -vv is more than -v).')
    parser.add_option('--backscrape',
                      dest='backscrape',
                      action='store_true',
                      default=False,
                      help='Download the historical corpus using the _download_backwards method.')

    (options, args) = parser.parse_args()

    daemon_mode = options.daemonmode
    binaries = options.binaries
    court_id = options.court_id
    backscrape = options.backscrape

    # Set up the print function
    print("Verbosity is set to: %s" % options.verbosity)
    def _v_print(*verb_args):
        if verb_args[0] > (3 - options.verbosity):
            print(verb_args[1])

    global v_print
    v_print = _v_print

    if not court_id:
        parser.error('You must specify a court as a package or module.')
    else:
        court_id = court_id.replace('/', '.')
        if court_id.endswith('.py'):
            court_id = court_id[:-3]

        module_strings = build_module_list(court_id)
        if len(module_strings) == 0:
            parser.error('Unable to import module or package. Aborting.')

        v_print(3, 'Starting up the scraper.')
        num_courts = len(module_strings)
        i = 0
        while i < num_courts:
            # this catches SIGINT, so the code can be killed safely.
            if die_now:
                v_print(3, 'The scraper has stopped.')
                sys.exit(1)

            package, module = module_strings[i].rsplit('.', 1)
            v_print(3, "Current court: %s.%s" % (package, module))

            mod = __import__("%s.%s" % (package, module),
                             globals(),
                             locals(),
                             [module])
            try:
                if backscrape:
                    for site in site_yielder(mod.Site().back_scrape_iterable, mod):
                        site.parse()
                        scrape_court(site, binaries)
                else:
                    site = mod.Site()
                    v_print(3, 'Sent %s request to: %s' % (site.method, site.url))
                    if site.uses_selenium:
                        v_print(3, "Selenium will be used.")
                    site.parse()
                    scrape_court(site, binaries)
            except Exception:
                v_print(3, '*************!! CRAWLER DOWN !!****************')
                v_print(3, '*****scrape_court method failed on mod: %s*****' % module_strings[i])
                v_print(3, '*************!! ACTION NEEDED !!***************')
                v_print(3, traceback.format_exc())
                i += 1
                continue

            last_court_in_list = (i == (num_courts - 1))
            if last_court_in_list and daemon_mode:
                i = 0
            else:
                i += 1

    v_print(3, 'The scraper has stopped.')
    sys.exit(0)

Exemple #5

0

Afficher le fichier

Fichier : sample_caller.py Projet : uglyboxer/juriscraper

def main():
    global die_now

    # this line is used for handling SIGTERM (CTRL+4), so things can die safely
    signal.signal(signal.SIGTERM, signal_handler)

    usage = ('usage: %prog -c COURTID [-d|--daemon] [-b|--binaries]\n\n'
             'To test ca1, downloading binaries, use: \n'
             '    python %prog -c opinions.united_states.federal_appellate.ca1 -b\n\n'
             'To test all federal courts, omitting binaries, use: \n'
             '    python %prog -c opinions.united_states.federal_appellate')
    parser = OptionParser(usage)
    parser.add_option('-c', '--courts', dest='court_id', metavar="COURTID",
                      help=('The court(s) to scrape and extract. This should be in '
                            'the form of a python module or package import '
                            'from the Juriscraper library, e.g. '
                            '"juriscraper.opinions.united_states.federal.ca1" or '
                            'simply "opinions" to do all opinions. If desired, '
                            'you can use slashes instead of dots to separate'
                            'the import path.'))
    parser.add_option('-d', '--daemon', action="store_true", dest='daemonmode',
                      default=False, help=('Use this flag to turn on daemon '
                                           'mode, in which all courts requested '
                                           'will be scraped in turn, non-stop.'))
    parser.add_option('-b', '--download_binaries', action='store_true',
                      dest='binaries',
                      default=False,
                      help=('Use this flag if you wish to download the pdf, '
                            'wpd, and doc files.'))
    parser.add_option('-v',
                      '--verbosity',
                      action='count',
                      default=1,
                      help='Increase output verbosity (e.g., -vv is more than -v).')
    parser.add_option('--backscrape',
                      dest='backscrape',
                      action='store_true',
                      default=False,
                      help='Download the historical corpus using the _download_backwards method.')

    (options, args) = parser.parse_args()

    daemon_mode = options.daemonmode
    binaries = options.binaries
    court_id = options.court_id
    backscrape = options.backscrape

    # Set up the print function
    print "Verbosity is set to: %s" % options.verbosity
    def _v_print(*verb_args):
        if verb_args[0] > (3 - options.verbosity):
            print verb_args[1]

    global v_print
    v_print = _v_print

    if not court_id:
        parser.error('You must specify a court as a package or module.')
    else:
        court_id = court_id.replace('/', '.')
        if court_id.endswith('.py'):
            court_id = court_id[:-3]

        module_strings = build_module_list(court_id)
        if len(module_strings) == 0:
            parser.error('Unable to import module or package. Aborting.')

        v_print(3, 'Starting up the scraper.')
        num_courts = len(module_strings)
        i = 0
        while i < num_courts:
            # this catches SIGINT, so the code can be killed safely.
            if die_now:
                v_print(3, 'The scraper has stopped.')
                sys.exit(1)

            package, module = module_strings[i].rsplit('.', 1)
            v_print(3, "Current court: %s.%s" % (package, module))

            mod = __import__("%s.%s" % (package, module),
                             globals(),
                             locals(),
                             [module])
            try:
                if backscrape:
                    for site in site_yielder(mod.Site().back_scrape_iterable, mod):
                        site.parse()
                        scrape_court(site, binaries)
                else:
                    site = mod.Site()
                    v_print(3, 'Sent %s request to: %s' % (site.method, site.url))
                    if site.uses_selenium:
                        v_print(3, "Selenium will be used.")
                    site.parse()
                    scrape_court(site, binaries)
            except Exception:
                v_print(3, '*************!! CRAWLER DOWN !!****************')
                v_print(3, '*****scrape_court method failed on mod: %s*****' % module_strings[i])
                v_print(3, '*************!! ACTION NEEDED !!***************')
                v_print(3, traceback.format_exc())
                i += 1
                continue

            last_court_in_list = (i == (num_courts - 1))
            if last_court_in_list and daemon_mode:
                i = 0
            else:
                i += 1

    v_print(3, 'The scraper has stopped.')
    sys.exit(0)