Python DupChecker.abort_by_url_hashの例

プログラミング言語: Python

名前空間/パッケージ名: alert.scrapers.DupChecker

クラス/型: DupChecker

メソッド/関数: abort_by_url_hash

hotexamples.comのコード掲載数: 4

Python DupChecker.abort_by_url_hash - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのalert.scrapers.DupChecker.DupChecker.abort_by_url_hashの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

DupChecker(4)

abort_by_url_hash(2)

reset(2)

should_we_continue_break_or_carry_on(2)

update_site_hash(2)

abort_by_hash(1)

コード例 #1

ファイルを表示

ファイル: cl_scrape_oral_arguments.py プロジェクト: ellliottt/courtlistener

    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i in range(0, len(site.case_names)):
                msg, r = get_binary_content(
                    site.download_urls[i],
                    site.cookies,
                    method=site.method
                )
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING',
                             court=court,
                             message=msg).save()
                    continue

                current_date = site.case_dates[i]
                try:
                    next_date = site.case_dates[i + 1]
                except IndexError:
                    next_date = None

                sha1_hash = hashlib.sha1(r.content).hexdigest()
                onwards = dup_checker.should_we_continue_break_or_carry_on(
                    Audio,
                    current_date,
                    next_date,
                    lookup_value=sha1_hash,
                    lookup_by='sha1'
                )

                if onwards == 'CONTINUE':
                    # It's a duplicate, but we haven't hit any thresholds yet.
                    continue
                elif onwards == 'BREAK':
                    # It's a duplicate, and we hit a date or dup_count threshold.
                    dup_checker.update_site_hash(sha1_hash)
                    break
                elif onwards == 'CARRY_ON':
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' % site.download_urls[i])
                    dup_checker.reset()

                    docket, audio_file = self.associate_meta_data_to_objects(
                        site, i, court, sha1_hash)

                    audio_file.docket = docket

                    # Make and associate the file object
                    try:
                        cf = ContentFile(r.content)
                        extension = get_extension(r.content)
                        if extension not in ['.mp3', '.wma']:
                            extension = '.' + site.download_urls[i].rsplit('.', 1)[1]
                        # See bitbucket issue #215 for why this must be
                        # lower-cased.
                        file_name = trunc(site.case_names[i].lower(), 75) + extension
                        audio_file.local_path_original_file.save(file_name, cf, save=False)
                    except:
                        msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                              (site.case_names[i], traceback.format_exc())
                        logger.critical(msg)
                        ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
                        download_error = True
                        continue

                    self.save_everything(docket, audio_file)
                    random_delay = random.randint(0, 3600)
                    process_audio_file.apply_async(
                        (audio_file.pk,),
                        countdown=random_delay
                    )

                    logger.info("Successfully added audio file %s: %s" % (audio_file.pk, site.case_names[i]))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled oral arguments." % site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)

コード例 #2

ファイルを表示

ファイル: cl_scrape_opinions.py プロジェクト: ishammi/courtlistener

    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split(".")[-1].split("_")[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i in range(0, len(site.case_names)):
                msg, r = get_binary_content(
                    site.download_urls[i], site.cookies, site._get_adapter_instance(), method=site.method
                )
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level="WARNING", court=court, message=msg).save()
                    continue
                content = site.cleanup_content(r.content)

                current_date = site.case_dates[i]
                try:
                    next_date = site.case_dates[i + 1]
                except IndexError:
                    next_date = None

                # Make a hash of the data
                if isinstance(content, unicode):
                    sha1_hash = hashlib.sha1(content.encode("utf-8")).hexdigest()
                else:
                    sha1_hash = hashlib.sha1(content).hexdigest()
                if court_str == "nev" and site.precedential_statuses[i] == "Unpublished":
                    # Nevada's non-precedential cases have different SHA1
                    # sums every time.
                    onwards = dup_checker.should_we_continue_break_or_carry_on(
                        Document, current_date, next_date, lookup_value=site.download_urls[i], lookup_by="download_url"
                    )
                else:
                    onwards = dup_checker.should_we_continue_break_or_carry_on(
                        Document, current_date, next_date, lookup_value=sha1_hash, lookup_by="sha1"
                    )

                if onwards == "CONTINUE":
                    # It's a duplicate, but we haven't hit any thresholds yet.
                    continue
                elif onwards == "BREAK":
                    # It's a duplicate, and we hit a date or dup_count
                    # threshold.
                    dup_checker.update_site_hash(sha1_hash)
                    break
                elif onwards == "CARRY_ON":
                    # Not a duplicate, carry on
                    logger.info("Adding new document found at: %s" % site.download_urls[i].encode("utf-8"))
                    dup_checker.reset()

                    cite, docket, doc = self.associate_meta_data_to_objects(site, i, court, sha1_hash)

                    # Make and associate the file object
                    try:
                        cf = ContentFile(content)
                        extension = get_extension(content)
                        # See bitbucket issue #215 for why this must be
                        # lower-cased.
                        file_name = trunc(site.case_names[i].lower(), 75) + extension
                        doc.local_path.save(file_name, cf, save=False)
                    except:
                        msg = "Unable to save binary to disk. Deleted " "document: % s.\n % s" % (
                            site.case_names[i],
                            traceback.format_exc(),
                        )
                        logger.critical(msg.encode("utf-8"))
                        ErrorLog(log_level="CRITICAL", court=court, message=msg).save()
                        download_error = True
                        continue

                    # Save everything, but don't update Solr index yet
                    self.save_everything(cite, docket, doc, index=False)
                    random_delay = random.randint(0, 3600)
                    extract_doc_content.delay(doc.pk, callback=subtask(extract_by_ocr), citation_countdown=random_delay)

                    logger.info(
                        "Successfully added doc {pk}: {name}".format(pk=doc.pk, name=site.case_names[i].encode("utf-8"))
                    )

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled opinions." % site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)

コード例 #3

ファイルを表示

ファイル: cl_scrape_opinions.py プロジェクト: aktary/courtlistener

    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i in range(0, len(site.case_names)):
                msg, r = get_binary_content(site.download_urls[i],
                                            site.cookies,
                                            method=site.method)
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING', court=court,
                             message=msg).save()
                    continue

                current_date = site.case_dates[i]
                try:
                    next_date = site.case_dates[i + 1]
                except IndexError:
                    next_date = None

                # Make a hash of the data
                sha1_hash = hashlib.sha1(r.content).hexdigest()
                if court_str == 'nev' and site.precedential_statuses[
                        i] == 'Unpublished':
                    # Nevada's non-precedential cases have different SHA1 sums every time.
                    onwards = dup_checker.should_we_continue_break_or_carry_on(
                        Document,
                        current_date,
                        next_date,
                        lookup_value=site.download_urls[i],
                        lookup_by='download_url')
                else:
                    onwards = dup_checker.should_we_continue_break_or_carry_on(
                        Document,
                        current_date,
                        next_date,
                        lookup_value=sha1_hash,
                        lookup_by='sha1')

                if onwards == 'CONTINUE':
                    # It's a duplicate, but we haven't hit any thresholds yet.
                    continue
                elif onwards == 'BREAK':
                    # It's a duplicate, and we hit a date or dup_count threshold.
                    dup_checker.update_site_hash(sha1_hash)
                    break
                elif onwards == 'CARRY_ON':
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' %
                                site.download_urls[i])
                    dup_checker.reset()

                    cite, docket, doc = self.associate_meta_data_to_objects(
                        site, i, court, sha1_hash)

                    # Make and associate the file object
                    try:
                        cf = ContentFile(r.content)
                        extension = get_extension(r.content)
                        # See bitbucket issue #215 for why this must be
                        # lower-cased.
                        file_name = trunc(site.case_names[i].lower(),
                                          75) + extension
                        doc.local_path.save(file_name, cf, save=False)
                    except:
                        msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                              (site.case_names[i], traceback.format_exc())
                        logger.critical(msg)
                        ErrorLog(log_level='CRITICAL',
                                 court=court,
                                 message=msg).save()
                        download_error = True
                        continue

                    # Save everything, but don't update Solr index yet
                    self.save_everything(cite, docket, doc, index=False)
                    random_delay = random.randint(0, 3600)
                    extract_doc_content.delay(doc.pk,
                                              callback=subtask(extract_by_ocr),
                                              citation_countdown=random_delay)

                    logger.info("Successfully added doc {pk}: {name}".format(
                        pk=doc.pk, name=site.case_names[i]))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled opinions." % site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)

コード例 #4

ファイルを表示

    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i in range(0, len(site.case_names)):
                msg, r = get_binary_content(site.download_urls[i],
                                            site.cookies,
                                            site._get_adapter_instance(),
                                            method=site.method)
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING', court=court,
                             message=msg).save()
                    continue
                content = site.cleanup_content(r.content)

                current_date = site.case_dates[i]
                try:
                    next_date = site.case_dates[i + 1]
                except IndexError:
                    next_date = None

                sha1_hash = hashlib.sha1(content).hexdigest()
                onwards = dup_checker.should_we_continue_break_or_carry_on(
                    Audio,
                    current_date,
                    next_date,
                    lookup_value=sha1_hash,
                    lookup_by='sha1')

                if onwards == 'CONTINUE':
                    # It's a duplicate, but we haven't hit any thresholds yet.
                    continue
                elif onwards == 'BREAK':
                    # It's a duplicate, and we hit a date or dup_count threshold.
                    dup_checker.update_site_hash(sha1_hash)
                    break
                elif onwards == 'CARRY_ON':
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' %
                                site.download_urls[i])
                    dup_checker.reset()

                    docket, audio_file = self.associate_meta_data_to_objects(
                        site, i, court, sha1_hash)

                    # Make and associate the file object
                    try:
                        cf = ContentFile(content)
                        extension = get_extension(content)
                        if extension not in ['.mp3', '.wma']:
                            extension = '.' + site.download_urls[i].rsplit(
                                '.', 1)[1]
                        # See bitbucket issue #215 for why this must be
                        # lower-cased.
                        file_name = trunc(site.case_names[i].lower(),
                                          75) + extension
                        audio_file.local_path_original_file.save(file_name,
                                                                 cf,
                                                                 save=False)
                    except:
                        msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                              (site.case_names[i], traceback.format_exc())
                        logger.critical(msg)
                        ErrorLog(log_level='CRITICAL',
                                 court=court,
                                 message=msg).save()
                        download_error = True
                        continue

                    self.save_everything(docket, audio_file)
                    random_delay = random.randint(0, 3600)
                    process_audio_file.apply_async((audio_file.pk, ),
                                                   countdown=random_delay)

                    logger.info("Successfully added audio file %s: %s" %
                                (audio_file.pk, site.case_names[i]))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled oral arguments." %
                        site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)