Python get_binary_contentの例

プログラミング言語: Python

名前空間/パッケージ名: alert.scrapers.management.commands.cl_scrape_opinions

メソッド/関数: get_binary_content

hotexamples.comのコード掲載数: 4

Python get_binary_content - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのalert.scrapers.management.commands.cl_scrape_opinions.get_binary_contentの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: cl_scrape_oral_arguments.py プロジェクト: ellliottt/courtlistener

    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i in range(0, len(site.case_names)):
                msg, r = get_binary_content(
                    site.download_urls[i],
                    site.cookies,
                    method=site.method
                )
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING',
                             court=court,
                             message=msg).save()
                    continue

                current_date = site.case_dates[i]
                try:
                    next_date = site.case_dates[i + 1]
                except IndexError:
                    next_date = None

                sha1_hash = hashlib.sha1(r.content).hexdigest()
                onwards = dup_checker.should_we_continue_break_or_carry_on(
                    Audio,
                    current_date,
                    next_date,
                    lookup_value=sha1_hash,
                    lookup_by='sha1'
                )

                if onwards == 'CONTINUE':
                    # It's a duplicate, but we haven't hit any thresholds yet.
                    continue
                elif onwards == 'BREAK':
                    # It's a duplicate, and we hit a date or dup_count threshold.
                    dup_checker.update_site_hash(sha1_hash)
                    break
                elif onwards == 'CARRY_ON':
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' % site.download_urls[i])
                    dup_checker.reset()

                    docket, audio_file = self.associate_meta_data_to_objects(
                        site, i, court, sha1_hash)

                    audio_file.docket = docket

                    # Make and associate the file object
                    try:
                        cf = ContentFile(r.content)
                        extension = get_extension(r.content)
                        if extension not in ['.mp3', '.wma']:
                            extension = '.' + site.download_urls[i].rsplit('.', 1)[1]
                        # See bitbucket issue #215 for why this must be
                        # lower-cased.
                        file_name = trunc(site.case_names[i].lower(), 75) + extension
                        audio_file.local_path_original_file.save(file_name, cf, save=False)
                    except:
                        msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                              (site.case_names[i], traceback.format_exc())
                        logger.critical(msg)
                        ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
                        download_error = True
                        continue

                    self.save_everything(docket, audio_file)
                    random_delay = random.randint(0, 3600)
                    process_audio_file.apply_async(
                        (audio_file.pk,),
                        countdown=random_delay
                    )

                    logger.info("Successfully added audio file %s: %s" % (audio_file.pk, site.case_names[i]))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled oral arguments." % site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)

コード例 #2

ファイルを表示

    def scrape_court(self, site, full_crawl=False):
        download_error = False
        # Get the court object early for logging
        # opinions.united_states.federal.ca9_u --> ca9
        court_str = site.court_id.split('.')[-1].split('_')[0]
        court = Court.objects.get(pk=court_str)

        dup_checker = DupChecker(court, full_crawl=full_crawl)
        abort = dup_checker.abort_by_url_hash(site.url, site.hash)
        if not abort:
            if site.cookies:
                logger.info("Using cookies: %s" % site.cookies)
            for i in range(0, len(site.case_names)):
                msg, r = get_binary_content(site.download_urls[i],
                                            site.cookies,
                                            site._get_adapter_instance(),
                                            method=site.method)
                if msg:
                    logger.warn(msg)
                    ErrorLog(log_level='WARNING', court=court,
                             message=msg).save()
                    continue
                content = site.cleanup_content(r.content)

                current_date = site.case_dates[i]
                try:
                    next_date = site.case_dates[i + 1]
                except IndexError:
                    next_date = None

                sha1_hash = hashlib.sha1(content).hexdigest()
                onwards = dup_checker.should_we_continue_break_or_carry_on(
                    Audio,
                    current_date,
                    next_date,
                    lookup_value=sha1_hash,
                    lookup_by='sha1')

                if onwards == 'CONTINUE':
                    # It's a duplicate, but we haven't hit any thresholds yet.
                    continue
                elif onwards == 'BREAK':
                    # It's a duplicate, and we hit a date or dup_count threshold.
                    dup_checker.update_site_hash(sha1_hash)
                    break
                elif onwards == 'CARRY_ON':
                    # Not a duplicate, carry on
                    logger.info('Adding new document found at: %s' %
                                site.download_urls[i])
                    dup_checker.reset()

                    docket, audio_file = self.associate_meta_data_to_objects(
                        site, i, court, sha1_hash)

                    # Make and associate the file object
                    try:
                        cf = ContentFile(content)
                        extension = get_extension(content)
                        if extension not in ['.mp3', '.wma']:
                            extension = '.' + site.download_urls[i].rsplit(
                                '.', 1)[1]
                        # See bitbucket issue #215 for why this must be
                        # lower-cased.
                        file_name = trunc(site.case_names[i].lower(),
                                          75) + extension
                        audio_file.local_path_original_file.save(file_name,
                                                                 cf,
                                                                 save=False)
                    except:
                        msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                              (site.case_names[i], traceback.format_exc())
                        logger.critical(msg)
                        ErrorLog(log_level='CRITICAL',
                                 court=court,
                                 message=msg).save()
                        download_error = True
                        continue

                    self.save_everything(docket, audio_file)
                    random_delay = random.randint(0, 3600)
                    process_audio_file.apply_async((audio_file.pk, ),
                                                   countdown=random_delay)

                    logger.info("Successfully added audio file %s: %s" %
                                (audio_file.pk, site.case_names[i]))

            # Update the hash if everything finishes properly.
            logger.info("%s: Successfully crawled oral arguments." %
                        site.court_id)
            if not download_error and not full_crawl:
                # Only update the hash if no errors occurred.
                dup_checker.update_site_hash(site.hash)

コード例 #3

ファイルを表示

ファイル: import_oa_from_csv.py プロジェクト: aktary/courtlistener

def download_and_save():
    """This function is run in many threads simultaneously. Each thread
    runs so long as there are items in the queue. Once an item is found, it's
    downloaded and saved.

    The number of items that can be concurrently saved is determined by the
    number of threads that are running this function.
    """
    while True:
        item = queue.get()
        logger.info("Attempting to add item at: %s" % item['url'])
        try:
            msg, r = get_binary_content(
                item['url'],
                {},
            )
        except:
            logger.info("Unable to get item at: %s" % item['url'])
            queue.task_done()

        if msg:
            logger.warn(msg)
            queue.task_done()

        sha1_hash = hashlib.sha1(r.content).hexdigest()
        if Audio.objects.filter(sha1=sha1_hash).exists():
            # Simpsons did it! Try the next one.
            logger.info("Item already exists, moving to next item.")
            queue.task_done()
        else:
            # New item, onwards!
            logger.info('Adding new document found at: %s' % item['url'])
            audio_file = Audio(
                source='H',
                sha1=sha1_hash,
                case_name=item['case_name'],
                date_argued=item['date_argued'],
                download_url=item['url'],
                processing_complete=False,
            )
            if item['judges']:
                audio_file.judges = item['judges']
            if item['docket_number']:
                audio_file.docket_number = item['docket_number']

            court = Court.objects.get(pk=item['court_code'])

            docket = Docket(
                case_name=item['case_name'],
                court=court,
            )
            # Make and associate the file object
            try:
                cf = ContentFile(r.content)
                extension = get_extension(r.content)
                if extension not in ['.mp3', '.wma']:
                    extension = '.' + item['url'].rsplit('.', 1)[1]
                # See bitbucket issue #215 for why this must be
                # lower-cased.
                file_name = trunc(item['case_name'].lower(), 75) + extension
                audio_file.local_path_original_file.save(file_name,
                                                         cf,
                                                         save=False)
            except:
                msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                      (item['case_name'], traceback.format_exc())
                logger.critical(msg)
                queue.task_done()

            docket.save()
            audio_file.docket = docket
            audio_file.save(index=False)

            random_delay = random.randint(0, 3600)
            process_audio_file.apply_async((audio_file.pk, ),
                                           countdown=random_delay)

            logger.info("Successfully added audio file %s: %s" %
                        (audio_file.pk, audio_file.case_name))

コード例 #4

ファイルを表示

ファイル: import_oa_from_csv.py プロジェクト: aktary/courtlistener

def download_and_save():
    """This function is run in many threads simultaneously. Each thread
    runs so long as there are items in the queue. Once an item is found, it's
    downloaded and saved.

    The number of items that can be concurrently saved is determined by the
    number of threads that are running this function.
    """
    while True:
        item = queue.get()
        logger.info("Attempting to add item at: %s" % item['url'])
        try:
            msg, r = get_binary_content(
                item['url'],
                {},
            )
        except:
            logger.info("Unable to get item at: %s" % item['url'])
            queue.task_done()

        if msg:
            logger.warn(msg)
            queue.task_done()

        sha1_hash = hashlib.sha1(r.content).hexdigest()
        if Audio.objects.filter(sha1=sha1_hash).exists():
            # Simpsons did it! Try the next one.
            logger.info("Item already exists, moving to next item.")
            queue.task_done()
        else:
            # New item, onwards!
            logger.info('Adding new document found at: %s' % item['url'])
            audio_file = Audio(
                source='H',
                sha1=sha1_hash,
                case_name=item['case_name'],
                date_argued=item['date_argued'],
                download_url=item['url'],
                processing_complete=False,
            )
            if item['judges']:
                audio_file.judges = item['judges']
            if item['docket_number']:
                audio_file.docket_number = item['docket_number']

            court = Court.objects.get(pk=item['court_code'])

            docket = Docket(
                case_name=item['case_name'],
                court=court,
            )
            # Make and associate the file object
            try:
                cf = ContentFile(r.content)
                extension = get_extension(r.content)
                if extension not in ['.mp3', '.wma']:
                    extension = '.' + item['url'].rsplit('.', 1)[1]
                # See bitbucket issue #215 for why this must be
                # lower-cased.
                file_name = trunc(item['case_name'].lower(), 75) + extension
                audio_file.local_path_original_file.save(file_name, cf,
                                                         save=False)
            except:
                msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                      (item['case_name'], traceback.format_exc())
                logger.critical(msg)
                queue.task_done()

            docket.save()
            audio_file.docket = docket
            audio_file.save(index=False)

            random_delay = random.randint(0, 3600)
            process_audio_file.apply_async(
                (audio_file.pk,),
                countdown=random_delay
            )

            logger.info("Successfully added audio file %s: %s" % (
                audio_file.pk, audio_file.case_name))