def make_objects(self, item, court, sha1_hash, content):
        """Takes the meta data from the scraper and associates it with objects.

        Returns the created objects.
        """
        blocked = item['blocked_statuses']
        if blocked is not None:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))
        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            source=Docket.SCRAPER,
        )

        cluster = OpinionCluster(
            judges=item.get('judges', ''),
            date_filed=item['case_dates'],
            case_name=item['case_names'],
            case_name_short=case_name_short,
            source='C',
            precedential_status=item['precedential_statuses'],
            nature_of_suit=item.get('nature_of_suit', ''),
            blocked=blocked,
            date_blocked=date_blocked,
            federal_cite_one=item.get('west_citations', ''),
            state_cite_one=item.get('west_state_citations', ''),
            neutral_cite=item.get('neutral_citations', ''),
        )
        opinion = Opinion(
            type='010combined',
            sha1=sha1_hash,
            download_url=item['download_urls'],
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            file_name = trunc(item['case_names'].lower(), 75) + extension
            opinion.file_with_date = cluster.date_filed
            opinion.local_path.save(file_name, cf, save=False)
        except:
            msg = ('Unable to save binary to disk. Deleted '
                   'item: %s.\n %s' %
                   (item['case_names'], traceback.format_exc()))
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, opinion, cluster, error
Example #2
0
    def make_objects(self, item, court, sha1_hash, content):
        """Takes the meta data from the scraper and associates it with objects.

        Returns the created objects.
        """
        blocked = item['blocked_statuses']
        if blocked is not None:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))
        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            source=Docket.SCRAPER,
        )

        cluster = OpinionCluster(
            judges=item.get('judges', ''),
            date_filed=item['case_dates'],
            case_name=item['case_names'],
            case_name_short=case_name_short,
            source='C',
            precedential_status=item['precedential_statuses'],
            nature_of_suit=item.get('nature_of_suit', ''),
            blocked=blocked,
            date_blocked=date_blocked,
            federal_cite_one=item.get('west_citations', ''),
            state_cite_one=item.get('west_state_citations', ''),
            neutral_cite=item.get('neutral_citations', ''),
        )
        opinion = Opinion(
            type='010combined',
            sha1=sha1_hash,
            download_url=item['download_urls'],
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            file_name = trunc(item['case_names'].lower(), 75) + extension
            opinion.file_with_date = cluster.date_filed
            opinion.local_path.save(file_name, cf, save=False)
        except:
            msg = ('Unable to save binary to disk. Deleted '
                   'item: %s.\n %s' %
                   (item['case_names'], traceback.format_exc()))
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, opinion, cluster, error
    def make_objects(self, item, court, sha1_hash, content):
        blocked = item['blocked_statuses']
        if blocked:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))

        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            date_argued=item['case_dates'],
            source=Docket.SCRAPER,
        )

        audio_file = Audio(
            judges=item.get('judges', ''),
            source='C',
            case_name=item['case_names'],
            case_name_short=case_name_short,
            sha1=sha1_hash,
            download_url=item['download_urls'],
            blocked=blocked,
            date_blocked=date_blocked,
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            if extension not in ['.mp3', '.wma']:
                extension = '.' + item['download_urls'].lower().rsplit('.', 1)[1]
            # See bitbucket issue #215 for why this must be
            # lower-cased.
            file_name = trunc(item['case_names'].lower(), 75) + extension
            audio_file.file_with_date = docket.date_argued
            audio_file.local_path_original_file.save(file_name, cf, save=False)
        except:
            msg = 'Unable to save binary to disk. Deleted audio file: %s.\n ' \
                  '%s' % (item['case_names'], traceback.format_exc())
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, audio_file, error
    def make_objects(self, item, court, sha1_hash, content):
        blocked = item['blocked_statuses']
        if blocked is not None:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))

        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            date_argued=item['case_dates'],
            source=Docket.SCRAPER,
        )

        audio_file = Audio(
            judges=item.get('judges', ''),
            source='C',
            case_name=item['case_names'],
            case_name_short=case_name_short,
            sha1=sha1_hash,
            download_url=item['download_urls'],
            blocked=blocked,
            date_blocked=date_blocked,
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            if extension not in ['.mp3', '.wma']:
                extension = '.' + item['download_urls'].rsplit('.', 1)[1]
            # See bitbucket issue #215 for why this must be
            # lower-cased.
            file_name = trunc(item['case_names'].lower(), 75) + extension
            audio_file.file_with_date = docket.date_argued
            audio_file.local_path_original_file.save(file_name, cf, save=False)
        except:
            msg = 'Unable to save binary to disk. Deleted audio file: %s.\n ' \
                  '%s' % (item['case_names'], traceback.format_exc())
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, audio_file, error
Example #5
0
 def test_content_extraction(self):
     """Do all of the supported mimetypes get extracted to text
     successfully, including OCR?"""
     test_strings = [
         'supreme',
         'intelligence',
         'indiana',
         'reagan',
         'indiana',
         'fidelity'
     ]
     opinions = Opinion.objects.all()
     for op, test_string in zip(opinions, test_strings):
         ext = get_extension(op.local_path.file.read())
         op = extract_doc_content(op.pk, callback=subtask(extract_by_ocr))
         if ext in ['.html', '.wpd']:
             self.assertIn(test_string, op.html.lower())
         else:
             self.assertIn(test_string, op.plain_text.lower())
def download_and_save():
    """This function is run in many threads simultaneously. Each thread
    runs so long as there are items in the queue. Once an item is found, it's
    downloaded and saved.

    The number of items that can be concurrently saved is determined by the
    number of threads that are running this function.
    """
    while True:
        item = queue.get()
        logger.info("%s: Attempting to add item at: %s" %
                    (threading.current_thread().name, item['url']))
        try:
            msg, r = get_binary_content(
                item['url'],
                {},
            )
        except:
            logger.info("%s: Unable to get item at: %s" %
                        (threading.current_thread().name, item['url']))
            queue.task_done()

        if msg:
            logger.warn(msg)
            queue.task_done()
            continue

        sha1_hash = hashlib.sha1(r.content).hexdigest()
        if Audio.objects.filter(sha1=sha1_hash).exists():
            # Simpsons did it! Try the next one.
            logger.info("%s: Item already exists, moving to next item." %
                        threading.current_thread().name)
            queue.task_done()
            continue
        else:
            # New item, onwards!
            logger.info('%s: Adding new document found at: %s' %
                        (threading.current_thread().name, item['url']))
            audio_file = Audio(
                source='H',
                sha1=sha1_hash,
                case_name=item['case_name'],
                download_url=item['url'],
                processing_complete=False,
            )
            if item['judges']:
                audio_file.judges = item['judges']
            if item['docket_number']:
                audio_file.docket.docket_number = item['docket_number']

            court = Court.objects.get(pk=item['court_code'])

            docket = Docket(
                case_name=item['case_name'],
                court=court,
                date_argued=item['date_argued'],
            )
            # Make and associate the file object
            try:
                cf = ContentFile(r.content)
                extension = get_extension(r.content)
                if extension not in ['.mp3', '.wma']:
                    extension = '.' + item['url'].rsplit('.', 1)[1]
                # See bitbucket issue #215 for why this must be
                # lower-cased.
                file_name = trunc(item['case_name'].lower(), 75) + extension
                audio_file.local_path_original_file.save(file_name, cf,
                                                         save=False)
            except:
                msg = 'Unable to save binary. Deleted document: %s.\n%s' % \
                      (item['case_name'], traceback.format_exc())
                logger.critical(msg)
                queue.task_done()

            docket.save()
            audio_file.docket = docket
            audio_file.save(index=False)

            random_delay = random.randint(0, 3600)
            process_audio_file.apply_async(
                (audio_file.pk,),
                countdown=random_delay
            )

            logger.info("%s: Successfully added audio file %s: %s" %
                        (threading.current_thread().name,
                         audio_file.pk,
                         audio_file.case_name))
def download_and_save():
    """This function is run in many threads simultaneously. Each thread
    runs so long as there are items in the queue. Once an item is found, it's
    downloaded and saved.

    The number of items that can be concurrently saved is determined by the
    number of threads that are running this function.
    """
    while True:
        item = queue.get()
        logger.info("%s: Attempting to add item at: %s" %
                    (threading.current_thread().name, item['url']))
        try:
            msg, r = get_binary_content(
                item['url'],
                {},
            )
        except:
            logger.info("%s: Unable to get item at: %s" %
                        (threading.current_thread().name, item['url']))
            queue.task_done()

        if msg:
            logger.warn(msg)
            queue.task_done()
            continue

        sha1_hash = hashlib.sha1(r.content).hexdigest()
        if Audio.objects.filter(sha1=sha1_hash).exists():
            # Simpsons did it! Try the next one.
            logger.info("%s: Item already exists, moving to next item." %
                        threading.current_thread().name)
            queue.task_done()
            continue
        else:
            # New item, onwards!
            logger.info('%s: Adding new document found at: %s' %
                        (threading.current_thread().name, item['url']))
            audio_file = Audio(
                source='H',
                sha1=sha1_hash,
                case_name=item['case_name'],
                download_url=item['url'],
                processing_complete=False,
            )
            if item['judges']:
                audio_file.judges = item['judges']
            if item['docket_number']:
                audio_file.docket.docket_number = item['docket_number']

            court = Court.objects.get(pk=item['court_code'])

            docket = Docket(
                case_name=item['case_name'],
                court=court,
                date_argued=item['date_argued'],
            )
            # Make and associate the file object
            try:
                cf = ContentFile(r.content)
                extension = get_extension(r.content)
                if extension not in ['.mp3', '.wma']:
                    extension = '.' + item['url'].rsplit('.', 1)[1]
                # See bitbucket issue #215 for why this must be
                # lower-cased.
                file_name = trunc(item['case_name'].lower(), 75) + extension
                audio_file.local_path_original_file.save(file_name,
                                                         cf,
                                                         save=False)
            except:
                msg = 'Unable to save binary. Deleted document: %s.\n%s' % \
                      (item['case_name'], traceback.format_exc())
                logger.critical(msg)
                queue.task_done()

            docket.save()
            audio_file.docket = docket
            audio_file.save(index=False)

            random_delay = random.randint(0, 3600)
            process_audio_file.apply_async((audio_file.pk, ),
                                           countdown=random_delay)

            logger.info("%s: Successfully added audio file %s: %s" %
                        (threading.current_thread().name, audio_file.pk,
                         audio_file.case_name))