Beispiel #1
0
    def make_objects(self, item, court, sha1_hash, content):
        blocked = item['blocked_statuses']
        if blocked:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))

        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            date_argued=item['case_dates'],
            source=Docket.SCRAPER,
        )

        audio_file = Audio(
            judges=item.get('judges', ''),
            source='C',
            case_name=item['case_names'],
            case_name_short=case_name_short,
            sha1=sha1_hash,
            download_url=item['download_urls'],
            blocked=blocked,
            date_blocked=date_blocked,
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            if extension not in ['.mp3', '.wma']:
                extension = '.' + item['download_urls'].lower().rsplit('.',
                                                                       1)[1]
            # See bitbucket issue #215 for why this must be
            # lower-cased.
            file_name = trunc(item['case_names'].lower(), 75) + extension
            audio_file.file_with_date = docket.date_argued
            audio_file.local_path_original_file.save(file_name, cf, save=False)
        except:
            msg = 'Unable to save binary to disk. Deleted audio file: %s.\n ' \
                  '%s' % (item['case_names'], traceback.format_exc())
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, audio_file, error
Beispiel #2
0
def make_objects(item, court, sha1_hash, content):
    blocked = item["blocked_statuses"]
    if blocked:
        date_blocked = date.today()
    else:
        date_blocked = None

    case_name_short = item.get("case_name_shorts") or cnt.make_case_name_short(
        item["case_names"])

    docket = Docket(
        docket_number=item.get("docket_numbers", ""),
        case_name=item["case_names"],
        case_name_short=case_name_short,
        court=court,
        blocked=blocked,
        date_blocked=date_blocked,
        date_argued=item["case_dates"],
        source=item.get("source") or Docket.SCRAPER,
    )

    audio_file = Audio(
        judges=item.get("judges", ""),
        source=item.get("cluster_source") or "C",
        case_name=item["case_names"],
        case_name_short=case_name_short,
        sha1=sha1_hash,
        download_url=item["download_urls"],
        blocked=blocked,
        date_blocked=date_blocked,
    )

    error = False
    try:
        cf = ContentFile(content)
        extension = get_extension(content)
        if extension not in [".mp3", ".wma"]:
            extension = "." + item["download_urls"].lower().rsplit(".", 1)[1]
        # See bitbucket issue #215 for why this must be
        # lower-cased.
        file_name = trunc(item["case_names"].lower(), 75) + extension
        audio_file.file_with_date = docket.date_argued
        audio_file.local_path_original_file.save(file_name, cf, save=False)
    except:
        msg = ("Unable to save binary to disk. Deleted audio file: %s.\n "
               "%s" % (item["case_names"], traceback.format_exc()))
        logger.critical(msg.encode())
        ErrorLog(log_level="CRITICAL", court=court, message=msg).save()
        error = True

    return docket, audio_file, error
    def make_objects(self, item, court, sha1_hash, content):
        blocked = item['blocked_statuses']
        if blocked:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))

        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            date_argued=item['case_dates'],
            source=Docket.SCRAPER,
        )

        audio_file = Audio(
            judges=item.get('judges', ''),
            source='C',
            case_name=item['case_names'],
            case_name_short=case_name_short,
            sha1=sha1_hash,
            download_url=item['download_urls'],
            blocked=blocked,
            date_blocked=date_blocked,
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            if extension not in ['.mp3', '.wma']:
                extension = '.' + item['download_urls'].lower().rsplit('.', 1)[1]
            # See bitbucket issue #215 for why this must be
            # lower-cased.
            file_name = trunc(item['case_names'].lower(), 75) + extension
            audio_file.file_with_date = docket.date_argued
            audio_file.local_path_original_file.save(file_name, cf, save=False)
        except:
            msg = 'Unable to save binary to disk. Deleted audio file: %s.\n ' \
                  '%s' % (item['case_names'], traceback.format_exc())
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, audio_file, error
    def make_objects(self, item, court, sha1_hash, content):
        """Takes the meta data from the scraper and associates it with objects.

        Returns the created objects.
        """
        blocked = item['blocked_statuses']
        if blocked:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))
        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            source=Docket.SCRAPER,
        )

        west_cite_str = item.get('west_citations', '')
        state_cite_str = item.get('west_state_citations', '')
        neutral_cite_str = item.get('neutral_citations', '')
        cluster = OpinionCluster(
            judges=item.get('judges', ''),
            date_filed=item['case_dates'],
            date_filed_is_approximate=item['date_filed_is_approximate'],
            case_name=item['case_names'],
            case_name_short=case_name_short,
            source='C',
            precedential_status=item['precedential_statuses'],
            nature_of_suit=item.get('nature_of_suit', ''),
            blocked=blocked,
            date_blocked=date_blocked,
            # These three fields are replaced below.
            federal_cite_one=west_cite_str,
            state_cite_one=state_cite_str,
            neutral_cite=neutral_cite_str,
            syllabus=item.get('summaries', ''),
        )
        citations = []
        cite_types = [
            (west_cite_str, Citation.WEST),
            (state_cite_str, Citation.STATE),
            (neutral_cite_str, Citation.NEUTRAL),
        ]
        for cite_str, cite_type in cite_types:
            if cite_str:
                citations.append(make_citation(cite_str, cluster, cite_type))
        opinion = Opinion(
            type='010combined',
            sha1=sha1_hash,
            download_url=item['download_urls'],
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            file_name = trunc(item['case_names'].lower(), 75) + extension
            opinion.file_with_date = cluster.date_filed
            opinion.local_path.save(file_name, cf, save=False)
        except:
            msg = ('Unable to save binary to disk. Deleted '
                   'item: %s.\n %s' %
                   (item['case_names'], traceback.format_exc()))
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, opinion, cluster, citations, error
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        global die_now

        # this line is used for handling SIGTERM (CTRL+4), so things can die
        # safely
        signal.signal(signal.SIGTERM, signal_handler)

        module_strings = build_module_list(options['court_id'])
        if not len(module_strings):
            raise CommandError('Unable to import module or package. Aborting.')

        logger.info("Starting up the scraper.")
        num_courts = len(module_strings)
        wait = (options['rate'] * 60) / num_courts
        i = 0
        while i < num_courts:
            # this catches SIGTERM, so the code can be killed safely.
            if die_now:
                logger.info("The scraper has stopped.")
                sys.exit(1)

            package, module = module_strings[i].rsplit('.', 1)

            mod = __import__("%s.%s" % (package, module), globals(), locals(),
                             [module])
            # noinspection PyBroadException
            try:
                self.parse_and_scrape_site(mod, options['full_crawl'])
            except Exception as e:
                # noinspection PyBroadException
                try:
                    msg = ('********!! CRAWLER DOWN !!***********\n'
                           '*****scrape_court method failed!*****\n'
                           '********!! ACTION NEEDED !!**********\n%s' %
                           traceback.format_exc())
                    logger.critical(msg)

                    # opinions.united_states.federal.ca9_u --> ca9
                    court_str = mod.Site.__module__.split('.')[-1].split(
                        '_')[0]
                    court = Court.objects.get(pk=court_str)
                    ErrorLog(log_level='CRITICAL', court=court,
                             message=msg).save()
                except Exception as e:
                    # This is very important. Without this, an exception
                    # above will crash the caller.
                    pass
            finally:
                time.sleep(wait)
                last_court_in_list = (i == (num_courts - 1))
                if last_court_in_list and options['daemon']:
                    # Start over...
                    logger.info(
                        "All jurisdictions done. Looping back to "
                        "the beginning because daemon mode is enabled.")
                    i = 0
                else:
                    i += 1

        logger.info("The scraper has stopped.")
        sys.exit(0)
def make_objects(item, court, sha1_hash, content):
    """Takes the meta data from the scraper and associates it with objects.

    Returns the created objects.
    """
    blocked = item["blocked_statuses"]
    if blocked:
        date_blocked = date.today()
    else:
        date_blocked = None

    case_name_short = item.get("case_name_shorts") or cnt.make_case_name_short(
        item["case_names"])

    docket = Docket(
        docket_number=item.get("docket_numbers", ""),
        case_name=item["case_names"],
        case_name_short=case_name_short,
        court=court,
        blocked=blocked,
        date_blocked=date_blocked,
        source=item.get("source") or Docket.SCRAPER,
    )

    west_cite_str = item.get("west_citations", "")
    state_cite_str = item.get("west_state_citations", "")
    neutral_cite_str = item.get("neutral_citations", "")
    cluster = OpinionCluster(
        judges=item.get("judges", ""),
        date_filed=item["case_dates"],
        date_filed_is_approximate=item["date_filed_is_approximate"],
        case_name=item["case_names"],
        case_name_short=case_name_short,
        source=item.get("cluster_source") or "C",
        precedential_status=item["precedential_statuses"],
        nature_of_suit=item.get("nature_of_suit", ""),
        blocked=blocked,
        date_blocked=date_blocked,
        syllabus=item.get("summaries", ""),
    )
    citations = []
    cite_types = [
        (west_cite_str, Citation.WEST),
        (state_cite_str, Citation.STATE),
        (neutral_cite_str, Citation.NEUTRAL),
    ]
    for cite_str, cite_type in cite_types:
        if cite_str:
            citations.append(make_citation(cite_str, cluster, cite_type))
    opinion = Opinion(
        type=Opinion.COMBINED,
        sha1=sha1_hash,
        download_url=item["download_urls"],
    )

    error = False
    try:
        cf = ContentFile(content)
        extension = get_extension(content)
        file_name = trunc(item["case_names"].lower(), 75) + extension
        opinion.file_with_date = cluster.date_filed
        opinion.local_path.save(file_name, cf, save=False)
    except:
        msg = "Unable to save binary to disk. Deleted " "item: %s.\n %s" % (
            item["case_names"],
            traceback.format_exc(),
        )
        logger.critical(msg.encode("utf-8"))
        ErrorLog(log_level="CRITICAL", court=court, message=msg).save()
        error = True

    return docket, opinion, cluster, citations, error
    def make_objects(self, item, court, sha1_hash, content):
        """Takes the meta data from the scraper and associates it with objects.

        Returns the created objects.
        """
        blocked = item['blocked_statuses']
        if blocked:
            date_blocked = date.today()
        else:
            date_blocked = None

        case_name_short = (item.get('case_name_shorts') or
                           self.cnt.make_case_name_short(item['case_names']))
        docket = Docket(
            docket_number=item.get('docket_numbers', ''),
            case_name=item['case_names'],
            case_name_short=case_name_short,
            court=court,
            blocked=blocked,
            date_blocked=date_blocked,
            source=Docket.SCRAPER,
        )

        west_cite_str = item.get('west_citations', '')
        state_cite_str = item.get('west_state_citations', '')
        neutral_cite_str = item.get('neutral_citations', '')
        cluster = OpinionCluster(
            judges=item.get('judges', ''),
            date_filed=item['case_dates'],
            date_filed_is_approximate=item['date_filed_is_approximate'],
            case_name=item['case_names'],
            case_name_short=case_name_short,
            source='C',
            precedential_status=item['precedential_statuses'],
            nature_of_suit=item.get('nature_of_suit', ''),
            blocked=blocked,
            date_blocked=date_blocked,
            # These three fields are replaced below.
            federal_cite_one=west_cite_str,
            state_cite_one=state_cite_str,
            neutral_cite=neutral_cite_str,
            syllabus=item.get('summaries', ''),
        )
        citations = []
        cite_types = [
            (west_cite_str, Citation.WEST),
            (state_cite_str, Citation.STATE),
            (neutral_cite_str, Citation.NEUTRAL),
        ]
        for cite_str, cite_type in cite_types:
            if cite_str:
                citations.append(make_citation(cite_str, cluster, cite_type))
        opinion = Opinion(
            type='010combined',
            sha1=sha1_hash,
            download_url=item['download_urls'],
        )

        error = False
        try:
            cf = ContentFile(content)
            extension = get_extension(content)
            file_name = trunc(item['case_names'].lower(), 75) + extension
            opinion.file_with_date = cluster.date_filed
            opinion.local_path.save(file_name, cf, save=False)
        except:
            msg = ('Unable to save binary to disk. Deleted '
                   'item: %s.\n %s' %
                   (item['case_names'], traceback.format_exc()))
            logger.critical(msg.encode('utf-8'))
            ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
            error = True

        return docket, opinion, cluster, citations, error
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        global die_now

        # this line is used for handling SIGTERM (CTRL+4), so things can die
        # safely
        signal.signal(signal.SIGTERM, signal_handler)

        module_strings = build_module_list(options['court_id'])
        if not len(module_strings):
            raise CommandError('Unable to import module or package. Aborting.')

        logger.info("Starting up the scraper.")
        num_courts = len(module_strings)
        wait = (options['rate'] * 60) / num_courts
        i = 0
        while i < num_courts:
            # this catches SIGTERM, so the code can be killed safely.
            if die_now:
                logger.info("The scraper has stopped.")
                sys.exit(1)

            package, module = module_strings[i].rsplit('.', 1)

            mod = __import__(
                "%s.%s" % (package, module),
                globals(),
                locals(),
                [module]
            )
            # noinspection PyBroadException
            try:
                self.parse_and_scrape_site(mod, options['full_crawl'])
            except Exception as e:
                # noinspection PyBroadException
                try:
                    msg = ('********!! CRAWLER DOWN !!***********\n'
                           '*****scrape_court method failed!*****\n'
                           '********!! ACTION NEEDED !!**********\n%s' %
                           traceback.format_exc())
                    logger.critical(msg)

                    # opinions.united_states.federal.ca9_u --> ca9
                    court_str = mod.Site.__module__.split('.')[-1].split('_')[0]
                    court = Court.objects.get(pk=court_str)
                    ErrorLog(
                        log_level='CRITICAL',
                        court=court,
                        message=msg
                    ).save()
                except Exception as e:
                    # This is very important. Without this, an exception
                    # above will crash the caller.
                    pass
            finally:
                time.sleep(wait)
                last_court_in_list = (i == (num_courts - 1))
                if last_court_in_list and options['daemon']:
                    # Start over...
                    logger.info("All jurisdictions done. Looping back to "
                                "the beginning because daemon mode is enabled.")
                    i = 0
                else:
                    i += 1

        logger.info("The scraper has stopped.")
        sys.exit(0)