Example #1
0
 def get_cell_labels(self, cells, table_headers):
     """We shouldn't have to do this, but the SCOTUS site has a
     persistent problem whereby malformed tables are not uncommon.
     Sporadically a table with have the header and some rows in the
     COLUMN_LABELS_LEGACY format, and other rows in the newer
     COLUMN_LABELS format. Ive contacted the court about this many
     times, and often times they clear their CDN cache the resolve
     the problem, but its become such a reoccuring annoyance that
     I felt the need to implement this workaround. The court hasn't
     provided any indication in our communication that they have any
     understanding of the problem or why it is happening.
     """
     count_cell = len(cells)
     count_headers = len(table_headers)
     acceptable_headers_counts = [len(f) for f in self.TABLE_HEADER_FORMATS]
     if count_cell != count_headers:
         cell_data = ', '.join([c.text_content() for c in cells])
         Logger.warning(
             'Cell/Header mismatch, found %d headers but %d cells: %s'
             % (count_headers, count_cell, cell_data)
         )
     if count_cell not in acceptable_headers_counts:
         raise RuntimeError(
             'Row has an unfamiliar number of cells: %d' % count_cell
         )
     for header_format in self.TABLE_HEADER_FORMATS:
         # First header element must match first element from
         # expected table_headers, since there are multiple
         # formats with the same length
         if len(header_format) == count_cell and header_format[0] == table_headers[0]:
             return header_format
     raise RuntimeError('Should never reach this error')
Example #2
0
 def get_cell_labels(self, cells, table_headers):
     """We shouldn't have to do this, but the SCOTUS site has a
     persistent problem whereby malformed tables are not uncommon.
     Sporadically a table with have the header and some rows in the
     COLUMN_LABELS_LEGACY format, and other rows in the newer
     COLUMN_LABELS format. Ive contacted the court about this many
     times, and often times they clear their CDN cache the resolve
     the problem, but its become such a reoccuring annoyance that
     I felt the need to implement this workaround. The court hasn't
     provided any indication in our communication that they have any
     understanding of the problem or why it is happening.
     """
     count_cell = len(cells)
     count_headers = len(table_headers)
     acceptable_headers_counts = [len(f) for f in self.TABLE_HEADER_FORMATS]
     if count_cell != count_headers:
         cell_data = ', '.join([c.text_content() for c in cells])
         Logger.warning(
             'Cell/Header mismatch, found %d headers but %d cells: %s' %
             (count_headers, count_cell, cell_data))
     if count_cell not in acceptable_headers_counts:
         raise RuntimeError('Row has an unfamiliar number of cells: %d' %
                            count_cell)
     for header_format in self.TABLE_HEADER_FORMATS:
         # First header element must match first element from
         # expected table_headers, since there are multiple
         # formats with the same length
         if len(header_format
                ) == count_cell and header_format[0] == table_headers[0]:
             return header_format
     raise RuntimeError('Should never reach this error')
Example #3
0
 def run(self):
     Logger.info('INITIATING DISCOVERY: %s' % timezone.now())
     Logger.info('[**%s**]' % self.OPINIONS_MAIN_PAGE)
     self.fetch_opinion_category_urls()
     self.get_opinions_from_categories()
     Logger.info('INITIATING OPINION INGEST')
     self.ingest_new_opinions()
     Logger.info('INITIATION CITATION SCRAPING AND INGEST')
     self.ingest_new_citations()
     Logger.info('DISCOVERY COMPLETE')
Example #4
0
 def run(self):
     Logger.info('INITIATING DISCOVERY: %s' % timezone.now())
     Logger.info('[**%s**]' % self.OPINIONS_MAIN_PAGE)
     self.fetch_opinion_category_urls()
     self.get_opinions_from_categories()
     Logger.info('INITIATING OPINION INGEST')
     self.ingest_new_opinions()
     Logger.info('INITIATION CITATION SCRAPING AND INGEST')
     self.ingest_new_citations()
     Logger.info('DISCOVERY COMPLETE')
Example #5
0
 def handle(self, *args, **options):
     try:
         print('\nRunning discovery. Logging to logs/%s.log\n' %
               time.strftime('%Y%m%d'))
         job = Discovery()
         job.run()
         job.send_email_report()
     except Exception as e:
         Logger.error(traceback.format_exc())
         error = 'FAILED DISCOVER ERROR: %s' % e
         self.send_error_email(error)
Example #6
0
    def ingest_new_opinions(self):
        # Sort opinions by publication date, oldest to newest
        self.discovered_opinions.sort(key=lambda o: o.published)

        for opinion in self.discovered_opinions:
            if opinion.already_exists():
                Logger.info('Skipping: %s' % opinion.name)
                continue

            Logger.info('Ingesting: %s  %s' % (opinion.name, opinion.pdf_url))

            opinion.save()
            self.new_opinions.append(opinion)
Example #7
0
    def ingest_new_opinions(self):
        # Sort opinions by publication date, oldest to newest
        self.discovered_opinions.sort(key=lambda o: o.published)

        for opinion in self.discovered_opinions:
            if opinion.already_exists():
                Logger.info('Skipping: %s' % opinion.name)
                continue

            Logger.info('Ingesting: %s  %s' % (opinion.name, opinion.pdf_url))

            opinion.save()
            self.new_opinions.append(opinion)
 def _send_email(self, subscriber, citations):
     if settings.EMAIL_HOST_USER != 'YOUR_GMAIL_ADDRESS':
         Logger.info('+sending subscriber notification to %s' % subscriber.email)
         html = get_template('newly_verified_citations_email.html')
         template_parameters = {
             'subscriber': subscriber,
             'domain': settings.ALLOWED_HOSTS[0] if settings.ALLOWED_HOSTS else False,
             'citations': citations,
             'contact_email': settings.EMAIL_HOST_USER if settings.EMAIL_HOST_USER else False
         }
         body = html.render(template_parameters)
         subject = '[scotuswebcites] New citations discovered and verified'
         sender = settings.EMAIL_HOST_USER
         recipient = subscriber.email
         msg = EmailMultiAlternatives(subject, body, sender, [recipient])
         msg.attach_alternative(body, "text/html")
         msg.send()
Example #9
0
 def send_email_report(self):
     if settings.EMAIL_HOST_USER != 'YOUR_GMAIL_ADDRESS':
         if self.new_opinions or self.new_justices or self.ingested_citations_count or self.failed_scrapes:
             subject = '[scotuswebcites] New Data Discovered'
             recipient = settings.CONTACT_EMAIL
             sender = settings.EMAIL_HOST_USER
             template_parameters = {
                 'new_opinions_count': str(len(self.new_opinions)),
                 'ingested_citations_count': str(self.ingested_citations_count),
                 'new_justices': self.new_justices,
                 'failed_scrapes': self.failed_scrapes,
                 'domain': settings.ALLOWED_HOSTS[0] if settings.ALLOWED_HOSTS else False,
             }
             body = get_template('discovery_report_email.html').render(template_parameters)
             Logger.info('+sending discovery report email from %s to %s' % (sender, recipient))
             msg = EmailMultiAlternatives(subject, body, sender, [recipient])
             msg.attach_alternative(body, "text/html")
             msg.send()
 def _send_email(self, subscriber, citations):
     if settings.EMAIL_HOST_USER != 'YOUR_GMAIL_ADDRESS':
         Logger.info('+sending subscriber notification to %s' %
                     subscriber.email)
         template = get_template('newly_verified_citations_email.html')
         template_parameters = {
             'subscriber':
             subscriber,
             'domain':
             settings.ALLOWED_HOSTS[0] if settings.ALLOWED_HOSTS else False,
             'citations':
             citations,
             'contact_email':
             settings.SENDER_EMAIL if settings.SENDER_EMAIL else False
         }
         body = template.render(template_parameters)
         subject = '[scotuswebcites] New citations discovered and verified'
         send_email(subject, body, subscriber.email)
Example #11
0
    def get(cls, url=False, err=True):
        if url:
            # Wait 2 seconds between requests
            sleep(cls.SLEEP)
            check = urlparse(url)

            if not check.scheme:
                url = 'http://' + url 

            try:
                return requests.get(url, headers=cls.HEADERS, timeout=cls.TIMEOUT,)
            except Exception:
                pass
       
        if err:
            Logger.error('Fetching failed for: %s' % url)

        return False
Example #12
0
    def ingest_citations(self):
        self.ingested_citation_count = 0

        for url in self.pdf.urls:
            if url in self.previous_publication_citations:
                Logger.info(
                    '--Skipping previously discovered citation for %s: %s' %
                    (self.name, url))
                continue

            Logger.info('++Ingesting citation: %s' % url)

            new_citation = Citation(
                opinion=Opinion(self.id),
                scraped=url,
            )

            new_citation.yyyymmdd = self.published.strftime("%Y%m%d")
            new_citation.get_statuses()
            new_citation.save()
            self.ingested_citation_count += 1
Example #13
0
 def send_email_report(self):
     if settings.EMAIL_HOST_USER != 'YOUR_GMAIL_ADDRESS':
         if self.new_opinions or self.new_justices or self.ingested_citations_count or self.failed_scrapes:
             subject = '[scotuswebcites] New Data Discovered'
             template_parameters = {
                 'new_opinions_count':
                 str(len(self.new_opinions)),
                 'ingested_citations_count':
                 str(self.ingested_citations_count),
                 'new_justices':
                 self.new_justices,
                 'failed_scrapes':
                 self.failed_scrapes,
                 'domain':
                 settings.ALLOWED_HOSTS[0]
                 if settings.ALLOWED_HOSTS else False,
             }
             body = get_template('discovery_report_email.html').render(
                 template_parameters)
             Logger.info('+sending discovery report email from %s to %s' %
                         (settings.SENDER_EMAIL, settings.CONTACT_EMAIL))
             send_email(subject, body, settings.CONTACT_EMAIL)
Example #14
0
    def get(cls, url=False, err=True):
        if url:
            # Wait 2 seconds between requests
            sleep(cls.SLEEP)
            check = urlparse(url)

            if not check.scheme:
                url = 'http://' + url

            try:
                return requests.get(
                    url,
                    headers=cls.HEADERS,
                    timeout=cls.TIMEOUT,
                )
            except Exception:
                pass

        if err:
            Logger.error('Fetching failed for: %s' % url)

        return False
Example #15
0
    def ingest_new_citations(self):
        for opinion in self.new_opinions:
            Logger.info('Downloading: %s  %s' % (opinion.name, opinion.pdf_url))
            opinion.download()
            Logger.info('Scraping: %s  %s' % (opinion.name, opinion.local_pdf))

            try:
                opinion.scrape()
            except:
                Logger.error(traceback.format_exc())
                self.failed_scrapes.append(opinion.name)

            if opinion.pdf.urls:
                Logger.info('Ingesting citations from %s' % opinion.name)
                opinion.ingest_citations()
                self.ingested_citations_count += opinion.ingested_citation_count
Example #16
0
    def ingest_new_citations(self):
        for opinion in self.new_opinions:
            Logger.info('Downloading: %s  %s' %
                        (opinion.name, opinion.pdf_url))
            opinion.download()
            Logger.info('Scraping: %s  %s' % (opinion.name, opinion.local_pdf))

            try:
                opinion.scrape()
            except:
                Logger.error(traceback.format_exc())
                self.failed_scrapes.append(opinion.name)

            if opinion.pdf.urls:
                Logger.info('Ingesting citations from %s' % opinion.name)
                opinion.ingest_citations()
                self.ingested_citations_count += opinion.ingested_citation_count
    def handle(self, *args, **options):
        Logger.info("RUNNING NOTIFY SUBSCRIBERS FUNCTION: %s" % timezone.now())
        newly_verified_citations = Citation.objects.filter(
            validated__isnull=False,
            notified_subscribers__isnull=True,
        )
        if newly_verified_citations:
            Logger.info('Found %d newly verified citations' % len(newly_verified_citations))

            # Send email notifications to subscribers
            subscribers = Subscriber.objects.filter(subscribed=True)
            if subscribers:
                Logger.info('Emailing newly verified citations to %d subscribers' % len(subscribers))
                for subscriber in subscribers:
                    self._send_email(subscriber, newly_verified_citations)

            # Update citations records to indicate that notifications sent
            for citation in newly_verified_citations:
                citation.notified_subscribers = timezone.now()
                citation.save()
    def handle(self, *args, **options):
        Logger.info("RUNNING NOTIFY SUBSCRIBERS FUNCTION: %s" % timezone.now())
        newly_verified_citations = Citation.objects.filter(
            validated__isnull=False,
            notified_subscribers__isnull=True,
        )
        if newly_verified_citations:
            Logger.info('Found %d newly verified citations' %
                        len(newly_verified_citations))

            # Send email notifications to subscribers
            subscribers = Subscriber.objects.filter(subscribed=True)
            if subscribers:
                Logger.info(
                    'Emailing newly verified citations to %d subscribers' %
                    len(subscribers))
                for subscriber in subscribers:
                    self._send_email(subscriber, newly_verified_citations)

            # Update citations records to indicate that notifications sent
            for citation in newly_verified_citations:
                citation.notified_subscribers = timezone.now()
                citation.save()
Example #19
0
    def get_opinions_from_categories(self):
        for category_url in self.category_urls:
            category = category_url.split('/')[-2]
            request = Url.get(category_url)    

            if request and request.status_code == 200:
                Logger.info('EXTRACTING OPINIONS FROM %s' % category_url)
                html = lxml.html.fromstring(request.text)

                if not html.xpath(self.PATH_TABLES):
                    # Its a new term with an empty page, no table yet
                    Logger.info('SKIPPING BLANK PAGE: %s' % category_url)
                    continue

                table_headers = self.get_table_headers(html)

                for row in html.xpath(self.PATH_TABLE_ROWS):
                    cells = row.xpath('td')
                    if not cells:
                        continue

                    revisions = []
                    row_data = {}
                    cell_count = 0
                    cell_labels = self.get_cell_labels(cells, table_headers)

                    # Parse data from rows in table
                    for cell in cells:
                        cell_label = cell_labels[cell_count]
                        text = cell.text_content().strip()

                        # Skip rows with empty first cell, these
                        # can appear at the start of a new cycle
                        # when scotus adds new date pages that
                        # do not yet have records
                        if cell_count == 0 and not text:
                            break

                        if cell_label == 'Revised':
                            # Revised cells can have multiple links
                            # so we must have special handling for it
                            for anchor in cell.xpath('a'):
                                revisions.append({
                                    'href': anchor.xpath('@href')[0],
                                    'date_string': anchor.text_content(),
                                })
                        else:
                            row_data[cell_label] = text if text else None
                            if cell.xpath('a'):
                                href = cell.xpath('a/@href')
                                row_data[cell_label + '_Url'] = href[0] if href else None

                        cell_count += 1

                    if row_data:
                        Logger.info('Discovered: %s' % row_data['Name'])

                        # Validate the justice, or add new record for him/her
                        if not Justice.objects.filter(id=row_data['J.']):
                            self.new_justices.append(row_data['J.'])
                            justice = Justice(id=row_data['J.'], name=row_data['J.'])
                            justice.save()

                        # Convert all scraped data to uniform unicode string
                        for label, data in row_data.items():
                            if data:
                                row_data[label] = str(data)

                        # Create new opinion record from row data
                        self.discovered_opinions.append(Opinion(
                            category=category,
                            reporter=row_data['R-'] if 'R-' in row_data else None,
                            published=self.convert_date_string(row_data['Date']),
                            docket=row_data['Docket'],
                            name=row_data['Name'],
                            pdf_url=self.BASE + row_data['Name_Url'],
                            justice=Justice(row_data['J.']),
                            part=row_data['Pt.'],
                            discovered=timezone.now(),
                        ))

                        # Create opinions for revision, if it exists
                        for revision in revisions:
                            date_string = revision['date_string']
                            href = revision['href']
                            Logger.info('Discovered REVISION: %s' % row_data['Name'])
                            self.discovered_opinions.append(Opinion(
                                category=category,
                                reporter=row_data['R-'] if 'R-' in row_data else None,
                                published=self.convert_date_string(date_string),
                                docket=row_data['Docket'],
                                name='%s [REVISION]' % row_data['Name'],
                                pdf_url=self.BASE + href,
                                justice=Justice(row_data['J.']),
                                part=row_data['Pt.'],
                                discovered=timezone.now(),
                            ))
Example #20
0
    def get_opinions_from_categories(self):
        for category_url in self.category_urls:
            category = category_url.split('/')[-2]
            request = Url.get(category_url)

            if request and request.status_code == 200:
                Logger.info('EXTRACTING OPINIONS FROM %s' % category_url)
                html = lxml.html.fromstring(request.text)

                if not html.xpath(self.PATH_TABLES):
                    # Its a new term with an empty page, no table yet
                    Logger.info('SKIPPING BLANK PAGE: %s' % category_url)
                    continue

                table_headers = self.get_table_headers(html)

                for row in html.xpath(self.PATH_TABLE_ROWS):
                    cells = row.xpath('td')
                    if not cells:
                        continue

                    revisions = []
                    row_data = {}
                    cell_count = 0
                    cell_labels = self.get_cell_labels(cells, table_headers)

                    # Parse data from rows in table
                    for cell in cells:
                        cell_label = cell_labels[cell_count]
                        text = cell.text_content().strip()

                        # Skip rows with empty first cell, these
                        # can appear at the start of a new cycle
                        # when scotus adds new date pages that
                        # do not yet have records
                        if cell_count == 0 and not text:
                            break

                        if cell_label == 'Revised':
                            # Revised cells can have multiple links
                            # so we must have special handling for it
                            for anchor in cell.xpath('a'):
                                revisions.append({
                                    'href':
                                    anchor.xpath('@href')[0],
                                    'date_string':
                                    anchor.text_content(),
                                })
                        else:
                            row_data[cell_label] = text if text else None
                            if cell.xpath('a'):
                                href = cell.xpath('a/@href')
                                row_data[cell_label +
                                         '_Url'] = href[0] if href else None

                        cell_count += 1

                    if row_data:
                        Logger.info('Discovered: %s' % row_data['Name'])

                        # Validate the justice, or add new record for him/her
                        if not Justice.objects.filter(id=row_data['J.']):
                            self.new_justices.append(row_data['J.'])
                            justice = Justice(id=row_data['J.'],
                                              name=row_data['J.'])
                            justice.save()

                        # Convert all scraped data to uniform unicode string
                        for label, data in row_data.items():
                            if data:
                                row_data[label] = str(data)

                        # Create new opinion record from row data
                        self.discovered_opinions.append(
                            Opinion(
                                category=category,
                                reporter=row_data['R-']
                                if 'R-' in row_data else None,
                                published=self.convert_date_string(
                                    row_data['Date']),
                                docket=row_data['Docket'],
                                name=row_data['Name'],
                                pdf_url=self.BASE + row_data['Name_Url'],
                                justice=Justice(row_data['J.']),
                                part=row_data['Pt.'],
                                discovered=timezone.now(),
                            ))

                        # Create opinions for revision, if it exists
                        for revision in revisions:
                            date_string = revision['date_string']
                            href = revision['href']
                            Logger.info('Discovered REVISION: %s' %
                                        row_data['Name'])
                            self.discovered_opinions.append(
                                Opinion(
                                    category=category,
                                    reporter=row_data['R-']
                                    if 'R-' in row_data else None,
                                    published=self.convert_date_string(
                                        date_string),
                                    docket=row_data['Docket'],
                                    name='%s [REVISION]' % row_data['Name'],
                                    pdf_url=self.BASE + href,
                                    justice=Justice(row_data['J.']),
                                    part=row_data['Pt.'],
                                    discovered=timezone.now(),
                                ))