def get_cell_labels(self, cells, table_headers): """We shouldn't have to do this, but the SCOTUS site has a persistent problem whereby malformed tables are not uncommon. Sporadically a table with have the header and some rows in the COLUMN_LABELS_LEGACY format, and other rows in the newer COLUMN_LABELS format. Ive contacted the court about this many times, and often times they clear their CDN cache the resolve the problem, but its become such a reoccuring annoyance that I felt the need to implement this workaround. The court hasn't provided any indication in our communication that they have any understanding of the problem or why it is happening. """ count_cell = len(cells) count_headers = len(table_headers) acceptable_headers_counts = [len(f) for f in self.TABLE_HEADER_FORMATS] if count_cell != count_headers: cell_data = ', '.join([c.text_content() for c in cells]) Logger.warning( 'Cell/Header mismatch, found %d headers but %d cells: %s' % (count_headers, count_cell, cell_data) ) if count_cell not in acceptable_headers_counts: raise RuntimeError( 'Row has an unfamiliar number of cells: %d' % count_cell ) for header_format in self.TABLE_HEADER_FORMATS: # First header element must match first element from # expected table_headers, since there are multiple # formats with the same length if len(header_format) == count_cell and header_format[0] == table_headers[0]: return header_format raise RuntimeError('Should never reach this error')
def get_cell_labels(self, cells, table_headers): """We shouldn't have to do this, but the SCOTUS site has a persistent problem whereby malformed tables are not uncommon. Sporadically a table with have the header and some rows in the COLUMN_LABELS_LEGACY format, and other rows in the newer COLUMN_LABELS format. Ive contacted the court about this many times, and often times they clear their CDN cache the resolve the problem, but its become such a reoccuring annoyance that I felt the need to implement this workaround. The court hasn't provided any indication in our communication that they have any understanding of the problem or why it is happening. """ count_cell = len(cells) count_headers = len(table_headers) acceptable_headers_counts = [len(f) for f in self.TABLE_HEADER_FORMATS] if count_cell != count_headers: cell_data = ', '.join([c.text_content() for c in cells]) Logger.warning( 'Cell/Header mismatch, found %d headers but %d cells: %s' % (count_headers, count_cell, cell_data)) if count_cell not in acceptable_headers_counts: raise RuntimeError('Row has an unfamiliar number of cells: %d' % count_cell) for header_format in self.TABLE_HEADER_FORMATS: # First header element must match first element from # expected table_headers, since there are multiple # formats with the same length if len(header_format ) == count_cell and header_format[0] == table_headers[0]: return header_format raise RuntimeError('Should never reach this error')
def run(self): Logger.info('INITIATING DISCOVERY: %s' % timezone.now()) Logger.info('[**%s**]' % self.OPINIONS_MAIN_PAGE) self.fetch_opinion_category_urls() self.get_opinions_from_categories() Logger.info('INITIATING OPINION INGEST') self.ingest_new_opinions() Logger.info('INITIATION CITATION SCRAPING AND INGEST') self.ingest_new_citations() Logger.info('DISCOVERY COMPLETE')
def handle(self, *args, **options): try: print('\nRunning discovery. Logging to logs/%s.log\n' % time.strftime('%Y%m%d')) job = Discovery() job.run() job.send_email_report() except Exception as e: Logger.error(traceback.format_exc()) error = 'FAILED DISCOVER ERROR: %s' % e self.send_error_email(error)
def ingest_new_opinions(self): # Sort opinions by publication date, oldest to newest self.discovered_opinions.sort(key=lambda o: o.published) for opinion in self.discovered_opinions: if opinion.already_exists(): Logger.info('Skipping: %s' % opinion.name) continue Logger.info('Ingesting: %s %s' % (opinion.name, opinion.pdf_url)) opinion.save() self.new_opinions.append(opinion)
def _send_email(self, subscriber, citations): if settings.EMAIL_HOST_USER != 'YOUR_GMAIL_ADDRESS': Logger.info('+sending subscriber notification to %s' % subscriber.email) html = get_template('newly_verified_citations_email.html') template_parameters = { 'subscriber': subscriber, 'domain': settings.ALLOWED_HOSTS[0] if settings.ALLOWED_HOSTS else False, 'citations': citations, 'contact_email': settings.EMAIL_HOST_USER if settings.EMAIL_HOST_USER else False } body = html.render(template_parameters) subject = '[scotuswebcites] New citations discovered and verified' sender = settings.EMAIL_HOST_USER recipient = subscriber.email msg = EmailMultiAlternatives(subject, body, sender, [recipient]) msg.attach_alternative(body, "text/html") msg.send()
def send_email_report(self): if settings.EMAIL_HOST_USER != 'YOUR_GMAIL_ADDRESS': if self.new_opinions or self.new_justices or self.ingested_citations_count or self.failed_scrapes: subject = '[scotuswebcites] New Data Discovered' recipient = settings.CONTACT_EMAIL sender = settings.EMAIL_HOST_USER template_parameters = { 'new_opinions_count': str(len(self.new_opinions)), 'ingested_citations_count': str(self.ingested_citations_count), 'new_justices': self.new_justices, 'failed_scrapes': self.failed_scrapes, 'domain': settings.ALLOWED_HOSTS[0] if settings.ALLOWED_HOSTS else False, } body = get_template('discovery_report_email.html').render(template_parameters) Logger.info('+sending discovery report email from %s to %s' % (sender, recipient)) msg = EmailMultiAlternatives(subject, body, sender, [recipient]) msg.attach_alternative(body, "text/html") msg.send()
def _send_email(self, subscriber, citations): if settings.EMAIL_HOST_USER != 'YOUR_GMAIL_ADDRESS': Logger.info('+sending subscriber notification to %s' % subscriber.email) template = get_template('newly_verified_citations_email.html') template_parameters = { 'subscriber': subscriber, 'domain': settings.ALLOWED_HOSTS[0] if settings.ALLOWED_HOSTS else False, 'citations': citations, 'contact_email': settings.SENDER_EMAIL if settings.SENDER_EMAIL else False } body = template.render(template_parameters) subject = '[scotuswebcites] New citations discovered and verified' send_email(subject, body, subscriber.email)
def get(cls, url=False, err=True): if url: # Wait 2 seconds between requests sleep(cls.SLEEP) check = urlparse(url) if not check.scheme: url = 'http://' + url try: return requests.get(url, headers=cls.HEADERS, timeout=cls.TIMEOUT,) except Exception: pass if err: Logger.error('Fetching failed for: %s' % url) return False
def ingest_citations(self): self.ingested_citation_count = 0 for url in self.pdf.urls: if url in self.previous_publication_citations: Logger.info( '--Skipping previously discovered citation for %s: %s' % (self.name, url)) continue Logger.info('++Ingesting citation: %s' % url) new_citation = Citation( opinion=Opinion(self.id), scraped=url, ) new_citation.yyyymmdd = self.published.strftime("%Y%m%d") new_citation.get_statuses() new_citation.save() self.ingested_citation_count += 1
def send_email_report(self): if settings.EMAIL_HOST_USER != 'YOUR_GMAIL_ADDRESS': if self.new_opinions or self.new_justices or self.ingested_citations_count or self.failed_scrapes: subject = '[scotuswebcites] New Data Discovered' template_parameters = { 'new_opinions_count': str(len(self.new_opinions)), 'ingested_citations_count': str(self.ingested_citations_count), 'new_justices': self.new_justices, 'failed_scrapes': self.failed_scrapes, 'domain': settings.ALLOWED_HOSTS[0] if settings.ALLOWED_HOSTS else False, } body = get_template('discovery_report_email.html').render( template_parameters) Logger.info('+sending discovery report email from %s to %s' % (settings.SENDER_EMAIL, settings.CONTACT_EMAIL)) send_email(subject, body, settings.CONTACT_EMAIL)
def get(cls, url=False, err=True): if url: # Wait 2 seconds between requests sleep(cls.SLEEP) check = urlparse(url) if not check.scheme: url = 'http://' + url try: return requests.get( url, headers=cls.HEADERS, timeout=cls.TIMEOUT, ) except Exception: pass if err: Logger.error('Fetching failed for: %s' % url) return False
def ingest_new_citations(self): for opinion in self.new_opinions: Logger.info('Downloading: %s %s' % (opinion.name, opinion.pdf_url)) opinion.download() Logger.info('Scraping: %s %s' % (opinion.name, opinion.local_pdf)) try: opinion.scrape() except: Logger.error(traceback.format_exc()) self.failed_scrapes.append(opinion.name) if opinion.pdf.urls: Logger.info('Ingesting citations from %s' % opinion.name) opinion.ingest_citations() self.ingested_citations_count += opinion.ingested_citation_count
def handle(self, *args, **options): Logger.info("RUNNING NOTIFY SUBSCRIBERS FUNCTION: %s" % timezone.now()) newly_verified_citations = Citation.objects.filter( validated__isnull=False, notified_subscribers__isnull=True, ) if newly_verified_citations: Logger.info('Found %d newly verified citations' % len(newly_verified_citations)) # Send email notifications to subscribers subscribers = Subscriber.objects.filter(subscribed=True) if subscribers: Logger.info('Emailing newly verified citations to %d subscribers' % len(subscribers)) for subscriber in subscribers: self._send_email(subscriber, newly_verified_citations) # Update citations records to indicate that notifications sent for citation in newly_verified_citations: citation.notified_subscribers = timezone.now() citation.save()
def handle(self, *args, **options): Logger.info("RUNNING NOTIFY SUBSCRIBERS FUNCTION: %s" % timezone.now()) newly_verified_citations = Citation.objects.filter( validated__isnull=False, notified_subscribers__isnull=True, ) if newly_verified_citations: Logger.info('Found %d newly verified citations' % len(newly_verified_citations)) # Send email notifications to subscribers subscribers = Subscriber.objects.filter(subscribed=True) if subscribers: Logger.info( 'Emailing newly verified citations to %d subscribers' % len(subscribers)) for subscriber in subscribers: self._send_email(subscriber, newly_verified_citations) # Update citations records to indicate that notifications sent for citation in newly_verified_citations: citation.notified_subscribers = timezone.now() citation.save()
def get_opinions_from_categories(self): for category_url in self.category_urls: category = category_url.split('/')[-2] request = Url.get(category_url) if request and request.status_code == 200: Logger.info('EXTRACTING OPINIONS FROM %s' % category_url) html = lxml.html.fromstring(request.text) if not html.xpath(self.PATH_TABLES): # Its a new term with an empty page, no table yet Logger.info('SKIPPING BLANK PAGE: %s' % category_url) continue table_headers = self.get_table_headers(html) for row in html.xpath(self.PATH_TABLE_ROWS): cells = row.xpath('td') if not cells: continue revisions = [] row_data = {} cell_count = 0 cell_labels = self.get_cell_labels(cells, table_headers) # Parse data from rows in table for cell in cells: cell_label = cell_labels[cell_count] text = cell.text_content().strip() # Skip rows with empty first cell, these # can appear at the start of a new cycle # when scotus adds new date pages that # do not yet have records if cell_count == 0 and not text: break if cell_label == 'Revised': # Revised cells can have multiple links # so we must have special handling for it for anchor in cell.xpath('a'): revisions.append({ 'href': anchor.xpath('@href')[0], 'date_string': anchor.text_content(), }) else: row_data[cell_label] = text if text else None if cell.xpath('a'): href = cell.xpath('a/@href') row_data[cell_label + '_Url'] = href[0] if href else None cell_count += 1 if row_data: Logger.info('Discovered: %s' % row_data['Name']) # Validate the justice, or add new record for him/her if not Justice.objects.filter(id=row_data['J.']): self.new_justices.append(row_data['J.']) justice = Justice(id=row_data['J.'], name=row_data['J.']) justice.save() # Convert all scraped data to uniform unicode string for label, data in row_data.items(): if data: row_data[label] = str(data) # Create new opinion record from row data self.discovered_opinions.append(Opinion( category=category, reporter=row_data['R-'] if 'R-' in row_data else None, published=self.convert_date_string(row_data['Date']), docket=row_data['Docket'], name=row_data['Name'], pdf_url=self.BASE + row_data['Name_Url'], justice=Justice(row_data['J.']), part=row_data['Pt.'], discovered=timezone.now(), )) # Create opinions for revision, if it exists for revision in revisions: date_string = revision['date_string'] href = revision['href'] Logger.info('Discovered REVISION: %s' % row_data['Name']) self.discovered_opinions.append(Opinion( category=category, reporter=row_data['R-'] if 'R-' in row_data else None, published=self.convert_date_string(date_string), docket=row_data['Docket'], name='%s [REVISION]' % row_data['Name'], pdf_url=self.BASE + href, justice=Justice(row_data['J.']), part=row_data['Pt.'], discovered=timezone.now(), ))
def get_opinions_from_categories(self): for category_url in self.category_urls: category = category_url.split('/')[-2] request = Url.get(category_url) if request and request.status_code == 200: Logger.info('EXTRACTING OPINIONS FROM %s' % category_url) html = lxml.html.fromstring(request.text) if not html.xpath(self.PATH_TABLES): # Its a new term with an empty page, no table yet Logger.info('SKIPPING BLANK PAGE: %s' % category_url) continue table_headers = self.get_table_headers(html) for row in html.xpath(self.PATH_TABLE_ROWS): cells = row.xpath('td') if not cells: continue revisions = [] row_data = {} cell_count = 0 cell_labels = self.get_cell_labels(cells, table_headers) # Parse data from rows in table for cell in cells: cell_label = cell_labels[cell_count] text = cell.text_content().strip() # Skip rows with empty first cell, these # can appear at the start of a new cycle # when scotus adds new date pages that # do not yet have records if cell_count == 0 and not text: break if cell_label == 'Revised': # Revised cells can have multiple links # so we must have special handling for it for anchor in cell.xpath('a'): revisions.append({ 'href': anchor.xpath('@href')[0], 'date_string': anchor.text_content(), }) else: row_data[cell_label] = text if text else None if cell.xpath('a'): href = cell.xpath('a/@href') row_data[cell_label + '_Url'] = href[0] if href else None cell_count += 1 if row_data: Logger.info('Discovered: %s' % row_data['Name']) # Validate the justice, or add new record for him/her if not Justice.objects.filter(id=row_data['J.']): self.new_justices.append(row_data['J.']) justice = Justice(id=row_data['J.'], name=row_data['J.']) justice.save() # Convert all scraped data to uniform unicode string for label, data in row_data.items(): if data: row_data[label] = str(data) # Create new opinion record from row data self.discovered_opinions.append( Opinion( category=category, reporter=row_data['R-'] if 'R-' in row_data else None, published=self.convert_date_string( row_data['Date']), docket=row_data['Docket'], name=row_data['Name'], pdf_url=self.BASE + row_data['Name_Url'], justice=Justice(row_data['J.']), part=row_data['Pt.'], discovered=timezone.now(), )) # Create opinions for revision, if it exists for revision in revisions: date_string = revision['date_string'] href = revision['href'] Logger.info('Discovered REVISION: %s' % row_data['Name']) self.discovered_opinions.append( Opinion( category=category, reporter=row_data['R-'] if 'R-' in row_data else None, published=self.convert_date_string( date_string), docket=row_data['Docket'], name='%s [REVISION]' % row_data['Name'], pdf_url=self.BASE + href, justice=Justice(row_data['J.']), part=row_data['Pt.'], discovered=timezone.now(), ))