Ejemplo n.º 1
0
    def setUp(self):
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')
        self.client = Client()

        # Add a document to the index
        site = test_opinion_scraper.Site().parse()
        cite = Citation(
            docket_number=site.docket_numbers[0],
            neutral_cite=site.neutral_citations[0],
            federal_cite_one=site.west_citations[0]
        )
        cite.save(index=False)
        docket = Docket(
            court=self.court,
            case_name=site.case_names[0],
        )
        docket.save()
        self.doc = Document(
            date_filed=site.case_dates[0],
            citation=cite,
            docket=docket,
            precedential_status=site.precedential_statuses[0],
        )
        self.doc.save(index=False)
Ejemplo n.º 2
0
 def test_solr_ingestion_and_deletion(self):
     """Do items get added to the Solr index when they are ingested?"""
     site = test_opinion_scraper.Site().parse()
     path = os.path.join(settings.INSTALL_ROOT, 'alert',
                         site.download_urls[0])  # a simple PDF
     with open(path) as f:
         content = f.read()
         cf = ContentFile(content)
         extension = get_extension(content)
     cite = Citation()
     cite.save(index=False)
     docket = Docket(
         court=self.court,
         case_name=site.case_names[0],
     )
     docket.save()
     doc = Document(
         date_filed=site.case_dates[0],
         docket=docket,
         citation=cite,
     )
     file_name = trunc(site.case_names[0].lower(), 75) + extension
     doc.local_path.save(file_name, cf, save=False)
     doc.save(index=False)
     extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
     response = self.si.raw_query(**{
         'q': 'supreme',
         'caller': 'scraper_test',
     }).execute()
     count = response.result.numFound
     self.assertEqual(
         count, 1,
         "There were %s items found when there should have been 1" % count)
Ejemplo n.º 3
0
    def test_abort_on_changed_court_website(self):
        """Similar to the above, but we create a url2Hash with a different hash before checking if it exists."""
        site = test_opinion_scraper.Site()
        site.hash = 'this is a dummy hash code string'
        for dup_checker in self.dup_checkers:
            urlToHash(pk=site.url, SHA1=site.hash).save()
            abort = dup_checker.abort_by_url_hash(
                site.url, "this is a *different* hash!")
            if dup_checker.full_crawl:
                self.assertFalse(
                    abort, "DupChecker says to abort during a full crawl.")
            else:
                self.assertFalse(
                    abort,
                    "DupChecker says to abort on a court where the hash has changed."
                )

            dup_checker.url2Hash.delete()
Ejemplo n.º 4
0
    def test_abort_on_unchanged_court_website(self):
        """Similar to the above, but we create a url2hash object before
        checking if it exists."""
        site = test_opinion_scraper.Site()
        site.hash = 'this is a dummy hash code string'
        for dup_checker in self.dup_checkers:
            urlToHash(id=site.url, SHA1=site.hash).save()
            abort = dup_checker.abort_by_url_hash(site.url, site.hash)
            if dup_checker.full_crawl:
                self.assertFalse(
                    abort, "DupChecker says to abort during a full crawl.")
            else:
                self.assertTrue(
                    abort,
                    "DupChecker says not to abort on a court that's been "
                    "crawled before with the same hash")

            dup_checker.url2Hash.delete()
Ejemplo n.º 5
0
    def test_abort_when_new_court_website(self):
        """Tests what happens when a new website is discovered."""
        site = test_opinion_scraper.Site()
        site.hash = 'this is a dummy hash code string'

        for dup_checker in self.dup_checkers:
            abort = dup_checker.abort_by_url_hash(site.url, site.hash)
            if dup_checker.full_crawl:
                self.assertFalse(
                    abort, "DupChecker says to abort during a full crawl.")
            else:
                self.assertFalse(
                    abort,
                    "DupChecker says to abort on a court that's never been "
                    "crawled before.")

            # The checking function creates url2Hashes, that we must delete as
            # part of cleanup.
            dup_checker.url2Hash.delete()
Ejemplo n.º 6
0
    def test_content_extraction(self):
        """Do all of the supported mimetypes get extracted to text
        successfully, including OCR?"""
        site = test_opinion_scraper.Site().parse()

        test_strings = [
            'supreme', 'intelligence', 'indiana', 'reagan', 'indiana',
            'fidelity'
        ]
        for i in range(0, len(site.case_names)):
            path = os.path.join(settings.INSTALL_ROOT, 'alert',
                                site.download_urls[i])
            with open(path) as f:
                content = f.read()
                cf = ContentFile(content)
                extension = get_extension(content)
            cite = Citation()
            cite.save(index=False)
            docket = Docket(
                case_name=site.case_names[i],
                court=self.court,
            )
            docket.save()
            doc = Document(
                date_filed=site.case_dates[i],
                citation=cite,
                docket=docket,
            )
            file_name = trunc(site.case_names[i].lower(), 75) + extension
            doc.local_path.save(file_name, cf, save=False)
            doc.save(index=False)
            doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
            if extension in ['.html', '.wpd']:
                self.assertIn(test_strings[i], doc.html.lower())
            else:
                self.assertIn(test_strings[i], doc.plain_text.lower())

            doc.delete()
Ejemplo n.º 7
0
    def setUp(self):
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')

        # Set up testing cores in Solr and swap them in
        self.core_name_opinion = '%s.opinion-test-%s' % \
                                 (self.__module__, time.time())
        self.core_name_audio = '%s.audio-test-%s' % \
                               (self.__module__, time.time())
        create_solr_core(self.core_name_opinion)
        create_solr_core(
            self.core_name_audio,
            schema=os.path.join(settings.INSTALL_ROOT, 'Solr', 'conf',
                                'audio_schema.xml'),
            instance_dir='/usr/local/solr/example/solr/audio',
        )
        swap_solr_core('collection1', self.core_name_opinion)
        swap_solr_core('audio', self.core_name_audio)
        self.si_opinion = sunburnt.SolrInterface(settings.SOLR_OPINION_URL,
                                                 mode='rw')
        self.si_audio = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL,
                                               mode='rw')

        # Add three documents and three audio files to the index, but don't
        # extract their contents
        self.site_opinion = test_opinion_scraper.Site().parse()
        self.site_audio = test_oral_arg_scraper.Site().parse()
        cite_counts = (4, 6, 8)
        self.docs = {}
        for i in range(0, 3):
            cite = Citation(
                case_name=self.site_opinion.case_names[i],
                docket_number=self.site_opinion.docket_numbers[i],
                neutral_cite=self.site_opinion.neutral_citations[i],
                federal_cite_one=self.site_opinion.west_citations[i],
            )
            cite.save(index=False)
            docket = Docket(
                case_name=self.site_opinion.case_names[i],
                court=self.court,
            )
            docket.save()
            self.docs[i] = Document(
                date_filed=self.site_opinion.case_dates[i],
                citation=cite,
                docket=docket,
                precedential_status=self.site_opinion.precedential_statuses[i],
                citation_count=cite_counts[i],
                nature_of_suit=self.site_opinion.nature_of_suit[i],
                judges=self.site_opinion.judges[i],
            )
            self.docs[i].save()

        # Create citations between the documents
        # 0 ---cites--> 1, 2
        # 1 ---cites--> 2
        # 2 ---cites--> 0
        self.docs[0].cases_cited.add(self.docs[1].citation)
        self.docs[0].cases_cited.add(self.docs[2].citation)
        self.docs[1].cases_cited.add(self.docs[2].citation)
        self.docs[2].cases_cited.add(self.docs[0].citation)

        for doc in self.docs.itervalues():
            doc.save()

        # Scrape the audio "site" and add its contents
        site = test_oral_arg_scraper.Site().parse()
        Command().scrape_court(site, full_crawl=True)

        self.expected_num_results_opinion = 3
        self.expected_num_results_audio = 2
        self.si_opinion.commit()
        self.si_audio.commit()
Ejemplo n.º 8
0
 def test_parsing_xml_opinion_site_to_site_object(self):
     """Does a basic parse of a site reveal the right number of items?"""
     site = test_opinion_scraper.Site().parse()
     self.assertEqual(len(site.case_names), 6)
Ejemplo n.º 9
0
 def test_ingest_opinions(self):
     """Can we successfully ingest opinions at a high level?"""
     site = test_opinion_scraper.Site()
     site.method = "LOCAL"
     parsed_site = site.parse()
     OpinionCommand().scrape_court(parsed_site, full_crawl=True)