Beispiel #1
0
    def setUp(self):
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')
        self.client = Client()

        # Add a document to the index
        site = test_opinion_scraper.Site().parse()
        cite = Citation(
            neutral_cite=site.neutral_citations[0],
            federal_cite_one=site.west_citations[0]
        )
        cite.save(index=False)
        docket = Docket(
            docket_number=site.docket_numbers[0],
            court=self.court,
            case_name=site.case_names[0],
        )
        docket.save()
        self.doc = Document(
            date_filed=site.case_dates[0],
            citation=cite,
            docket=docket,
            precedential_status=site.precedential_statuses[0],
        )
        self.doc.save(index=False)
Beispiel #2
0
    def setUp(self):
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')
        self.client = Client()

        # Add a document to the index
        site = test_opinion_scraper.Site().parse()
        cite = Citation(
            docket_number=site.docket_numbers[0],
            neutral_cite=site.neutral_citations[0],
            federal_cite_one=site.west_citations[0]
        )
        cite.save(index=False)
        docket = Docket(
            court=self.court,
            case_name=site.case_names[0],
        )
        docket.save()
        self.doc = Document(
            date_filed=site.case_dates[0],
            citation=cite,
            docket=docket,
            precedential_status=site.precedential_statuses[0],
        )
        self.doc.save(index=False)
Beispiel #3
0
 def test_solr_ingestion_and_deletion(self):
     """Do items get added to the Solr index when they are ingested?"""
     site = test_opinion_scraper.Site().parse()
     path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[0])  # a simple PDF
     with open(path) as f:
         content = f.read()
         cf = ContentFile(content)
         extension = get_extension(content)
     cite = Citation()
     cite.save(index=False)
     docket = Docket(
         court=self.court,
         case_name=site.case_names[0],
     )
     docket.save()
     doc = Document(
         date_filed=site.case_dates[0],
         docket=docket,
         citation=cite,
     )
     file_name = trunc(site.case_names[0].lower(), 75) + extension
     doc.local_path.save(file_name, cf, save=False)
     doc.save(index=False)
     extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
     response = self.si.raw_query(**{'q': 'supreme', 'caller': 'scraper_test',}).execute()
     count = response.result.numFound
     self.assertEqual(count, 1, "There were %s items found when there should have been 1" % count)
Beispiel #4
0
 def test_solr_ingestion_and_deletion(self):
     """Do items get added to the Solr index when they are ingested?"""
     site = test_opinion_scraper.Site().parse()
     path = os.path.join(settings.INSTALL_ROOT, 'alert',
                         site.download_urls[0])  # a simple PDF
     with open(path) as f:
         content = f.read()
         cf = ContentFile(content)
         extension = get_extension(content)
     cite = Citation()
     cite.save(index=False)
     docket = Docket(
         court=self.court,
         case_name=site.case_names[0],
     )
     docket.save()
     doc = Document(
         date_filed=site.case_dates[0],
         docket=docket,
         citation=cite,
     )
     file_name = trunc(site.case_names[0].lower(), 75) + extension
     doc.local_path.save(file_name, cf, save=False)
     doc.save(index=False)
     extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
     response = self.si.raw_query(**{
         'q': 'supreme',
         'caller': 'scraper_test',
     }).execute()
     count = response.result.numFound
     self.assertEqual(
         count, 1,
         "There were %s items found when there should have been 1" % count)
Beispiel #5
0
 def test_should_we_continue_break_or_carry_on_with_dup_found_and_older_date(
         self):
     content = "this is dummy content that we hash"
     content_hash = hashlib.sha1(content).hexdigest()
     for dup_checker in self.dup_checkers:
         docket = Docket(court=self.court)
         docket.save()
         doc = Document(sha1=content_hash, docket=docket)
         doc.save(index=False)
         # Note that the next case occurs prior to the current one
         onwards = dup_checker.should_we_continue_break_or_carry_on(
             Document,
             now(),
             now() - timedelta(days=1),
             lookup_value=content_hash,
             lookup_by='sha1')
         if dup_checker.full_crawl:
             self.assertEqual(
                 onwards, 'CONTINUE',
                 'DupChecker says to %s during a full crawl.' % onwards)
         else:
             self.assertEqual(
                 onwards, 'BREAK',
                 "DupChecker says to %s but there should be a duplicate in "
                 "the database. dup_count is %s, and dup_threshold is %s" %
                 (onwards, dup_checker.dup_count,
                  dup_checker.dup_threshold))
         doc.delete()
Beispiel #6
0
    def test_save_old_opinion(self):
        """Can we save opinions older than 1900?"""
        court = Court.objects.get(pk='test')

        cite = Citation(case_name=u"Blah")
        cite.save(index=True)
        docket = Docket(
            case_name=u"Blah",
            court=court,
        )
        docket.save()
        d = Document(
            citation=cite,
            docket=docket,
            date_filed=datetime.date(1899, 1, 1),

        )

        try:
            cf = ContentFile(StringIO.StringIO('blah').read())
            d.local_path.save('file_name.pdf', cf, save=False)
            d.save(index=True)
        except ValueError:
            raise ValueError("Unable to save a case older than 1900. Did you "
                             "try to use `strftime`...again?")
Beispiel #7
0
    def test_should_we_continue_break_or_carry_on_with_a_dup_found(self):
        # Set the dup_threshold to zero for this test
        self.dup_checkers = [
            DupChecker(self.court, full_crawl=True, dup_threshold=0),
            DupChecker(self.court, full_crawl=False, dup_threshold=0)
        ]
        content = "this is dummy content that we hash"
        content_hash = hashlib.sha1(content).hexdigest()
        for dup_checker in self.dup_checkers:
            # Create a document, then use the dup_checker to see if it exists.
            docket = Docket(court=self.court)
            docket.save()
            doc = Document(sha1=content_hash, docket=docket)
            doc.save(index=False)
            onwards = dup_checker.should_we_continue_break_or_carry_on(
                Document,
                now(),
                now(),
                lookup_value=content_hash,
                lookup_by='sha1')
            if dup_checker.full_crawl:
                self.assertEqual(
                    onwards, 'CONTINUE',
                    'DupChecker says to %s during a full crawl.' % onwards)

            else:
                self.assertEqual(
                    onwards, 'BREAK',
                    "DupChecker says to %s but there should be a duplicate in "
                    "the database. dup_count is %s, and dup_threshold is %s" %
                    (onwards, dup_checker.dup_count,
                     dup_checker.dup_threshold))

            doc.delete()
Beispiel #8
0
    def test_content_extraction(self):
        """Do all of the supported mimetypes get extracted to text
        successfully, including OCR?"""
        site = test_opinion_scraper.Site().parse()

        test_strings = ["supreme", "intelligence", "indiana", "reagan", "indiana", "fidelity"]
        for i in range(0, len(site.case_names)):
            path = os.path.join(settings.INSTALL_ROOT, "alert", site.download_urls[i])
            with open(path) as f:
                content = f.read()
                cf = ContentFile(content)
                extension = get_extension(content)
            cite = Citation()
            cite.save(index=False)
            docket = Docket(case_name=site.case_names[i], court=self.court)
            docket.save()
            doc = Document(date_filed=site.case_dates[i], citation=cite, docket=docket)
            file_name = trunc(site.case_names[i].lower(), 75) + extension
            doc.local_path.save(file_name, cf, save=False)
            doc.save(index=False)
            doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
            if extension in [".html", ".wpd"]:
                self.assertIn(test_strings[i], doc.html.lower())
            else:
                self.assertIn(test_strings[i], doc.plain_text.lower())

            doc.delete()
Beispiel #9
0
    def test_should_we_continue_break_or_carry_on_with_a_dup_found(self):
        # Set the dup_threshold to zero for this test
        self.dup_checkers = [
            DupChecker(self.court, full_crawl=True, dup_threshold=0),
            DupChecker(self.court, full_crawl=False, dup_threshold=0),
        ]
        content = "this is dummy content that we hash"
        content_hash = hashlib.sha1(content).hexdigest()
        for dup_checker in self.dup_checkers:
            # Create a document, then use the dup_checker to see if it exists.
            docket = Docket(court=self.court)
            docket.save()
            doc = Document(sha1=content_hash, docket=docket)
            doc.save(index=False)
            onwards = dup_checker.should_we_continue_break_or_carry_on(
                Document, now(), now(), lookup_value=content_hash, lookup_by="sha1"
            )
            if dup_checker.full_crawl:
                self.assertEqual(onwards, "CONTINUE", "DupChecker says to %s during a full crawl." % onwards)

            else:
                self.assertEqual(
                    onwards,
                    "BREAK",
                    "DupChecker says to %s but there should be a duplicate in "
                    "the database. dup_count is %s, and dup_threshold is %s"
                    % (onwards, dup_checker.dup_count, dup_checker.dup_threshold),
                )

            doc.delete()
Beispiel #10
0
 def test_should_we_continue_break_or_carry_on_with_dup_found_and_older_date(self):
     content = "this is dummy content that we hash"
     content_hash = hashlib.sha1(content).hexdigest()
     for dup_checker in self.dup_checkers:
         docket = Docket(court=self.court)
         docket.save()
         doc = Document(sha1=content_hash, docket=docket)
         doc.save(index=False)
         # Note that the next case occurs prior to the current one
         onwards = dup_checker.should_we_continue_break_or_carry_on(
             Document,
             now(),
             now() - timedelta(days=1),
             lookup_value=content_hash,
             lookup_by='sha1'
         )
         if dup_checker.full_crawl:
             self.assertEqual(
                 onwards,
                 'CONTINUE',
                 'DupChecker says to %s during a full crawl.' % onwards)
         else:
             self.assertEqual(
                 onwards,
                 'BREAK',
                 "DupChecker says to %s but there should be a duplicate in "
                 "the database. dup_count is %s, and dup_threshold is %s" %
                 (onwards, dup_checker.dup_count, dup_checker.dup_threshold)
             )
         doc.delete()
Beispiel #11
0
    def test_citation_matching(self):
        """Creates a few documents that contain specific citations, then
        attempts to find and match those citations.

        This becomes a bit of an integration test, which is likely fine.
        """
        # Set up a document
        c1 = models.Citation(federal_cite_one=u'1 Yeates 1 (test 1795)')
        c1.save(index=False)
        docket1 = Docket(
            case_name=u"Lissner v. Saad",
            court=self.court,
        )
        docket1.save()
        d1 = models.Document(
            date_filed=date(1795, 6, 9),
            citation=c1,
            docket=docket1,
            precedential_status='Published',
        )
        d1.save(index=True)
        # Reference d1 from the text of another document
        c2 = models.Citation()
        c2.save(index=False)
        docket2 = Docket(
            case_name=u"Reference to Lissner v. Saad",
            court=self.court,
        )
        docket2.save()
        d2 = models.Document(
            date_filed=date(1982, 6, 9),
            docket=docket2,
            citation=c2,
            plain_text=u"1 Yeates 1"
        )
        d2.save(index=True)

        # Do a commit, or else citations can't be found in the index.
        self.si_opinion.commit()
        update_document(d2)  # Updates d1's citation count in a Celery task
        d1 = models.Document.objects.get(pk=1)  # cache-bust d1

        self.assertEqual(
            d1.citation_count,
            1,
            msg=u"d1 was not updated by a citation found in d2. Count was: %s"
                % d1.citation_count
        )
        d1.delete()
        d2.delete()
Beispiel #12
0
    def setUp(self):
        c1 = Citation(case_name=u"foo")
        c1.save(index=False)
        docket = Docket(
            case_name=u'foo',
            court=Court.objects.get(pk='test'),
        )
        docket.save()
        # Must be more than a year old for all tests to be runnable.
        last_month = now().date() - timedelta(days=400)
        self.doc = Document(citation=c1, docket=docket, date_filed=last_month)
        self.doc.save(index=False)

        # Scrape the audio "site" and add its contents
        site = test_oral_arg_scraper.Site().parse()
        OralArgumentCommand().scrape_court(site, full_crawl=True)
Beispiel #13
0
    def test_citation_matching(self):
        """Creates a few documents that contain specific citations, then
        attempts to find and match those citations.

        This becomes a bit of an integration test, which is likely fine.
        """
        # Set up a document
        c1 = models.Citation(federal_cite_one=u'1 Yeates 1 (test 1795)')
        c1.save(index=False)
        docket1 = Docket(
            case_name=u"Lissner v. Saad",
            court=self.court,
        )
        docket1.save()
        d1 = models.Document(
            date_filed=date(1795, 6, 9),
            citation=c1,
            docket=docket1,
            precedential_status='Published',
        )
        d1.save(index=True)
        # Reference d1 from the text of another document
        c2 = models.Citation()
        c2.save(index=False)
        docket2 = Docket(
            case_name=u"Reference to Lissner v. Saad",
            court=self.court,
        )
        docket2.save()
        d2 = models.Document(date_filed=date(1982, 6, 9),
                             docket=docket2,
                             citation=c2,
                             plain_text=u"1 Yeates 1")
        d2.save(index=True)

        # Do a commit, or else citations can't be found in the index.
        self.si_opinion.commit()
        update_document(d2)  # Updates d1's citation count in a Celery task
        d1 = models.Document.objects.get(pk=1)  # cache-bust d1

        self.assertEqual(
            d1.citation_count,
            1,
            msg=u"d1 was not updated by a citation found in d2. Count was: %s"
            % d1.citation_count)
        d1.delete()
        d2.delete()
Beispiel #14
0
    def setUp(self):
        c1 = Citation(case_name=u"foo")
        c1.save(index=False)
        docket = Docket(
            case_name=u'foo',
            court=Court.objects.get(pk='test'),
        )
        docket.save()
        # Must be more than a year old for all tests to be runnable.
        last_month = now().date() - timedelta(days=400)
        self.doc = Document(
            citation=c1,
            docket=docket,
            date_filed=last_month
        )
        self.doc.save(index=False)

        # Scrape the audio "site" and add its contents
        site = test_oral_arg_scraper.Site().parse()
        OralArgumentCommand().scrape_court(site, full_crawl=True)
Beispiel #15
0
    def test_updating_the_docket_when_the_citation_case_name_changes(self):
        """Makes sure that the docket changes when the citation does."""
        court = Court.objects.get(pk='test')

        original_case_name = u'original case name'
        new_case_name = u'new case name'
        cite = Citation(case_name=original_case_name)
        cite.save(index=False)
        docket = Docket(
            case_name=original_case_name,
            court=court,
        )
        docket.save()
        Document(
            citation=cite,
            docket=docket,
        ).save(index=False)
        cite.case_name = new_case_name
        cite.save(index=False)
        changed_docket = Docket.objects.get(pk=docket.pk)
        self.assertEqual(changed_docket.case_name, new_case_name)
Beispiel #16
0
    def test_content_extraction(self):
        """Do all of the supported mimetypes get extracted to text
        successfully, including OCR?"""
        site = test_opinion_scraper.Site().parse()

        test_strings = [
            'supreme', 'intelligence', 'indiana', 'reagan', 'indiana',
            'fidelity'
        ]
        for i in range(0, len(site.case_names)):
            path = os.path.join(settings.INSTALL_ROOT, 'alert',
                                site.download_urls[i])
            with open(path) as f:
                content = f.read()
                cf = ContentFile(content)
                extension = get_extension(content)
            cite = Citation()
            cite.save(index=False)
            docket = Docket(
                case_name=site.case_names[i],
                court=self.court,
            )
            docket.save()
            doc = Document(
                date_filed=site.case_dates[i],
                citation=cite,
                docket=docket,
            )
            file_name = trunc(site.case_names[i].lower(), 75) + extension
            doc.local_path.save(file_name, cf, save=False)
            doc.save(index=False)
            doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
            if extension in ['.html', '.wpd']:
                self.assertIn(test_strings[i], doc.html.lower())
            else:
                self.assertIn(test_strings[i], doc.plain_text.lower())

            doc.delete()
Beispiel #17
0
    def setUp(self):
        self.court = Court.objects.get(pk='test')

        # create 3 documents with their citations and dockets
        c1, c2, c3 = Citation(case_name=u"c1"), Citation(
            case_name=u"c2"), Citation(case_name=u"c3")
        c1.save(index=False)
        c2.save(index=False)
        c3.save(index=False)
        docket1 = Docket(
            case_name=u"c1",
            court=self.court,
        )
        docket2 = Docket(
            case_name=u"c2",
            court=self.court,
        )
        docket3 = Docket(
            case_name=u"c3",
            court=self.court,
        )
        docket1.save()
        docket2.save()
        docket3.save()
        d1, d2, d3 = Document(date_filed=date.today()), Document(
            date_filed=date.today()), Document(date_filed=date.today())
        d1.citation, d2.citation, d3.citation = c1, c2, c3
        d1.docket, d2.docket, d3.docket = docket1, docket2, docket3
        doc_list = [d1, d2, d3]
        for d in doc_list:
            d.citation.save(index=False)
            d.save(index=False)

        # create simple citing relation: 1 cites 2 and 3; 2 cites 3; 3 cites 1;
        d1.cases_cited.add(d2.citation)
        d2.citation_count += 1
        d2.cases_cited.add(d3.citation)
        d3.citation_count += 1
        d3.cases_cited.add(d1.citation)
        d1.citation_count += 1
        d1.cases_cited.add(d3.citation)
        d3.citation_count += 1
        d1.save(index=False)
        d2.save(index=False)
        d3.save(index=False)
Beispiel #18
0
    def setUp(self):
        self.court = Court.objects.get(pk='test')

        # create 3 documents with their citations and dockets
        c1, c2, c3 = Citation(case_name=u"c1"), Citation(
            case_name=u"c2"), Citation(case_name=u"c3")
        c1.save(index=False)
        c2.save(index=False)
        c3.save(index=False)
        docket1 = Docket(
            case_name=u"c1",
            court=self.court,
        )
        docket2 = Docket(
            case_name=u"c2",
            court=self.court,
        )
        docket3 = Docket(
            case_name=u"c3",
            court=self.court,
        )
        docket1.save()
        docket2.save()
        docket3.save()
        d1, d2, d3 = Document(date_filed=date.today()), Document(
            date_filed=date.today()), Document(date_filed=date.today())
        d1.citation, d2.citation, d3.citation = c1, c2, c3
        d1.docket, d2.docket, d3.docket = docket1, docket2, docket3
        doc_list = [d1, d2, d3]
        for d in doc_list:
            d.citation.save(index=False)
            d.save(index=False)

        # create simple citing relation: 1 cites 2 and 3; 2 cites 3; 3 cites 1;
        d1.cases_cited.add(d2.citation)
        d2.citation_count += 1
        d2.cases_cited.add(d3.citation)
        d3.citation_count += 1
        d3.cases_cited.add(d1.citation)
        d1.citation_count += 1
        d1.cases_cited.add(d3.citation)
        d3.citation_count += 1
        d1.save(index=False)
        d2.save(index=False)
        d3.save(index=False)
Beispiel #19
0
    def setUp(self):
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')

        # Set up testing cores in Solr and swap them in
        self.core_name_opinion = '%s.opinion-test-%s' % \
                                 (self.__module__, time.time())
        self.core_name_audio = '%s.audio-test-%s' % \
                               (self.__module__, time.time())
        create_solr_core(self.core_name_opinion)
        create_solr_core(
            self.core_name_audio,
            schema=os.path.join(settings.INSTALL_ROOT, 'Solr', 'conf',
                                'audio_schema.xml'),
            instance_dir='/usr/local/solr/example/solr/audio',
        )
        swap_solr_core('collection1', self.core_name_opinion)
        swap_solr_core('audio', self.core_name_audio)
        self.si_opinion = sunburnt.SolrInterface(
            settings.SOLR_OPINION_URL, mode='rw')
        self.si_audio = sunburnt.SolrInterface(
            settings.SOLR_AUDIO_URL, mode='rw')

        # Add three documents and three audio files to the index, but don't
        # extract their contents
        self.site_opinion = test_opinion_scraper.Site().parse()
        self.site_audio = test_oral_arg_scraper.Site().parse()
        cite_counts = (4, 6, 8)
        self.docs = {}
        for i in range(0, 3):
            cite = Citation(
                case_name=self.site_opinion.case_names[i],
                docket_number=self.site_opinion.docket_numbers[i],
                neutral_cite=self.site_opinion.neutral_citations[i],
                federal_cite_one=self.site_opinion.west_citations[i],
            )
            cite.save(index=False)
            docket = Docket(
                case_name=self.site_opinion.case_names[i],
                court=self.court,
            )
            docket.save()
            self.docs[i] = Document(
                date_filed=self.site_opinion.case_dates[i],
                citation=cite,
                docket=docket,
                precedential_status=self.site_opinion.precedential_statuses[i],
                citation_count=cite_counts[i],
                nature_of_suit=self.site_opinion.nature_of_suit[i],
                judges=self.site_opinion.judges[i],
            )
            self.docs[i].save()

        # Create citations between the documents
        # 0 ---cites--> 1, 2
        # 1 ---cites--> 2
        # 2 ---cites--> 0
        self.docs[0].cases_cited.add(self.docs[1].citation)
        self.docs[0].cases_cited.add(self.docs[2].citation)
        self.docs[1].cases_cited.add(self.docs[2].citation)
        self.docs[2].cases_cited.add(self.docs[0].citation)

        for doc in self.docs.itervalues():
            doc.save()

        # Scrape the audio "site" and add its contents
        site = test_oral_arg_scraper.Site().parse()
        Command().scrape_court(site, full_crawl=True)

        self.expected_num_results_opinion = 3
        self.expected_num_results_audio = 2
        self.si_opinion.commit()
        self.si_audio.commit()
Beispiel #20
0
def import_resource_org_item(case_location):
    """Using the path to a case, import it, gathering all needed meta data.

    Path is any valid URI that the requests library can handle.
    """
    def get_file(location):
        if location.startswith('/'):
            with open(location) as f:
                r = requests.Session()
                r.content = f.read()
        else:
            r = requests.get(location)
        return fromstring(r.content), get_clean_body_content(r.content)

    # Get trees and text for the opinion itself and for the index page
    # that links to it. Each has useful data.
    case_tree, case_text = get_file(case_location)
    vol_location = case_location.rsplit('/', 1)[-2] + '/index.html'
    vol_tree, vol_text = get_file(vol_location)

    html, blocked = anonymize(get_case_body(case_tree))

    case_location_relative = case_location.rsplit('/', 1)[1]
    case_name, status = get_case_name_and_status(
        vol_tree, case_location_relative)
    cite = Citation(
        case_name=case_name,
        docket_number=get_docket_number(case_location),
        federal_cite_one=get_west_cite(vol_tree, case_location_relative),
    )
    docket = Docket(
        court=Court.objects.get(pk=get_court_id(case_tree)),
        case_name=case_name,
    )
    doc = Document(
        date_filed=get_date_filed(vol_tree, case_location_relative),
        source='R',
        sha1=hashlib.sha1(case_text).hexdigest(),
        citation=cite,
        docket=docket,
        download_url=case_location,
        html=html,
        precedential_status=status,
    )
    if blocked:
        doc.blocked = True
        docket.blocked = True
        doc.date_blocked = datetime.date.today()
        docket.date_blocked = datetime.date.today()

    cite.save()
    docket.save()
    doc.docket = docket
    doc.citation = cite
    doc.save()

    # Update the citation graph
    from alert.citations.tasks import update_document_by_id
    update_document_by_id(doc.pk)

    return doc
Beispiel #21
0
    def setUp(self):
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')

        # Set up testing cores in Solr and swap them in
        self.core_name_opinion = '%s.opinion-test-%s' % \
                                 (self.__module__, time.time())
        self.core_name_audio = '%s.audio-test-%s' % \
                               (self.__module__, time.time())
        create_solr_core(self.core_name_opinion)
        create_solr_core(
            self.core_name_audio,
            schema=os.path.join(settings.INSTALL_ROOT, 'Solr', 'conf',
                                'audio_schema.xml'),
            instance_dir='/usr/local/solr/example/solr/audio',
        )
        swap_solr_core('collection1', self.core_name_opinion)
        swap_solr_core('audio', self.core_name_audio)
        self.si_opinion = sunburnt.SolrInterface(settings.SOLR_OPINION_URL,
                                                 mode='rw')
        self.si_audio = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL,
                                               mode='rw')

        # Add three documents and three audio files to the index, but don't
        # extract their contents
        self.site_opinion = test_opinion_scraper.Site().parse()
        self.site_audio = test_oral_arg_scraper.Site().parse()
        cite_counts = (4, 6, 8)
        self.docs = {}
        for i in range(0, 3):
            cite = Citation(
                case_name=self.site_opinion.case_names[i],
                docket_number=self.site_opinion.docket_numbers[i],
                neutral_cite=self.site_opinion.neutral_citations[i],
                federal_cite_one=self.site_opinion.west_citations[i],
            )
            cite.save(index=False)
            docket = Docket(
                case_name=self.site_opinion.case_names[i],
                court=self.court,
            )
            docket.save()
            self.docs[i] = Document(
                date_filed=self.site_opinion.case_dates[i],
                citation=cite,
                docket=docket,
                precedential_status=self.site_opinion.precedential_statuses[i],
                citation_count=cite_counts[i],
                nature_of_suit=self.site_opinion.nature_of_suit[i],
                judges=self.site_opinion.judges[i],
            )
            self.docs[i].save()

        # Create citations between the documents
        # 0 ---cites--> 1, 2
        # 1 ---cites--> 2
        # 2 ---cites--> 0
        self.docs[0].cases_cited.add(self.docs[1].citation)
        self.docs[0].cases_cited.add(self.docs[2].citation)
        self.docs[1].cases_cited.add(self.docs[2].citation)
        self.docs[2].cases_cited.add(self.docs[0].citation)

        for doc in self.docs.itervalues():
            doc.save()

        # Scrape the audio "site" and add its contents
        site = test_oral_arg_scraper.Site().parse()
        Command().scrape_court(site, full_crawl=True)

        self.expected_num_results_opinion = 3
        self.expected_num_results_audio = 2
        self.si_opinion.commit()
        self.si_audio.commit()
def download_and_save():
    """This function is run in many threads simultaneously. Each thread
    runs so long as there are items in the queue. Once an item is found, it's
    downloaded and saved.

    The number of items that can be concurrently saved is determined by the
    number of threads that are running this function.
    """
    while True:
        item = queue.get()
        logger.info("Attempting to add item at: %s" % item['url'])
        try:
            msg, r = get_binary_content(
                item['url'],
                {},
            )
        except:
            logger.info("Unable to get item at: %s" % item['url'])
            queue.task_done()

        if msg:
            logger.warn(msg)
            queue.task_done()

        sha1_hash = hashlib.sha1(r.content).hexdigest()
        if Audio.objects.filter(sha1=sha1_hash).exists():
            # Simpsons did it! Try the next one.
            logger.info("Item already exists, moving to next item.")
            queue.task_done()
        else:
            # New item, onwards!
            logger.info('Adding new document found at: %s' % item['url'])
            audio_file = Audio(
                source='H',
                sha1=sha1_hash,
                case_name=item['case_name'],
                date_argued=item['date_argued'],
                download_url=item['url'],
                processing_complete=False,
            )
            if item['judges']:
                audio_file.judges = item['judges']
            if item['docket_number']:
                audio_file.docket_number = item['docket_number']

            court = Court.objects.get(pk=item['court_code'])

            docket = Docket(
                case_name=item['case_name'],
                court=court,
            )
            # Make and associate the file object
            try:
                cf = ContentFile(r.content)
                extension = get_extension(r.content)
                if extension not in ['.mp3', '.wma']:
                    extension = '.' + item['url'].rsplit('.', 1)[1]
                # See bitbucket issue #215 for why this must be
                # lower-cased.
                file_name = trunc(item['case_name'].lower(), 75) + extension
                audio_file.local_path_original_file.save(file_name,
                                                         cf,
                                                         save=False)
            except:
                msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                      (item['case_name'], traceback.format_exc())
                logger.critical(msg)
                queue.task_done()

            docket.save()
            audio_file.docket = docket
            audio_file.save(index=False)

            random_delay = random.randint(0, 3600)
            process_audio_file.apply_async((audio_file.pk, ),
                                           countdown=random_delay)

            logger.info("Successfully added audio file %s: %s" %
                        (audio_file.pk, audio_file.case_name))
def download_and_save():
    """This function is run in many threads simultaneously. Each thread
    runs so long as there are items in the queue. Once an item is found, it's
    downloaded and saved.

    The number of items that can be concurrently saved is determined by the
    number of threads that are running this function.
    """
    while True:
        item = queue.get()
        logger.info("Attempting to add item at: %s" % item['url'])
        try:
            msg, r = get_binary_content(
                item['url'],
                {},
            )
        except:
            logger.info("Unable to get item at: %s" % item['url'])
            queue.task_done()

        if msg:
            logger.warn(msg)
            queue.task_done()

        sha1_hash = hashlib.sha1(r.content).hexdigest()
        if Audio.objects.filter(sha1=sha1_hash).exists():
            # Simpsons did it! Try the next one.
            logger.info("Item already exists, moving to next item.")
            queue.task_done()
        else:
            # New item, onwards!
            logger.info('Adding new document found at: %s' % item['url'])
            audio_file = Audio(
                source='H',
                sha1=sha1_hash,
                case_name=item['case_name'],
                date_argued=item['date_argued'],
                download_url=item['url'],
                processing_complete=False,
            )
            if item['judges']:
                audio_file.judges = item['judges']
            if item['docket_number']:
                audio_file.docket_number = item['docket_number']

            court = Court.objects.get(pk=item['court_code'])

            docket = Docket(
                case_name=item['case_name'],
                court=court,
            )
            # Make and associate the file object
            try:
                cf = ContentFile(r.content)
                extension = get_extension(r.content)
                if extension not in ['.mp3', '.wma']:
                    extension = '.' + item['url'].rsplit('.', 1)[1]
                # See bitbucket issue #215 for why this must be
                # lower-cased.
                file_name = trunc(item['case_name'].lower(), 75) + extension
                audio_file.local_path_original_file.save(file_name, cf,
                                                         save=False)
            except:
                msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                      (item['case_name'], traceback.format_exc())
                logger.critical(msg)
                queue.task_done()

            docket.save()
            audio_file.docket = docket
            audio_file.save(index=False)

            random_delay = random.randint(0, 3600)
            process_audio_file.apply_async(
                (audio_file.pk,),
                countdown=random_delay
            )

            logger.info("Successfully added audio file %s: %s" % (
                audio_file.pk, audio_file.case_name))
Beispiel #24
0
def import_resource_org_item(case_location):
    """Using the path to a case, import it, gathering all needed meta data.

    Path is any valid URI that the requests library can handle.
    """
    def get_file(location):
        if location.startswith('/'):
            with open(location) as f:
                r = requests.Session()
                r.content = f.read()
        else:
            r = requests.get(location)
        return fromstring(r.content), get_clean_body_content(r.content)

    # Get trees and text for the opinion itself and for the index page
    # that links to it. Each has useful data.
    case_tree, case_text = get_file(case_location)
    vol_location = case_location.rsplit('/', 1)[-2] + '/index.html'
    vol_tree, vol_text = get_file(vol_location)

    html, blocked = anonymize(get_case_body(case_tree))

    case_location_relative = case_location.rsplit('/', 1)[1]
    case_name, status = get_case_name_and_status(vol_tree,
                                                 case_location_relative)
    cite = Citation(
        case_name=case_name,
        docket_number=get_docket_number(case_location),
        federal_cite_one=get_west_cite(vol_tree, case_location_relative),
    )
    docket = Docket(
        court=Court.objects.get(pk=get_court_id(case_tree)),
        case_name=case_name,
    )
    doc = Document(
        date_filed=get_date_filed(vol_tree, case_location_relative),
        source='R',
        sha1=hashlib.sha1(case_text).hexdigest(),
        citation=cite,
        docket=docket,
        download_url=case_location,
        html=html,
        precedential_status=status,
    )
    if blocked:
        doc.blocked = True
        docket.blocked = True
        doc.date_blocked = datetime.date.today()
        docket.date_blocked = datetime.date.today()

    cite.save()
    docket.save()
    doc.docket = docket
    doc.citation = cite
    doc.save()

    # Update the citation graph
    from alert.citations.tasks import update_document_by_id
    update_document_by_id(doc.pk)

    return doc