Beispiel #1
0
    def test_save_old_opinion(self):
        """Can we save opinions older than 1900?"""
        court = Court.objects.get(pk='test')

        cite = Citation(case_name=u"Blah")
        cite.save(index=True)
        docket = Docket(
            case_name=u"Blah",
            court=court,
        )
        docket.save()
        d = Document(
            citation=cite,
            docket=docket,
            date_filed=datetime.date(1899, 1, 1),

        )

        try:
            cf = ContentFile(StringIO.StringIO('blah').read())
            d.local_path.save('file_name.pdf', cf, save=False)
            d.save(index=True)
        except ValueError:
            raise ValueError("Unable to save a case older than 1900. Did you "
                             "try to use `strftime`...again?")
Beispiel #2
0
 def test_solr_ingestion_and_deletion(self):
     """Do items get added to the Solr index when they are ingested?"""
     site = test_opinion_scraper.Site().parse()
     path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[0])  # a simple PDF
     with open(path) as f:
         content = f.read()
         cf = ContentFile(content)
         extension = get_extension(content)
     cite = Citation()
     cite.save(index=False)
     docket = Docket(
         court=self.court,
         case_name=site.case_names[0],
     )
     docket.save()
     doc = Document(
         date_filed=site.case_dates[0],
         docket=docket,
         citation=cite,
     )
     file_name = trunc(site.case_names[0].lower(), 75) + extension
     doc.local_path.save(file_name, cf, save=False)
     doc.save(index=False)
     extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
     response = self.si.raw_query(**{'q': 'supreme', 'caller': 'scraper_test',}).execute()
     count = response.result.numFound
     self.assertEqual(count, 1, "There were %s items found when there should have been 1" % count)
Beispiel #3
0
class BulkDataTest(TestCase):
    fixtures = ['test_court.json']
    tmp_data_dir = '/tmp/bulk-dir/'

    def setUp(self):
        c1 = Citation(case_name=u"foo")
        c1.save(index=False)
        docket = Docket(
            case_name=u'foo',
            court=Court.objects.get(pk='test'),
        )
        docket.save()
        # Must be more than a year old for all tests to be runnable.
        last_month = now().date() - timedelta(days=400)
        self.doc = Document(citation=c1, docket=docket, date_filed=last_month)
        self.doc.save(index=False)

        # Scrape the audio "site" and add its contents
        site = test_oral_arg_scraper.Site().parse()
        OralArgumentCommand().scrape_court(site, full_crawl=True)

    def tearDown(self):
        Document.objects.all().delete()
        Docket.objects.all().delete()
        Citation.objects.all().delete()
        shutil.rmtree(self.tmp_data_dir)

    @override_settings(BULK_DATA_DIR=tmp_data_dir)
    def test_make_all_bulk_files(self):
        """Can we successfully generate all bulk files?"""
        Command().execute()
Beispiel #4
0
 def test_should_we_continue_break_or_carry_on_with_dup_found_and_older_date(self):
     content = "this is dummy content that we hash"
     content_hash = hashlib.sha1(content).hexdigest()
     for dup_checker in self.dup_checkers:
         docket = Docket(court=self.court)
         docket.save()
         doc = Document(sha1=content_hash, docket=docket)
         doc.save(index=False)
         # Note that the next case occurs prior to the current one
         onwards = dup_checker.should_we_continue_break_or_carry_on(
             Document,
             now(),
             now() - timedelta(days=1),
             lookup_value=content_hash,
             lookup_by='sha1'
         )
         if dup_checker.full_crawl:
             self.assertEqual(
                 onwards,
                 'CONTINUE',
                 'DupChecker says to %s during a full crawl.' % onwards)
         else:
             self.assertEqual(
                 onwards,
                 'BREAK',
                 "DupChecker says to %s but there should be a duplicate in "
                 "the database. dup_count is %s, and dup_threshold is %s" %
                 (onwards, dup_checker.dup_count, dup_checker.dup_threshold)
             )
         doc.delete()
Beispiel #5
0
    def test_should_we_continue_break_or_carry_on_with_a_dup_found(self):
        # Set the dup_threshold to zero for this test
        self.dup_checkers = [
            DupChecker(self.court, full_crawl=True, dup_threshold=0),
            DupChecker(self.court, full_crawl=False, dup_threshold=0),
        ]
        content = "this is dummy content that we hash"
        content_hash = hashlib.sha1(content).hexdigest()
        for dup_checker in self.dup_checkers:
            # Create a document, then use the dup_checker to see if it exists.
            docket = Docket(court=self.court)
            docket.save()
            doc = Document(sha1=content_hash, docket=docket)
            doc.save(index=False)
            onwards = dup_checker.should_we_continue_break_or_carry_on(
                Document, now(), now(), lookup_value=content_hash, lookup_by="sha1"
            )
            if dup_checker.full_crawl:
                self.assertEqual(onwards, "CONTINUE", "DupChecker says to %s during a full crawl." % onwards)

            else:
                self.assertEqual(
                    onwards,
                    "BREAK",
                    "DupChecker says to %s but there should be a duplicate in "
                    "the database. dup_count is %s, and dup_threshold is %s"
                    % (onwards, dup_checker.dup_count, dup_checker.dup_threshold),
                )

            doc.delete()
Beispiel #6
0
    def test_content_extraction(self):
        """Do all of the supported mimetypes get extracted to text successfully, including OCR?"""
        site = test_scraper.Site().parse()

        test_strings = ['supreme',
                        'intelligence',
                        'indiana',
                        'reagan',
                        'indiana',
                        'fidelity']
        for i in range(0, len(site.case_names)):
            path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[i])
            with open(path) as f:
                content = f.read()
                cf = ContentFile(content)
                extension = get_extension(content)
            cite = Citation(case_name=site.case_names[i])
            cite.save(index=False)
            doc = Document(date_filed=site.case_dates[i],
                           court=self.court,
                           citation=cite)
            file_name = trunc(site.case_names[i].lower(), 75) + extension
            doc.local_path.save(file_name, cf, save=False)
            doc.save(index=False)
            doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
            if extension in ['.html', '.wpd']:
                self.assertIn(test_strings[i], doc.html.lower())
            else:
                self.assertIn(test_strings[i], doc.plain_text.lower())

            doc.delete()
    def associate_meta_data_to_objects(self, site, i, court, sha1_hash):
        """Takes the meta data from the scraper and assocites it with objects. Returns the created objects.
        """
        cite = Citation(case_name=site.case_names[i])
        if site.docket_numbers:
            cite.docket_number = site.docket_numbers[i]
        if site.neutral_citations:
            cite.neutral_cite = site.neutral_citations[i]
        if site.west_citations:
            cite.federal_cite_one = site.west_citations[i]
        if site.west_state_citations:
            cite.west_state_cite = site.west_state_citations[i]

        docket = Docket(
            case_name=site.case_names[i],
            court=court,
        )

        doc = Document(source='C',
                       sha1=sha1_hash,
                       date_filed=site.case_dates[i],
                       download_url=site.download_urls[i],
                       precedential_status=site.precedential_statuses[i])
        if site.judges:
            doc.judges = site.judges[i]
        if site.nature_of_suit:
            doc.nature_of_suit = site.nature_of_suit[i]

        return cite, docket, doc
    def associate_meta_data_to_objects(self, site, i, court, sha1_hash):
        """Takes the meta data from the scraper and assocites it with objects.

        Returns the created objects.
        """
        cite = Citation(case_name=site.case_names[i])
        if site.docket_numbers:
            cite.docket_number = site.docket_numbers[i]
        if site.neutral_citations:
            cite.neutral_cite = site.neutral_citations[i]
        if site.west_citations:
            cite.federal_cite_one = site.west_citations[i]
        if site.west_state_citations:
            cite.west_state_cite = site.west_state_citations[i]

        docket = Docket(case_name=site.case_names[i], court=court)

        doc = Document(
            source="C",
            sha1=sha1_hash,
            date_filed=site.case_dates[i],
            download_url=site.download_urls[i],
            precedential_status=site.precedential_statuses[i],
        )
        if site.judges:
            doc.judges = site.judges[i]
        if site.nature_of_suit:
            doc.nature_of_suit = site.nature_of_suit[i]

        return cite, docket, doc
Beispiel #9
0
    def setUp(self):
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')
        self.client = Client()

        # Add a document to the index
        site = test_opinion_scraper.Site().parse()
        cite = Citation(
            docket_number=site.docket_numbers[0],
            neutral_cite=site.neutral_citations[0],
            federal_cite_one=site.west_citations[0]
        )
        cite.save(index=False)
        docket = Docket(
            court=self.court,
            case_name=site.case_names[0],
        )
        docket.save()
        self.doc = Document(
            date_filed=site.case_dates[0],
            citation=cite,
            docket=docket,
            precedential_status=site.precedential_statuses[0],
        )
        self.doc.save(index=False)
Beispiel #10
0
class BulkDataTest(TestCase):
    fixtures = ['test_court.json']
    tmp_data_dir = '/tmp/bulk-dir/'

    def setUp(self):
        c1 = Citation(case_name=u"foo")
        c1.save(index=False)
        docket = Docket(
            case_name=u'foo',
            court=Court.objects.get(pk='test'),
        )
        docket.save()
        # Must be more than a year old for all tests to be runnable.
        last_month = now().date() - timedelta(days=400)
        self.doc = Document(
            citation=c1,
            docket=docket,
            date_filed=last_month
        )
        self.doc.save(index=False)

        # Scrape the audio "site" and add its contents
        site = test_oral_arg_scraper.Site().parse()
        OralArgumentCommand().scrape_court(site, full_crawl=True)

    def tearDown(self):
        Document.objects.all().delete()
        Docket.objects.all().delete()
        Citation.objects.all().delete()
        shutil.rmtree(self.tmp_data_dir)

    @override_settings(BULK_DATA_DIR=tmp_data_dir)
    def test_make_all_bulk_files(self):
        """Can we successfully generate all bulk files?"""
        Command().execute()
Beispiel #11
0
 def test_solr_ingestion_and_deletion(self):
     """Do items get added to the Solr index when they are ingested?"""
     site = test_opinion_scraper.Site().parse()
     path = os.path.join(settings.INSTALL_ROOT, 'alert',
                         site.download_urls[0])  # a simple PDF
     with open(path) as f:
         content = f.read()
         cf = ContentFile(content)
         extension = get_extension(content)
     cite = Citation()
     cite.save(index=False)
     docket = Docket(
         court=self.court,
         case_name=site.case_names[0],
     )
     docket.save()
     doc = Document(
         date_filed=site.case_dates[0],
         docket=docket,
         citation=cite,
     )
     file_name = trunc(site.case_names[0].lower(), 75) + extension
     doc.local_path.save(file_name, cf, save=False)
     doc.save(index=False)
     extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
     response = self.si.raw_query(**{
         'q': 'supreme',
         'caller': 'scraper_test',
     }).execute()
     count = response.result.numFound
     self.assertEqual(
         count, 1,
         "There were %s items found when there should have been 1" % count)
Beispiel #12
0
class ViewDocumentTest(TestCase):
    fixtures = ['test_court.json']

    def setUp(self):
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')
        self.client = Client()

        # Add a document to the index
        site = test_scraper.Site().parse()
        cite = Citation(case_name=site.case_names[0],
                        docket_number=site.docket_numbers[0],
                        neutral_cite=site.neutral_citations[0],
                        federal_cite_one=site.west_citations[0])
        cite.save(index=False)
        self.doc = Document(date_filed=site.case_dates[0],
                            court=self.court,
                            citation=cite,
                            precedential_status=site.precedential_statuses[0])
        self.doc.save(index=False)

    def tearDown(self):
        self.doc.delete()

    def test_simple_url_check_for_document(self):
        """Does the page load properly?"""
        response = self.client.get('/test/2/asdf/')
        self.assertEqual(response.status_code, 200)
        self.assertIn('Tarrant', response.content)
Beispiel #13
0
    def test_pagerank_calculation(self):
        """Create a few Documents and fake citation relation among them, then run the pagerank
        algorithm. Check whether this simple case can get the correct result.
        """
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')

        #create 3 documents with their citations
        c1, c2, c3 = Citation(case_name=u"c1"), Citation(case_name=u"c2"), Citation(case_name=u"c3")
        c1.save(index=False)
        c2.save(index=False)
        c3.save(index=False)
        d1, d2, d3 = Document(date_filed=date.today()), Document(date_filed=date.today()), Document(date_filed=date.today())
        d1.citation, d2.citation, d3.citation = c1, c2, c3
        doc_list = [d1, d2, d3]
        for d in doc_list:
            d.court = self.court
            d.citation.save(index=False)
            d.save(index=False)

        #create simple citing relation: 1 cites 2 and 3; 2 cites 3; 3 cites 1;
        d1.cases_cited.add(d2.citation)
        d2.citation_count += 1
        d2.cases_cited.add(d3.citation)
        d3.citation_count += 1
        d3.cases_cited.add(d1.citation)
        d1.citation_count += 1
        d1.cases_cited.add(d3.citation)
        d3.citation_count += 1
        d1.save(index=False)
        d2.save(index=False)
        d3.save(index=False)

        #calculate pagerank of these 3 document
        comm = Command()
        self.verbosity = 1
        comm.do_pagerank(chown=False)

        # read in the pagerank file, converting to a dict
        pr_values_from_file = {}
        with open(get_data_dir_location() + "external_pagerank") as f:
            for line in f:
                pk, value = line.split('=')
                pr_values_from_file[pk] = float(value.strip())

        # Verify that whether the answer is correct, based on calculations in Gephi
        answers = {
            '1': 0.387790,
            '2': 0.214811,
            '3': 0.397400,
        }
        for key, value in answers.iteritems():
            self.assertTrue(
                (abs(pr_values_from_file[key]) - value) < 0.0001,
                msg="The answer for item %s was %s when it should have been %s" % (key, answers['1'], pr_values_from_file['1'])
            )
def import_law_box_case(case_path):
    """Open the file, get its contents, convert to XML and extract the meta data.

    Return a document object for saving in the database
    """
    raw_text = open(case_path).read()
    clean_html_tree, complete_html_tree, clean_html_str, body_text = get_html_from_raw_text(
        raw_text)

    sha1 = hashlib.sha1(clean_html_str).hexdigest()
    citations = get_citations_from_tree(complete_html_tree, case_path)
    judges = get_judge(clean_html_tree, case_path)
    court = get_court_object(clean_html_tree, citations, case_path, judges)

    doc = Document(
        source='L',
        sha1=sha1,
        html=
        clean_html_str,  # we clear this field later, putting the value into html_lawbox.
        date_filed=get_date_filed(clean_html_tree,
                                  citations=citations,
                                  case_path=case_path,
                                  court=court),
        precedential_status=get_precedential_status(),
        judges=judges,
        download_url=case_path,
    )

    cite = Citation(docket_number=get_docket_number(
        clean_html_tree, case_path=case_path, court=court))

    docket = Docket(
        case_name=get_case_name(complete_html_tree, case_path),
        court=court,
    )

    # Necessary for dup_finder.
    path = '//p/text()'
    doc.body_text = ' '.join(clean_html_tree.xpath(path))

    # Add the dict of citations to the object as its attributes.
    citations_as_dict = map_citations_to_models(citations)
    for k, v in citations_as_dict.iteritems():
        setattr(cite, k, v)

    doc.citation = cite
    doc.docket = docket

    return doc
Beispiel #15
0
def import_mayer(case_path):
    """Open the file, get its contents, convert to XML and extract the meta data.

    Return a document object for saving in the database
    """
    #raw_text = open(case_path).read()    
    #clean_html_tree, complete_html_tree, clean_html_str, body_text = get_html_from_raw_text(raw_text)
    tree = html.parse(case_path)

    sha1 = hashlib.sha1(clean_html_str).hexdigest()
    citations = get_citations_from_tree(complete_html_tree, case_path)
    judges = get_judge(clean_html_tree, case_path)
    court = get_court_object(clean_html_tree, citations, case_path, judges)

    doc = Document(
        source='L',
        sha1=sha1,
        html=clean_html_str,  # we clear this field later, putting the value into html_lawbox.
        date_filed=get_date_filed(clean_html_tree, citations=citations, case_path=case_path, court=court),
        precedential_status=get_precedential_status(),
        judges=judges,
        download_url=case_path,
    )

    cite = Citation()

    docket = Docket(
        docket_number=get_docket_number(
            clean_html_tree,
            case_path=case_path,
            court=court
        ),
        case_name=get_case_name(complete_html_tree, case_path),
        court=court,
    )

    # Necessary for dup_finder.
    path = '//p/text()'
    doc.body_text = ' '.join(clean_html_tree.xpath(path))

    # Add the dict of citations to the object as its attributes.
    citations_as_dict = map_citations_to_models(citations)
    for k, v in citations_as_dict.iteritems():
        setattr(cite, k, v)

    doc.citation = cite
    doc.docket = docket

    return doc
Beispiel #16
0
    def setUp(self):
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')
        self.client = Client()

        # Add a document to the index
        site = test_opinion_scraper.Site().parse()
        cite = Citation(
            neutral_cite=site.neutral_citations[0],
            federal_cite_one=site.west_citations[0]
        )
        cite.save(index=False)
        docket = Docket(
            docket_number=site.docket_numbers[0],
            court=self.court,
            case_name=site.case_names[0],
        )
        docket.save()
        self.doc = Document(
            date_filed=site.case_dates[0],
            citation=cite,
            docket=docket,
            precedential_status=site.precedential_statuses[0],
        )
        self.doc.save(index=False)
Beispiel #17
0
    def setUp(self):
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')
        self.client = Client()

        # Set up a testing core in Solr and swap it in
        self.core_name = '%s.test-%s' % (self.__module__, time.time())
        create_solr_core(self.core_name)
        swap_solr_core('collection1', self.core_name)
        self.si = sunburnt.SolrInterface(settings.SOLR_URL, mode='rw')

        # Add two documents to the index, but don't extract their contents
        self.site = test_scraper.Site().parse()
        cite_counts = (4, 6)
        for i in range(0, 2):
            cite = Citation(case_name=self.site.case_names[i],
                            docket_number=self.site.docket_numbers[i],
                            neutral_cite=self.site.neutral_citations[i],
                            federal_cite_one=self.site.west_citations[i])
            cite.save(index=False)
            self.doc = Document(date_filed=self.site.case_dates[i],
                                court=self.court,
                                citation=cite,
                                precedential_status=self.site.precedential_statuses[i],
                                citation_count=cite_counts[i],
                                nature_of_suit=self.site.nature_of_suit[i],
                                judges=self.site.judges[i])
            self.doc.save()

        self.expected_num_results = 2
Beispiel #18
0
    def setUp(self):
        c1 = Citation(case_name=u"foo")
        c1.save(index=False)
        docket = Docket(
            case_name=u'foo',
            court=Court.objects.get(pk='test'),
        )
        docket.save()
        # Must be more than a year old for all tests to be runnable.
        last_month = now().date() - timedelta(days=400)
        self.doc = Document(citation=c1, docket=docket, date_filed=last_month)
        self.doc.save(index=False)

        # Scrape the audio "site" and add its contents
        site = test_oral_arg_scraper.Site().parse()
        OralArgumentCommand().scrape_court(site, full_crawl=True)
Beispiel #19
0
def create_stub(citations):
    """Creates a stub document with the bare minimum of meta data."""
    cite = Citation()
    # Add the dict of citations to the object as its attributes.
    citations_as_dict = map_citations_to_models(citations)
    for k, v in citations_as_dict.iteritems():
        setattr(cite, k, v)
    # TODO: We can use the court information in the citation here. Failure to do so will mean that our URLs will later
    #       change -- something we wish to avoid.
    stub_doc = Document(
        is_stub_document=True,
        sha1='!',
        court=None,
        citation=cite,
    )
    stub_doc.save(index=False)
    return stub_doc
Beispiel #20
0
    def setUp(self):
        self.court = Court.objects.get(pk='test')

        # create 3 documents with their citations and dockets
        c1, c2, c3 = Citation(case_name=u"c1"), Citation(
            case_name=u"c2"), Citation(case_name=u"c3")
        c1.save(index=False)
        c2.save(index=False)
        c3.save(index=False)
        docket1 = Docket(
            case_name=u"c1",
            court=self.court,
        )
        docket2 = Docket(
            case_name=u"c2",
            court=self.court,
        )
        docket3 = Docket(
            case_name=u"c3",
            court=self.court,
        )
        docket1.save()
        docket2.save()
        docket3.save()
        d1, d2, d3 = Document(date_filed=date.today()), Document(
            date_filed=date.today()), Document(date_filed=date.today())
        d1.citation, d2.citation, d3.citation = c1, c2, c3
        d1.docket, d2.docket, d3.docket = docket1, docket2, docket3
        doc_list = [d1, d2, d3]
        for d in doc_list:
            d.citation.save(index=False)
            d.save(index=False)

        # create simple citing relation: 1 cites 2 and 3; 2 cites 3; 3 cites 1;
        d1.cases_cited.add(d2.citation)
        d2.citation_count += 1
        d2.cases_cited.add(d3.citation)
        d3.citation_count += 1
        d3.cases_cited.add(d1.citation)
        d1.citation_count += 1
        d1.cases_cited.add(d3.citation)
        d3.citation_count += 1
        d1.save(index=False)
        d2.save(index=False)
        d3.save(index=False)
Beispiel #21
0
    def test_should_we_continue_break_or_carry_on_with_a_dup_found(self):
        # Set the dup_threshold to zero for this test
        self.dup_checkers = [
            DupChecker(self.court, full_crawl=True, dup_threshold=0),
            DupChecker(self.court, full_crawl=False, dup_threshold=0)
        ]
        content = "this is dummy content that we hash"
        content_hash = hashlib.sha1(content).hexdigest()
        for dup_checker in self.dup_checkers:
            # Create a document, then use the dup_checker to see if it exists.
            docket = Docket(court=self.court)
            docket.save()
            doc = Document(sha1=content_hash, docket=docket)
            doc.save(index=False)
            onwards = dup_checker.should_we_continue_break_or_carry_on(
                Document,
                now(),
                now(),
                lookup_value=content_hash,
                lookup_by='sha1')
            if dup_checker.full_crawl:
                self.assertEqual(
                    onwards, 'CONTINUE',
                    'DupChecker says to %s during a full crawl.' % onwards)

            else:
                self.assertEqual(
                    onwards, 'BREAK',
                    "DupChecker says to %s but there should be a duplicate in "
                    "the database. dup_count is %s, and dup_threshold is %s" %
                    (onwards, dup_checker.dup_count,
                     dup_checker.dup_threshold))

            doc.delete()
Beispiel #22
0
 def test_should_we_continue_break_or_carry_on_with_dup_found_and_older_date(
         self):
     content = "this is dummy content that we hash"
     content_hash = hashlib.sha1(content).hexdigest()
     for dup_checker in self.dup_checkers:
         docket = Docket(court=self.court)
         docket.save()
         doc = Document(sha1=content_hash, docket=docket)
         doc.save(index=False)
         # Note that the next case occurs prior to the current one
         onwards = dup_checker.should_we_continue_break_or_carry_on(
             Document,
             now(),
             now() - timedelta(days=1),
             lookup_value=content_hash,
             lookup_by='sha1')
         if dup_checker.full_crawl:
             self.assertEqual(
                 onwards, 'CONTINUE',
                 'DupChecker says to %s during a full crawl.' % onwards)
         else:
             self.assertEqual(
                 onwards, 'BREAK',
                 "DupChecker says to %s but there should be a duplicate in "
                 "the database. dup_count is %s, and dup_threshold is %s" %
                 (onwards, dup_checker.dup_count,
                  dup_checker.dup_threshold))
         doc.delete()
Beispiel #23
0
class ViewDocumentTest(TestCase):
    fixtures = ['test_court.json']

    def setUp(self):
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')
        self.client = Client()

        # Add a document to the index
        site = test_opinion_scraper.Site().parse()
        cite = Citation(
            docket_number=site.docket_numbers[0],
            neutral_cite=site.neutral_citations[0],
            federal_cite_one=site.west_citations[0]
        )
        cite.save(index=False)
        docket = Docket(
            court=self.court,
            case_name=site.case_names[0],
        )
        docket.save()
        self.doc = Document(
            date_filed=site.case_dates[0],
            citation=cite,
            docket=docket,
            precedential_status=site.precedential_statuses[0],
        )
        self.doc.save(index=False)

    def tearDown(self):
        self.doc.delete()

    def test_simple_url_check_for_document(self):
        """Does the page load properly?"""
        response = self.client.get('/opinion/1/asdf/')
        self.assertEqual(response.status_code, 200)
        self.assertIn('Tarrant', response.content)
Beispiel #24
0
    def setUp(self):
        self.court = Court.objects.get(pk='test')

        # create 3 documents with their citations and dockets
        c1, c2, c3 = Citation(case_name=u"c1"), Citation(
            case_name=u"c2"), Citation(case_name=u"c3")
        c1.save(index=False)
        c2.save(index=False)
        c3.save(index=False)
        docket1 = Docket(
            case_name=u"c1",
            court=self.court,
        )
        docket2 = Docket(
            case_name=u"c2",
            court=self.court,
        )
        docket3 = Docket(
            case_name=u"c3",
            court=self.court,
        )
        docket1.save()
        docket2.save()
        docket3.save()
        d1, d2, d3 = Document(date_filed=date.today()), Document(
            date_filed=date.today()), Document(date_filed=date.today())
        d1.citation, d2.citation, d3.citation = c1, c2, c3
        d1.docket, d2.docket, d3.docket = docket1, docket2, docket3
        doc_list = [d1, d2, d3]
        for d in doc_list:
            d.citation.save(index=False)
            d.save(index=False)

        # create simple citing relation: 1 cites 2 and 3; 2 cites 3; 3 cites 1;
        d1.cases_cited.add(d2.citation)
        d2.citation_count += 1
        d2.cases_cited.add(d3.citation)
        d3.citation_count += 1
        d3.cases_cited.add(d1.citation)
        d1.citation_count += 1
        d1.cases_cited.add(d3.citation)
        d3.citation_count += 1
        d1.save(index=False)
        d2.save(index=False)
        d3.save(index=False)
Beispiel #25
0
    def setUp(self):
        c1 = Citation(case_name=u"foo")
        c1.save(index=False)
        docket = Docket(
            case_name=u'foo',
            court=Court.objects.get(pk='test'),
        )
        docket.save()
        # Must be more than a year old for all tests to be runnable.
        last_month = now().date() - timedelta(days=400)
        self.doc = Document(
            citation=c1,
            docket=docket,
            date_filed=last_month
        )
        self.doc.save(index=False)

        # Scrape the audio "site" and add its contents
        site = test_oral_arg_scraper.Site().parse()
        OralArgumentCommand().scrape_court(site, full_crawl=True)
Beispiel #26
0
    def test_updating_the_docket_when_the_citation_case_name_changes(self):
        """Makes sure that the docket changes when the citation does."""
        court = Court.objects.get(pk='test')

        original_case_name = u'original case name'
        new_case_name = u'new case name'
        cite = Citation(case_name=original_case_name)
        cite.save(index=False)
        docket = Docket(
            case_name=original_case_name,
            court=court,
        )
        docket.save()
        Document(
            citation=cite,
            docket=docket,
        ).save(index=False)
        cite.case_name = new_case_name
        cite.save(index=False)
        changed_docket = Docket.objects.get(pk=docket.pk)
        self.assertEqual(changed_docket.case_name, new_case_name)
Beispiel #27
0
    def test_content_extraction(self):
        """Do all of the supported mimetypes get extracted to text
        successfully, including OCR?"""
        site = test_opinion_scraper.Site().parse()

        test_strings = [
            'supreme', 'intelligence', 'indiana', 'reagan', 'indiana',
            'fidelity'
        ]
        for i in range(0, len(site.case_names)):
            path = os.path.join(settings.INSTALL_ROOT, 'alert',
                                site.download_urls[i])
            with open(path) as f:
                content = f.read()
                cf = ContentFile(content)
                extension = get_extension(content)
            cite = Citation()
            cite.save(index=False)
            docket = Docket(
                case_name=site.case_names[i],
                court=self.court,
            )
            docket.save()
            doc = Document(
                date_filed=site.case_dates[i],
                citation=cite,
                docket=docket,
            )
            file_name = trunc(site.case_names[i].lower(), 75) + extension
            doc.local_path.save(file_name, cf, save=False)
            doc.save(index=False)
            doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
            if extension in ['.html', '.wpd']:
                self.assertIn(test_strings[i], doc.html.lower())
            else:
                self.assertIn(test_strings[i], doc.plain_text.lower())

            doc.delete()
Beispiel #28
0
    def test_pagerank_calculation(self):
        """Create a few Documents and fake citation relation among them, then
        run the pagerank algorithm. Check whether this simple case can get the
        correct result.
        """
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')

        # create 3 documents with their citations and dockets
        c1, c2, c3 = Citation(case_name=u"c1"), Citation(
            case_name=u"c2"), Citation(case_name=u"c3")
        c1.save(index=False)
        c2.save(index=False)
        c3.save(index=False)
        docket1 = Docket(
            case_name=u"c1",
            court=self.court,
        )
        docket2 = Docket(
            case_name=u"c2",
            court=self.court,
        )
        docket3 = Docket(
            case_name=u"c3",
            court=self.court,
        )
        d1, d2, d3 = Document(date_filed=date.today()), Document(
            date_filed=date.today()), Document(date_filed=date.today())
        d1.citation, d2.citation, d3.citation = c1, c2, c3
        d1.docket, d2.docket, d3.docket = docket1, docket2, docket3
        doc_list = [d1, d2, d3]
        for d in doc_list:
            d.citation.save(index=False)
            d.save(index=False)

        #create simple citing relation: 1 cites 2 and 3; 2 cites 3; 3 cites 1;
        d1.cases_cited.add(d2.citation)
        d2.citation_count += 1
        d2.cases_cited.add(d3.citation)
        d3.citation_count += 1
        d3.cases_cited.add(d1.citation)
        d1.citation_count += 1
        d1.cases_cited.add(d3.citation)
        d3.citation_count += 1
        d1.save(index=False)
        d2.save(index=False)
        d3.save(index=False)

        #calculate pagerank of these 3 document
        comm = Command()
        self.verbosity = 1
        comm.do_pagerank(chown=False)

        # read in the pagerank file, converting to a dict
        pr_values_from_file = {}
        with open(get_data_dir_location() + "external_pagerank") as f:
            for line in f:
                pk, value = line.split('=')
                pr_values_from_file[pk] = float(value.strip())

        # Verify that whether the answer is correct, based on calculations in
        # Gephi
        answers = {
            '1': 0.387790,
            '2': 0.214811,
            '3': 0.397400,
        }
        for key, value in answers.iteritems():
            self.assertTrue(
                (abs(pr_values_from_file[key]) - value) < 0.0001,
                msg="The answer for item %s was %s when it should have been "
                "%s" % (
                    key,
                    pr_values_from_file[key],
                    answers[key],
                ))
def scrape_court(site, full_crawl=False):
    download_error = False
    # Get the court object early for logging
    # opinions.united_states.federal.ca9_u --> ca9
    court_str = site.court_id.split('.')[-1].split('_')[0]
    court = Court.objects.get(pk=court_str)

    dup_checker = DupChecker(site.court_id, full_crawl=full_crawl)
    abort = dup_checker.abort_by_hash(site.hash)
    if not abort:
        for i in range(0, len(site.case_names)):
            msg, r = get_binary_content(site.download_urls[i], site._get_cookies())
            clean_content = site._cleanup_content(r.content)
            if msg:
                logger.warn(msg)
                ErrorLog(log_level='WARNING',
                         court=court,
                         message=msg).save()
                continue

            current_date = site.case_dates[i]
            try:
                next_date = site.case_dates[i + 1]
            except IndexError:
                next_date = None

            # Make a hash of the data. Need to convert unicode to binary before hashing.
            if type(clean_content) == unicode:
                hash_content = clean_content.encode('utf-8')
            else:
                hash_content = clean_content
            sha1_hash = hashlib.sha1(hash_content).hexdigest()
            if court_str == 'nev' and site.precedential_statuses[i] == 'Unpublished':
                # Nevada's non-precedential cases have different SHA1 sums every time.
                onwards = dup_checker.should_we_continue_break_or_carry_on(
                    current_date,
                    next_date,
                    lookup_value=site.download_urls[i],
                    lookup_by='download_url'
                )
            else:
                onwards = dup_checker.should_we_continue_break_or_carry_on(
                    current_date,
                    next_date,
                    lookup_value=sha1_hash,
                    lookup_by='sha1'
                )

            if onwards == 'CONTINUE':
                # It's a duplicate, but we haven't hit any thresholds yet.
                continue
            elif onwards == 'BREAK':
                # It's a duplicate, and we hit a date or dup_count threshold.
                dup_checker.update_site_hash(sha1_hash)
                break
            elif onwards == 'CARRY_ON':
                # Not a duplicate, carry on
                logger.info('Adding new document found at: %s' % site.download_urls[i])
                dup_checker.reset()

                # Make a citation
                cite = Citation(case_name=site.case_names[i])
                if site.docket_numbers:
                    cite.docket_number = site.docket_numbers[i]
                if site.neutral_citations:
                    cite.neutral_cite = site.neutral_citations[i]
                if site.west_citations:
                    cite.federal_cite_one = site.west_citations[i]
                if site.west_state_citations:
                    cite.west_state_cite = site.west_state_citations[i]

                # Make the document object
                doc = Document(source='C',
                               sha1=sha1_hash,
                               date_filed=site.case_dates[i],
                               court=court,
                               download_url=site.download_urls[i],
                               precedential_status=site.precedential_statuses[i])

                # Make and associate the file object
                try:
                    cf = ContentFile(clean_content)
                    extension = get_extension(r.content)
                    # See issue #215 for why this must be lower-cased.
                    file_name = trunc(site.case_names[i].lower(), 75) + extension
                    doc.local_path.save(file_name, cf, save=False)
                except:
                    msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                          (cite.case_name, traceback.format_exc())
                    logger.critical(msg)
                    ErrorLog(log_level='CRITICAL', court=court, message=msg).save()
                    download_error = True
                    continue

                if site.judges:
                    doc.judges = site.judges[i]
                if site.nature_of_suit:
                    doc.nature_of_suit = site.nature_of_suit[i]

                # Save everything, but don't update Solr index yet
                cite.save(index=False)
                doc.citation = cite
                doc.save(index=False)

                # Extract the contents asynchronously.
                extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))

                logger.info("Successfully added doc %s: %s" % (doc.pk, site.case_names[i]))

        # Update the hash if everything finishes properly.
        logger.info("%s: Successfully crawled." % site.court_id)
        if not download_error and not full_crawl:
            # Only update the hash if no errors occurred.
            dup_checker.update_site_hash(site.hash)
Beispiel #30
0
def import_resource_org_item(case_location):
    """Using the path to a case, import it, gathering all needed meta data.

    Path is any valid URI that the requests library can handle.
    """
    def get_file(location):
        if location.startswith('/'):
            with open(location) as f:
                r = requests.Session()
                r.content = f.read()
        else:
            r = requests.get(location)
        return fromstring(r.content), get_clean_body_content(r.content)

    # Get trees and text for the opinion itself and for the index page
    # that links to it. Each has useful data.
    case_tree, case_text = get_file(case_location)
    vol_location = case_location.rsplit('/', 1)[-2] + '/index.html'
    vol_tree, vol_text = get_file(vol_location)

    html, blocked = anonymize(get_case_body(case_tree))

    case_location_relative = case_location.rsplit('/', 1)[1]
    case_name, status = get_case_name_and_status(vol_tree,
                                                 case_location_relative)
    cite = Citation(
        case_name=case_name,
        docket_number=get_docket_number(case_location),
        federal_cite_one=get_west_cite(vol_tree, case_location_relative),
    )
    docket = Docket(
        court=Court.objects.get(pk=get_court_id(case_tree)),
        case_name=case_name,
    )
    doc = Document(
        date_filed=get_date_filed(vol_tree, case_location_relative),
        source='R',
        sha1=hashlib.sha1(case_text).hexdigest(),
        citation=cite,
        docket=docket,
        download_url=case_location,
        html=html,
        precedential_status=status,
    )
    if blocked:
        doc.blocked = True
        docket.blocked = True
        doc.date_blocked = datetime.date.today()
        docket.date_blocked = datetime.date.today()

    cite.save()
    docket.save()
    doc.docket = docket
    doc.citation = cite
    doc.save()

    # Update the citation graph
    from alert.citations.tasks import update_document_by_id
    update_document_by_id(doc.pk)

    return doc
Beispiel #31
0
def import_resource_org_item(case_location):
    """Using the path to a case, import it, gathering all needed meta data.

    Path is any valid URI that the requests library can handle.
    """
    def get_file(location):
        if location.startswith('/'):
            with open(location) as f:
                r = requests.Session()
                r.content = f.read()
        else:
            r = requests.get(location)
        return fromstring(r.content), get_clean_body_content(r.content)

    # Get trees and text for the opinion itself and for the index page
    # that links to it. Each has useful data.
    case_tree, case_text = get_file(case_location)
    vol_location = case_location.rsplit('/', 1)[-2] + '/index.html'
    vol_tree, vol_text = get_file(vol_location)

    html, blocked = anonymize(get_case_body(case_tree))

    case_location_relative = case_location.rsplit('/', 1)[1]
    case_name, status = get_case_name_and_status(
        vol_tree, case_location_relative)
    cite = Citation(
        case_name=case_name,
        docket_number=get_docket_number(case_location),
        federal_cite_one=get_west_cite(vol_tree, case_location_relative),
    )
    docket = Docket(
        court=Court.objects.get(pk=get_court_id(case_tree)),
        case_name=case_name,
    )
    doc = Document(
        date_filed=get_date_filed(vol_tree, case_location_relative),
        source='R',
        sha1=hashlib.sha1(case_text).hexdigest(),
        citation=cite,
        docket=docket,
        download_url=case_location,
        html=html,
        precedential_status=status,
    )
    if blocked:
        doc.blocked = True
        docket.blocked = True
        doc.date_blocked = datetime.date.today()
        docket.date_blocked = datetime.date.today()

    cite.save()
    docket.save()
    doc.docket = docket
    doc.citation = cite
    doc.save()

    # Update the citation graph
    from alert.citations.tasks import update_document_by_id
    update_document_by_id(doc.pk)

    return doc
Beispiel #32
0
    def setUp(self):
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')

        # Set up testing cores in Solr and swap them in
        self.core_name_opinion = '%s.opinion-test-%s' % \
                                 (self.__module__, time.time())
        self.core_name_audio = '%s.audio-test-%s' % \
                               (self.__module__, time.time())
        create_solr_core(self.core_name_opinion)
        create_solr_core(
            self.core_name_audio,
            schema=os.path.join(settings.INSTALL_ROOT, 'Solr', 'conf',
                                'audio_schema.xml'),
            instance_dir='/usr/local/solr/example/solr/audio',
        )
        swap_solr_core('collection1', self.core_name_opinion)
        swap_solr_core('audio', self.core_name_audio)
        self.si_opinion = sunburnt.SolrInterface(settings.SOLR_OPINION_URL,
                                                 mode='rw')
        self.si_audio = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL,
                                               mode='rw')

        # Add three documents and three audio files to the index, but don't
        # extract their contents
        self.site_opinion = test_opinion_scraper.Site().parse()
        self.site_audio = test_oral_arg_scraper.Site().parse()
        cite_counts = (4, 6, 8)
        self.docs = {}
        for i in range(0, 3):
            cite = Citation(
                case_name=self.site_opinion.case_names[i],
                docket_number=self.site_opinion.docket_numbers[i],
                neutral_cite=self.site_opinion.neutral_citations[i],
                federal_cite_one=self.site_opinion.west_citations[i],
            )
            cite.save(index=False)
            docket = Docket(
                case_name=self.site_opinion.case_names[i],
                court=self.court,
            )
            docket.save()
            self.docs[i] = Document(
                date_filed=self.site_opinion.case_dates[i],
                citation=cite,
                docket=docket,
                precedential_status=self.site_opinion.precedential_statuses[i],
                citation_count=cite_counts[i],
                nature_of_suit=self.site_opinion.nature_of_suit[i],
                judges=self.site_opinion.judges[i],
            )
            self.docs[i].save()

        # Create citations between the documents
        # 0 ---cites--> 1, 2
        # 1 ---cites--> 2
        # 2 ---cites--> 0
        self.docs[0].cases_cited.add(self.docs[1].citation)
        self.docs[0].cases_cited.add(self.docs[2].citation)
        self.docs[1].cases_cited.add(self.docs[2].citation)
        self.docs[2].cases_cited.add(self.docs[0].citation)

        for doc in self.docs.itervalues():
            doc.save()

        # Scrape the audio "site" and add its contents
        site = test_oral_arg_scraper.Site().parse()
        Command().scrape_court(site, full_crawl=True)

        self.expected_num_results_opinion = 3
        self.expected_num_results_audio = 2
        self.si_opinion.commit()
        self.si_audio.commit()