Beispiel #1
0
 def test_should_we_continue_break_or_carry_on_with_dup_found_and_older_date(self):
     content = "this is dummy content that we hash"
     content_hash = hashlib.sha1(content).hexdigest()
     for dup_checker in self.dup_checkers:
         docket = Docket(court=self.court)
         docket.save()
         doc = Document(sha1=content_hash, docket=docket)
         doc.save(index=False)
         # Note that the next case occurs prior to the current one
         onwards = dup_checker.should_we_continue_break_or_carry_on(
             Document,
             now(),
             now() - timedelta(days=1),
             lookup_value=content_hash,
             lookup_by='sha1'
         )
         if dup_checker.full_crawl:
             self.assertEqual(
                 onwards,
                 'CONTINUE',
                 'DupChecker says to %s during a full crawl.' % onwards)
         else:
             self.assertEqual(
                 onwards,
                 'BREAK',
                 "DupChecker says to %s but there should be a duplicate in "
                 "the database. dup_count is %s, and dup_threshold is %s" %
                 (onwards, dup_checker.dup_count, dup_checker.dup_threshold)
             )
         doc.delete()
Beispiel #2
0
    def test_save_old_opinion(self):
        """Can we save opinions older than 1900?"""
        court = Court.objects.get(pk='test')

        cite = Citation(case_name=u"Blah")
        cite.save(index=True)
        docket = Docket(
            case_name=u"Blah",
            court=court,
        )
        docket.save()
        d = Document(
            citation=cite,
            docket=docket,
            date_filed=datetime.date(1899, 1, 1),

        )

        try:
            cf = ContentFile(StringIO.StringIO('blah').read())
            d.local_path.save('file_name.pdf', cf, save=False)
            d.save(index=True)
        except ValueError:
            raise ValueError("Unable to save a case older than 1900. Did you "
                             "try to use `strftime`...again?")
Beispiel #3
0
    def test_should_we_continue_break_or_carry_on_with_a_dup_found(self):
        # Set the dup_threshold to zero for this test
        self.dup_checkers = [
            DupChecker(self.court, full_crawl=True, dup_threshold=0),
            DupChecker(self.court, full_crawl=False, dup_threshold=0)
        ]
        content = "this is dummy content that we hash"
        content_hash = hashlib.sha1(content).hexdigest()
        for dup_checker in self.dup_checkers:
            # Create a document, then use the dup_checker to see if it exists.
            docket = Docket(court=self.court)
            docket.save()
            doc = Document(sha1=content_hash, docket=docket)
            doc.save(index=False)
            onwards = dup_checker.should_we_continue_break_or_carry_on(
                Document,
                now(),
                now(),
                lookup_value=content_hash,
                lookup_by='sha1')
            if dup_checker.full_crawl:
                self.assertEqual(
                    onwards, 'CONTINUE',
                    'DupChecker says to %s during a full crawl.' % onwards)

            else:
                self.assertEqual(
                    onwards, 'BREAK',
                    "DupChecker says to %s but there should be a duplicate in "
                    "the database. dup_count is %s, and dup_threshold is %s" %
                    (onwards, dup_checker.dup_count,
                     dup_checker.dup_threshold))

            doc.delete()
Beispiel #4
0
 def test_should_we_continue_break_or_carry_on_with_dup_found_and_older_date(
         self):
     content = "this is dummy content that we hash"
     content_hash = hashlib.sha1(content).hexdigest()
     for dup_checker in self.dup_checkers:
         docket = Docket(court=self.court)
         docket.save()
         doc = Document(sha1=content_hash, docket=docket)
         doc.save(index=False)
         # Note that the next case occurs prior to the current one
         onwards = dup_checker.should_we_continue_break_or_carry_on(
             Document,
             now(),
             now() - timedelta(days=1),
             lookup_value=content_hash,
             lookup_by='sha1')
         if dup_checker.full_crawl:
             self.assertEqual(
                 onwards, 'CONTINUE',
                 'DupChecker says to %s during a full crawl.' % onwards)
         else:
             self.assertEqual(
                 onwards, 'BREAK',
                 "DupChecker says to %s but there should be a duplicate in "
                 "the database. dup_count is %s, and dup_threshold is %s" %
                 (onwards, dup_checker.dup_count,
                  dup_checker.dup_threshold))
         doc.delete()
Beispiel #5
0
    def test_should_we_continue_break_or_carry_on_with_a_dup_found(self):
        # Set the dup_threshold to zero for this test
        self.dup_checkers = [
            DupChecker(self.court, full_crawl=True, dup_threshold=0),
            DupChecker(self.court, full_crawl=False, dup_threshold=0),
        ]
        content = "this is dummy content that we hash"
        content_hash = hashlib.sha1(content).hexdigest()
        for dup_checker in self.dup_checkers:
            # Create a document, then use the dup_checker to see if it exists.
            docket = Docket(court=self.court)
            docket.save()
            doc = Document(sha1=content_hash, docket=docket)
            doc.save(index=False)
            onwards = dup_checker.should_we_continue_break_or_carry_on(
                Document, now(), now(), lookup_value=content_hash, lookup_by="sha1"
            )
            if dup_checker.full_crawl:
                self.assertEqual(onwards, "CONTINUE", "DupChecker says to %s during a full crawl." % onwards)

            else:
                self.assertEqual(
                    onwards,
                    "BREAK",
                    "DupChecker says to %s but there should be a duplicate in "
                    "the database. dup_count is %s, and dup_threshold is %s"
                    % (onwards, dup_checker.dup_count, dup_checker.dup_threshold),
                )

            doc.delete()
Beispiel #6
0
    def test_content_extraction(self):
        """Do all of the supported mimetypes get extracted to text
        successfully, including OCR?"""
        site = test_opinion_scraper.Site().parse()

        test_strings = ["supreme", "intelligence", "indiana", "reagan", "indiana", "fidelity"]
        for i in range(0, len(site.case_names)):
            path = os.path.join(settings.INSTALL_ROOT, "alert", site.download_urls[i])
            with open(path) as f:
                content = f.read()
                cf = ContentFile(content)
                extension = get_extension(content)
            cite = Citation()
            cite.save(index=False)
            docket = Docket(case_name=site.case_names[i], court=self.court)
            docket.save()
            doc = Document(date_filed=site.case_dates[i], citation=cite, docket=docket)
            file_name = trunc(site.case_names[i].lower(), 75) + extension
            doc.local_path.save(file_name, cf, save=False)
            doc.save(index=False)
            doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
            if extension in [".html", ".wpd"]:
                self.assertIn(test_strings[i], doc.html.lower())
            else:
                self.assertIn(test_strings[i], doc.plain_text.lower())

            doc.delete()
Beispiel #7
0
    def setUp(self):
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')
        self.client = Client()

        # Add a document to the index
        site = test_opinion_scraper.Site().parse()
        cite = Citation(
            neutral_cite=site.neutral_citations[0],
            federal_cite_one=site.west_citations[0]
        )
        cite.save(index=False)
        docket = Docket(
            docket_number=site.docket_numbers[0],
            court=self.court,
            case_name=site.case_names[0],
        )
        docket.save()
        self.doc = Document(
            date_filed=site.case_dates[0],
            citation=cite,
            docket=docket,
            precedential_status=site.precedential_statuses[0],
        )
        self.doc.save(index=False)
Beispiel #8
0
    def setUp(self):
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')
        self.client = Client()

        # Add a document to the index
        site = test_opinion_scraper.Site().parse()
        cite = Citation(
            docket_number=site.docket_numbers[0],
            neutral_cite=site.neutral_citations[0],
            federal_cite_one=site.west_citations[0]
        )
        cite.save(index=False)
        docket = Docket(
            court=self.court,
            case_name=site.case_names[0],
        )
        docket.save()
        self.doc = Document(
            date_filed=site.case_dates[0],
            citation=cite,
            docket=docket,
            precedential_status=site.precedential_statuses[0],
        )
        self.doc.save(index=False)
Beispiel #9
0
 def test_solr_ingestion_and_deletion(self):
     """Do items get added to the Solr index when they are ingested?"""
     site = test_opinion_scraper.Site().parse()
     path = os.path.join(settings.INSTALL_ROOT, 'alert',
                         site.download_urls[0])  # a simple PDF
     with open(path) as f:
         content = f.read()
         cf = ContentFile(content)
         extension = get_extension(content)
     cite = Citation()
     cite.save(index=False)
     docket = Docket(
         court=self.court,
         case_name=site.case_names[0],
     )
     docket.save()
     doc = Document(
         date_filed=site.case_dates[0],
         docket=docket,
         citation=cite,
     )
     file_name = trunc(site.case_names[0].lower(), 75) + extension
     doc.local_path.save(file_name, cf, save=False)
     doc.save(index=False)
     extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
     response = self.si.raw_query(**{
         'q': 'supreme',
         'caller': 'scraper_test',
     }).execute()
     count = response.result.numFound
     self.assertEqual(
         count, 1,
         "There were %s items found when there should have been 1" % count)
    def associate_meta_data_to_objects(self, site, i, court, sha1_hash):
        """Takes the meta data from the scraper and assocites it with objects.

        Returns the created objects.
        """
        cite = Citation(case_name=site.case_names[i])
        if site.neutral_citations:
            cite.neutral_cite = site.neutral_citations[i]
        if site.west_citations:
            cite.federal_cite_one = site.west_citations[i]
        if site.west_state_citations:
            cite.west_state_cite = site.west_state_citations[i]

        docket = Docket()
        if site.docket_numbers:
            docket.docket_number = site.docket_numbers[i]
        docket.case_name = site.case_names[i]
        docket.court = court

        doc = Document(
            source='C',
            sha1=sha1_hash,
            date_filed=site.case_dates[i],
            download_url=site.download_urls[i],
            precedential_status=site.precedential_statuses[i]
        )
        if site.judges:
            doc.judges = site.judges[i]
        if site.nature_of_suit:
            doc.nature_of_suit = site.nature_of_suit[i]

        return cite, docket, doc
Beispiel #11
0
 def test_solr_ingestion_and_deletion(self):
     """Do items get added to the Solr index when they are ingested?"""
     site = test_opinion_scraper.Site().parse()
     path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[0])  # a simple PDF
     with open(path) as f:
         content = f.read()
         cf = ContentFile(content)
         extension = get_extension(content)
     cite = Citation()
     cite.save(index=False)
     docket = Docket(
         court=self.court,
         case_name=site.case_names[0],
     )
     docket.save()
     doc = Document(
         date_filed=site.case_dates[0],
         docket=docket,
         citation=cite,
     )
     file_name = trunc(site.case_names[0].lower(), 75) + extension
     doc.local_path.save(file_name, cf, save=False)
     doc.save(index=False)
     extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
     response = self.si.raw_query(**{'q': 'supreme', 'caller': 'scraper_test',}).execute()
     count = response.result.numFound
     self.assertEqual(count, 1, "There were %s items found when there should have been 1" % count)
Beispiel #12
0
    def setUp(self):
        c1 = Citation(case_name=u"foo")
        c1.save(index=False)
        docket = Docket(
            case_name=u'foo',
            court=Court.objects.get(pk='test'),
        )
        docket.save()
        # Must be more than a year old for all tests to be runnable.
        last_month = now().date() - timedelta(days=400)
        self.doc = Document(citation=c1, docket=docket, date_filed=last_month)
        self.doc.save(index=False)

        # Scrape the audio "site" and add its contents
        site = test_oral_arg_scraper.Site().parse()
        OralArgumentCommand().scrape_court(site, full_crawl=True)
    def associate_meta_data_to_objects(self, site, i, court, sha1_hash):
        """Takes the meta data from the scraper and assocites it with objects. Returns the created objects.
        """
        cite = Citation(case_name=site.case_names[i])
        if site.docket_numbers:
            cite.docket_number = site.docket_numbers[i]
        if site.neutral_citations:
            cite.neutral_cite = site.neutral_citations[i]
        if site.west_citations:
            cite.federal_cite_one = site.west_citations[i]
        if site.west_state_citations:
            cite.west_state_cite = site.west_state_citations[i]

        docket = Docket(
            case_name=site.case_names[i],
            court=court,
        )

        doc = Document(source='C',
                       sha1=sha1_hash,
                       date_filed=site.case_dates[i],
                       download_url=site.download_urls[i],
                       precedential_status=site.precedential_statuses[i])
        if site.judges:
            doc.judges = site.judges[i]
        if site.nature_of_suit:
            doc.nature_of_suit = site.nature_of_suit[i]

        return cite, docket, doc
Beispiel #14
0
    def setUp(self):
        c1 = Citation(case_name=u"foo")
        c1.save(index=False)
        docket = Docket(
            case_name=u'foo',
            court=Court.objects.get(pk='test'),
        )
        docket.save()
        # Must be more than a year old for all tests to be runnable.
        last_month = now().date() - timedelta(days=400)
        self.doc = Document(
            citation=c1,
            docket=docket,
            date_filed=last_month
        )
        self.doc.save(index=False)

        # Scrape the audio "site" and add its contents
        site = test_oral_arg_scraper.Site().parse()
        OralArgumentCommand().scrape_court(site, full_crawl=True)
Beispiel #15
0
    def test_updating_the_docket_when_the_citation_case_name_changes(self):
        """Makes sure that the docket changes when the citation does."""
        court = Court.objects.get(pk='test')

        original_case_name = u'original case name'
        new_case_name = u'new case name'
        cite = Citation(case_name=original_case_name)
        cite.save(index=False)
        docket = Docket(
            case_name=original_case_name,
            court=court,
        )
        docket.save()
        Document(
            citation=cite,
            docket=docket,
        ).save(index=False)
        cite.case_name = new_case_name
        cite.save(index=False)
        changed_docket = Docket.objects.get(pk=docket.pk)
        self.assertEqual(changed_docket.case_name, new_case_name)
Beispiel #16
0
    def setUp(self):
        self.court = Court.objects.get(pk='test')

        # create 3 documents with their citations and dockets
        c1, c2, c3 = Citation(case_name=u"c1"), Citation(
            case_name=u"c2"), Citation(case_name=u"c3")
        c1.save(index=False)
        c2.save(index=False)
        c3.save(index=False)
        docket1 = Docket(
            case_name=u"c1",
            court=self.court,
        )
        docket2 = Docket(
            case_name=u"c2",
            court=self.court,
        )
        docket3 = Docket(
            case_name=u"c3",
            court=self.court,
        )
        docket1.save()
        docket2.save()
        docket3.save()
        d1, d2, d3 = Document(date_filed=date.today()), Document(
            date_filed=date.today()), Document(date_filed=date.today())
        d1.citation, d2.citation, d3.citation = c1, c2, c3
        d1.docket, d2.docket, d3.docket = docket1, docket2, docket3
        doc_list = [d1, d2, d3]
        for d in doc_list:
            d.citation.save(index=False)
            d.save(index=False)

        # create simple citing relation: 1 cites 2 and 3; 2 cites 3; 3 cites 1;
        d1.cases_cited.add(d2.citation)
        d2.citation_count += 1
        d2.cases_cited.add(d3.citation)
        d3.citation_count += 1
        d3.cases_cited.add(d1.citation)
        d1.citation_count += 1
        d1.cases_cited.add(d3.citation)
        d3.citation_count += 1
        d1.save(index=False)
        d2.save(index=False)
        d3.save(index=False)
Beispiel #17
0
    def test_content_extraction(self):
        """Do all of the supported mimetypes get extracted to text
        successfully, including OCR?"""
        site = test_opinion_scraper.Site().parse()

        test_strings = [
            'supreme', 'intelligence', 'indiana', 'reagan', 'indiana',
            'fidelity'
        ]
        for i in range(0, len(site.case_names)):
            path = os.path.join(settings.INSTALL_ROOT, 'alert',
                                site.download_urls[i])
            with open(path) as f:
                content = f.read()
                cf = ContentFile(content)
                extension = get_extension(content)
            cite = Citation()
            cite.save(index=False)
            docket = Docket(
                case_name=site.case_names[i],
                court=self.court,
            )
            docket.save()
            doc = Document(
                date_filed=site.case_dates[i],
                citation=cite,
                docket=docket,
            )
            file_name = trunc(site.case_names[i].lower(), 75) + extension
            doc.local_path.save(file_name, cf, save=False)
            doc.save(index=False)
            doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
            if extension in ['.html', '.wpd']:
                self.assertIn(test_strings[i], doc.html.lower())
            else:
                self.assertIn(test_strings[i], doc.plain_text.lower())

            doc.delete()
Beispiel #18
0
    def test_citation_matching(self):
        """Creates a few documents that contain specific citations, then
        attempts to find and match those citations.

        This becomes a bit of an integration test, which is likely fine.
        """
        # Set up a document
        c1 = models.Citation(federal_cite_one=u'1 Yeates 1 (test 1795)')
        c1.save(index=False)
        docket1 = Docket(
            case_name=u"Lissner v. Saad",
            court=self.court,
        )
        docket1.save()
        d1 = models.Document(
            date_filed=date(1795, 6, 9),
            citation=c1,
            docket=docket1,
            precedential_status='Published',
        )
        d1.save(index=True)
        # Reference d1 from the text of another document
        c2 = models.Citation()
        c2.save(index=False)
        docket2 = Docket(
            case_name=u"Reference to Lissner v. Saad",
            court=self.court,
        )
        docket2.save()
        d2 = models.Document(date_filed=date(1982, 6, 9),
                             docket=docket2,
                             citation=c2,
                             plain_text=u"1 Yeates 1")
        d2.save(index=True)

        # Do a commit, or else citations can't be found in the index.
        self.si_opinion.commit()
        update_document(d2)  # Updates d1's citation count in a Celery task
        d1 = models.Document.objects.get(pk=1)  # cache-bust d1

        self.assertEqual(
            d1.citation_count,
            1,
            msg=u"d1 was not updated by a citation found in d2. Count was: %s"
            % d1.citation_count)
        d1.delete()
        d2.delete()
def import_law_box_case(case_path):
    """Open the file, get its contents, convert to XML and extract the meta data.

    Return a document object for saving in the database
    """
    raw_text = open(case_path).read()
    clean_html_tree, complete_html_tree, clean_html_str, body_text = get_html_from_raw_text(
        raw_text)

    sha1 = hashlib.sha1(clean_html_str).hexdigest()
    citations = get_citations_from_tree(complete_html_tree, case_path)
    judges = get_judge(clean_html_tree, case_path)
    court = get_court_object(clean_html_tree, citations, case_path, judges)

    doc = Document(
        source='L',
        sha1=sha1,
        html=
        clean_html_str,  # we clear this field later, putting the value into html_lawbox.
        date_filed=get_date_filed(clean_html_tree,
                                  citations=citations,
                                  case_path=case_path,
                                  court=court),
        precedential_status=get_precedential_status(),
        judges=judges,
        download_url=case_path,
    )

    cite = Citation(docket_number=get_docket_number(
        clean_html_tree, case_path=case_path, court=court))

    docket = Docket(
        case_name=get_case_name(complete_html_tree, case_path),
        court=court,
    )

    # Necessary for dup_finder.
    path = '//p/text()'
    doc.body_text = ' '.join(clean_html_tree.xpath(path))

    # Add the dict of citations to the object as its attributes.
    citations_as_dict = map_citations_to_models(citations)
    for k, v in citations_as_dict.iteritems():
        setattr(cite, k, v)

    doc.citation = cite
    doc.docket = docket

    return doc
Beispiel #20
0
    def test_citation_matching(self):
        """Creates a few documents that contain specific citations, then
        attempts to find and match those citations.

        This becomes a bit of an integration test, which is likely fine.
        """
        # Set up a document
        c1 = models.Citation(federal_cite_one=u'1 Yeates 1 (test 1795)')
        c1.save(index=False)
        docket1 = Docket(
            case_name=u"Lissner v. Saad",
            court=self.court,
        )
        docket1.save()
        d1 = models.Document(
            date_filed=date(1795, 6, 9),
            citation=c1,
            docket=docket1,
            precedential_status='Published',
        )
        d1.save(index=True)
        # Reference d1 from the text of another document
        c2 = models.Citation()
        c2.save(index=False)
        docket2 = Docket(
            case_name=u"Reference to Lissner v. Saad",
            court=self.court,
        )
        docket2.save()
        d2 = models.Document(
            date_filed=date(1982, 6, 9),
            docket=docket2,
            citation=c2,
            plain_text=u"1 Yeates 1"
        )
        d2.save(index=True)

        # Do a commit, or else citations can't be found in the index.
        self.si_opinion.commit()
        update_document(d2)  # Updates d1's citation count in a Celery task
        d1 = models.Document.objects.get(pk=1)  # cache-bust d1

        self.assertEqual(
            d1.citation_count,
            1,
            msg=u"d1 was not updated by a citation found in d2. Count was: %s"
                % d1.citation_count
        )
        d1.delete()
        d2.delete()
Beispiel #21
0
    def associate_meta_data_to_objects(site, i, court, sha1_hash):
        audio_file = Audio(
            source='C',
            sha1=sha1_hash,
            case_name=site.case_names[i],
            date_argued=site.case_dates[i],
            download_url=site.download_urls[i],
            processing_complete=False,
        )
        if site.judges:
            audio_file.judges = site.judges[i]
        if site.docket_numbers:
            audio_file.docket_number = site.docket_numbers[i]

        docket = Docket(
            case_name=site.case_names[i],
            court=court,
        )

        return docket, audio_file
Beispiel #22
0
    def test_pagerank_calculation(self):
        """Create a few Documents and fake citation relation among them, then
        run the pagerank algorithm. Check whether this simple case can get the
        correct result.
        """
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')

        # create 3 documents with their citations and dockets
        c1, c2, c3 = Citation(case_name=u"c1"), Citation(
            case_name=u"c2"), Citation(case_name=u"c3")
        c1.save(index=False)
        c2.save(index=False)
        c3.save(index=False)
        docket1 = Docket(
            case_name=u"c1",
            court=self.court,
        )
        docket2 = Docket(
            case_name=u"c2",
            court=self.court,
        )
        docket3 = Docket(
            case_name=u"c3",
            court=self.court,
        )
        d1, d2, d3 = Document(date_filed=date.today()), Document(
            date_filed=date.today()), Document(date_filed=date.today())
        d1.citation, d2.citation, d3.citation = c1, c2, c3
        d1.docket, d2.docket, d3.docket = docket1, docket2, docket3
        doc_list = [d1, d2, d3]
        for d in doc_list:
            d.citation.save(index=False)
            d.save(index=False)

        #create simple citing relation: 1 cites 2 and 3; 2 cites 3; 3 cites 1;
        d1.cases_cited.add(d2.citation)
        d2.citation_count += 1
        d2.cases_cited.add(d3.citation)
        d3.citation_count += 1
        d3.cases_cited.add(d1.citation)
        d1.citation_count += 1
        d1.cases_cited.add(d3.citation)
        d3.citation_count += 1
        d1.save(index=False)
        d2.save(index=False)
        d3.save(index=False)

        #calculate pagerank of these 3 document
        comm = Command()
        self.verbosity = 1
        comm.do_pagerank(chown=False)

        # read in the pagerank file, converting to a dict
        pr_values_from_file = {}
        with open(get_data_dir_location() + "external_pagerank") as f:
            for line in f:
                pk, value = line.split('=')
                pr_values_from_file[pk] = float(value.strip())

        # Verify that whether the answer is correct, based on calculations in
        # Gephi
        answers = {
            '1': 0.387790,
            '2': 0.214811,
            '3': 0.397400,
        }
        for key, value in answers.iteritems():
            self.assertTrue(
                (abs(pr_values_from_file[key]) - value) < 0.0001,
                msg="The answer for item %s was %s when it should have been "
                "%s" % (
                    key,
                    pr_values_from_file[key],
                    answers[key],
                ))
def download_and_save():
    """This function is run in many threads simultaneously. Each thread
    runs so long as there are items in the queue. Once an item is found, it's
    downloaded and saved.

    The number of items that can be concurrently saved is determined by the
    number of threads that are running this function.
    """
    while True:
        item = queue.get()
        logger.info("Attempting to add item at: %s" % item['url'])
        try:
            msg, r = get_binary_content(
                item['url'],
                {},
            )
        except:
            logger.info("Unable to get item at: %s" % item['url'])
            queue.task_done()

        if msg:
            logger.warn(msg)
            queue.task_done()

        sha1_hash = hashlib.sha1(r.content).hexdigest()
        if Audio.objects.filter(sha1=sha1_hash).exists():
            # Simpsons did it! Try the next one.
            logger.info("Item already exists, moving to next item.")
            queue.task_done()
        else:
            # New item, onwards!
            logger.info('Adding new document found at: %s' % item['url'])
            audio_file = Audio(
                source='H',
                sha1=sha1_hash,
                case_name=item['case_name'],
                date_argued=item['date_argued'],
                download_url=item['url'],
                processing_complete=False,
            )
            if item['judges']:
                audio_file.judges = item['judges']
            if item['docket_number']:
                audio_file.docket_number = item['docket_number']

            court = Court.objects.get(pk=item['court_code'])

            docket = Docket(
                case_name=item['case_name'],
                court=court,
            )
            # Make and associate the file object
            try:
                cf = ContentFile(r.content)
                extension = get_extension(r.content)
                if extension not in ['.mp3', '.wma']:
                    extension = '.' + item['url'].rsplit('.', 1)[1]
                # See bitbucket issue #215 for why this must be
                # lower-cased.
                file_name = trunc(item['case_name'].lower(), 75) + extension
                audio_file.local_path_original_file.save(file_name,
                                                         cf,
                                                         save=False)
            except:
                msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                      (item['case_name'], traceback.format_exc())
                logger.critical(msg)
                queue.task_done()

            docket.save()
            audio_file.docket = docket
            audio_file.save(index=False)

            random_delay = random.randint(0, 3600)
            process_audio_file.apply_async((audio_file.pk, ),
                                           countdown=random_delay)

            logger.info("Successfully added audio file %s: %s" %
                        (audio_file.pk, audio_file.case_name))
Beispiel #24
0
    def setUp(self):
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')

        # Set up testing cores in Solr and swap them in
        self.core_name_opinion = '%s.opinion-test-%s' % \
                                 (self.__module__, time.time())
        self.core_name_audio = '%s.audio-test-%s' % \
                               (self.__module__, time.time())
        create_solr_core(self.core_name_opinion)
        create_solr_core(
            self.core_name_audio,
            schema=os.path.join(settings.INSTALL_ROOT, 'Solr', 'conf',
                                'audio_schema.xml'),
            instance_dir='/usr/local/solr/example/solr/audio',
        )
        swap_solr_core('collection1', self.core_name_opinion)
        swap_solr_core('audio', self.core_name_audio)
        self.si_opinion = sunburnt.SolrInterface(
            settings.SOLR_OPINION_URL, mode='rw')
        self.si_audio = sunburnt.SolrInterface(
            settings.SOLR_AUDIO_URL, mode='rw')

        # Add three documents and three audio files to the index, but don't
        # extract their contents
        self.site_opinion = test_opinion_scraper.Site().parse()
        self.site_audio = test_oral_arg_scraper.Site().parse()
        cite_counts = (4, 6, 8)
        self.docs = {}
        for i in range(0, 3):
            cite = Citation(
                case_name=self.site_opinion.case_names[i],
                docket_number=self.site_opinion.docket_numbers[i],
                neutral_cite=self.site_opinion.neutral_citations[i],
                federal_cite_one=self.site_opinion.west_citations[i],
            )
            cite.save(index=False)
            docket = Docket(
                case_name=self.site_opinion.case_names[i],
                court=self.court,
            )
            docket.save()
            self.docs[i] = Document(
                date_filed=self.site_opinion.case_dates[i],
                citation=cite,
                docket=docket,
                precedential_status=self.site_opinion.precedential_statuses[i],
                citation_count=cite_counts[i],
                nature_of_suit=self.site_opinion.nature_of_suit[i],
                judges=self.site_opinion.judges[i],
            )
            self.docs[i].save()

        # Create citations between the documents
        # 0 ---cites--> 1, 2
        # 1 ---cites--> 2
        # 2 ---cites--> 0
        self.docs[0].cases_cited.add(self.docs[1].citation)
        self.docs[0].cases_cited.add(self.docs[2].citation)
        self.docs[1].cases_cited.add(self.docs[2].citation)
        self.docs[2].cases_cited.add(self.docs[0].citation)

        for doc in self.docs.itervalues():
            doc.save()

        # Scrape the audio "site" and add its contents
        site = test_oral_arg_scraper.Site().parse()
        Command().scrape_court(site, full_crawl=True)

        self.expected_num_results_opinion = 3
        self.expected_num_results_audio = 2
        self.si_opinion.commit()
        self.si_audio.commit()
Beispiel #25
0
def import_resource_org_item(case_location):
    """Using the path to a case, import it, gathering all needed meta data.

    Path is any valid URI that the requests library can handle.
    """
    def get_file(location):
        if location.startswith('/'):
            with open(location) as f:
                r = requests.Session()
                r.content = f.read()
        else:
            r = requests.get(location)
        return fromstring(r.content), get_clean_body_content(r.content)

    # Get trees and text for the opinion itself and for the index page
    # that links to it. Each has useful data.
    case_tree, case_text = get_file(case_location)
    vol_location = case_location.rsplit('/', 1)[-2] + '/index.html'
    vol_tree, vol_text = get_file(vol_location)

    html, blocked = anonymize(get_case_body(case_tree))

    case_location_relative = case_location.rsplit('/', 1)[1]
    case_name, status = get_case_name_and_status(
        vol_tree, case_location_relative)
    cite = Citation(
        case_name=case_name,
        docket_number=get_docket_number(case_location),
        federal_cite_one=get_west_cite(vol_tree, case_location_relative),
    )
    docket = Docket(
        court=Court.objects.get(pk=get_court_id(case_tree)),
        case_name=case_name,
    )
    doc = Document(
        date_filed=get_date_filed(vol_tree, case_location_relative),
        source='R',
        sha1=hashlib.sha1(case_text).hexdigest(),
        citation=cite,
        docket=docket,
        download_url=case_location,
        html=html,
        precedential_status=status,
    )
    if blocked:
        doc.blocked = True
        docket.blocked = True
        doc.date_blocked = datetime.date.today()
        docket.date_blocked = datetime.date.today()

    cite.save()
    docket.save()
    doc.docket = docket
    doc.citation = cite
    doc.save()

    # Update the citation graph
    from alert.citations.tasks import update_document_by_id
    update_document_by_id(doc.pk)

    return doc
Beispiel #26
0
    def setUp(self):
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')

        # Set up testing cores in Solr and swap them in
        self.core_name_opinion = '%s.opinion-test-%s' % \
                                 (self.__module__, time.time())
        self.core_name_audio = '%s.audio-test-%s' % \
                               (self.__module__, time.time())
        create_solr_core(self.core_name_opinion)
        create_solr_core(
            self.core_name_audio,
            schema=os.path.join(settings.INSTALL_ROOT, 'Solr', 'conf',
                                'audio_schema.xml'),
            instance_dir='/usr/local/solr/example/solr/audio',
        )
        swap_solr_core('collection1', self.core_name_opinion)
        swap_solr_core('audio', self.core_name_audio)
        self.si_opinion = sunburnt.SolrInterface(settings.SOLR_OPINION_URL,
                                                 mode='rw')
        self.si_audio = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL,
                                               mode='rw')

        # Add three documents and three audio files to the index, but don't
        # extract their contents
        self.site_opinion = test_opinion_scraper.Site().parse()
        self.site_audio = test_oral_arg_scraper.Site().parse()
        cite_counts = (4, 6, 8)
        self.docs = {}
        for i in range(0, 3):
            cite = Citation(
                case_name=self.site_opinion.case_names[i],
                docket_number=self.site_opinion.docket_numbers[i],
                neutral_cite=self.site_opinion.neutral_citations[i],
                federal_cite_one=self.site_opinion.west_citations[i],
            )
            cite.save(index=False)
            docket = Docket(
                case_name=self.site_opinion.case_names[i],
                court=self.court,
            )
            docket.save()
            self.docs[i] = Document(
                date_filed=self.site_opinion.case_dates[i],
                citation=cite,
                docket=docket,
                precedential_status=self.site_opinion.precedential_statuses[i],
                citation_count=cite_counts[i],
                nature_of_suit=self.site_opinion.nature_of_suit[i],
                judges=self.site_opinion.judges[i],
            )
            self.docs[i].save()

        # Create citations between the documents
        # 0 ---cites--> 1, 2
        # 1 ---cites--> 2
        # 2 ---cites--> 0
        self.docs[0].cases_cited.add(self.docs[1].citation)
        self.docs[0].cases_cited.add(self.docs[2].citation)
        self.docs[1].cases_cited.add(self.docs[2].citation)
        self.docs[2].cases_cited.add(self.docs[0].citation)

        for doc in self.docs.itervalues():
            doc.save()

        # Scrape the audio "site" and add its contents
        site = test_oral_arg_scraper.Site().parse()
        Command().scrape_court(site, full_crawl=True)

        self.expected_num_results_opinion = 3
        self.expected_num_results_audio = 2
        self.si_opinion.commit()
        self.si_audio.commit()
Beispiel #27
0
    def setUp(self):
        self.court = Court.objects.get(pk='test')

        # create 3 documents with their citations and dockets
        c1, c2, c3 = Citation(case_name=u"c1"), Citation(
            case_name=u"c2"), Citation(case_name=u"c3")
        c1.save(index=False)
        c2.save(index=False)
        c3.save(index=False)
        docket1 = Docket(
            case_name=u"c1",
            court=self.court,
        )
        docket2 = Docket(
            case_name=u"c2",
            court=self.court,
        )
        docket3 = Docket(
            case_name=u"c3",
            court=self.court,
        )
        docket1.save()
        docket2.save()
        docket3.save()
        d1, d2, d3 = Document(date_filed=date.today()), Document(
            date_filed=date.today()), Document(date_filed=date.today())
        d1.citation, d2.citation, d3.citation = c1, c2, c3
        d1.docket, d2.docket, d3.docket = docket1, docket2, docket3
        doc_list = [d1, d2, d3]
        for d in doc_list:
            d.citation.save(index=False)
            d.save(index=False)

        # create simple citing relation: 1 cites 2 and 3; 2 cites 3; 3 cites 1;
        d1.cases_cited.add(d2.citation)
        d2.citation_count += 1
        d2.cases_cited.add(d3.citation)
        d3.citation_count += 1
        d3.cases_cited.add(d1.citation)
        d1.citation_count += 1
        d1.cases_cited.add(d3.citation)
        d3.citation_count += 1
        d1.save(index=False)
        d2.save(index=False)
        d3.save(index=False)
Beispiel #28
0
def import_resource_org_item(case_location):
    """Using the path to a case, import it, gathering all needed meta data.

    Path is any valid URI that the requests library can handle.
    """
    def get_file(location):
        if location.startswith('/'):
            with open(location) as f:
                r = requests.Session()
                r.content = f.read()
        else:
            r = requests.get(location)
        return fromstring(r.content), get_clean_body_content(r.content)

    # Get trees and text for the opinion itself and for the index page
    # that links to it. Each has useful data.
    case_tree, case_text = get_file(case_location)
    vol_location = case_location.rsplit('/', 1)[-2] + '/index.html'
    vol_tree, vol_text = get_file(vol_location)

    html, blocked = anonymize(get_case_body(case_tree))

    case_location_relative = case_location.rsplit('/', 1)[1]
    case_name, status = get_case_name_and_status(vol_tree,
                                                 case_location_relative)
    cite = Citation(
        case_name=case_name,
        docket_number=get_docket_number(case_location),
        federal_cite_one=get_west_cite(vol_tree, case_location_relative),
    )
    docket = Docket(
        court=Court.objects.get(pk=get_court_id(case_tree)),
        case_name=case_name,
    )
    doc = Document(
        date_filed=get_date_filed(vol_tree, case_location_relative),
        source='R',
        sha1=hashlib.sha1(case_text).hexdigest(),
        citation=cite,
        docket=docket,
        download_url=case_location,
        html=html,
        precedential_status=status,
    )
    if blocked:
        doc.blocked = True
        docket.blocked = True
        doc.date_blocked = datetime.date.today()
        docket.date_blocked = datetime.date.today()

    cite.save()
    docket.save()
    doc.docket = docket
    doc.citation = cite
    doc.save()

    # Update the citation graph
    from alert.citations.tasks import update_document_by_id
    update_document_by_id(doc.pk)

    return doc
def download_and_save():
    """This function is run in many threads simultaneously. Each thread
    runs so long as there are items in the queue. Once an item is found, it's
    downloaded and saved.

    The number of items that can be concurrently saved is determined by the
    number of threads that are running this function.
    """
    while True:
        item = queue.get()
        logger.info("Attempting to add item at: %s" % item['url'])
        try:
            msg, r = get_binary_content(
                item['url'],
                {},
            )
        except:
            logger.info("Unable to get item at: %s" % item['url'])
            queue.task_done()

        if msg:
            logger.warn(msg)
            queue.task_done()

        sha1_hash = hashlib.sha1(r.content).hexdigest()
        if Audio.objects.filter(sha1=sha1_hash).exists():
            # Simpsons did it! Try the next one.
            logger.info("Item already exists, moving to next item.")
            queue.task_done()
        else:
            # New item, onwards!
            logger.info('Adding new document found at: %s' % item['url'])
            audio_file = Audio(
                source='H',
                sha1=sha1_hash,
                case_name=item['case_name'],
                date_argued=item['date_argued'],
                download_url=item['url'],
                processing_complete=False,
            )
            if item['judges']:
                audio_file.judges = item['judges']
            if item['docket_number']:
                audio_file.docket_number = item['docket_number']

            court = Court.objects.get(pk=item['court_code'])

            docket = Docket(
                case_name=item['case_name'],
                court=court,
            )
            # Make and associate the file object
            try:
                cf = ContentFile(r.content)
                extension = get_extension(r.content)
                if extension not in ['.mp3', '.wma']:
                    extension = '.' + item['url'].rsplit('.', 1)[1]
                # See bitbucket issue #215 for why this must be
                # lower-cased.
                file_name = trunc(item['case_name'].lower(), 75) + extension
                audio_file.local_path_original_file.save(file_name, cf,
                                                         save=False)
            except:
                msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \
                      (item['case_name'], traceback.format_exc())
                logger.critical(msg)
                queue.task_done()

            docket.save()
            audio_file.docket = docket
            audio_file.save(index=False)

            random_delay = random.randint(0, 3600)
            process_audio_file.apply_async(
                (audio_file.pk,),
                countdown=random_delay
            )

            logger.info("Successfully added audio file %s: %s" % (
                audio_file.pk, audio_file.case_name))