def test_pagerank_calculation(self): """Create a few Documents and fake citation relation among them, then run the pagerank algorithm. Check whether this simple case can get the correct result. """ # Set up some handy variables self.court = Court.objects.get(pk='test') #create 3 documents with their citations c1, c2, c3 = Citation(case_name=u"c1"), Citation(case_name=u"c2"), Citation(case_name=u"c3") c1.save(index=False) c2.save(index=False) c3.save(index=False) d1, d2, d3 = Document(date_filed=date.today()), Document(date_filed=date.today()), Document(date_filed=date.today()) d1.citation, d2.citation, d3.citation = c1, c2, c3 doc_list = [d1, d2, d3] for d in doc_list: d.court = self.court d.citation.save(index=False) d.save(index=False) #create simple citing relation: 1 cites 2 and 3; 2 cites 3; 3 cites 1; d1.cases_cited.add(d2.citation) d2.citation_count += 1 d2.cases_cited.add(d3.citation) d3.citation_count += 1 d3.cases_cited.add(d1.citation) d1.citation_count += 1 d1.cases_cited.add(d3.citation) d3.citation_count += 1 d1.save(index=False) d2.save(index=False) d3.save(index=False) #calculate pagerank of these 3 document comm = Command() self.verbosity = 1 comm.do_pagerank(chown=False) # read in the pagerank file, converting to a dict pr_values_from_file = {} with open(get_data_dir_location() + "external_pagerank") as f: for line in f: pk, value = line.split('=') pr_values_from_file[pk] = float(value.strip()) # Verify that whether the answer is correct, based on calculations in Gephi answers = { '1': 0.387790, '2': 0.214811, '3': 0.397400, } for key, value in answers.iteritems(): self.assertTrue( (abs(pr_values_from_file[key]) - value) < 0.0001, msg="The answer for item %s was %s when it should have been %s" % (key, answers['1'], pr_values_from_file['1']) )
def import_law_box_case(case_path): """Open the file, get its contents, convert to XML and extract the meta data. Return a document object for saving in the database """ raw_text = open(case_path).read() clean_html_tree, complete_html_tree, clean_html_str, body_text = get_html_from_raw_text( raw_text) sha1 = hashlib.sha1(clean_html_str).hexdigest() citations = get_citations_from_tree(complete_html_tree, case_path) judges = get_judge(clean_html_tree, case_path) court = get_court_object(clean_html_tree, citations, case_path, judges) doc = Document( source='L', sha1=sha1, html= clean_html_str, # we clear this field later, putting the value into html_lawbox. date_filed=get_date_filed(clean_html_tree, citations=citations, case_path=case_path, court=court), precedential_status=get_precedential_status(), judges=judges, download_url=case_path, ) cite = Citation(docket_number=get_docket_number( clean_html_tree, case_path=case_path, court=court)) docket = Docket( case_name=get_case_name(complete_html_tree, case_path), court=court, ) # Necessary for dup_finder. path = '//p/text()' doc.body_text = ' '.join(clean_html_tree.xpath(path)) # Add the dict of citations to the object as its attributes. citations_as_dict = map_citations_to_models(citations) for k, v in citations_as_dict.iteritems(): setattr(cite, k, v) doc.citation = cite doc.docket = docket return doc
def import_mayer(case_path): """Open the file, get its contents, convert to XML and extract the meta data. Return a document object for saving in the database """ #raw_text = open(case_path).read() #clean_html_tree, complete_html_tree, clean_html_str, body_text = get_html_from_raw_text(raw_text) tree = html.parse(case_path) sha1 = hashlib.sha1(clean_html_str).hexdigest() citations = get_citations_from_tree(complete_html_tree, case_path) judges = get_judge(clean_html_tree, case_path) court = get_court_object(clean_html_tree, citations, case_path, judges) doc = Document( source='L', sha1=sha1, html=clean_html_str, # we clear this field later, putting the value into html_lawbox. date_filed=get_date_filed(clean_html_tree, citations=citations, case_path=case_path, court=court), precedential_status=get_precedential_status(), judges=judges, download_url=case_path, ) cite = Citation() docket = Docket( docket_number=get_docket_number( clean_html_tree, case_path=case_path, court=court ), case_name=get_case_name(complete_html_tree, case_path), court=court, ) # Necessary for dup_finder. path = '//p/text()' doc.body_text = ' '.join(clean_html_tree.xpath(path)) # Add the dict of citations to the object as its attributes. citations_as_dict = map_citations_to_models(citations) for k, v in citations_as_dict.iteritems(): setattr(cite, k, v) doc.citation = cite doc.docket = docket return doc
def setUp(self): self.court = Court.objects.get(pk='test') # create 3 documents with their citations and dockets c1, c2, c3 = Citation(case_name=u"c1"), Citation( case_name=u"c2"), Citation(case_name=u"c3") c1.save(index=False) c2.save(index=False) c3.save(index=False) docket1 = Docket( case_name=u"c1", court=self.court, ) docket2 = Docket( case_name=u"c2", court=self.court, ) docket3 = Docket( case_name=u"c3", court=self.court, ) docket1.save() docket2.save() docket3.save() d1, d2, d3 = Document(date_filed=date.today()), Document( date_filed=date.today()), Document(date_filed=date.today()) d1.citation, d2.citation, d3.citation = c1, c2, c3 d1.docket, d2.docket, d3.docket = docket1, docket2, docket3 doc_list = [d1, d2, d3] for d in doc_list: d.citation.save(index=False) d.save(index=False) # create simple citing relation: 1 cites 2 and 3; 2 cites 3; 3 cites 1; d1.cases_cited.add(d2.citation) d2.citation_count += 1 d2.cases_cited.add(d3.citation) d3.citation_count += 1 d3.cases_cited.add(d1.citation) d1.citation_count += 1 d1.cases_cited.add(d3.citation) d3.citation_count += 1 d1.save(index=False) d2.save(index=False) d3.save(index=False)
def import_resource_org_item(case_location): """Using the path to a case, import it, gathering all needed meta data. Path is any valid URI that the requests library can handle. """ def get_file(location): if location.startswith('/'): with open(location) as f: r = requests.Session() r.content = f.read() else: r = requests.get(location) return fromstring(r.content), get_clean_body_content(r.content) # Get trees and text for the opinion itself and for the index page # that links to it. Each has useful data. case_tree, case_text = get_file(case_location) vol_location = case_location.rsplit('/', 1)[-2] + '/index.html' vol_tree, vol_text = get_file(vol_location) html, blocked = anonymize(get_case_body(case_tree)) case_location_relative = case_location.rsplit('/', 1)[1] case_name, status = get_case_name_and_status( vol_tree, case_location_relative) cite = Citation( case_name=case_name, docket_number=get_docket_number(case_location), federal_cite_one=get_west_cite(vol_tree, case_location_relative), ) docket = Docket( court=Court.objects.get(pk=get_court_id(case_tree)), case_name=case_name, ) doc = Document( date_filed=get_date_filed(vol_tree, case_location_relative), source='R', sha1=hashlib.sha1(case_text).hexdigest(), citation=cite, docket=docket, download_url=case_location, html=html, precedential_status=status, ) if blocked: doc.blocked = True docket.blocked = True doc.date_blocked = datetime.date.today() docket.date_blocked = datetime.date.today() cite.save() docket.save() doc.docket = docket doc.citation = cite doc.save() # Update the citation graph from alert.citations.tasks import update_document_by_id update_document_by_id(doc.pk) return doc
def import_resource_org_item(case_location): """Using the path to a case, import it, gathering all needed meta data. Path is any valid URI that the requests library can handle. """ def get_file(location): if location.startswith('/'): with open(location) as f: r = requests.Session() r.content = f.read() else: r = requests.get(location) return fromstring(r.content), get_clean_body_content(r.content) # Get trees and text for the opinion itself and for the index page # that links to it. Each has useful data. case_tree, case_text = get_file(case_location) vol_location = case_location.rsplit('/', 1)[-2] + '/index.html' vol_tree, vol_text = get_file(vol_location) html, blocked = anonymize(get_case_body(case_tree)) case_location_relative = case_location.rsplit('/', 1)[1] case_name, status = get_case_name_and_status(vol_tree, case_location_relative) cite = Citation( case_name=case_name, docket_number=get_docket_number(case_location), federal_cite_one=get_west_cite(vol_tree, case_location_relative), ) docket = Docket( court=Court.objects.get(pk=get_court_id(case_tree)), case_name=case_name, ) doc = Document( date_filed=get_date_filed(vol_tree, case_location_relative), source='R', sha1=hashlib.sha1(case_text).hexdigest(), citation=cite, docket=docket, download_url=case_location, html=html, precedential_status=status, ) if blocked: doc.blocked = True docket.blocked = True doc.date_blocked = datetime.date.today() docket.date_blocked = datetime.date.today() cite.save() docket.save() doc.docket = docket doc.citation = cite doc.save() # Update the citation graph from alert.citations.tasks import update_document_by_id update_document_by_id(doc.pk) return doc
def scrape_court(site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(site.court_id, full_crawl=full_crawl) abort = dup_checker.abort_by_hash(site.hash) if not abort: for i in range(0, len(site.case_names)): msg, r = get_binary_content(site.download_urls[i], site._get_cookies()) clean_content = site._cleanup_content(r.content) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue current_date = site.case_dates[i] try: next_date = site.case_dates[i + 1] except IndexError: next_date = None # Make a hash of the data. Need to convert unicode to binary before hashing. if type(clean_content) == unicode: hash_content = clean_content.encode('utf-8') else: hash_content = clean_content sha1_hash = hashlib.sha1(hash_content).hexdigest() if court_str == 'nev' and site.precedential_statuses[i] == 'Unpublished': # Nevada's non-precedential cases have different SHA1 sums every time. onwards = dup_checker.should_we_continue_break_or_carry_on( current_date, next_date, lookup_value=site.download_urls[i], lookup_by='download_url' ) else: onwards = dup_checker.should_we_continue_break_or_carry_on( current_date, next_date, lookup_value=sha1_hash, lookup_by='sha1' ) if onwards == 'CONTINUE': # It's a duplicate, but we haven't hit any thresholds yet. continue elif onwards == 'BREAK': # It's a duplicate, and we hit a date or dup_count threshold. dup_checker.update_site_hash(sha1_hash) break elif onwards == 'CARRY_ON': # Not a duplicate, carry on logger.info('Adding new document found at: %s' % site.download_urls[i]) dup_checker.reset() # Make a citation cite = Citation(case_name=site.case_names[i]) if site.docket_numbers: cite.docket_number = site.docket_numbers[i] if site.neutral_citations: cite.neutral_cite = site.neutral_citations[i] if site.west_citations: cite.federal_cite_one = site.west_citations[i] if site.west_state_citations: cite.west_state_cite = site.west_state_citations[i] # Make the document object doc = Document(source='C', sha1=sha1_hash, date_filed=site.case_dates[i], court=court, download_url=site.download_urls[i], precedential_status=site.precedential_statuses[i]) # Make and associate the file object try: cf = ContentFile(clean_content) extension = get_extension(r.content) # See issue #215 for why this must be lower-cased. file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (cite.case_name, traceback.format_exc()) logger.critical(msg) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() download_error = True continue if site.judges: doc.judges = site.judges[i] if site.nature_of_suit: doc.nature_of_suit = site.nature_of_suit[i] # Save everything, but don't update Solr index yet cite.save(index=False) doc.citation = cite doc.save(index=False) # Extract the contents asynchronously. extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) logger.info("Successfully added doc %s: %s" % (doc.pk, site.case_names[i])) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def test_pagerank_calculation(self): """Create a few Documents and fake citation relation among them, then run the pagerank algorithm. Check whether this simple case can get the correct result. """ # Set up some handy variables self.court = Court.objects.get(pk='test') # create 3 documents with their citations and dockets c1, c2, c3 = Citation(case_name=u"c1"), Citation( case_name=u"c2"), Citation(case_name=u"c3") c1.save(index=False) c2.save(index=False) c3.save(index=False) docket1 = Docket( case_name=u"c1", court=self.court, ) docket2 = Docket( case_name=u"c2", court=self.court, ) docket3 = Docket( case_name=u"c3", court=self.court, ) d1, d2, d3 = Document(date_filed=date.today()), Document( date_filed=date.today()), Document(date_filed=date.today()) d1.citation, d2.citation, d3.citation = c1, c2, c3 d1.docket, d2.docket, d3.docket = docket1, docket2, docket3 doc_list = [d1, d2, d3] for d in doc_list: d.citation.save(index=False) d.save(index=False) #create simple citing relation: 1 cites 2 and 3; 2 cites 3; 3 cites 1; d1.cases_cited.add(d2.citation) d2.citation_count += 1 d2.cases_cited.add(d3.citation) d3.citation_count += 1 d3.cases_cited.add(d1.citation) d1.citation_count += 1 d1.cases_cited.add(d3.citation) d3.citation_count += 1 d1.save(index=False) d2.save(index=False) d3.save(index=False) #calculate pagerank of these 3 document comm = Command() self.verbosity = 1 comm.do_pagerank(chown=False) # read in the pagerank file, converting to a dict pr_values_from_file = {} with open(get_data_dir_location() + "external_pagerank") as f: for line in f: pk, value = line.split('=') pr_values_from_file[pk] = float(value.strip()) # Verify that whether the answer is correct, based on calculations in # Gephi answers = { '1': 0.387790, '2': 0.214811, '3': 0.397400, } for key, value in answers.iteritems(): self.assertTrue( (abs(pr_values_from_file[key]) - value) < 0.0001, msg="The answer for item %s was %s when it should have been " "%s" % ( key, pr_values_from_file[key], answers[key], ))