def get_human_review(self, clusters, d): for i, cluster in enumerate(clusters): logger.info("%s: Cluster %s (%0.3f sim):" % ( i, cluster.pk, gen_diff_ratio(cluster.case_name.lower(), d["caseName"].lower()), )) logger.info("https://www.courtlistener.com%s" % cluster.get_absolute_url()) logger.info("%s" % cluster.case_name.encode()) if cluster.docket.docket_number: logger.info(cluster.docket.docket_number.encode()) logger.info(cluster.date_filed) logger.info("SCDB info:") logger.info(d["caseName"]) if d["docket"]: logger.info(d["docket"]) logger.info(d["dateDecision"]) if self.skip_human_review: logger.info( "Skipping human review and just returning the first item.") self.skipped_count += 1 return clusters[0] else: choice = input("Which item should we update? [0-%s] " % (len(clusters) - 1)) try: choice = int(choice) cluster = clusters[choice] except ValueError: cluster = None return cluster
def get_human_review(self, clusters, d): for i, cluster in enumerate(clusters): logger.info('%s: Cluster %s (%0.3f sim):' % ( i, cluster.pk, gen_diff_ratio(cluster.case_name.lower(), d['caseName'].lower()), )) logger.info('https://www.courtlistener.com%s' % cluster.get_absolute_url()) logger.info(' %s' % cluster.case_name.encode('utf-8')) if cluster.docket.docket_number: logger.info(cluster.docket.docket_number.encode('utf-8')) logger.info(cluster.date_filed) logger.info('SCDB info:') logger.info(d['caseName']) if d['docket']: logger.info(d['docket']) logger.info(d['dateDecision']) if self.skip_human_review: logger.info('Skipping human review and just returning the first ' 'item.') self.skipped_count += 1 return clusters[0] else: choice = raw_input(' Which item should we update? [0-%s] ' % (len(clusters) - 1)) try: choice = int(choice) cluster = clusters[choice] except ValueError: cluster = None return cluster
def get_human_review(self, clusters, d): for i, cluster in enumerate(clusters): print ' %s: Cluster %s (%0.3f sim):' % ( i, cluster.pk, gen_diff_ratio( cluster.case_name.lower(), d['caseName'].lower() ), ) print ' https://www.courtlistener.com%s' % cluster.get_absolute_url() print ' %s' % cluster.case_name.encode('utf-8') if cluster.docket.docket_number: print ' %s' % cluster.docket.docket_number.encode('utf-8') print ' %s' % cluster.date_filed print ' SCDB info:' print ' %s' % d['caseName'] if d['docket']: print ' %s' % d['docket'] print ' %s' % d['dateDecision'] if self.skip_human_review: print(' Skipping human review and just returning the first item.') self.skipped_count += 1 return clusters[0] else: choice = raw_input(' Which item should we update? [0-%s] ' % (len(clusters) - 1)) try: choice = int(choice) cluster = clusters[choice] except ValueError: cluster = None return cluster
def get_dup_stats(doc): """The heart of the duplicate algorithm. Returns stats about the case as compared to other cases already in the system. Other methods can call this one, and can make decisions based on the stats generated here. If no likely duplicates are encountered, stats are returned as zeroes. Process: 1. Refine the possible result set down to just a few candidates. 2. Determine their likelihood of being duplicates according to a number of measures: - Similarity of case name - Similarity of docket number - Comparison of content length """ conn = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='r') DEBUG = True ########################################## # 1: Refine by date, court and case name # ########################################## main_params = make_case_name_solr_query( doc.case_name, doc.docket.court_id, doc.date_filed, DEBUG=DEBUG, ) main_params['caller'] = 'corpus_importer' if DEBUG: print " - main_params are: %s" % main_params candidates = conn.raw_query(**main_params).execute() if not len(candidates) and doc.docket.docket_number is not None: # Try by docket number rather than case name clean_docket_number_words = [] for word in doc.docket.docket_number.split(): if not re.search('\d', word): # Must have numbers. continue word = word.strip(string.punctuation) regex = re.compile('[%s]' % re.escape(string.punctuation)) if regex.search(re.sub('-', '', word)): # Can only have hyphens after stripping continue clean_docket_number_words.append(word) docket_q = ' OR '.join(clean_docket_number_words) if docket_q: main_params = { 'fq': [ 'court_exact:%s' % doc.docket.court_id, 'dateFiled:%s' % build_date_range(doc.date_filed, range=15), 'docketNumber:(%s)' % docket_q ], 'rows': 100, 'caller': 'corpus_importer', } if DEBUG: print " - main_params are: %s" % main_params candidates = conn.raw_query(**main_params).execute() if not len(candidates) and doc.docket.court_id == 'scotus': if doc.federal_cite_one: # Scotus case, try by citation. main_params = { 'fq': [ 'court_exact:%s' % doc.docket.court_id, 'dateFiled:%s' % build_date_range(doc.date_filed, range=90), # Creates ~6 month span. 'citation:(%s)' % ' '.join([re.sub(r"\D", '', w) for w in doc.federal_cite_one.split()]) ], 'rows': 100, 'caller': 'corpus_importer', } if DEBUG: print " - main_params are: %s" % main_params candidates = conn.raw_query(**main_params).execute() stats = {'candidate_count': len(candidates)} if not len(candidates): return stats, candidates ######################################### # 2: Attempt filtering by docket number # ######################################### # Two-step process. First we see if we have any exact hits. # Second, if there were exact hits, we forward those onwards. If not, we # forward everything. remaining_candidates = [] if doc.docket.docket_number: new_docket_number = re.sub("(\D|0)", "", doc.docket.docket_number) for candidate in candidates: if candidate.get('docketNumber'): # Get rid of anything in the docket numbers that's not a digit result_docket_number = re.sub("(\D|0)", "", candidate['docketNumber']) # Get rid of zeroes too. if new_docket_number == result_docket_number: remaining_candidates.append(candidate) if len(remaining_candidates) > 0: # We had one or more exact hits! Use those. candidates = remaining_candidates else: # We just let candidates from step one get passed through by doing nothing. pass stats = {'candidate_count': len(candidates)} ############################## # 3: Find the best case name # ############################## confidences = find_confidences(candidates, doc.case_name) stats['case_name_similarities'] = confidences ##################################################################### # 4: Check content length, gestalt difference and cosine similarity # ##################################################################### percent_diffs, gestalt_diffs, cos_sims = [], [], [] new_stripped_content = re.sub('\W', '', doc.body_text).lower() for candidate in candidates: candidate_stripped_content = re.sub('\W', '', candidate['text']).lower() # Calculate the difference in text length and their gestalt difference try: length_diff = abs(len(candidate_stripped_content) - len(new_stripped_content)) except ZeroDivisionError: length_diff = 0 try: percent_diff = float(length_diff) / len(new_stripped_content) except ZeroDivisionError: percent_diff = 0 cos_sim = get_cosine_similarity(doc.body_text, candidate['text']) percent_diffs.append(percent_diff) gestalt_diffs.append(gen_diff_ratio(candidate_stripped_content, new_stripped_content)) cos_sims.append(cos_sim) stats['length_diffs'] = percent_diffs stats['gestalt_diffs'] = gestalt_diffs stats['cos_sims'] = cos_sims return stats, candidates