def process_citations(data, debug): """Walk through the citations and add them one at a time. """ updated_ids = set() for index, item in data.iterrows(): print("\nAdding citation from %s to %s" % (item['citing'], item['cited'])) try: cite = OpinionsCited.objects.get( citing_opinion_id=item['citing'], cited_opinion_id=item['cited'], ) msg = "Citation already exists. Doing nothing:\n" except OpinionsCited.DoesNotExist: cite = OpinionsCited(citing_opinion_id=item['citing'], cited_opinion_id=item['cited']) msg = "Created new citation:\n" if not debug: cite.save() updated_ids.add(cite.citing_opinion.pk) try: print( " %s" " %s: %s\n" " From: %s\n" " To: %s\n" % (msg, cite.pk, cite, cite.citing_opinion, cite.cited_opinion)) except Opinion.DoesNotExist: print(" Unable to create citation. Underlying Opinion doesn't " "exist.") print("\nUpdating Solr...") if not debug: add_or_update_opinions(updated_ids) print("Done.")
def find_citations_for_opinion_by_pks(self, opinion_pks, index=True): """Find citations for search.Opinion objects. :param opinion_pks: An iterable of search.Opinion PKs :param index: Whether to add the item to Solr :return: None """ opinions = Opinion.objects.filter(pk__in=opinion_pks) for opinion in opinions: # Returns a list of Citation objects, i.e., something like # [FullCitation, FullCitation, ShortformCitation, FullCitation, # SupraCitation, SupraCitation, ShortformCitation, FullCitation] citations = get_document_citations(opinion) # Match all those different Citation objects to Opinion objects, using # a variety of hueristics. try: citation_matches = match_citations.get_citation_matches( opinion, citations ) except ResponseNotReady as e: # Threading problem in httplib, which is used in the Solr query. raise self.retry(exc=e, countdown=2) # Consolidate duplicate matches, keeping a counter of how often each # match appears (so we know how many times an opinion cites another). # keys = cited opinion # values = number of times that opinion is cited grouped_matches = Counter(citation_matches) for matched_opinion in grouped_matches: # Increase citation count for matched cluster if it hasn't # already been cited by this opinion. if matched_opinion not in opinion.opinions_cited.all(): matched_opinion.cluster.citation_count += 1 matched_opinion.cluster.save(index=index) # Only update things if we found citations if citations: opinion.html_with_citations = create_cited_html(opinion, citations) # Nuke existing citations OpinionsCited.objects.filter(citing_opinion_id=opinion.pk).delete() # Create the new ones. OpinionsCited.objects.bulk_create( [ OpinionsCited( citing_opinion_id=opinion.pk, cited_opinion_id=matched_opinion.pk, depth=grouped_matches[matched_opinion], ) for matched_opinion in grouped_matches ] ) # Update Solr if requested. In some cases we do it at the end for # performance reasons. opinion.save(index=index)
def update_document(opinion, index=True): """Get the citations for an item and save it and add it to the index if requested.""" citations = get_document_citations(opinion) # List used so we can do one simple update to the citing opinion. opinions_cited = set() for citation in citations: matches = match_citations.match_citation(citation, citing_doc=opinion) # TODO: Figure out what to do if there's more than one if len(matches) == 1: match_id = matches[0]['id'] try: matched_opinion = Opinion.objects.get(pk=match_id) # Increase citation count for matched cluster if it hasn't # already been cited by this opinion. if matched_opinion not in opinion.opinions_cited.all(): matched_opinion.cluster.citation_count += 1 matched_opinion.cluster.save(index=index) # Add citation match to the citing opinion's list of cases it # cites. opinions_cited is a set so duplicates aren't an issue opinions_cited.add(matched_opinion.pk) # URL field will be used for generating inline citation html citation.match_url = matched_opinion.cluster.get_absolute_url() citation.match_id = matched_opinion.pk except Opinion.DoesNotExist: # No Opinions returned. Press on. continue except Opinion.MultipleObjectsReturned: # Multiple Opinions returned. Press on. continue else: # No match found for citation #create_stub([citation]) pass # Only update things if we found citations if citations: opinion.html_with_citations = create_cited_html(opinion, citations) # Nuke existing citations OpinionsCited.objects.filter(citing_opinion_id=opinion.pk).delete() # Create the new ones. OpinionsCited.objects.bulk_create([ OpinionsCited(citing_opinion_id=opinion.pk, cited_opinion_id=pk) for pk in opinions_cited ]) # Update Solr if requested. In some cases we do it at the end for # performance reasons. opinion.save(index=index)
def process_citations(data, debug): """Walk through the citations and add them one at a time.""" updated_ids = set() for index, item in data.iterrows(): logger.info( f"\nAdding citation from {item['citing']} to {item['cited']}") try: cite = OpinionsCited.objects.get( citing_opinion_id=item["citing"], cited_opinion_id=item["cited"], ) msg = "Citation already exists. Doing nothing:\n" except OpinionsCited.DoesNotExist: cite = OpinionsCited( citing_opinion_id=item["citing"], cited_opinion_id=item["cited"], ) msg = "Created new citation:\n" if not debug: cite.save() updated_ids.add(cite.citing_opinion.pk) try: logger.info( " %s" " %s: %s\n" " From: %s\n" " To: %s\n" % (msg, cite.pk, cite, cite.citing_opinion, cite.cited_opinion)) except Opinion.DoesNotExist: logger.warning( " Unable to create citation. Underlying Opinion doesn't " "exist.") logger.info("\nUpdating Solr...") if not debug: add_items_to_solr(updated_ids, "search.Opinion") logger.info("Done.")
def process_citations(data, debug): """Walk through the citations and add them one at a time. """ updated_ids = set() for index, item in data.iterrows(): print("\nAdding citation from %s to %s" % (item['citing'], item['cited'])) try: cite = OpinionsCited.objects.get( citing_opinion_id=item['citing'], cited_opinion_id=item['cited'], ) msg = "Citation already exists. Doing nothing:\n" except OpinionsCited.DoesNotExist: cite = OpinionsCited(citing_opinion_id=item['citing'], cited_opinion_id=item['cited']) msg = "Created new citation:\n" if not debug: cite.save() updated_ids.add(cite.citing_opinion.pk) try: print( " %s" " %s: %s\n" " From: %s\n" " To: %s\n" % (msg, cite.pk, cite, cite.citing_opinion, cite.cited_opinion) ) except Opinion.DoesNotExist: print(" Unable to create citation. Underlying Opinion doesn't " "exist.") print("\nUpdating Solr...") if not debug: add_or_update_opinions(updated_ids) print("Done.")
def find_citations_for_opinion_by_pks(self, opinion_pks, index=True): """Find citations for search.Opinion objects. :param opinion_pks: An iterable of search.Opinion PKs :param index: Whether to add the item to Solr :return: None """ opinions = Opinion.objects.filter(pk__in=opinion_pks) for opinion in opinions: # Returns a list of Citation objects, i.e., something like # [FullCitation, FullCitation, ShortformCitation, FullCitation, # SupraCitation, SupraCitation, ShortformCitation, FullCitation] citations = get_document_citations(opinion) # If no citations are found, continue if not citations: continue # Match all those different Citation objects to Opinion objects, using # a variety of hueristics. try: citation_matches = match_citations.get_citation_matches( opinion, citations) except ResponseNotReady as e: # Threading problem in httplib, which is used in the Solr query. raise self.retry(exc=e, countdown=2) # Consolidate duplicate matches, keeping a counter of how often each # match appears (so we know how many times an opinion cites another). # keys = cited opinion # values = number of times that opinion is cited grouped_matches = Counter(citation_matches) # Increase the citation count for the cluster of each matched opinion # if that cluster has not already been cited by this opinion. First, # calculate a list of the IDs of every opinion whose cluster will need # updating. all_cited_opinions = opinion.opinions_cited.all().values_list( "pk", flat=True) opinion_ids_to_update = set() for matched_opinion in grouped_matches: if matched_opinion.pk not in all_cited_opinions: opinion_ids_to_update.add(matched_opinion.pk) # Then, increment the citation_count fields for those matched clusters # all at once. Trigger a single Solr update as well, if required. opinion_clusters_to_update = OpinionCluster.objects.filter( sub_opinions__pk__in=opinion_ids_to_update) opinion_clusters_to_update.update(citation_count=F("citation_count") + 1) if index: add_items_to_solr.delay( opinion_clusters_to_update.values_list("pk", flat=True), "search.OpinionCluster", ) # Generate the citing opinion's new HTML (with inline citation links) opinion.html_with_citations = create_cited_html(opinion, citations) # Nuke existing citations OpinionsCited.objects.filter(citing_opinion_id=opinion.pk).delete() # Create the new ones. OpinionsCited.objects.bulk_create([ OpinionsCited( citing_opinion_id=opinion.pk, cited_opinion_id=matched_opinion.pk, depth=grouped_matches[matched_opinion], ) for matched_opinion in grouped_matches ]) # Save all the changes to the citing opinion opinion.save() # If a Solr update was requested, do a single one at the end with all the # pks of the passed opinions if index: add_items_to_solr.delay(opinion_pks, "search.Opinion")
def find_citations_for_opinion_by_pks( self, opinion_pks: List[int], index: bool = True, ) -> None: """Find citations for search.Opinion objects. :param opinion_pks: An iterable of search.Opinion PKs :param index: Whether to add the item to Solr :return: None """ opinions: List[Opinion] = Opinion.objects.filter(pk__in=opinion_pks) for opinion in opinions: # Memoize parsed versions of the opinion's text get_and_clean_opinion_text(opinion) # Extract the citations from the opinion's text citations: List[CitationBase] = get_citations(opinion.cleaned_text) # If no citations are found, continue if not citations: continue # Resolve all those different citation objects to Opinion objects, # using a variety of heuristics. try: citation_resolutions: Dict[ MatchedResourceType, List[SupportedCitationType]] = do_resolve_citations( citations, opinion) except ResponseNotReady as e: # Threading problem in httplib, which is used in the Solr query. raise self.retry(exc=e, countdown=2) # Generate the citing opinion's new HTML with inline citation links opinion.html_with_citations = create_cited_html( opinion, citation_resolutions) # Delete the unmatched citations citation_resolutions.pop(NO_MATCH_RESOURCE, None) # Increase the citation count for the cluster of each matched opinion # if that cluster has not already been cited by this opinion. First, # calculate a list of the IDs of every opinion whose cluster will need # updating. all_cited_opinions = opinion.opinions_cited.all().values_list( "pk", flat=True) opinion_ids_to_update = set() for _opinion in citation_resolutions.keys(): if _opinion.pk not in all_cited_opinions: opinion_ids_to_update.add(_opinion.pk) # Finally, commit these changes to the database in a single # transcation block. Trigger a single Solr update as well, if # required. with transaction.atomic(): opinion_clusters_to_update = OpinionCluster.objects.filter( sub_opinions__pk__in=opinion_ids_to_update) opinion_clusters_to_update.update( citation_count=F("citation_count") + 1) if index: add_items_to_solr.delay( opinion_clusters_to_update.values_list("pk", flat=True), "search.OpinionCluster", ) # Nuke existing citations OpinionsCited.objects.filter(citing_opinion_id=opinion.pk).delete() # Create the new ones. OpinionsCited.objects.bulk_create([ OpinionsCited( citing_opinion_id=opinion.pk, cited_opinion_id=_opinion.pk, depth=len(_citations), ) for _opinion, _citations in citation_resolutions.items() ]) # Save all the changes to the citing opinion (send to solr later) opinion.save(index=False) # If a Solr update was requested, do a single one at the end with all the # pks of the passed opinions if index: add_items_to_solr.delay(opinion_pks, "search.Opinion")
def find_citations_for_opinion_by_pks(self, opinion_pks, index=True): """Find citations for search.Opinion objects. :param opinion_pks: An iterable of search.Opinion PKs :param index: Whether to add the item to Solr :return: None """ opinions = Opinion.objects.filter(pk__in=opinion_pks) for opinion in opinions: citations = get_document_citations(opinion) # List used so we can do one simple update to the citing opinion. opinions_cited = set() for citation in citations: try: matches = match_citations.match_citation( citation, citing_doc=opinion) except ResponseNotReady as e: # Threading problem in httplib, which is used in the Solr query. raise self.retry(exc=e, countdown=2) # TODO: Figure out what to do if there's more than one if len(matches) == 1: match_id = matches[0]['id'] try: matched_opinion = Opinion.objects.get(pk=match_id) # Increase citation count for matched cluster if it hasn't # already been cited by this opinion. if matched_opinion not in opinion.opinions_cited.all(): matched_opinion.cluster.citation_count += 1 matched_opinion.cluster.save(index=index) # Add citation match to the citing opinion's list of cases # it cites. opinions_cited is a set so duplicates aren't an # issue opinions_cited.add(matched_opinion.pk) # URL field will be used for generating inline citation # html citation.match_url = matched_opinion.cluster.get_absolute_url() citation.match_id = matched_opinion.pk except Opinion.DoesNotExist: # No Opinions returned. Press on. continue except Opinion.MultipleObjectsReturned: # Multiple Opinions returned. Press on. continue else: # No match found for citation # create_stub([citation]) pass # Only update things if we found citations if citations: opinion.html_with_citations = create_cited_html(opinion, citations) # Nuke existing citations OpinionsCited.objects.filter(citing_opinion_id=opinion.pk).delete() # Create the new ones. OpinionsCited.objects.bulk_create([ OpinionsCited(citing_opinion_id=opinion.pk, cited_opinion_id=pk) for pk in opinions_cited ]) # Update Solr if requested. In some cases we do it at the end for # performance reasons. opinion.save(index=index)