def process_citations(data, debug):
    """Walk through the citations and add them one at a time.
    """
    updated_ids = set()
    for index, item in data.iterrows():
        print("\nAdding citation from %s to %s" %
              (item['citing'], item['cited']))
        try:
            cite = OpinionsCited.objects.get(
                citing_opinion_id=item['citing'],
                cited_opinion_id=item['cited'],
            )
            msg = "Citation already exists. Doing nothing:\n"
        except OpinionsCited.DoesNotExist:
            cite = OpinionsCited(citing_opinion_id=item['citing'],
                                 cited_opinion_id=item['cited'])
            msg = "Created new citation:\n"
            if not debug:
                cite.save()
                updated_ids.add(cite.citing_opinion.pk)
        try:
            print(
                "  %s"
                "    %s: %s\n"
                "    From: %s\n"
                "    To:   %s\n" %
                (msg, cite.pk, cite, cite.citing_opinion, cite.cited_opinion))
        except Opinion.DoesNotExist:
            print("  Unable to create citation. Underlying Opinion doesn't "
                  "exist.")

    print("\nUpdating Solr...")
    if not debug:
        add_or_update_opinions(updated_ids)
    print("Done.")
Esempio n. 2
0
def find_citations_for_opinion_by_pks(self, opinion_pks, index=True):
    """Find citations for search.Opinion objects.

    :param opinion_pks: An iterable of search.Opinion PKs
    :param index: Whether to add the item to Solr
    :return: None
    """
    opinions = Opinion.objects.filter(pk__in=opinion_pks)
    for opinion in opinions:
        # Returns a list of Citation objects, i.e., something like
        # [FullCitation, FullCitation, ShortformCitation, FullCitation,
        #   SupraCitation, SupraCitation, ShortformCitation, FullCitation]
        citations = get_document_citations(opinion)

        # Match all those different Citation objects to Opinion objects, using
        # a variety of hueristics.
        try:
            citation_matches = match_citations.get_citation_matches(
                opinion, citations
            )
        except ResponseNotReady as e:
            # Threading problem in httplib, which is used in the Solr query.
            raise self.retry(exc=e, countdown=2)

        # Consolidate duplicate matches, keeping a counter of how often each
        # match appears (so we know how many times an opinion cites another).
        # keys = cited opinion
        # values = number of times that opinion is cited
        grouped_matches = Counter(citation_matches)

        for matched_opinion in grouped_matches:
            # Increase citation count for matched cluster if it hasn't
            # already been cited by this opinion.
            if matched_opinion not in opinion.opinions_cited.all():
                matched_opinion.cluster.citation_count += 1
                matched_opinion.cluster.save(index=index)

        # Only update things if we found citations
        if citations:
            opinion.html_with_citations = create_cited_html(opinion, citations)

            # Nuke existing citations
            OpinionsCited.objects.filter(citing_opinion_id=opinion.pk).delete()

            # Create the new ones.
            OpinionsCited.objects.bulk_create(
                [
                    OpinionsCited(
                        citing_opinion_id=opinion.pk,
                        cited_opinion_id=matched_opinion.pk,
                        depth=grouped_matches[matched_opinion],
                    )
                    for matched_opinion in grouped_matches
                ]
            )

        # Update Solr if requested. In some cases we do it at the end for
        # performance reasons.
        opinion.save(index=index)
Esempio n. 3
0
def update_document(opinion, index=True):
    """Get the citations for an item and save it and add it to the index if
    requested."""
    citations = get_document_citations(opinion)

    # List used so we can do one simple update to the citing opinion.
    opinions_cited = set()
    for citation in citations:
        matches = match_citations.match_citation(citation, citing_doc=opinion)

        # TODO: Figure out what to do if there's more than one
        if len(matches) == 1:
            match_id = matches[0]['id']
            try:
                matched_opinion = Opinion.objects.get(pk=match_id)

                # Increase citation count for matched cluster if it hasn't
                # already been cited by this opinion.
                if matched_opinion not in opinion.opinions_cited.all():
                    matched_opinion.cluster.citation_count += 1
                    matched_opinion.cluster.save(index=index)

                # Add citation match to the citing opinion's list of cases it
                # cites. opinions_cited is a set so duplicates aren't an issue
                opinions_cited.add(matched_opinion.pk)

                # URL field will be used for generating inline citation html
                citation.match_url = matched_opinion.cluster.get_absolute_url()
                citation.match_id = matched_opinion.pk
            except Opinion.DoesNotExist:
                # No Opinions returned. Press on.
                continue
            except Opinion.MultipleObjectsReturned:
                # Multiple Opinions returned. Press on.
                continue
        else:
            # No match found for citation
            #create_stub([citation])
            pass

    # Only update things if we found citations
    if citations:
        opinion.html_with_citations = create_cited_html(opinion, citations)

        # Nuke existing citations
        OpinionsCited.objects.filter(citing_opinion_id=opinion.pk).delete()

        # Create the new ones.
        OpinionsCited.objects.bulk_create([
            OpinionsCited(citing_opinion_id=opinion.pk, cited_opinion_id=pk)
            for pk in opinions_cited
        ])

    # Update Solr if requested. In some cases we do it at the end for
    # performance reasons.
    opinion.save(index=index)
def process_citations(data, debug):
    """Walk through the citations and add them one at a time."""
    updated_ids = set()
    for index, item in data.iterrows():
        logger.info(
            f"\nAdding citation from {item['citing']} to {item['cited']}")
        try:
            cite = OpinionsCited.objects.get(
                citing_opinion_id=item["citing"],
                cited_opinion_id=item["cited"],
            )
            msg = "Citation already exists. Doing nothing:\n"
        except OpinionsCited.DoesNotExist:
            cite = OpinionsCited(
                citing_opinion_id=item["citing"],
                cited_opinion_id=item["cited"],
            )
            msg = "Created new citation:\n"
            if not debug:
                cite.save()
                updated_ids.add(cite.citing_opinion.pk)
        try:
            logger.info(
                "  %s"
                "    %s: %s\n"
                "    From: %s\n"
                "    To:   %s\n" %
                (msg, cite.pk, cite, cite.citing_opinion, cite.cited_opinion))
        except Opinion.DoesNotExist:
            logger.warning(
                "  Unable to create citation. Underlying Opinion doesn't "
                "exist.")

    logger.info("\nUpdating Solr...")
    if not debug:
        add_items_to_solr(updated_ids, "search.Opinion")
    logger.info("Done.")
def process_citations(data, debug):
    """Walk through the citations and add them one at a time.
    """
    updated_ids = set()
    for index, item in data.iterrows():
        print("\nAdding citation from %s to %s" % (item['citing'],
                                                   item['cited']))
        try:
            cite = OpinionsCited.objects.get(
                citing_opinion_id=item['citing'],
                cited_opinion_id=item['cited'],
            )
            msg = "Citation already exists. Doing nothing:\n"
        except OpinionsCited.DoesNotExist:
            cite = OpinionsCited(citing_opinion_id=item['citing'],
                                 cited_opinion_id=item['cited'])
            msg = "Created new citation:\n"
            if not debug:
                cite.save()
                updated_ids.add(cite.citing_opinion.pk)
        try:
            print(
                "  %s"
                "    %s: %s\n"
                "    From: %s\n"
                "    To:   %s\n" % (msg, cite.pk, cite, cite.citing_opinion,
                                    cite.cited_opinion)
            )
        except Opinion.DoesNotExist:
            print("  Unable to create citation. Underlying Opinion doesn't "
                  "exist.")

    print("\nUpdating Solr...")
    if not debug:
        add_or_update_opinions(updated_ids)
    print("Done.")
Esempio n. 6
0
def find_citations_for_opinion_by_pks(self, opinion_pks, index=True):
    """Find citations for search.Opinion objects.

    :param opinion_pks: An iterable of search.Opinion PKs
    :param index: Whether to add the item to Solr
    :return: None
    """
    opinions = Opinion.objects.filter(pk__in=opinion_pks)
    for opinion in opinions:
        # Returns a list of Citation objects, i.e., something like
        # [FullCitation, FullCitation, ShortformCitation, FullCitation,
        #   SupraCitation, SupraCitation, ShortformCitation, FullCitation]
        citations = get_document_citations(opinion)

        # If no citations are found, continue
        if not citations:
            continue

        # Match all those different Citation objects to Opinion objects, using
        # a variety of hueristics.
        try:
            citation_matches = match_citations.get_citation_matches(
                opinion, citations)
        except ResponseNotReady as e:
            # Threading problem in httplib, which is used in the Solr query.
            raise self.retry(exc=e, countdown=2)

        # Consolidate duplicate matches, keeping a counter of how often each
        # match appears (so we know how many times an opinion cites another).
        # keys = cited opinion
        # values = number of times that opinion is cited
        grouped_matches = Counter(citation_matches)

        # Increase the citation count for the cluster of each matched opinion
        # if that cluster has not already been cited by this opinion. First,
        # calculate a list of the IDs of every opinion whose cluster will need
        # updating.
        all_cited_opinions = opinion.opinions_cited.all().values_list(
            "pk", flat=True)
        opinion_ids_to_update = set()
        for matched_opinion in grouped_matches:
            if matched_opinion.pk not in all_cited_opinions:
                opinion_ids_to_update.add(matched_opinion.pk)

        # Then, increment the citation_count fields for those matched clusters
        # all at once. Trigger a single Solr update as well, if required.
        opinion_clusters_to_update = OpinionCluster.objects.filter(
            sub_opinions__pk__in=opinion_ids_to_update)
        opinion_clusters_to_update.update(citation_count=F("citation_count") +
                                          1)
        if index:
            add_items_to_solr.delay(
                opinion_clusters_to_update.values_list("pk", flat=True),
                "search.OpinionCluster",
            )

        # Generate the citing opinion's new HTML (with inline citation links)
        opinion.html_with_citations = create_cited_html(opinion, citations)

        # Nuke existing citations
        OpinionsCited.objects.filter(citing_opinion_id=opinion.pk).delete()

        # Create the new ones.
        OpinionsCited.objects.bulk_create([
            OpinionsCited(
                citing_opinion_id=opinion.pk,
                cited_opinion_id=matched_opinion.pk,
                depth=grouped_matches[matched_opinion],
            ) for matched_opinion in grouped_matches
        ])

        # Save all the changes to the citing opinion
        opinion.save()

    # If a Solr update was requested, do a single one at the end with all the
    # pks of the passed opinions
    if index:
        add_items_to_solr.delay(opinion_pks, "search.Opinion")
Esempio n. 7
0
def find_citations_for_opinion_by_pks(
    self,
    opinion_pks: List[int],
    index: bool = True,
) -> None:
    """Find citations for search.Opinion objects.

    :param opinion_pks: An iterable of search.Opinion PKs
    :param index: Whether to add the item to Solr
    :return: None
    """
    opinions: List[Opinion] = Opinion.objects.filter(pk__in=opinion_pks)
    for opinion in opinions:
        # Memoize parsed versions of the opinion's text
        get_and_clean_opinion_text(opinion)

        # Extract the citations from the opinion's text
        citations: List[CitationBase] = get_citations(opinion.cleaned_text)

        # If no citations are found, continue
        if not citations:
            continue

        # Resolve all those different citation objects to Opinion objects,
        # using a variety of heuristics.
        try:
            citation_resolutions: Dict[
                MatchedResourceType,
                List[SupportedCitationType]] = do_resolve_citations(
                    citations, opinion)
        except ResponseNotReady as e:
            # Threading problem in httplib, which is used in the Solr query.
            raise self.retry(exc=e, countdown=2)

        # Generate the citing opinion's new HTML with inline citation links
        opinion.html_with_citations = create_cited_html(
            opinion, citation_resolutions)

        # Delete the unmatched citations
        citation_resolutions.pop(NO_MATCH_RESOURCE, None)

        # Increase the citation count for the cluster of each matched opinion
        # if that cluster has not already been cited by this opinion. First,
        # calculate a list of the IDs of every opinion whose cluster will need
        # updating.
        all_cited_opinions = opinion.opinions_cited.all().values_list(
            "pk", flat=True)
        opinion_ids_to_update = set()
        for _opinion in citation_resolutions.keys():
            if _opinion.pk not in all_cited_opinions:
                opinion_ids_to_update.add(_opinion.pk)

        # Finally, commit these changes to the database in a single
        # transcation block. Trigger a single Solr update as well, if
        # required.
        with transaction.atomic():
            opinion_clusters_to_update = OpinionCluster.objects.filter(
                sub_opinions__pk__in=opinion_ids_to_update)
            opinion_clusters_to_update.update(
                citation_count=F("citation_count") + 1)
            if index:
                add_items_to_solr.delay(
                    opinion_clusters_to_update.values_list("pk", flat=True),
                    "search.OpinionCluster",
                )

            # Nuke existing citations
            OpinionsCited.objects.filter(citing_opinion_id=opinion.pk).delete()

            # Create the new ones.
            OpinionsCited.objects.bulk_create([
                OpinionsCited(
                    citing_opinion_id=opinion.pk,
                    cited_opinion_id=_opinion.pk,
                    depth=len(_citations),
                ) for _opinion, _citations in citation_resolutions.items()
            ])

            # Save all the changes to the citing opinion (send to solr later)
            opinion.save(index=False)

    # If a Solr update was requested, do a single one at the end with all the
    # pks of the passed opinions
    if index:
        add_items_to_solr.delay(opinion_pks, "search.Opinion")
Esempio n. 8
0
def find_citations_for_opinion_by_pks(self, opinion_pks, index=True):
    """Find citations for search.Opinion objects.

    :param opinion_pks: An iterable of search.Opinion PKs
    :param index: Whether to add the item to Solr
    :return: None
    """
    opinions = Opinion.objects.filter(pk__in=opinion_pks)
    for opinion in opinions:
        citations = get_document_citations(opinion)

        # List used so we can do one simple update to the citing opinion.
        opinions_cited = set()
        for citation in citations:
            try:
                matches = match_citations.match_citation(
                    citation, citing_doc=opinion)
            except ResponseNotReady as e:
                # Threading problem in httplib, which is used in the Solr query.
                raise self.retry(exc=e, countdown=2)

            # TODO: Figure out what to do if there's more than one
            if len(matches) == 1:
                match_id = matches[0]['id']
                try:
                    matched_opinion = Opinion.objects.get(pk=match_id)

                    # Increase citation count for matched cluster if it hasn't
                    # already been cited by this opinion.
                    if matched_opinion not in opinion.opinions_cited.all():
                        matched_opinion.cluster.citation_count += 1
                        matched_opinion.cluster.save(index=index)

                    # Add citation match to the citing opinion's list of cases
                    # it cites. opinions_cited is a set so duplicates aren't an
                    # issue
                    opinions_cited.add(matched_opinion.pk)

                    # URL field will be used for generating inline citation
                    # html
                    citation.match_url = matched_opinion.cluster.get_absolute_url()
                    citation.match_id = matched_opinion.pk
                except Opinion.DoesNotExist:
                    # No Opinions returned. Press on.
                    continue
                except Opinion.MultipleObjectsReturned:
                    # Multiple Opinions returned. Press on.
                    continue
            else:
                # No match found for citation
                # create_stub([citation])
                pass

        # Only update things if we found citations
        if citations:
            opinion.html_with_citations = create_cited_html(opinion, citations)

            # Nuke existing citations
            OpinionsCited.objects.filter(citing_opinion_id=opinion.pk).delete()

            # Create the new ones.
            OpinionsCited.objects.bulk_create([
                OpinionsCited(citing_opinion_id=opinion.pk,
                              cited_opinion_id=pk) for
                pk in opinions_cited
            ])

        # Update Solr if requested. In some cases we do it at the end for
        # performance reasons.
        opinion.save(index=index)