def handle(self, *args, **options):
        """Identify parallel citations and save them as requested.

        This process proceeds in two phases. The first phase is to work through
        the entire corpus, identifying citations that occur very near to each
        other. These are considered parallel citations, and they are built into
        a graph data structure where citations are nodes and each parallel
        citation is an edge. The weight of each edge is determined by the
        number of times a parallel citation has been identified between two
        citations. This should solve problems like typos or other issues with
        our heuristic approach.

        The second phase of this process is to update the database with the
        high quality citations. This can only be done by matching the citations
        with actual items in the database and then updating them with parallel
        citations that are sufficiently likely to be good.
        """
        super(Command, self).handle(*args, **options)
        no_option = (not any([options.get('doc_id'), options.get('all')]))
        if no_option:
            raise CommandError("Please specify if you want all items or a "
                               "specific item.")
        if not options['update_database']:
            logger.info(
                "--update_database is not set. No changes will be made to the "
                "database."
            )

        # Update Citation object to consider similar objects equal.
        self.monkey_patch_citation()

        logger.info("## Entering phase one: Building a network object of "
                    "all citations.\n")
        q = Opinion.objects.all()
        if options.get('doc_id'):
            q = q.filter(pk__in=options['doc_id'])
        count = q.count()
        opinions = queryset_generator(q, chunksize=10000)

        node_count = edge_count = completed = 0
        subtasks = []
        for o in opinions:
            subtasks.append(
                # This will call the second function with the results from the
                # first.
                get_document_citations.s(o) | identify_parallel_citations.s()
            )
            last_item = (count == completed + 1)
            if (completed % 50 == 0) or last_item:
                job = group(subtasks)
                result = job.apply_async().join()
                [self.add_groups_to_network(citation_groups) for
                 citation_groups in result]
                subtasks = []

            completed += 1
            if completed % 250 == 0 or last_item:
                # Only do this once in a while.
                node_count = len(self.g.nodes())
                edge_count = len(self.g.edges())
            sys.stdout.write("\r  Completed %s of %s. (%s nodes, %s edges)" % (
                completed,
                count,
                node_count,
                edge_count,
            ))
            sys.stdout.flush()

        logger.info("\n\n## Entering phase two: Saving the best edges to "
                    "the database.\n\n")
        for sub_graph in nx.connected_component_subgraphs(self.g):
            self.handle_subgraph(sub_graph, options)

        logger.info("\n\n## Done. Added %s new citations." % self.update_count)

        self.do_solr(options)
Esempio n. 2
0
    def handle(self, *args, **options):
        """Identify parallel citations and save them as requested.

        This process proceeds in two phases. The first phase is to work through
        the entire corpus, identifying citations that occur very near to each
        other. These are considered parallel citations, and they are built into
        a graph data structure where citations are nodes and each parallel
        citation is an edge. The weight of each edge is determined by the
        number of times a parallel citation has been identified between two
        citations. This should solve problems like typos or other issues with
        our heuristic approach.

        The second phase of this process is to update the database with the
        high quality citations. This can only be done by matching the citations
        with actual items in the database and then updating them with parallel
        citations that are sufficiently likely to be good.
        """
        super(Command, self).handle(*args, **options)
        no_option = not any([options.get("doc_id"), options.get("all")])
        if no_option:
            raise CommandError(
                "Please specify if you want all items or a specific item.")
        if not options["update_database"]:
            logger.info(
                "--update_database is not set. No changes will be made to the "
                "database.")

        logger.info("## Entering phase one: Building a network object of "
                    "all citations.\n")
        q = Opinion.objects.all()
        if options.get("doc_id"):
            q = q.filter(pk__in=options["doc_id"])
        count = q.count()
        opinions = queryset_generator(q, chunksize=10000)

        node_count = edge_count = completed = 0
        subtasks = []
        for o in opinions:
            subtasks.append(
                # This will call the second function with the results from the
                # first.
                get_document_citations.s(o)
                | identify_parallel_citations.s())
            last_item = count == completed + 1
            if (completed % 50 == 0) or last_item:
                job = group(subtasks)
                result = job.apply_async().join()
                [
                    self.add_groups_to_network(citation_groups)
                    for citation_groups in result
                ]
                subtasks = []

            completed += 1
            if completed % 250 == 0 or last_item:
                # Only do this once in a while.
                node_count = len(self.g.nodes())
                edge_count = len(self.g.edges())
            sys.stdout.write("\r  Completed %s of %s. (%s nodes, %s edges)" %
                             (completed, count, node_count, edge_count))
            sys.stdout.flush()

        logger.info("\n\n## Entering phase two: Saving the best edges to "
                    "the database.\n\n")
        for sub_graph in nx.connected_component_subgraphs(self.g):
            self.handle_subgraph(sub_graph, options)

        logger.info("\n\n## Done. Added %s new citations." % self.update_count)

        self.do_solr(options)