コード例 #1
0
    def handle(self, *args, **options):
        both_list_and_endpoints = (options.get('doc_id') is not None and
                                   (options.get('start_id') is not None or
                                    options.get('end_id') is not None or
                                    options.get('filed_after') is not None))
        no_option = (not any([options.get('doc_id') is None,
                              options.get('start_id') is None,
                              options.get('end_id') is None,
                              options.get('filed_after') is None,
                              options.get('all') is False]))
        if both_list_and_endpoints or no_option:
            raise CommandError('Please specify either a list of documents, a range of ids, a range of dates, or '
                               'everything.')

        if options.get('filed_after'):
            start_date = make_aware(datetime.strptime(options['filed_after'], '%Y-%m-%d'), utc)

        index = options['index'].lower()

        # Use query chaining to build the query
        query = Document.objects.all()
        if options.get('doc_id'):
            query = query.filter(pk=options.get('doc_id'))
        if options.get('end_id'):
            query = query.filter(pk__lte=options.get('end_id'))
        if options.get('start_id'):
            query = query.filter(pk__gte=options.get('start_id'))
        if options.get('filed_after'):
            query = query.filter(date_filed__gte=start_date)
        if options.get('all'):
            query = Document.object.all()
        count = query.count()
        docs = queryset_generator(query, chunksize=10000)
        self.update_documents(docs, count, index)
def cleaner(simulate=False, verbose=False):
    """Re-run the anonymize function across the whole corpus.

    The anonymize function was previously missing any documents that contained
    punctuation before or after an ID. This script re-runs the function, fixing
    the error.
    """
    docs = queryset_generator(Document.objects.all())
    for doc in docs:
        text = doc.plain_text
        clean_lines = []
        any_mods = []
        for line in text.split('\n'):
            clean_line, modified = anonymize(line)
            if modified:
                print "Fixing text in document: %s" % doc.pk
                print "Line reads: %s" % line
                fix = raw_input("Fix the line? [Y/n]: ") or 'y'
                if fix.lower() == 'y':
                    clean_lines.append(clean_line)
                    any_mods.append(modified)
                else:
                    clean_lines.append(line)
            else:
                clean_lines.append(line)

        if not simulate and any(any_mods):
            doc.plain_text = '\n'.join(clean_lines)
            doc.blocked = True
            doc.date_blocked = now()
            doc.save()
コード例 #3
0
def cleaner(simulate=False, verbose=False):
    """Re-run the anonymize function across the whole corpus.

    The anonymize function was previously missing any documents that contained
    punctuation before or after an ID. This script re-runs the function, fixing
    the error.
    """
    docs = queryset_generator(Document.objects.all())
    for doc in docs:
        text = doc.plain_text
        clean_lines = []
        any_mods = []
        for line in text.split('\n'):
            clean_line, modified = anonymize(line)
            if modified:
                print "Fixing text in document: %s" % doc.pk
                print "Line reads: %s" % line
                fix = raw_input("Fix the line? [Y/n]: ") or 'y'
                if fix.lower() == 'y':
                    clean_lines.append(clean_line)
                    any_mods.append(modified)
                else:
                    clean_lines.append(line)
            else:
                clean_lines.append(line)

        if not simulate and any(any_mods):
            doc.plain_text = '\n'.join(clean_lines)
            doc.blocked = True
            doc.date_blocked = now()
            doc.save()
コード例 #4
0
 def add_or_update_by_datetime(self, dt):
     """
     Given a datetime, adds or updates all documents newer than that time.
     """
     self.stdout.write("Adding or updating document(s) newer than %s\n" % dt)
     qs = Document.objects.filter(time_retrieved__gt=dt)
     docs = queryset_generator(qs)
     count = qs.count()
     self._chunk_queryset_into_tasks(docs, count)
コード例 #5
0
ファイル: tasks.py プロジェクト: aktary/courtlistener
def write_json_to_disk(obj_type_str, obj_type, court_attr,
                       api_resource_obj, courts):
    """Write all items to disk as json files inside directories named by
    jurisdiction.

    The main trick is that we identify if we are creating a bulk archive
    from scratch. If so, we iterate over everything. If not, we only
    iterate over items that have been modified in the last 32 days because
    it's assumed that the bulk files are generated once per month.
    """
    # Are there already bulk files?
    incremental = test_if_old_bulk_files_exist(obj_type_str)

    # Create a directory for every jurisdiction, if they don't already
    # exist. This does not clobber.
    for court in courts:
        mkdir_p(os.path.join(
            settings.BULK_DATA_DIR,
            'tmp',
            obj_type_str,
            court.pk,
        ))

    if incremental:
        # Make the archives using updated data from the last 32 days.
        print "   - Incremental data! We assume it's good, and use it..."
        thirty_two_days_ago = now() - datetime.timedelta(days=32)
        qs = obj_type.objects.filter(date_modified__gt=thirty_two_days_ago)
    else:
        print "   - Incremental data not found. Working from scratch..."
        qs = obj_type.objects.all()
    item_resource = api_resource_obj()
    if type(qs[0].pk) == int:
        item_list = queryset_generator(qs)
    else:
        # Necessary for jurisdictions, which don't have ints for ids.
        item_list = qs
    i = 0
    for item in item_list:
        json_str = item_resource.serialize(
            None,
            item_resource.full_dehydrate(
                item_resource.build_bundle(obj=item)),
            'application/json',
        ).encode('utf-8')

        with open(os.path.join(
                settings.BULK_DATA_DIR,
                'tmp',
                obj_type_str,
                deepgetattr(item, court_attr),
                '%s.json' % item.pk), 'wb') as f:
            f.write(json_str)
        i += 1

    print '   - all %s %s json files created.' % (i, obj_type_str)
コード例 #6
0
 def add_or_update_all(self):
     """
     Iterates over the entire corpus, adding it to the index. Can be run on
     an empty index or an existing one. If run on an existing index,
     existing documents will be updated.
     """
     self.stdout.write("Adding or updating all documents...\n")
     docs = queryset_generator(Document.objects.all(), chunksize=5000)
     count = Document.objects.all().count()
     self._chunk_queryset_into_tasks(docs, count)
コード例 #7
0
 def add_or_update_by_datetime(self, dt):
     """
     Given a datetime, adds or updates all items newer than that time.
     """
     self.stdout.write(
         "Adding or updating items(s) newer than %s\n" % dt)
     qs = self.type.objects.filter(time_retrieved__gt=dt)
     items = queryset_generator(qs)
     count = qs.count()
     self._chunk_queryset_into_tasks(items, count)
コード例 #8
0
ファイル: tasks.py プロジェクト: wmbutler/courtlistener
def write_json_to_disk(obj_type_str, obj_type, court_attr, api_resource_obj,
                       courts):
    """Write all items to disk as json files inside directories named by
    jurisdiction.

    The main trick is that we identify if we are creating a bulk archive
    from scratch. If so, we iterate over everything. If not, we only
    iterate over items that have been modified in the last 32 days because
    it's assumed that the bulk files are generated once per month.
    """
    # Are there already bulk files?
    incremental = test_if_old_bulk_files_exist(obj_type_str)

    # Create a directory for every jurisdiction, if they don't already
    # exist. This does not clobber.
    for court in courts:
        mkdir_p(
            os.path.join(
                settings.BULK_DATA_DIR,
                'tmp',
                obj_type_str,
                court.pk,
            ))

    if incremental:
        # Make the archives using updated data from the last 32 days.
        print "   - Incremental data! We assume it's good, and use it..."
        thirty_two_days_ago = now() - datetime.timedelta(days=32)
        qs = obj_type.objects.filter(date_modified__gt=thirty_two_days_ago)
    else:
        print "   - Incremental data not found. Working from scratch..."
        qs = obj_type.objects.all()
    item_resource = api_resource_obj()
    if type(qs[0].pk) == int:
        item_list = queryset_generator(qs)
    else:
        # Necessary for jurisdictions, which don't have ints for ids.
        item_list = qs
    i = 0
    for item in item_list:
        json_str = item_resource.serialize(
            None,
            item_resource.full_dehydrate(item_resource.build_bundle(obj=item)),
            'application/json',
        ).encode('utf-8')

        with open(
                os.path.join(settings.BULK_DATA_DIR, 'tmp', obj_type_str,
                             deepgetattr(item, court_attr),
                             '%s.json' % item.pk), 'wb') as f:
            f.write(json_str)
        i += 1

    print '   - all %s %s json files created.' % (i, obj_type_str)
コード例 #9
0
def cleaner(simulate=False, verbose=False):
    docs = queryset_generator(Document.objects.filter(source="R", time_retrieved__gt="2011-06-01"))
    for doc in docs:
        original_link = doc.download_url
        fixed = link_fixer(original_link)
        doc.download_url = fixed
        if verbose:
            print "Changing: " + original_link
            print "      to: " + fixed
        if not simulate:
            doc.save()
コード例 #10
0
    def add_or_update_all(self):
        """
        Iterates over the entire corpus, adding it to the index. Can be run on
        an empty index or an existing one.

        If run on an existing index, existing items will be updated.
        """
        self.stdout.write("Adding or updating all items...\n")
        q = self.type.objects.all()
        items = queryset_generator(q, chunksize=5000)
        count = q.count()
        self._chunk_queryset_into_tasks(items, count)
コード例 #11
0
def cleaner(simulate=False, verbose=False):
    docs = queryset_generator(
        Document.objects.filter(source='R', time_retrieved__gt='2011-06-01'))
    for doc in docs:
        original_link = doc.download_url
        fixed = link_fixer(original_link)
        doc.download_url = fixed
        if verbose:
            print "Changing: " + original_link
            print "      to: " + fixed
        if not simulate:
            doc.save()
コード例 #12
0
 def delete_by_datetime(self, dt):
     """
     Given a datetime, deletes all documents in the index newer than that time.
     """
     qs = Document.objects.filter(time_retrieved__gt=dt)
     count = qs.count()
     if self._proceed_with_deletion(count):
         self.stdout.write("Deleting all document(s) newer than %s\n" % dt)
         docs = queryset_generator(qs)
         for doc in docs:
             self.si.delete(doc)
         self.si.commit()
コード例 #13
0
    def delete_by_datetime(self, dt):
        """
        Given a datetime, deletes all items in the index newer than that time.

        Relies on the items still being in the database.
        """
        qs = self.type.objects.filter(time_retrieved__gt=dt)
        count = qs.count()
        if proceed_with_deletion(self.stdout, count):
            self.stdout.write("Deleting all item(s) newer than %s\n" % dt)
            items = queryset_generator(qs)
            for item in items:
                self.si.delete(item)
            self.si.commit()
コード例 #14
0
ファイル: delete_tool.py プロジェクト: enyst/courtlistener
def delete_data_by_time_and_court(courtID, SIMULATE, delTime=None, VERBOSITY=0):
    """
    Deletes data for a court. If a time is given, uses that time as a constraint.
    """
    if delTime is not None:
        if VERBOSITY >= 1:
            print "Deleting data newer than %s for court %s" % (delTime, courtID)
        count = Document.objects.filter(time_retrieved__gt=delTime, court=courtID).count()
        if count != 0:
            docs = queryset_generator(Document.objects.filter(time_retrieved__gt=delTime, court=courtID))

    else:
        if VERBOSITY >= 1:
            print "Deleting all data for court %s" % courtID
        count = Document.objects.filter(court=courtID).count()
        if count != 0:
            docs = queryset_generator(Document.objects.filter(court=courtID))

    if VERBOSITY >= 1:
        print "Deleting %s documents from the database." % count
    if (not SIMULATE) and (count != 0):
        for doc in docs:
            doc.delete()
コード例 #15
0
def cleaner(simulate=False, verbose=False):
    docs = queryset_generator(Document.objects.filter(source = 'R'))
    for doc in docs:
        caseNameShortOrig = doc.citation.caseNameShort
        caseNameFullOrig = doc.citation.caseNameFull
        caseNameShort = titlecase(harmonize(clean_string(caseNameShortOrig)))
        caseNameFull  = titlecase(harmonize(clean_string(caseNameFullOrig)))
        doc.citation.caseNameShort = caseNameShort
        doc.citation.caseNameFull = caseNameFull
        if verbose:
            if (caseNameShortOrig != caseNameShort) or (caseNameFullOrig != caseNameFull):
                print "Document: %s" % doc.pk
            if caseNameShortOrig != caseNameShort:
                print "Short name, replacing: '%s'" % caseNameShortOrig
                print "                 with: '%s'" % caseNameShort
            if caseNameFullOrig != caseNameFull:
                print " Full name, replacing: '%s'" % caseNameFullOrig
                print "                 with: '%s'\n" % caseNameFull
        if not simulate:
            doc.citation.save()
コード例 #16
0
def cleaner(simulate=False, verbose=False):
    docs = queryset_generator(Document.objects.filter(source='R'))
    for doc in docs:
        caseNameShortOrig = doc.citation.caseNameShort
        caseNameFullOrig = doc.citation.caseNameFull
        caseNameShort = titlecase(harmonize(clean_string(caseNameShortOrig)))
        caseNameFull = titlecase(harmonize(clean_string(caseNameFullOrig)))
        doc.citation.caseNameShort = caseNameShort
        doc.citation.caseNameFull = caseNameFull
        if verbose:
            if (caseNameShortOrig != caseNameShort) or (caseNameFullOrig !=
                                                        caseNameFull):
                print "Document: %s" % doc.pk
            if caseNameShortOrig != caseNameShort:
                print "Short name, replacing: '%s'" % caseNameShortOrig
                print "                 with: '%s'" % caseNameShort
            if caseNameFullOrig != caseNameFull:
                print " Full name, replacing: '%s'" % caseNameFullOrig
                print "                 with: '%s'\n" % caseNameFull
        if not simulate:
            doc.citation.save()
コード例 #17
0
    def handle(self, *args, **options):
        both_list_and_endpoints = (options.get('doc_id') is not None and
                                   (options.get('start_id') is not None
                                    or options.get('end_id') is not None
                                    or options.get('filed_after') is not None))
        no_option = (not any([
            options.get('doc_id') is None,
            options.get('start_id') is None,
            options.get('end_id') is None,
            options.get('filed_after') is None,
            options.get('all') is False
        ]))
        if both_list_and_endpoints or no_option:
            raise CommandError(
                'Please specify either a list of documents, a range of ids, a range of dates, or '
                'everything.')

        if options.get('filed_after'):
            start_date = make_aware(
                datetime.strptime(options['filed_after'], '%Y-%m-%d'), utc)

        index = options['index'].lower()

        # Use query chaining to build the query
        query = Document.objects.all()
        if options.get('doc_id'):
            query = query.filter(pk=options.get('doc_id'))
        if options.get('end_id'):
            query = query.filter(pk__lte=options.get('end_id'))
        if options.get('start_id'):
            query = query.filter(pk__gte=options.get('start_id'))
        if options.get('filed_after'):
            query = query.filter(date_filed__gte=start_date)
        if options.get('all'):
            query = Document.object.all()
        count = query.count()
        docs = queryset_generator(query, chunksize=10000)
        self.update_documents(docs, count, index)
コード例 #18
0
def fixer(simulate=False, verbose=False):
    """Fix a few issues discovered."""

    # docs = queryset_generator(Document.objects.filter(source='C', plain_text=''))
    # docs = Document.objects.raw('''select "pk"  from "Document" where "source" = 'C' and "plain_text" ~ '^[[:space:]]*$' ''')
    # docs = Document.objects.raw('''select "pk" from "Document" where "source" = 'C' and "plain_text" = 'Unable to extract document content.' ''')

    def fix_plaintiffs(docs, left, simulate, verbose):
        for doc in docs:
            if verbose:
                print "Fixing document number %s: %s" % (doc.pk, doc)
                old_case_name = doc.case_name
                if left:
                    new_case_name = old_case_name.replace("P. v.", "People v.")
                else:
                    new_case_name = old_case_name.replace("v. P.", "v. People")
                print "    Replacing %s" % old_case_name
                print "         with %s" % new_case_name

            if not simulate:
                if left:
                    doc.case_name = doc.case_name.replace("P. v.", "People v.")
                else:
                    doc.case_name = doc.case_name.replace("v. P.", "v. People")
                doc.citation.save()

    def fix_michigan(docs, left, simulate, verbose):
        for doc in docs:
            if verbose:
                print "Fixing document number %s: %s" % (doc.pk, doc)
                old_case_name = doc.case_name
                if left:
                    new_case_name = old_case_name.replace(
                        "People of Mi", "People of Michigan")
                print "    Replacing %s" % old_case_name
                print "         with %s" % new_case_name

            if not simulate:
                if left:
                    doc.case_name = doc.case_name.replace(
                        "People of Mi", "People of Michigan")
                doc.citation.save()

    def fix_wva(docs, simulate, verbose):
        for doc in docs:
            if verbose:
                print "Fixing document number %s: %s" % (doc.pk, doc)
            if not simulate:
                doc.precedential_status = "Published"
                doc.save()

    # Round one! Fix plaintiffs.
    print "!!! ROUND ONE !!!"
    court = Court.objects.get(pk="cal")
    docs = queryset_generator(
        Document.objects.filter(source="C",
                                court=court,
                                citation__case_name__contains="P. v."))
    fix_plaintiffs(docs, True, simulate, verbose)

    # Round three! Fix the Mi cases.
    print "!!! ROUND THREE !!!"
    court = Court.objects.get(pk="mich")
    docs = queryset_generator(
        Document.objects.filter(
            source="C",
            court=court,
            citation__case_name__startswith="People of Mi ",
        ))
    fix_michigan(docs, True, simulate, verbose)

    # Round four! Fix the statuses.
    print "!!! ROUND FOUR !!!"
    court = Court.objects.get(pk="wva")
    docs = queryset_generator(
        Document.objects.filter(
            precedential_status__in=[
                "Memorandum Decision",
                "Per Curiam Opinion",
                "Signed Opinion",
            ],
            court=court,
        ))
    fix_wva(docs, simulate, verbose)
コード例 #19
0
def fixer(simulate=False, verbose=False):
    """Fix a few issues discovered."""
    #docs = queryset_generator(Document.objects.filter(source='C', plain_text=''))
    #docs = Document.objects.raw('''select "pk"  from "Document" where "source" = 'C' and "plain_text" ~ '^[[:space:]]*$' ''')
    #docs = Document.objects.raw('''select "pk" from "Document" where "source" = 'C' and "plain_text" = 'Unable to extract document content.' ''')

    def fix_plaintiffs(docs, left, simulate, verbose):
        for doc in docs:
            if verbose:
                print "Fixing document number %s: %s" % (doc.pk, doc)
                old_case_name = doc.citation.case_name
                if left:
                    new_case_name = old_case_name.replace('P. v.', 'People v.')
                else:
                    new_case_name = old_case_name.replace('v. P.', 'v. People')
                print "    Replacing %s" % old_case_name
                print "         with %s" % new_case_name

            if not simulate:
                if left:
                    doc.citation.case_name = doc.citation.case_name.replace('P. v.', 'People v.')
                else:
                    doc.citation.case_name = doc.citation.case_name.replace('v. P.', 'v. People')
                doc.citation.save()

    def fix_michigan(docs, left, simulate, verbose):
        for doc in docs:
            if verbose:
                print "Fixing document number %s: %s" % (doc.pk, doc)
                old_case_name = doc.citation.case_name
                if left:
                    new_case_name = old_case_name.replace('People of Mi', 'People of Michigan')
                print "    Replacing %s" % old_case_name
                print "         with %s" % new_case_name

            if not simulate:
                if left:
                    doc.citation.case_name = doc.citation.case_name.replace('People of Mi', 'People of Michigan')
                doc.citation.save()

    def fix_wva(docs, simulate, verbose):
        for doc in docs:
            if verbose:
                print "Fixing document number %s: %s" % (doc.pk, doc)
            if not simulate:
                doc.precedential_status = "Published"
                doc.save()


    # Round one! Fix plaintiffs.
    print "!!! ROUND ONE !!!"
    court = Court.objects.get(pk='cal')
    docs = queryset_generator(Document.objects.filter(source="C", court=court, citation__case_name__contains='P. v.'))
    fix_plaintiffs(docs, True, simulate, verbose)

    # Round three! Fix the Mi cases.
    print "!!! ROUND THREE !!!"
    court = Court.objects.get(pk='mich')
    docs = queryset_generator(Document.objects.filter(source="C", court=court, citation__case_name__startswith='People of Mi '))
    fix_michigan(docs, True, simulate, verbose)

    # Round four! Fix the statuses.
    print "!!! ROUND FOUR !!!"
    court = Court.objects.get(pk='wva')
    docs = queryset_generator(Document.objects.filter(precedential_status__in=['Memorandum Decision', 'Per Curiam Opinion', 'Signed Opinion'],
                                                      court=court))
    fix_wva(docs, simulate, verbose)
コード例 #20
0
    def make_archive(self, obj_type_str, obj_type, court_attr,
                     api_resource_obj):
        """Generate compressed archives containing the contents of an object
        database.

        There are a few tricks to this, but the main one is that each item in
        the database goes into two files, all.tar.gz and {court}.tar.gz. This
        means that if we want to avoid iterating the database once per file,
        we need to generate all 350+ jurisdiction files simultaneously.

        We do this by making a dict of open file handles and adding each item
        to the correct two files: The all.tar.gz file and the {court}.tar.gz
        file.

        This function takes longer to run than almost any in the codebase and
        has been the subject of some profiling. The top results are as follows:

           ncalls  tottime  percall  cumtime  percall filename:lineno(function)
           138072    5.007    0.000    6.138    0.000 {method 'sub' of '_sre.SRE_Pattern' objects}
             6001    4.452    0.001    4.608    0.001 {method 'execute' of 'psycopg2._psycopg.cursor' objects}
            24900    3.623    0.000    3.623    0.000 {built-in method compress}
        2807031/69163    2.923    0.000    8.216    0.000 copy.py:145(deepcopy)
          2427852    0.952    0.000    1.130    0.000 encoder.py:37(replace)

        Conclusions:
         1. sub is from string_utils.py, where we nuke bad chars. Could remove
            this code by sanitizing all future input to system and fixing any
            current issues. Other than that, it's already optimized.
         1. Next up is DB waiting. Queries could be optimized to make this
            better.
         1. Next is compression, which we've turned down as much as possible
            already (compresslevel=1 for most bulk files =3 for all.tar.gz).
         1. Encoding and copying bring up the rear. Not much to do there, and
            gains are limited. Could install a faster json decoder, but Python
            2.7's json implementation is already written in C. Not sure how to
            remove the gazillion copy's that are happening.
        """
        courts = Court.objects.all()
        self.stdout.write(' - Creating %s bulk %s files '
                          'simultaneously...\n' % (len(courts), obj_type_str))

        mkdir_p('/tmp/bulk/%s' % obj_type_str)

        # Open a gzip'ed tar file for every court
        tar_files = {}
        for court in courts:
            tar_files[court.pk] = tarfile.open(
                '/tmp/bulk/%s/%s.tar.gz' % (obj_type_str, court.pk),
                mode='w:gz',
                compresslevel=1,
            )
        tar_files['all'] = tarfile.open(
            '/tmp/bulk/%s/all.tar.gz' % obj_type_str,
            mode='w:gz',
            compresslevel=3,
        )

        # Make the archives
        qs = obj_type.objects.all()
        item_resource = api_resource_obj()
        if type(qs[0].pk) == int:
            item_list = queryset_generator(qs)
        else:
            item_list = qs
        for item in item_list:
            json_str = item_resource.serialize(
                None,
                item_resource.full_dehydrate(
                    item_resource.build_bundle(obj=item)),
                'application/json',
            ).encode('utf-8')

            # Add the json str to the two tarballs
            tarinfo = tarfile.TarInfo("%s.json" % item.pk)
            tarinfo.size = len(json_str)
            tarinfo.mtime = time.mktime(item.date_modified.timetuple())
            tarinfo.type = tarfile.REGTYPE

            tar_files[deepgetattr(item, court_attr)].addfile(
                tarinfo, StringIO.StringIO(json_str))
            tar_files['all'].addfile(tarinfo, StringIO.StringIO(json_str))

        # Close off all the gzip'ed tar files
        for court in courts:
            tar_files[court.pk].close()
        tar_files['all'].close()

        self.stdout.write('   - all %s bulk files created.\n' % obj_type_str)
コード例 #21
0
    def do_pagerank(self, verbosity=1, chown=True):
        #####################
        #      Stage I      #
        # Import Data to NX #
        #####################
        sys.stdout.write('Initializing...\n')
        graph_size = Document.objects.all().count()
        citing_graph = nx.DiGraph()
        qs = Document.objects.only(
            'pk',
            'cases_cited',
        )
        case_list = queryset_generator(qs, chunksize=10000)
        case_count = 0
        timings = []
        average_per_s = 0

        # Build up the network graph and a list of all valid ids
        id_list = []
        for source_case in case_list:
            case_count += 1
            if case_count % 100 == 1:
                t1 = time.time()
            if case_count % 100 == 0:
                t2 = time.time()
                timings.append(t2 - t1)
                average_per_s = 100 / (sum(timings) / float(len(timings)))
            sys.stdout.write(
                "\rGenerating networkx graph...{:.0%} ({}/{}, {:.1f}/s)".
                format(
                    case_count * 1.0 / graph_size,
                    case_count,
                    graph_size,
                    average_per_s,
                ))
            sys.stdout.flush()
            for target_case in source_case.cases_cited.values_list(
                    'parent_documents__id'):
                citing_graph.add_edge(str(source_case.pk), str(target_case[0]))

            # Save all the keys since they get dropped by networkx in Stage II
            id_list.append(str(source_case.pk))

        ######################
        #      Stage II      #
        # Calculate Pagerank #
        ######################
        if verbosity >= 1:
            sys.stdout.write('\n')
            sys.stdout.write('NetworkX PageRank calculating...')
            sys.stdout.flush()
        pr_result = nx.pagerank(citing_graph)
        if verbosity >= 1:
            sys.stdout.write('Complete!\n')

        ###################
        #    Stage III    #
        # Update Pagerank #
        ###################
        progress = 0
        min_value = min(pr_result.values())
        for id in id_list:
            progress += 1
            try:
                new_pr = pr_result[id]
            except KeyError:
                # NetworkX removes the isolated nodes from the network, but they still need to go into the PR file.
                new_pr = min_value
            self.result_file.write('{}={}\n'.format(id, new_pr))

            if verbosity >= 1:
                sys.stdout.write(
                    '\rUpdating Pagerank in external file...{:.0%}'.format(
                        progress * 1.0 / graph_size))
                sys.stdout.flush()

        self.result_file.close()

        if verbosity >= 1:
            sys.stdout.write('\nPageRank calculation finished!')
            sys.stdout.write('See the django log for more details.\n')

        ########################
        #       Stage IV       #
        # Maintenance Routines #
        ########################
        if verbosity >= 1:
            sys.stdout.write(
                'Sorting the temp pagerank file for improved Solr performance...\n'
            )

        # Sort the temp file, creating a new file without the TEMP_EXTENSION value, then delete the temp file.
        os.system('sort -n %s%s > %s' %
                  (self.RESULT_FILE_PATH, self.TEMP_EXTENSION,
                   self.RESULT_FILE_PATH))
        os.remove(self.RESULT_FILE_PATH + self.TEMP_EXTENSION)

        if verbosity >= 1:
            sys.stdout.write('Reloading the external file cache in Solr...\n')
        reload_pagerank_external_file_cache()

        if verbosity >= 1:
            sys.stdout.write(
                'Copying pagerank file to %s, for bulk downloading...\n' %
                settings.BULK_DATA_DIR)
        shutil.copy(self.RESULT_FILE_PATH, settings.BULK_DATA_DIR)
        if chown:
            user_info = pwd.getpwnam('www-data')
            os.chown(settings.BULK_DATA_DIR + 'external_pagerank',
                     user_info.pw_uid, user_info.pw_gid)
コード例 #22
0
    def do_pagerank(self, verbosity=1, chown=True):
        #####################
        #      Stage I      #
        # Import Data to NX #
        #####################
        sys.stdout.write('Initializing...\n')
        graph_size = Document.objects.all().count()
        citing_graph = nx.DiGraph()
        qs = Document.objects.only(
            'pk',
            'cases_cited',
        )
        case_list = queryset_generator(qs, chunksize=10000)
        case_count = 0
        timings = []
        average_per_s = 0

        # Build up the network graph and a list of all valid ids
        id_list = []
        for source_case in case_list:
            case_count += 1
            if case_count % 100 == 1:
                t1 = time.time()
            if case_count % 100 == 0:
                t2 = time.time()
                timings.append(t2 - t1)
                average_per_s = 100 / (sum(timings) / float(len(timings)))
            sys.stdout.write("\rGenerating networkx graph...{:.0%} ({}/{}, {:.1f}/s)".format(
                case_count * 1.0 / graph_size,
                case_count,
                graph_size,
                average_per_s,
            ))
            sys.stdout.flush()
            for target_case in source_case.cases_cited.values_list('parent_documents__id'):
                citing_graph.add_edge(str(source_case.pk), str(target_case[0]))

            # Save all the keys since they get dropped by networkx in Stage II
            id_list.append(str(source_case.pk))

        ######################
        #      Stage II      #
        # Calculate Pagerank #
        ######################
        if verbosity >= 1:
            sys.stdout.write('\n')
            sys.stdout.write('NetworkX PageRank calculating...')
            sys.stdout.flush()
        pr_result = nx.pagerank(citing_graph)
        if verbosity >= 1:
            sys.stdout.write('Complete!\n')

        ###################
        #    Stage III    #
        # Update Pagerank #
        ###################
        progress = 0
        min_value = min(pr_result.values())
        for id in id_list:
            progress += 1
            try:
                new_pr = pr_result[id]
            except KeyError:
                # NetworkX removes the isolated nodes from the network, but they still need to go into the PR file.
                new_pr = min_value
            self.result_file.write('{}={}\n'.format(id, new_pr))

            if verbosity >= 1:
                sys.stdout.write('\rUpdating Pagerank in external file...{:.0%}'.format(
                    progress * 1.0 / graph_size
                ))
                sys.stdout.flush()

        self.result_file.close()

        if verbosity >= 1:
            sys.stdout.write('\nPageRank calculation finished!')
            sys.stdout.write('See the django log for more details.\n')

        ########################
        #       Stage IV       #
        # Maintenance Routines #
        ########################
        if verbosity >= 1:
            sys.stdout.write('Sorting the temp pagerank file for improved Solr performance...\n')

        # Sort the temp file, creating a new file without the TEMP_EXTENSION value, then delete the temp file.
        os.system('sort -n %s%s > %s' % (self.RESULT_FILE_PATH, self.TEMP_EXTENSION, self.RESULT_FILE_PATH))
        os.remove(self.RESULT_FILE_PATH + self.TEMP_EXTENSION)

        if verbosity >= 1:
            sys.stdout.write('Reloading the external file cache in Solr...\n')
        reload_pagerank_external_file_cache()

        if verbosity >= 1:
            sys.stdout.write('Copying pagerank file to %s, for bulk downloading...\n' % settings.BULK_DATA_DIR)
        shutil.copy(self.RESULT_FILE_PATH, settings.BULK_DATA_DIR)
        if chown:
            user_info = pwd.getpwnam('www-data')
            os.chown(settings.BULK_DATA_DIR + 'external_pagerank', user_info.pw_uid, user_info.pw_gid)
コード例 #23
0
    def make_archive(self, obj_type_str, obj_type, court_attr, api_resource_obj):
        """Generate compressed archives containing the contents of an object
        database.

        There are a few tricks to this, but the main one is that each item in
        the database goes into two files, all.tar.gz and {court}.tar.gz. This
        means that if we want to avoid iterating the database once per file,
        we need to generate all 350+ jurisdiction files simultaneously.

        We do this by making a dict of open file handles and adding each item
        to the correct two files: The all.tar.gz file and the {court}.tar.gz
        file.

        This function takes longer to run than almost any in the codebase and
        has been the subject of some profiling. The top results are as follows:

           ncalls  tottime  percall  cumtime  percall filename:lineno(function)
           138072    5.007    0.000    6.138    0.000 {method 'sub' of '_sre.SRE_Pattern' objects}
             6001    4.452    0.001    4.608    0.001 {method 'execute' of 'psycopg2._psycopg.cursor' objects}
            24900    3.623    0.000    3.623    0.000 {built-in method compress}
        2807031/69163    2.923    0.000    8.216    0.000 copy.py:145(deepcopy)
          2427852    0.952    0.000    1.130    0.000 encoder.py:37(replace)

        Conclusions:
         1. sub is from string_utils.py, where we nuke bad chars. Could remove
            this code by sanitizing all future input to system and fixing any
            current issues. Other than that, it's already optimized.
         1. Next up is DB waiting. Queries could be optimized to make this
            better.
         1. Next is compression, which we've turned down as much as possible
            already (compresslevel=1 for most bulk files =3 for all.tar.gz).
         1. Encoding and copying bring up the rear. Not much to do there, and
            gains are limited. Could install a faster json decoder, but Python
            2.7's json implementation is already written in C. Not sure how to
            remove the gazillion copy's that are happening.
        """
        courts = Court.objects.all()
        self.stdout.write(' - Creating %s bulk %s files '
                          'simultaneously...\n' % (len(courts), obj_type_str))

        mkdir_p('/tmp/bulk/%s' % obj_type_str)

        # Open a gzip'ed tar file for every court
        tar_files = {}
        for court in courts:
            tar_files[court.pk] = tarfile.open(
                '/tmp/bulk/%s/%s.tar.gz' % (obj_type_str, court.pk),
                mode='w:gz',
                compresslevel=1,
            )
        tar_files['all'] = tarfile.open(
            '/tmp/bulk/%s/all.tar.gz' % obj_type_str,
            mode='w:gz',
            compresslevel=3,
        )

        # Make the archives
        qs = obj_type.objects.all()
        item_resource = api_resource_obj()
        if type(qs[0].pk) == int:
            item_list = queryset_generator(qs)
        else:
            item_list = qs
        for item in item_list:
            json_str = item_resource.serialize(
                None,
                item_resource.full_dehydrate(
                    item_resource.build_bundle(obj=item)),
                'application/json',
            ).encode('utf-8')

            # Add the json str to the two tarballs
            tarinfo = tarfile.TarInfo("%s.json" % item.pk)
            tarinfo.size = len(json_str)
            tarinfo.mtime = time.mktime(item.date_modified.timetuple())
            tarinfo.type = tarfile.REGTYPE

            tar_files[deepgetattr(item, court_attr)].addfile(
                tarinfo, StringIO.StringIO(json_str))
            tar_files['all'].addfile(
                tarinfo, StringIO.StringIO(json_str))

        # Close off all the gzip'ed tar files
        for court in courts:
            tar_files[court.pk].close()
        tar_files['all'].close()

        self.stdout.write('   - all %s bulk files created.\n' % obj_type_str)