Example #1
0
def upload_docket(docket, nonce):
    """Case should be locked prior to this method"""
    ia_docket, message = _get_docket_from_IA(docket)
    if ia_docket:
        docket.merge_docket(ia_docket)

    # Don't upload if nothing has changed
    if docket == ia_docket:
        return True, 'Unmodified'

    docket.nonce = nonce

    #TK: Check that it's okay to always request a new bucket made
    request = IACommon.make_docketxml_request(docket.to_xml(),
                                                  docket.get_court(),
                                                  docket.get_casenum(),
                                                  docket.casemeta,
                                                  makenew=True)

    success, msg = _post_request(request)

    if not success:
        logger.error('XML Docket upload for %s.%s failed: %s', docket.get_court(),
                                                                docket.get_casenum(),
                                                                msg)
        return False, msg

    logger.info('XML Docket upload for %s.%s succeeded', docket.get_court(),
                                                          docket.get_casenum())


    # TK: Maybe handle this in a separate function that can deal with html?
    # Assuming this is sucessful, also upload an update to the html page
    request = IACommon.make_dockethtml_request(docket.to_html(),
                                               docket.get_court(),
                                               docket.get_casenum(),
                                               docket.casemeta)

    success, msg = _post_request(request)
    if not success:
        logger.error('HTML Docket upload for %s.%s failed: %s', docket.get_court(),
                                                                 docket.get_casenum(),
                                                                 msg)
        return False, msg

    logger.info('HTML Docket upload for %s.%s succeeded', docket.get_court(),
                                                          docket.get_casenum())
    return success, msg
Example #2
0
def delete_documents_from_docket(court, casenum, documents):
    # Step 1: Get docket and convert into DocketXML
    docketstring, fetcherror = IADirect.get_docket_string(court, casenum)

    if not docketstring:
        print "Could not find docket on IA, exiting...."
        exit()

    ia_docket, message = DocketXML.parse_xml_string(docketstring)

    if not ia_docket:
        print "Docket parsing error: %s.%s, exiting...." % (court, casenum)
        exit()

    # Step 2: Remove documents from DocketXML object

    for document in documents:
        ia_docket.remove_document(document.docnum, document.subdocnum)

    # Step 3: upload modified xml
    docketbits = ia_docket.to_xml()

    request = IACommon.make_docketxml_request(docketbits, court, casenum,
                                              ia_docket.casemeta)

    success_status = False
    try:
       response = urllib2.urlopen(request)
    except urllib2.HTTPError, e:
        if e.code == 201 or e.code == 200: # 201 Created: Success!
            print "Updated %s %s docket.xml" % (court, casenum)
            success_status = True
Example #3
0
def delete_document_from_IA(document):
    request = IACommon.make_pdf_delete_request(document.court, document.casenum, document.docnum, document.subdocnum)
    try:
        response = urllib2.urlopen(request)
    except urllib2.HTTPError, e:
        if e.code != 204:
            print "   the response to the delete request was %s. This may not be an error" % e.code
Example #4
0
def delete_docket_xml_from_IA(court, casenum):
    request = IACommon.make_docketxml_delete_request(court, casenum)
    try:
       response = urllib2.urlopen(request)
    except urllib2.HTTPError, e:
       if e.code != 204:
          print "   the response to the delete request was %s. This may not be an error" % e.code
Example #5
0
def delete_docket_xml_from_IA(court, casenum):
    request = IACommon.make_docketxml_delete_request(court, casenum)
    try:
       response = urllib2.urlopen(request)
    except urllib2.HTTPError, e:
       if e.code != 204:
          print "   the response to the delete request was %s. This may not be an error" % e.code
def add_document_to_blacklist(document):
    BLACKLIST_PATH = "../blacklist"

    f = open(BLACKLIST_PATH, "a")
    f.write(IACommon.get_pdfname(document.court, document.casenum, document.docnum, document.subdocnum) + "\n")
    f.close()
    print "  added document to %s, you may want to add a comment in that file" % BLACKLIST_PATH
Example #7
0
def _cron_process_docketXML(docket, ppentry):
    ''' Required to have the lock. '''

    court = docket.casemeta["court"]
    casenum = docket.casemeta["pacer_case_num"]

    # Force '0' in the XML on docs that failed to upload.
    _update_docs_availability(docket)

    # The docket filename
    docketname = IACommon.get_docketxml_name(court, casenum)

    # Step 1: Try to fetch the existing docket from IA
    docketstring, fetcherror = IADirect.get_docket_string(court, casenum)

    if docketstring:
        # Got the existing docket-- put merged docket file.
        ia_docket, parse_msg = DocketXML.parse_xml_string(docketstring)

        if ia_docket:
            put_result, put_msg = _cron_me_up(ia_docket, docket, ppentry)

            print "  %s %s" % (docketname, put_msg)
        else:
            print "  %s docket parsing error: %s" % (docketname, parse_msg)

    elif fetcherror is IADirect.FETCH_NO_FILE:
        # Bucket exists but no docket-- put a new docket file.
        put_result, put_msg = put_docket(docket, court, casenum, ppentry)

        print "  %s put into existing bucket: %s" % (docketname, put_msg)

    elif fetcherror is IADirect.FETCH_NO_BUCKET:
        # Bucket doesn't exist-- make the bucket and put a new docket file.
        put_result, put_msg = put_docket(docket, court, casenum, ppentry,
                                         newbucket=1)

        print "  %s put into new bucket: %s" % (docketname, put_msg)

    elif fetcherror is IADirect.FETCH_URLERROR:
        # Couldn't get the IA docket

        # Unset the processing flag for later
#        ppentry.processing = 0
#        ppentry.save()
        # Leave the pickle file for later
        # Drop Lock Here?

        print "  %s timed out.  wait for next cron." % (docketname)

    else:
        # Unknown fetch error.

        # Unset the processing flag for later
#        ppentry.processing = 0
#        ppentry.save()
        # Drop Lock Here?

        # Leave the pickle file for later
        print "  %s unknown fetch error.  wait for next cron." % (docketname)
Example #8
0
def archive_docket_xml_locally(court, casenum, directory = "archived_dockets"):
    docket_url = IACommon.get_docketxml_url(court, casenum)

    if os.system("wget --quiet --directory-prefix=%s %s" % (directory, docket_url)) != 0:
        print "Could not archive this docket, exiting without trying to delete..."
        exit()

    print " saved docket %s.%s for analysis in %s directory" % (court, casenum, directory)
Example #9
0
def archive_docket_xml_locally(court, casenum, directory = "archived_dockets"):
    docket_url = IACommon.get_docketxml_url(court, casenum)

    if os.system("wget --quiet --directory-prefix=%s %s" % (directory, docket_url)) != 0:
        print "Could not archive this docket, exiting without trying to delete..."
        exit()

    print " saved docket %s.%s for analysis in %s directory" % (court, casenum, directory)
Example #10
0
def archive_document_locally(document, directory="blacklisted_documents"):
    doc_url = IACommon.get_pdf_url(document.court, document.casenum,
                                   document.docnum, document.subdocnum)

    if os.system("wget --quiet --directory-prefix=%s %s" % (directory, doc_url)) != 0:
        print "There was an error archiving document (%s.%s.%s.%s), it has been marked as unavailble, but has not been deleted from the Internet Archive" % (document.court, document.casenum, document.docnum, document.subdocnum)
        exit()

    print "    saved document %s.%s for analysis in %s directory" % (document.docnum, document.subdocnum, directory)
Example #11
0
def add_document_to_blacklist(document):
    BLACKLIST_PATH = "../blacklist"

    f = open(BLACKLIST_PATH, "a")
    f.write(
        IACommon.get_pdfname(document.court, document.casenum, document.docnum,
                             document.subdocnum) + "\n")
    f.close()
    print "  added document to %s, you may want to add a comment in that file" % BLACKLIST_PATH
def check_bucket_ready(court, casenum):
    bucketurl = IACommon.get_bucketcheck_url(court, casenum)

    request = urllib2.Request(bucketurl)

    try:
        response = opener.open(request)
    except urllib2.HTTPError, e: # HTTP Error
        # No bucket exists, probably a 404 code.
        return False, int(e.code)
Example #13
0
def check_bucket_ready(court, casenum):
    bucketurl = IACommon.get_bucketcheck_url(court, casenum)

    request = urllib2.Request(bucketurl)

    try:
        response = opener.open(request)
    except urllib2.HTTPError, e:  # HTTP Error
        # No bucket exists, probably a 404 code.
        return False, int(e.code)
Example #14
0
def _cron_process_PDF(obj, ppentry):
    filename = ppentry.filename
    meta = IACommon.get_meta_from_filename(filename)
    court = meta["court"]
    casenum = meta["casenum"]
    docnum = meta["docnum"]
    subdocnum = meta["subdocnum"]

    invalid_PDF = _is_invalid_pdf(obj, filename)

    # We only want to check for ssns on valid PDFs
    # PyPdf doesn't deal well with bad input
    if not invalid_PDF:
        # SSN privacy check
        has_ssn = _has_ssn(obj, filename)
    else:
        has_ssn = False

    # Blacklist file check
    in_blacklist = _in_blacklist(filename)

    if invalid_PDF or has_ssn or in_blacklist:
        docket = DocketXML.make_docket_for_pdf("",
                                               court,
                                               casenum,
                                               docnum,
                                               subdocnum,
                                               available=0)
        UploadHandler.do_me_up(docket)

        # Delete the entry from the DB
        ppentry.delete()
        # Quarantine the pickle file for analysis
        _quarantine_pickle(filename,
                           ssn=has_ssn,
                           blacklist_file=in_blacklist,
                           invalid_PDF=invalid_PDF)

        return

    put_result, put_msg = _dispatch_put(obj, ppentry)

    if put_result:
        # Put success-- mark this document as available in the DB
        DocumentManager.mark_as_available(filename)

        docket = DocketXML.make_docket_for_pdf("",
                                               court,
                                               casenum,
                                               docnum,
                                               subdocnum,
                                               available=1)
        UploadHandler.do_me_up(docket)

    print "  %s %s" % (filename, put_msg)
Example #15
0
def upload_document(pdfbits, court, casenum, docnum, subdocnum):
    logger.info('   Uploading document %s.%s.%s.%s' % (court, casenum, docnum, subdocnum))
    request = IACommon.make_pdf_request(pdfbits, court, casenum,
                                        docnum, subdocnum, metadict = {},
                                        makenew=True)
    success, msg = _post_request(request)
    if not success:
        logger.error('   Failed to upload document %s.%s.%s.%s' % (court, casenum, docnum, subdocnum))
        return False, msg
    logger.info('  Uploaded document %s.%s.%s.%s' % (court, casenum, docnum, subdocnum))
    return success, msg
Example #16
0
def archive_document_locally(document, directory="blacklisted_documents"):
    doc_url = IACommon.get_pdf_url(document.court, document.casenum,
                                   document.docnum, document.subdocnum)

    if os.system("wget --quiet --directory-prefix=%s %s" % (
            directory, doc_url)) != 0:
        print "There was an error archiving document (%s.%s.%s.%s), it has been marked as unavailble, but has not been deleted from the Internet Archive" % (
            document.court, document.casenum, document.docnum, document.subdocnum)
        exit()

    print "    saved document %s.%s for analysis in %s directory" % (
        document.docnum, document.subdocnum, directory)
Example #17
0
def put_file(filebits, court, casenum, docnum, subdocnum, metadict={}):
    """ PUT the file into a new Internet Archive bucket. """

    request = IACommon.make_pdf_request(filebits, court, casenum,
                                        docnum, subdocnum, metadict)

    # If this file is already scheduled, drop this. # TK: what we want?
    filename = IACommon.get_pdfname(court, casenum, docnum, subdocnum)

    query = PickledPut.objects.filter(filename=filename)
    if query:
        logging.info("put_file: same file already pickled. %s" % filename)
        return "IA PUT failed: the same file is already in the pickle bucket."

    # Add a PickledPut DB entry to schedule the PUT, not yet ready
    ppentry = PickledPut(filename=filename)

    # Fix a race case?
    try:
        ppentry.save()
    except IntegrityError:

        logging.info("put_file: same file already pickled. %s" % filename)
        return "IA PUT failed: the same file is already in the pickle bucket."


    # Pickle the request object into the jar
    pickle_success, message = pickle_object(request, filename)

    if pickle_success:
        # PickledPut now ready for processing.
        ppentry.ready = 1
        ppentry.save()
        logging.info("put_file: ready. %s" % filename)
    else:
        # Could not pickle object, so remove from DB
        logging.warning("put_file: could not pickle PDF. %s" % filename)
        ppentry.delete()

    return message
Example #18
0
def put_file(filebits, court, casenum, docnum, subdocnum, metadict={}):
    """ PUT the file into a new Internet Archive bucket. """

    request = IACommon.make_pdf_request(filebits, court, casenum, docnum,
                                        subdocnum, metadict)

    # If this file is already scheduled, drop this. # TK: what we want?
    filename = IACommon.get_pdfname(court, casenum, docnum, subdocnum)

    query = PickledPut.objects.filter(filename=filename)
    if query:
        logging.info("put_file: same file already pickled. %s" % filename)
        return "IA PUT failed: the same file is already in the pickle bucket."

    # Add a PickledPut DB entry to schedule the PUT, not yet ready
    ppentry = PickledPut(filename=filename)

    # Fix a race case?
    try:
        ppentry.save()
    except IntegrityError:

        logging.info("put_file: same file already pickled. %s" % filename)
        return "IA PUT failed: the same file is already in the pickle bucket."

    # Pickle the request object into the jar
    pickle_success, message = pickle_object(request, filename)

    if pickle_success:
        # PickledPut now ready for processing.
        ppentry.ready = 1
        ppentry.save()
        logging.info("put_file: ready. %s" % filename)
    else:
        # Could not pickle object, so remove from DB
        logging.warning("put_file: could not pickle PDF. %s" % filename)
        ppentry.delete()

    return message
Example #19
0
def delete_document_from_IA(document):
    request = IACommon.make_pdf_delete_request(document.court, document.casenum, document.docnum, document.subdocnum)
    print request
    print request.get_full_url()
    print request.get_method()
    print dir(request)
    try:
       response = urllib2.urlopen(request)
       print response
    except urllib2.HTTPError, e:
       if e.code != 204:
          print "   the response to the delete request was %s. This may not be an error" % e.code
       print "  response: %s" % e.code
def put_docket(docket, court, casenum, casemeta_diff=1):

    docketbits = docket.to_xml()

    request = IACommon.make_docketxml_request(docketbits, court, casenum,
                                              docket.casemeta)

    put_result, put_msg = _dispatch_direct_put(request)

    if put_result:
        cleanup_docket_put(court, casenum, docket, metadiff=casemeta_diff)

    return put_result, put_msg
Example #21
0
def put_docket(docket, court, casenum, casemeta_diff=1):

    docketbits = docket.to_xml()

    request = IACommon.make_docketxml_request(docketbits, court, casenum,
                                              docket.casemeta)

    put_result, put_msg = _dispatch_direct_put(request)

    if put_result:
        cleanup_docket_put(court, casenum, docket, metadiff=casemeta_diff)

    return put_result, put_msg
Example #22
0
def _cron_process_PDF(obj, ppentry):

    filename = ppentry.filename
    meta = IACommon.get_meta_from_filename(filename)
    court = meta["court"]
    casenum = meta["casenum"]
    docnum = meta["docnum"]
    subdocnum = meta["subdocnum"]

    invalid_PDF = _is_invalid_pdf(obj, filename)

    # We only want to check for ssns on valid PDFs
    # PyPdf doesn't deal well with bad input
    if not invalid_PDF:
       # SSN privacy check
       has_ssn = _has_ssn(obj, filename)
    else:
       has_ssn = False

    # Blacklist file check
    in_blacklist = _in_blacklist(filename)

    if invalid_PDF or has_ssn or in_blacklist:

        docket = DocketXML.make_docket_for_pdf("", court, casenum, docnum,
                                               subdocnum, available=0)
        UploadHandler.do_me_up(docket)

        # Delete the entry from the DB
        ppentry.delete()
        # Quarantine the pickle file for analysis
        _quarantine_pickle(filename, ssn=has_ssn, blacklist_file= in_blacklist, invalid_PDF= invalid_PDF)

        return


    put_result, put_msg = _dispatch_put(obj, ppentry)

    if put_result:
        # Put success-- mark this document as available in the DB
        DocumentManager.mark_as_available(filename)

        docket = DocketXML.make_docket_for_pdf("", court, casenum, docnum,
                                               subdocnum, available=1)
        UploadHandler.do_me_up(docket)


    print "  %s %s" % (filename, put_msg)
Example #23
0
def delete_document_from_IA(document):
    request = IACommon.make_pdf_delete_request(document.court,
                                               document.casenum,
                                               document.docnum,
                                               document.subdocnum)
    print request
    print request.get_full_url()
    print request.get_method()
    print dir(request)
    try:
        response = urllib2.urlopen(request)
        print response
    except urllib2.HTTPError, e:
        if e.code != 204:
            print "   the response to the delete request was %s. This may not be an error" % e.code
        print "  response: %s" % e.code
Example #24
0
def mark_as_available(filename):
    docmeta = IACommon.get_meta_from_filename(filename)

    docquery = Document.objects.filter(
        court=docmeta["court"], casenum=docmeta["casenum"], docnum=docmeta["docnum"], subdocnum=docmeta["subdocnum"]
    )

    try:
        docentry = docquery[0]
    except IndexError:
        # Unexpected case.  No Document entry
        logging.error("mark_as_available: no entry for %s." % (filename))
    else:
        docentry.available = 1
        try:
            docentry.save()
        except IntegrityError:
            logging.error("mark_as_available: could not save %s." % (filename))
Example #25
0
def mark_as_available(filename):
    docmeta = IACommon.get_meta_from_filename(filename)

    docquery = Document.objects.filter(court=docmeta["court"],
                                       casenum=docmeta["casenum"],
                                       docnum=docmeta["docnum"],
                                       subdocnum=docmeta["subdocnum"])

    try:
        docentry = docquery[0]
    except IndexError:
        # Unexpected case.  No Document entry
        logging.error("mark_as_available: no entry for %s." % (filename))
    else:
        docentry.available = 1
        try:
            docentry.save()
        except IntegrityError:
            logging.error("mark_as_available: could not save %s." % (filename))
def get_docket_string(court, casenum):
    docketurl = IACommon.get_docketxml_url(court, casenum)

    request = urllib2.Request(docketurl)

    try:
        response = opener.open(request)
    except urllib2.HTTPError, e: # HTTP Error
        if e.code == 404:
            bits = e.read()
            # IA returns different 404 pages if the bucket exists or not
            # This might be a brittle way to check the difference, but don't think there's a better way
            if(bits.find(NO_BUCKET_HTML_MESSAGE) > 0):
                return None, FETCH_NO_BUCKET
            # Otherwise, assume the bucket exists
            return None, FETCH_NO_FILE
        else:
            logging.info("get_docket_string: unknown fetch code %d" % e.code)
            return None, FETCH_UNKNOWN
Example #27
0
def get_docket_string(court, casenum):
    docketurl = IACommon.get_docketxml_url(court, casenum)

    request = urllib2.Request(docketurl)

    try:
        response = opener.open(request)
    except urllib2.HTTPError, e:  # HTTP Error
        if e.code == 404:
            bits = e.read()
            # IA returns different 404 pages if the bucket exists or not
            # This might be a brittle way to check the difference, but don't think there's a better way
            if (bits.find(NO_BUCKET_HTML_MESSAGE) > 0):
                return None, FETCH_NO_BUCKET
            # Otherwise, assume the bucket exists
            return None, FETCH_NO_FILE
        else:
            logging.info("get_docket_string: unknown fetch code %d" % e.code)
            return None, FETCH_UNKNOWN
Example #28
0
def put_docket(docket, court, casenum, ppentry, newbucket=0, casemeta_diff=1):
    # Put the docket to IA
    docketbits = docket.to_xml()

    request = IACommon.make_docketxml_request(docketbits, court, casenum,
                                              docket.casemeta, newbucket)

    put_result, put_msg = _dispatch_put(request, ppentry)

    if put_result:
        html_put_msg = IADirect.cleanup_docket_put(court,
                                                   casenum,
                                                   docket,
                                                   metadiff=casemeta_diff)
        print "  gov.uscourts.%s.%s.docket.html upload: %s" % (
            court, unicode(casenum), html_put_msg)
        DocumentManager.update_local_db(docket)

    return put_result, put_msg
Example #29
0
def put_docket(docket, court, casenum, ppentry, newbucket=0, casemeta_diff=1):
    # Put the docket to IA
    docketbits = docket.to_xml()

    request = IACommon.make_docketxml_request(docketbits, court, casenum,
                                              docket.casemeta, newbucket)

    put_result, put_msg = _dispatch_put(request, ppentry)

    if put_result:
        html_put_msg = IADirect.cleanup_docket_put(court, casenum, docket,
                                                   metadiff=casemeta_diff)
        print "  gov.uscourts.%s.%s.docket.html upload: %s" % (court,
                                                               unicode(
                                                                   casenum),
                                                               html_put_msg)
        DocumentManager.update_local_db(docket)

    return put_result, put_msg
Example #30
0
def _get_documents_dict(court, casenum):
    """ Create a dict containing the info for the docs specified """
    documents = {}

    query = Document.objects.filter(court=court, casenum=casenum)
    if query:
        for document in query:
            if document.docid:
                docmeta = {"casenum": document.casenum,
                           "docnum": document.docnum,
                           "subdocnum": document.subdocnum}

                if document.available:
                    docmeta.update({"filename": IACommon.get_pdf_url(document.court,
                                                 document.casenum,
                                                 document.docnum,
                                                 document.subdocnum),
                                    "timestamp": document.lastdate.strftime("%m/%d/%y")})
                documents[document.docid] = docmeta
    return documents
def _get_documents_dict(court, casenum):
    """ Create a dict containing the info for the docs specified """
    documents = {}

    query = Document.objects.filter(court=court, casenum=casenum)
    if query:
        for document in query:
            if document.docid:
                docmeta = {"casenum": document.casenum,
                           "docnum": document.docnum,
                           "subdocnum": document.subdocnum}

                if document.available:
                    docmeta.update(
                        {"filename": IACommon.get_pdf_url(document.court,
                                                          document.casenum,
                                                          document.docnum,
                                                          document.subdocnum),
                         "timestamp": document.lastdate.strftime("%m/%d/%y")})
                documents[document.docid] = docmeta
    return documents
def put_pdf(filebits, court, casenum, docnum, subdocnum, metadict={}):
    """ PUT the file into a new Internet Archive bucket. """
    request = IACommon.make_pdf_request(filebits, court, casenum,
                                        docnum, subdocnum, metadict)

    return _dispatch_direct_put(request)
Example #33
0
    yesterday = datetime.datetime.now() - datetime.timedelta(1)

    old_or_avail_query = doc_query.filter(available=1) | \
                         doc_query.filter(modified__lte=yesterday)
    query = None
    try:
        query = old_or_avail_query[0]
    except IndexError:
        try:
            query = doc_query[0]
        except IndexError:
            query = None
        else:
            ppquery = PickledPut.objects.filter(
                filename=IACommon.get_docketxml_name(court, casenum))
            if len(ppquery) > 0:
                query = None

    if query:
        try:
            # we only have a last date for documents that have been uploaded
            date = query.lastdate.strftime("%m/%d/%y")
        except AttributeError:
            try:
                date = query.modified.strftime("%m/%d/%y")
            except AttributeError:
                date = "Unknown"

        response = {
            "docket_url": IACommon.get_dockethtml_url(court, casenum),
def put_dockethtml(court, casenum, docket):
    dockethtml = docket.to_html()
    request = IACommon.make_dockethtml_request(dockethtml, court, casenum,
                                               docket.casemeta)
    return _dispatch_direct_put(request)
Example #35
0
def query(request):
    """  Query the database to check which PDF documents we have.

         The json input is {"court": <court>,
                            "urls": <list of PACER doc1 urls>}

         The json output is a set of mappings:
                           {<pacer url>: { "filename": <public url>,
                                           "timestamp": <last time seen> },
                            <pacer url>: ... }
    """
    response = {}

    if request.method != "POST":
        message = "query: Not a POST request."
        logging.error(message)
        return HttpResponse(message)

    try:
        jsonin = simplejson.loads(request.POST["json"])
    except KeyError:
        message = "query: no 'json' POST argument"
        logging.warning(message)
        return HttpResponse(message)
    except ValueError:
        message = "query: malformed 'json' POST argument"
        logging.warning(message)
        return HttpResponse(message)
    except IOError:
        # Not something we can fix I don't think.  Client fails to send data.
        message = "query: Client read error (Timeout?)"
        logging.warning(message)
        return HttpResponse(message)

    try:
        court = jsonin["court"].strip()
    except KeyError:
        message = "query: missing json 'court' argument."
        logging.warning(message)
        return HttpResponse(message)

    try:
        urls = jsonin["urls"]
    except KeyError:
        message = "query: missing json 'urls' argument."
        logging.warning(message)
        return HttpResponse(message)

    for url in urls:
        # detect show_doc style document links
        sdre = re.search("show_doc\.pl\?(.*)", url)

        if sdre:
            argsstring = sdre.group(1)
            args = argsstring.split("&")
            argsdict = {}

            for arg in args:
                (key, val) = arg.split("=")
                argsdict[key] = val

            # maybe need to add some checks for whether
            # these vars exist in argsdict
            query = Document.objects.filter(court=court) \
                .filter(docnum=argsdict["doc_num"]) \
                .filter(casenum=argsdict["caseid"]) \
                .filter(dm_id=int(argsdict["dm_id"])) \
                .filter(available=1)

        else:
            # otherwise, assume it's a normal doc1 style url
            docid = UploadHandler.docid_from_url_name(url)
            query = Document.objects.filter(docid=docid) \
                .filter(available=1)

        if query:
            query = query[0]
            real_casenum = query.casenum

            response[url] = {
                "filename":
                IACommon.get_pdf_url(court, real_casenum, query.docnum,
                                     query.subdocnum),
                "timestamp":
                query.lastdate.strftime("%m/%d/%y")
            }

            if query.subdocnum == 0:
                subquery = Document.objects.filter(
                    court=court,
                    casenum=query.casenum,
                    docnum=query.docnum,
                    available=1).exclude(subdocnum=0)

                if len(subquery) > 0:
                    response[url]["subDocuments"] = {}

                    for subDoc in subquery:
                        real_sub_casenum = subDoc.casenum
                        response[url]["subDocuments"][subDoc.subdocnum] = {
                            "filename":
                            IACommon.get_pdf_url(court, real_sub_casenum,
                                                 subDoc.docnum,
                                                 subDoc.subdocnum),
                            "timestamp":
                            subDoc.lastdate.strftime("%m/%d/%y")
                        }

    jsonout = simplejson.dumps(response)

    return HttpResponse(jsonout, mimetype="application/json")
Example #36
0
def query(request):
    """  Query the database to check which PDF documents we have.

         The json input is {"court": <court>,
                            "urls": <list of PACER doc1 urls>}

         The json output is a set of mappings:
                           {<pacer url>: { "filename": <public url>,
                                           "timestamp": <last time seen> },
                            <pacer url>: ... }
    """

    response = {}

    if request.method != "POST":
        message = "query: Not a POST request."
        logging.error(message)
        return HttpResponse(message)

    try:
        jsonin = simplejson.loads(request.POST["json"])
    except KeyError:
        message = "query: no 'json' POST argument"
        logging.warning(message)
        return HttpResponse(message)
    except ValueError:
        message = "query: malformed 'json' POST argument"
        logging.warning(message)
        return HttpResponse(message)
    except IOError:
        # Not something we can fix I don't think.  Client fails to send data.
        message = "query: Client read error (Timeout?)"
        logging.warning(message)
        return HttpResponse(message)

    try:
        court = jsonin["court"].strip()
    except KeyError:
        message = "query: missing json 'court' argument."
        logging.warning(message)
        return HttpResponse(message)

    try:
        urls = jsonin["urls"]
    except KeyError:
        message = "query: missing json 'urls' argument."
        logging.warning(message)
        return HttpResponse(message)

    for url in urls:

        # detect show_doc style document links
        sdre = re.search("show_doc\.pl\?(.*)",url)

        if sdre:
            argsstring = sdre.group(1)
            args = argsstring.split("&")
            argsdict = {}

            for arg in args:
                (key, val) = arg.split("=")
                argsdict[key] = val

            # maybe need to add some checks for whether
            # these vars exist in argsdict

            query = Document.objects.filter(court=court) \
                .filter(docnum=argsdict["doc_num"]) \
                .filter(casenum=argsdict["caseid"]) \
                .filter(dm_id=int(argsdict["dm_id"])) \
                .filter(available=1)

        else:
            # otherwise, assume it's a normal doc1 style url
            docid = UploadHandler.docid_from_url_name(url)
            query = Document.objects.filter(docid=docid) \
                .filter(available=1)


        if query:
            query = query[0]
            real_casenum = query.casenum

            response[url] = {
                "filename": IACommon.get_pdf_url(court,
                                                 real_casenum,
                                                 query.docnum,
                                                 query.subdocnum),
                "timestamp": query.lastdate.strftime("%m/%d/%y")}


            if query.subdocnum == 0:

                subquery = Document.objects.filter(court=court,
                                                   casenum=query.casenum,
                                                   docnum=query.docnum,
                                                   available=1).exclude(
                                                   subdocnum=0)

                if len(subquery) > 0:
                    response[url]["subDocuments"] = {}

                    for subDoc in subquery:
                        real_sub_casenum = subDoc.casenum
                        response[url]["subDocuments"][subDoc.subdocnum] = {
                                     "filename" : IACommon.get_pdf_url(court,
                                                              real_sub_casenum,
                                                              subDoc.docnum,
                                                              subDoc.subdocnum),
                                     "timestamp": subDoc.lastdate.strftime("%m/%d/%y")}


    jsonout = simplejson.dumps(response)

    return HttpResponse(jsonout, mimetype="application/json")
Example #37
0
def do_me_up(docket):
    ''' Download, merge and update the docket with IA. '''
    # Pickle this object for do_me_up by the cron process.

    court = docket.get_court()
    casenum = docket.get_casenum()

    docketname = IACommon.get_docketxml_name(court, casenum)

    # Check if this docket is already scheduled to be processed.
    query = PickledPut.objects.filter(filename=docketname)

    try:
        ppentry = query[0]
    except IndexError:
        # Not already scheduled, so schedule it now.
        ppentry = PickledPut(filename=docketname, docket=1)

        try:
            ppentry.save()
        except IntegrityError:
            # Try again.
            do_me_up(docket)
        else:
            # Pickle this object.
            pickle_success, msg = IA.pickle_object(docket, docketname)

            if pickle_success:
                # Ready for processing.
                ppentry.ready = 1
                ppentry.save()

                logging.info("do_me_up: ready. %s" % (docketname))
            else:
                # Pickle failed, remove from DB.
                ppentry.delete()
                logging.error("do_me_up: %s %s" % (msg, docketname))

    else:
        # Already scheduled.
        # If there is a lock for this case, it's being uploaded. Don't merge now
        locked = BucketLockManager.lock_exists(court, casenum)
        if ppentry.ready and not locked:
            # Docket is waiting to be processed by cron job.

            # Revert state back to 'not ready' so we can do local merge.
            ppentry.ready = 0
            ppentry.save()

            # Fetch and unpickle the waiting docket.
            prev_docket, unpickle_msg = IA.unpickle_object(docketname)

            if prev_docket:

                # Do the local merge.
                prev_docket.merge_docket(docket)

                # Pickle it back
                pickle_success, pickle_msg = \
                    IA.pickle_object(prev_docket, docketname)

                if pickle_success:
                    # Merged and ready.
                    ppentry.ready = 1
                    ppentry.save()
                    logging.info("do_me_up: merged and ready. %s" %(docketname))
                else:
                    # Re-pickle failed, delete.
                    ppentry.delete()
                    logging.error("do_me_up: re-%s %s" % (pickle_msg,
                                                          docketname))

            else:
                # Unpickle failed
                ppentry.delete()
                IA.delete_pickle(docketname)
                logging.error("do_me_up: %s %s" % (unpickle_msg, docketname))


        # Ignore if in any of the other three possible state...
        #   because another cron job is already doing work on this entity
        # Don't delete DB entry or pickle file.
        elif ppentry.ready and locked:
            pass
            #logging.debug("do_me_up: %s discarded, processing conflict." %
            #              (docketname))
        elif not ppentry.ready and not locked:
            pass
            #logging.debug("do_me_up: %s discarded, preparation conflict." %
            #              (docketname))
        else:
            logging.error("do_me_up: %s discarded, inconsistent state." %
                          (docketname))
def make_new_bucket(court, casenum):

    request = IACommon.make_bucket_request(court, casenum, makenew=1)
    return _dispatch_direct_put(request)
Example #39
0
def make_new_bucket(court, casenum):

    request = IACommon.make_bucket_request(court, casenum, makenew=1)
    return _dispatch_direct_put(request)
Example #40
0
def _cron_put_pickles():
    # Get uploader credentials.
    uploader_query = Uploader.objects.filter(key=AUTH_KEY)
    try:
        RECAP_UPLOADER_ID = uploader_query[0].id
    except IndexError:
        print "  could not find uploader with key=%s" % AUTH_KEY
        return

    # Get all ready pickles
    query = PickledPut.objects.filter(ready=1, processing=0) \
        .order_by('-filename')

    # Set all ready pickles to the processing state
    # for ppentry in query:
    #    ppentry.processing = 1
    #    ppentry.save()

    # Keep track of court, casenum.  Only lock and unlock once for each case.
    curr_court = None
    curr_casenum = None
    lock_nonce = None

    # Process pickles one at a time.
    for ppentry in query:

        filename = ppentry.filename

        ppmeta = IACommon.get_meta_from_filename(filename)

        court = ppmeta["court"]
        casenum = ppmeta["casenum"]

        # Make sure we have the lock for this case.

        if curr_court == court and curr_casenum == casenum:
            # Same case as the previous ppentry.

            if not lock_nonce:
                # Skip if we don't have the lock already.
                #               ppentry.processing = 0
                #               ppentry.save()
                continue

                # Otherwise, we already have the lock, so continue.

        else:
            # Switching to a new case.

            # Drop the current lock (from previous case), if necessary.
            if curr_court and curr_casenum:
                dropped, errmsg = BucketLockManager.drop_lock(
                    curr_court, curr_casenum, RECAP_UPLOADER_ID, nolocaldb=1)
                if not dropped:
                    print "  %s.%s someone stole my lock?" % \
                          (court, unicode(casenum))

            # Grab new lock
            curr_court = court
            curr_casenum = casenum

            lock_nonce, errmsg = BucketLockManager.get_lock(court,
                                                            casenum,
                                                            RECAP_UPLOADER_ID,
                                                            one_per_uploader=1)
            if not lock_nonce:
                print "  Passing on %s.%s: %s" % (court, casenum, errmsg)

            if not lock_nonce or lock_nonce == 'bigdoc':
                # We don't have a lock, so don't drop the lock in the next loop
                curr_court = None
                curr_casenum = None
                continue

        # We'll always have the lock here.

        # Unpickle the object
        obj, unpickle_msg = unpickle_object(filename)

        # Two cases for the unpickled object: Request or DocketXML
        if obj and ppentry.docket:
            print "Processing docket: %s" % filename
            _cron_process_docketXML(obj, ppentry)

        elif obj:
            # Dispatch the PUT request

            _cron_process_PDF(obj, ppentry)

        else:
            # Unpickling failed
            # If unpickling fails, it could mean that another cron job
            # has already finished this PP - not sure how to distinguish this
            print "  %s %s (Another cron job completed?)" % (filename,
                                                             unpickle_msg)

            # Delete the entry from the DB
            ppentry.delete()
            # Delete the pickle file
            delete_pickle(filename)

    # Drop last lock
    if curr_court and curr_casenum:
        dropped, errmsg = BucketLockManager.drop_lock(curr_court,
                                                      curr_casenum,
                                                      RECAP_UPLOADER_ID,
                                                      nolocaldb=1)
        if not dropped:
            print "  %s.%s someone stole my lock??" % (court, unicode(casenum))
def put_casemeta(court, casenum, metadict={}):
    request = IACommon.make_casemeta_request(court, casenum, metadict)
    return _dispatch_direct_put(request)
Example #42
0
def put_dockethtml(court, casenum, docket):
    dockethtml = docket.to_html()
    request = IACommon.make_dockethtml_request(dockethtml, court, casenum,
                                               docket.casemeta)
    return _dispatch_direct_put(request)
def do_me_up(docket):
    """ Download, merge and update the docket with IA. """
    # Pickle this object for do_me_up by the cron process.

    court = docket.get_court()
    casenum = docket.get_casenum()

    docketname = IACommon.get_docketxml_name(court, casenum)

    # Check if this docket is already scheduled to be processed.
    query = PickledPut.objects.filter(filename=docketname)

    try:
        ppentry = query[0]
    except IndexError:
        # Not already scheduled, so schedule it now.
        ppentry = PickledPut(filename=docketname, docket=1)

        try:
            ppentry.save()
        except IntegrityError:
            # Try again.
            do_me_up(docket)
        else:
            # Pickle this object.
            pickle_success, msg = IA.pickle_object(docket, docketname)

            if pickle_success:
                # Ready for processing.
                ppentry.ready = 1
                ppentry.save()

                logging.info("do_me_up: ready. %s" % (docketname))
            else:
                # Pickle failed, remove from DB.
                ppentry.delete()
                logging.error("do_me_up: %s %s" % (msg, docketname))

    else:
        # Already scheduled.
        # If there is a lock for this case, it's being uploaded. Don't merge now
        locked = BucketLockManager.lock_exists(court, casenum)
        if ppentry.ready and not locked:
            # Docket is waiting to be processed by cron job.

            # Revert state back to 'not ready' so we can do local merge.
            ppentry.ready = 0
            ppentry.save()

            # Fetch and unpickle the waiting docket.
            prev_docket, unpickle_msg = IA.unpickle_object(docketname)

            if prev_docket:

                # Do the local merge.
                prev_docket.merge_docket(docket)

                # Pickle it back
                pickle_success, pickle_msg = \
                    IA.pickle_object(prev_docket, docketname)

                if pickle_success:
                    # Merged and ready.
                    ppentry.ready = 1
                    ppentry.save()
                    logging.info(
                        "do_me_up: merged and ready. %s" % (docketname))
                else:
                    # Re-pickle failed, delete.
                    ppentry.delete()
                    logging.error("do_me_up: re-%s %s" % (pickle_msg,
                                                          docketname))

            else:
                # Unpickle failed
                ppentry.delete()
                IA.delete_pickle(docketname)
                logging.error("do_me_up: %s %s" % (unpickle_msg, docketname))


        # Ignore if in any of the other three possible state...
        # because another cron job is already doing work on this entity
        # Don't delete DB entry or pickle file.
        elif ppentry.ready and locked:
            pass
            #logging.debug("do_me_up: %s discarded, processing conflict." %
            #              (docketname))
        elif not ppentry.ready and not locked:
            pass
            #logging.debug("do_me_up: %s discarded, preparation conflict." %
            #              (docketname))
        else:
            logging.error("do_me_up: %s discarded, inconsistent state." %
                          (docketname))
Example #44
0
                  .order_by('-lastdate', '-modified')

    yesterday = datetime.datetime.now() - datetime.timedelta(1)

    old_or_avail_query = doc_query.filter(available=1) \
                         | doc_query.filter(modified__lte=yesterday)
    query = None
    try:
        query = old_or_avail_query[0]
    except IndexError:
        try:
            query = doc_query[0]
        except IndexError:
            query = None
        else:
            ppquery = PickledPut.objects.filter(filename=IACommon.get_docketxml_name(court, casenum))
            if len(ppquery) > 0:
                query = None



    if query:
        try:
            # we only have a last date for documents that have been uploaded
            date = query.lastdate.strftime("%m/%d/%y")
        except AttributeError:
            try:
                date = query.modified.strftime("%m/%d/%y")
            except AttributeError:
                date = "Unknown"
Example #45
0
def _cron_process_docketXML(docket, ppentry):
    """ Required to have the lock. """

    court = docket.casemeta["court"]
    casenum = docket.casemeta["pacer_case_num"]

    # Force '0' in the XML on docs that failed to upload.
    _update_docs_availability(docket)

    # The docket filename
    docketname = IACommon.get_docketxml_name(court, casenum)

    # Step 1: Try to fetch the existing docket from IA
    docketstring, fetcherror = IADirect.get_docket_string(court, casenum)

    if docketstring:
        # Got the existing docket-- put merged docket file.
        ia_docket, parse_msg = DocketXML.parse_xml_string(docketstring)

        if ia_docket:
            put_result, put_msg = _cron_me_up(ia_docket, docket, ppentry)

            print "  %s %s" % (docketname, put_msg)
        else:
            print "  %s docket parsing error: %s" % (docketname, parse_msg)

    elif fetcherror is IADirect.FETCH_NO_FILE:
        # Bucket exists but no docket-- put a new docket file.
        put_result, put_msg = put_docket(docket, court, casenum, ppentry)

        print "  %s put into existing bucket: %s" % (docketname, put_msg)

    elif fetcherror is IADirect.FETCH_NO_BUCKET:
        # Bucket doesn't exist-- make the bucket and put a new docket file.
        put_result, put_msg = put_docket(docket,
                                         court,
                                         casenum,
                                         ppentry,
                                         newbucket=1)

        print "  %s put into new bucket: %s" % (docketname, put_msg)

    elif fetcherror is IADirect.FETCH_URLERROR:
        # Couldn't get the IA docket

        # Unset the processing flag for later
        # ppentry.processing = 0
        #        ppentry.save()
        # Leave the pickle file for later
        # Drop Lock Here?

        print "  %s timed out.  wait for next cron." % (docketname)

    else:
        # Unknown fetch error.

        # Unset the processing flag for later
        # ppentry.processing = 0
        #        ppentry.save()
        # Drop Lock Here?

        # Leave the pickle file for later
        print "  %s unknown fetch error.  wait for next cron." % (docketname)
Example #46
0
def _cron_put_pickles():

    # Get uploader credentials.
    uploader_query = Uploader.objects.filter(key=AUTH_KEY)
    try:
        RECAP_UPLOADER_ID = uploader_query[0].id
    except IndexError:
        print "  could not find uploader with key=%s" % AUTH_KEY
        return

    # Get all ready pickles
    query = PickledPut.objects.filter(ready=1, processing=0) \
                              .order_by('-filename')

    # Set all ready pickles to the processing state
    #for ppentry in query:
    #    ppentry.processing = 1
    #    ppentry.save()

    # Keep track of court, casenum.  Only lock and unlock once for each case.
    curr_court = None
    curr_casenum = None
    lock_nonce = None

    # Process pickles one at a time.
    for ppentry in query:

        filename = ppentry.filename

        ppmeta = IACommon.get_meta_from_filename(filename)

        court = ppmeta["court"]
        casenum = ppmeta["casenum"]

        # Make sure we have the lock for this case.

        if curr_court == court and curr_casenum == casenum:
            # Same case as the previous ppentry.

            if not lock_nonce:
                # Skip if we don't have the lock already.
#               ppentry.processing = 0
#               ppentry.save()
                continue

            # Otherwise, we already have the lock, so continue.

        else:
            # Switching to a new case.

            # Drop the current lock (from previous case), if necessary.
            if curr_court and curr_casenum:
                dropped, errmsg = BucketLockManager.drop_lock(curr_court,
                                                              curr_casenum,
                                                              RECAP_UPLOADER_ID,
                                                              nolocaldb=1)
                if not dropped:
                    print "  %s.%s someone stole my lock?" % \
                        (court, unicode(casenum))

            # Grab new lock
            curr_court = court
            curr_casenum = casenum


            lock_nonce, errmsg = BucketLockManager.get_lock(court, casenum,
                                                            RECAP_UPLOADER_ID,
                                                            one_per_uploader=1)

            if not lock_nonce:
                print "  Passing on %s.%s: %s" % (court, casenum, errmsg)

                # We don't have a lock, so don't drop the lock in the next loop
                curr_court = None
                curr_casenum = None
                continue

        # We'll always have the lock here.

        # Unpickle the object
        obj, unpickle_msg = unpickle_object(filename)

        # Two cases for the unpickled object: Request or DocketXML
        if obj and ppentry.docket:
            _cron_process_docketXML(obj, ppentry)

        elif obj:
            # Dispatch the PUT request

            _cron_process_PDF(obj, ppentry)

        else:
           # Unpickling failed
           # If unpickling fails, it could mean that another cron job
           # has already finished this PP - not sure how to distinguish this
            print "  %s %s (Another cron job completed?)" % (filename, unpickle_msg)

            # Delete the entry from the DB
            ppentry.delete()
            # Delete the pickle file
            delete_pickle(filename)

    # Drop last lock
    if curr_court and curr_casenum:
        dropped, errmsg = BucketLockManager.drop_lock(curr_court, curr_casenum,
                                                      RECAP_UPLOADER_ID,
                                                      nolocaldb=1)
        if not dropped:
            print "  %s.%s someone stole my lock??" % (court, unicode(casenum))
Example #47
0
def put_casemeta(court, casenum, metadict={}):
    request = IACommon.make_casemeta_request(court, casenum, metadict)
    return _dispatch_direct_put(request)
Example #48
0
    yesterday = datetime.datetime.now() - datetime.timedelta(1)

    old_or_avail_query = doc_query.filter(available=1) | \
                         doc_query.filter(modified__lte=yesterday)
    query = None
    try:
        query = old_or_avail_query[0]
    except IndexError:
        try:
            query = doc_query[0]
        except IndexError:
            query = None
        else:
            ppquery = PickledPut.objects.filter(
                filename=IACommon.get_docketxml_name(court, casenum))
            if len(ppquery) > 0:
                query = None

    if query:
        try:
            # we only have a last date for documents that have been uploaded
            date = query.lastdate.strftime("%m/%d/%y")
        except AttributeError:
            try:
                date = query.modified.strftime("%m/%d/%y")
            except AttributeError:
                date = "Unknown"

        response = {
            "docket_url": IACommon.get_dockethtml_url(court,
Example #49
0
def put_pdf(filebits, court, casenum, docnum, subdocnum, metadict={}):
    """ PUT the file into a new Internet Archive bucket. """
    request = IACommon.make_pdf_request(filebits, court, casenum, docnum,
                                        subdocnum, metadict)

    return _dispatch_direct_put(request)