Python InternetArchive Exemples, InternetArchive Python Exemples

Exemple #1

0

Afficher le fichier

def create_docket_pickles(path, court):
    #check if docket pickle directory exists and pickle generation has completed
    opinion_reports= glob.glob(os.path.join(path, "*.opinions.*"))


    try:
        os.mkdir(os.path.join(path, 'docket_pickles'))
    except OSError:
        #delete docket_pickles
        pass

    for report in opinion_reports:
        filebits = open(report).read()
        dockets = PP.parse_opinions(filebits, court)

        if dockets:
            print "Found %s dockets in %s " % (len(dockets), report)
            for docket in dockets:
                if len(docket.documents) != 1:
                    raise "This docket has more than one document! docket text: " % docket
                doc = docket.documents.values()[0]
                filename = _get_docket_pickle_filename(court, doc['casenum'], doc['doc_num'], doc['attachment_num'])
                success, msg = IA.pickle_object(docket, filename, os.path.join(path, "docket_pickles"))

                if not success:
                    print "Error pickling file %s: %s " % (filename, msg)

Exemple #2

0

Afficher le fichier

from InternetArchive import *

collection_name = "coverartarchive"
ia = InternetArchive(collection_name)
ia.upload_collection()

Exemple #3

0

Afficher le fichier

Fichier : UploadHandler.py Projet : janderse/recap-server

def handle_pdf(filebits, court, url):
    """ Write PDF file metadata into the database. """

    # Parse coerced docid out of url
    try:
        docid = docid_from_url_name(url)
    except ValueError:
        logging.warning("handle_pdf: no url available to get docid")
        return "upload: pdf failed. no url supplied."

    # Lookup based on docid b/c it's the only metadata we have
    #  Document exists if we've previously parsed the case's docket
    query = Document.objects.filter(docid=docid)
    try:
        doc = query[0]
    except IndexError:
        logging.info("handle_pdf: haven't yet seen docket %s" % (docid))
        return "upload: pdf ignored."
    else:
        # Sanity check
        if doc.court != court:
            logging.error("handle_pdf: court mismatch (%s, %s) %s" %
                          (court, doc.court, url))
            return "upload: pdf metadata mismatch."

        casenum = doc.casenum
        docnum = doc.docnum
        subdocnum = doc.subdocnum
        sha1 = doc.sha1

    # Docket with updated sha1, available, and upload_date
    docket = DocketXML.make_docket_for_pdf(filebits, court, casenum,
                                           docnum, subdocnum, available=0)
    DocumentManager.update_local_db(docket)

    if docket.get_document_sha1(docnum ,subdocnum) != sha1:

        # Upload the file -- either doesn't exist on IA or has different sha1

        # Gather all the additional metadata we have
        #   - from the docket we just made
        doc_meta = docket.get_document_metadict(docnum, subdocnum)
        #   - from the database, if available
        if doc.docid:
            doc_meta["pacer_doc_id"] = doc.docid
        if doc.de_seq_num:
            doc_meta["pacer_de_seq_num"] = doc.de_seq_num
        if doc.dm_id:
            doc_meta["pacer_dm_id"] = doc.dm_id

        # Push the file to IA
        IA.put_file(filebits, court, casenum, docnum, subdocnum, doc_meta)

    # Whether we uploaded the file, push the docket update to IA.
    do_me_up(docket)

    logging.info("handle_pdf: uploaded %s.%s.%s.%s.pdf" % (court, casenum,
                                                           docnum, subdocnum))
    message = "pdf uploaded."

    response = {}
    response["message"] = message
    jsonout = simplejson.dumps(response)

    return jsonout

Exemple #4

0

Afficher le fichier

Fichier : UploadHandler.py Projet : janderse/recap-server

def do_me_up(docket):
    ''' Download, merge and update the docket with IA. '''
    # Pickle this object for do_me_up by the cron process.

    court = docket.get_court()
    casenum = docket.get_casenum()

    docketname = IACommon.get_docketxml_name(court, casenum)

    # Check if this docket is already scheduled to be processed.
    query = PickledPut.objects.filter(filename=docketname)

    try:
        ppentry = query[0]
    except IndexError:
        # Not already scheduled, so schedule it now.
        ppentry = PickledPut(filename=docketname, docket=1)

        try:
            ppentry.save()
        except IntegrityError:
            # Try again.
            do_me_up(docket)
        else:
            # Pickle this object.
            pickle_success, msg = IA.pickle_object(docket, docketname)

            if pickle_success:
                # Ready for processing.
                ppentry.ready = 1
                ppentry.save()

                logging.info("do_me_up: ready. %s" % (docketname))
            else:
                # Pickle failed, remove from DB.
                ppentry.delete()
                logging.error("do_me_up: %s %s" % (msg, docketname))

    else:
        # Already scheduled.
        # If there is a lock for this case, it's being uploaded. Don't merge now
        locked = BucketLockManager.lock_exists(court, casenum)
        if ppentry.ready and not locked:
            # Docket is waiting to be processed by cron job.

            # Revert state back to 'not ready' so we can do local merge.
            ppentry.ready = 0
            ppentry.save()

            # Fetch and unpickle the waiting docket.
            prev_docket, unpickle_msg = IA.unpickle_object(docketname)

            if prev_docket:

                # Do the local merge.
                prev_docket.merge_docket(docket)

                # Pickle it back
                pickle_success, pickle_msg = \
                    IA.pickle_object(prev_docket, docketname)

                if pickle_success:
                    # Merged and ready.
                    ppentry.ready = 1
                    ppentry.save()
                    logging.info("do_me_up: merged and ready. %s" %(docketname))
                else:
                    # Re-pickle failed, delete.
                    ppentry.delete()
                    logging.error("do_me_up: re-%s %s" % (pickle_msg,
                                                          docketname))

            else:
                # Unpickle failed
                ppentry.delete()
                IA.delete_pickle(docketname)
                logging.error("do_me_up: %s %s" % (unpickle_msg, docketname))


        # Ignore if in any of the other three possible state...
        #   because another cron job is already doing work on this entity
        # Don't delete DB entry or pickle file.
        elif ppentry.ready and locked:
            pass
            #logging.debug("do_me_up: %s discarded, processing conflict." %
            #              (docketname))
        elif not ppentry.ready and not locked:
            pass
            #logging.debug("do_me_up: %s discarded, preparation conflict." %
            #              (docketname))
        else:
            logging.error("do_me_up: %s discarded, inconsistent state." %
                          (docketname))

Exemple #5

0

Afficher le fichier

Fichier : UploadHandler.py Projet : wethepeopleonline/recap-server

def handle_pdf(filebits, court, url, team_name):
    """ Write PDF file metadata into the database. """

    # Parse coerced docid out of url
    try:
        docid = docid_from_url_name(url)
    except ValueError:
        logging.warning("handle_pdf: no url available to get docid")
        return "upload: pdf failed. no url supplied."

    # Lookup based on docid b/c it's the only metadata we have
    # Document exists if we've previously parsed the case's docket
    query = Document.objects.filter(docid=docid)
    try:
        doc = query[0]
    except IndexError:
        logging.info("handle_pdf: haven't yet seen docket %s" % docid)
        return "upload: pdf ignored because we don't have docket %s" % docid
    else:
        # Sanity check
        if doc.court != court:
            logging.error("handle_pdf: court mismatch (%s, %s) %s" %
                          (court, doc.court, url))
            return "upload: pdf metadata mismatch."

        casenum = doc.casenum
        docnum = doc.docnum
        subdocnum = doc.subdocnum
        sha1 = doc.sha1

    # Docket with updated sha1, available, and upload_date
    docket = DocketXML.make_docket_for_pdf(filebits, court, casenum,
                                           docnum, subdocnum, available=0)
    DocumentManager.update_local_db(docket, team_name=team_name)

    if docket.get_document_sha1(docnum, subdocnum) != sha1:

        # Upload the file -- either doesn't exist on IA or has different sha1

        # Gather all the additional metadata we have
        #   - from the docket we just made
        doc_meta = docket.get_document_metadict(docnum, subdocnum)
        #   - from the database, if available
        if doc.docid:
            doc_meta["pacer_doc_id"] = doc.docid
        if doc.de_seq_num:
            doc_meta["pacer_de_seq_num"] = doc.de_seq_num
        if doc.dm_id:
            doc_meta["pacer_dm_id"] = doc.dm_id

        # Push the file to IA
        IA.put_file(filebits, court, casenum, docnum, subdocnum, doc_meta)

    # Whether we uploaded the file, push the docket update to IA.
    do_me_up(docket)

    logging.info("handle_pdf: uploaded %s.%s.%s.%s.pdf" % (court, casenum,
                                                           docnum, subdocnum))
    message = "pdf uploaded."

    response = {"message": message}

    return simplejson.dumps(response)

Exemple #6

0

Afficher le fichier

Fichier : UploadHandler.py Projet : wethepeopleonline/recap-server

def do_me_up(docket):
    """ Download, merge and update the docket with IA. """
    # Pickle this object for do_me_up by the cron process.

    court = docket.get_court()
    casenum = docket.get_casenum()

    docketname = IACommon.get_docketxml_name(court, casenum)

    # Check if this docket is already scheduled to be processed.
    query = PickledPut.objects.filter(filename=docketname)

    try:
        ppentry = query[0]
    except IndexError:
        # Not already scheduled, so schedule it now.
        ppentry = PickledPut(filename=docketname, docket=1)

        try:
            ppentry.save()
        except IntegrityError:
            # Try again.
            do_me_up(docket)
        else:
            # Pickle this object.
            pickle_success, msg = IA.pickle_object(docket, docketname)

            if pickle_success:
                # Ready for processing.
                ppentry.ready = 1
                ppentry.save()

                logging.info("do_me_up: ready. %s" % (docketname))
            else:
                # Pickle failed, remove from DB.
                ppentry.delete()
                logging.error("do_me_up: %s %s" % (msg, docketname))

    else:
        # Already scheduled.
        # If there is a lock for this case, it's being uploaded. Don't merge now
        locked = BucketLockManager.lock_exists(court, casenum)
        if ppentry.ready and not locked:
            # Docket is waiting to be processed by cron job.

            # Revert state back to 'not ready' so we can do local merge.
            ppentry.ready = 0
            ppentry.save()

            # Fetch and unpickle the waiting docket.
            prev_docket, unpickle_msg = IA.unpickle_object(docketname)

            if prev_docket:

                # Do the local merge.
                prev_docket.merge_docket(docket)

                # Pickle it back
                pickle_success, pickle_msg = \
                    IA.pickle_object(prev_docket, docketname)

                if pickle_success:
                    # Merged and ready.
                    ppentry.ready = 1
                    ppentry.save()
                    logging.info(
                        "do_me_up: merged and ready. %s" % (docketname))
                else:
                    # Re-pickle failed, delete.
                    ppentry.delete()
                    logging.error("do_me_up: re-%s %s" % (pickle_msg,
                                                          docketname))

            else:
                # Unpickle failed
                ppentry.delete()
                IA.delete_pickle(docketname)
                logging.error("do_me_up: %s %s" % (unpickle_msg, docketname))


        # Ignore if in any of the other three possible state...
        # because another cron job is already doing work on this entity
        # Don't delete DB entry or pickle file.
        elif ppentry.ready and locked:
            pass
            #logging.debug("do_me_up: %s discarded, processing conflict." %
            #              (docketname))
        elif not ppentry.ready and not locked:
            pass
            #logging.debug("do_me_up: %s discarded, preparation conflict." %
            #              (docketname))
        else:
            logging.error("do_me_up: %s discarded, inconsistent state." %
                          (docketname))

Exemple #7

0

Afficher le fichier

def _upload_document(path, court, document):
    filename = _get_docket_pickle_filename(court, document['casenum'], document['doc_num'], document['attachment_num'])

    docket, msg = IA.unpickle_object(filename, os.path.join(path, 'docket_pickles'))
    if not docket:
        return False, 'Could not unpickle: %s' % msg

    casenum = docket.get_casenum()
    got_lock, nonce_or_message = UM.lock(court, casenum)
    # We need to: grab a lock
    if got_lock:
        print "got the lock: %s" % (nonce_or_message)
        nonce = nonce_or_message
    else:
        return False, "could not get lock: %s" % (nonce_or_message)

    # Get the existing ia docket, if it exists
    ia_docket = None
    ia_docket_orig_string = ""
    ia_casemeta_orig_hash = ""

    ia_docketstring, fetcherror = IADirect.get_docket_string(court, casenum)

    if ia_docketstring:
        # Got the existing docket-- parse it.
        ia_docket, parseerror = DocketXML.parse_xml_string(ia_docketstring)
        if not ia_docket:
            reason = "could not parse IA docket: %s" % (parseerror)
            UM.print_unlock_message(UM.unlock(court, casenum, False))
            return False, "***Skipping %s.%s: %s... " % (court, casenum, reason),
        else:
            # Save the original docket hashes
            ia_docket_orig_string = ia_docketstring
            ia_casemeta_orig_hash = hash(pickle.dumps(ia_docket.casemeta))

        print "There is a docket for %s, %s! " % (court, casenum)
    elif fetcherror is IADirect.FETCH_NO_FILE:
        # Bucket exists but no docket-- ok.
        pass

    elif fetcherror is IADirect.FETCH_NO_BUCKET:
        # Bucket doesn't exist-- either make_bucket failed or not yet ready.
        # That's okay, we'll make the bucket with the first upload

        #if casenum not in bucket_made:
            # If make_bucket failed, try make_bucket again.
        #    print "  make bucket...",
        #    make_bucket(casenum)

    elif fetcherror is IADirect.FETCH_TIMEOUT:
        # Couldn't contact IA, skip.
        UM.print_unlock_message(UM.unlock(court, casenum, False))
        #TK: Handle retry logic here?
        return False, "***Skipping %s.%s: IA is down... " % (court, casenum),

    elif not ia_docketstring:
        # Unknown fetch error, skip.
        UM.print_unlock_message(UM.unlock(court, casenum, False))
        return False, "***Skipping %s.%s: unknown docket fetch error: %s..." % \
            (court, casenum, fetcherror),

    # Step 1b: If necessary, merge the two dockets.
    if ia_docket:
        ia_docket.merge_docket(docket)
    else:
        ia_docket = docket

    # Upload the pdf

    #TK: add some better status updates here, maybe uploading doc 123 of 1234
    print "  uploading document %s.%s.%s..." % (court, casenum, document['doc_num']),
    try:
        doc_filename = os.path.join(path, document['docid'], ".pdf"
        pdfbits = open(doc_filename)).read()
    except IOError:
        UM.print_unlock_message(UM.unlock(court, casenum, False))
        return False, "  ***Could not open file %s " % doc_filename

    #TK: probably need to make the bucket before doing this
    doc_docket = DocketXML.make_docket_for_pdf(pdfbits, court,
                                               casenum, docnum,
                                               subdocnum)
    doc_meta = doc_docket.get_document_metadict(docnum, subdocnum)

    # Only upload the PDF if the hash doesn't match the one in IA.
    ia_pdfhash = ia_docket.get_document_sha1(docnum, subdocnum)
    pdfhash = doc_docket.get_document_sha1(docnum, subdocnum)

    if ia_pdfhash != pdfhash:
        pdfstatus, pdferror = \
                IADirect.put_pdf(pdfbits, court, casenum,
                                 docnum, subdocnum, doc_meta)

        if not pdfstatus:
            # PUT failed, mark document as unavailable
            doc_docket.set_document_available(docnum, subdocnum, "0")
            # TK: handle retry here
            UM.print_unlock_message(UM.unlock(court, casenum, False))
            return False, " fail: %s" % pdferror
        else:
            print "done."

            # Add this document's metadata into the ia_docket
        ia_docket.merge_docket(doc_docket)


    # Step 5: Push the docket to IA, if things have changed.
    print "  docket upload...",

    docket_modified = 0
    ignore_nonce = 0
    ia_docket_merged_string = ia_docket.to_xml()

    if ia_docket_orig_string != ia_docket_merged_string:
        # Assign the docket the new nonce from the lock
        ia_docket.nonce = nonce

        ia_casemeta_merged_hash = hash(pickle.dumps(ia_docket.casemeta))
        casemeta_diff = ia_casemeta_orig_hash != ia_casemeta_merged_hash

        putstatus, puterror = \
            IADirect.put_docket(ia_docket, court, casenum,
                                casemeta_diff=casemeta_diff)






    UM.print_unlock_message(UM.unlock(court, casenum, modified = False))
    return True, "Document uploaded"