Example #1
0
def _cron_fetch_update(lock):
    court = unicode(lock.court)
    casenum = unicode(lock.casenum)
    nonce = unicode(lock.nonce)

    docketstring, fetcherror = IADirect.get_docket_string(court, casenum)

    if not docketstring:
        # Couldn't get the docket.  Try again later.

        if nonce:
            BucketLockManager.try_lock_later(lock)
        else:
            lock.delete()
        print "  %s.%s couldn't fetch the docket: %d" % (court, casenum,
                                                         fetcherror)
        return

    ia_docket, message = DocketXML.parse_xml_string(docketstring)

    if not ia_docket:
        # Docket parsing error.

        if nonce:
            BucketLockManager.try_lock_later(lock)
        else:
            lock.delete()
        print "  %s.%s docket parsing error: %s" % (court, casenum, message)
        return
    elif ia_docket.nonce == nonce or not nonce:
        # Got the docket and it is either:
        # 1. up-to-date (nonce match), or
        #  2. expired (ignore nonce)
        # In both scenarios, update the local DB.
        DocumentManager.update_local_db(ia_docket, ignore_available=0)

        print "  %s.%s fetched and DB updated." % (court, casenum)

        ia_docket_orig_hash = hash(pickle.dumps(ia_docket))

        local_docket = DocumentManager.create_docket_from_local_documents(
            court, casenum)

        if local_docket:
            ia_docket.merge_docket(local_docket)

        ia_docket_after_local_merge_hash = hash(pickle.dumps(ia_docket))

        if ia_docket_orig_hash != ia_docket_after_local_merge_hash:
            print " After fetch, some locally stored information was " \
                  "missing from %s.%s. Local info addition scheduled." % (
                      court, casenum)
            UploadHandler.do_me_up(ia_docket)

        # Remove the lock.
        lock.delete()
    else:
        # Got the docket but it is not update to date.  Try again later.
        BucketLockManager.try_lock_later(lock)
        print "  %s.%s fetched, wait more." % (court, casenum)
Example #2
0
def _cron_fetch_update(lock):
    court = unicode(lock.court)
    casenum = unicode(lock.casenum)
    nonce = unicode(lock.nonce)

    docketstring, fetcherror = IADirect.get_docket_string(court, casenum)

    if not docketstring:
        # Couldn't get the docket.  Try again later.

        if nonce:
            BucketLockManager.try_lock_later(lock)
        else:
            lock.delete()
        print "  %s.%s couldn't fetch the docket: %d" % (court, casenum,
                                                         fetcherror)
        return

    ia_docket, message = DocketXML.parse_xml_string(docketstring)

    if not ia_docket:
        # Docket parsing error.

        if nonce:
            BucketLockManager.try_lock_later(lock)
        else:
            lock.delete()
        print "  %s.%s docket parsing error: %s" % (court, casenum,
                                                    message)
        return
    elif ia_docket.nonce == nonce or not nonce:
        # Got the docket and it is either:
        #  1. up-to-date (nonce match), or
        #  2. expired (ignore nonce)
        # In both scenarios, update the local DB.
        DocumentManager.update_local_db(ia_docket, ignore_available=0)

        print "  %s.%s fetched and DB updated." % (court, casenum)

        ia_docket_orig_hash = hash(pickle.dumps(ia_docket))

        local_docket = DocumentManager.create_docket_from_local_documents(court, casenum)

        if local_docket:
            ia_docket.merge_docket(local_docket)

        ia_docket_after_local_merge_hash = hash(pickle.dumps(ia_docket))

        if ia_docket_orig_hash != ia_docket_after_local_merge_hash:
            print " After fetch, some locally stored information was missing from %s.%s. Local info addition scheduled."  % (court, casenum)
            UploadHandler.do_me_up(ia_docket)

        # Remove the lock.
        lock.delete()
    else:
        # Got the docket but it is not update to date.  Try again later.
        BucketLockManager.try_lock_later(lock)
        print "  %s.%s fetched, wait more." % (court, casenum)
Example #3
0
def _cron_process_PDF(obj, ppentry):
    filename = ppentry.filename
    meta = IACommon.get_meta_from_filename(filename)
    court = meta["court"]
    casenum = meta["casenum"]
    docnum = meta["docnum"]
    subdocnum = meta["subdocnum"]

    invalid_PDF = _is_invalid_pdf(obj, filename)

    # We only want to check for ssns on valid PDFs
    # PyPdf doesn't deal well with bad input
    if not invalid_PDF:
        # SSN privacy check
        has_ssn = _has_ssn(obj, filename)
    else:
        has_ssn = False

    # Blacklist file check
    in_blacklist = _in_blacklist(filename)

    if invalid_PDF or has_ssn or in_blacklist:
        docket = DocketXML.make_docket_for_pdf("",
                                               court,
                                               casenum,
                                               docnum,
                                               subdocnum,
                                               available=0)
        UploadHandler.do_me_up(docket)

        # Delete the entry from the DB
        ppentry.delete()
        # Quarantine the pickle file for analysis
        _quarantine_pickle(filename,
                           ssn=has_ssn,
                           blacklist_file=in_blacklist,
                           invalid_PDF=invalid_PDF)

        return

    put_result, put_msg = _dispatch_put(obj, ppentry)

    if put_result:
        # Put success-- mark this document as available in the DB
        DocumentManager.mark_as_available(filename)

        docket = DocketXML.make_docket_for_pdf("",
                                               court,
                                               casenum,
                                               docnum,
                                               subdocnum,
                                               available=1)
        UploadHandler.do_me_up(docket)

    print "  %s %s" % (filename, put_msg)
def mark_document_as_unavailable(document):
    #if not document.available:
    #    print "Exiting: This document isn't currently available on IA"
    #    print usage()
    #    exit()

    document.available = 0
    document.lastdate = datetime.datetime.now() # this ensures that the archive.recapthelaw will get the update
    document.save()

    docket = DocketXML.make_docket_for_pdf("", document.court, document.casenum, document.docnum,
                                               document.subdocnum, available=0)
    UploadHandler.do_me_up(docket)
Example #5
0
def mark_document_as_unavailable(document):
    # if not document.available:
    #    print "Exiting: This document isn't currently available on IA"
    #    print usage()
    #    exit()

    document.available = 0
    document.lastdate = datetime.datetime.now()  # this ensures that the archive.recapthelaw will get the update
    document.save()

    docket = DocketXML.make_docket_for_pdf("", document.court,
                                           document.casenum, document.docnum,
                                           document.subdocnum, available=0)
    UploadHandler.do_me_up(docket)
Example #6
0
def _cron_process_PDF(obj, ppentry):

    filename = ppentry.filename
    meta = IACommon.get_meta_from_filename(filename)
    court = meta["court"]
    casenum = meta["casenum"]
    docnum = meta["docnum"]
    subdocnum = meta["subdocnum"]

    invalid_PDF = _is_invalid_pdf(obj, filename)

    # We only want to check for ssns on valid PDFs
    # PyPdf doesn't deal well with bad input
    if not invalid_PDF:
       # SSN privacy check
       has_ssn = _has_ssn(obj, filename)
    else:
       has_ssn = False

    # Blacklist file check
    in_blacklist = _in_blacklist(filename)

    if invalid_PDF or has_ssn or in_blacklist:

        docket = DocketXML.make_docket_for_pdf("", court, casenum, docnum,
                                               subdocnum, available=0)
        UploadHandler.do_me_up(docket)

        # Delete the entry from the DB
        ppentry.delete()
        # Quarantine the pickle file for analysis
        _quarantine_pickle(filename, ssn=has_ssn, blacklist_file= in_blacklist, invalid_PDF= invalid_PDF)

        return


    put_result, put_msg = _dispatch_put(obj, ppentry)

    if put_result:
        # Put success-- mark this document as available in the DB
        DocumentManager.mark_as_available(filename)

        docket = DocketXML.make_docket_for_pdf("", court, casenum, docnum,
                                               subdocnum, available=1)
        UploadHandler.do_me_up(docket)


    print "  %s %s" % (filename, put_msg)
Example #7
0
def query(request):
    """  Query the database to check which PDF documents we have.

         The json input is {"court": <court>,
                            "urls": <list of PACER doc1 urls>}

         The json output is a set of mappings:
                           {<pacer url>: { "filename": <public url>,
                                           "timestamp": <last time seen> },
                            <pacer url>: ... }
    """

    response = {}

    if request.method != "POST":
        message = "query: Not a POST request."
        logging.error(message)
        return HttpResponse(message)

    try:
        jsonin = simplejson.loads(request.POST["json"])
    except KeyError:
        message = "query: no 'json' POST argument"
        logging.warning(message)
        return HttpResponse(message)
    except ValueError:
        message = "query: malformed 'json' POST argument"
        logging.warning(message)
        return HttpResponse(message)
    except IOError:
        # Not something we can fix I don't think.  Client fails to send data.
        message = "query: Client read error (Timeout?)"
        logging.warning(message)
        return HttpResponse(message)

    try:
        court = jsonin["court"].strip()
    except KeyError:
        message = "query: missing json 'court' argument."
        logging.warning(message)
        return HttpResponse(message)

    try:
        urls = jsonin["urls"]
    except KeyError:
        message = "query: missing json 'urls' argument."
        logging.warning(message)
        return HttpResponse(message)

    for url in urls:

        # detect show_doc style document links
        sdre = re.search("show_doc\.pl\?(.*)",url)

        if sdre:
            argsstring = sdre.group(1)
            args = argsstring.split("&")
            argsdict = {}

            for arg in args:
                (key, val) = arg.split("=")
                argsdict[key] = val

            # maybe need to add some checks for whether
            # these vars exist in argsdict

            query = Document.objects.filter(court=court) \
                .filter(docnum=argsdict["doc_num"]) \
                .filter(casenum=argsdict["caseid"]) \
                .filter(dm_id=int(argsdict["dm_id"])) \
                .filter(available=1)

        else:
            # otherwise, assume it's a normal doc1 style url
            docid = UploadHandler.docid_from_url_name(url)
            query = Document.objects.filter(docid=docid) \
                .filter(available=1)


        if query:
            query = query[0]
            real_casenum = query.casenum

            response[url] = {
                "filename": IACommon.get_pdf_url(court,
                                                 real_casenum,
                                                 query.docnum,
                                                 query.subdocnum),
                "timestamp": query.lastdate.strftime("%m/%d/%y")}


            if query.subdocnum == 0:

                subquery = Document.objects.filter(court=court,
                                                   casenum=query.casenum,
                                                   docnum=query.docnum,
                                                   available=1).exclude(
                                                   subdocnum=0)

                if len(subquery) > 0:
                    response[url]["subDocuments"] = {}

                    for subDoc in subquery:
                        real_sub_casenum = subDoc.casenum
                        response[url]["subDocuments"][subDoc.subdocnum] = {
                                     "filename" : IACommon.get_pdf_url(court,
                                                              real_sub_casenum,
                                                              subDoc.docnum,
                                                              subDoc.subdocnum),
                                     "timestamp": subDoc.lastdate.strftime("%m/%d/%y")}


    jsonout = simplejson.dumps(response)

    return HttpResponse(jsonout, mimetype="application/json")
Example #8
0
def upload(request):
    """ Public upload view for all incoming data. """

    if request.method != "POST":
        message = "upload: Not a POST request."
        logging.error(message)
        return HttpResponse(message)

    try:
        if not request.FILES:
            message = "upload: No request.FILES attribute."
            logging.error(message)
            return HttpResponse(message)
    except IOError:
        # Not something we can fix I don't think.  Client fails to send data.
        message = "Client read error (Timeout?)"
        logging.warning("upload: %s" % message)
        return HttpResponse(message)
    except SystemError:
        message = "Could not parse POST arguments."
        logging.warning("uploads: %s" % message)
        return HttpResponse(message)

    try:
        data = request.FILES["data"]
    except KeyError:
        try:
            # TK: Only used in testing - get rid of me
            data = request.FILES["data_file"]
        except KeyError:
            message = "upload: No FILES 'data' attribute."
            logging.error(message)
            return HttpResponse(message)

    try:
        court = request.POST["court"]
    except KeyError:
        message = "upload: No POST 'court' attribute."
        logging.error(message)
        return HttpResponse(message)
    else:
        court = court.strip()

    if request.POST.get("casenum"):
        casenum_re = re.compile(r'\d+(-\d+)?')
        casenum = request.POST["casenum"].strip()
        if not casenum_re.match(casenum):
            message = "upload: 'casenum' invalid: %s" % \
                request.POST["casenum"]
            logging.error(message)
            return HttpResponse(message)
    else:
        casenum = None

    try:
        mimetype = request.POST["mimetype"].strip()
    except KeyError:
        message = "upload: No POST 'mimetype' attribute."
        logging.error(message)
        return HttpResponse(message)

    try:
        url = request.POST["url"].strip()
    except KeyError:
        url = None

    message = UploadHandler.handle_upload(data, court, casenum,
                                          mimetype, url)

    return HttpResponse(message)
Example #9
0
        logging.error(message)
        return HttpResponse(message)

    # Necessary to preserve backwards compatibility with 0.6
    #  This param prevents tons of garbage from being printed to
    #  the error console after an Adddocmeta request
    try:
        add_case_info = request.POST["add_case_info"]
    except KeyError:
        add_case_info = None


    DocumentManager.handle_adddocmeta(docid, court, casenum, de_seq_num,
                                      dm_id, docnum, subdocnum)
    if add_case_info:
        response = {"documents": UploadHandler._get_documents_dict(court, casenum),
                  "message": "adddocmeta: DB updated for docid=%s" % (docid) }
        message = simplejson.dumps(response)
    else:
        message = "adddocmeta: DB updated for docid=%s" % (docid)

    return HttpResponse(message)

def lock(request):

    try:
        key = request.GET["key"].strip()
        court = request.GET["court"].strip()
        casenum = request.GET["casenum"].strip()
        one_per_uploader = 1 if request.GET.get('one_per_uploader') else 0
    except KeyError:
            "      The contents of filename should have a single case per line, each identified by 'court casenum'\n "
        )
        sys.exit(1)

    cases_to_repair = read_in_cases_to_repair(sys.argv[1])

    for case in cases_to_repair:
        court = case[0]
        casenum = case[1]

        print "Repairing case %s.%s...." % (court, casenum)

        docket = DocumentManager.create_docket_from_local_documents(
            court, casenum)

        if docket:
            # this will merge our docket with existing one on IA
            UploadHandler.do_me_up(docket)
        else:
            print " Could not create docket from local documents for %s %s" % (
                court, casenum)

#  for each case, create docket fromlocal

#  call do_me_up(docket)
#  download ia_docket
#
#  merge ia docket and local docket
#
#  if there is a difference, schedule the docket for upload (created a pickled put?)
Example #11
0
def upload(request):
    """ Public upload view for all incoming data. """

    if request.method != "POST":
        message = "upload: Not a POST request."
        logging.error(message)
        return HttpResponse(message)

    try:
        if not request.FILES:
            message = "upload: No request.FILES attribute."
            logging.error(message)
            return HttpResponse(message)
    except IOError:
        # Not something we can fix I don't think.  Client fails to send data.
        message = "Client read error (Timeout?)"
        logging.warning("upload: %s" % message)
        return HttpResponse(message)
    except SystemError:
        message = "Could not parse POST arguments."
        logging.warning("uploads: %s" % message)
        return HttpResponse(message)

    try:
        data = request.FILES["data"]
    except KeyError:
        try:
            # TK: Only used in testing - get rid of me
            data = request.FILES["data_file"]
        except KeyError:
            message = "upload: No FILES 'data' attribute."
            logging.error(message)
            return HttpResponse(message)

    try:
        court = request.POST["court"]
    except KeyError:
        message = "upload: No POST 'court' attribute."
        logging.error(message)
        return HttpResponse(message)
    else:
        court = court.strip()

    if request.POST.get("casenum"):
        casenum = request.POST["casenum"].strip()
        casenum_re = re.compile(r'\d+(-\d+)?')
        if not casenum_re.match(casenum) or ":" in casenum:
            message = "upload: 'casenum' invalid: %s" % request.POST["casenum"]
            logging.error(message)
            return HttpResponse(message)
    else:
        casenum = None

    try:
        mimetype = request.POST["mimetype"].strip()
    except KeyError:
        message = "upload: No POST 'mimetype' attribute."
        logging.error(message)
        return HttpResponse(message)

    try:
        url = request.POST["url"].strip()
    except KeyError:
        url = None

    try:
        team_name = request.POST['team_name'].strip()
    except KeyError:
        team_name = None

    message = UploadHandler.handle_upload(data, court, casenum, mimetype, url,
                                          team_name)

    return HttpResponse(message)
Example #12
0
def query(request):
    """  Query the database to check which PDF documents we have.

         The json input is {"court": <court>,
                            "urls": <list of PACER doc1 urls>}

         The json output is a set of mappings:
                           {<pacer url>: { "filename": <public url>,
                                           "timestamp": <last time seen> },
                            <pacer url>: ... }
    """
    response = {}

    if request.method != "POST":
        message = "query: Not a POST request."
        logging.error(message)
        return HttpResponse(message)

    try:
        jsonin = simplejson.loads(request.POST["json"])
    except KeyError:
        message = "query: no 'json' POST argument"
        logging.warning(message)
        return HttpResponse(message)
    except ValueError:
        message = "query: malformed 'json' POST argument"
        logging.warning(message)
        return HttpResponse(message)
    except IOError:
        # Not something we can fix I don't think.  Client fails to send data.
        message = "query: Client read error (Timeout?)"
        logging.warning(message)
        return HttpResponse(message)

    try:
        court = jsonin["court"].strip()
    except KeyError:
        message = "query: missing json 'court' argument."
        logging.warning(message)
        return HttpResponse(message)

    try:
        urls = jsonin["urls"]
    except KeyError:
        message = "query: missing json 'urls' argument."
        logging.warning(message)
        return HttpResponse(message)

    for url in urls:
        # detect show_doc style document links
        sdre = re.search("show_doc\.pl\?(.*)", url)

        if sdre:
            argsstring = sdre.group(1)
            args = argsstring.split("&")
            argsdict = {}

            for arg in args:
                (key, val) = arg.split("=")
                argsdict[key] = val

            # maybe need to add some checks for whether
            # these vars exist in argsdict
            query = Document.objects.filter(court=court) \
                .filter(docnum=argsdict["doc_num"]) \
                .filter(casenum=argsdict["caseid"]) \
                .filter(dm_id=int(argsdict["dm_id"])) \
                .filter(available=1)

        else:
            # otherwise, assume it's a normal doc1 style url
            docid = UploadHandler.docid_from_url_name(url)
            query = Document.objects.filter(docid=docid) \
                .filter(available=1)

        if query:
            query = query[0]
            real_casenum = query.casenum

            response[url] = {
                "filename":
                IACommon.get_pdf_url(court, real_casenum, query.docnum,
                                     query.subdocnum),
                "timestamp":
                query.lastdate.strftime("%m/%d/%y")
            }

            if query.subdocnum == 0:
                subquery = Document.objects.filter(
                    court=court,
                    casenum=query.casenum,
                    docnum=query.docnum,
                    available=1).exclude(subdocnum=0)

                if len(subquery) > 0:
                    response[url]["subDocuments"] = {}

                    for subDoc in subquery:
                        real_sub_casenum = subDoc.casenum
                        response[url]["subDocuments"][subDoc.subdocnum] = {
                            "filename":
                            IACommon.get_pdf_url(court, real_sub_casenum,
                                                 subDoc.docnum,
                                                 subDoc.subdocnum),
                            "timestamp":
                            subDoc.lastdate.strftime("%m/%d/%y")
                        }

    jsonout = simplejson.dumps(response)

    return HttpResponse(jsonout, mimetype="application/json")
Example #13
0
        logging.error(message)
        return HttpResponse(message)

    # Necessary to preserve backwards compatibility with 0.6
    #  This param prevents tons of garbage from being printed to
    #  the error console after an Adddocmeta request
    try:
        add_case_info = request.POST["add_case_info"]
    except KeyError:
        add_case_info = None

    DocumentManager.handle_adddocmeta(docid, court, casenum, de_seq_num, dm_id,
                                      docnum, subdocnum)
    if add_case_info:
        response = {
            "documents": UploadHandler._get_documents_dict(court, casenum),
            "message": "adddocmeta: DB updated for docid=%s" % (docid)
        }
        message = simplejson.dumps(response)
    else:
        message = "adddocmeta: DB updated for docid=%s" % (docid)

    return HttpResponse(message)


def lock(request):
    try:
        key = request.GET["key"].strip()
        court = request.GET["court"].strip()
        casenum = request.GET["casenum"].strip()
        one_per_uploader = 1 if request.GET.get('one_per_uploader') else 0
Example #14
0
    sys.stderr.write("Usage: %s <filename_containing_cases_to_repair>\n " % sys.argv[0])
    sys.stderr.write("      The contents of filename should have a single case per line, each identified by 'court casenum'\n " )
    sys.exit(1)

  cases_to_repair = read_in_cases_to_repair(sys.argv[1])

  for case in cases_to_repair:
    court = case[0]
    casenum = case[1]

    print "Repairing case %s.%s...." % (court, casenum)

    docket = DocumentManager.create_docket_from_local_documents(court, casenum)

    if docket:
        # this will merge our docket with existing one on IA
        UploadHandler.do_me_up(docket)
    else:
        print " Could not create docket from local documents for %s %s" % (court, casenum)



#  for each case, create docket fromlocal

#  call do_me_up(docket)
#  download ia_docket
#
#  merge ia docket and local docket
#
#  if there is a difference, schedule the docket for upload (created a pickled put?)