def generateFileEntries(root, item):
    from os.path import join
    from hashlib import sha256

    from uchicagoldr.batch import Batch

    fileInfoDict = {}
    b = Batch(root, item)
    totalDigitalSize = 0
    for item in b.find_items(from_directory=True):
        itemDict = {}
        item.set_accession(item.find_file_accession())
        uid = sha256(join(
              item.get_accession(), item.find_canonical_filepath()
                    ).encode('utf-8')).hexdigest()
        itemDict['fileSize'] = item.find_file_size()
        itemDict['fileMime'] = item.find_file_mime_type()
        itemDict['fileHash'] = item.find_sha256_hash()
        totalDigitalSize += itemDict['fileSize']

        if ".presform" in item.find_file_name():
            presStable = "True"
        else:
            presStable = "False"

        itemDict['fileStable'] = presStable

        fileInfoDict[uid] = itemDict
    return fileInfoDict
def zipConverter(item):
    if not exists(item.get_file_path()+'.presform.extracted'):
        unzipCommandArgs=['7z','x','-o'+item.get_file_path()+'.presform.extracted',item.get_file_path()]
        unzipCommand=BashCommand(unzipCommandArgs)
        unzipCommand.run_command()
        unzipCommand.read_data()
        print(unzipCommand.read_data())
        b=Batch(root,item.get_file_path()+'.presform.extracted')
        for item in b.find_items(from_directory=True):
            itemStack.append(item)
        return unzipCommand.get_data()
    else:
        logger.info("Already extracted.")
Ejemplo n.º 3
0
    def createMetaAccession(self):
        metaAccessionDict = {}
        for accession in self.accessions:
            for item in accession.get_items():
                # Clobber entries from previous accessions, keep the newest
                metaAccessionDict[item.get_canonical_filepath()] = \
                    item.get_root_path()
        metaAccessionList = []
        for canonicalFilepath in metaAccessionDict:
            metaAccessionList.append(
                                     metaAccessionDict[canonicalFilepath] +
                                     "/" + canonicalFilepath)
        metaAccession = Batch()
        for reconstructedFilePath in metaAccessionList:
            metaAccession.add_item(Item(reconstructedFilePath))

        return metaAccession
def main():
    # start of parser boilerplate
    parser = ArgumentParser(description="Produce TFIDF numbers for terms in the text preservation formats in a batch",
                            epilog="Copyright University of Chicago; " + \
                            "written by "+__author__ + \
                            " "+__email__)

    parser.add_argument("-v", help="See the version of this program",
                        action="version", version=__version__)
    # let the user decide the verbosity level of logging statements
    # -b sets it to INFO so warnings, errors and generic informative statements
    # will be logged
    parser.add_argument( \
                         '-b','-verbose',help="set verbose logging",
                         action='store_const',dest='log_level',
                         const=INFO,default='INFO' \
    )
    # -d is debugging so anything you want to use a debugger gets logged if you
    # use this level
    parser.add_argument( \
                         '-d','--debugging',help="set debugging logging",
                         action='store_const',dest='log_level',
                         const=DEBUG,default='INFO' \
    )
    # optionally save the log to a file. set a location or use the default constant
    parser.add_argument( \
                         '-l','--log_loc',help="save logging to a file",
                         dest="log_loc",
                         \
    )
    parser.add_argument("item", help="Enter a noid for an accession or a " + \
                        "directory path that you need to validate against" + \
                        " a type of controlled collection"
    )
    parser.add_argument("root",help="Enter the root of the directory path",
                        action="store"
    )
    args = parser.parse_args()
    log_format = Formatter( \
                            "[%(levelname)s] %(asctime)s  " + \
                            "= %(message)s",
                            datefmt="%Y-%m-%dT%H:%M:%S" \
    )
    global logger
    logger = getLogger( \
                        "lib.uchicago.repository.logger" \
    )
    ch = StreamHandler()
    ch.setFormatter(log_format)
    logger.setLevel(args.log_level)
    if args.log_loc:
        fh = FileHandler(args.log_loc)
        fh.setFormatter(log_format)
        logger.addHandler(fh)
    logger.addHandler(ch)
    try:
        b = Batch(args.root, args.item)
        textDocs=TextBatch(args.item,args.root)
        for item in b.find_items(from_directory=True):
            if ".presform.txt" in item.find_file_name():
                textDoc=TextItem(item.get_file_path(),item.get_root_path())
                textDocs.add_item(textDoc)
        if textDocs.validate_items():
            logger.info("Getting document term indices")
            term_map={}
            for item in textDocs.get_items():
                item.set_raw_string(item.find_raw_string())
                indexOut=item.find_index(purge_raw=True,scrub_text=True,term_map=term_map)
                item.set_index(indexOut[0])
                term_map.update(indexOut[1])
            textDocs.set_term_map(term_map)
            logger.info("Getting IDFs")
            textDocs.set_doc_counts(textDocs.find_doc_counts())
            textDocs.set_idfs(textDocs.find_idfs())
            logger.info("Computing TFIDFs")
            textDocs.set_tf_idfs(textDocs.find_tf_idfs())
            textDocs.rev_term_map()

            for key in textDocs.get_tf_idfs():
                print(key)
                tfidfs=[]
                for entry in textDocs.get_tf_idfs()[key]:
                    tfidfs.append((entry,textDocs.get_tf_idfs()[key][entry]))
                tfidfs=sorted(tfidfs,key=lambda x: x[1],reverse=True)
                printFirstX=9
                firstX=tfidfs[0:printFirstX]
                justTerms=[]
                for entry in firstX:
                    justTerms.append(textDocs.get_term_map()[entry[0]])
                print(",".join(justTerms)+"\n")
            
        return 0
    except KeyboardInterrupt:
        logger.error("Program aborted manually")
        return 131
def main():
    # Master log instantiation begins #
    global masterLog
    masterLog = MasterLogger()
    # Master log instantiation ends #

    # Application specific log instantation begins #
    global logger
    logger = masterLog.getChild(__name__)
    f = UserAndIPFilter()
    termHandler = DefaultTermHandler()
    logger.addHandler(termHandler)
    logger.addFilter(f)
    logger.info("BEGINS")
    # Application specific log instantation ends #

    # Parser instantiation begins #
    parser = ArgumentParser(description="[A brief description of the utility]",
                            epilog="Copyright University of Chicago; " +
                            "written by "+__author__ +
                            " "+__email__)

    parser.add_argument(
                        "-v",
                        help="See the version of this program",
                        action="version",
                        version=__version__
    )
    # let the user decide the verbosity level of logging statements
    # -b sets it to INFO so warnings, errors and generic informative statements
    # will be logged
    parser.add_argument(
                        '-b', '--verbosity',
                        help="set logging verbosity " +
                        "(DEBUG,INFO,WARN,ERROR,CRITICAL)",
                        nargs='?',
                        const='INFO'
    )
    # -d is debugging so anything you want to use a debugger gets logged if you
    # use this level
    parser.add_argument(
                        '-d', '--debugging',
                        help="set debugging logging",
                        action='store_true'
    )
    # optionally save the log to a file.
    # Set a location or use the default constant
    parser.add_argument(
                        '-l', '--log_loc',
                        help="save logging to a file",
                        dest="log_loc",

    )
    parser.add_argument(
                        "item",
                        help="Enter a noid for an accession or a " +
                        "directory path that you need to validate against" +
                        " a type of controlled collection"
    )
    parser.add_argument(
                        "root",
                        help="Enter the root of the directory path",
                        action="store"
    )
    try:
        args = parser.parse_args()
    except SystemExit:
        logger.critical("ENDS: Command line argument parsing failed.")
        exit(1)

    # Begin argument post processing, if required #
    if args.verbosity and args.verbosity not in ['DEBUG', 'INFO',
                                                 'WARN', 'ERROR', 'CRITICAL']:
        logger.critical("You did not pass a valid argument to the verbosity \
                        flag! Valid arguments include: \
                        'DEBUG','INFO','WARN','ERROR', and 'CRITICAL'")
        return(1)
    if args.log_loc:
        if not exists(split(args.log_loc)[0]):
            logger.critical("The specified log location does not exist!")
            return(1)
    # End argument post processing #

    # Begin user specified log instantiation, if required #
    if args.log_loc:
        fileHandler = DefaultFileHandler(args.log_loc)
        logger.addHandler(fileHandler)

    if args.verbosity:
        logger.removeHandler(termHandler)
        termHandler = DefaultTermHandlerAtLevel(args.verbosity)
        logger.addHandler(termHandler)
        if args.log_loc:
            logger.removeHandler(fileHandler)
            fileHandler = DefaultFileHandlerAtLevel(args.log_loc,
                                                    args.verbosity)
            logger.addHandler(fileHandler)

    if args.debugging:
        logger.removeHandler(termHandler)
        termHandler = DebugTermHandler()
        logger.addHandler(termHandler)
        if args.log_loc:
            logger.removeHandler(fileHandler)
            fileHandler = DebugFileHandler(args.log_loc)
            logger.addHandler(fileHandler)
    # End user specified log instantiation #
    try:
        # Begin module code #
        b = Batch(args.root, args.item)
        for item in b.find_items(from_directory=True):
            print(item.get_file_path())

        # End module code #
        logger.info("ENDS: COMPLETE")
        return 0
    except KeyboardInterrupt:
        logger.error("ENDS: Program aborted manually")
        return 131
    except Exception as e:
        logger.critical("ENDS: Exception ("+str(e)+")")
        return 1
def main():
    # start of parser boilerplate
    parser = ArgumentParser(description="Produce TFIDF numbers for terms in the text preservation formats in a batch",
                            epilog="Copyright University of Chicago; " + \
                            "written by "+__author__ + \
                            " "+__email__)

    parser.add_argument("-v", help="See the version of this program",
                        action="version", version=__version__)
    # let the user decide the verbosity level of logging statements
    # -b sets it to INFO so warnings, errors and generic informative statements
    # will be logged
    parser.add_argument( \
                         '-b','-verbose',help="set verbose logging",
                         action='store_const',dest='log_level',
                         const=INFO,default='INFO' \
    )
    # -d is debugging so anything you want to use a debugger gets logged if you
    # use this level
    parser.add_argument( \
                         '-d','--debugging',help="set debugging logging",
                         action='store_const',dest='log_level',
                         const=DEBUG,default='INFO' \
    )
    # optionally save the log to a file. set a location or use the default constant
    parser.add_argument( \
                         '-l','--log_loc',help="save logging to a file",
                         dest="log_loc",
                         \
    )
    parser.add_argument("restritem", help="Enter a noid for an accession or a " + \
                        "directory path that you need to validate against" + \
                        " a type of controlled collection"
    )
    parser.add_argument("restrroot",help="Enter the root of the directory path",
                        action="store"
    )
    parser.add_argument("item", help="Enter a noid for an accession or a " + \
                        "directory path that you need to validate against" + \
                        " a type of controlled collection"
    )
    parser.add_argument("root",help="Enter the root of the directory path",
                        action="store"
    )
    args = parser.parse_args()
    log_format = Formatter( \
                            "[%(levelname)s] %(asctime)s  " + \
                            "= %(message)s",
                            datefmt="%Y-%m-%dT%H:%M:%S" \
    )
    global logger
    logger = getLogger( \
                        "lib.uchicago.repository.logger" \
    )
    ch = StreamHandler()
    ch.setFormatter(log_format)
    logger.setLevel(args.log_level)
    if args.log_loc:
        fh = FileHandler(args.log_loc)
        fh.setFormatter(log_format)
        logger.addHandler(fh)
    logger.addHandler(ch)
    try:
        args.restritem=abspath(args.restritem)
        args.restrroot=abspath(args.restrroot)
        args.item=abspath(args.item)
        args.root=abspath(args.root)

        b = Batch(args.restrroot, args.restritem)
        restrDocs=TextBatch(args.restritem,args.restrroot)
        for item in b.find_items(from_directory=True):
            if ".presform.txt" in item.find_file_name():
                textDoc=TextItem(item.get_file_path(),item.get_root_path())
                restrDocs.add_item(textDoc)
        if restrDocs.validate_items():
            logger.info("Generating language model from provided document set.")
            logger.info("Getting document term indices")
            term_map={}
            for item in restrDocs.get_items():
                item.set_raw_string(item.find_raw_string())
                indexOut=item.find_index(purge_raw=True,scrub_text=False,stem=False,term_map=term_map)
                item.set_index(indexOut[0])
                term_map.update(indexOut[1])
            restrDocs.set_term_map(term_map)
            logger.info("Generating corpus term index")
            restrDocs.set_term_index(restrDocs.find_term_index())
            logger.info("Getting iIDFs")
            restrDocs.set_doc_counts(restrDocs.find_doc_counts())
            restrDocs.set_iIdfs(restrDocs.find_iIdfs())
            logger.info("Computing Language Model")
            restrDocs.set_language_model(restrDocs.find_language_model())
            logger.info("Computing LM VSM")
            restrDocs.set_vector_space_model(restrDocs.find_vector_space_model())

        c=Batch(args.root,args.item)
        Docs=TextBatch(args.root,args.item)
        for item in c.find_items(from_directory=True):
            if ".presform.txt" in item.find_file_name():
                textDoc=TextItem(item.get_file_path(),item.get_root_path())
                Docs.add_item(textDoc)
        if Docs.validate_items():
            logger.info("Generating TFIDF models for each document in the batch.")
            logger.info("Getting document term indices")
            tote=len(Docs.get_items())
            i=0
            for item in Docs.get_items():
                i+=1
                print("\r"+str(i)+"/"+str(tote)+" - "+item.get_file_path(),end="")
                item.set_raw_string(item.find_raw_string())
                indexOut=item.find_index(purge_raw=True,scrub_text=False,stem=False,term_map=term_map,only_mapped=True)
                item.set_index(indexOut[0])
            print()
            logger.info("Getting IDFs")
            Docs.set_doc_counts(Docs.find_doc_counts())
            Docs.set_idfs(Docs.find_idfs())
            logger.info("Computing TFIDFs")
            Docs.set_tf_idfs(Docs.find_tf_idfs())
            logger.info("Generating document vector space models.")
            Docs.set_document_vector_space_models(Docs.find_document_vector_space_models())
            
            logger.info("Computing similarity metrics.")
            
            rels=[]
            for document in Docs.get_document_vector_space_models():
                rels.append((document,restrDocs.find_similarity(Docs.get_document_vector_space_models()[document])))
            logger.info("Sorting similarity metrics for output")
            rels=sorted(rels,key=itemgetter(1))
            for entry in rels:
                print(entry[0]+": "+str(entry[1]))

            
        return 0
    except KeyboardInterrupt:
        logger.error("Program aborted manually")
        return 131
def main():
    # start of parser boilerplate
    parser = ArgumentParser(description="[A brief description of the utility]",
                            epilog="Copyright University of Chicago; " + \
                            "written by "+__author__ + \
                            " "+__email__)

    parser.add_argument("-v", help="See the version of this program",
                        action="version", version=__version__)
    # let the user decide the verbosity level of logging statements
    # -b sets it to INFO so warnings, errors and generic informative statements
    # will be logged
    parser.add_argument( \
                         '-b','-verbose',help="set verbose logging",
                         action='store_const',dest='log_level',
                         const=INFO,default='INFO' \
    )
    # -d is debugging so anything you want to use a debugger gets logged if you
    # use this level
    parser.add_argument( \
                         '-d','--debugging',help="set debugging logging",
                         action='store_const',dest='log_level',
                         const=DEBUG,default='INFO' \
    )
    # optionally save the log to a file. set a location or use the default constant
    parser.add_argument( \
                         '-l','--log_loc',help="save logging to a file",
                         dest="log_loc",
                         \
    )
    parser.add_argument("item", help="Enter a noid for an accession or a " + \
                        "directory path that you need to validate against" + \
                        " a type of controlled collection"
    )
    parser.add_argument("root",help="Enter the root of the directory path",
                        action="store"
    )
    args = parser.parse_args()
    log_format = Formatter( \
                            "[%(levelname)s] %(asctime)s  " + \
                            "= %(message)s",
                            datefmt="%Y-%m-%dT%H:%M:%S" \
    )
    global logger
    logger = getLogger( \
                        "lib.uchicago.repository.logger" \
    )
    ch = StreamHandler()
    ch.setFormatter(log_format)
    logger.setLevel(args.log_level)
    if args.log_loc:
        fh = FileHandler(args.log_loc)
        fh.setFormatter(log_format)
        logger.addHandler(fh)
    logger.addHandler(ch)
    #BEGIN MAIN HERE - EXAMPLE BELOW
    try:
        b = Batch(args.root, args.item)
        for item in b.find_items(from_directory=True):
            print(item.filepath)
            
        return 0
    except KeyboardInterrupt:
        logger.error("Program aborted manually")
        return 131
def main():
    # start of parser boilerplate
    parser = ArgumentParser(description="This module is meant to take a batch of files (probably an accession in place) and generate the technical metadata for it.",
                            epilog="Copyright University of Chicago; " + \
                            "written by "+__author__ + \
                            " "+__email__)

    parser.add_argument("-v", help="See the version of this program",
                        action="version", version=__version__)
    # let the user decide the verbosity level of logging statements
    # -b sets it to INFO so warnings, errors and generic informative statements
    # will be logged
    parser.add_argument( \
                         '-b','-verbose',help="set verbosity for logging to stdout",
                         action='store_const',dest='log_level',
                         const=INFO,default='INFO' \
    )
    # -d is debugging so anything you want to use a debugger gets logged if you
    # use this level
    parser.add_argument( \
                         '-d','--debugging',help="set debugging logging",
                         action='store_const',dest='log_level',
                         const=DEBUG,default='INFO' \
    )
    # optionally save the log to a file. set a location or use the default constant
    parser.add_argument( \
                         '-l','--log_loc',help="save logging to a file",
                         dest="log_loc",
                         \
    )
    parser.add_argument( \
                         '-t','--timeout',help="set a timeout in seconds for any single bash command",
                         dest='timeout',default=3600,type=int \
    )
    parser.add_argument("item", help="Enter a noid for an accession or a " + \
                        "directory path that you need to validate against" + \
                        " a type of controlled collection"
    )
    parser.add_argument("root",help="Enter the root of the directory path",
                        action="store"
    )
    args = parser.parse_args()
    log_format = Formatter( \
                            "[%(levelname)s] %(asctime)s  " + \
                            "= %(message)s",
                            datefmt="%Y-%m-%dT%H:%M:%S" \
    )
    global logger
    logger = getLogger( \
                        "lib.uchicago.repository.logger" \
    )
    logger.setLevel(DEBUG)
    ch = StreamHandler()
    ch.setFormatter(log_format)
    ch.setLevel(args.log_level)
    logger.addHandler(ch)
    if args.log_loc:
        fh = FileHandler(args.log_loc)
        fh.setFormatter(log_format)
        logger.addHandler(fh)
    try:
        fitscommand="fits"
        md5command="md5"
        shacommand="sha256"

        b = Batch(abspath(args.root), abspath(args.item))
        for item in b.find_items(from_directory=True):
            if ".fits.xml" in item.find_file_name() or ".stif.txt" in item.find_file_name():
                continue
            item.find_technical_metadata()
            if item.has_technical_md:
                logger.info(item.get_file_path()+" already has technical metadata. Continuing.")
                continue
            else:
                logger.info("Attempting technical metadata generation for: "+item.get_file_path())
                fitsArgs=[fitscommand,'-i',item.get_file_path(),'-o',item.get_file_path()+'.fits.xml']
                fitsCommand=BashCommand(fitsArgs)
                fitsCommand.set_timeout(args.timeout)
                try:
                    logger.info("Attempting FITS generation for: "+item.get_file_path())
                    result=fitsCommand.run_command()
                    if isinstance(result[1],Exception):
                        raise result[1]
                    assert(exists(item.get_file_path()+'.fits.xml'))
                    logger.info("FITS generated for: "+item.get_file_path()) 
                except TimeoutExpired:
                    logger.warn("FITS generation timed out")
                    logger.info("Attempting STIF generation")
                    statArgs=['stat',item.get_file_path()]
                    statCommand=BashCommand(statArgs)
                    statCommand.set_timeout(args.timeout)

                    mimeArgs=['file','-i',item.get_file_path()]
                    mimeCommand=BashCommand(mimeArgs)
                    mimeCommand.set_timeout(args.timeout)

                    fileArgs=['file',item.get_file_path()]
                    fileCommand=BashCommand(fileArgs)
                    fileCommand.set_timeout(args.timeout)
                    
                    assert(statCommand.run_command()[0])
                    assert(mimeCommand.run_command()[0])
                    assert(fileCommand.run_command()[0])

                    md5hash=item.find_md5_hash()
                    shahash=item.find_sha256_hash

                    with open(item.get_file_path()+'.stif.txt','w') as f:
                        f.write(statCommand.get_data()[1].stdout.decode(encoding='UTF-8')+ \
                                mimeCommand.get_data()[1].stdout.decode(encoding='UTF-8')+ \
                                fileCommand.get_data()[1].stdout.decode(encoding='UTF-8')+ \
                                "md5: " + item.find_md5_hash() + '\n'+ \
                                "sha256: " + item.find_sha256_hash() \
                               )
                    assert(exists(item.get_file_path()+'.stif.txt'))
                    logger.info("STIF generated for: "+item.get_file_path())
                item.find_technical_metadata()
                assert(item.has_technical_md)
                logger.info("Technical metadata generation complete for: "+item.get_file_path())
        return 0
    except KeyboardInterrupt:
        logger.error("Program aborted manually")
        return 131
def main():
    # start of parser boilerplate
    parser = ArgumentParser(description="Program for the conversion of uncontrolled accessions into preservation stable file formats for ingest into the University of Chicago Library Digital Repository",
                            epilog="Copyright University of Chicago; " + \
                            "written by Brian Balsamo" + \
                            "<*****@*****.**>")

    parser.add_argument("-v", help="See the version of this program",
                        action="version", version=__version__)
    # let the user decide the verbosity level of logging statements
    # -b sets it to INFO so warnings, errors and generic informative statements
    # will be logged
    parser.add_argument( \
                         '-b','-verbose',help="set verbose logging",
                         action='store_const',dest='log_level',
                         const=INFO,default='INFO' \
    )
    # -d is debugging so anything you want to use a debugger gets logged if you
    # use this level
    parser.add_argument( \
                         '-d','--debugging',help="set debugging logging",
                         action='store_const',dest='log_level',
                         const=DEBUG \
    )
    # optionally save the log to a file. set a location or use the default constant
    parser.add_argument( \
                         '-l','--log_loc',help="save logging to a file",
                         action="store_const",dest="log_loc",
                         const='./current.log' \
    )
    parser.add_argument("item", help="Enter a noid for an accession or a " + \
                        "directory path that you need to validate against" + \
                        " a type of controlled collection")
    parser.add_argument("root",help="Enter the root of the directory path",
                        action="store")
    args = parser.parse_args()

    log_format = Formatter( \
                            "[%(levelname)s] %(asctime)s  " + \
                            "= %(message)s",
                            datefmt="%Y-%m-%dT%H:%M:%S" \
    )
    global logger
    logger = getLogger( \
                        "lib.uchicago.repository.logger" \
    )
    ch = StreamHandler()
    ch.setFormatter(log_format)
    logger.setLevel(args.log_level)
    if args.log_loc:
        fh = FileHandler(args.log_loc)
        fh.setFormatter(log_format)
        logger.addHandler(fh)
    logger.addHandler(ch)

    logger.info("Beginning")
    global itemStack
    itemStack=[]
    global root
    root=args.root
    try:
        if isdir(args.item):
            b = Batch(root, args.item)
            for item in b.find_items(from_directory=True):
                itemStack.append(item)
        if isfile(args.item):
            itemStack.append(Item(args.item,root))

        for item in itemStack:
            logger.info("Parsing "+item.get_file_path())
            parse(item)
            logger.info("Parsing complete on "+item.get_file_path())

        return 0
    except KeyboardInterrupt:
        logger.error("Program aborted manually")
        return 131
def main():
    # start of parser boilerplate
    parser = ArgumentParser(
        description="A command line utility for staging physical media",
        epilog="Copyright University of Chicago; " + "written by " + __author__ + " " + __email__,
    )

    parser.add_argument("-v", help="See the version of this program", action="version", version=__version__)
    # let the user decide the verbosity level of logging statements
    # -b sets it to INFO so warnings, errors and generic informative statements
    # will be logged
    parser.add_argument(
        "-b", "-verbose", help="set verbose logging", action="store_const", dest="log_level", const=INFO, default="INFO"
    )
    # -d is debugging so anything you want to use a debugger gets logged if you
    # use this level
    parser.add_argument(
        "-d",
        "--debugging",
        help="set debugging logging",
        action="store_const",
        dest="log_level",
        const=DEBUG,
        default="DEBUG",
    )
    # optionally save the log to a file. set a location or use the default constant
    parser.add_argument("-l", "--log_loc", help="save logging to a file", dest="log_loc")
    parser.add_argument(
        "--log_verb",
        help="Set a separate verbosity for the log written to disk, if desired",
        dest="log_verb",
        default=None,
    )
    parser.add_argument(
        "item",
        help="Enter a noid for an accession or a "
        + "directory path that you need to validate against"
        + " a type of controlled collection",
    )
    parser.add_argument("root", help="Enter the root of the directory path", action="store")
    parser.add_argument("dest_root", help="Enter the destination root path", action="store")
    parser.add_argument(
        "containing_folder", help="The name of the containing folder on disk (prefix+number)", action="store"
    )
    parser.add_argument(
        "--rehash",
        help="Disregard any existing previously generated hashes, recreate them on this run",
        action="store_true",
    )
    args = parser.parse_args()
    log_format = Formatter("[%(levelname)s] %(asctime)s  " + "= %(message)s", datefmt="%Y-%m-%dT%H:%M:%S")
    global logger
    logger = getLogger("lib.uchicago.repository.logger")
    ch = StreamHandler()
    ch.setFormatter(log_format)
    ch.setLevel(args.log_level)
    logger.setLevel("DEBUG")
    if args.log_loc:
        fh = FileHandler(args.log_loc)
        fh.setFormatter(log_format)
        logger.addHandler(fh)
    logger.addHandler(ch)
    try:
        if args.item[-1] == "/" and args.item != args.root:
            logger.warn("It looks like you may have set the root incorrectly.")
            wrongRootGoAnyways = input("Are you sure you want to continue? (y/n)\n")
            if wrongRootGoAnyways is not "y":
                exit(1)
        shouldBeEAD = getImmediateSubDirs(args.dest_root)
        assert len(shouldBeEAD) == 1
        shouldBeAccNo = getImmediateSubDirs(join(args.dest_root, shouldBeEAD[0]))
        assert len(shouldBeAccNo) == 1
        stageRoot = join(join(args.dest_root, shouldBeEAD[0]), shouldBeAccNo[0])
        destinationAdminRoot = join(stageRoot, "admin/")
        destinationDataRoot = join(stageRoot, "data/")
        containing_folder = args.containing_folder
        destinationAdminFolder = join(destinationAdminRoot, containing_folder)
        destinationDataFolder = join(destinationDataRoot, containing_folder)

        stagingDebugLog = FileHandler(join(destinationAdminFolder, "log.txt"))
        stagingDebugLog.setFormatter(log_format)
        stagingDebugLog.setLevel("DEBUG")
        logger.addHandler(stagingDebugLog)

        logger.debug("Creating batch from original files.")
        originalFiles = Batch(args.root, directory=args.item)

        existingOriginalFileHashes = {}
        originalFileHashes = {}
        logger.info("Hashing original files")
        if exists(join(destinationAdminFolder, "fixityFromOrigin.txt")):
            with open(join(destinationAdminFolder, "fixityFromOrigin.txt"), "r") as f:
                for line in f.readlines():
                    if not args.rehash:
                        splitLine = line.split("\t")
                        if splitLine[1] != "ERROR":
                            existingOriginalFileHashes[splitLine[0]] = [splitLine[1], splitLine[2].rstrip("\n")]
        with open(join(destinationAdminFolder, "fixityFromOrigin.txt"), "a") as f:
            for item in originalFiles.find_items(from_directory=True):
                if item.test_readability():
                    item.set_root_path(args.root)
                    if relpath(item.get_file_path(), start=item.get_root_path()) not in existingOriginalFileHashes:
                        item.set_sha256(item.find_sha256_hash())
                        item.set_md5(item.find_md5_hash())
                        originalFileHashes[relpath(item.get_file_path(), start=item.get_root_path())] = [
                            item.get_sha256(),
                            item.get_md5(),
                        ]
                else:
                    logger.warn("COULD NOT READ FILE: " + item.get_file_path())
                    originalFileHashes[relpath(item.get_file_path(), start=args.root)] = ["ERROR", "ERROR"]
            for entry in originalFileHashes:
                f.write(entry + "\t" + originalFileHashes[entry][0] + "\t" + originalFileHashes[entry][1] + "\n")
        return 0
    except KeyboardInterrupt:
        logger.error("Program aborted manually")
        return 131
def main():
    parser = ArgumentParser(description="{description}". \
                            format(description = __description__),
                            epilog="{copyright}; ". \
                            format(copyright = __copyright__) + \
                            "written by {author} <{email}>.". \
                            format(author = __author__,
                                   email = __email__)) 
    parser.add_argument("-v", help="See the version of this program",
                        action="version", version=__version__)
    parser.add_argument( \
                         '-b','-verbose',help="set verbose logging",
                         action='store_const',dest='log_level',
                         const=INFO \
    )
    parser.add_argument( \
                         '-d','--debugging',help="set debugging logging",
                         action='store_const',dest='log_level',
                         const=DEBUG \
    ) 
    parser.add_argument( \
                         '-l','--log_loc',help="save logging to a file",
                         action="store_const",dest="log_loc",
                         const='./{progname}.log'. \
                         format(progname=__file__) \
    )
    parser.add_argument( \
                         '--db_url',help="Enter a db url",action='store' \
    )
    parser.add_argument( \
                         '--root',help="Enter the root of the repository",
                         action='store')
    parser.add_argument( \
                         '--object_pattern',help="Enter the regex pattern " + \
                         "to match an object",
                         action='store')
    parser.add_argument( \
                         '--page_pattern',help="Enter the regex pattern " + \
                         "to match a page",
                         action='store')
    parser.add_argument( \
                         'accessions',nargs="*",action='store',
                         help="Enter 1 or more accession " + \
                         "identifiers to process" \
    )
    args = parser.parse_args()
    log_format = Formatter( \
                            "[%(levelname)s] %(asctime)s  " + \
                            "= %(message)s",
                            datefmt="%Y-%m-%dT%H:%M:%S" \
    )
    global logger
    logger = getLogger( \
                        "lib.uchicago.repository.logger" \
    )
    ch = StreamHandler()
    ch.setFormatter(log_format)
    try:
        logger.setLevel(args.log_level)
    except TypeError:
        logger.setLevel(INFO)
    if args.log_loc:
        fh = FileHandler(args.log_loc)
        fh.setFormatter(log_format)
        logger.addHandler(fh)
    logger.addHandler(ch)
    db = Database(args.db_url, ['record','file'])
    
    class Record(db.base):
            __table__ = Table('record', db.metadata, autoload=True)
        
    class File(db.base):
            __table__ = Table('file', db.metadata, autoload=True)

    query = db.session.query(File).filter(File.accession.in_(args.accessions))
    if args.root:
        batch = Batch(args.root, query = query)
        items  = batch.find_items(from_db = True)
        batch.set_items(items)
    else:
        raise ValueError("need to include a root")
    try:
        all_objects = []
        for item in batch.get_items():
            accession = item.find_file_accession()
            item.set_accession(accession)            
            canon = item.find_canonical_filepath()            
            item.set_canonical_filepath(canon)

            search_pattern = item.find_matching_object_pattern( \
                    re_compile("(mvol)/(\w{4})/(\w{4})/(\w{4})/" +  
                               "(mvol)-(\w{4})-(\w{4})-(\w{4})"))

            if search_pattern.status == True:
                potential_identifier = '-'.join(search_pattern.data.groups())
                is_an_object_already_present = [x for x in all_objects \
                                                if x.identifier == \
                                                potential_identifier]
                if is_an_object_already_present:
                    logger.debug("found this id already")
                else:
                    logger.debug("this id is new!")
                    new_object = DigitalObject(potential_identifier)
                    all_objects.append(new_object)
                logger.debug(potential_identifier)

        return 0
    except KeyboardInterrupt:
        logger.error("Program aborted manually")
        return 131
def main():
    parser = ArgumentParser(description="{description}". \
                            format(description = __description__),
                            epilog="{copyright}; ". \
                            format(copyright=__copyright__) + \
                            "written by {name} ".format(name=__author__) + \
                            " <{email}> ".format(email=__email__) + \
                            "University of Chicago")
    parser.add_argument("-v", help="See the version of this program",
                        action="version", version=__version__)
    parser.add_argument( \
                         '-b','-verbose',help="set verbose logging",
                         action='store_const',dest='log_level',
                         const=INFO \
    )
    parser.add_argument( \
                         '-d','--debugging',help="set debugging logging",
                         action='store_const',dest='log_level',
                         const=DEBUG \
    )
    parser.add_argument( \
                         '-l','--log_loc',help="save logging to a file",
                         action="store_const",dest="log_loc",
                         const='./{progname}.log'. \
                         format(progname=argv[0]) \
    )
    parser.add_argument("location_root",help="Enter the root " + \
                        "of the directory path",
                        action="store")
    parser.add_argument("directory_path", 
                           help="Enter a directory that you need to work on ",
                           action='store')
    args = parser.parse_args()
    log_format = Formatter( \
                            "[%(levelname)s] %(asctime)s  " + \
                            "= %(message)s",
                            datefmt="%Y-%m-%dT%H:%M:%S" \
    )
    global logger
    logger = getLogger( \
                        "lib.uchicago.repository.logger" \
    )
    ch = StreamHandler()
    ch.setFormatter(log_format)
    try:
        logger.setLevel(args.log_level)
    except TypeError:
        logger.setLevel(INFO)
    if args.log_loc:
        fh = FileHandler(args.log_loc)
        fh.setFormatter(log_format)
        logger.addHandler(fh)
    logger.addHandler(ch)
    try:
        b = Batch(args.location_root, directory = args.directory_path)
        generator_object = b.find_items(from_directory=True)
        logger.debug(generator_object)
        b.set_items(generator_object)
        stdout.write("begin transaction;\n")
        for a_file in b.get_items():
            if a_file.test_readability():
                
                file_hash = a_file.find_hash_of_file(sha256)
                mime = a_file.find_file_mime_type()
                size = a_file.find_file_size()
                accession = a_file.find_file_accession()
                a_file.set_file_mime_type(mime)
                a_file.set_file_size(size)
                a_file.set_hash(file_hash)
                a_file.set_accession(accession)
                out_string = "insert into file (filepath,accession," + \
                             "mimetype,size,checksum) values (" + \
                             "\"{path}\",\"{accession}\",\"{mimetype}\"". \
                             format(path = a_file.filepath,
                                    accession = a_file.get_accession(),
                                    mimetype = a_file.get_file_mime_type()) + \
                                    ",{filesize},\"{filehash}\");\n". \
                                    format(filesize = a_file.get_file_size(),
                                           filehash = a_file.get_hash())
                stdout.write(out_string)
            else:
                logger.error("{path} could not be read". \
                             format(path=a_file.filepath))
        stdout.write("commit;\n")
        return 0 
    except KeyboardInterrupt:
        logger.warn("Program aborted manually")
        return 131
Ejemplo n.º 13
0
def main():
    parser = ArgumentParser(description="{description}". \
                            format(description=__description__),
                            epilog="Copyright University of Chicago; " + \
                            "written by {author} ". \
                            format(author = __author__) + \
                            " <{email}> University of Chicago". \
                            format(email = __email__))
    parser.add_argument("-v", help="See the version of this program",
                        action="version", version=__version__)
    parser.add_argument( \
                         '-b','-verbose',help="set verbose logging",
                         action='store_const',dest='log_level',
                         const=INFO \
    )
    parser.add_argument( \
                         '-d','--debugging',help="set debugging logging",
                         action='store_const',dest='log_level',
                         const=DEBUG \
    )
    parser.add_argument( \
                         '-l','--log_loc',help="save logging to a file",
                         action="store_const",dest="log_loc",
                         const='./{progname}.log'. \
                         format(progname=argv[0]) \
    )
    parser.add_argument("-o","--object_level",
                        help="Enter the level at which object starts",
                        type=int,
                        action='store')
    parser.add_argument("-r", "--root",
                       help="Enter the root of the directory path",
                        action="store")
    parser.add_argument("directory_path", 
                        help="Enter a directory that you need to work on ",
                        action='store')

    parser.add_argument('pattern', help="Enter a pattern to filter files with",
                        action="store")
    global args
    args = parser.parse_args()
    log_format = Formatter( \
                            "[%(levelname)s] %(asctime)s  " + \
                            "= %(message)s",
                            datefmt="%Y-%m-%dT%H:%M:%S" \
    )
    global logger
    logger = getLogger( \
                        "lib.uchicago.repository.logger" \
    )
    ch = StreamHandler()
    ch.setFormatter(log_format)
    try:
        logger.setLevel(args.log_level)
    except TypeError:
        logger.setLevel(INFO)
    if args.log_loc:
        fh = FileHandler(args.log_loc)
        fh.setFormatter(log_format)
        logger.addHandler(fh)
    logger.addHandler(ch)
    current_date = datetime.now()
    isof_current_date = current_date.strftime("%Y-%m-%dT%H:%M:%S")
    sixty_days_ago_date = current_date - timedelta(days=60)
    isof_sixty_days_ago_date = sixty_days_ago_date.strftime( \
                            "%Y-%m-%dT%H:%M:%S")
    db = Database("sqlite:////media/repo/repository/databases/" +  
                  "official/repositoryAccessions.db.new",tables_to_bind= \
                  ['record'])


    class Record(db.base):
        __table__ = Table('record', db.metadata, autoload=True)
        
    b = Batch(args.root, directory = args.directory_path)
    difference_in_path = relpath(args.directory_path, args.root)

    query = db.session.query(Record.createdate).filter(Record.receipt == \
                                                       difference_in_path) 
    createdate = query.first()[0]
    items = b.find_items(from_directory = True, 
                         filterable = re_compile(args.pattern))

    b.set_items(items)
    try:
        generated_data = evaluate_items(b,createdate)
        count = 0
        objects = {}
        descriptive_metadata = '.dc.xml$'
        representation_file = '.pdf$'
        mets_file = '.mets.xml$'
        file_definers = ['dc.xml','ALTO','TIFF','JPEG','pdf','mets.xml',
                         '\d{4}.txt']
        file_definer_sequences = ['ALTO','TIFF','JPEG']
        page_number_pattern = '_(\w{4})'
        for n in generated_data:
            id_parts = args.pattern.split('/')
            id_parts_enumerated = [x for x in range(args.object_level)]
            id_part_values = [n.canonical_filepath.split('/')[x] \
                              for x in id_parts_enumerated]
        
            identifier = "-".join(id_part_values)
            to_add = None
            for p in file_definers:
                if p in n.canonical_filepath:
                    to_add = n
                    break
            if to_add:
                if objects.get(identifier):
                    objects.get(identifier).append(n)
                else:
                    objects[identifier] = [n]
            else:
                logger.error("{fpath} in {id} could not be matched". \
                             format(fpath = n.canonical_filepath,
                                    id = identifier))
        for k, v in objects.items():
            logger.error(k)
            k_identifier = k
            k_id_part_values = k.split('-')
            logger.info(k_id_part_values)
            k_id_directory = '/'.join(k_id_part_values)
            for p in file_definer_sequences:
                sequence = sorted([(int(re_compile(page_number_pattern). \
                            search(x.canonical_filepath).group(1).lstrip('0')),
                             x.canonical_filepath) \
                            for x in v if p in x.canonical_filepath])
                known_complete_page_range = [x for x in \
                                             range(sequence[-1][0])][1:]
                what_is_actually_present  = [x[0] for x in sequence]
                if set(known_complete_page_range) - \
                   set(what_is_actually_present):
                    difference = list(set(known_complete_page_range) - \
                                      set(what_is_actually_present))
                    l = [str(x) for x in list(difference)]
                    logger.error("The sequence part {part} ". \
                                 format(part = p) + 
                                 "is missing pages {pages}". \
                                 format(pages = ','.join(l)))
            for p in file_definers:
                seek = [x for x in v if p in x.canonical_filepath]
                if len(seek) == 0:
                    logger.error("{identifier}". \
                                 format(identifier = k_identifier) + \
                                " missing part {part}".format(part = p))
            ldrurl = LDRURL(join(k_id_directory, k_identifier))
            piurl = PIURL("dig/campub", join(k_id_directory, k_identifier))
            rightsurl = RightsURL()
            repurl = URL("http://repository.lib.uchicago.edu/")
            collectionurl = URL("ead/ICU.SPCL.CAMPUB")
            dcfile = [x for x in v if '.dc.xml' in x.canonical_filepath][0]
            proxy = Proxy(join(dcfile.accession, dcfile.dirhead, 
                               dcfile.canonical_filepath))
            pdffile = [x for x in v if '.pdf' in x.canonical_filepath][0]
            jpegfile = [x for x in v if 'JPEG' in x.canonical_filepath \
                        and '_0001' in x.canonical_filepath][0]
            pdfresource = WebResource(join(pdffile.accession, pdffile.dirhead, 
                                           pdffile.canonical_filepath))

            metsfile = [x for x in v 
                        if '.mets.xml' in x.canonical_filepath][0]
            metsresource = RDFSResource(join(metsfile.accession, 
                                            metsfile.dirhead,
                                            metsfile.canonical_filepath))
            pages = set([int(re_compile('_(\w{4}).*'). \
                             search(basename(x.canonical_filepath)). \
                             group(1).lstrip('0'))
                         for x in v if re_compile('_\w{4}.*'). \
                         search(basename(x.canonical_filepath))])
            numpages = list(pages)[-1]
            providedcho = ProvidedCHO(k_id_directory)
            aggregation = Aggregation(k_id_directory)
            rem = ResourceMap(k_id_directory)
            proxy.add_statement("dc:format", TextValue(dcfile.mimetype))
            proxy.add_statement("ore:proxyFor", URL(providedcho.subject))
            proxy.add_statement("ore:proxyIn", URL(aggregation.subject))
            stdout.write(str(proxy))
            providedcho.add_statement("dc:coverage",TextValue("Chicago"))
            providedcho.add_statement("dc:date", DateValue(dcfile.date))
            providedcho.add_statement("edm:year", 
                                      DateValue(dcfile.date.split('-')[0]))
            providedcho.add_statement("dc:description", 
                                      TextValue(dcfile.description))
            providedcho.add_statement("dc:identifier", 
                                      TextValue(dcfile.identifier))
            providedcho.add_statement("dc:language", TextValue("en"))
            providedcho.add_statement("dc:rights", rightsurl)
            providedcho.add_statement("dc:title", TextValue(dcfile.title))
            providedcho.add_statement("dc:type", TextValue("text"))
            providedcho.add_statement("edm:type", TextValue("TEXT"))
            providedcho.add_statement("dc:description", 
                                      URL(aggregation.subject))
            providedcho.add_statement("dcterms:isPartOf", collectionurl)

            rem.add_statement("dcterms:created", DateValue(createdate))
            rem.add_statement("dcterms:creator", repurl)
            rem.add_statement("ore:describes", URL(aggregation.subject))
            stdout.write(str(rem))
            aggregation.add_statement("edm:aggregatedCHO", 
                                      URL(providedcho.subject))
            aggregation.add_statement("edm:dataProvider", 
                                    TextValue("University of Chicago Library"))
            aggregation.add_statement("edm:isShownAt", piurl)
            aggregation.add_statement("edm:isShownBy", 
                                      URL(join(pdffile.accession,
                                               pdffile.dirhead,
                                               pdffile.canonical_filepath)))
            aggregation.add_statement("edm:object", 
                                      URL(join(jpegfile.accession, 
                                               jpegfile.dirhead,
                                               jpegfile.canonical_filepath)))
            aggregation.add_statement("edm:provider", 
                                    TextValue("University of Chicago Library"))
            aggregation.add_statement("dc:rights", rightsurl)
            aggregation.add_statement("ore:isDescribedBy", URL(rem.subject))
            stdout.write(str(aggregation))
            metsresource.add_statement("dc:format", 
                                       TextValue(metsfile.mimetype))
            stdout.write(str(metsresource))
            pdfresource.add_statement("dcterms:isFormatOf", ldrurl.subject)
            pdfresource.add_statement("premis:objectIdentifierType", 
                                      TextValue("ARK"))
            pdfresource.add_statement("premis:objectIdentifierValue", 
                                      URL(pdfresource.subject))
            pdfresource.add_statement("dc:format", TextValue(pdffile.mimetype))
            pdfresource.add_statement("premis:objectCategory", 
                                      TextValue("file"))
            pdfresource.add_statement("premis:compositionLevel", IntegerValue(0))
            pdfresource.add_statement("premis:messageDigestAlgorithm", 
                                      TextValue("SHA-256")) 
            pdfresource.add_statement("premis:messageDigest", 
                                      TextValue(pdffile.checksum))
            pdfresource.add_statement("premis:messageDigestOriginator", 
                                      TextValue("/sbin/sha256"))
            pdfresource.add_statement("premis:size", IntegerValue(pdffile.file_size))
            pdfresource.add_statement("premis:formatName", 
                                      TextValue(pdffile.mimetype))
            pdfresource.add_statement("premis:originalName", 
                                      TextValue(pdffile.canonical_filepath))
            pdfresource.add_statement("premis:eventIdentifierType", 
                                      TextValue("ARK"))
            pdfresource.add_statement("premis:eventIdentifierValue", 
                                      TextValue(pdffile.accession))
            pdfresource.add_statement("premis:eventType", 
                                      TextValue("creation"))
            pdfresource.add_statement("premis:eventDateTime", 
                                      DateValue(createdate))
            stdout.write(str(pdfresource))
            all_pages = range(1, numpages + 1)
            for n in all_pages:
                if n != all_pages[-1]:
                    next_page = n + 1
                    canonical_next_page = '0' * (4 - len(str(next_page))) + \
                                          str(next_page)
                    canonical_next_page_name = join(k_id_directory,k_identifier  + \
                                               '_' + canonical_next_page)
                else:
                    next_page = None
                canonical_page = ('0' * (4 - len(str(n)))) + str(n)
                canonical_page_file_name = k_identifier + '_' + canonical_page
                page_name = join(k_id_directory,
                                 canonical_page_file_name)
                logger.info(page_name)
                providedcho.add_statement("dcterms:hasPart", "<{url}>". \
                            format(url = page_name))
                tiffile = [x for x in v if 'TIFF' in x.canonical_filepath and str('_' + canonical_page) in x.canonical_filepath][0]
                ocrfile = [x for x in v if 'ALTO' in x.canonical_filepath and str('_' + canonical_page) in x.canonical_filepath][0]
                jpegfile =  [x for x in v if 'JPEG' in x.canonical_filepath and str('_' + canonical_page) in x.canonical_filepath][0]
                page_providedcho = ProvidedCHO(page_name)
                page_aggregation = Aggregation(page_name)
                page_rem = ResourceMap(page_name)
                page_webresource = WebResource(join(tiffile.accession, 
                                                    tiffile.dirhead, 
                                                    tiffile.canonical_filepath))

                page_jpeg = RDFSResource(join(jpegfile.accession, 
                                              jpegfile.dirhead,
                                              jpegfile.canonical_filepath))

                page_ocr = RDFSResource(join(ocrfile.accession, ocrfile.dirhead,
                                            ocrfile.canonical_filepath))

                page_providedcho.add_statement("dc:description", "<{url}>". \
                                format(url = join(ocrfile.accession,
                                                  ocrfile.dirhead,
                                                  ocrfile.canonical_filepath)))

                page_providedcho.add_statement("dc:language", TextValue("en"))
                page_providedcho.add_statement("dc:rights", 
                                               rightsurl)
                page_providedcho.add_statement("dc:type", TextValue("Text"))
                page_providedcho.add_statement("edm:type", TextValue("TEXT"))
                page_providedcho.add_statement("dc:title", 
                                               TextValue("Page {number}". \
                                                    format(number = str(n))))
                page_providedcho.add_statement("dcterms:isPartOf",
                                               URL(providedcho.subject))
                if next_page:
                    page_providedcho.add_statement("edm:isNextInSequence",
                                    URL(join("/",canonical_next_page_name)))
                stdout.write(str(page_providedcho))
                page_aggregation.add_statement("edm:aggregatedCHO", 
                                               URL(page_providedcho.subject))
                page_aggregation.add_statement("edm:dataProvider",
                                    TextValue("University of Chicago Library"))
                page_aggregation.add_statement("edm:isShownBy", 
                                               URL(page_webresource.subject))
                page_aggregation.add_statement("edm:object", 
                                               URL(page_jpeg.subject))
                page_aggregation.add_statement("edm:provider", 
                                TextValue("University of Chicago Library"))
                page_aggregation.add_statement("edm:rights", 
                                               URL(rightsurl.subject))
                page_aggregation.add_statement("ore:isDescribedBy", 
                                               URL(page_rem.subject))
                stdout.write(str(page_aggregation))
                page_rem.add_statement("dc:created", 
                                       DateValue(createdate))
                page_rem.add_statement("dcterms:creator", 
                                       URL(repurl.subject))
                stdout.write(str(page_rem))
                page_webresource.add_statement("mix:fileSize", 
                                               IntegerValue(tiffile.file_size))
                page_webresource.add_statement("mix:formatName", 
                                               TextValue(tiffile.mimetype))
                if getattr(tiffile,'mixchecksum',None):
                    page_webresource.add_statement("mix:messageDigestAlgorithm", 
                                                   TextValue("MD5"))
                    page_webresource.add_statement("mix:messageDigest", 
                                                   TextValue(tiffile.mixchecksum))
                if getattr(tiffile,'imageheight',None):
                    page_webresource.add_statement("mix:imageHeight",
                                                   IntegerValue(int(tiffile.imageheight)))
                if getattr(tiffile,'imagewidth',None):
                    page_webresource.add_statement("mix:imageWidth", 
                                                   IntegerValue(int(tiffile.imagewidth)))
                if getattr(tiffile,'bitspersample',None):
                    page_webresource.add_statement("mix:bitsPerSample",
                                                   TextValue(tiffile.bitspersample))
                stdout.write(str(page_webresource))
                page_jpegresource = RDFSResource(join(jpegfile.accession,
                                                      jpegfile.dirhead,
                                                      jpegfile.canonical_filepath))
                page_ocrresource = RDFSResource(join(ocrfile.accession,
                                                     ocrfile.dirhead,
                                                     ocrfile.canonical_filepath))
                
                page_ocrresource.add_statement("dc:format",
                                               TextValue(ocrfile.mimetype))
                stdout.write(str(page_ocrresource))
                page_jpegresource.add_statement("dc:format", 
                                        TextValue(jpegfile.mimetype))
                stdout.write(str(page_jpegresource))
            stdout.write(str(providedcho))

        return 0
    except KeyboardInterrupt:
         logger.error("Program aborted manually")
         return 131
def main():
    parser = ArgumentParser(description="{description}". \
                            format(description=__description__),
                            epilog="Copyright University of Chicago; " + \
                            "written by {author} ". \
                            format(author = __author__) + \
                            " <{email}> University of Chicago". \
                            format(email = __email__))
    parser.add_argument("-v", help="See the version of this program",
                        action="version", version=__version__)
    parser.add_argument( \
                         '-b','-verbose',help="set verbose logging",
                         action='store_const',dest='log_level',
                         const=INFO \
    )
    parser.add_argument( \
                         '-d','--debugging',help="set debugging logging",
                         action='store_const',dest='log_level',
                         const=DEBUG \
    )
    parser.add_argument( \
                         '-l','--log_loc',help="save logging to a file",
                         action="store_const",dest="log_loc",
                         const='./{progname}.log'. \
                         format(progname=argv[0]) \
    )
    parser.add_argument("-o","--object_level",
                        help="Enter the level at which object starts",
                        type=int,
                        action='store')
    parser.add_argument("-r", "--root",
                       help="Enter the root of the directory path",
                        action="store")
    parser.add_argument("directory_path", 
                        help="Enter a directory that you need to work on ",
                        action='store')

    parser.add_argument('pattern', help="Enter a pattern to filter files with",
                        action="store")
    global args
    args = parser.parse_args()
    log_format = Formatter( \
                            "[%(levelname)s] %(asctime)s  " + \
                            "= %(message)s",
                            datefmt="%Y-%m-%dT%H:%M:%S" \
    )
    global logger
    logger = getLogger( \
                        "lib.uchicago.repository.logger" \
    )
    ch = StreamHandler()
    ch.setFormatter(log_format)
    try:
        logger.setLevel(args.log_level)
    except TypeError:
        logger.setLevel(INFO)
    if args.log_loc:
        fh = FileHandler(args.log_loc)
        fh.setFormatter(log_format)
        logger.addHandler(fh)
    logger.addHandler(ch)
    current_date = datetime.now()
    isof_current_date = current_date.strftime("%Y-%m-%dT%H:%M:%S")
    sixty_days_ago_date = current_date - timedelta(days=60)
    isof_sixty_days_ago_date = sixty_days_ago_date.strftime( \
                            "%Y-%m-%dT%H:%M:%S")
    db = Database("sqlite:////media/repo/repository/databases/" +  
                  "official/repositoryAccessions.db.new",tables_to_bind= \
                  ['record'])
                  

    class Record(db.base):
        __table__ = Table('record', db.metadata, autoload=True)
        
    b = Batch(args.root, directory = args.directory_path)
    difference_in_path = relpath(args.directory_path, args.root)

    query = db.session.query(Record.createdate).filter(Record.receipt == \
                                                       difference_in_path) 
    createdate = query.first()[0]
    items = b.find_items(from_directory = True, 
                         filterable = re_compile(args.pattern))
    b.set_items(items)
    try:
        generated_data = evaluate_items(b,createdate)
        count = 0
        objects = {}
        descriptive_metadata = '.dc.xml$'
        representation_file = '.pdf$'
        file_definers = ['dc.xml','ALTO','TIFF','JPEG','pdf','mets.xml',
                         '\d{4}.txt']
        file_definer_sequences = ['ALTO','TIFF','JPEG']
        page_number_pattern = '_(\w{4})'
        for n in generated_data:
            id_parts = args.pattern.split('/')
            id_parts_enumerated = [x for x in range(args.object_level)]
            id_part_values = [n.canonical_filepath.split('/')[x] \
                              for x in id_parts_enumerated]
            identifier = "-".join(id_part_values)
            to_add = None
            for p in file_definers:
                if p in n.canonical_filepath:
                    to_add = n
                    break
            if to_add:
                if objects.get(identifier):
                    objects.get(identifier).append(n)
                else:
                    objects[identifier] = [n]
            else:
                logger.error("{fpath} in {id} could not be matched". \
                             format(fpath = n.canonical_filepath,
                                    id = identifier))
        for k, v in objects.items():
            for p in file_definer_sequences:
                sequence = sorted([(int(re_compile(page_number_pattern). \
                            search(x.canonical_filepath).group(1).lstrip('0')),
                             x.canonical_filepath) \
                            for x in v if p in x.canonical_filepath])
                known_complete_page_range = [x for x in \
                                             range(sequence[-1][0])][1:]
                what_is_actually_present  = [x[0] for x in sequence]
                if set(known_complete_page_range) - \
                   set(what_is_actually_present):
                    difference = list(set(known_complete_page_range) - \
                                      set(what_is_actually_present))
                    l = [str(x) for x in list(difference)]
                    logger.error("The sequence part {part} ". \
                                 format(part = p) + 
                                 "is missing pages {pages}". \
                                 format(pages = ','.join(l)))
            for p in file_definers:
                seek = [x for x in v if p in x.canonical_filepath]
                if len(seek) == 0:
                    logger.error("{identifier}". \
                                 format(identifier = identifier) + \
                                " missing part {part}".format(part = p))
            i = make_identifier(id_part_values, v)
            metadata = [x for x in v if re_compile(descriptive_metadata). \
                        search(x.canonical_filepath)][0]
            representation = [x for x in v if re_compile(representation_file). \
                        search(x.canonical_filepath)][0]
            providedcho = make_providedcho(i, metadata)
            print(providedcho)
            aggregation = make_aggregation(i, representation)
            print(aggregation)
            logger.info(i)

        return 0
    except KeyboardInterrupt:
         logger.error("Program aborted manually")
         return 131
def main():
    # start of parser boilerplate
    parser = ArgumentParser(description="A command line utility for staging physical media",
                            epilog="Copyright University of Chicago; " + \
                            "written by "+__author__ + \
                            " "+__email__)

    parser.add_argument("-v", help="See the version of this program",
                        action="version", version=__version__)
    # let the user decide the verbosity level of logging statements
    # -b sets it to INFO so warnings, errors and generic informative statements
    # will be logged
    parser.add_argument( \
                         '-b','-verbose',help="set verbose logging",
                         action='store_const',dest='log_level',
                         const=INFO,default='INFO' \
    )
    # -d is debugging so anything you want to use a debugger gets logged if you
    # use this level
    parser.add_argument( \
                         '-d','--debugging',help="set debugging logging",
                         action='store_const',dest='log_level',
                         const=DEBUG,default='DEBUG' \
    )
    # optionally save the log to a file. set a location or use the default constant
    parser.add_argument( \
                         '-l','--log_loc',help="save logging to a file",
                         dest="log_loc",
                         \
    )
    parser.add_argument( \
                         '--log_verb',help="Set a separate verbosity for the log written to disk, if desired",
                         dest="log_verb",default=None
                         \
    )
    parser.add_argument("dest_root",help="Enter the destination root path",
                        action='store'
    )
    parser.add_argument("containing_folder",help="The name of the containing folder on disk (prefix+number)",
                        action='store'
    )
    parser.add_argument("--rehash",help="Disregard any existing previously generated hashes, recreate them on this run",
                        action="store_true"
    )
    args = parser.parse_args()
    log_format = Formatter( \
                            "[%(levelname)s] %(asctime)s  " + \
                            "= %(message)s",
                            datefmt="%Y-%m-%dT%H:%M:%S" \
    )
    global logger
    logger = getLogger( \
                        "lib.uchicago.repository.logger" \
    )
    ch = StreamHandler()
    ch.setFormatter(log_format)
    ch.setLevel(args.log_level)
    logger.setLevel('DEBUG')
    if args.log_loc:
        fh = FileHandler(args.log_loc)
        fh.setFormatter(log_format)
        logger.addHandler(fh)
    logger.addHandler(ch)
    try:
        shouldBeEAD=getImmediateSubDirs(args.dest_root)
        assert(len(shouldBeEAD)==1)
        shouldBeAccNo=getImmediateSubDirs(join(args.dest_root,shouldBeEAD[0]))
        assert(len(shouldBeAccNo)==1)
        stageRoot=join(join(args.dest_root,shouldBeEAD[0]),shouldBeAccNo[0])
        destinationAdminRoot=join(stageRoot,'admin/')
        destinationDataRoot=join(stageRoot,'data/')
        containing_folder=args.containing_folder
        destinationAdminFolder=join(destinationAdminRoot,containing_folder)
        destinationDataFolder=join(destinationDataRoot,containing_folder)

        stagingDebugLog = FileHandler(join(destinationAdminFolder,'log.txt'))
        stagingDebugLog.setFormatter(log_format)
        stagingDebugLog.setLevel('DEBUG')
        logger.addHandler(stagingDebugLog)

        logger.debug("Creating batch from moved files.")
        movedFiles=Batch(args.dest_root,directory=destinationDataFolder)
        
        existingMovedFileHashes={}
        movedFileHashes={}
        logger.info("Hashing copied files.")
        if exists(join(destinationAdminFolder,'fixityInStaging.txt')):
            with open(join(destinationAdminFolder,'fixityInStaging.txt'),'r') as f:
                if not args.rehash:
                    for line in f.readlines():
                        splitLine=line.split('\t')
                        if splitLine[1] != "ERROR":
                            existingMovedFileHashes[splitLine[0]]=[splitLine[1],splitLine[2].rstrip('\n')]
        with open(join(destinationAdminFolder,'fixityInStaging.txt'),'a') as f:
            for item in movedFiles.find_items(from_directory=True):
                if item.test_readability():
                    item.set_root_path(destinationDataFolder)
                    if relpath(item.get_file_path(),start=item.get_root_path()) not in existingMovedFileHashes:
                        item.set_sha256(item.find_sha256_hash())
                        item.set_md5(item.find_md5_hash())
                        movedFileHashes[relpath(item.get_file_path(),start=destinationDataFolder)]=[item.get_sha256(),item.get_md5()]
                else:
                    logger.warn("COULD NOT READ FILE: "+item.get_file_path())
                    movedFileHashes[relpath(item.get_file_path(),start=destinationDataFolder)]=["ERROR","ERROR"]
            for entry in movedFileHashes:
                f.write(entry+"\t"+movedFileHashes[entry][0]+'\t'+movedFileHashes[entry][1]+'\n')
        return 0
    except KeyboardInterrupt:
        logger.error("Program aborted manually")
        return 131