def main():
    # start of parser boilerplate
    parser = ArgumentParser(description="Produce TFIDF numbers for terms in the text preservation formats in a batch",
                            epilog="Copyright University of Chicago; " + \
                            "written by "+__author__ + \
                            " "+__email__)

    parser.add_argument("-v", help="See the version of this program",
                        action="version", version=__version__)
    # let the user decide the verbosity level of logging statements
    # -b sets it to INFO so warnings, errors and generic informative statements
    # will be logged
    parser.add_argument( \
                         '-b','-verbose',help="set verbose logging",
                         action='store_const',dest='log_level',
                         const=INFO,default='INFO' \
    )
    # -d is debugging so anything you want to use a debugger gets logged if you
    # use this level
    parser.add_argument( \
                         '-d','--debugging',help="set debugging logging",
                         action='store_const',dest='log_level',
                         const=DEBUG,default='INFO' \
    )
    # optionally save the log to a file. set a location or use the default constant
    parser.add_argument( \
                         '-l','--log_loc',help="save logging to a file",
                         dest="log_loc",
                         \
    )
    parser.add_argument("item", help="Enter a noid for an accession or a " + \
                        "directory path that you need to validate against" + \
                        " a type of controlled collection"
    )
    parser.add_argument("root",help="Enter the root of the directory path",
                        action="store"
    )
    args = parser.parse_args()
    log_format = Formatter( \
                            "[%(levelname)s] %(asctime)s  " + \
                            "= %(message)s",
                            datefmt="%Y-%m-%dT%H:%M:%S" \
    )
    global logger
    logger = getLogger( \
                        "lib.uchicago.repository.logger" \
    )
    ch = StreamHandler()
    ch.setFormatter(log_format)
    logger.setLevel(args.log_level)
    if args.log_loc:
        fh = FileHandler(args.log_loc)
        fh.setFormatter(log_format)
        logger.addHandler(fh)
    logger.addHandler(ch)
    try:
        b = Batch(args.root, args.item)
        textDocs=TextBatch(args.item,args.root)
        for item in b.find_items(from_directory=True):
            if ".presform.txt" in item.find_file_name():
                textDoc=TextItem(item.get_file_path(),item.get_root_path())
                textDocs.add_item(textDoc)
        if textDocs.validate_items():
            logger.info("Getting document term indices")
            term_map={}
            for item in textDocs.get_items():
                item.set_raw_string(item.find_raw_string())
                indexOut=item.find_index(purge_raw=True,scrub_text=True,term_map=term_map)
                item.set_index(indexOut[0])
                term_map.update(indexOut[1])
            textDocs.set_term_map(term_map)
            logger.info("Getting IDFs")
            textDocs.set_doc_counts(textDocs.find_doc_counts())
            textDocs.set_idfs(textDocs.find_idfs())
            logger.info("Computing TFIDFs")
            textDocs.set_tf_idfs(textDocs.find_tf_idfs())
            textDocs.rev_term_map()

            for key in textDocs.get_tf_idfs():
                print(key)
                tfidfs=[]
                for entry in textDocs.get_tf_idfs()[key]:
                    tfidfs.append((entry,textDocs.get_tf_idfs()[key][entry]))
                tfidfs=sorted(tfidfs,key=lambda x: x[1],reverse=True)
                printFirstX=9
                firstX=tfidfs[0:printFirstX]
                justTerms=[]
                for entry in firstX:
                    justTerms.append(textDocs.get_term_map()[entry[0]])
                print(",".join(justTerms)+"\n")
            
        return 0
    except KeyboardInterrupt:
        logger.error("Program aborted manually")
        return 131
def main():
    # start of parser boilerplate
    parser = ArgumentParser(description="Produce TFIDF numbers for terms in the text preservation formats in a batch",
                            epilog="Copyright University of Chicago; " + \
                            "written by "+__author__ + \
                            " "+__email__)

    parser.add_argument("-v", help="See the version of this program",
                        action="version", version=__version__)
    # let the user decide the verbosity level of logging statements
    # -b sets it to INFO so warnings, errors and generic informative statements
    # will be logged
    parser.add_argument( \
                         '-b','-verbose',help="set verbose logging",
                         action='store_const',dest='log_level',
                         const=INFO,default='INFO' \
    )
    # -d is debugging so anything you want to use a debugger gets logged if you
    # use this level
    parser.add_argument( \
                         '-d','--debugging',help="set debugging logging",
                         action='store_const',dest='log_level',
                         const=DEBUG,default='INFO' \
    )
    # optionally save the log to a file. set a location or use the default constant
    parser.add_argument( \
                         '-l','--log_loc',help="save logging to a file",
                         dest="log_loc",
                         \
    )
    parser.add_argument("restritem", help="Enter a noid for an accession or a " + \
                        "directory path that you need to validate against" + \
                        " a type of controlled collection"
    )
    parser.add_argument("restrroot",help="Enter the root of the directory path",
                        action="store"
    )
    parser.add_argument("item", help="Enter a noid for an accession or a " + \
                        "directory path that you need to validate against" + \
                        " a type of controlled collection"
    )
    parser.add_argument("root",help="Enter the root of the directory path",
                        action="store"
    )
    args = parser.parse_args()
    log_format = Formatter( \
                            "[%(levelname)s] %(asctime)s  " + \
                            "= %(message)s",
                            datefmt="%Y-%m-%dT%H:%M:%S" \
    )
    global logger
    logger = getLogger( \
                        "lib.uchicago.repository.logger" \
    )
    ch = StreamHandler()
    ch.setFormatter(log_format)
    logger.setLevel(args.log_level)
    if args.log_loc:
        fh = FileHandler(args.log_loc)
        fh.setFormatter(log_format)
        logger.addHandler(fh)
    logger.addHandler(ch)
    try:
        args.restritem=abspath(args.restritem)
        args.restrroot=abspath(args.restrroot)
        args.item=abspath(args.item)
        args.root=abspath(args.root)

        b = Batch(args.restrroot, args.restritem)
        restrDocs=TextBatch(args.restritem,args.restrroot)
        for item in b.find_items(from_directory=True):
            if ".presform.txt" in item.find_file_name():
                textDoc=TextItem(item.get_file_path(),item.get_root_path())
                restrDocs.add_item(textDoc)
        if restrDocs.validate_items():
            logger.info("Generating language model from provided document set.")
            logger.info("Getting document term indices")
            term_map={}
            for item in restrDocs.get_items():
                item.set_raw_string(item.find_raw_string())
                indexOut=item.find_index(purge_raw=True,scrub_text=False,stem=False,term_map=term_map)
                item.set_index(indexOut[0])
                term_map.update(indexOut[1])
            restrDocs.set_term_map(term_map)
            logger.info("Generating corpus term index")
            restrDocs.set_term_index(restrDocs.find_term_index())
            logger.info("Getting iIDFs")
            restrDocs.set_doc_counts(restrDocs.find_doc_counts())
            restrDocs.set_iIdfs(restrDocs.find_iIdfs())
            logger.info("Computing Language Model")
            restrDocs.set_language_model(restrDocs.find_language_model())
            logger.info("Computing LM VSM")
            restrDocs.set_vector_space_model(restrDocs.find_vector_space_model())

        c=Batch(args.root,args.item)
        Docs=TextBatch(args.root,args.item)
        for item in c.find_items(from_directory=True):
            if ".presform.txt" in item.find_file_name():
                textDoc=TextItem(item.get_file_path(),item.get_root_path())
                Docs.add_item(textDoc)
        if Docs.validate_items():
            logger.info("Generating TFIDF models for each document in the batch.")
            logger.info("Getting document term indices")
            tote=len(Docs.get_items())
            i=0
            for item in Docs.get_items():
                i+=1
                print("\r"+str(i)+"/"+str(tote)+" - "+item.get_file_path(),end="")
                item.set_raw_string(item.find_raw_string())
                indexOut=item.find_index(purge_raw=True,scrub_text=False,stem=False,term_map=term_map,only_mapped=True)
                item.set_index(indexOut[0])
            print()
            logger.info("Getting IDFs")
            Docs.set_doc_counts(Docs.find_doc_counts())
            Docs.set_idfs(Docs.find_idfs())
            logger.info("Computing TFIDFs")
            Docs.set_tf_idfs(Docs.find_tf_idfs())
            logger.info("Generating document vector space models.")
            Docs.set_document_vector_space_models(Docs.find_document_vector_space_models())
            
            logger.info("Computing similarity metrics.")
            
            rels=[]
            for document in Docs.get_document_vector_space_models():
                rels.append((document,restrDocs.find_similarity(Docs.get_document_vector_space_models()[document])))
            logger.info("Sorting similarity metrics for output")
            rels=sorted(rels,key=itemgetter(1))
            for entry in rels:
                print(entry[0]+": "+str(entry[1]))

            
        return 0
    except KeyboardInterrupt:
        logger.error("Program aborted manually")
        return 131