collection = args.COLLECTION csvfile = args.OUTFILE idfile = args.IDFILE ptp = PairTreePathFinder(collection) lexicon = Dictionary.BuildLexicon() if not idfile: # default identifier in collection idfile = os.path.join(collection, 'id') scores = {} counter = Counter() with open(csvfile, 'w', encoding='utf-8') as csvf: csvwriter = csv.writer(csvf) for i, htid in enumerate(file_id_iter(idfile, 'r')): try: path, post = ptp.get_path_to_htid(htid) except ValueError as ve: print(ve) continue path = os.path.join(path, post + ".txt") try: with open(path, encoding='utf-8') as f: text = f.readlines() _,_,_,lowcount,lowmatch,_ = AccEval.GetScore(text, lexicon) pct = lowmatch / lowcount * 100 pct = round(pct, 1) scores[htid] = pct counter[pct] += 1
help="Output a JSON result file in addition the the default csv file.") parser.add_argument("--id-file", "-i", metavar="ID_FILE", dest="ID_FILE", help="Analyze the ids contained in ID_FILE rather than the entire database.") args = parser.parse_args() if not os.path.exists(args.DATABASE): print "database {} does not exist".format(args.DATABASE) sys.exit() with MarcSQLite(args.DATABASE) as db: if args.ID_FILE: ids = file_id_iter(args.ID_FILE) else: ids = None if args.MAPPING == 'years': mapper = map_publication_years elif args.MAPPING == 'subjects': mapper = map_subjects map_onto_records(mapper, db, args.CSV_OUT, json_fname=args.JSON_OUT, ids=ids)
action='store_true', help='Overwrite existing collated documents.') parser.add_argument('--no-divs', action='store_true', help='If specified, do not write page or header divisions to the collation.') parser.add_argument('--skip', type=int, default=0, help='Number of lines in the id file to skip; eg after an interrupted collate.') args = parser.parse_args() collection = args.COLLECTION rewrite_existing = args.rewrite_existing include_divs = not args.no_divs id_file = args.ID_FILE if not id_file: # default identifier in collection id_file = os.path.join(collection, 'id') ids = file_id_iter(id_file) print(bigcollate(ids, collection, rewrite_existing=rewrite_existing, include_divs=include_divs, skip=args.skip)) os.path.join(os.path.expanduser("~"), ".collate_resume")