tm = None if tagmap: tm = TagMap(tagmap) # Initialize multithreading... p = Pool(cpu_count()) for path in filelist: p.apply_async(process_file, args=[path, tm, delimeter], callback=merge_counts) # result = p.apply(process_file, args=[path, tm, delimeter]) # merge_counts(result) p.close() p.join() # Now, dump the pickled POSEvalDict. print("Writing out dictionary...", end=' ') pickle.dump(c, open(output, 'wb')) print("Done.") print("{} tokens processed, {} sentences.".format(counts['tokens'], counts['lines'])) if __name__ == '__main__': p = argparse.ArgumentParser() p.add_argument('FILE', nargs='+', help='Slashtag files for input', type=globfiles) p.add_argument('-o', dest='output', help='Destination for pickled POS dict', required=True) p.add_argument('-t', '--tagmap', help='Tag Map for tags', type=existsfile) args = p.parse_args() create_dictionary(flatten_list(args.FILE), args.output, args.tagmap)
# Decide on action based on subcommand and args. ------------------------------- # =============================================================================== # Set verbosity level # =============================================================================== logging.getLogger().setLevel(logging.WARNING - 10 * (min(args.verbose, 2))) # ENRICH if args.subcommand == CMD_ENRICH: enrich(**vars(args)) # STATS elif args.subcommand == CMD_STATS: igt_stats(flatten_list(args.FILE), type='xigt', show_filename=True) # SPLIT elif args.subcommand == CMD_SPLIT: split_corpus(flatten_list(args.FILE), args.train, args.dev, args.test, prefix=args.prefix, overwrite=args.overwrite, nfold=args.nfold) # FILTER elif args.subcommand == CMD_FILTER: filter_corpus(flatten_list(getattr(args, ARG_INFILE)), getattr(args, ARG_OUTFILE), **vars(args)) # EXTRACT elif args.subcommand == CMD_EXTRACT: extract_from_xigt(input_filelist=flatten_list(args.FILE), **vars(args)) # EVAL