Ejemplo n.º 1
0
    tm = None
    if tagmap:
        tm = TagMap(tagmap)

    # Initialize multithreading...
    p = Pool(cpu_count())
    for path in filelist:
        p.apply_async(process_file, args=[path, tm, delimeter], callback=merge_counts)
        # result = p.apply(process_file, args=[path, tm, delimeter])
        # merge_counts(result)

    p.close()
    p.join()


    # Now, dump the pickled POSEvalDict.
    print("Writing out dictionary...", end=' ')
    pickle.dump(c, open(output, 'wb'))
    print("Done.")
    print("{} tokens processed, {} sentences.".format(counts['tokens'], counts['lines']))

if __name__ == '__main__':
    p = argparse.ArgumentParser()
    p.add_argument('FILE', nargs='+', help='Slashtag files for input', type=globfiles)
    p.add_argument('-o', dest='output', help='Destination for pickled POS dict', required=True)
    p.add_argument('-t', '--tagmap', help='Tag Map for tags', type=existsfile)

    args = p.parse_args()

    create_dictionary(flatten_list(args.FILE), args.output, args.tagmap)
Ejemplo n.º 2
0
# Decide on action based on subcommand and args. -------------------------------

# ===============================================================================
# Set verbosity level
# ===============================================================================

logging.getLogger().setLevel(logging.WARNING - 10 * (min(args.verbose, 2)))

# ENRICH
if args.subcommand == CMD_ENRICH:
    enrich(**vars(args))

# STATS
elif args.subcommand == CMD_STATS:
    igt_stats(flatten_list(args.FILE), type='xigt', show_filename=True)

# SPLIT
elif args.subcommand == CMD_SPLIT:
    split_corpus(flatten_list(args.FILE), args.train, args.dev, args.test, prefix=args.prefix, overwrite=args.overwrite,
                 nfold=args.nfold)

# FILTER
elif args.subcommand == CMD_FILTER:
    filter_corpus(flatten_list(getattr(args, ARG_INFILE)), getattr(args, ARG_OUTFILE), **vars(args))

# EXTRACT
elif args.subcommand == CMD_EXTRACT:
    extract_from_xigt(input_filelist=flatten_list(args.FILE), **vars(args))

# EVAL