help='Full module path to a corpus reader class, such as %(default)s') corpus_group.add_argument('--word-tokenizer', default='', help='Word Tokenizer class path') corpus_group.add_argument('--sent-tokenizer', default='', help='Sent Tokenizer data.pickle path') corpus_group.add_argument('--para-block-reader', default='', help='Block reader function path') args = parser.parse_args() ################### ## corpus reader ## ################### reader_args = [] reader_kwargs = {} if args.word_tokenizer: reader_kwargs['word_tokenizer'] = import_attr(args.word_tokenizer)() if args.sent_tokenizer: reader_kwargs['sent_tokenizer'] = nltk.data.LazyLoader(args.sent_tokenizer) if args.para_block_reader: reader_kwargs['para_block_reader'] = import_attr(args.para_block_reader) if args.trace: print 'loading %s' % args.source_corpus input_corpus = load_corpus_reader(args.source_corpus, args.reader, *reader_args, **reader_kwargs) ################# ## translation ##
if args.cat_file: reader_kwargs['cat_file'] = args.cat_file if args.delimiter and args.delimiter != ' ': reader_kwargs['delimiter'] = args.delimiter if args.cat_pattern: reader_args.append(args.cat_pattern) else: reader_args.append('.+/.+') elif args.cat_pattern: reader_args.append(args.cat_pattern) reader_kwargs['cat_pattern'] = re.compile(args.cat_pattern) if args.word_tokenizer: reader_kwargs['word_tokenizer'] = import_attr(args.word_tokenizer)() if args.sent_tokenizer: reader_kwargs['sent_tokenizer'] = nltk.data.LazyLoader(args.sent_tokenizer) if args.para_block_reader: reader_kwargs['para_block_reader'] = import_attr(args.para_block_reader) if args.trace: print('loading %s' % args.corpus) categorized_corpus = load_corpus_reader(args.corpus, args.reader, *reader_args, **reader_kwargs) if not hasattr(categorized_corpus, 'categories'): raise ValueError('%s is does not have categories for classification')
) corpus_group.add_argument("--word-tokenizer", default="", help="Word Tokenizer class path") corpus_group.add_argument("--sent-tokenizer", default="", help="Sent Tokenizer data.pickle path") corpus_group.add_argument("--para-block-reader", default="", help="Block reader function path") args = parser.parse_args() ################### ## corpus reader ## ################### reader_args = [] reader_kwargs = {} if args.word_tokenizer: reader_kwargs["word_tokenizer"] = import_attr(args.word_tokenizer)() if args.sent_tokenizer: reader_kwargs["sent_tokenizer"] = nltk.data.LazyLoader(args.sent_tokenizer) if args.para_block_reader: reader_kwargs["para_block_reader"] = import_attr(args.para_block_reader) if args.trace: print "loading %s" % args.source_corpus input_corpus = load_corpus_reader(args.source_corpus, args.reader, *reader_args, **reader_kwargs) ################# ## translation ## #################