help='''The fraction of the corpus to use for training a binary or multi-class classifier, the rest will be used for evaulation. The default is to use the entire corpus, and to test the classifier against the same training data. Any number < 1 will test against the remaining fraction.''') args = parser.parse_args() ################### ## corpus reader ## ################### if args.trace: print('loading corpus %s' % args.corpus) corpus = load_corpus_reader(args.corpus) methods = { 'sents': nltk_trainer.classification.corpus.category_sent_strings, 'paras': nltk_trainer.classification.corpus.category_para_strings, 'files': nltk_trainer.classification.corpus.category_file_strings } cat_instances = methods[args.instances](corpus) ################ ## CSV output ## ################ filename = args.filename
reader_args.append(args.cat_pattern) reader_kwargs['cat_pattern'] = re.compile(args.cat_pattern) if args.word_tokenizer: reader_kwargs['word_tokenizer'] = import_attr(args.word_tokenizer)() if args.sent_tokenizer: reader_kwargs['sent_tokenizer'] = nltk.data.LazyLoader(args.sent_tokenizer) if args.para_block_reader: reader_kwargs['para_block_reader'] = import_attr(args.para_block_reader) if args.trace: print('loading %s' % args.corpus) categorized_corpus = load_corpus_reader(args.corpus, args.reader, *reader_args, **reader_kwargs) if not hasattr(categorized_corpus, 'categories'): raise ValueError('%s is does not have categories for classification') if len(args.labels) > 0: labels = args.labels.split(",") else: labels = categorized_corpus.categories() nlabels = len(labels) if args.trace: print('%d labels: %s' % (nlabels, labels)) if not nlabels: raise ValueError('corpus does not have any categories')
corpus_group.add_argument('--tagset', default=None, help='Map tags to a given tagset, such as "universal"') sort_group = parser.add_argument_group('Tag Count Sorting Options') sort_group.add_argument('--sort', default='tag', choices=['tag', 'count'], help='Sort key, defaults to %(default)s') sort_group.add_argument('--reverse', action='store_true', default=False, help='Sort in revere order') args = parser.parse_args() ################### ## corpus reader ## ################### tagged_corpus = load_corpus_reader(args.corpus, reader=args.reader, fileids=args.fileids) if not tagged_corpus: raise ValueError('%s is an unknown corpus') if args.trace: print('loading %s' % args.corpus) ############## ## counting ## ############## wc = 0 tag_counts = collections.defaultdict(int) taglen = 7 word_set = set()
reader_args = [] reader_kwargs = {} if args.word_tokenizer: reader_kwargs['word_tokenizer'] = import_attr(args.word_tokenizer)() if args.sent_tokenizer: reader_kwargs['sent_tokenizer'] = nltk.data.LazyLoader(args.sent_tokenizer) if args.para_block_reader: reader_kwargs['para_block_reader'] = import_attr(args.para_block_reader) if args.trace: print 'loading %s' % args.source_corpus input_corpus = load_corpus_reader(args.source_corpus, args.reader, *reader_args, **reader_kwargs) ################# ## translation ## ################# for fileid in input_corpus.fileids(): # TODO: use ~/nltk_data/corpora as dir prefix? path = os.path.join(args.target_corpus, fileid) dirname = os.path.dirname(path) if not os.path.exists(dirname): if args.trace: print 'making directory %s' % dirname os.makedirs(dirname)
eval_group = parser.add_argument_group('Tagger Evaluation', 'Evaluation metrics for part-of-speech taggers') eval_group.add_argument('--no-eval', action='store_true', default=False, help="don't do any evaluation") # TODO: word coverage of test words, how many get a tag != '-NONE-' args = parser.parse_args() ################### ## corpus reader ## ################### if args.trace: print('loading %s' % args.corpus) tagged_corpus = load_corpus_reader(args.corpus, reader=args.reader, fileids=args.fileids) fileids = args.fileids kwargs = {} # all other corpora are assumed to support simplify_tags kwarg if simplify_wsj_tag and args.simplify_tags and args.corpus not in ['conll2000', 'switchboard', 'pl196x']: kwargs['simplify_tags'] = True # these corpora do not support simplify_tags, and have no known workaround elif simplify_wsj_tag and args.simplify_tags and args.corpus in ['pl196x']: raise ValueError('%s does not support simplify_tags' % args.corpus) elif not simplify_wsj_tag and args.tagset: kwargs['tagset'] = args.tagset if args.trace: print('using %s tagset' % args.tagset)
default='tokenizers/punkt/english.pickle', help='Path to pickled sentence tokenizer') corpus_group.add_argument( '--word-tokenizer', default='nltk.tokenize.WordPunctTokenizer', help='Full module path to a tokenizer class, defaults to %(default)s.') args = parser.parse_args() ################### ## corpus reader ## ################### source_corpus = load_corpus_reader(args.source_corpus, reader=args.reader, fileids=args.fileids, encoding='utf-8', sent_tokenizer=args.sent_tokenizer, word_tokenizer=args.word_tokenizer) if not source_corpus: raise ValueError('%s is an unknown corpus') if args.trace: print 'loaded %s' % args.source_corpus ############ ## tagger ## ############ # TODO: from analyze_tagger_coverage.py if args.trace:
reader_args.append(args.cat_pattern) reader_kwargs['cat_pattern'] = re.compile(args.cat_pattern) if args.word_tokenizer: reader_kwargs['word_tokenizer'] = import_attr(args.word_tokenizer)() if args.sent_tokenizer: reader_kwargs['sent_tokenizer'] = nltk.data.LazyLoader(args.sent_tokenizer) if args.para_block_reader: reader_kwargs['para_block_reader'] = import_attr(args.para_block_reader) if args.trace: print 'loading %s' % args.corpus categorized_corpus = load_corpus_reader(args.corpus, args.reader, *reader_args, **reader_kwargs) if not hasattr(categorized_corpus, 'categories'): raise ValueError('%s is does not have categories for classification') if len(args.labels) > 0: labels = args.labels.split(",") else: labels = categorized_corpus.categories() nlabels = len(labels) if args.trace: print '%d labels: %s' % (nlabels, labels) if not nlabels: raise ValueError('corpus does not have any categories')
help='Full module path to a corpus reader class, defaults to %(default)s.') corpus_group.add_argument('--fileids', default=None, help='Specify fileids to load from corpus') corpus_group.add_argument('--sent-tokenizer', default='tokenizers/punkt/english.pickle', help='Path to pickled sentence tokenizer') corpus_group.add_argument('--word-tokenizer', default='nltk.tokenize.WordPunctTokenizer', help='Full module path to a tokenizer class, defaults to %(default)s.') args = parser.parse_args() ################### ## corpus reader ## ################### source_corpus = load_corpus_reader(args.source_corpus, reader=args.reader, fileids=args.fileids, encoding='utf-8', sent_tokenizer=args.sent_tokenizer, word_tokenizer=args.word_tokenizer) if not source_corpus: raise ValueError('%s is an unknown corpus') if args.trace: print 'loaded %s' % args.source_corpus ############ ## tagger ## ############ # TODO: from analyze_tagger_coverage.py if args.trace: print 'loading tagger %s' % args.tagger
help='''The fraction of the corpus to use for training a binary or multi-class classifier, the rest will be used for evaulation. The default is to use the entire corpus, and to test the classifier against the same training data. Any number < 1 will test against the remaining fraction.''') args = parser.parse_args() ################### ## corpus reader ## ################### if args.trace: print 'loading corpus %s' % args.corpus corpus = load_corpus_reader(args.corpus) methods = { 'sents': nltk_trainer.classification.corpus.category_sent_strings, 'paras': nltk_trainer.classification.corpus.category_para_strings, 'files': nltk_trainer.classification.corpus.category_file_strings } cat_instances = methods[args.instances](corpus) ################ ## CSV output ## ################ filename = args.filename
trans_group = parser.add_argument_group('Language Translation') trans_group.add_argument('--source', default='english', choices=langs, help='source language') trans_group.add_argument('--target', default=None, choices=langs, help='target language') trans_group.add_argument('--retries', default=3, type=int, help='Number of babelfish retries before quiting') trans_group.add_argument('--sleep', default=3, type=int, help='Sleep time between retries') args = parser.parse_args() ################### ## corpus reader ## ################### source_corpus = load_corpus_reader(args.source_corpus, args.reader) if not source_corpus: raise ValueError('%s is an unknown corpus') if args.trace: print 'loaded %s' % args.source_corpus ######################## ## text normalization ## ######################## # TODO: copied from analyze_classifier_coverage, so abstract if args.filter_stopwords == 'no': stopset = set()
trans_group.add_argument('--retries', default=3, type=int, help='Number of babelfish retries before quiting') trans_group.add_argument('--sleep', default=3, type=int, help='Sleep time between retries') args = parser.parse_args() ################### ## corpus reader ## ################### source_corpus = load_corpus_reader(args.source_corpus, args.reader) if not source_corpus: raise ValueError('%s is an unknown corpus') if args.trace: print 'loaded %s' % args.source_corpus ######################## ## text normalization ## ######################## # TODO: copied from analyze_classifier_coverage, so abstract if args.filter_stopwords == 'no': stopset = set()