Ejemplo n.º 1
0
    help='''The fraction of the corpus to use for training a binary or
	multi-class classifier, the rest will be used for evaulation.
	The default is to use the entire corpus, and to test the classifier
	against the same training data. Any number < 1 will test against
	the remaining fraction.''')

args = parser.parse_args()

###################
## corpus reader ##
###################

if args.trace:
    print('loading corpus %s' % args.corpus)

corpus = load_corpus_reader(args.corpus)

methods = {
    'sents': nltk_trainer.classification.corpus.category_sent_strings,
    'paras': nltk_trainer.classification.corpus.category_para_strings,
    'files': nltk_trainer.classification.corpus.category_file_strings
}

cat_instances = methods[args.instances](corpus)

################
## CSV output ##
################

filename = args.filename
Ejemplo n.º 2
0
    reader_args.append(args.cat_pattern)
    reader_kwargs['cat_pattern'] = re.compile(args.cat_pattern)

if args.word_tokenizer:
    reader_kwargs['word_tokenizer'] = import_attr(args.word_tokenizer)()

if args.sent_tokenizer:
    reader_kwargs['sent_tokenizer'] = nltk.data.LazyLoader(args.sent_tokenizer)

if args.para_block_reader:
    reader_kwargs['para_block_reader'] = import_attr(args.para_block_reader)

if args.trace:
    print('loading %s' % args.corpus)

categorized_corpus = load_corpus_reader(args.corpus, args.reader, *reader_args,
                                        **reader_kwargs)

if not hasattr(categorized_corpus, 'categories'):
    raise ValueError('%s is does not have categories for classification')

if len(args.labels) > 0:
    labels = args.labels.split(",")
else:
    labels = categorized_corpus.categories()
nlabels = len(labels)

if args.trace:
    print('%d labels: %s' % (nlabels, labels))

if not nlabels:
    raise ValueError('corpus does not have any categories')
Ejemplo n.º 3
0
	corpus_group.add_argument('--tagset', default=None,
		help='Map tags to a given tagset, such as "universal"')

sort_group = parser.add_argument_group('Tag Count Sorting Options')
sort_group.add_argument('--sort', default='tag', choices=['tag', 'count'],
	help='Sort key, defaults to %(default)s')
sort_group.add_argument('--reverse', action='store_true', default=False,
	help='Sort in revere order')

args = parser.parse_args()

###################
## corpus reader ##
###################

tagged_corpus = load_corpus_reader(args.corpus, reader=args.reader, fileids=args.fileids)

if not tagged_corpus:
	raise ValueError('%s is an unknown corpus')

if args.trace:
	print('loading %s' % args.corpus)

##############
## counting ##
##############

wc = 0
tag_counts = collections.defaultdict(int)
taglen = 7
word_set = set()
reader_args = []
reader_kwargs = {}

if args.word_tokenizer:
	reader_kwargs['word_tokenizer'] = import_attr(args.word_tokenizer)()

if args.sent_tokenizer:
	reader_kwargs['sent_tokenizer'] = nltk.data.LazyLoader(args.sent_tokenizer)

if args.para_block_reader:
	reader_kwargs['para_block_reader'] = import_attr(args.para_block_reader)

if args.trace:
	print 'loading %s' % args.source_corpus

input_corpus = load_corpus_reader(args.source_corpus, args.reader,
	*reader_args, **reader_kwargs)

#################
## translation ##
#################

for fileid in input_corpus.fileids():
	# TODO: use ~/nltk_data/corpora as dir prefix?
	path = os.path.join(args.target_corpus, fileid)
	dirname = os.path.dirname(path)
	
	if not os.path.exists(dirname):
		if args.trace:
			print 'making directory %s' % dirname
		
		os.makedirs(dirname)
Ejemplo n.º 5
0
eval_group = parser.add_argument_group('Tagger Evaluation',
	'Evaluation metrics for part-of-speech taggers')
eval_group.add_argument('--no-eval', action='store_true', default=False,
	help="don't do any evaluation")
# TODO: word coverage of test words, how many get a tag != '-NONE-'

args = parser.parse_args()

###################
## corpus reader ##
###################

if args.trace:
	print('loading %s' % args.corpus)

tagged_corpus = load_corpus_reader(args.corpus, reader=args.reader, fileids=args.fileids)
fileids = args.fileids
kwargs = {}

# all other corpora are assumed to support simplify_tags kwarg
if simplify_wsj_tag and args.simplify_tags and args.corpus not in ['conll2000', 'switchboard', 'pl196x']:
	kwargs['simplify_tags'] = True
# these corpora do not support simplify_tags, and have no known workaround
elif simplify_wsj_tag and args.simplify_tags and args.corpus in ['pl196x']:
	raise ValueError('%s does not support simplify_tags' % args.corpus)
elif not simplify_wsj_tag and args.tagset:
	kwargs['tagset'] = args.tagset
	
	if args.trace:
		print('using %s tagset' % args.tagset)
Ejemplo n.º 6
0
                          default='tokenizers/punkt/english.pickle',
                          help='Path to pickled sentence tokenizer')
corpus_group.add_argument(
    '--word-tokenizer',
    default='nltk.tokenize.WordPunctTokenizer',
    help='Full module path to a tokenizer class, defaults to %(default)s.')

args = parser.parse_args()

###################
## corpus reader ##
###################

source_corpus = load_corpus_reader(args.source_corpus,
                                   reader=args.reader,
                                   fileids=args.fileids,
                                   encoding='utf-8',
                                   sent_tokenizer=args.sent_tokenizer,
                                   word_tokenizer=args.word_tokenizer)

if not source_corpus:
    raise ValueError('%s is an unknown corpus')

if args.trace:
    print 'loaded %s' % args.source_corpus

############
## tagger ##
############

# TODO: from analyze_tagger_coverage.py
if args.trace:
Ejemplo n.º 7
0
	reader_args.append(args.cat_pattern)
	reader_kwargs['cat_pattern'] = re.compile(args.cat_pattern)

if args.word_tokenizer:
	reader_kwargs['word_tokenizer'] = import_attr(args.word_tokenizer)()

if args.sent_tokenizer:
	reader_kwargs['sent_tokenizer'] = nltk.data.LazyLoader(args.sent_tokenizer)

if args.para_block_reader:
	reader_kwargs['para_block_reader'] = import_attr(args.para_block_reader)

if args.trace:
	print 'loading %s' % args.corpus

categorized_corpus = load_corpus_reader(args.corpus, args.reader,
	*reader_args, **reader_kwargs)

if not hasattr(categorized_corpus, 'categories'):
	raise ValueError('%s is does not have categories for classification')

if len(args.labels) > 0:
	labels = args.labels.split(",")
else:
	labels = categorized_corpus.categories()
nlabels = len(labels)

if args.trace:
	print '%d labels: %s' % (nlabels, labels)

if not nlabels:
	raise ValueError('corpus does not have any categories')
Ejemplo n.º 8
0
	help='Full module path to a corpus reader class, defaults to %(default)s.')
corpus_group.add_argument('--fileids', default=None,
	help='Specify fileids to load from corpus')
corpus_group.add_argument('--sent-tokenizer', default='tokenizers/punkt/english.pickle',
	help='Path to pickled sentence tokenizer')
corpus_group.add_argument('--word-tokenizer', default='nltk.tokenize.WordPunctTokenizer',
	help='Full module path to a tokenizer class, defaults to %(default)s.')

args = parser.parse_args()

###################
## corpus reader ##
###################

source_corpus = load_corpus_reader(args.source_corpus, reader=args.reader,
	fileids=args.fileids, encoding='utf-8', sent_tokenizer=args.sent_tokenizer,
	word_tokenizer=args.word_tokenizer)

if not source_corpus:
	raise ValueError('%s is an unknown corpus')

if args.trace:
	print 'loaded %s' % args.source_corpus

############
## tagger ##
############

# TODO: from analyze_tagger_coverage.py
if args.trace:
	print 'loading tagger %s' % args.tagger
	help='''The fraction of the corpus to use for training a binary or
	multi-class classifier, the rest will be used for evaulation.
	The default is to use the entire corpus, and to test the classifier
	against the same training data. Any number < 1 will test against
	the remaining fraction.''')

args = parser.parse_args()

###################
## corpus reader ##
###################

if args.trace:
	print 'loading corpus %s' % args.corpus

corpus = load_corpus_reader(args.corpus)

methods = {
	'sents': nltk_trainer.classification.corpus.category_sent_strings,
	'paras': nltk_trainer.classification.corpus.category_para_strings,
	'files': nltk_trainer.classification.corpus.category_file_strings
}

cat_instances = methods[args.instances](corpus)

################
## CSV output ##
################

filename = args.filename
Ejemplo n.º 10
0
trans_group = parser.add_argument_group('Language Translation')
trans_group.add_argument('--source', default='english', choices=langs, help='source language')
trans_group.add_argument('--target', default=None, choices=langs, help='target language')
trans_group.add_argument('--retries', default=3, type=int,
	help='Number of babelfish retries before quiting')
trans_group.add_argument('--sleep', default=3, type=int,
	help='Sleep time between retries')

args = parser.parse_args()

###################
## corpus reader ##
###################

source_corpus = load_corpus_reader(args.source_corpus, args.reader)

if not source_corpus:
	raise ValueError('%s is an unknown corpus')

if args.trace:
	print 'loaded %s' % args.source_corpus

########################
## text normalization ##
########################

# TODO: copied from analyze_classifier_coverage, so abstract

if args.filter_stopwords == 'no':
	stopset = set()
Ejemplo n.º 11
0
trans_group.add_argument('--retries',
                         default=3,
                         type=int,
                         help='Number of babelfish retries before quiting')
trans_group.add_argument('--sleep',
                         default=3,
                         type=int,
                         help='Sleep time between retries')

args = parser.parse_args()

###################
## corpus reader ##
###################

source_corpus = load_corpus_reader(args.source_corpus, args.reader)

if not source_corpus:
    raise ValueError('%s is an unknown corpus')

if args.trace:
    print 'loaded %s' % args.source_corpus

########################
## text normalization ##
########################

# TODO: copied from analyze_classifier_coverage, so abstract

if args.filter_stopwords == 'no':
    stopset = set()
Ejemplo n.º 12
0
reader_args = []
reader_kwargs = {}

if args.word_tokenizer:
	reader_kwargs['word_tokenizer'] = import_attr(args.word_tokenizer)()

if args.sent_tokenizer:
	reader_kwargs['sent_tokenizer'] = nltk.data.LazyLoader(args.sent_tokenizer)

if args.para_block_reader:
	reader_kwargs['para_block_reader'] = import_attr(args.para_block_reader)

if args.trace:
	print 'loading %s' % args.source_corpus

input_corpus = load_corpus_reader(args.source_corpus, args.reader,
	*reader_args, **reader_kwargs)

#################
## translation ##
#################

for fileid in input_corpus.fileids():
	# TODO: use ~/nltk_data/corpora as dir prefix?
	path = os.path.join(args.target_corpus, fileid)
	dirname = os.path.dirname(path)
	
	if not os.path.exists(dirname):
		if args.trace:
			print 'making directory %s' % dirname
		
		os.makedirs(dirname)