ws = scoring.sorted_word_scores(
        scoring.sum_category_word_scores(cat_words, score_fn))

    if args.min_score:
        ws = [(w, s) for (w, s) in ws if s >= args.min_score]

    if args.max_feats:
        ws = ws[:args.max_feats]

    bestwords = set([w for (w, s) in ws])

    if args.value_type == 'bool':
        if args.trace:
            print('using bag of words from known set feature extraction')

        featx = lambda words: bag_of_words_in_set(words, bestwords)
    else:
        if args.trace:
            print('using word counts from known set feature extraction')

        featx = lambda words: word_counts_in_set(words, bestwords)

    if args.trace:
        print('%d words meet min_score and/or max_feats' % len(bestwords))
elif args.value_type == 'bool':
    if args.trace:
        print('using bag of words feature extraction')

    featx = bag_of_words
else:
    if args.trace:
Esempio n. 2
0
if args.min_score or args.max_feats:
	if args.trace:
		print 'calculating word scores'
	
	cat_words = [(cat, norm_words(words)) for cat, words in corpus.category_words(categorized_corpus)]
	ws = scoring.sorted_word_scores(scoring.sum_category_word_scores(cat_words, score_fn))
	
	if args.min_score:
		ws = [(w, s) for (w, s) in ws if s >= args.min_score]
	
	if args.max_feats:
		ws = ws[:args.max_feats]
	
	bestwords = set([w for (w, s) in ws])
	featx = lambda words: bag_of_words_in_set(words, bestwords)
	
	if args.trace:
		print '%d words meet min_score and/or max_feats' % len(bestwords)
else:
	featx = bag_of_words

#####################
## text extraction ##
#####################

if args.multi and args.binary:
	label_instance_function = {
		'sents': corpus.multi_category_sent_words,
		'paras': corpus.multi_category_para_words,
		'files': corpus.multi_category_file_words