lif = label_instance_function[args.instances]
	feats = []
	test_feats = []
	
	for label in labels:
		texts = lif(categorized_corpus, label)
		
		if args.instances == 'files':
			# don't get list(texts) here since might have tons of files
			stop = int(len(categorized_corpus.fileids())*args.fraction)
		else:
			texts = list(texts)
			stop = int(len(texts)*args.fraction)
		
		for t in itertools.islice(texts, stop):
			feat = bag_of_words(norm_words(t))
			feats.append(feat)
			test_feats.append((feat, label))
	
	print('accuracy:', accuracy(classifier, test_feats))
	refsets, testsets = scoring.ref_test_sets(classifier, test_feats)
	
	for label in labels:
		ref = refsets[label]
		test = testsets[label]
		print('%s precision: %f' % (label, precision(ref, test) or 0))
		print('%s recall: %f' % (label, recall(ref, test) or 0))
		print('%s f-measure: %f' % (label, f_measure(ref, test) or 0))
else:
	if args.instances == 'sents':
		texts = categorized_corpus.sents()
Beispiel #2
0
	label_instance_function = {
		'sents': corpus.category_sent_words,
		'paras': corpus.category_para_words,
		'files': corpus.category_file_words
	}
	
	lif = label_instance_function[args.instances]
	feats = []
	test_feats = []
	
	for label in labels:
		texts = list(lif(categorized_corpus, label))
		stop = int(len(texts)*args.fraction)
		
		for t in texts[:stop]:
			feat = bag_of_words(norm_words(t))
			feats.append(feat)
			test_feats.append((feat, label))
	
	print 'accuracy:', accuracy(classifier, test_feats)
	refsets, testsets = scoring.ref_test_sets(classifier, test_feats)
	
	for label in labels:
		ref = refsets[label]
		test = testsets[label]
		print '%s precision: %f' % (label, precision(ref, test) or 0)
		print '%s recall: %f' % (label, recall(ref, test) or 0)
		print '%s f-measure: %f' % (label, f_measure(ref, test) or 0)
else:
	instance_function = {
		'sents': categorized_corpus.sents,
Beispiel #3
0
def bag_of_bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_words(words + bigrams)
Beispiel #4
0
    lif = label_instance_function[args.instances]
    feats = []
    test_feats = []

    for label in labels:
        texts = lif(categorized_corpus, label)

        if args.instances == 'files':
            # don't get list(texts) here since might have tons of files
            stop = int(len(categorized_corpus.fileids()) * args.fraction)
        else:
            texts = list(texts)
            stop = int(len(texts) * args.fraction)

        for t in itertools.islice(texts, stop):
            feat = bag_of_words(norm_words(t))
            feats.append(feat)
            test_feats.append((feat, label))

    print('accuracy:', accuracy(classifier, test_feats))
    refsets, testsets = scoring.ref_test_sets(classifier, test_feats)

    for label in labels:
        ref = refsets[label]
        test = testsets[label]
        print('%s precision: %f' % (label, precision(ref, test) or 0))
        print('%s recall: %f' % (label, recall(ref, test) or 0))
        print('%s f-measure: %f' % (label, f_measure(ref, test) or 0))
else:
    if args.instances == 'sents':
        texts = categorized_corpus.sents()
Beispiel #5
0
	
	if args.trace:
		print 'filename for category %s: %s' % (label, path)
	
	return path

labels = classifier.labels()
label_files = dict([(l, open(label_filename(l), 'a')) for l in labels])

# TODO: create a nltk.corpus.writer framework with some initial CorpusWriter classes

if args.target:
	if args.trace:
		print 'translating all text from %s to %s' % (args.source, args.target)
	
	featx = lambda words: bag_of_words(norm_words(wordpunct_tokenize(translate(join_words(words),
		args.source, args.target, trace=args.trace, sleep=args.sleep, retries=args.retries))))
else:
	featx = lambda words: bag_of_words(norm_words(words))

def classify_write(words):
	feats = featx(words)
	probs = classifier.prob_classify(feats)
	label = probs.max()
	
	if probs.prob(label) >= args.threshold:
		label_files[label].write(join_words(words) + u'\n\n')

if args.trace:
	print 'classifying %s' % args.instances

if args.instances == 'paras':
Beispiel #6
0

labels = classifier.labels()
label_files = dict([(l, open(label_filename(l), 'a')) for l in labels])

# TODO: create a nltk.corpus.writer framework with some initial CorpusWriter classes

if args.target:
    if args.trace:
        print 'translating all text from %s to %s' % (args.source, args.target)

    featx = lambda words: bag_of_words(
        norm_words(
            wordpunct_tokenize(
                translate(join_words(words),
                          args.source,
                          args.target,
                          trace=args.trace,
                          sleep=args.sleep,
                          retries=args.retries))))
else:
    featx = lambda words: bag_of_words(norm_words(words))


def classify_write(words):
    feats = featx(words)
    probs = classifier.prob_classify(feats)
    label = probs.max()

    if probs.prob(label) >= args.threshold:
        label_files[label].write(join_words(words) + u'\n\n')
    label_instance_function = {
        "sents": corpus.category_sent_words,
        "paras": corpus.category_para_words,
        "files": corpus.category_file_words,
    }

    lif = label_instance_function[args.instances]
    feats = []
    test_feats = []

    for label in labels:
        texts = list(lif(categorized_corpus, label))
        stop = int(len(texts) * args.fraction)

        for t in texts[:stop]:
            feat = bag_of_words(norm_words(t))
            feats.append(feat)
            test_feats.append((feat, label))

    print "accuracy:", accuracy(classifier, test_feats)
    refsets, testsets = scoring.ref_test_sets(classifier, test_feats)

    for label in labels:
        ref = refsets[label]
        test = testsets[label]
        print "%s precision: %f" % (label, precision(ref, test) or 0)
        print "%s recall: %f" % (label, recall(ref, test) or 0)
        print "%s f-measure: %f" % (label, f_measure(ref, test) or 0)
else:
    instance_function = {
        "sents": categorized_corpus.sents,
    if not os.path.exists(args.target_corpus):
        os.makedirs(args.target_corpus)

    if args.trace:
        print('filename for category %s: %s' % (label, path))

    return path


labels = classifier.labels()
label_files = dict([(l, open(label_filename(l), 'a')) for l in labels])

# TODO: create a nltk.corpus.writer framework with some initial CorpusWriter classes

featx = lambda words: bag_of_words(norm_words(words))


def classify_write(words):
    feats = featx(words)
    probs = classifier.prob_classify(feats)
    label = probs.max()

    if probs.prob(label) >= args.threshold:
        label_files[label].write(join_words(words) + u'\n\n')


if args.trace:
    print('classifying %s' % args.instances)

if args.instances == 'paras':
	path = os.path.join(args.target_corpus, '%s.txt' % label)
	
	if not os.path.exists(args.target_corpus):
		os.makedirs(args.target_corpus)
	
	if args.trace:
		print 'filename for category %s: %s' % (label, path)
	
	return path

labels = classifier.labels()
label_files = dict([(l, open(label_filename(l), 'a')) for l in labels])

# TODO: create a nltk.corpus.writer framework with some initial CorpusWriter classes

featx = lambda words: bag_of_words(norm_words(words))

def classify_write(words):
	feats = featx(words)
	probs = classifier.prob_classify(feats)
	label = probs.max()
	
	if probs.prob(label) >= args.threshold:
		label_files[label].write(join_words(words) + u'\n\n')

if args.trace:
	print 'classifying %s' % args.instances

if args.instances == 'paras':
	for para in source_corpus.paras():
		classify_write(list(itertools.chain(*para)))