for fileid in input_corpus.fileids():
	# TODO: use ~/nltk_data/corpora as dir prefix?
	path = os.path.join(args.target_corpus, fileid)
	dirname = os.path.dirname(path)
	
	if not os.path.exists(dirname):
		if args.trace:
			print 'making directory %s' % dirname
		
		os.makedirs(dirname)
	
	with open(path, 'w') as outf:
		if args.trace:
			print 'translating file %s to %s' % (fileid, path)
		
		for para in input_corpus.paras(fileids=[fileid]):
			for sent in para:
				# TODO: use intelligent joining (with punctuation)
				text = join_words(sent)
				if not text: continue
				trans = translate(text, args.source, args.corpus, trace=args.trace,
					sleep=args.sleep, retries=args.retries)
				if not trans: continue
				
				if args.trace > 1:
					print text, '-->>', trans
				
				outf.write(trans + ' ')
			
			outf.write('\n\n')
Exemple #2
0
labels = classifier.labels()
label_files = dict([(l, open(label_filename(l), 'a')) for l in labels])

# TODO: create a nltk.corpus.writer framework with some initial CorpusWriter classes

if args.target:
    if args.trace:
        print 'translating all text from %s to %s' % (args.source, args.target)

    featx = lambda words: bag_of_words(
        norm_words(
            wordpunct_tokenize(
                translate(join_words(words),
                          args.source,
                          args.target,
                          trace=args.trace,
                          sleep=args.sleep,
                          retries=args.retries))))
else:
    featx = lambda words: bag_of_words(norm_words(words))


def classify_write(words):
    feats = featx(words)
    probs = classifier.prob_classify(feats)
    label = probs.max()

    if probs.prob(label) >= args.threshold:
        label_files[label].write(join_words(words) + u'\n\n')

Exemple #3
0
	
	if args.trace:
		print 'filename for category %s: %s' % (label, path)
	
	return path

labels = classifier.labels()
label_files = dict([(l, open(label_filename(l), 'a')) for l in labels])

# TODO: create a nltk.corpus.writer framework with some initial CorpusWriter classes

if args.target:
	if args.trace:
		print 'translating all text from %s to %s' % (args.source, args.target)
	
	featx = lambda words: bag_of_words(norm_words(wordpunct_tokenize(translate(join_words(words),
		args.source, args.target, trace=args.trace, sleep=args.sleep, retries=args.retries))))
else:
	featx = lambda words: bag_of_words(norm_words(words))

def classify_write(words):
	feats = featx(words)
	probs = classifier.prob_classify(feats)
	label = probs.max()
	
	if probs.prob(label) >= args.threshold:
		label_files[label].write(join_words(words) + u'\n\n')

if args.trace:
	print 'classifying %s' % args.instances

if args.instances == 'paras':
Exemple #4
0
for fileid in input_corpus.fileids():
	# TODO: use ~/nltk_data/corpora as dir prefix?
	path = os.path.join(args.target_corpus, fileid)
	dirname = os.path.dirname(path)
	
	if not os.path.exists(dirname):
		if args.trace:
			print 'making directory %s' % dirname
		
		os.makedirs(dirname)
	
	with open(path, 'w') as outf:
		if args.trace:
			print 'translating file %s to %s' % (fileid, path)
		
		for para in input_corpus.paras(fileids=[fileid]):
			for sent in para:
				# TODO: use intelligent joining (with punctuation)
				text = join_words(sent)
				if not text: continue
				trans = translate(text, args.source, args.corpus, trace=args.trace,
					sleep=args.sleep, retries=args.retries)
				if not trans: continue
				
				if args.trace > 1:
					print text, '-->>', trans
				
				outf.write(trans + ' ')
			
			outf.write('\n\n')