for fileid in input_corpus.fileids(): # TODO: use ~/nltk_data/corpora as dir prefix? path = os.path.join(args.target_corpus, fileid) dirname = os.path.dirname(path) if not os.path.exists(dirname): if args.trace: print 'making directory %s' % dirname os.makedirs(dirname) with open(path, 'w') as outf: if args.trace: print 'translating file %s to %s' % (fileid, path) for para in input_corpus.paras(fileids=[fileid]): for sent in para: # TODO: use intelligent joining (with punctuation) text = join_words(sent) if not text: continue trans = translate(text, args.source, args.corpus, trace=args.trace, sleep=args.sleep, retries=args.retries) if not trans: continue if args.trace > 1: print text, '-->>', trans outf.write(trans + ' ') outf.write('\n\n')
labels = classifier.labels() label_files = dict([(l, open(label_filename(l), 'a')) for l in labels]) # TODO: create a nltk.corpus.writer framework with some initial CorpusWriter classes if args.target: if args.trace: print 'translating all text from %s to %s' % (args.source, args.target) featx = lambda words: bag_of_words( norm_words( wordpunct_tokenize( translate(join_words(words), args.source, args.target, trace=args.trace, sleep=args.sleep, retries=args.retries)))) else: featx = lambda words: bag_of_words(norm_words(words)) def classify_write(words): feats = featx(words) probs = classifier.prob_classify(feats) label = probs.max() if probs.prob(label) >= args.threshold: label_files[label].write(join_words(words) + u'\n\n')
if args.trace: print 'filename for category %s: %s' % (label, path) return path labels = classifier.labels() label_files = dict([(l, open(label_filename(l), 'a')) for l in labels]) # TODO: create a nltk.corpus.writer framework with some initial CorpusWriter classes if args.target: if args.trace: print 'translating all text from %s to %s' % (args.source, args.target) featx = lambda words: bag_of_words(norm_words(wordpunct_tokenize(translate(join_words(words), args.source, args.target, trace=args.trace, sleep=args.sleep, retries=args.retries)))) else: featx = lambda words: bag_of_words(norm_words(words)) def classify_write(words): feats = featx(words) probs = classifier.prob_classify(feats) label = probs.max() if probs.prob(label) >= args.threshold: label_files[label].write(join_words(words) + u'\n\n') if args.trace: print 'classifying %s' % args.instances if args.instances == 'paras':