def dump_data(path): ## get an empty directory if os.path.isdir(path): os.system('rm -rf %s' % path) os.mkdir(path) import sentence_cleaner for problem in task.problems: sent_file = '%s/%s.sent' % (path, problem.id) doc_file = '%s/%s.doc' % (path, problem.id) par_file = '%s/%s.par' % (path, problem.id) sent_fh = open(sent_file, 'w') doc_fh = open(doc_file, 'w') par_fh = open(par_file, 'w') for doc in problem.new_docs: count = 0 for sent in doc.sentences: ## cleaning if sent.original[0:2].islower(): print 'bad parse:', sent.original continue if sent.order == 0: cleaned = sentence_cleaner.clean_aggressive(sent.original) else: cleaned = sentence_cleaner.clean(sent.original) sent_fh.write('%s\n' % cleaned) doc_fh.write('%s\n' % (doc.id)) par_fh.write('%d\n' % int(sent.paragraph_starter)) sent_fh.close() doc_fh.close() par_fh.close() query_file = '%s/%s.query' % (path, problem.id) query_fh = open(query_file, 'w') query_fh.write('%s\n' % problem.title) query_fh.write('%s\n' % problem.narr) query_fh.close() gold_file = '%s/%s.gold_sent' % (path, problem.id) gold_doc_file = '%s/%s.gold_doc' % (path, problem.id) gold_fh = open(gold_file, 'w') gold_doc_fh = open(gold_doc_file, 'w') for ann, sents in problem.training.items(): for sent in sents: gold_fh.write('%s\n' % sent) gold_doc_fh.write('%s\n' % ann) gold_fh.close() gold_doc_fh.close()
def dump_data(path): ## get an empty directory if os.path.isdir(path): os.system('rm -rf %s' %path) os.mkdir(path) import sentence_cleaner for problem in task.problems: sent_file = '%s/%s.sent' %(path, problem.id) doc_file = '%s/%s.doc' %(path, problem.id) par_file = '%s/%s.par' %(path, problem.id) sent_fh = open(sent_file, 'w') doc_fh = open(doc_file, 'w') par_fh = open(par_file, 'w') for doc in problem.new_docs: count = 0 for sent in doc.sentences: ## cleaning if sent.original[0:2].islower(): print 'bad parse:', sent.original continue if sent.order == 0: cleaned = sentence_cleaner.clean_aggressive(sent.original) else: cleaned = sentence_cleaner.clean(sent.original) sent_fh.write('%s\n' %cleaned) doc_fh.write('%s\n' %(doc.id)) par_fh.write('%d\n' %int(sent.paragraph_starter)) sent_fh.close() doc_fh.close() par_fh.close() query_file = '%s/%s.query' %(path, problem.id) query_fh = open(query_file, 'w') query_fh.write('%s\n' %problem.title) query_fh.write('%s\n' %problem.narr) query_fh.close() gold_file = '%s/%s.gold_sent' %(path, problem.id) gold_doc_file = '%s/%s.gold_doc' %(path, problem.id) gold_fh = open(gold_file, 'w') gold_doc_fh = open(gold_doc_file, 'w') for ann, sents in problem.training.items(): for sent in sents: gold_fh.write('%s\n' %sent) gold_doc_fh.write('%s\n' %ann) gold_fh.close() gold_doc_fh.close()
def test(): import sentence_cleaner total, reg, agg = 0, 0, 0 for problem in task.problems: for doc in problem.new_docs: for sent in doc.sentences: if sent.original[0].islower(): print '**', sent.original if sent.order == 0: cleaned = sentence_cleaner.clean_aggressive(sent.original) agg += len(cleaned.split()) reg += len(sentence_cleaner.clean(sent.original).split()) else: cleaned = sentence_cleaner.clean(sent.original) agg += len(cleaned.split()) reg += len(cleaned.split()) total += len(sent.original.split()) if sent.original == cleaned: continue print sent.original print cleaned print '----------' #if sent.order == 0: print sent print '+++' print 'total [%d] reg [%d] agg [%d]' % (total, reg, agg)
def test(): import sentence_cleaner total, reg, agg = 0, 0, 0 for problem in task.problems: for doc in problem.new_docs: for sent in doc.sentences: if sent.original[0].islower(): print '**', sent.original if sent.order == 0: cleaned = sentence_cleaner.clean_aggressive(sent.original) agg += len(cleaned.split()) reg += len(sentence_cleaner.clean(sent.original).split()) else: cleaned = sentence_cleaner.clean(sent.original) agg += len(cleaned.split()) reg += len(cleaned.split()) total += len(sent.original.split()) if sent.original == cleaned: continue print sent.original print cleaned print '----------' #if sent.order == 0: print sent print '+++' print 'total [%d] reg [%d] agg [%d]' %(total, reg, agg)