コード例 #1
0
def dump_data(path):

    ## get an empty directory
    if os.path.isdir(path): os.system('rm -rf %s' % path)
    os.mkdir(path)

    import sentence_cleaner

    for problem in task.problems:
        sent_file = '%s/%s.sent' % (path, problem.id)
        doc_file = '%s/%s.doc' % (path, problem.id)
        par_file = '%s/%s.par' % (path, problem.id)
        sent_fh = open(sent_file, 'w')
        doc_fh = open(doc_file, 'w')
        par_fh = open(par_file, 'w')
        for doc in problem.new_docs:
            count = 0
            for sent in doc.sentences:

                ## cleaning
                if sent.original[0:2].islower():
                    print 'bad parse:', sent.original
                    continue
                if sent.order == 0:
                    cleaned = sentence_cleaner.clean_aggressive(sent.original)
                else:
                    cleaned = sentence_cleaner.clean(sent.original)

                sent_fh.write('%s\n' % cleaned)
                doc_fh.write('%s\n' % (doc.id))
                par_fh.write('%d\n' % int(sent.paragraph_starter))
        sent_fh.close()
        doc_fh.close()
        par_fh.close()

        query_file = '%s/%s.query' % (path, problem.id)
        query_fh = open(query_file, 'w')
        query_fh.write('%s\n' % problem.title)
        query_fh.write('%s\n' % problem.narr)
        query_fh.close()

        gold_file = '%s/%s.gold_sent' % (path, problem.id)
        gold_doc_file = '%s/%s.gold_doc' % (path, problem.id)
        gold_fh = open(gold_file, 'w')
        gold_doc_fh = open(gold_doc_file, 'w')
        for ann, sents in problem.training.items():
            for sent in sents:
                gold_fh.write('%s\n' % sent)
                gold_doc_fh.write('%s\n' % ann)
        gold_fh.close()
        gold_doc_fh.close()
コード例 #2
0
ファイル: main.py プロジェクト: DrDub/icsisumm
def dump_data(path):
    
    ## get an empty directory
    if os.path.isdir(path): os.system('rm -rf %s' %path)
    os.mkdir(path)
    
    import sentence_cleaner
    
    for problem in task.problems:
        sent_file = '%s/%s.sent' %(path, problem.id)
        doc_file = '%s/%s.doc' %(path, problem.id)
        par_file = '%s/%s.par' %(path, problem.id)
        sent_fh = open(sent_file, 'w')
        doc_fh = open(doc_file, 'w')
        par_fh = open(par_file, 'w')
        for doc in problem.new_docs:
            count = 0
            for sent in doc.sentences:
                
                ## cleaning
                if sent.original[0:2].islower(): 
                    print 'bad parse:', sent.original
                    continue
                if sent.order == 0: cleaned = sentence_cleaner.clean_aggressive(sent.original)
                else: cleaned = sentence_cleaner.clean(sent.original)
                
                sent_fh.write('%s\n' %cleaned)
                doc_fh.write('%s\n' %(doc.id))
                par_fh.write('%d\n' %int(sent.paragraph_starter))
        sent_fh.close()
        doc_fh.close()
        par_fh.close()
            
        query_file = '%s/%s.query' %(path, problem.id)
        query_fh = open(query_file, 'w')
        query_fh.write('%s\n' %problem.title)
        query_fh.write('%s\n' %problem.narr)
        query_fh.close()

        gold_file = '%s/%s.gold_sent' %(path, problem.id)
        gold_doc_file = '%s/%s.gold_doc' %(path, problem.id)
        gold_fh = open(gold_file, 'w')
        gold_doc_fh = open(gold_doc_file, 'w')
        for ann, sents in problem.training.items():
            for sent in sents:
                gold_fh.write('%s\n' %sent)
                gold_doc_fh.write('%s\n' %ann)
        gold_fh.close()
        gold_doc_fh.close()
コード例 #3
0
def test():
    import sentence_cleaner
    total, reg, agg = 0, 0, 0
    for problem in task.problems:
        for doc in problem.new_docs:
            for sent in doc.sentences:
                if sent.original[0].islower(): print '**', sent.original
                if sent.order == 0:
                    cleaned = sentence_cleaner.clean_aggressive(sent.original)
                    agg += len(cleaned.split())
                    reg += len(sentence_cleaner.clean(sent.original).split())
                else:
                    cleaned = sentence_cleaner.clean(sent.original)
                    agg += len(cleaned.split())
                    reg += len(cleaned.split())
                total += len(sent.original.split())
                if sent.original == cleaned: continue
                print sent.original
                print cleaned
                print '----------'
                #if sent.order == 0: print sent
            print '+++'
    print 'total [%d] reg [%d] agg [%d]' % (total, reg, agg)
コード例 #4
0
ファイル: main.py プロジェクト: DrDub/icsisumm
def test():
    import sentence_cleaner
    total, reg, agg = 0, 0, 0
    for problem in task.problems:
        for doc in problem.new_docs:
            for sent in doc.sentences:
                if sent.original[0].islower(): print '**', sent.original
                if sent.order == 0:
                    cleaned = sentence_cleaner.clean_aggressive(sent.original)
                    agg += len(cleaned.split())
                    reg += len(sentence_cleaner.clean(sent.original).split())
                else:
                    cleaned = sentence_cleaner.clean(sent.original)
                    agg += len(cleaned.split())
                    reg += len(cleaned.split())
                total += len(sent.original.split())    
                if sent.original == cleaned: continue
                print sent.original
                print cleaned
                print '----------'
                #if sent.order == 0: print sent
            print '+++'
    print 'total [%d] reg [%d] agg [%d]' %(total, reg, agg)