Ejemplo n.º 1
0
def getRouge_Tac(refs, model):
    #return the Rouge scores given the reference summary and the models
    
    #write the files
    fio.SaveList(model, tmpdir+'model.txt', '\n')
    
    for i, ref in enumerate(refs):
        fio.SaveList(ref, tmpdir+'ref%d.txt'%(i+1), '\n')
    
    retcode = subprocess.call(['./get_rouge_tac'], shell=True)
    if retcode != 0:
        print("Failed!")
        exit(-1)
    else:
        print "Passed!"
    
    row = []
    for scorename in RougeNames:
        filename = tmpdir + "OUT_"+scorename+".csv"
        lines = fio.ReadFile(filename)
        try:
            scorevalues = lines[1].split(',')
            score = scorevalues[1].strip()
            row.append(score)
            score = scorevalues[2].strip()
            row.append(score)
            score = scorevalues[3].strip()
            row.append(score)
        except Exception:
            print filename, scorename, lines
            
    return row
Ejemplo n.º 2
0
def extractPhrase(excelfile, folder, sennadatadir, method):
    sheets = range(0, maxWeek)

    for i, sheet in enumerate(sheets):
        week = i + 1

        for type in ['q1', 'q2', 'q3', 'q4']:
            #for type in ['POI', 'MP']:
            print excelfile, sheet, type
            student_summaryList = CourseMirror_Survey.getStudentResponseList(
                excelfile, course, week, type, withSource=False)
            if len(student_summaryList) == 0: continue

            path = folder + str(week) + '/'
            fio.NewPath(path)
            filename = path + type + '.' + method + '.key'

            sennafile = sennadatadir + "senna." + str(
                week) + "." + type + '.output'
            if not fio.IsExist(sennafile): continue

            phrases = getKeyPhrases(student_summaryList,
                                    sennafile,
                                    method=method,
                                    MalformedFlilter=True)

            fio.SaveList(phrases, filename)
Ejemplo n.º 3
0
def extractPhraseFromCRFWithColor(phrasedir, systemdir):
    crf_reader = CRF()
    aligner = AlignPhraseAnnotation()

    lectures = annotation.Lectures
    for i, lec in enumerate(lectures):
        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:
            filename = path + prompt + '.' + method + '.key'
            extracted_phrases = []
            extracted_colors = []

            crf_file = os.path.join(systemdir, 'extraction', 'all_output',
                                    'test_%i_%s.out' % (i, prompt))
            for tokens, tags, color0, color1 in crf_reader.read_file_generator_index(
                    crf_file, [0, -1, -4, -3]):
                phrases, phrase_colors = aligner.get_phrase_with_colors(
                    tokens, tags, [color0, color1])

                for phrase, phrase_color in zip(phrases, phrase_colors):

                    extracted_phrases.append(phrase.lower())
                    extracted_colors.append(phrase_color)

            fio.SaveList(extracted_phrases, filename)

            filename = path + prompt + '.' + method + '.key.color'
            fio.SaveDict2Json(extracted_colors, filename)
Ejemplo n.º 4
0
def get_phrase_reference_summary_phrase(outputs = None):
    
    for output in outputs:
        fio.NewPath(output)
        
        counts = []
        for doc, lec, annotator in annotation.generate_all_files(annotation.datadir + 'json/', '.json', anotators = annotation.anotators, lectures=annotation.Lectures):
            print doc
            
            task = annotation.Task()
            task.loadjson(doc)
            
            sub_tasks = task.get_tasks()
            
            for sub_task in sub_tasks:
                if sub_task["task_name"] == "Phrase":
                    if sub_task['prompt'] == 0: #POI
                        type = 'q1'
                    else: 
                        type = 'q2'
                    
                    summary_filename = os.path.join(output, str(lec), type+'.ref.' + str(annotation.anotator_dict[annotator])) 
                    #summary_filename = os.path.join(output, str(lec), type+'.ref.summary') 
                    
                    print summary_filename
                    
                    summaries = [row[1] for row in sub_task["summary"][1:]]
                    colors = [row[0].strip()[1] for row in sub_task["summary"][1:]]
                    student_numbers = [row[2].strip() for row in sub_task["summary"][1:]]
                    
                    count = 0
                    for summary in summaries:
                        count += len(NLTKWrapper.wordtokenizer(summary))
                    
                    counts.append(count)
                    fio.SaveList(summaries, summary_filename)
                    
                    color_filename = os.path.join(output, str(lec), '%s.ref.%s.color'%(type, str(annotation.anotator_dict[annotator])))
                    fio.SaveList(colors, color_filename)
                    
                    no_filename = os.path.join(output, str(lec), '%s.ref.%s.no'%(type, str(annotation.anotator_dict[annotator])))
                    fio.SaveList(student_numbers, no_filename)
        
        print counts
        print numpy.mean(counts)
        print numpy.median(counts)
Ejemplo n.º 5
0
def getRougeTmp(ref, model):
    #return the Rouge scores given the reference summary and the models
    #create a temp file
    temp_path = mkdtemp()
    print(temp_path)
    
    #write the files
    fio.SaveList(ref, os.path.join(temp_path, 'ref.txt'), '\n')
    fio.SaveList(model, os.path.join(temp_path, 'model.txt'), '\n')
    
    retcode = subprocess.call(['./get_rouge_tmp %s'%temp_path], shell=True)
    if retcode != 0:
        print("Failed!")
        exit(-1)
    else:
        print "Passed!"
    
    row = []
    for scorename in RougeNames:
        filename = os.path.join(temp_path, "OUT_"+scorename+".csv")
        
        if not fio.IsExist(filename): 
            print filename, " not exist"
            row = row + [0, 0, 0]
            
            continue
        
        lines = fio.ReadFile(filename)
        try:
            scorevalues = lines[1].split(',')
            score = scorevalues[1].strip()
            row.append(score)
            score = scorevalues[2].strip()
            row.append(score)
            score = scorevalues[3].strip()
            row.append(score)
            fio.DeleteFolder(temp_path)
        except Exception:
            print filename, scorename, lines
            
    return row
Ejemplo n.º 6
0
def getStudentResponses4Senna(excelfile, cid, maxWeek, datadir):
    sheets = range(1, maxWeek + 1)

    for sheet in sheets:
        week = sheet

        for type in ['q1', 'q2', 'q3', 'q4']:
            student_summaryList = getStudentResponseList(
                excelfile, cid, week, type)
            if len(student_summaryList) == 0: continue

            filename = datadir + "senna." + str(week) + "." + type + ".input"

            fio.SaveList(student_summaryList, filename)
Ejemplo n.º 7
0
def getRouge(ref, model):
    #return the Rouge scores given the reference summary and the models
    
    #write the files
    fio.SaveList(ref, tmpdir+'ref.txt', '\n')
    fio.SaveList(model, tmpdir+'model.txt', '\n')
    
    retcode = subprocess.call(['./get_rouge'], shell=True)
    if retcode != 0:
        print("Failed!")
        exit(-1)
    else:
        print "Passed!"
    
    row = []
    for scorename in RougeNames:
        filename = tmpdir + "OUT_"+scorename+".csv"
        
        if not fio.IsExist(filename): 
            print filename, " not exist"
            row = row + [0, 0, 0]
            
            continue
        
        lines = fio.ReadFile(filename)
        try:
            scorevalues = lines[1].split(',')
            score = scorevalues[1].strip()
            row.append(score)
            score = scorevalues[2].strip()
            row.append(score)
            score = scorevalues[3].strip()
            row.append(score)
        except Exception:
            print filename, scorename, lines
            
    return row
Ejemplo n.º 8
0
def PrepareIE256():
    cid = "IE256"
    maxWeek = 25

    excelfile = "../data/CourseMirror/Reflection.json"
    sennadir = "../../AbstractPhraseSummarization/data/IE256/senna/"

    #fio.NewPath(sennadir)
    #getStudentResponses4Senna(excelfile, cid, maxWeek, sennadir)

    outdirs = [  #'../../AbstractPhraseSummarization/data/IE256/ILP_Baseline_Sentence/',
        #'../../AbstractPhraseSummarization/data/IE256/MC/',
        #'../../AbstractPhraseSummarization/data/IE256/ILP_Sentence_MC/',
        '../../AbstractPhraseSummarization/data/IE256/ILP_Sentence_Supervised_FeatureWeightingAveragePerceptron/',
    ]

    sheets = range(1, maxWeek + 1)

    for outdir in outdirs:
        for sheet in sheets:
            week = sheet

            for type in ['q1', 'q2', 'q3', 'q4']:
                student_summaryList = getStudentResponseList(
                    excelfile, cid, week, type, True)
                if len(student_summaryList) == 0: continue

                path = os.path.join(outdir, str(week))
                fio.NewPath(path)

                source = {}
                responses = []
                count = defaultdict(int)
                for response, student in student_summaryList:
                    responses.append(response)
                    count[response] += 1

                    if response not in source:
                        source[response] = []
                    source[response].append(student)

                outout = os.path.join(path, type + ".sentence.key")
                fio.SaveList(set(responses), outout)

                output = os.path.join(path, type + '.sentence.keys.source')
                fio.SaveDict2Json(source, output)

                output = os.path.join(path, type + '.sentence.dict')
                fio.SaveDict(count, output)
Ejemplo n.º 9
0
def extractPhraseFromCRF(phrasedir, systemdir):
    crf_reader = CRF()
    aligner = AlignPhraseAnnotation()

    lectures = annotation.Lectures
    for i, lec in enumerate(lectures):
        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:
            filename = path + prompt + '.' + method + '.key'
            phrases = []

            crf_file = os.path.join(systemdir, 'extraction', 'all_output',
                                    'test_%i_%s.out' % (i, prompt))
            for tokens, tags in crf_reader.read_file_generator(crf_file):
                for phrase in aligner.get_phrase(tokens, tags):
                    phrases.append(phrase.lower())

            fio.SaveList(phrases, filename)
Ejemplo n.º 10
0
def extractPhraseFromAnnotationIntersect(phrasedir, annotators):
    for docs in annotation.generate_all_files_by_annotators(
            annotation.datadir + 'json/',
            '.json',
            anotators=annotators,
            lectures=annotation.Lectures):

        doc0, lec0, annotator0 = docs[0]
        doc1, lec1, annotator1 = docs[1]

        assert (lec0 == lec1)
        lec = lec0

        #load tasks
        task0 = annotation.Task()
        task0.loadjson(doc0)

        task1 = annotation.Task()
        task1.loadjson(doc1)

        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        for prompt in ['q1', 'q2']:
            filename = path + prompt + '.' + method + '.key'
            print filename

            extracted_phrases = []
            phrase_annotation0 = task0.get_phrase_annotation(prompt)
            phrase_annotation1 = task1.get_phrase_annotation(prompt)

            aligner = AlignPhraseAnnotation(task0, task1, prompt)
            aligner.align()
            extracted_phrases = aligner.get_intersect()

            fio.SaveList(extracted_phrases, filename)
Ejemplo n.º 11
0
def getOracleRouge(oracledir, np, L, metric, outputdir):
    #sheets = range(0,1)
    sheets = range(0,12)
    
    body = []
    
    for i, sheet in enumerate(sheets):
        week = i + 1
            
        #Add a cache to make it faster
        Cache = {}
        cachefile = oracledir + str(week) + '/' + 'cache.json'
        print cachefile
        if fio.IsExist(cachefile):
            with open(cachefile, 'r') as fin:
                Cache = json.load(fin)
        
        for type in ['POI', 'MP', 'LP']:
            row = []
            row.append(week)
        
            #read TA's summmary
            reffile = oracledir + str(week) + '/' + type + '.ref.summary'
            lines = fio.ReadFile(reffile)
            ref = [line.strip() for line in lines]
            
            Round = 1
            while True:
                sumfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + "." + str(metric) + '.R' + str(Round) +'.summary'
                if not fio.IsExist(sumfile): break
                Round = Round + 1
            
            Round = Round - 1
            sumfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + "." + str(metric) + '.R' + str(Round) +'.summary'
            
            if not fio.IsExist(sumfile):
                lines = []
            else:
                lines = fio.ReadFile(sumfile)
            TmpSum = [line.strip() for line in lines]
            
            newsumfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) +'.summary'
            fio.SaveList(TmpSum, newsumfile)
            
            cacheKey = getKey(ref, TmpSum)
            if cacheKey in Cache:
                scores = Cache[cacheKey]
                print "Hit"
            else:
                print "Miss", cacheKey
                print sumfile
                scores = getRouge(ref, TmpSum)
                Cache[cacheKey] = scores
                #exit()
            
            row = row + scores
            
            body.append(row)
            
    header = ['week'] + RougeHeader    
    row = []
    row.append("average")
    for i in range(1, len(header)):
        scores = [float(xx[i]) for xx in body]
        row.append(numpy.mean(scores))
    body.append(row)
    
    fio.WriteMatrix(outputdir + "rouge." + str(np) + '.L' + str(L) + "." + str(metric) + ".txt", body, header)
Ejemplo n.º 12
0
def Greedy(oracledir, np, L, metric='R1-F'):
    #sheets = range(0,1)
    sheets = range(0,12)
    RIndex = RougeHeader.index(metric)
    assert(RIndex != -1)
    
    for i, sheet in enumerate(sheets):
        week = i + 1
        
        #Add a cache to make it faster
        Cache = {}
        cachefile = oracledir + str(week) + '/' + 'cache.json'
        if fio.IsExist(cachefile):
            with open(cachefile, 'r') as fin:
                Cache = json.load(fin)
        
        #for type in ['POI']:
        for type in ['POI', 'MP', 'LP']:
            #read TA's summmary
            reffile = oracledir + str(week) + '/' + type + '.ref.summary'
            lines = fio.ReadFile(reffile)
            ref = [line.strip() for line in lines]
            
            #read Phrases
            phrasefile = oracledir + str(week) + '/' + type + '.' + str(np) + '.key'
            lines = fio.ReadFile(phrasefile)
            candidates = [line.strip() for line in lines]
            
            summary = []
            Length = 0
            
            maxSum = []
            maxScore = 0
            Round = 1
            
            Changed = True
            while Changed:
                Changed = False
                for phrase in candidates:
                    WC = len(phrase.split())
                    if Length + WC > L: continue
                    
                    TmpSum = copy.deepcopy(summary)
                    TmpSum.append(phrase)
                    
                    #get Rouge Score
                    cacheKey = getKey(ref, TmpSum)
                    if cacheKey in Cache:
                        scores = Cache[cacheKey]
                        print "Hit"
                    else:
                        scores = getRouge(ref, TmpSum)
                        Cache[cacheKey] = scores
                    
                    s = float(scores[RIndex])
                    #s = scores[RIndex]
                    if s > maxScore:
                        maxSum = TmpSum
                        maxScore = scores
                        Changed = True
                
                if Changed:
                    #write the results
                    sumfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + "." + str(metric) + '.R' + str(Round) +'.summary'
                    fio.SaveList(maxSum, sumfile, '\r\n')
                    
                    summary = maxSum
                    Length = 0
                    for s in maxSum:
                        Length = Length + len(s.split())
                    
                    Round = Round + 1
                    
                    newCandidates = []
                    #remove the candidate from the existing summary
                    for phrase in candidates:
                        if phrase not in maxSum:
                            newCandidates.append(phrase)
                    
                    candidates = newCandidates

        with open(cachefile, 'w') as outfile:
            json.dump(Cache, outfile, indent=2)
Ejemplo n.º 13
0
def extractPhraseFromAnnotation(phrasedir, annotator, summarydir=None):
    for doc, lec, annotator in annotation.generate_all_files(
            annotation.datadir + 'json/',
            '.json',
            anotators=annotator,
            lectures=annotation.Lectures):
        print doc

        #load task
        task = annotation.Task()
        task.loadjson(doc)

        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        #Add a cache to make it faster
        Cache = {}
        cachefile = phrasedir + str(lec) + '/' + 'cache.json'
        if fio.IsExist(cachefile):
            with open(cachefile, 'r') as fin:
                Cache = json.load(fin)

        for prompt in ['q1', 'q2']:
            filename = path + prompt + '.' + method + '.key'
            cluster_output = path + prompt + '.cluster.kmedoids.sqrt.oracle.%s' % method

            if summarydir:
                fio.NewPath(os.path.join(summarydir, str(lec)))
                summary_file = os.path.join(summarydir, str(lec),
                                            '%s.summary' % prompt)

            body = []

            if summarydir:
                summaries = []

            phrase_summary_dict = task.get_phrase_summary_textdict(prompt)
            extracted_phrases = []
            phrase_annotation = task.get_phrase_annotation(prompt)
            for rank in sorted(phrase_annotation):
                rank_phrases = []
                phrases = phrase_annotation[rank]
                for phrasedict in phrases:
                    phrase = phrasedict['phrase'].lower()
                    extracted_phrases.append(phrase)
                    rank_phrases.append(phrase)
                    row = [phrase, rank]
                    body.append(row)

                if summarydir:
                    rank_summary = phrase_summary_dict[rank]
                    max_summary = get_max_phrase_by_ROUGE(
                        rank_summary, rank_phrases, Cache)
                    print max_summary

                    summaries.append(max_summary)

            fio.SaveList(extracted_phrases, filename)

            fio.WriteMatrix(cluster_output, body, header=None)

            if summarydir:
                fio.SaveList(summaries, summary_file)

            with open(cachefile, 'w') as outfile:
                json.dump(Cache, outfile, indent=2)
Ejemplo n.º 14
0
def getShallowSummary(excelfile, folder, clusterdir, K=30, method=None, similarity=None, ratio=None, lex='lexrank'):
    #K is the number of words per points
    sheets = range(0,maxWeek)
    
    for i, sheet in enumerate(sheets):
        week = i + 1
        
        for type in ['q1', 'q2', 'q3', 'q4']:
            
            path = folder + str(week)+ '/'
            fio.NewPath(path)
            filename = path + type + '.%d.summary'%ratio
            
            #produce the cluster file on the fly
            phrasefile = os.path.join(clusterdir, str(week), type + '.' + method + '.key')
            if not fio.IsExist(phrasefile): continue
            
            print excelfile, sheet, type
            
            cluster_output = clusterdir + str(week) +'/' + type + ".cluster.kmedoids." + str(ratio) + "." +similarity + '.' + method
            print cluster_output
            
            weightfile = clusterdir + str(week)+ '/' + type + '.' + method + '.' + similarity
            print weightfile
            
            if not fio.IsExist(cluster_output):
            #if True:
                print "clustering"
                phraseClusteringKmedoid.getPhraseClusterPhrase(phrasefile, weightfile, cluster_output, ratio, method=method)
            if not fio.IsExist(cluster_output): continue
            body = fio.ReadMatrix(cluster_output, False)
            
            NPCandidates = fio.ReadFile(phrasefile)
            
            lexfile = clusterdir + str(week)+ '/' + str(type) + "." + method + "."+lex+".dict"
            lexdict = fio.LoadDict(lexfile, 'float')
            
            NPs = [row[0] for row in body]
            clusterids = [row[1] for row in body]
            
            #assert(NPCandidates == NPs)
            if NPCandidates != NPs: 
                print NPCandidates
                print NPs
            
            cluster = {}
            for row in body:
                cluster[row[0]] = int(row[1])
            
            Summary = []
            
            #sort the clusters according to the number of response
            keys = postProcess.RankClusterNoSource(NPs, lexdict, clusterids)
            
            total_word = 0
            word_count = 0
            for key in keys:
                #phrase = NPs[key]
                phrase = postProcess.getTopRankPhraseNoSource(NPs, clusterids, int(key), lexdict)
                if phrase in Summary: continue
                
                word_count = len(phrase.split())
                total_word = total_word + word_count
                #if total_word <= K:
                if len(Summary) + 1 <= K:
                    Summary.append(phrase)
                    
            fio.SaveList(Summary, filename)