Beispiel #1
0
def PrintClusterRankSummary(datadir):
    sheets = range(0,maxWeek)
    
    lectures = fio.LoadDictJson('../data/CourseMIRROR/lectures.json')
    
    head = ['week', 'data', 'Point of Interest', "Muddiest Point"]
    body = []
    
    for i, sheet in enumerate(sheets):        
        row = []
        week = i + 1
        
        row.append(week)
        row.append(getDate(lectures, course, week))
        
        for type in ['q1', 'q2', 'q3', 'q4']:
            path = datadir + str(i+1)+ '/'
            summaryfile = path + type + '.summary'
            if not fio.IsExist(summaryfile): continue
            
            summaries = [line.strip() for line in fio.ReadFile(summaryfile)]
            
            sourcefile = path + type + '.summary.source'
            sources = [line.split(',') for line in fio.ReadFile(sourcefile)]
            
            combinedSummary = []
            for j, (summary, source) in enumerate(zip(summaries, sources)):
                summary = summary.replace('"', '\'')
                combinedSummary.append(str(j+1) + ") " + summary + " [" + str(len(source)) + "]")
            
            row.append('"' + chr(10).join(combinedSummary)+ '"') 
        
        body.append(row)
    fio.WriteMatrix(datadir + "summary.txt", body, head)
Beispiel #2
0
    def load(self, filename):
        self.filename = filename
        self.lines = fio.ReadFile(filename)

        self.extract_task_anntation()

        self.combine_info()
Beispiel #3
0
def getRouge_Tac(refs, model):
    #return the Rouge scores given the reference summary and the models
    
    #write the files
    fio.SaveList(model, tmpdir+'model.txt', '\n')
    
    for i, ref in enumerate(refs):
        fio.SaveList(ref, tmpdir+'ref%d.txt'%(i+1), '\n')
    
    retcode = subprocess.call(['./get_rouge_tac'], shell=True)
    if retcode != 0:
        print("Failed!")
        exit(-1)
    else:
        print "Passed!"
    
    row = []
    for scorename in RougeNames:
        filename = tmpdir + "OUT_"+scorename+".csv"
        lines = fio.ReadFile(filename)
        try:
            scorevalues = lines[1].split(',')
            score = scorevalues[1].strip()
            row.append(score)
            score = scorevalues[2].strip()
            row.append(score)
            score = scorevalues[3].strip()
            row.append(score)
        except Exception:
            print filename, scorename, lines
            
    return row
Beispiel #4
0
def SennaParseWithCountDict(filename):
    """
	@function: Parse the file and return a list of sentence with index.
	@param filename: string, the filename of the sennafile, the sennafile is an output file given by SENNA
	@return: <list, dict>, the dict stores for the start line for each sentence
	"""
    lines = fio.ReadFile(filename)
    print "nLine=", len(lines)
    sys.stdout.flush()

    CountDict = {}

    nCount = 0
    nLast = -1
    for i in range(len(lines)):
        line = lines[i]
        row = []
        line = line.strip()
        if len(line) == 0:  #the last sentence is finished
            CountDict[nCount] = nLast + 1
            nLast = i
            nCount = nCount + 1
    print "nCount=", nCount
    sys.stdout.flush()

    #for s in sentences:
    #	print s
    return lines, CountDict
Beispiel #5
0
def getPhraseClusterPhrase(phrasefile,
                           weightfile,
                           output,
                           ratio=None,
                           method=None):
    NPCandidates = fio.ReadFile(phrasefile)
    if len(NPCandidates) == 0: return

    NPs, matrix = fio.ReadMatrix(weightfile, hasHead=True)

    #change the similarity to distance
    matrix = Similarity2Distance(matrix)

    index = {}
    for i, NP in enumerate(NPs):
        index[NP] = i

    newMatrix = []

    for NP1 in NPCandidates:
        if NP1 not in index: continue

        i = index[NP1]

        row = []
        for NP2 in NPCandidates:
            if NP2 not in index:
                print NP2, weightfile, method
                continue

            j = index[NP2]
            row.append(matrix[i][j])

        newMatrix.append(row)

    V = len(NPCandidates)
    if ratio == "sqrt":
        K = int(math.sqrt(V))
    elif float(ratio) >= 1:
        K = int(ratio)
    else:
        K = int(ratio * V)

    if K < 1: K = 1

    K = min(K, V)

    clusterid = ClusterWrapper.KMedoidCluster(newMatrix, K)

    body = []
    for NP, id in zip(NPCandidates, clusterid):
        row = []
        row.append(NP)
        row.append(id)
        body.append(row)

    fio.WriteMatrix(output, body, header=None)
Beispiel #6
0
    def readgraph_partitions(self, input):
        lines = fio.ReadFile(input)

        communites = []
        for line in lines:
            if line.startswith('#'): continue

            nodes = [int(x) for x in line.strip().split()]

            communites.append(nodes)

        return communites
Beispiel #7
0
def getRougeTmp(ref, model):
    #return the Rouge scores given the reference summary and the models
    #create a temp file
    temp_path = mkdtemp()
    print(temp_path)
    
    #write the files
    fio.SaveList(ref, os.path.join(temp_path, 'ref.txt'), '\n')
    fio.SaveList(model, os.path.join(temp_path, 'model.txt'), '\n')
    
    retcode = subprocess.call(['./get_rouge_tmp %s'%temp_path], shell=True)
    if retcode != 0:
        print("Failed!")
        exit(-1)
    else:
        print "Passed!"
    
    row = []
    for scorename in RougeNames:
        filename = os.path.join(temp_path, "OUT_"+scorename+".csv")
        
        if not fio.IsExist(filename): 
            print filename, " not exist"
            row = row + [0, 0, 0]
            
            continue
        
        lines = fio.ReadFile(filename)
        try:
            scorevalues = lines[1].split(',')
            score = scorevalues[1].strip()
            row.append(score)
            score = scorevalues[2].strip()
            row.append(score)
            score = scorevalues[3].strip()
            row.append(score)
            fio.DeleteFolder(temp_path)
        except Exception:
            print filename, scorename, lines
            
    return row
Beispiel #8
0
def SennaParse(filename):
    """
	@function: Parse the file and return a list of sentence. Each sentence is a SennaSentence
	@param filename: string, the filename of the sennafile, the sennafile is an output file given by SENNA
	@return: list, Each item is a SennaSentence
	"""
    lines = fio.ReadFile(filename)
    #print "nLine=", len(lines)
    sys.stdout.flush()

    nCount = 0
    for line in lines:
        row = []
        line = line.strip()
        if len(line) == 0:  #the last sentence is finished
            nCount = nCount + 1
    #print "nCount=", nCount
    sys.stdout.flush()

    sentences = [None] * nCount
    nCount = 0

    tm = []
    for line in lines:
        row = []
        line = line.strip()
        if len(line) == 0:  #the last sentence is finished
            sentences[nCount] = SennaSentence(tm)
            nCount = nCount + 1
            tm = []
            continue

        for num in line.split("\t"):
            row.append(num.strip())
        tm.append(row)

    #for s in sentences:
    #	print s
    return sentences
Beispiel #9
0
def getRouge(ref, model):
    #return the Rouge scores given the reference summary and the models
    
    #write the files
    fio.SaveList(ref, tmpdir+'ref.txt', '\n')
    fio.SaveList(model, tmpdir+'model.txt', '\n')
    
    retcode = subprocess.call(['./get_rouge'], shell=True)
    if retcode != 0:
        print("Failed!")
        exit(-1)
    else:
        print "Passed!"
    
    row = []
    for scorename in RougeNames:
        filename = tmpdir + "OUT_"+scorename+".csv"
        
        if not fio.IsExist(filename): 
            print filename, " not exist"
            row = row + [0, 0, 0]
            
            continue
        
        lines = fio.ReadFile(filename)
        try:
            scorevalues = lines[1].split(',')
            score = scorevalues[1].strip()
            row.append(score)
            score = scorevalues[2].strip()
            row.append(score)
            score = scorevalues[3].strip()
            row.append(score)
        except Exception:
            print filename, scorename, lines
            
    return row
Beispiel #10
0
def getOracleRougeSplit(oracledir, np, L, metric, outputdir):
    #sheets = range(0,1)
    sheets = range(0,12)
    
    body = []
    
    for i, sheet in enumerate(sheets):
        week = i + 1
            
        #Add a cache to make it faster
        Cache = {}
        cachefile = oracledir + str(week) + '/' + 'cache.json'
        print cachefile
        if fio.IsExist(cachefile):
            with open(cachefile, 'r') as fin:
                Cache = json.load(fin)
        
        row = []
        for type in ['POI', 'MP', 'LP']:
            row.append(week)
        
            #read TA's summmary
            reffile = oracledir + str(week) + '/' + type + '.ref.summary'
            lines = fio.ReadFile(reffile)
            ref = [line.strip() for line in lines]
            
            Round = 1
            while True:
                sumfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + "." + str(metric) + '.R' + str(Round) +'.summary'
                if not fio.IsExist(sumfile): break
                Round = Round + 1
            
            Round = Round - 1
            sumfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + "." + str(metric) + '.R' + str(Round) +'.summary'
            
            if fio.IsExist(sumfile):
                import os
                ssfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + ".summary"
                cmd = 'cp ' + sumfile + ' ' + ssfile
                print cmd
                
                os.system(cmd)
                lines = fio.ReadFile(sumfile)
                TmpSum = [line.strip() for line in lines]
                
                cacheKey = getKey(ref, TmpSum)
                if cacheKey in Cache:
                    scores = Cache[cacheKey]
                    print "Hit"
                else:
                    print "Miss", cacheKey
                    print sumfile
                    scores = getRouge(ref, TmpSum)
                    Cache[cacheKey] = scores
                    #exit()
                
                row = row + scores
            else:
                row = row + [0]*len(RougeHeader)
            
        body.append(row)
    
    print body
    print "RougeHeader", len(RougeHeader)
    header = ['week'] + RougeHeader*3
    row = []
    row.append("average")
    print len(header)
    for i in range(1, len(header)):
        scores = [float(xx[i]) for xx in body]
        row.append(numpy.mean(scores))
    body.append(row)
    
    fio.WriteMatrix(outputdir + "rouge." + str(np) + '.L' + str(L) + "." + str(metric) + ".txt", body, header)
Beispiel #11
0
def Greedy(oracledir, np, L, metric='R1-F'):
    #sheets = range(0,1)
    sheets = range(0,12)
    RIndex = RougeHeader.index(metric)
    assert(RIndex != -1)
    
    for i, sheet in enumerate(sheets):
        week = i + 1
        
        #Add a cache to make it faster
        Cache = {}
        cachefile = oracledir + str(week) + '/' + 'cache.json'
        if fio.IsExist(cachefile):
            with open(cachefile, 'r') as fin:
                Cache = json.load(fin)
        
        #for type in ['POI']:
        for type in ['POI', 'MP', 'LP']:
            #read TA's summmary
            reffile = oracledir + str(week) + '/' + type + '.ref.summary'
            lines = fio.ReadFile(reffile)
            ref = [line.strip() for line in lines]
            
            #read Phrases
            phrasefile = oracledir + str(week) + '/' + type + '.' + str(np) + '.key'
            lines = fio.ReadFile(phrasefile)
            candidates = [line.strip() for line in lines]
            
            summary = []
            Length = 0
            
            maxSum = []
            maxScore = 0
            Round = 1
            
            Changed = True
            while Changed:
                Changed = False
                for phrase in candidates:
                    WC = len(phrase.split())
                    if Length + WC > L: continue
                    
                    TmpSum = copy.deepcopy(summary)
                    TmpSum.append(phrase)
                    
                    #get Rouge Score
                    cacheKey = getKey(ref, TmpSum)
                    if cacheKey in Cache:
                        scores = Cache[cacheKey]
                        print "Hit"
                    else:
                        scores = getRouge(ref, TmpSum)
                        Cache[cacheKey] = scores
                    
                    s = float(scores[RIndex])
                    #s = scores[RIndex]
                    if s > maxScore:
                        maxSum = TmpSum
                        maxScore = scores
                        Changed = True
                
                if Changed:
                    #write the results
                    sumfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + "." + str(metric) + '.R' + str(Round) +'.summary'
                    fio.SaveList(maxSum, sumfile, '\r\n')
                    
                    summary = maxSum
                    Length = 0
                    for s in maxSum:
                        Length = Length + len(s.split())
                    
                    Round = Round + 1
                    
                    newCandidates = []
                    #remove the candidate from the existing summary
                    for phrase in candidates:
                        if phrase not in maxSum:
                            newCandidates.append(phrase)
                    
                    candidates = newCandidates

        with open(cachefile, 'w') as outfile:
            json.dump(Cache, outfile, indent=2)
Beispiel #12
0
def getShallowSummary(excelfile, folder, clusterdir, K=30, method=None, similarity=None, ratio=None, lex='lexrank'):
    #K is the number of words per points
    sheets = range(0,maxWeek)
    
    for i, sheet in enumerate(sheets):
        week = i + 1
        
        for type in ['q1', 'q2', 'q3', 'q4']:
            
            path = folder + str(week)+ '/'
            fio.NewPath(path)
            filename = path + type + '.%d.summary'%ratio
            
            #produce the cluster file on the fly
            phrasefile = os.path.join(clusterdir, str(week), type + '.' + method + '.key')
            if not fio.IsExist(phrasefile): continue
            
            print excelfile, sheet, type
            
            cluster_output = clusterdir + str(week) +'/' + type + ".cluster.kmedoids." + str(ratio) + "." +similarity + '.' + method
            print cluster_output
            
            weightfile = clusterdir + str(week)+ '/' + type + '.' + method + '.' + similarity
            print weightfile
            
            if not fio.IsExist(cluster_output):
            #if True:
                print "clustering"
                phraseClusteringKmedoid.getPhraseClusterPhrase(phrasefile, weightfile, cluster_output, ratio, method=method)
            if not fio.IsExist(cluster_output): continue
            body = fio.ReadMatrix(cluster_output, False)
            
            NPCandidates = fio.ReadFile(phrasefile)
            
            lexfile = clusterdir + str(week)+ '/' + str(type) + "." + method + "."+lex+".dict"
            lexdict = fio.LoadDict(lexfile, 'float')
            
            NPs = [row[0] for row in body]
            clusterids = [row[1] for row in body]
            
            #assert(NPCandidates == NPs)
            if NPCandidates != NPs: 
                print NPCandidates
                print NPs
            
            cluster = {}
            for row in body:
                cluster[row[0]] = int(row[1])
            
            Summary = []
            
            #sort the clusters according to the number of response
            keys = postProcess.RankClusterNoSource(NPs, lexdict, clusterids)
            
            total_word = 0
            word_count = 0
            for key in keys:
                #phrase = NPs[key]
                phrase = postProcess.getTopRankPhraseNoSource(NPs, clusterids, int(key), lexdict)
                if phrase in Summary: continue
                
                word_count = len(phrase.split())
                total_word = total_word + word_count
                #if total_word <= K:
                if len(Summary) + 1 <= K:
                    Summary.append(phrase)
                    
            fio.SaveList(Summary, filename)
Beispiel #13
0
import sys
import re
import fio
import xml.etree.ElementTree as ET
from collections import defaultdict

import postProcess
import random
import CourseMirror_Survey
import phraseClusteringKmedoid
import os

stopwords = [line.lower().strip() for line in fio.ReadFile("../data/smart_common_words.txt")]
#print "stopwords:", len(stopwords)

noremove = ['nothing', 'none']
for w in noremove:
    if w in stopwords:
        index = stopwords.index(w)
        stopwords.pop(index)

stopwords = stopwords + ['.', '?', '-', ',', '[', ']', '-', ';', '\'', '"', '+', '&', '!', '/', '>', '<', ')', '(', '#', '=']

def getTopRankPhrase(NPs, clusterids, cid, lexdict, sources):
    #get cluster NP, and scores
    dict = {}
    
    s = []
    
    for NP, id, source in zip(NPs, clusterids, sources):
        if int(id) == cid:
Beispiel #14
0
]  #'none', "no", "nothing"

import datetime

RatingKey = {
    "slightly": 1,
    "somewhat": 2,
    "moderately": 3,
    "mostly": 4,
    "completely": 5
}

RateSplitTag = "||Rating: "

stopwordfilename = "../data/smart_common_words.txt"
stopwords = [line.lower().strip() for line in fio.ReadFile(stopwordfilename)]
punctuations = [
    '.', '?', '-', ',', '[', ']', '-', ';', '\'', '"', '+', '&', '!', '/', '>',
    '<', ')', '(', '#', '='
]

stopwords = stopwords + punctuations


def getRatingkey(rate):
    key = rate.strip().lower()
    if key in RatingKey:
        return RatingKey[key]
    return -1

Beispiel #15
0
def getRouge(datadir, maxWeek, output):
    print datadir

    sheets = range(0, maxWeek)

    body = []

    for sheet in sheets:
        week = sheet + 1
        dir = datadir + str(week) + '/'

        for type in ['q1', 'q2']:
            summary_file = dir + type + "." + 'summary'
            print summary_file

            if not fio.IsExist(summary_file):
                print summary_file
                continue

            Cache = {}
            cachefile = os.path.join(datadir, str(week), 'cache.json')
            print cachefile
            if fio.IsExist(cachefile):
                with open(cachefile, 'r') as fin:
                    Cache = json.load(fin)

            #read TA's summmary
            refs = []
            for i in range(2):
                reffile = os.path.join(datadir, str(week),
                                       type + '.ref.%d' % i)
                if not fio.IsExist(reffile):
                    print reffile
                    continue

                lines = fio.ReadFile(reffile)
                ref = [line.strip() for line in lines]
                refs.append(ref)

            if len(refs) == 0: continue

            lstref = refs[0] + refs[1]

            lines = fio.ReadFile(summary_file)
            TmpSum = [line.strip() for line in lines]

            cacheKey = OracleExperiment.getKey(lstref, TmpSum)
            if cacheKey in Cache:
                scores = Cache[cacheKey]
                print "Hit"
            else:
                print "Miss"
                print summary_file
                scores = OracleExperiment.getRouge_IE256(refs, TmpSum)
                Cache[cacheKey] = scores

            row = [week]
            row = row + scores

            body.append(row)

            try:
                fio.SaveDict2Json(Cache, cachefile)
            except Exception as e:
                #fio.SaveDict(Cache, cachefile + '.dict')
                print e

    header = ['id'] + RougeHeader
    row = ['ave']
    for i in range(1, len(header)):
        scores = [float(xx[i]) for xx in body]
        row.append(numpy.mean(scores))
    body.append(row)

    fio.WriteMatrix(output, body, header)
Beispiel #16
0
def getRouge(datadir, maxWeek, output):
    sheets = range(0, maxWeek)

    body = []
    allbody = []

    #Krange = range(1, 25)
    #Krange = range(1, 25)
    Krange = [gK]

    for sheet in sheets:
        week = sheet + 1
        dir = datadir + str(week) + '/'

        for type in ['q1', 'q2']:

            maxS = 0
            maxK = -1
            maxScore = []

            Cache = {}
            cachefile = os.path.join(datadir, str(week), 'cache.json')
            print cachefile
            if fio.IsExist(cachefile):
                with open(cachefile, 'r') as fin:
                    Cache = json.load(fin)

            allrow = [week]

            #Krange = [np.random.randint(1, 25)]

            for K in Krange:

                summary_file = dir + type + '.%d.summary' % K

                print summary_file

                if not fio.IsExist(summary_file):
                    print summary_file
                    continue

                #read TA's summmary
                refs = []
                for i in range(2):
                    reffile = os.path.join(datadir, str(week),
                                           type + '.ref.%d' % i)
                    if not fio.IsExist(reffile):
                        print reffile
                        continue

                    lines = fio.ReadFile(reffile)
                    ref = [line.strip() for line in lines]
                    refs.append(ref)

                if len(refs) == 0: continue

                lstref = refs[0] + refs[1]

                lines = fio.ReadFile(summary_file)
                TmpSum = [line.strip() for line in lines]

                cacheKey = OracleExperiment.getKey(lstref, TmpSum)
                if cacheKey in Cache:
                    scores = Cache[cacheKey]
                    print "Hit"
                else:
                    print "Miss"
                    print summary_file
                    scores = OracleExperiment.getRouge_IE256(refs, TmpSum)
                    Cache[cacheKey] = scores

                s = float(scores[RIndex])

                allrow.append(s)

                if s >= maxS:
                    maxS = s
                    maxScore = scores
                    maxK = K

            if maxK == -1: continue

            row = [week]
            row = row + maxScore + [maxK]

            body.append(row)

            allrow.append(maxK)

            allbody.append(allrow)

            try:
                fio.SaveDict2Json(Cache, cachefile)
            except:
                #fio.SaveDict(Cache, cachefile + '.dict')
                pass

    header = ['id'] + RougeHeader
    row = ['ave']
    for i in range(1, len(header)):
        scores = [float(xx[i]) for xx in body]
        row.append(numpy.mean(scores))
    body.append(row)

    fio.WriteMatrix(output, body, header)

    fio.WriteMatrix(output + '.all', allbody, ['week'] + Krange)