Ejemplo n.º 1
0
def train_IE256_svm(traincourse, model_dir, name='simlearn_cv'):
    sim_extractor = Similarity()
    allfeatures = sorted(sim_extractor.features.keys())

    features = allfeatures

    name = '_'.join(features)

    lectures = annotation.Lectures

    dict = defaultdict(int)

    if traincourse == 'IE256':
        train = [x for x in range(14, 26) if x != 22]
    else:
        train = [x for x in range(3, 27)]

    model_file = os.path.join(model_dir, '%s_%s.model' % (traincourse, name))

    if fio.IsExist(model_file):
        with open(model_file, 'rb') as handle:
            clf = pickle.load(handle)
    else:
        train_X, train_Y = combine_files_course(traincourse, train, features)
        clf = svm.SVC()
        clf.fit(train_X, train_Y)

        with open(model_file, 'wb') as handle:
            pickle.dump(clf, handle)
Ejemplo n.º 2
0
def extractPhrase(excelfile, folder, sennadatadir, method):
    sheets = range(0, maxWeek)

    for i, sheet in enumerate(sheets):
        week = i + 1

        for type in ['q1', 'q2', 'q3', 'q4']:
            #for type in ['POI', 'MP']:
            print excelfile, sheet, type
            student_summaryList = CourseMirror_Survey.getStudentResponseList(
                excelfile, course, week, type, withSource=False)
            if len(student_summaryList) == 0: continue

            path = folder + str(week) + '/'
            fio.NewPath(path)
            filename = path + type + '.' + method + '.key'

            sennafile = sennadatadir + "senna." + str(
                week) + "." + type + '.output'
            if not fio.IsExist(sennafile): continue

            phrases = getKeyPhrases(student_summaryList,
                                    sennafile,
                                    method=method,
                                    MalformedFlilter=True)

            fio.SaveList(phrases, filename)
Ejemplo n.º 3
0
def PrintClusterRankSummary(datadir):
    sheets = range(0,maxWeek)
    
    lectures = fio.LoadDictJson('../data/CourseMIRROR/lectures.json')
    
    head = ['week', 'data', 'Point of Interest', "Muddiest Point"]
    body = []
    
    for i, sheet in enumerate(sheets):        
        row = []
        week = i + 1
        
        row.append(week)
        row.append(getDate(lectures, course, week))
        
        for type in ['q1', 'q2', 'q3', 'q4']:
            path = datadir + str(i+1)+ '/'
            summaryfile = path + type + '.summary'
            if not fio.IsExist(summaryfile): continue
            
            summaries = [line.strip() for line in fio.ReadFile(summaryfile)]
            
            sourcefile = path + type + '.summary.source'
            sources = [line.split(',') for line in fio.ReadFile(sourcefile)]
            
            combinedSummary = []
            for j, (summary, source) in enumerate(zip(summaries, sources)):
                summary = summary.replace('"', '\'')
                combinedSummary.append(str(j+1) + ") " + summary + " [" + str(len(source)) + "]")
            
            row.append('"' + chr(10).join(combinedSummary)+ '"') 
        
        body.append(row)
    fio.WriteMatrix(datadir + "summary.txt", body, head)
Ejemplo n.º 4
0
def train_leave_one_lecture_out(model_dir, name='simlearn_cv'):
    #     model_dir = '../data/IE256/%s/model/%s/'%(system, name)
    #     fio.NewPath(model_dir)
    #
    #     outputdir = '../data/IE256/%s/extraction/%s_output/'%(system, name)
    #     fio.NewPath(outputdir)

    sim_extractor = Similarity()
    allfeatures = sorted(sim_extractor.features.keys())

    if True:
        k = len(allfeatures)
        #for k in range(len(allfeatures)+1):
        #features = allfeatures#['WordEmbedding']

        if k == len(allfeatures):  #use all features
            features = allfeatures
        else:
            features = [allfeatures[k]]

        name = '_'.join(features)

        lectures = annotation.Lectures

        dict = defaultdict(int)

        MSE = []
        for i, lec in enumerate(lectures):
            train = [x for x in lectures if x != lec]
            test = [lec]

            print train
            print test

            model_file = os.path.join(model_dir, '%d_%s.model' % (lec, name))

            if fio.IsExist(model_file):
                with open(model_file, 'rb') as handle:
                    clf = pickle.load(handle)
            else:
                train_X, train_Y = combine_files(train, features)
                clf = svm.SVR()
                clf.fit(train_X, train_Y)

                with open(model_file, 'wb') as handle:
                    pickle.dump(clf, handle)

            for q in ['q1', 'q2']:
                test_X, test_Y = combine_files(test, features, prompts=[q])
                predict_Y = clf.predict(test_X)

                mse = mean_squared_error(test_Y, predict_Y)

                MSE.append([lec, q, mse])

        output = '../data/%s/simlearning.cv.%s.txt' % (course, name)

        fio.WriteMatrix(output, MSE, header=['lec', 'prompt', 'MSE'])
Ejemplo n.º 5
0
def generate_all_files(datadir,
                       extension,
                       anotators=anotators,
                       lectures=AllLectures):
    for annotator in anotators:
        for lec in lectures:
            filename = datadir + annotator + doc_prefix + str(
                lec) + '_Completed' + extension

            assert (fio.IsExist(filename))

            yield filename, lec, annotator
Ejemplo n.º 6
0
def WriteDocsent(excelfile, folder, phrasedir, np=None):
    sheets = range(0, maxWeek)

    for i, sheet in enumerate(sheets):
        week = i + 1

        for type in ['q1', 'q2', 'q3', 'q4']:

            phrasefile = os.path.join(phrasedir, str(week),
                                      type + '.' + method + '.key')
            if not fio.IsExist(phrasefile): continue

            print phrasefile

            DID = str(week) + '_' + type

            path = folder + str(week) + '/'
            fio.NewPath(path)
            path = path + type + '/'
            fio.NewPath(path)
            path = path + 'docsent/'
            fio.NewPath(path)
            filename = path + DID + '.docsent'

            #create a XML file
            root = ET.Element(tag='DOCSENT',
                              attrib={
                                  'DID': DID,
                                  'LANG': "ENG"
                              })
            root.tail = '\n'
            tree = ET.ElementTree(root)

            phrases = fio.ReadFileUTF8(phrasefile)

            sno_id = 1
            for par, phrase in enumerate(phrases):
                phrase = phrase.rstrip()
                s = [phrase]

                for RSNT, value in enumerate(s):
                    node = ET.Element(tag='S',
                                      attrib={
                                          'PAR': str(par + 1),
                                          'RSNT': str(RSNT + 1),
                                          'SNO': str(sno_id)
                                      })
                    node.text = value
                    node.tail = '\n'
                    root.append(node)
                    sno_id = sno_id + 1

            tree.write(filename)
Ejemplo n.º 7
0
def gather_rouge(output):
    datadir = '../data/%s/' % course

    #output = '../data/IE256/result.rouge.txt'

    models = [
        'QPS_NP',
        #'QPS_A1_N', 'QPS_A2_N', 'QPS_union', 'QPS_intersect',
        'QPS_combine'
    ]
    methods = [
        'rouge_crf_optimumComparerLSATasa',
        'rouge_crf_ct.svm.default',
        #'rouge_crf_svm',
        #'rouge_crf_svr',
        'rouge_crf_ct.svm.default',
        #'rouge_crf_ct.svr.default',
    ]

    Header = [
        'method',
        'model',
        'R1-R',
        'R1-P',
        'R1-F',
        'R2-R',
        'R2-P',
        'R2-F',
        'RSU4-R',
        'RSU4-P',
        'RSU4-F',
    ]

    xbody = []
    for method in methods:
        for model in models:

            filename = os.path.join(datadir, model, "%s.txt" % method)

            if not fio.IsExist(filename): continue

            head, body = fio.ReadMatrix(filename, hasHead=True)

            row = [method, model]
            row += body[-1][1:]

            xbody.append(row)

    fio.WriteMatrix(output, xbody, Header)
Ejemplo n.º 8
0
def test_cross_course(train, name='all'):
    wapiti_home = global_params.wapiti_dir

    pattern_file = '../data/%s.pattern.txt' % name
    model_dir = '../data/%s/%s/model/%s/' % (course, system, name)
    fio.NewPath(model_dir)

    feature_dir = '../data/%s/%s/extraction/' % (course, system)
    feature_cv_dir = '../data/%s/%s/extraction/%s/' % (course, system, name)
    fio.NewPath(feature_cv_dir)

    outputdir = '../data/%s/%s/extraction/%s_output/' % (course, system, name)
    fio.NewPath(outputdir)

    lectures = annotation.Lectures

    dict = defaultdict(int)

    for i, lec in enumerate(lectures):
        test = [lec]
        model_file = os.path.join(model_dir, '%s.model' % train)

        print model_file

        crf = CRF(wapiti_home)
        if not fio.IsExist(model_file):
            print "Model is not available"

        for q in ['q1', 'q2']:

            test_filename = os.path.join(feature_cv_dir,
                                         'test_%d_%s.feature.crf' % (i, q))
            output_file = os.path.join(outputdir, 'test_%d_%s.out' % (i, q))

            dict['test_%d_%s' % (i, q)] = 1

            if method == 'combine':
                test_filename_old = test_filename.replace('_combine', '_A1')
                cmd = 'cp %s %s' % (test_filename_old, test_filename)
                os.system(cmd)
            else:
                combine_files(feature_dir, test, test_filename, prompts=[q])

            crf.predict(test_filename, model_file, output_file)

        if debug: break

    file_util.save_dict2json(dict, class_index_dict_file)
Ejemplo n.º 9
0
def getRougeTmp(ref, model):
    #return the Rouge scores given the reference summary and the models
    #create a temp file
    temp_path = mkdtemp()
    print(temp_path)
    
    #write the files
    fio.SaveList(ref, os.path.join(temp_path, 'ref.txt'), '\n')
    fio.SaveList(model, os.path.join(temp_path, 'model.txt'), '\n')
    
    retcode = subprocess.call(['./get_rouge_tmp %s'%temp_path], shell=True)
    if retcode != 0:
        print("Failed!")
        exit(-1)
    else:
        print "Passed!"
    
    row = []
    for scorename in RougeNames:
        filename = os.path.join(temp_path, "OUT_"+scorename+".csv")
        
        if not fio.IsExist(filename): 
            print filename, " not exist"
            row = row + [0, 0, 0]
            
            continue
        
        lines = fio.ReadFile(filename)
        try:
            scorevalues = lines[1].split(',')
            score = scorevalues[1].strip()
            row.append(score)
            score = scorevalues[2].strip()
            row.append(score)
            score = scorevalues[3].strip()
            row.append(score)
            fio.DeleteFolder(temp_path)
        except Exception:
            print filename, scorename, lines
            
    return row
Ejemplo n.º 10
0
    def __init__(self, prefix=""):
        self.features = {
            'optimumComparerLSATasa': self.LSA,
            'LexicalOverlap': self.LexicalOverlap,
            'optimumComparerWNLin': self.LIN,
            'BLEU': self.BLEU,
            'ROUGE': self.ROUGE,
            'Cosine': self.Cosine,
            'WordEmbedding': self.WordEmbedding,
            #'WMD': self.WMD,
        }

        self.prefix = prefix

        self.Cache = {}
        self.cachefile = os.path.join(prefix + 'cache.json')
        print self.cachefile
        if fio.IsExist(self.cachefile):
            with open(self.cachefile, 'r') as fin:
                self.Cache = json.load(fin)

        if self.prefix != '':
            self.matrixdict = {}
            for sim in [
                    'optimumComparerLSATasa', 'LexicalOverlap',
                    'optimumComparerWNLin', 'BLEU'
            ]:
                self.matrixdict[sim] = {}

                filename = self.prefix + sim

                phrases, matrix = fio.ReadMatrix(filename, hasHead=True)

                index = {}
                for i, p in enumerate(phrases):
                    index[p] = i

                self.matrixdict[sim]['index'] = index
                self.matrixdict[sim]['matrix'] = matrix

        self.word2vec = fio.LoadDictJson(global_params.word2vec_model)
Ejemplo n.º 11
0
def readgraph_leave_one_lecture_out(phrasedir, modelname='svr'):
    lectures = annotation.Lectures

    oslom = OSLOM()

    if modelname == 'svr':
        weighted = True
        undirect = True
    else:
        weighted = False
        undirect = True

    for i, lec in enumerate(lectures):
        path = os.path.join(phrasedir, str(lec))

        for q in ['q1', 'q2']:
            #write the output
            phrasefile = os.path.join(path, "%s.%s.key" % (q, method))
            phrases = fio.LoadList(phrasefile)

            netgraphfile = os.path.join(
                path,
                "%s.%s.%s%s_oslo_files" % (q, method, modelname, net_exe),
                'tp')

            if not fio.IsExist(netgraphfile):  #no communities
                print netgraphfile
                communites = [[x] for x in range(len(phrases))]
            else:
                communites = oslom.readgraph_partitions(netgraphfile)

                #if len(communites) == 1:#break it
                #    communites = [[x] for x in range(len(phrases))]

            name = 'ct.%s.%s' % (modelname, 'default')
            output = os.path.join(
                path, "%s.cluster.kmedoids.sqrt.%s.%s" % (q, name, method))
            write_communite_to_clusters(communites, phrases, output)

            print "%d\t%s\t%d" % (lec, q, len(communites))
Ejemplo n.º 12
0
def getRouge(ref, model):
    #return the Rouge scores given the reference summary and the models
    
    #write the files
    fio.SaveList(ref, tmpdir+'ref.txt', '\n')
    fio.SaveList(model, tmpdir+'model.txt', '\n')
    
    retcode = subprocess.call(['./get_rouge'], shell=True)
    if retcode != 0:
        print("Failed!")
        exit(-1)
    else:
        print "Passed!"
    
    row = []
    for scorename in RougeNames:
        filename = tmpdir + "OUT_"+scorename+".csv"
        
        if not fio.IsExist(filename): 
            print filename, " not exist"
            row = row + [0, 0, 0]
            
            continue
        
        lines = fio.ReadFile(filename)
        try:
            scorevalues = lines[1].split(',')
            score = scorevalues[1].strip()
            row.append(score)
            score = scorevalues[2].strip()
            row.append(score)
            score = scorevalues[3].strip()
            row.append(score)
        except Exception:
            print filename, scorename, lines
            
    return row
Ejemplo n.º 13
0
def train_on_course(traincourse, name='all'):
    wapiti_home = global_params.wapiti_dir

    pattern_file = '../data/%s.pattern.txt' % name
    model_dir = '../data/%s/%s/model/%s/' % (course, system, name)
    fio.NewPath(model_dir)

    feature_dir = '../data/%s/%s/extraction/' % (traincourse, system)
    feature_cv_dir = '../data/%s/%s/extraction/%s/' % (traincourse, system,
                                                       name)
    fio.NewPath(feature_cv_dir)

    outputdir = '../data/%s/%s/extraction/%s_output/' % (course, system, name)
    fio.NewPath(outputdir)

    if traincourse == 'IE256':
        lectures = [x for x in range(14, 26) if x != 22]
    else:
        lectures = [x for x in range(3, 27)]

    dict = defaultdict(int)

    train = [x for x in lectures]

    train_filename = os.path.join(feature_cv_dir, 'train.feature.crf')

    model_file = os.path.join(model_dir, '%s.model' % traincourse)

    print train_filename
    print model_file

    crf = CRF(wapiti_home)
    if not fio.IsExist(model_file):
        #if True:
        combine_files(feature_dir, train, train_filename)
        crf.train(train_filename, pattern_file, model_file)
Ejemplo n.º 14
0
def getOracleRougeSplit(oracledir, np, L, metric, outputdir):
    #sheets = range(0,1)
    sheets = range(0,12)
    
    body = []
    
    for i, sheet in enumerate(sheets):
        week = i + 1
            
        #Add a cache to make it faster
        Cache = {}
        cachefile = oracledir + str(week) + '/' + 'cache.json'
        print cachefile
        if fio.IsExist(cachefile):
            with open(cachefile, 'r') as fin:
                Cache = json.load(fin)
        
        row = []
        for type in ['POI', 'MP', 'LP']:
            row.append(week)
        
            #read TA's summmary
            reffile = oracledir + str(week) + '/' + type + '.ref.summary'
            lines = fio.ReadFile(reffile)
            ref = [line.strip() for line in lines]
            
            Round = 1
            while True:
                sumfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + "." + str(metric) + '.R' + str(Round) +'.summary'
                if not fio.IsExist(sumfile): break
                Round = Round + 1
            
            Round = Round - 1
            sumfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + "." + str(metric) + '.R' + str(Round) +'.summary'
            
            if fio.IsExist(sumfile):
                import os
                ssfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + ".summary"
                cmd = 'cp ' + sumfile + ' ' + ssfile
                print cmd
                
                os.system(cmd)
                lines = fio.ReadFile(sumfile)
                TmpSum = [line.strip() for line in lines]
                
                cacheKey = getKey(ref, TmpSum)
                if cacheKey in Cache:
                    scores = Cache[cacheKey]
                    print "Hit"
                else:
                    print "Miss", cacheKey
                    print sumfile
                    scores = getRouge(ref, TmpSum)
                    Cache[cacheKey] = scores
                    #exit()
                
                row = row + scores
            else:
                row = row + [0]*len(RougeHeader)
            
        body.append(row)
    
    print body
    print "RougeHeader", len(RougeHeader)
    header = ['week'] + RougeHeader*3
    row = []
    row.append("average")
    print len(header)
    for i in range(1, len(header)):
        scores = [float(xx[i]) for xx in body]
        row.append(numpy.mean(scores))
    body.append(row)
    
    fio.WriteMatrix(outputdir + "rouge." + str(np) + '.L' + str(L) + "." + str(metric) + ".txt", body, header)
Ejemplo n.º 15
0
def gather_rouge():

    Allbody = []
    for cid in [
            'IE256',
            'IE256_2016',
            'CS0445',
    ]:

        ilpdir = "../data/%s/" % cid
        baseline_rougefile = os.path.join(ilpdir, 'rouge_np.txt')
        if not fio.IsExist(baseline_rougefile): continue

        basehead, basebody = fio.ReadMatrix(baseline_rougefile, hasHead=True)
        row = [cid, '', 'PhrasSum'
               ] + ['%.3f' % float(x) for x in basebody[-1][1:-3]]
        Allbody.append(row)

        for A in [
                '1',
                '2',
        ]:

            for model in [
                    'optimumComparerLSATasa', 'oracle', 'oracle_selection'
            ]:

                modeldir = os.path.join(ilpdir, 'oracle_annotator_%s' % A)
                model_rouge_file = os.path.join(
                    modeldir, 'rouge_annotator%s_%s.txt' % (A, model))
                head, body = fio.ReadMatrix(model_rouge_file, hasHead=True)

                if model == 'optimumComparerLSATasa':
                    basehead1, basebody1 = fio.ReadMatrix(model_rouge_file,
                                                          hasHead=True)
                elif model == 'oracle':
                    basehead2, basebody2 = fio.ReadMatrix(model_rouge_file,
                                                          hasHead=True)

                row = [cid, 'A%s' % A, model
                       ] + ['%.3f' % float(x) for x in body[-1][1:-3]]

                print cid, model
                print model_rouge_file
                print baseline_rougefile
                #get p values
                from stats_util import get_ttest_pvalues
                pvalues = get_ttest_pvalues(basebody[1:-1], body[1:-1],
                                            range(1,
                                                  len(head) - 3))

                if model == 'optimumComparerLSATasa':
                    k = 3
                    for p in pvalues:
                        if p < 0.05:
                            row[k] = row[k] + '$^*$'
                        k += 1
                elif model == 'oracle':
                    pvalues1 = get_ttest_pvalues(basebody1[1:-1], body[1:-1],
                                                 range(1,
                                                       len(head) - 3))

                    k = 3
                    for p1, p2 in zip(pvalues, pvalues1):
                        if p1 < 0.05 and p2 < 0.05:
                            row[k] = row[k] + '$^{*\dag}$'
                        elif p1 < 0.05:
                            row[k] = row[k] + '$^*$'
                        elif p2 < 0.05:
                            row[k] = row[k] + '$^\dag$'
                        k += 1

                elif model == 'oracle_selection':
                    pvalues1 = get_ttest_pvalues(basebody1[1:-1], body[1:-1],
                                                 range(1,
                                                       len(head) - 3))
                    pvalues2 = get_ttest_pvalues(basebody2[1:-1], body[1:-1],
                                                 range(1,
                                                       len(head) - 3))

                    k = 3
                    for p1, p2, p3 in zip(pvalues, pvalues1, pvalues2):
                        if p1 >= 0.05 and p2 >= 0.05 and p3 >= 0.05:
                            k += 1
                            continue

                        row[k] = row[k] + '$^{'

                        if p1 < 0.05:
                            row[k] = row[k] + '*'
                        if p2 < 0.05:
                            row[k] = row[k] + '\dag'

                        if p3 < 0.05:
                            row[k] = row[k] + '\circ'

                        row[k] = row[k] + '}$'
                        k += 1

                Allbody.append(row)

    output = '../data/rouge_oracle_all_gather.txt'
    fio.Write2Latex(output, Allbody, [''] + head)
Ejemplo n.º 16
0
def Greedy(oracledir, np, L, metric='R1-F'):
    #sheets = range(0,1)
    sheets = range(0,12)
    RIndex = RougeHeader.index(metric)
    assert(RIndex != -1)
    
    for i, sheet in enumerate(sheets):
        week = i + 1
        
        #Add a cache to make it faster
        Cache = {}
        cachefile = oracledir + str(week) + '/' + 'cache.json'
        if fio.IsExist(cachefile):
            with open(cachefile, 'r') as fin:
                Cache = json.load(fin)
        
        #for type in ['POI']:
        for type in ['POI', 'MP', 'LP']:
            #read TA's summmary
            reffile = oracledir + str(week) + '/' + type + '.ref.summary'
            lines = fio.ReadFile(reffile)
            ref = [line.strip() for line in lines]
            
            #read Phrases
            phrasefile = oracledir + str(week) + '/' + type + '.' + str(np) + '.key'
            lines = fio.ReadFile(phrasefile)
            candidates = [line.strip() for line in lines]
            
            summary = []
            Length = 0
            
            maxSum = []
            maxScore = 0
            Round = 1
            
            Changed = True
            while Changed:
                Changed = False
                for phrase in candidates:
                    WC = len(phrase.split())
                    if Length + WC > L: continue
                    
                    TmpSum = copy.deepcopy(summary)
                    TmpSum.append(phrase)
                    
                    #get Rouge Score
                    cacheKey = getKey(ref, TmpSum)
                    if cacheKey in Cache:
                        scores = Cache[cacheKey]
                        print "Hit"
                    else:
                        scores = getRouge(ref, TmpSum)
                        Cache[cacheKey] = scores
                    
                    s = float(scores[RIndex])
                    #s = scores[RIndex]
                    if s > maxScore:
                        maxSum = TmpSum
                        maxScore = scores
                        Changed = True
                
                if Changed:
                    #write the results
                    sumfile = oracledir + str(week) + '/' + type + '.' + str(np) + '.L' + str(L) + "." + str(metric) + '.R' + str(Round) +'.summary'
                    fio.SaveList(maxSum, sumfile, '\r\n')
                    
                    summary = maxSum
                    Length = 0
                    for s in maxSum:
                        Length = Length + len(s.split())
                    
                    Round = Round + 1
                    
                    newCandidates = []
                    #remove the candidate from the existing summary
                    for phrase in candidates:
                        if phrase not in maxSum:
                            newCandidates.append(phrase)
                    
                    candidates = newCandidates

        with open(cachefile, 'w') as outfile:
            json.dump(Cache, outfile, indent=2)
Ejemplo n.º 17
0
def GetLexRankScore(datadir, np, outputdir):
    sheets = range(0, maxWeek)
    
    for type in ['q1', 'q2', 'q3', 'q4']:
        for sheet in sheets:
            week = sheet + 1
            
            DID = str(week) + '_' + type
            
            phrases = []
            scores = []
    
            #read Docsent
            path = datadir + str(week)+ '/'
            path = path + type + '/'
            path = path + 'docsent/'
            filename = path + DID + '.docsent'
            #print filename
            if not fio.IsExist(filename): continue
            
            tree = ET.parse(filename)
            root = tree.getroot()
            
            for child in root:
                phrases.append(child.text)
            
            #read feature
            path = datadir + str(week)+ '/'
            path = path + type + '/'
            path = path + 'feature/'
            filename = path + type + '.LexRank.sentfeature'
            
            if fio.IsExist(filename):
                tree = ET.parse(filename)
                root = tree.getroot()
                
                for child in root:
                    feature = child[0]
                    #print feature.tag, feature.attrib, feature.attrib['V']
                    #print child.tag, child.attrib
                    scores.append(feature.attrib['V'])
            else:
                for phrase in phrases:
                    scores.append("0")
                
            #write
            assert(len(phrases) == len(scores))
            
            dict = {}
            for phrase, score in zip(phrases, scores):
                dict[phrase.lower()] = score
            
            output = outputdir + str(week)+ '/' + str(type) + "." + np + ".lexrank.dict"
            fio.NewPath(outputdir + str(week)+ '/')
            fio.SaveDict(dict, output, SortbyValueflag=True)
            
            dict = {}
            for phrase, score in zip(phrases, scores):
                if phrase.lower() in dict:
                    dict[phrase.lower()] = max(score, dict[phrase.lower()])
                else:
                    dict[phrase.lower()] = score
            
            output = outputdir + str(week)+ '/' + str(type) + "." + np + ".lexrankmax.dict"
            fio.SaveDict(dict, output, SortbyValueflag=True)
Ejemplo n.º 18
0
def getRouge(datadir, maxWeek, output):
    print datadir

    sheets = range(0, maxWeek)

    body = []

    for sheet in sheets:
        week = sheet + 1
        dir = datadir + str(week) + '/'

        for type in ['q1', 'q2']:
            summary_file = dir + type + "." + 'summary'
            print summary_file

            if not fio.IsExist(summary_file):
                print summary_file
                continue

            Cache = {}
            cachefile = os.path.join(datadir, str(week), 'cache.json')
            print cachefile
            if fio.IsExist(cachefile):
                with open(cachefile, 'r') as fin:
                    Cache = json.load(fin)

            #read TA's summmary
            refs = []
            for i in range(2):
                reffile = os.path.join(datadir, str(week),
                                       type + '.ref.%d' % i)
                if not fio.IsExist(reffile):
                    print reffile
                    continue

                lines = fio.ReadFile(reffile)
                ref = [line.strip() for line in lines]
                refs.append(ref)

            if len(refs) == 0: continue

            lstref = refs[0] + refs[1]

            lines = fio.ReadFile(summary_file)
            TmpSum = [line.strip() for line in lines]

            cacheKey = OracleExperiment.getKey(lstref, TmpSum)
            if cacheKey in Cache:
                scores = Cache[cacheKey]
                print "Hit"
            else:
                print "Miss"
                print summary_file
                scores = OracleExperiment.getRouge_IE256(refs, TmpSum)
                Cache[cacheKey] = scores

            row = [week]
            row = row + scores

            body.append(row)

            try:
                fio.SaveDict2Json(Cache, cachefile)
            except Exception as e:
                #fio.SaveDict(Cache, cachefile + '.dict')
                print e

    header = ['id'] + RougeHeader
    row = ['ave']
    for i in range(1, len(header)):
        scores = [float(xx[i]) for xx in body]
        row.append(numpy.mean(scores))
    body.append(row)

    fio.WriteMatrix(output, body, header)
Ejemplo n.º 19
0
def train_leave_one_lecture_out_svm(model_dir, name='simlearn_cv'):
    #     model_dir = '../data/IE256/%s/model/%s/'%(system, name)
    #     fio.NewPath(model_dir)
    #
    #     outputdir = '../data/IE256/%s/extraction/%s_output/'%(system, name)
    #     fio.NewPath(outputdir)

    sim_extractor = Similarity()
    allfeatures = sorted(sim_extractor.features.keys())

    #for k in range(len(allfeatures)+1):
    k = len(allfeatures)
    if True:

        #for k in range(len(allfeatures)):
        #if allfeatures[k] != 'optimumComparerLSATasa': continue

        if k == len(allfeatures):  #use all features
            features = allfeatures
        else:
            features = [allfeatures[k]]
            #features = allfeatures[0:k] + allfeatures[k+1:]

        name = '_'.join(features)

        lectures = annotation.Lectures

        dict = defaultdict(int)

        MSE = []
        for i, lec in enumerate(lectures):
            train = [x for x in lectures if x != lec]
            test = [lec]

            print train
            print test

            model_file = os.path.join(model_dir, '%d_%s.model' % (lec, name))

            if fio.IsExist(model_file):
                with open(model_file, 'rb') as handle:
                    clf = pickle.load(handle)
            else:
                train_X, train_Y = combine_files(train, features)
                clf = svm.SVC()
                clf.fit(train_X, train_Y)

                with open(model_file, 'wb') as handle:
                    pickle.dump(clf, handle)

            for q in ['q1', 'q2']:
                test_X, test_Y = combine_files(test, features, prompts=[q])
                predict_Y = clf.predict(test_X)

                prf = precision_recall_fscore_support(test_Y,
                                                      predict_Y,
                                                      average='weighted')

                accuracy = accuracy_score(test_Y, predict_Y)

                MSE.append([lec, q, accuracy] + [prf[0], prf[1], prf[2]])

        output = '../data/%s/simlearning.cv.svm.%s.txt' % (course, name)

        fio.WriteMatrix(output,
                        MSE,
                        header=[
                            'lec', 'prompt', 'accuracy', 'precision', 'recall',
                            'f-score'
                        ])
Ejemplo n.º 20
0
def extractPhraseFromAnnotation(phrasedir, annotator, summarydir=None):
    for doc, lec, annotator in annotation.generate_all_files(
            annotation.datadir + 'json/',
            '.json',
            anotators=annotator,
            lectures=annotation.Lectures):
        print doc

        #load task
        task = annotation.Task()
        task.loadjson(doc)

        path = phrasedir + str(lec) + '/'
        fio.NewPath(path)

        #Add a cache to make it faster
        Cache = {}
        cachefile = phrasedir + str(lec) + '/' + 'cache.json'
        if fio.IsExist(cachefile):
            with open(cachefile, 'r') as fin:
                Cache = json.load(fin)

        for prompt in ['q1', 'q2']:
            filename = path + prompt + '.' + method + '.key'
            cluster_output = path + prompt + '.cluster.kmedoids.sqrt.oracle.%s' % method

            if summarydir:
                fio.NewPath(os.path.join(summarydir, str(lec)))
                summary_file = os.path.join(summarydir, str(lec),
                                            '%s.summary' % prompt)

            body = []

            if summarydir:
                summaries = []

            phrase_summary_dict = task.get_phrase_summary_textdict(prompt)
            extracted_phrases = []
            phrase_annotation = task.get_phrase_annotation(prompt)
            for rank in sorted(phrase_annotation):
                rank_phrases = []
                phrases = phrase_annotation[rank]
                for phrasedict in phrases:
                    phrase = phrasedict['phrase'].lower()
                    extracted_phrases.append(phrase)
                    rank_phrases.append(phrase)
                    row = [phrase, rank]
                    body.append(row)

                if summarydir:
                    rank_summary = phrase_summary_dict[rank]
                    max_summary = get_max_phrase_by_ROUGE(
                        rank_summary, rank_phrases, Cache)
                    print max_summary

                    summaries.append(max_summary)

            fio.SaveList(extracted_phrases, filename)

            fio.WriteMatrix(cluster_output, body, header=None)

            if summarydir:
                fio.SaveList(summaries, summary_file)

            with open(cachefile, 'w') as outfile:
                json.dump(Cache, outfile, indent=2)
Ejemplo n.º 21
0
def gather_rouge(output):

    courses = ['IE256', 'IE256_2016', 'CS0445']

    rouges = [
        ('LexRank', 'QPS_NP', 'rouge_LexRank'),
        ('PhraseSum', 'QPS_NP', 'rouge_crf_optimumComparerLSATasa'),
        ('SequenceSum', 'QPS_combine_coling',
         'rouge_crf_optimumComparerLSATasa'),
        ('SimSum', 'QPS_combine_coling', 'rouge_crf_svm'),
        ('CDSum', 'QPS_combine_coling', 'rouge_crf_ct.svm.default'),
    ]

    baseline1 = ('PhraseSum', 'QPS_NP', 'rouge_crf_optimumComparerLSATasa')
    baseline2 = ('SequenceSum', 'QPS_combine_coling',
                 'rouge_crf_optimumComparerLSATasa')

    Header = [
        'course',
        'name',
        'R1-R',
        'R1-P',
        'R1-F',
        'R2-R',
        'R2-P',
        'R2-F',
        'RSU4-R',
        'RSU4-P',
        'RSU4-F',
    ]

    ROUGE_Head = [
        'id', 'R1-R', 'R1-P', 'R1-F', 'R2-R', 'R2-P', 'R2-F', 'RSU4-R',
        'RSU4-P', 'RSU4-F'
    ]

    ROUGE_index = [
        ROUGE_Head.index(name) for name in ROUGE_Head if name != 'id'
    ]

    xbody = []
    for course in courses:
        for name, model, method in rouges:
            datadir = '../data/%s/' % course

            filename = os.path.join(datadir, model, "%s.txt" % method)
            if not fio.IsExist(filename): continue

            baseline1_name = os.path.join(datadir, baseline1[1],
                                          "%s.txt" % baseline1[2])
            baseline2_name = os.path.join(datadir, baseline2[1],
                                          "%s.txt" % baseline2[2])

            if name in ['LexRank', 'SequenceSum', 'SimSum', 'CDSum']:
                pvalues1 = get_pvalues(filename, baseline1_name, ROUGE_index)
            else:
                pvalues1 = [1] * len(ROUGE_index)

            if name in ['SimSum', 'CDSum']:
                pvalues2 = get_pvalues(filename, baseline2_name, ROUGE_index)
            else:
                pvalues2 = [1] * len(ROUGE_index)

            head, body = fio.ReadMatrix(filename, hasHead=True)

            row = [course, name]
            row += [
                '%.3f%s%s' % (float(x), '*' if pvalues1[i] < 0.05 else '',
                              '+' if pvalues2[i] < 0.05 else '')
                for i, x in enumerate(body[-1][1:])
            ]

            xbody.append(row)

    fio.WriteMatrix(output, xbody, Header)
Ejemplo n.º 22
0
def getRouge(datadir, maxWeek, output):
    sheets = range(0, maxWeek)

    body = []
    allbody = []

    #Krange = range(1, 25)
    #Krange = range(1, 25)
    Krange = [gK]

    for sheet in sheets:
        week = sheet + 1
        dir = datadir + str(week) + '/'

        for type in ['q1', 'q2']:

            maxS = 0
            maxK = -1
            maxScore = []

            Cache = {}
            cachefile = os.path.join(datadir, str(week), 'cache.json')
            print cachefile
            if fio.IsExist(cachefile):
                with open(cachefile, 'r') as fin:
                    Cache = json.load(fin)

            allrow = [week]

            #Krange = [np.random.randint(1, 25)]

            for K in Krange:

                summary_file = dir + type + '.%d.summary' % K

                print summary_file

                if not fio.IsExist(summary_file):
                    print summary_file
                    continue

                #read TA's summmary
                refs = []
                for i in range(2):
                    reffile = os.path.join(datadir, str(week),
                                           type + '.ref.%d' % i)
                    if not fio.IsExist(reffile):
                        print reffile
                        continue

                    lines = fio.ReadFile(reffile)
                    ref = [line.strip() for line in lines]
                    refs.append(ref)

                if len(refs) == 0: continue

                lstref = refs[0] + refs[1]

                lines = fio.ReadFile(summary_file)
                TmpSum = [line.strip() for line in lines]

                cacheKey = OracleExperiment.getKey(lstref, TmpSum)
                if cacheKey in Cache:
                    scores = Cache[cacheKey]
                    print "Hit"
                else:
                    print "Miss"
                    print summary_file
                    scores = OracleExperiment.getRouge_IE256(refs, TmpSum)
                    Cache[cacheKey] = scores

                s = float(scores[RIndex])

                allrow.append(s)

                if s >= maxS:
                    maxS = s
                    maxScore = scores
                    maxK = K

            if maxK == -1: continue

            row = [week]
            row = row + maxScore + [maxK]

            body.append(row)

            allrow.append(maxK)

            allbody.append(allrow)

            try:
                fio.SaveDict2Json(Cache, cachefile)
            except:
                #fio.SaveDict(Cache, cachefile + '.dict')
                pass

    header = ['id'] + RougeHeader
    row = ['ave']
    for i in range(1, len(header)):
        scores = [float(xx[i]) for xx in body]
        row.append(numpy.mean(scores))
    body.append(row)

    fio.WriteMatrix(output, body, header)

    fio.WriteMatrix(output + '.all', allbody, ['week'] + Krange)
Ejemplo n.º 23
0
def train_leave_one_lecture_out(name='cv'):
    wapiti_home = global_params.wapiti_dir

    pattern_file = '../data/%s.pattern.txt' % name
    model_dir = '../data/%s/%s/model/%s/' % (course, system, name)
    fio.NewPath(model_dir)

    feature_dir = '../data/%s/%s/extraction/' % (course, system)
    feature_cv_dir = '../data/%s/%s/extraction/%s/' % (course, system, name)
    fio.NewPath(feature_cv_dir)

    outputdir = '../data/%s/%s/extraction/%s_output/' % (course, system, name)
    fio.NewPath(outputdir)

    lectures = annotation.Lectures

    dict = defaultdict(int)

    for i, lec in enumerate(lectures):
        train = [x for x in lectures if x != lec]
        test = [lec]

        train_filename = os.path.join(feature_cv_dir,
                                      'train_%d.feature.crf' % i)

        model_file = os.path.join(model_dir, '%d.model' % i)

        print train_filename
        print model_file

        crf = CRF(wapiti_home)
        if not fio.IsExist(model_file):
            #if True:
            combine_files(feature_dir, train, train_filename)
            crf.train(train_filename, pattern_file, model_file)

        for q in ['q1', 'q2']:

            test_filename = os.path.join(feature_cv_dir,
                                         'test_%d_%s.feature.crf' % (i, q))
            output_file = os.path.join(outputdir, 'test_%d_%s.out' % (i, q))

            dict['test_%d_%s' % (i, q)] = 1

            if empty == 'Y':
                test_filename_old = test_filename.replace('_Y', '_N')
                cmd = 'cp %s %s' % (test_filename_old, test_filename)
                os.system(cmd)
            else:

                if method == 'combine':
                    test_filename_old = test_filename.replace(
                        '_combine', '_A1')
                    cmd = 'cp %s %s' % (test_filename_old, test_filename)
                    os.system(cmd)
                else:
                    combine_files(feature_dir,
                                  test,
                                  test_filename,
                                  prompts=[q])

            crf.predict(test_filename, model_file, output_file)

        if debug: break

    file_util.save_dict2json(dict, class_index_dict_file)
Ejemplo n.º 24
0
def getShallowSummary(excelfile, folder, clusterdir, K=30, method=None, similarity=None, ratio=None, lex='lexrank'):
    #K is the number of words per points
    sheets = range(0,maxWeek)
    
    for i, sheet in enumerate(sheets):
        week = i + 1
        
        for type in ['q1', 'q2', 'q3', 'q4']:
            
            path = folder + str(week)+ '/'
            fio.NewPath(path)
            filename = path + type + '.%d.summary'%ratio
            
            #produce the cluster file on the fly
            phrasefile = os.path.join(clusterdir, str(week), type + '.' + method + '.key')
            if not fio.IsExist(phrasefile): continue
            
            print excelfile, sheet, type
            
            cluster_output = clusterdir + str(week) +'/' + type + ".cluster.kmedoids." + str(ratio) + "." +similarity + '.' + method
            print cluster_output
            
            weightfile = clusterdir + str(week)+ '/' + type + '.' + method + '.' + similarity
            print weightfile
            
            if not fio.IsExist(cluster_output):
            #if True:
                print "clustering"
                phraseClusteringKmedoid.getPhraseClusterPhrase(phrasefile, weightfile, cluster_output, ratio, method=method)
            if not fio.IsExist(cluster_output): continue
            body = fio.ReadMatrix(cluster_output, False)
            
            NPCandidates = fio.ReadFile(phrasefile)
            
            lexfile = clusterdir + str(week)+ '/' + str(type) + "." + method + "."+lex+".dict"
            lexdict = fio.LoadDict(lexfile, 'float')
            
            NPs = [row[0] for row in body]
            clusterids = [row[1] for row in body]
            
            #assert(NPCandidates == NPs)
            if NPCandidates != NPs: 
                print NPCandidates
                print NPs
            
            cluster = {}
            for row in body:
                cluster[row[0]] = int(row[1])
            
            Summary = []
            
            #sort the clusters according to the number of response
            keys = postProcess.RankClusterNoSource(NPs, lexdict, clusterids)
            
            total_word = 0
            word_count = 0
            for key in keys:
                #phrase = NPs[key]
                phrase = postProcess.getTopRankPhraseNoSource(NPs, clusterids, int(key), lexdict)
                if phrase in Summary: continue
                
                word_count = len(phrase.split())
                total_word = total_word + word_count
                #if total_word <= K:
                if len(Summary) + 1 <= K:
                    Summary.append(phrase)
                    
            fio.SaveList(Summary, filename)
Ejemplo n.º 25
0
def plot_rouge_by_time():

    for metric in [
            'R1-R', 'R1-P', 'R1-F', 'R2-R', 'R2-P', 'R2-F', 'RSU4-R', 'RSU4-P',
            'RSU4-F'
    ]:
        for prompt in ['q1', 'q2']:
            courses = ['IE256', 'IE256_2016', 'CS0445']

            rouges = [  #('LexRank', 'QPS_NP', 'rouge_LexRank'),
                ('PhraseSum', 'QPS_NP', 'rouge_crf_optimumComparerLSATasa'),
                ('SequenceSum', 'QPS_combine_coling',
                 'rouge_crf_optimumComparerLSATasa'),
                #('SimSum', 'QPS_combine_coling', 'rouge_crf_svm'),
                #('CDSum', 'QPS_combine_coling', 'rouge_crf_ct.svm.default'),
            ]

            baseline1 = ('PhraseSum', 'QPS_NP',
                         'rouge_crf_optimumComparerLSATasa')
            baseline2 = ('SequenceSum', 'QPS_combine_coling',
                         'rouge_crf_optimumComparerLSATasa')

            Header = [
                'course',
                'name',
                'R1-R',
                'R1-P',
                'R1-F',
                'R2-R',
                'R2-P',
                'R2-F',
                'RSU4-R',
                'RSU4-P',
                'RSU4-F',
            ]

            ROUGE_Head = [
                'id', 'R1-R', 'R1-P', 'R1-F', 'R2-R', 'R2-P', 'R2-F', 'RSU4-R',
                'RSU4-P', 'RSU4-F'
            ]

            ROUGE_index = [
                ROUGE_Head.index(name) for name in ROUGE_Head if name != 'id'
            ]

            metric_index = ROUGE_Head.index(metric)

            fig = plt.figure()
            ax = fig.add_subplot(111)
            ax.set_xlabel('week', fontsize=12)
            ax.set_ylabel('Rouge', fontsize=12)

            plt.title('%s %s' % (metric, annotation.prompt_name[prompt]))
            plt.grid(True)

            colors = ['#d8b365', "#f5f5f5", "#5ab4ac"]

            for cid, course in enumerate(courses):
                #c = colors[cid]
                for name, model, method in rouges:
                    datadir = '../data/%s/' % course

                    filename = os.path.join(datadir, model,
                                            "%s_%s.txt" % (method, prompt))
                    if not fio.IsExist(filename): continue

                    X, Y = get_X_Y(filename, metric_index)

                    #plt.plot(X, Y, label=metric, marker='D', color="b", alpha=0.6, )
                    plt.plot(X,
                             Y,
                             label='%s_%s' % (course, name),
                             alpha=0.6,
                             linewidth=2)

            #legend = plt.legend(loc='right center', shadow=True, fontsize='x-large')
            legend = plt.legend(loc='upper right',
                                shadow=True,
                                fontsize='small')

            pp = PdfPages('../data/%s_%s.pdf' % (metric, prompt))
            plt.savefig(pp, format='pdf')
            pp.close()