Ejemplo n.º 1
0
def gather_performance(output):
    sim_extractor = Similarity()
    allfeatures = sorted(sim_extractor.features.keys())

    allbody = []
    for k in range(len(allfeatures) + 1):
        #features = allfeatures#['WordEmbedding']

        if k == len(allfeatures):  #use all features
            features = allfeatures
        else:
            features = [allfeatures[k]]
            #features = allfeatures[0:k] + allfeatures[k+1:]

        name = '_'.join(features)

        resultfile = '../data/%s/simlearning.cv.svm.%s.txt' % (course, name)

        head, body = fio.ReadMatrix(resultfile, hasHead=True)

        #get the average
        allhead = ['name'] + head[2:]
        average = [name]
        for i in range(2, len(head)):  #start from the third one
            values = [float(row[i]) for row in body]
            average.append(np.mean(values))

        allbody.append(average)

    fio.WriteMatrix(output, allbody, allhead)
Ejemplo n.º 2
0
def get_X_Y(input, index):
    head, body = fio.ReadMatrix(input, hasHead=True)

    X = [int(row[0]) for row in body[:-1]]  #week
    Y = [float(row[index]) for row in body[:-1]]  #rouge

    return X, Y
Ejemplo n.º 3
0
def getPhraseClusterAll(sennafile,
                        weightfile,
                        output,
                        ratio=None,
                        MalformedFlilter=False,
                        source=None,
                        np=None):
    NPCandidates, sources = getNPs(sennafile,
                                   MalformedFlilter,
                                   source=source,
                                   np=np)

    if len(NPCandidates) == 0: return

    NPs, matrix = fio.ReadMatrix(weightfile, hasHead=True)

    #change the similarity to distance
    matrix = Similarity2Distance(matrix)

    index = {}
    for i, NP in enumerate(NPs):
        index[NP] = i

    newMatrix = []

    for NP1 in NPCandidates:
        assert (NP1 in index)
        i = index[NP1]

        row = []
        for NP2 in NPCandidates:
            if NP2 not in index:
                print NP2, weightfile, np
            j = index[NP2]
            row.append(matrix[i][j])

        newMatrix.append(row)

    V = len(NPCandidates)
    if ratio == "sqrt":
        K = int(math.sqrt(V))
    elif float(ratio) > 1:
        K = int(ratio)
    else:
        K = int(ratio * V)

    if K < 1: K = 1

    clusterid = ClusterWrapper.KMedoidCluster(newMatrix, K)

    body = []
    for NP, id in zip(NPCandidates, clusterid):
        row = []
        row.append(NP)
        row.append(id)
        body.append(row)

    fio.WriteMatrix(output, body, header=None)
Ejemplo n.º 4
0
def getPhraseClusterPhrase(phrasefile,
                           weightfile,
                           output,
                           ratio=None,
                           method=None):
    NPCandidates = fio.ReadFile(phrasefile)
    if len(NPCandidates) == 0: return

    NPs, matrix = fio.ReadMatrix(weightfile, hasHead=True)

    #change the similarity to distance
    matrix = Similarity2Distance(matrix)

    index = {}
    for i, NP in enumerate(NPs):
        index[NP] = i

    newMatrix = []

    for NP1 in NPCandidates:
        if NP1 not in index: continue

        i = index[NP1]

        row = []
        for NP2 in NPCandidates:
            if NP2 not in index:
                print NP2, weightfile, method
                continue

            j = index[NP2]
            row.append(matrix[i][j])

        newMatrix.append(row)

    V = len(NPCandidates)
    if ratio == "sqrt":
        K = int(math.sqrt(V))
    elif float(ratio) >= 1:
        K = int(ratio)
    else:
        K = int(ratio * V)

    if K < 1: K = 1

    K = min(K, V)

    clusterid = ClusterWrapper.KMedoidCluster(newMatrix, K)

    body = []
    for NP, id in zip(NPCandidates, clusterid):
        row = []
        row.append(NP)
        row.append(id)
        body.append(row)

    fio.WriteMatrix(output, body, header=None)
Ejemplo n.º 5
0
def getPhraseCluster(phrasedir, method='lexicalOverlapComparer', ratio=None):
    sheets = range(0, 12)

    for sheet in sheets:
        week = sheet + 1
        for type in ['POI', 'MP', 'LP']:
            weightfilename = phrasedir + str(week) + '/' + type + '.' + method
            print weightfilename

            NPs, matrix = fio.ReadMatrix(weightfilename, hasHead=True)

            #change the similarity to method
            for i, row in enumerate(matrix):
                for j, col in enumerate(row):
                    matrix[i][j] = 1 - float(
                        matrix[i][j]) if matrix[i][j] != "NaN" else 0

            V = len(NPs)
            if ratio == None:
                K = int(math.sqrt(V))
            else:
                K = int(ratio * V)

            K = 10
            clusterid = ClusterWrapper.KMedoidCluster(matrix, K)

            #             sorted_lists = sorted(zip(NPs, clusterid), key=lambda x: x[1])
            #             NPs, clusterid = [[x[i] for x in sorted_lists] for i in range(2)]

            dict = defaultdict(int)
            for id in clusterid:
                dict[id] = dict[id] + 1

            body = []
            for NP, id in zip(NPs, clusterid):
                row = []
                row.append(NP)
                row.append(id)
                #row.append(dict[id])

                body.append(row)

            if ratio == None:
                file = phrasedir + '/' + str(
                    week
                ) + '/' + type + ".cluster.kmedoids." + "sqrt" + "." + method
            else:
                file = phrasedir + '/' + str(
                    week) + '/' + type + ".cluster.kmedoids." + str(
                        ratio) + "." + method
            fio.WriteMatrix(file, body, header=None)
Ejemplo n.º 6
0
def writegraph_leave_one_lecture_out_lsa(model_dir,
                                         phrasedir,
                                         modelname='lsa'):
    lectures = annotation.Lectures

    for i, lec in enumerate(lectures):
        test = [lec]

        path = os.path.join(phrasedir, str(lec))

        for q in ['q1', 'q2']:
            #write the output
            phrasefile = os.path.join(path, "%s.%s.key" % (q, method))
            phrases = fio.LoadList(phrasefile)

            if modelname == 'lsa':
                similarties_results = os.path.join(
                    path, "%s.%s.optimumComparerLSATasa" % (q, method))
            elif modelname == 'svm':
                similarties_results = os.path.join(path,
                                                   "%s.%s.svm" % (q, method))

            simhead, simbody = fio.ReadMatrix(similarties_results,
                                              hasHead=True)

            assert (len(simhead) == len(phrases))

            body = []
            for i, p1 in enumerate(phrases):
                for j, p2 in enumerate(phrases):
                    if j <= i:
                        continue  #undirect graph

                    score = simbody[i][j]

                    score = float(score) if score != 'NaN' else 0.0

                    #if score == 0.0: score = 0.000001
                    #if score < 0.5: continue
                    if score == 0.0: continue

                    #row = [i, j, '%f'%score]
                    row = [i, j]

                    body.append(row)

            output = os.path.join(
                path, "%s.%s.%s%s" % (q, method, modelname, net_exe))
            fio.WriteMatrix(output, body)
Ejemplo n.º 7
0
def gather_rouge(output):
    datadir = '../data/%s/' % course

    #output = '../data/IE256/result.rouge.txt'

    models = [
        'QPS_NP',
        #'QPS_A1_N', 'QPS_A2_N', 'QPS_union', 'QPS_intersect',
        'QPS_combine'
    ]
    methods = [
        'rouge_crf_optimumComparerLSATasa',
        'rouge_crf_ct.svm.default',
        #'rouge_crf_svm',
        #'rouge_crf_svr',
        'rouge_crf_ct.svm.default',
        #'rouge_crf_ct.svr.default',
    ]

    Header = [
        'method',
        'model',
        'R1-R',
        'R1-P',
        'R1-F',
        'R2-R',
        'R2-P',
        'R2-F',
        'RSU4-R',
        'RSU4-P',
        'RSU4-F',
    ]

    xbody = []
    for method in methods:
        for model in models:

            filename = os.path.join(datadir, model, "%s.txt" % method)

            if not fio.IsExist(filename): continue

            head, body = fio.ReadMatrix(filename, hasHead=True)

            row = [method, model]
            row += body[-1][1:]

            xbody.append(row)

    fio.WriteMatrix(output, xbody, Header)
Ejemplo n.º 8
0
def split_rouge(filename, prefix, N=2):
    head, body = fio.ReadMatrix(filename, hasHead=True)

    newbodies = [[] for i in range(N)]

    for i, row in enumerate(body[:-1]):
        newbodies[i % N].append(row)

    #compute the new average
    for k in range(len(newbodies)):
        row = ['ave']
        for i in range(1, len(head)):
            scores = [float(xx[i]) for xx in newbodies[k]]
            row.append(numpy.mean(scores))
        newbodies[k].append(row)

    for i, newbody in enumerate(newbodies):
        fio.WriteMatrix('%s_q%d.txt' % (prefix, i + 1), newbody, head)
Ejemplo n.º 9
0
    def __init__(self, prefix=""):
        self.features = {
            'optimumComparerLSATasa': self.LSA,
            'LexicalOverlap': self.LexicalOverlap,
            'optimumComparerWNLin': self.LIN,
            'BLEU': self.BLEU,
            'ROUGE': self.ROUGE,
            'Cosine': self.Cosine,
            'WordEmbedding': self.WordEmbedding,
            #'WMD': self.WMD,
        }

        self.prefix = prefix

        self.Cache = {}
        self.cachefile = os.path.join(prefix + 'cache.json')
        print self.cachefile
        if fio.IsExist(self.cachefile):
            with open(self.cachefile, 'r') as fin:
                self.Cache = json.load(fin)

        if self.prefix != '':
            self.matrixdict = {}
            for sim in [
                    'optimumComparerLSATasa', 'LexicalOverlap',
                    'optimumComparerWNLin', 'BLEU'
            ]:
                self.matrixdict[sim] = {}

                filename = self.prefix + sim

                phrases, matrix = fio.ReadMatrix(filename, hasHead=True)

                index = {}
                for i, p in enumerate(phrases):
                    index[p] = i

                self.matrixdict[sim]['index'] = index
                self.matrixdict[sim]['matrix'] = matrix

        self.word2vec = fio.LoadDictJson(global_params.word2vec_model)
Ejemplo n.º 10
0
def getShallowSummary(excelfile, folder, clusterdir, K=30, method=None, similarity=None, ratio=None, lex='lexrank'):
    #K is the number of words per points
    sheets = range(0,maxWeek)
    
    for i, sheet in enumerate(sheets):
        week = i + 1
        
        for type in ['q1', 'q2', 'q3', 'q4']:
            
            path = folder + str(week)+ '/'
            fio.NewPath(path)
            filename = path + type + '.%d.summary'%ratio
            
            #produce the cluster file on the fly
            phrasefile = os.path.join(clusterdir, str(week), type + '.' + method + '.key')
            if not fio.IsExist(phrasefile): continue
            
            print excelfile, sheet, type
            
            cluster_output = clusterdir + str(week) +'/' + type + ".cluster.kmedoids." + str(ratio) + "." +similarity + '.' + method
            print cluster_output
            
            weightfile = clusterdir + str(week)+ '/' + type + '.' + method + '.' + similarity
            print weightfile
            
            if not fio.IsExist(cluster_output):
            #if True:
                print "clustering"
                phraseClusteringKmedoid.getPhraseClusterPhrase(phrasefile, weightfile, cluster_output, ratio, method=method)
            if not fio.IsExist(cluster_output): continue
            body = fio.ReadMatrix(cluster_output, False)
            
            NPCandidates = fio.ReadFile(phrasefile)
            
            lexfile = clusterdir + str(week)+ '/' + str(type) + "." + method + "."+lex+".dict"
            lexdict = fio.LoadDict(lexfile, 'float')
            
            NPs = [row[0] for row in body]
            clusterids = [row[1] for row in body]
            
            #assert(NPCandidates == NPs)
            if NPCandidates != NPs: 
                print NPCandidates
                print NPs
            
            cluster = {}
            for row in body:
                cluster[row[0]] = int(row[1])
            
            Summary = []
            
            #sort the clusters according to the number of response
            keys = postProcess.RankClusterNoSource(NPs, lexdict, clusterids)
            
            total_word = 0
            word_count = 0
            for key in keys:
                #phrase = NPs[key]
                phrase = postProcess.getTopRankPhraseNoSource(NPs, clusterids, int(key), lexdict)
                if phrase in Summary: continue
                
                word_count = len(phrase.split())
                total_word = total_word + word_count
                #if total_word <= K:
                if len(Summary) + 1 <= K:
                    Summary.append(phrase)
                    
            fio.SaveList(Summary, filename)
Ejemplo n.º 11
0
def gather_rouge():

    Allbody = []
    for cid in [
            'IE256',
            'IE256_2016',
            'CS0445',
    ]:

        ilpdir = "../data/%s/" % cid
        baseline_rougefile = os.path.join(ilpdir, 'rouge_np.txt')
        if not fio.IsExist(baseline_rougefile): continue

        basehead, basebody = fio.ReadMatrix(baseline_rougefile, hasHead=True)
        row = [cid, '', 'PhrasSum'
               ] + ['%.3f' % float(x) for x in basebody[-1][1:-3]]
        Allbody.append(row)

        for A in [
                '1',
                '2',
        ]:

            for model in [
                    'optimumComparerLSATasa', 'oracle', 'oracle_selection'
            ]:

                modeldir = os.path.join(ilpdir, 'oracle_annotator_%s' % A)
                model_rouge_file = os.path.join(
                    modeldir, 'rouge_annotator%s_%s.txt' % (A, model))
                head, body = fio.ReadMatrix(model_rouge_file, hasHead=True)

                if model == 'optimumComparerLSATasa':
                    basehead1, basebody1 = fio.ReadMatrix(model_rouge_file,
                                                          hasHead=True)
                elif model == 'oracle':
                    basehead2, basebody2 = fio.ReadMatrix(model_rouge_file,
                                                          hasHead=True)

                row = [cid, 'A%s' % A, model
                       ] + ['%.3f' % float(x) for x in body[-1][1:-3]]

                print cid, model
                print model_rouge_file
                print baseline_rougefile
                #get p values
                from stats_util import get_ttest_pvalues
                pvalues = get_ttest_pvalues(basebody[1:-1], body[1:-1],
                                            range(1,
                                                  len(head) - 3))

                if model == 'optimumComparerLSATasa':
                    k = 3
                    for p in pvalues:
                        if p < 0.05:
                            row[k] = row[k] + '$^*$'
                        k += 1
                elif model == 'oracle':
                    pvalues1 = get_ttest_pvalues(basebody1[1:-1], body[1:-1],
                                                 range(1,
                                                       len(head) - 3))

                    k = 3
                    for p1, p2 in zip(pvalues, pvalues1):
                        if p1 < 0.05 and p2 < 0.05:
                            row[k] = row[k] + '$^{*\dag}$'
                        elif p1 < 0.05:
                            row[k] = row[k] + '$^*$'
                        elif p2 < 0.05:
                            row[k] = row[k] + '$^\dag$'
                        k += 1

                elif model == 'oracle_selection':
                    pvalues1 = get_ttest_pvalues(basebody1[1:-1], body[1:-1],
                                                 range(1,
                                                       len(head) - 3))
                    pvalues2 = get_ttest_pvalues(basebody2[1:-1], body[1:-1],
                                                 range(1,
                                                       len(head) - 3))

                    k = 3
                    for p1, p2, p3 in zip(pvalues, pvalues1, pvalues2):
                        if p1 >= 0.05 and p2 >= 0.05 and p3 >= 0.05:
                            k += 1
                            continue

                        row[k] = row[k] + '$^{'

                        if p1 < 0.05:
                            row[k] = row[k] + '*'
                        if p2 < 0.05:
                            row[k] = row[k] + '\dag'

                        if p3 < 0.05:
                            row[k] = row[k] + '\circ'

                        row[k] = row[k] + '}$'
                        k += 1

                Allbody.append(row)

    output = '../data/rouge_oracle_all_gather.txt'
    fio.Write2Latex(output, Allbody, [''] + head)
Ejemplo n.º 12
0
def gather_rouge(output):

    courses = ['IE256', 'IE256_2016', 'CS0445']

    rouges = [
        ('LexRank', 'QPS_NP', 'rouge_LexRank'),
        ('PhraseSum', 'QPS_NP', 'rouge_crf_optimumComparerLSATasa'),
        ('SequenceSum', 'QPS_combine_coling',
         'rouge_crf_optimumComparerLSATasa'),
        ('SimSum', 'QPS_combine_coling', 'rouge_crf_svm'),
        ('CDSum', 'QPS_combine_coling', 'rouge_crf_ct.svm.default'),
    ]

    baseline1 = ('PhraseSum', 'QPS_NP', 'rouge_crf_optimumComparerLSATasa')
    baseline2 = ('SequenceSum', 'QPS_combine_coling',
                 'rouge_crf_optimumComparerLSATasa')

    Header = [
        'course',
        'name',
        'R1-R',
        'R1-P',
        'R1-F',
        'R2-R',
        'R2-P',
        'R2-F',
        'RSU4-R',
        'RSU4-P',
        'RSU4-F',
    ]

    ROUGE_Head = [
        'id', 'R1-R', 'R1-P', 'R1-F', 'R2-R', 'R2-P', 'R2-F', 'RSU4-R',
        'RSU4-P', 'RSU4-F'
    ]

    ROUGE_index = [
        ROUGE_Head.index(name) for name in ROUGE_Head if name != 'id'
    ]

    xbody = []
    for course in courses:
        for name, model, method in rouges:
            datadir = '../data/%s/' % course

            filename = os.path.join(datadir, model, "%s.txt" % method)
            if not fio.IsExist(filename): continue

            baseline1_name = os.path.join(datadir, baseline1[1],
                                          "%s.txt" % baseline1[2])
            baseline2_name = os.path.join(datadir, baseline2[1],
                                          "%s.txt" % baseline2[2])

            if name in ['LexRank', 'SequenceSum', 'SimSum', 'CDSum']:
                pvalues1 = get_pvalues(filename, baseline1_name, ROUGE_index)
            else:
                pvalues1 = [1] * len(ROUGE_index)

            if name in ['SimSum', 'CDSum']:
                pvalues2 = get_pvalues(filename, baseline2_name, ROUGE_index)
            else:
                pvalues2 = [1] * len(ROUGE_index)

            head, body = fio.ReadMatrix(filename, hasHead=True)

            row = [course, name]
            row += [
                '%.3f%s%s' % (float(x), '*' if pvalues1[i] < 0.05 else '',
                              '+' if pvalues2[i] < 0.05 else '')
                for i, x in enumerate(body[-1][1:])
            ]

            xbody.append(row)

    fio.WriteMatrix(output, xbody, Header)
Ejemplo n.º 13
0
def get_pvalues(input1, input2, index):
    head, body1 = fio.ReadMatrix(input1, hasHead=True)
    head, body2 = fio.ReadMatrix(input2, hasHead=True)

    p_values = get_ttest_pvalues(body1[:-1], body2[:-1], index)
    return p_values