Example #1
0
 def __init__(self, queryFile, candidatePath, mu, corpusFile, sigma, lamda):
     self.query = {}
     self.candidate = candidatePath
     self.tweet = {}
     self.mu = mu
     self.sigma = sigma  #similarity threshold
     self.lamda = lamda  #cluster threshold
     self.jaccInstance = Jaccard()
     self.klInstance = Distance(mu, corpusFile)
     print "corpus read done!"
Example #2
0
 def __init__(self, queryFile, candidatePath, mu, corpusFile, sigma, lamda):
     self.query = {}
     self.candidate = candidatePath
     self.tweet = {}  
     self.mu = mu
     self.sigma = sigma      #similarity threshold
     self.lamda = lamda      #cluster threshold     
     self.jaccInstance = Jaccard()
     self.klInstance = Distance(mu, corpusFile)
     print "corpus read done!"    
Example #3
0
    def __init__(self, train_path, test_path):
        self.train_path = train_path
        self.test_path = test_path
        self.preprocessor = Preprocessor()
        self.trn = pd.DataFrame(columns=Classifier._COLS)  # Read data_frame
        self.tst = pd.DataFrame(columns=Classifier._COLS)  # Read data_frame
        self.trn_gs = pd.DataFrame(columns=Classifier._GS_COLS)  # Known labels
        self.tst_gs = pd.DataFrame(columns=Classifier._GS_COLS)  # Known labels
        self.tok_trn = []
        self.tok_tst = []

        self.feature_extractor = FeatureExtractor()
        self.jaccard = Jaccard()
        self.rfr = RFR()
        self.nn = MLPRegressor(hidden_layer_sizes=(100, 30, 30),
                               validation_fraction=0.3,
                               alpha=0.3,
                               warm_start=False,
                               max_iter=1000,
                               activation='logistic')
Example #4
0
class Classifier:
    _GS_COLS = ['labels']
    _COLS = ['sentence0', 'sentence1']

    def __init__(self, train_path, test_path):
        self.train_path = train_path
        self.test_path = test_path
        self.preprocessor = Preprocessor()
        self.trn = pd.DataFrame(columns=Classifier._COLS)  # Read data_frame
        self.tst = pd.DataFrame(columns=Classifier._COLS)  # Read data_frame
        self.trn_gs = pd.DataFrame(columns=Classifier._GS_COLS)  # Known labels
        self.tst_gs = pd.DataFrame(columns=Classifier._GS_COLS)  # Known labels
        self.tok_trn = []
        self.tok_tst = []

        self.feature_extractor = FeatureExtractor()
        self.jaccard = Jaccard()
        self.rfr = RFR()
        self.nn = MLPRegressor(hidden_layer_sizes=(100, 30, 30),
                               validation_fraction=0.3,
                               alpha=0.3,
                               warm_start=False,
                               max_iter=1000,
                               activation='logistic')

    # -------------------------------------------------- CLASSIFY ▼ ----------------------------------------------------

    def classify(self):
        print(self.trn.head())
        print('Preprocessing...')
        self.tok_trn = self.preprocessor.run(self.trn)
        self.tok_tst = self.preprocessor.run(self.tst)
        print(self.tok_trn.head())

        print(self.tok_trn['sentence0'].values[483])
        print(self.trn['sentence0'].values[483])

        # Features
        fea_trn = pd.read_pickle('./dump/fea_trn4.dump')
        fea_tst = pd.read_pickle('./dump/fea_tst4.dump')
        #fea_trn = self.feature_extractor.extract(tok_trn)
        #fea_tst = self.feature_extractor.extract(tok_tst)
        #fea_trn.to_pickle('./dump/fea_trn4.dump')
        #fea_tst.to_pickle('./dump/fea_tst4.dump')

        print('Creating BOG...')
        bog = BOG()
        bog.train_dictionary(self.tok_trn)
        bog_extended_trn = bog.get_bog_extended(self.tok_trn, fea_trn)
        bog_extended_tst = bog.get_bog_extended(self.tok_tst, fea_tst)
        bog_extended_trn_scaled = bog.get_bog_extended(self.tok_trn,
                                                       fea_trn,
                                                       scale=True)
        bog_extended_tst_scaled = bog.get_bog_extended(self.tok_tst,
                                                       fea_tst,
                                                       scale=True)

        print('Training RFR...')
        self.rfr.fit(bog_extended_trn, self.trn_gs['labels'].values)
        self.rfr.print_feature_importance(bog_extended_trn)

        print('Training NN...')
        self.nn.fit(bog_extended_trn_scaled, self.trn_gs['labels'].values)

        print('Testing...')
        predict_nn_trn = self.nn.predict(bog_extended_trn_scaled)
        predict_nn_tst = self.nn.predict(bog_extended_tst_scaled)
        predict_rfr_trn = self.rfr.predict(bog_extended_trn)
        predict_rfr_tst = self.rfr.predict(bog_extended_tst)
        predict_jac_trn = self.jaccard.predict(self.tok_trn)
        predict_jac_tst = self.jaccard.predict(self.tok_tst)
        predict_vot_trn = self.average(predict_rfr_trn, predict_nn_trn)
        predict_vot_tst = self.average(predict_rfr_tst, predict_nn_tst)

        self.show_results(predict_rfr_trn, predict_rfr_tst, predict_jac_trn,
                          predict_jac_tst, predict_nn_trn, predict_nn_tst,
                          predict_vot_trn, predict_vot_tst)

    def average(self, predict_rfr, predict_nn):
        voted = []
        for rfr, nn in zip(predict_rfr, predict_nn):
            voted.append(0.5 * rfr + 0.5 * nn)
        return voted

    # ---------------------------------------------------- SHOW ▼ -----------------------------------------------------

    def __add_table(self, table, name, trn, tst):
        table.append_column(name, [
            '{:.2f} std: {:.1f}'.format(np.mean(trn), np.std(trn)),
            '{:.2f} std: {:.1f}'.format(np.mean(tst), np.std(tst)),
            '{:.4f}'.format(pearsonr(trn, self.trn_gs['labels'])[0]),
            '{:.4f}'.format(pearsonr(tst, self.tst_gs['labels'])[0])
        ])

    def show_results(self, rfr_trn, rfr_tst, jac_trn, jac_tst, nn_trn, nn_tst,
                     vot_trn, vot_tst):
        table = BeautifulTable()
        table.append_column('', ['Trn', 'Tst', 'Trn Pearson', 'Tst Pearson'])

        self.__add_table(table, 'Real', self.trn_gs['labels'],
                         self.tst_gs['labels'])
        self.__add_table(table, 'RFR', rfr_trn, rfr_tst)
        self.__add_table(table, 'Jaccard', jac_trn, jac_tst)
        self.__add_table(table, 'NN', nn_trn, nn_tst)
        self.__add_table(table, 'Voting', vot_trn, vot_tst)
        plt.scatter(nn_trn, self.trn_gs['labels'], c='Cyan')
        plt.xlabel('NN label')
        plt.ylabel('Real label')
        plt.show()
        plt.scatter(vot_trn, self.trn_gs['labels'], c='Blue')
        plt.xlabel('Averaging label')
        plt.ylabel('Real label')
        plt.show()
        plt.scatter(jac_trn, self.trn_gs['labels'], c='Green')
        plt.xlabel('Jaccard label')
        plt.ylabel('Real label')
        plt.show()
        plt.scatter(rfr_trn, self.trn_gs['labels'], c='Red')
        plt.xlabel('RFR label')
        plt.ylabel('Real label')
        plt.show()
        plt.scatter(nn_tst, self.tst_gs['labels'], c='Cyan')
        plt.xlabel('NN label')
        plt.ylabel('Real label')
        plt.show()
        plt.scatter(vot_tst, self.tst_gs['labels'], c='Blue')
        plt.xlabel('Averaging label')
        plt.ylabel('Real label')
        plt.show()
        plt.scatter(jac_tst, self.tst_gs['labels'], c='Green')
        plt.xlabel('Jaccard label')
        plt.ylabel('Real label')
        plt.show()
        plt.scatter(rfr_tst, self.tst_gs['labels'], c='Red')
        plt.xlabel('RFR label')
        plt.ylabel('Real label')
        plt.show()
        print(table)
        self.show_worst_test(vot_tst, rfr_tst, nn_tst, jac_tst)
        print()
        self.show_best_test(vot_tst, rfr_tst, nn_tst, jac_tst)
        print()

    def show_best_test(self,
                       predicted,
                       predicted_rfr,
                       predicted_nn,
                       predicted_jac,
                       k=15):
        print('Best results in averaging:')
        err = np.abs(predicted - self.tst_gs['labels'].values)
        idx = np.argpartition(err, k)[:k]
        dic = {
            err[i]: i
            for i in idx
        }  # Create a dictionary with the errors as the key for sorting output
        for err in sorted(dic, reverse=True):
            i = dic[err]
            print(
                '\33[100m{:d} Predicted [Averaging: {:.2f} RFR: {:.2f} NN: {:.2f} Jaccard: {:.2f}] Target: {:.2f} Err: {:.2f}\033[0m\n{:s}\n{:s}'
                .format(
                    i, predicted[i], predicted_rfr[i], predicted_nn[i],
                    predicted_jac[i], self.tst_gs['labels'].values[i], err,
                    str(self.tst['sentence0'].values[i]).replace('\n',
                                                                 '').replace(
                                                                     '\r', ''),
                    str(self.tst['sentence1'].values[i]).replace(
                        '\n', '').replace('\r', '')))

    def show_worst_test(self,
                        predicted,
                        predicted_rfr,
                        predicted_nn,
                        predicted_jac,
                        k=15):
        print('Worst results in averaging:')
        err = np.abs(predicted - self.tst_gs['labels'].values)
        idx = np.argpartition(err, -k)[-k:]
        dic = {
            err[i]: i
            for i in idx
        }  # Create a dictionary with the errors as the key for sorting output
        for err in sorted(dic, reverse=True):
            i = dic[err]
            print(
                '\33[100m{:d} Predicted [Averaging: {:.2f} RFR: {:.2f} NN: {:.2f} Jaccard: {:.2f}] Target: {:.2f} Err: {:.2f}\033[0m\n{:s}\n{:s}'
                .format(
                    i, predicted[i], predicted_rfr[i], predicted_nn[i],
                    predicted_jac[i], self.tst_gs['labels'].values[i], err,
                    str(self.tst['sentence0'].values[i]).replace('\n',
                                                                 '').replace(
                                                                     '\r', ''),
                    str(self.tst['sentence1'].values[i]).replace(
                        '\n', '').replace('\r', '')))

    # --------------------------------------------------- LOADING ▼ ---------------------------------------------------

    def load(self, use_dump=True):
        self.trn, self.trn_gs = self.__load_all(self.train_path)
        self.tst, self.tst_gs = self.__load_all(self.test_path)

        print('Train: {0} Test: {1}'.format(self.trn.shape, self.tst.shape))

    def __load_all(self, dir):
        print(dir)
        files = listdir(dir)
        input = pd.DataFrame(columns=['sentence0', 'sentence1'])
        label = pd.DataFrame(columns=['labels'])
        for file in files:
            path = pth.join(dir, file)
            path_gs = path.replace('input', 'gs')
            if 'STS.input' in path:  # Only read input files
                input_df = pd.read_csv(path,
                                       sep='\t',
                                       lineterminator='\n',
                                       names=Classifier._COLS,
                                       header=None,
                                       quoting=csv.QUOTE_NONE)
                label_df = pd.read_csv(path_gs,
                                       sep='\t',
                                       lineterminator='\n',
                                       names=Classifier._GS_COLS,
                                       header=None,
                                       quoting=csv.QUOTE_NONE)
                input = input.append(input_df)
                label = label.append(label_df)

        return \
            input.fillna('').reset_index(drop=True), \
            label.fillna('').reset_index(drop=True)
Example #5
0
class Cluster:
    def __init__(self, queryFile, candidatePath, mu, corpusFile, sigma, lamda):
        self.query = {}
        self.candidate = candidatePath
        self.tweet = {}  
        self.mu = mu
        self.sigma = sigma      #similarity threshold
        self.lamda = lamda      #cluster threshold     
        self.jaccInstance = Jaccard()
        self.klInstance = Distance(mu, corpusFile)
        print "corpus read done!"    
    
    def write(self, writePath, alpha, yibuson):
        writeFile = writePath + "res.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson)
        result = open(writeFile, "w+")
        log = open(writePath + "log.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")
        log.write("Qid\tclusterCount\ttweetCount\n")
        log1 = open(writePath + "qidWidKL.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")
        log2 = open(writePath + "widWidKL.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")
        log3 = open(writePath + "jaccScore.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")

        num = 1
        files = []
        while(num <= 55):
            files.append(str(num) + ".res.content.all")
            num += 1
        for file in files:
            #remember to make them initial on each query
            self.curQid = -1
            self.cluster = []
            self.qidWidKL = {}
            self.qidWidMax = 0
            self.qidWidMin = 999
            self.widWidKL = {}
            self.widWidMax = 0
            self.widWidMin = 999 
            self.widScore = {}
            self.jacc = {}
            self.jaccMax = 0
            self.jaccMin = 1
            self.resultList = []
            readPath = self.candidate + file  
            
            with open(readPath, "r") as fin:
                for i, line in enumerate(fin):
                    qid, Qid, wid, rank, score, runName, wcontent, qcontent = line.strip().split("\t")
                    self.query[qid] = qcontent
                    
                    #first time selection
                    if(float(score) < 4.59):
                        if not self.cluster:
                            print "break out of 4.59, ", file, " , empty cluster!"
                            exit()
                        break
                        
                    self.tweet[wid] = wcontent
                    self.curQid = qid
                    
                    #calculate qidWidKL
                    similarity = self.klInstance.kl(self.query[qid], wcontent)
                    
                    #calculate jaccard score
                    jaccScore = self.jaccInstance.jaccardScore(qcontent, wcontent)
                    
                    #if similarity <= self.sigma and jaccScore >= yibuson:
                    if similarity <= self.sigma:    
                        #set self.qidWidKL
                        self.qidWidKL[qid+"-"+wid] = similarity
                        if (similarity > self.qidWidMax):
                            self.qidWidMax = similarity

                        if (similarity < self.qidWidMin):
                            self.qidWidMin = similarity
                        
                        #set self.jacc       
                        self.jacc[qid+"-"+wid] = jaccScore
                        if self.jaccMax < jaccScore:
                            self.jaccMax = jaccScore
                        if self.jaccMin > jaccScore:
                            self.jaccMin = jaccScore    
                        
                        #calculate widWidKL
                        if not self.cluster:
                            self.cluster.append([wid])
                            self.widWidKL[wid] = []
                        else:
                            self.__clustering(wid)

                    
                    if (i % 100) == 0:
                        print file, " => ", i
                
            log1.write(str(self.qidWidMin) + "\t" + str(self.qidWidMax) + "\n")
            for key in self.qidWidKL:
                log1.write(key + "\t" + str(self.qidWidKL[key]) + "\n")
            log2.write(str(self.widWidMin) + "\t" + str(self.widWidMax) + "\n")
            for key in self.widWidKL:
                for i in range(len(self.widWidKL[key])):
                    for widKey in self.widWidKL[key][i]:
                        log2.write(key + "-" + widKey + "\t" + str(self.widWidKL[key][i][widKey]) + "\n")
            log3.write(str(self.jaccMin) + "\t" + str(self.jaccMax) + "\n")
            for key in self.jacc:
                log3.write(key + "\t" + str(self.jacc[key]) + "\n")
                        
            rankInstance = Rank(self.widWidKL, self.widWidMin, self.widWidMax)
            #self.widScore = rankInstance.textRank()
            self.widScore = rankInstance.combinedCov(alpha, self.cluster, self.tweet)
            
            #log info
            clusterCount = len(self.cluster)
            tweetCount = 0
            
            #select one wid from each cluster 
            for i in range(len(self.cluster)):
                maxScore = 0
                bestWid = -1
                tweetCount += len(self.cluster[i])
                for wid in self.cluster[i]:
                    if self.widScore[wid] > maxScore:
                    #select min query-tweet kl score
                    #key = str(self.curQid) + "-" + wid
                    #if self.qidWidKL[key] > maxScore:
                        #maxScore = self.qidWidKL[key]
                        maxScore = self.widScore[wid]
                        bestWid = wid
                self.resultList.append(bestWid)
            
            #write log info
            log.write("MB" + self.curQid + "\t" + str(clusterCount) + "\t" + str(tweetCount) + "\n")
            
                       
            #write result
            for wid in self.resultList:
                result.write("MB" + self.curQid + "\t" + "Q0\t" + wid + "\t1\t1\tYAO\n")
            
        
    def __clustering(self, wid):
        minScore = 999
        index = -1
        wcontent = self.tweet[wid]
        for i in range(len(self.cluster)):          
            for cwid in self.cluster[i]:
                ccontent = self.tweet[cwid]
                score = self.klInstance.kl(self.tweet[wid], self.tweet[cwid])
                #print i
                if wid in self.widWidKL:
                    self.widWidKL[wid].append({cwid: score})
                else:
                    self.widWidKL[wid] = [{cwid: score}]
                
                if cwid in self.widWidKL:
                    self.widWidKL[cwid].append({wid: score})
                else:
                    self.widWidKL[cwid] = [{wid: score}]
                    
                    
                #select miniScore, that is the most similar value
                if score < minScore:
                    minScore = score
                    index = i
                #record self.widWidMax & self.widWidMin
                if score < self.widWidMin:
                    self.widWidMin = score

                if score > self.widWidMax:
                    self.widWidMax = score
                    
        #put wid into the cluster 
        #a new cluster
        if minScore > self.lamda:
            self.cluster.append([wid])
            #print self.cluster
        #add to a highest similarity cluster
        else:
            self.cluster[index].append(wid)
Example #6
0
class Cluster:
    def __init__(self, queryFile, candidatePath, mu, corpusFile, sigma, lamda):
        self.query = {}
        self.candidate = candidatePath
        self.tweet = {}
        self.mu = mu
        self.sigma = sigma  #similarity threshold
        self.lamda = lamda  #cluster threshold
        self.jaccInstance = Jaccard()
        self.klInstance = Distance(mu, corpusFile)
        print "corpus read done!"

    def write(self, writePath, alpha, yibuson):
        writeFile = writePath + "res.rmSW4.59S" + str(self.sigma) + ".L" + str(
            self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson)
        result = open(writeFile, "w+")
        log = open(
            writePath + "log.rmSW4.59S" + str(self.sigma) + ".L" +
            str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")
        log.write("Qid\tclusterCount\ttweetCount\n")
        log1 = open(
            writePath + "qidWidKL.rmSW4.59S" + str(self.sigma) + ".L" +
            str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")
        log2 = open(
            writePath + "widWidKL.rmSW4.59S" + str(self.sigma) + ".L" +
            str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")
        log3 = open(
            writePath + "jaccScore.rmSW4.59S" + str(self.sigma) + ".L" +
            str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")

        num = 1
        files = []
        while (num <= 55):
            files.append(str(num) + ".res.content.all")
            num += 1
        for file in files:
            #remember to make them initial on each query
            self.curQid = -1
            self.cluster = []
            self.qidWidKL = {}
            self.qidWidMax = 0
            self.qidWidMin = 999
            self.widWidKL = {}
            self.widWidMax = 0
            self.widWidMin = 999
            self.widScore = {}
            self.jacc = {}
            self.jaccMax = 0
            self.jaccMin = 1
            self.resultList = []
            readPath = self.candidate + file

            with open(readPath, "r") as fin:
                for i, line in enumerate(fin):
                    qid, Qid, wid, rank, score, runName, wcontent, qcontent = line.strip(
                    ).split("\t")
                    self.query[qid] = qcontent

                    #first time selection
                    if (float(score) < 4.59):
                        if not self.cluster:
                            print "break out of 4.59, ", file, " , empty cluster!"
                            exit()
                        break

                    self.tweet[wid] = wcontent
                    self.curQid = qid

                    #calculate qidWidKL
                    similarity = self.klInstance.kl(self.query[qid], wcontent)

                    #calculate jaccard score
                    jaccScore = self.jaccInstance.jaccardScore(
                        qcontent, wcontent)

                    #if similarity <= self.sigma and jaccScore >= yibuson:
                    if similarity <= self.sigma:
                        #set self.qidWidKL
                        self.qidWidKL[qid + "-" + wid] = similarity
                        if (similarity > self.qidWidMax):
                            self.qidWidMax = similarity

                        if (similarity < self.qidWidMin):
                            self.qidWidMin = similarity

                        #set self.jacc
                        self.jacc[qid + "-" + wid] = jaccScore
                        if self.jaccMax < jaccScore:
                            self.jaccMax = jaccScore
                        if self.jaccMin > jaccScore:
                            self.jaccMin = jaccScore

                        #calculate widWidKL
                        if not self.cluster:
                            self.cluster.append([wid])
                            self.widWidKL[wid] = []
                        else:
                            self.__clustering(wid)

                    if (i % 100) == 0:
                        print file, " => ", i

            log1.write(str(self.qidWidMin) + "\t" + str(self.qidWidMax) + "\n")
            for key in self.qidWidKL:
                log1.write(key + "\t" + str(self.qidWidKL[key]) + "\n")
            log2.write(str(self.widWidMin) + "\t" + str(self.widWidMax) + "\n")
            for key in self.widWidKL:
                for i in range(len(self.widWidKL[key])):
                    for widKey in self.widWidKL[key][i]:
                        log2.write(key + "-" + widKey + "\t" +
                                   str(self.widWidKL[key][i][widKey]) + "\n")
            log3.write(str(self.jaccMin) + "\t" + str(self.jaccMax) + "\n")
            for key in self.jacc:
                log3.write(key + "\t" + str(self.jacc[key]) + "\n")

            rankInstance = Rank(self.widWidKL, self.widWidMin, self.widWidMax)
            #self.widScore = rankInstance.textRank()
            self.widScore = rankInstance.combinedCov(alpha, self.cluster,
                                                     self.tweet)

            #log info
            clusterCount = len(self.cluster)
            tweetCount = 0

            #select one wid from each cluster
            for i in range(len(self.cluster)):
                maxScore = 0
                bestWid = -1
                tweetCount += len(self.cluster[i])
                for wid in self.cluster[i]:
                    if self.widScore[wid] > maxScore:
                        #select min query-tweet kl score
                        #key = str(self.curQid) + "-" + wid
                        #if self.qidWidKL[key] > maxScore:
                        #maxScore = self.qidWidKL[key]
                        maxScore = self.widScore[wid]
                        bestWid = wid
                self.resultList.append(bestWid)

            #write log info
            log.write("MB" + self.curQid + "\t" + str(clusterCount) + "\t" +
                      str(tweetCount) + "\n")

            #write result
            for wid in self.resultList:
                result.write("MB" + self.curQid + "\t" + "Q0\t" + wid +
                             "\t1\t1\tYAO\n")

    def __clustering(self, wid):
        minScore = 999
        index = -1
        wcontent = self.tweet[wid]
        for i in range(len(self.cluster)):
            for cwid in self.cluster[i]:
                ccontent = self.tweet[cwid]
                score = self.klInstance.kl(self.tweet[wid], self.tweet[cwid])
                #print i
                if wid in self.widWidKL:
                    self.widWidKL[wid].append({cwid: score})
                else:
                    self.widWidKL[wid] = [{cwid: score}]

                if cwid in self.widWidKL:
                    self.widWidKL[cwid].append({wid: score})
                else:
                    self.widWidKL[cwid] = [{wid: score}]

                #select miniScore, that is the most similar value
                if score < minScore:
                    minScore = score
                    index = i
                #record self.widWidMax & self.widWidMin
                if score < self.widWidMin:
                    self.widWidMin = score

                if score > self.widWidMax:
                    self.widWidMax = score

        #put wid into the cluster
        #a new cluster
        if minScore > self.lamda:
            self.cluster.append([wid])
            #print self.cluster
        #add to a highest similarity cluster
        else:
            self.cluster[index].append(wid)
Example #7
0
def get_answer(question):
    pyserini = Pyserini(app.config.get('index'))
    jaccard = Jaccard()
    candidate_passages = pyserini.ranked_passages(query_string=question, num_hits=30, k=20)
    answer = jaccard.most_similar_passage(question, candidate_passages)
    return answer