Python Jaccard Examples

Programming Language: Python

Namespace/Package Name: jaccard

Class/Type: Jaccard

Examples at hotexamples.com: 7

Python Jaccard - 7 examples found. These are the top rated real world Python examples of jaccard.Jaccard extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Jaccard(2)

jaccardScore(1)

most_similar_passage(1)

predict(1)

Example #1

Show file

 def __init__(self, queryFile, candidatePath, mu, corpusFile, sigma, lamda):
     self.query = {}
     self.candidate = candidatePath
     self.tweet = {}
     self.mu = mu
     self.sigma = sigma  #similarity threshold
     self.lamda = lamda  #cluster threshold
     self.jaccInstance = Jaccard()
     self.klInstance = Distance(mu, corpusFile)
     print "corpus read done!"

Example #2

Show file

File: cluster.py Project: yaolili/ttg

 def __init__(self, queryFile, candidatePath, mu, corpusFile, sigma, lamda):
     self.query = {}
     self.candidate = candidatePath
     self.tweet = {}  
     self.mu = mu
     self.sigma = sigma      #similarity threshold
     self.lamda = lamda      #cluster threshold     
     self.jaccInstance = Jaccard()
     self.klInstance = Distance(mu, corpusFile)
     print "corpus read done!"

Example #3

Show file

    def __init__(self, train_path, test_path):
        self.train_path = train_path
        self.test_path = test_path
        self.preprocessor = Preprocessor()
        self.trn = pd.DataFrame(columns=Classifier._COLS)  # Read data_frame
        self.tst = pd.DataFrame(columns=Classifier._COLS)  # Read data_frame
        self.trn_gs = pd.DataFrame(columns=Classifier._GS_COLS)  # Known labels
        self.tst_gs = pd.DataFrame(columns=Classifier._GS_COLS)  # Known labels
        self.tok_trn = []
        self.tok_tst = []

        self.feature_extractor = FeatureExtractor()
        self.jaccard = Jaccard()
        self.rfr = RFR()
        self.nn = MLPRegressor(hidden_layer_sizes=(100, 30, 30),
                               validation_fraction=0.3,
                               alpha=0.3,
                               warm_start=False,
                               max_iter=1000,
                               activation='logistic')

Example #4

Show file

class Classifier:
    _GS_COLS = ['labels']
    _COLS = ['sentence0', 'sentence1']

    def __init__(self, train_path, test_path):
        self.train_path = train_path
        self.test_path = test_path
        self.preprocessor = Preprocessor()
        self.trn = pd.DataFrame(columns=Classifier._COLS)  # Read data_frame
        self.tst = pd.DataFrame(columns=Classifier._COLS)  # Read data_frame
        self.trn_gs = pd.DataFrame(columns=Classifier._GS_COLS)  # Known labels
        self.tst_gs = pd.DataFrame(columns=Classifier._GS_COLS)  # Known labels
        self.tok_trn = []
        self.tok_tst = []

        self.feature_extractor = FeatureExtractor()
        self.jaccard = Jaccard()
        self.rfr = RFR()
        self.nn = MLPRegressor(hidden_layer_sizes=(100, 30, 30),
                               validation_fraction=0.3,
                               alpha=0.3,
                               warm_start=False,
                               max_iter=1000,
                               activation='logistic')

    # -------------------------------------------------- CLASSIFY ▼ ----------------------------------------------------

    def classify(self):
        print(self.trn.head())
        print('Preprocessing...')
        self.tok_trn = self.preprocessor.run(self.trn)
        self.tok_tst = self.preprocessor.run(self.tst)
        print(self.tok_trn.head())

        print(self.tok_trn['sentence0'].values[483])
        print(self.trn['sentence0'].values[483])

        # Features
        fea_trn = pd.read_pickle('./dump/fea_trn4.dump')
        fea_tst = pd.read_pickle('./dump/fea_tst4.dump')
        #fea_trn = self.feature_extractor.extract(tok_trn)
        #fea_tst = self.feature_extractor.extract(tok_tst)
        #fea_trn.to_pickle('./dump/fea_trn4.dump')
        #fea_tst.to_pickle('./dump/fea_tst4.dump')

        print('Creating BOG...')
        bog = BOG()
        bog.train_dictionary(self.tok_trn)
        bog_extended_trn = bog.get_bog_extended(self.tok_trn, fea_trn)
        bog_extended_tst = bog.get_bog_extended(self.tok_tst, fea_tst)
        bog_extended_trn_scaled = bog.get_bog_extended(self.tok_trn,
                                                       fea_trn,
                                                       scale=True)
        bog_extended_tst_scaled = bog.get_bog_extended(self.tok_tst,
                                                       fea_tst,
                                                       scale=True)

        print('Training RFR...')
        self.rfr.fit(bog_extended_trn, self.trn_gs['labels'].values)
        self.rfr.print_feature_importance(bog_extended_trn)

        print('Training NN...')
        self.nn.fit(bog_extended_trn_scaled, self.trn_gs['labels'].values)

        print('Testing...')
        predict_nn_trn = self.nn.predict(bog_extended_trn_scaled)
        predict_nn_tst = self.nn.predict(bog_extended_tst_scaled)
        predict_rfr_trn = self.rfr.predict(bog_extended_trn)
        predict_rfr_tst = self.rfr.predict(bog_extended_tst)
        predict_jac_trn = self.jaccard.predict(self.tok_trn)
        predict_jac_tst = self.jaccard.predict(self.tok_tst)
        predict_vot_trn = self.average(predict_rfr_trn, predict_nn_trn)
        predict_vot_tst = self.average(predict_rfr_tst, predict_nn_tst)

        self.show_results(predict_rfr_trn, predict_rfr_tst, predict_jac_trn,
                          predict_jac_tst, predict_nn_trn, predict_nn_tst,
                          predict_vot_trn, predict_vot_tst)

    def average(self, predict_rfr, predict_nn):
        voted = []
        for rfr, nn in zip(predict_rfr, predict_nn):
            voted.append(0.5 * rfr + 0.5 * nn)
        return voted

    # ---------------------------------------------------- SHOW ▼ -----------------------------------------------------

    def __add_table(self, table, name, trn, tst):
        table.append_column(name, [
            '{:.2f} std: {:.1f}'.format(np.mean(trn), np.std(trn)),
            '{:.2f} std: {:.1f}'.format(np.mean(tst), np.std(tst)),
            '{:.4f}'.format(pearsonr(trn, self.trn_gs['labels'])[0]),
            '{:.4f}'.format(pearsonr(tst, self.tst_gs['labels'])[0])
        ])

    def show_results(self, rfr_trn, rfr_tst, jac_trn, jac_tst, nn_trn, nn_tst,
                     vot_trn, vot_tst):
        table = BeautifulTable()
        table.append_column('', ['Trn', 'Tst', 'Trn Pearson', 'Tst Pearson'])

        self.__add_table(table, 'Real', self.trn_gs['labels'],
                         self.tst_gs['labels'])
        self.__add_table(table, 'RFR', rfr_trn, rfr_tst)
        self.__add_table(table, 'Jaccard', jac_trn, jac_tst)
        self.__add_table(table, 'NN', nn_trn, nn_tst)
        self.__add_table(table, 'Voting', vot_trn, vot_tst)
        plt.scatter(nn_trn, self.trn_gs['labels'], c='Cyan')
        plt.xlabel('NN label')
        plt.ylabel('Real label')
        plt.show()
        plt.scatter(vot_trn, self.trn_gs['labels'], c='Blue')
        plt.xlabel('Averaging label')
        plt.ylabel('Real label')
        plt.show()
        plt.scatter(jac_trn, self.trn_gs['labels'], c='Green')
        plt.xlabel('Jaccard label')
        plt.ylabel('Real label')
        plt.show()
        plt.scatter(rfr_trn, self.trn_gs['labels'], c='Red')
        plt.xlabel('RFR label')
        plt.ylabel('Real label')
        plt.show()
        plt.scatter(nn_tst, self.tst_gs['labels'], c='Cyan')
        plt.xlabel('NN label')
        plt.ylabel('Real label')
        plt.show()
        plt.scatter(vot_tst, self.tst_gs['labels'], c='Blue')
        plt.xlabel('Averaging label')
        plt.ylabel('Real label')
        plt.show()
        plt.scatter(jac_tst, self.tst_gs['labels'], c='Green')
        plt.xlabel('Jaccard label')
        plt.ylabel('Real label')
        plt.show()
        plt.scatter(rfr_tst, self.tst_gs['labels'], c='Red')
        plt.xlabel('RFR label')
        plt.ylabel('Real label')
        plt.show()
        print(table)
        self.show_worst_test(vot_tst, rfr_tst, nn_tst, jac_tst)
        print()
        self.show_best_test(vot_tst, rfr_tst, nn_tst, jac_tst)
        print()

    def show_best_test(self,
                       predicted,
                       predicted_rfr,
                       predicted_nn,
                       predicted_jac,
                       k=15):
        print('Best results in averaging:')
        err = np.abs(predicted - self.tst_gs['labels'].values)
        idx = np.argpartition(err, k)[:k]
        dic = {
            err[i]: i
            for i in idx
        }  # Create a dictionary with the errors as the key for sorting output
        for err in sorted(dic, reverse=True):
            i = dic[err]
            print(
                '\33[100m{:d} Predicted [Averaging: {:.2f} RFR: {:.2f} NN: {:.2f} Jaccard: {:.2f}] Target: {:.2f} Err: {:.2f}\033[0m\n{:s}\n{:s}'
                .format(
                    i, predicted[i], predicted_rfr[i], predicted_nn[i],
                    predicted_jac[i], self.tst_gs['labels'].values[i], err,
                    str(self.tst['sentence0'].values[i]).replace('\n',
                                                                 '').replace(
                                                                     '\r', ''),
                    str(self.tst['sentence1'].values[i]).replace(
                        '\n', '').replace('\r', '')))

    def show_worst_test(self,
                        predicted,
                        predicted_rfr,
                        predicted_nn,
                        predicted_jac,
                        k=15):
        print('Worst results in averaging:')
        err = np.abs(predicted - self.tst_gs['labels'].values)
        idx = np.argpartition(err, -k)[-k:]
        dic = {
            err[i]: i
            for i in idx
        }  # Create a dictionary with the errors as the key for sorting output
        for err in sorted(dic, reverse=True):
            i = dic[err]
            print(
                '\33[100m{:d} Predicted [Averaging: {:.2f} RFR: {:.2f} NN: {:.2f} Jaccard: {:.2f}] Target: {:.2f} Err: {:.2f}\033[0m\n{:s}\n{:s}'
                .format(
                    i, predicted[i], predicted_rfr[i], predicted_nn[i],
                    predicted_jac[i], self.tst_gs['labels'].values[i], err,
                    str(self.tst['sentence0'].values[i]).replace('\n',
                                                                 '').replace(
                                                                     '\r', ''),
                    str(self.tst['sentence1'].values[i]).replace(
                        '\n', '').replace('\r', '')))

    # --------------------------------------------------- LOADING ▼ ---------------------------------------------------

    def load(self, use_dump=True):
        self.trn, self.trn_gs = self.__load_all(self.train_path)
        self.tst, self.tst_gs = self.__load_all(self.test_path)

        print('Train: {0} Test: {1}'.format(self.trn.shape, self.tst.shape))

    def __load_all(self, dir):
        print(dir)
        files = listdir(dir)
        input = pd.DataFrame(columns=['sentence0', 'sentence1'])
        label = pd.DataFrame(columns=['labels'])
        for file in files:
            path = pth.join(dir, file)
            path_gs = path.replace('input', 'gs')
            if 'STS.input' in path:  # Only read input files
                input_df = pd.read_csv(path,
                                       sep='\t',
                                       lineterminator='\n',
                                       names=Classifier._COLS,
                                       header=None,
                                       quoting=csv.QUOTE_NONE)
                label_df = pd.read_csv(path_gs,
                                       sep='\t',
                                       lineterminator='\n',
                                       names=Classifier._GS_COLS,
                                       header=None,
                                       quoting=csv.QUOTE_NONE)
                input = input.append(input_df)
                label = label.append(label_df)

        return \
            input.fillna('').reset_index(drop=True), \
            label.fillna('').reset_index(drop=True)

Example #5

Show file

File: cluster.py Project: yaolili/ttg

class Cluster:
    def __init__(self, queryFile, candidatePath, mu, corpusFile, sigma, lamda):
        self.query = {}
        self.candidate = candidatePath
        self.tweet = {}  
        self.mu = mu
        self.sigma = sigma      #similarity threshold
        self.lamda = lamda      #cluster threshold     
        self.jaccInstance = Jaccard()
        self.klInstance = Distance(mu, corpusFile)
        print "corpus read done!"    
    
    def write(self, writePath, alpha, yibuson):
        writeFile = writePath + "res.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson)
        result = open(writeFile, "w+")
        log = open(writePath + "log.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")
        log.write("Qid\tclusterCount\ttweetCount\n")
        log1 = open(writePath + "qidWidKL.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")
        log2 = open(writePath + "widWidKL.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")
        log3 = open(writePath + "jaccScore.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")

        num = 1
        files = []
        while(num <= 55):
            files.append(str(num) + ".res.content.all")
            num += 1
        for file in files:
            #remember to make them initial on each query
            self.curQid = -1
            self.cluster = []
            self.qidWidKL = {}
            self.qidWidMax = 0
            self.qidWidMin = 999
            self.widWidKL = {}
            self.widWidMax = 0
            self.widWidMin = 999 
            self.widScore = {}
            self.jacc = {}
            self.jaccMax = 0
            self.jaccMin = 1
            self.resultList = []
            readPath = self.candidate + file  
            
            with open(readPath, "r") as fin:
                for i, line in enumerate(fin):
                    qid, Qid, wid, rank, score, runName, wcontent, qcontent = line.strip().split("\t")
                    self.query[qid] = qcontent
                    
                    #first time selection
                    if(float(score) < 4.59):
                        if not self.cluster:
                            print "break out of 4.59, ", file, " , empty cluster!"
                            exit()
                        break
                        
                    self.tweet[wid] = wcontent
                    self.curQid = qid
                    
                    #calculate qidWidKL
                    similarity = self.klInstance.kl(self.query[qid], wcontent)
                    
                    #calculate jaccard score
                    jaccScore = self.jaccInstance.jaccardScore(qcontent, wcontent)
                    
                    #if similarity <= self.sigma and jaccScore >= yibuson:
                    if similarity <= self.sigma:    
                        #set self.qidWidKL
                        self.qidWidKL[qid+"-"+wid] = similarity
                        if (similarity > self.qidWidMax):
                            self.qidWidMax = similarity

                        if (similarity < self.qidWidMin):
                            self.qidWidMin = similarity
                        
                        #set self.jacc       
                        self.jacc[qid+"-"+wid] = jaccScore
                        if self.jaccMax < jaccScore:
                            self.jaccMax = jaccScore
                        if self.jaccMin > jaccScore:
                            self.jaccMin = jaccScore    
                        
                        #calculate widWidKL
                        if not self.cluster:
                            self.cluster.append([wid])
                            self.widWidKL[wid] = []
                        else:
                            self.__clustering(wid)

                    
                    if (i % 100) == 0:
                        print file, " => ", i
                
            log1.write(str(self.qidWidMin) + "\t" + str(self.qidWidMax) + "\n")
            for key in self.qidWidKL:
                log1.write(key + "\t" + str(self.qidWidKL[key]) + "\n")
            log2.write(str(self.widWidMin) + "\t" + str(self.widWidMax) + "\n")
            for key in self.widWidKL:
                for i in range(len(self.widWidKL[key])):
                    for widKey in self.widWidKL[key][i]:
                        log2.write(key + "-" + widKey + "\t" + str(self.widWidKL[key][i][widKey]) + "\n")
            log3.write(str(self.jaccMin) + "\t" + str(self.jaccMax) + "\n")
            for key in self.jacc:
                log3.write(key + "\t" + str(self.jacc[key]) + "\n")
                        
            rankInstance = Rank(self.widWidKL, self.widWidMin, self.widWidMax)
            #self.widScore = rankInstance.textRank()
            self.widScore = rankInstance.combinedCov(alpha, self.cluster, self.tweet)
            
            #log info
            clusterCount = len(self.cluster)
            tweetCount = 0
            
            #select one wid from each cluster 
            for i in range(len(self.cluster)):
                maxScore = 0
                bestWid = -1
                tweetCount += len(self.cluster[i])
                for wid in self.cluster[i]:
                    if self.widScore[wid] > maxScore:
                    #select min query-tweet kl score
                    #key = str(self.curQid) + "-" + wid
                    #if self.qidWidKL[key] > maxScore:
                        #maxScore = self.qidWidKL[key]
                        maxScore = self.widScore[wid]
                        bestWid = wid
                self.resultList.append(bestWid)
            
            #write log info
            log.write("MB" + self.curQid + "\t" + str(clusterCount) + "\t" + str(tweetCount) + "\n")
            
                       
            #write result
            for wid in self.resultList:
                result.write("MB" + self.curQid + "\t" + "Q0\t" + wid + "\t1\t1\tYAO\n")
            
        
    def __clustering(self, wid):
        minScore = 999
        index = -1
        wcontent = self.tweet[wid]
        for i in range(len(self.cluster)):          
            for cwid in self.cluster[i]:
                ccontent = self.tweet[cwid]
                score = self.klInstance.kl(self.tweet[wid], self.tweet[cwid])
                #print i
                if wid in self.widWidKL:
                    self.widWidKL[wid].append({cwid: score})
                else:
                    self.widWidKL[wid] = [{cwid: score}]
                
                if cwid in self.widWidKL:
                    self.widWidKL[cwid].append({wid: score})
                else:
                    self.widWidKL[cwid] = [{wid: score}]
                    
                    
                #select miniScore, that is the most similar value
                if score < minScore:
                    minScore = score
                    index = i
                #record self.widWidMax & self.widWidMin
                if score < self.widWidMin:
                    self.widWidMin = score

                if score > self.widWidMax:
                    self.widWidMax = score
                    
        #put wid into the cluster 
        #a new cluster
        if minScore > self.lamda:
            self.cluster.append([wid])
            #print self.cluster
        #add to a highest similarity cluster
        else:
            self.cluster[index].append(wid)

Example #6

Show file

class Cluster:
    def __init__(self, queryFile, candidatePath, mu, corpusFile, sigma, lamda):
        self.query = {}
        self.candidate = candidatePath
        self.tweet = {}
        self.mu = mu
        self.sigma = sigma  #similarity threshold
        self.lamda = lamda  #cluster threshold
        self.jaccInstance = Jaccard()
        self.klInstance = Distance(mu, corpusFile)
        print "corpus read done!"

    def write(self, writePath, alpha, yibuson):
        writeFile = writePath + "res.rmSW4.59S" + str(self.sigma) + ".L" + str(
            self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson)
        result = open(writeFile, "w+")
        log = open(
            writePath + "log.rmSW4.59S" + str(self.sigma) + ".L" +
            str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")
        log.write("Qid\tclusterCount\ttweetCount\n")
        log1 = open(
            writePath + "qidWidKL.rmSW4.59S" + str(self.sigma) + ".L" +
            str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")
        log2 = open(
            writePath + "widWidKL.rmSW4.59S" + str(self.sigma) + ".L" +
            str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")
        log3 = open(
            writePath + "jaccScore.rmSW4.59S" + str(self.sigma) + ".L" +
            str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")

        num = 1
        files = []
        while (num <= 55):
            files.append(str(num) + ".res.content.all")
            num += 1
        for file in files:
            #remember to make them initial on each query
            self.curQid = -1
            self.cluster = []
            self.qidWidKL = {}
            self.qidWidMax = 0
            self.qidWidMin = 999
            self.widWidKL = {}
            self.widWidMax = 0
            self.widWidMin = 999
            self.widScore = {}
            self.jacc = {}
            self.jaccMax = 0
            self.jaccMin = 1
            self.resultList = []
            readPath = self.candidate + file

            with open(readPath, "r") as fin:
                for i, line in enumerate(fin):
                    qid, Qid, wid, rank, score, runName, wcontent, qcontent = line.strip(
                    ).split("\t")
                    self.query[qid] = qcontent

                    #first time selection
                    if (float(score) < 4.59):
                        if not self.cluster:
                            print "break out of 4.59, ", file, " , empty cluster!"
                            exit()
                        break

                    self.tweet[wid] = wcontent
                    self.curQid = qid

                    #calculate qidWidKL
                    similarity = self.klInstance.kl(self.query[qid], wcontent)

                    #calculate jaccard score
                    jaccScore = self.jaccInstance.jaccardScore(
                        qcontent, wcontent)

                    #if similarity <= self.sigma and jaccScore >= yibuson:
                    if similarity <= self.sigma:
                        #set self.qidWidKL
                        self.qidWidKL[qid + "-" + wid] = similarity
                        if (similarity > self.qidWidMax):
                            self.qidWidMax = similarity

                        if (similarity < self.qidWidMin):
                            self.qidWidMin = similarity

                        #set self.jacc
                        self.jacc[qid + "-" + wid] = jaccScore
                        if self.jaccMax < jaccScore:
                            self.jaccMax = jaccScore
                        if self.jaccMin > jaccScore:
                            self.jaccMin = jaccScore

                        #calculate widWidKL
                        if not self.cluster:
                            self.cluster.append([wid])
                            self.widWidKL[wid] = []
                        else:
                            self.__clustering(wid)

                    if (i % 100) == 0:
                        print file, " => ", i

            log1.write(str(self.qidWidMin) + "\t" + str(self.qidWidMax) + "\n")
            for key in self.qidWidKL:
                log1.write(key + "\t" + str(self.qidWidKL[key]) + "\n")
            log2.write(str(self.widWidMin) + "\t" + str(self.widWidMax) + "\n")
            for key in self.widWidKL:
                for i in range(len(self.widWidKL[key])):
                    for widKey in self.widWidKL[key][i]:
                        log2.write(key + "-" + widKey + "\t" +
                                   str(self.widWidKL[key][i][widKey]) + "\n")
            log3.write(str(self.jaccMin) + "\t" + str(self.jaccMax) + "\n")
            for key in self.jacc:
                log3.write(key + "\t" + str(self.jacc[key]) + "\n")

            rankInstance = Rank(self.widWidKL, self.widWidMin, self.widWidMax)
            #self.widScore = rankInstance.textRank()
            self.widScore = rankInstance.combinedCov(alpha, self.cluster,
                                                     self.tweet)

            #log info
            clusterCount = len(self.cluster)
            tweetCount = 0

            #select one wid from each cluster
            for i in range(len(self.cluster)):
                maxScore = 0
                bestWid = -1
                tweetCount += len(self.cluster[i])
                for wid in self.cluster[i]:
                    if self.widScore[wid] > maxScore:
                        #select min query-tweet kl score
                        #key = str(self.curQid) + "-" + wid
                        #if self.qidWidKL[key] > maxScore:
                        #maxScore = self.qidWidKL[key]
                        maxScore = self.widScore[wid]
                        bestWid = wid
                self.resultList.append(bestWid)

            #write log info
            log.write("MB" + self.curQid + "\t" + str(clusterCount) + "\t" +
                      str(tweetCount) + "\n")

            #write result
            for wid in self.resultList:
                result.write("MB" + self.curQid + "\t" + "Q0\t" + wid +
                             "\t1\t1\tYAO\n")

    def __clustering(self, wid):
        minScore = 999
        index = -1
        wcontent = self.tweet[wid]
        for i in range(len(self.cluster)):
            for cwid in self.cluster[i]:
                ccontent = self.tweet[cwid]
                score = self.klInstance.kl(self.tweet[wid], self.tweet[cwid])
                #print i
                if wid in self.widWidKL:
                    self.widWidKL[wid].append({cwid: score})
                else:
                    self.widWidKL[wid] = [{cwid: score}]

                if cwid in self.widWidKL:
                    self.widWidKL[cwid].append({wid: score})
                else:
                    self.widWidKL[cwid] = [{wid: score}]

                #select miniScore, that is the most similar value
                if score < minScore:
                    minScore = score
                    index = i
                #record self.widWidMax & self.widWidMin
                if score < self.widWidMin:
                    self.widWidMin = score

                if score > self.widWidMax:
                    self.widWidMax = score

        #put wid into the cluster
        #a new cluster
        if minScore > self.lamda:
            self.cluster.append([wid])
            #print self.cluster
        #add to a highest similarity cluster
        else:
            self.cluster[index].append(wid)

Example #7

Show file

File: api.py Project: rosequ/Anserini

def get_answer(question):
    pyserini = Pyserini(app.config.get('index'))
    jaccard = Jaccard()
    candidate_passages = pyserini.ranked_passages(query_string=question, num_hits=30, k=20)
    answer = jaccard.most_similar_passage(question, candidate_passages)
    return answer