Esempio n. 1
0
def IRTest(texts, ir, num_cluster, percentage):
    kmeans = KMeans(n_clusters=num_cluster).fit(ir.corpus.TFIDF)
    label = list(zip(kmeans.labels_, range(len(texts))))
    test = []
    for cluster in range(num_cluster):
        #siapkan untuk summarization
        label_doc = list(filter(lambda x : x[0] == cluster, label))
        label_text = [texts[label_doc[i][1]] for i in range(len(label_doc))]
        label_tfidf = np.array([ir.corpus.TFIDF[label_doc[i][1]] for i in range(len(label_doc))])
        summa = Ranking.TextRank(label_tfidf).summarize(label_text, percentage)
        #retrive doc dengan query summa
        retrieved = [doc[0] for doc in ir.query(summa, len(label_doc))]
        #hitung confusion matrix
        dic_label = {page[1]:True for page in label_doc}
        dic_retrieved = {page:True for page in retrieved}
        TP = sum([1 if page in dic_label else 0 for page in retrieved])
        FP = sum([1 if page not in dic_label else 0 for page in retrieved])
        FN = sum([1 if page not in dic_retrieved else 0 for page in label_doc])

        #hitung recall dan precision
        precision = float(TP) / (TP + FP)
        recall = float(TP) / (TP + FN)
        fmeasure = 2 * (precision * recall / (precision + recall))
        test.append([precision, recall, fmeasure])
    return test
Esempio n. 2
0
def IRTest(texts, ir, num_cluster, percentage):
    #Buat cluster menggunakan algoritma kmeans
    kmeans = KMeans(n_clusters=num_cluster).fit(ir.corpus.TFIDF)
    
    #gabung antara cluster dengan halaman
    label = list(zip(kmeans.labels_, range(len(texts))))
    
    #vector test untuk menyimpan nilai precision, recall dan fmeasure
    test = []
    
    #untuk setiap cluster
    for cluster in range(num_cluster):
        #ambil dokumen yang berada pada cluster
        label_doc = list(filter(lambda x : x[0] == cluster, label))
        
        #ambil raw text dari dokumen dalam cluster
        label_text = [texts[label_doc[i][1]] for i in range(len(label_doc))]
        
        #ambil tfidf dokumen pada cluster
        label_tfidf = np.array([ir.corpus.TFIDF[label_doc[i][1]] for i in range(len(label_doc))])
        
        #lakukan peringkasan dengan algoritma textrank
        summa = Ranking.TextRank(label_tfidf).summarize(label_text, percentage)
        
        #retrive doc dengan query summa
        start = time.time()
        retrieved = [doc[0] for doc in ir.query(summa, len(label_doc))]
        print("time=%f"%(time.time() - start))
        
        #hitung confusion matrix
        dic_label = {page[1]:True for page in label_doc}
        dic_retrieved = {page:True for page in retrieved}
        TP = sum([1 if page in dic_label else 0 for page in retrieved])
        FP = sum([1 if page not in dic_label else 0 for page in retrieved])
        FN = sum([1 if page not in dic_retrieved else 0 for page in label_doc])

        #hitung recall dan precision
        precision = float(TP) / (TP + FP)
        recall = float(TP) / (TP + FN)
        fmeasure = 2 * (precision * recall / (precision + recall))
        test.append([precision, recall, fmeasure])
    return test