def IRTest(texts, ir, num_cluster, percentage): kmeans = KMeans(n_clusters=num_cluster).fit(ir.corpus.TFIDF) label = list(zip(kmeans.labels_, range(len(texts)))) test = [] for cluster in range(num_cluster): #siapkan untuk summarization label_doc = list(filter(lambda x : x[0] == cluster, label)) label_text = [texts[label_doc[i][1]] for i in range(len(label_doc))] label_tfidf = np.array([ir.corpus.TFIDF[label_doc[i][1]] for i in range(len(label_doc))]) summa = Ranking.TextRank(label_tfidf).summarize(label_text, percentage) #retrive doc dengan query summa retrieved = [doc[0] for doc in ir.query(summa, len(label_doc))] #hitung confusion matrix dic_label = {page[1]:True for page in label_doc} dic_retrieved = {page:True for page in retrieved} TP = sum([1 if page in dic_label else 0 for page in retrieved]) FP = sum([1 if page not in dic_label else 0 for page in retrieved]) FN = sum([1 if page not in dic_retrieved else 0 for page in label_doc]) #hitung recall dan precision precision = float(TP) / (TP + FP) recall = float(TP) / (TP + FN) fmeasure = 2 * (precision * recall / (precision + recall)) test.append([precision, recall, fmeasure]) return test
def IRTest(texts, ir, num_cluster, percentage): #Buat cluster menggunakan algoritma kmeans kmeans = KMeans(n_clusters=num_cluster).fit(ir.corpus.TFIDF) #gabung antara cluster dengan halaman label = list(zip(kmeans.labels_, range(len(texts)))) #vector test untuk menyimpan nilai precision, recall dan fmeasure test = [] #untuk setiap cluster for cluster in range(num_cluster): #ambil dokumen yang berada pada cluster label_doc = list(filter(lambda x : x[0] == cluster, label)) #ambil raw text dari dokumen dalam cluster label_text = [texts[label_doc[i][1]] for i in range(len(label_doc))] #ambil tfidf dokumen pada cluster label_tfidf = np.array([ir.corpus.TFIDF[label_doc[i][1]] for i in range(len(label_doc))]) #lakukan peringkasan dengan algoritma textrank summa = Ranking.TextRank(label_tfidf).summarize(label_text, percentage) #retrive doc dengan query summa start = time.time() retrieved = [doc[0] for doc in ir.query(summa, len(label_doc))] print("time=%f"%(time.time() - start)) #hitung confusion matrix dic_label = {page[1]:True for page in label_doc} dic_retrieved = {page:True for page in retrieved} TP = sum([1 if page in dic_label else 0 for page in retrieved]) FP = sum([1 if page not in dic_label else 0 for page in retrieved]) FN = sum([1 if page not in dic_retrieved else 0 for page in label_doc]) #hitung recall dan precision precision = float(TP) / (TP + FP) recall = float(TP) / (TP + FN) fmeasure = 2 * (precision * recall / (precision + recall)) test.append([precision, recall, fmeasure]) return test