def find_clust_similar_sent(self): #to find out the sentence most similar to the cluster clust_sent_sim = helper.similarity(self.clust_weight,self.sent_weight) #print clust_sent_sim clust_sent = [] for sim in clust_sent_sim: #max_val = max(sim) max_index = max(sim.iteritems(), key=operator.itemgetter(1))[0] if max_index not in clust_sent: clust_sent.append(max_index) #print clust_sent self.clust_sentences = clust_sent
def __init__(self,docs,num_clu): self.no_clusters = num_clu #self.sentences = preprocessing.load_sentences(docs) self.sentences = preprocessing.load_duc_xml(docs) self.sent_no_swords = preprocessing.remove_stopwords(self.sentences) #self.full_doc = helper.fulldoc(self.sentences) #self.sent_no_swords.append(self.full_doc) self.unique_terms = helper.uniqueterms(self.sent_no_swords) self.sent_weight = helper.tfisf(self.sent_no_swords,self.unique_terms) #self.sent_weight = helper.word_vector(self.sent_no_swords,self.unique_terms) self.sent_similarity = helper.similarity(self.sent_weight,self.sent_weight) self.clusters = cluster.kmedoid(self.sent_similarity,self.no_clusters)
def find_clust_similar_sent(self): #to find out the sentence most similar to the cluster clust_sent_sim = helper.similarity(self.clust_weight, self.sent_weight) #print clust_sent_sim clust_sent = [] for sim in clust_sent_sim: #max_val = max(sim) max_index = max(sim.iteritems(), key=operator.itemgetter(1))[0] if max_index not in clust_sent: clust_sent.append(max_index) #print "Sentence most similar to clusters..." #print clust_sent self.clust_sentences = clust_sent
def get_most_similar(fv1, fv2): s = 0 x, y = None, None for i in range(len(fv1)): for j in range(len(fv2)): si = helper.similarity(fv1[i], fv2[j]) if si > s: x = i y = j s = si return x, y, s
def __init__(self,docs,num_clu): self.no_clusters = num_clu print "Loading Sentences..." self.sentences = preprocessing.load_sentences(docs) print "Preprocessing..." self.sent_no_swords = preprocessing.remove_stopwords(self.sentences) self.unique_terms = helper.uniqueterms(self.sent_no_swords) self.sent_weight = helper.tfisf(self.sent_no_swords,self.unique_terms) #self.sent_weight = helper.word_vector(self.sent_no_swords,self.unique_terms) print "Finding Similarity Graph..." self.sent_similarity = helper.similarity(self.sent_weight,self.sent_weight) print "Clustering..." self.clusters = cluster.kmedoid(self.sent_similarity,self.no_clusters) '''
def __init__(self, docs, num_clu): self.no_clusters = num_clu print "Loading Sentences..." self.sentences = preprocessing.load_sentences(docs) print "Preprocessing..." self.sent_no_swords = preprocessing.remove_stopwords(self.sentences) self.unique_terms = helper.uniqueterms(self.sent_no_swords) self.sent_weight = helper.tfisf(self.sent_no_swords, self.unique_terms) #self.sent_weight = helper.word_vector(self.sent_no_swords,self.unique_terms) print "Finding Similarity Graph..." self.sent_similarity = helper.similarity(self.sent_weight, self.sent_weight) print "Clustering..." self.clusters = cluster.kmedoid(self.sent_similarity, self.no_clusters) '''
def match_level(pv1, pv2, fv1, fv2): ml = np.zeros((len(pv1), len(pv2))) for i in range((len(pv1))): for j in range((len(pv2))): if np.all(np.abs(pv1[i] - pv2[j]) > const.BG): continue ml[i, j] = 0.5 + (0.5 * helper.similarity(fv1[i], fv2[j])) # ml_prime = np.zeros((len(pv1),len(pv2))) # for i,row in enumerate(ml): # j = np.argmax(row) # ml_prime[i,j] = row[j] # ml = ml_prime # ml_prime = np.zeros((len(pv1),len(pv2))) # for j,col in enumerate(ml.T): # i = np.argmax(col) # ml_prime[i,j] = col[i] # print(ml_prime) sum = 0 count = 0 while ml.any() != 0: ind = np.argmax(ml) x, y = ind // len(pv2), ind % (len(pv2)) if ml[x, y] != 0.5: sum = sum + ml[x, y] # sum = sum + ml[x,y] # print(ml[x,y]) ml[x] = 0 ml[:, y] = 0 count = count + 1 return (sum / count)
def nivel_similitud(pv1, pv2, fv1, fv2): ml = np.zeros((len(pv1), len(pv2))) for i in range((len(pv1))): for j in range((len(pv2))): if np.all(np.abs(pv1[i] - pv2[j]) > const.BG): continue ml[i, j] = 0.5 + (0.5 * helper.similarity(fv1[i], fv2[j])) sum = 0 count = 0 while ml.any() != 0: ind = np.argmax(ml) x, y = ind // len(pv2), ind % (len(pv2)) if ml[x, y] != 0.5: sum = sum + ml[x, y] ml[x] = 0 ml[:, y] = 0 count = count + 1 return (sum / count)