class EntityRelationRank(tk.Frame): def __init__(self, parent, patternset, relationlist, model = ''): tk.Frame.__init__(self, parent) parent.bind("<Return>", self.rank) self.parent = parent self.patternset = patternset self.relationlist = relationlist self.RelInstance = [] self.norm = 1 self.parent.title("EntityRelationRank") self.pack(fill="both", expand=True) frame = tk.Frame(self) frame.pack(fill="both", expand=True) self.state = tk.StringVar(frame) self.state.set("vanilla") self.rank_method = tk.StringVar(frame) self.rank_method.set("old") self.customFont = tkFont.Font(family="Helvetica", size=20) # tk.Radiobutton(frame, text = "vanilla", variable = self.state, value = 0).place(x = 940, y = 10, height = 10) # tk.Radiobutton(frame, text = "cluster", variable = self.state, value = 1).place(x = 940, y = 30, height = 10) tk.OptionMenu(frame, self.state, "vanilla", "MeanShift", "Kmeans", "DBSCAN").place(x = 970, y=10, width = 100, height = 30) tk.OptionMenu(frame, self.rank_method, "old", "new").place(x=970, y=50, width = 100, height = 30) self.label1 = tk.Label(frame, text="Arg1", font=self.customFont) self.label2 = tk.Label(frame, text="Arg2", font=self.customFont) self.label3 = tk.Label(frame, text="Factor", font=self.customFont) tk.Label(frame, text = "eps").place(x = 945, y = 90) tk.Label(frame, text = "min").place(x = 945, y = 130) self.xscrollbar = tk.Scrollbar(frame, orient = tk.HORIZONTAL) self.xscrollbar.pack(side = "bottom", fill = "x") self.yscrollbar = tk.Scrollbar(frame) self.yscrollbar.pack(side = "right", fill = "y") self.entry1 = tk.Entry(frame, font=self.customFont) self.entry2 = tk.Entry(frame, font=self.customFont) self.entry3 = tk.Entry(frame, font=self.customFont) self.entry4 = tk.Entry(frame, font=self.customFont) self.entry5 = tk.Entry(frame, font=self.customFont) self.eps = tk.Entry(frame) self.eps.place(x = 970, y = 90, width = 100, height = 30) self.min_samples = tk.Entry(frame) self.min_samples.place(x = 970, y = 130, width = 100, height = 30) self.button = tk.Button(frame, text="OK", command=self.zoom_in) self.output = tk.Text(frame, width=200, font=self.customFont, xscrollcommand = self.xscrollbar.set, yscrollcommand = self.yscrollbar.set, highlightbackground="black") self.output.tag_config("blue", foreground="cornflower blue", underline=0) self.output.tag_config("red", foreground="salmon", underline=0) self.output.tag_config("black", foreground="dim gray", underline=0) self.output.tag_config("green", foreground = "sea green", underline = 0) self.output.tag_config("brown", foreground = "brown", underline = 0) self.label1.place(x=10, y=10, height=50) self.entry1.place(x=10, y=70, width=300, height=50) self.label3.place(x=320, y=10, height=50) self.entry3.place(x=320, y=70, width=300, height=50) self.label2.place(x=630, y=10, height=50) self.entry2.place(x=630, y=70, width=300, height=50) self.entry4.place(x = 10, y = 130, width = 300, height = 50) self.entry5.place(x = 320, y = 130, width = 300, height = 50) self.button.place(x = 630, y = 130) self.output.place(x = 10, y = 190, width=980, height=550) self.xscrollbar.config(command = self.output.xview) self.yscrollbar.config(command = self.output.yview) self.model = model self.es = Elasticsearch("http://*****:*****@192.17.58.145:9200") self.entity_type = cPickle.load(open('entity_type.p', 'rb')) self.prior = cPickle.load(open('prior_new.p', 'rb')) self.info = cPickle.load(open('info.p', 'rb')) print "Complete Setup" def zoom_in(self): relation = self.entry4.get() entity = self.entry5.get() self.output.delete("1.0", "end") for ele in self.RelInstance.storage[relation].ct_entity[entity]: self.output.insert("end", ele + '\n', "black") self.output.insert("end", '\n\n\n', "black") for st in self.RelInstance.storage[relation].ct_entity_st[entity]: self.output.insert("end", st + '\n\n', "blue") def rank(self, event): print "start rank" color = ['black', 'blue'] # print self.state.get() self.output.delete("1.0", "end") arg1 = self.entry1.get() arg2 = self.entry2.get() context = self.entry3.get() self.norm = float(self.entry3.get()) sep_context = context.split() context_id = {} self.RelInstance = RelationSet(patternset = self.patternset, relationlist = self.relationlist) s = Search(using = self.es) q = Q("match", pentity1 = {"query":arg1, "operator":"and"}) & Q("match", pentity2 = {"query":arg2, "operator":"and"}) s = s.query(q) cnt = 0 if self.rank_method.get() == 'new': try: type1 = self.entity_type[arg1] type2 = self.entity_type[arg2] prior = self.prior[type1 + '-' + type2] except: print "cannot find type" return if self.model == '': for hits in s.scan(): # if joinRelation(hits.relation) in self.model: if cnt%500 == 0: print cnt cnt += 1 self.RelInstance.update_relation(hits.relation, hits.confidence) self.RelInstance.update_context_constt(hits.relation, hits.sentence, hits.phrase, hits.offset) self.RelInstance.update_modifier(arg1, arg2, hits.relation, hits.head, hits.role, hits.offset, hits.postag, hits.sentence) else: for hits in s.scan(): if joinRelation(hits.relation) in self.model: if cnt%500 == 0: print cnt cnt += 1 self.RelInstance.update_relation(hits.relation, hits.confidence) self.RelInstance.update_context_constt(hits.relation, hits.sentence, hits.phrase, hits.offset) self.RelInstance.update_modifier(arg1, arg2, hits.relation, hits.head, hits.role, hits.offset, hits.postag, hits.sentence) if self.rank_method.get() == 'new': print "update score" self.RelInstance.new_score(prior, self.norm, self.info) if self.state.get() == 'vanilla': print "vanilla" print "there are %d instances" % cnt length = len(self.RelInstance.storage) print "There are %d types of relations" % length sorted_list = self.RelInstance.sort_score(order = True) for ele in sorted_list: # context = RelInstance.storage[ele].context ct_entity_cnt = self.RelInstance.storage[ele].ct_entity_cnt modifier1 = self.RelInstance.storage[ele].modifier1.most_common(1) modifier2 = self.RelInstance.storage[ele].modifier2.most_common(1) if len(modifier1) == 0: modifier1 = [['','']] if len(modifier2) == 0: modifier2 = [['','']] name1 = ' '.join(modifier1[0][0].split('@')[::-1]) name2 = ' '.join(modifier2[0][0].split('@')[::-1]) self.output.insert("end", ele + ' ' + str(self.RelInstance.storage[ele].score) + ' ', "black") self.output.insert("end", '(' + name1 + ', ' + str(modifier1[0][1]) + ')' + ' ' + '(' + name2 + ', ' + str(modifier2[0][1]) + ')' + '\n', "red") for entity_ele in ct_entity_cnt.most_common(10): self.output.insert("end", str(entity_ele) + ' ', "green") self.output.insert("end", '\n\n\n', "black") elif (self.state.get() == 'DBSCAN' or self.state.get() == 'Kmeans'): input_eps = float(self.eps.get()) input_min_samples = int(self.min_samples.get()) print "there are %d instances" % cnt Items = self.RelInstance.get_scoreitems() Weight = self.RelInstance.get_scores() Weight = np.asarray(Weight) feature = np.empty((0,400)) for triple in Items: feature = np.vstack([feature, self.model[joinRelation(triple[0])]]) if self.state.get() == 'DBSCAN': handle = DBSCAN(eps = input_eps, min_samples = input_min_samples) handle.fit(feature, sample_weight = Weight) n_clusters = len(set(handle.labels_)) print "there are %d clusters" % n_clusters else: handle = KMeans(n_clusters=self.num_cluster.get(), init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1) handle.fit(feature) n_clusters = self.num_cluster.get() cluster_cnt = [0] * n_clusters core_sample = [0] * n_clusters noise = [] if self.state.get() == 'DBSCAN': for i in range(n_clusters): core_sample[i] = [] for i in handle.core_sample_indices_: core_sample[handle.labels_[i]].append(Items[i]) else: cluster_content = [Counter()] for j in range(n_clusters-1): cluster_content.append(Counter()) for k in range(len(Items)): cluster_content[handle.labels_[k]][Items[k][0]] += Items[k][1] for j in range(len(handle.labels_)): if handle.labels_[j] == -1: noise.append(Items[j]) else: # if Weight[j] > 2: cluster_cnt[handle.labels_[j]] += Weight[j] sorted_cluster = np.argsort(cluster_cnt)[::-1] clusterid=0; if self.state.get() == "DBSCAN": for cluster_indices in sorted_cluster: rprent = core_sample[cluster_indices] sorted_rprent = sorted(rprent, key = itemgetter(1), reverse = True) for ele in sorted_rprent: self.output.insert("end", ele[0] + ' ' + str(ele[1]) + ' ', color[clusterid%2]) # self.output.insert("end", ele[0], color[clusterid%3]) ct_entity_cnt = self.RelInstance.storage[ele[0]].ct_entity_cnt ct_entity_cnt = self.RelInstance.storage[ele[0]].ct_entity_cnt modifier1 = self.RelInstance.storage[ele[0]].modifier1.most_common(1) modifier2 = self.RelInstance.storage[ele[0]].modifier2.most_common(1) if len(modifier1) == 0: modifier1 = [['','']] if len(modifier2) == 0: modifier2 = [['','']] name1 = ' '.join(modifier1[0][0].split('@')[::-1]) name2 = ' '.join(modifier2[0][0].split('@')[::-1]) self.output.insert("end", '(' + name1 + ', ' + str(modifier1[0][1]) + ')' + ' ' + '(' + name2 + ', ' + str(modifier2[0][1]) + ')' + '\n', "red") for entity_ele in ct_entity_cnt.most_common(10): self.output.insert("end", str(entity_ele) + ' ', "green") self.output.insert("end", '\n', "black") clusterid += 1 for noise_ele in noise: self.output.insert("end", noise_ele[0] + ' ' + str(noise_ele[1]) + ' NOISE\n', "black") else: for m in sorted_cluster: dic = cluster_content[m] sorted_list = dic.most_common(10) for ele in sorted_list: self.output.insert("end", ele[0] + ' ' + str(ele[1]) + '\n', color[clusterid%3]) clusterid += 1
def rank(self, event): print "start rank" color = ['black', 'blue'] # print self.state.get() self.output.delete("1.0", "end") arg1 = self.entry1.get() arg2 = self.entry2.get() context = self.entry3.get() self.norm = float(self.entry3.get()) sep_context = context.split() context_id = {} self.RelInstance = RelationSet(patternset = self.patternset, relationlist = self.relationlist) s = Search(using = self.es) q = Q("match", pentity1 = {"query":arg1, "operator":"and"}) & Q("match", pentity2 = {"query":arg2, "operator":"and"}) s = s.query(q) cnt = 0 if self.rank_method.get() == 'new': try: type1 = self.entity_type[arg1] type2 = self.entity_type[arg2] prior = self.prior[type1 + '-' + type2] except: print "cannot find type" return if self.model == '': for hits in s.scan(): # if joinRelation(hits.relation) in self.model: if cnt%500 == 0: print cnt cnt += 1 self.RelInstance.update_relation(hits.relation, hits.confidence) self.RelInstance.update_context_constt(hits.relation, hits.sentence, hits.phrase, hits.offset) self.RelInstance.update_modifier(arg1, arg2, hits.relation, hits.head, hits.role, hits.offset, hits.postag, hits.sentence) else: for hits in s.scan(): if joinRelation(hits.relation) in self.model: if cnt%500 == 0: print cnt cnt += 1 self.RelInstance.update_relation(hits.relation, hits.confidence) self.RelInstance.update_context_constt(hits.relation, hits.sentence, hits.phrase, hits.offset) self.RelInstance.update_modifier(arg1, arg2, hits.relation, hits.head, hits.role, hits.offset, hits.postag, hits.sentence) if self.rank_method.get() == 'new': print "update score" self.RelInstance.new_score(prior, self.norm, self.info) if self.state.get() == 'vanilla': print "vanilla" print "there are %d instances" % cnt length = len(self.RelInstance.storage) print "There are %d types of relations" % length sorted_list = self.RelInstance.sort_score(order = True) for ele in sorted_list: # context = RelInstance.storage[ele].context ct_entity_cnt = self.RelInstance.storage[ele].ct_entity_cnt modifier1 = self.RelInstance.storage[ele].modifier1.most_common(1) modifier2 = self.RelInstance.storage[ele].modifier2.most_common(1) if len(modifier1) == 0: modifier1 = [['','']] if len(modifier2) == 0: modifier2 = [['','']] name1 = ' '.join(modifier1[0][0].split('@')[::-1]) name2 = ' '.join(modifier2[0][0].split('@')[::-1]) self.output.insert("end", ele + ' ' + str(self.RelInstance.storage[ele].score) + ' ', "black") self.output.insert("end", '(' + name1 + ', ' + str(modifier1[0][1]) + ')' + ' ' + '(' + name2 + ', ' + str(modifier2[0][1]) + ')' + '\n', "red") for entity_ele in ct_entity_cnt.most_common(10): self.output.insert("end", str(entity_ele) + ' ', "green") self.output.insert("end", '\n\n\n', "black") elif (self.state.get() == 'DBSCAN' or self.state.get() == 'Kmeans'): input_eps = float(self.eps.get()) input_min_samples = int(self.min_samples.get()) print "there are %d instances" % cnt Items = self.RelInstance.get_scoreitems() Weight = self.RelInstance.get_scores() Weight = np.asarray(Weight) feature = np.empty((0,400)) for triple in Items: feature = np.vstack([feature, self.model[joinRelation(triple[0])]]) if self.state.get() == 'DBSCAN': handle = DBSCAN(eps = input_eps, min_samples = input_min_samples) handle.fit(feature, sample_weight = Weight) n_clusters = len(set(handle.labels_)) print "there are %d clusters" % n_clusters else: handle = KMeans(n_clusters=self.num_cluster.get(), init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1) handle.fit(feature) n_clusters = self.num_cluster.get() cluster_cnt = [0] * n_clusters core_sample = [0] * n_clusters noise = [] if self.state.get() == 'DBSCAN': for i in range(n_clusters): core_sample[i] = [] for i in handle.core_sample_indices_: core_sample[handle.labels_[i]].append(Items[i]) else: cluster_content = [Counter()] for j in range(n_clusters-1): cluster_content.append(Counter()) for k in range(len(Items)): cluster_content[handle.labels_[k]][Items[k][0]] += Items[k][1] for j in range(len(handle.labels_)): if handle.labels_[j] == -1: noise.append(Items[j]) else: # if Weight[j] > 2: cluster_cnt[handle.labels_[j]] += Weight[j] sorted_cluster = np.argsort(cluster_cnt)[::-1] clusterid=0; if self.state.get() == "DBSCAN": for cluster_indices in sorted_cluster: rprent = core_sample[cluster_indices] sorted_rprent = sorted(rprent, key = itemgetter(1), reverse = True) for ele in sorted_rprent: self.output.insert("end", ele[0] + ' ' + str(ele[1]) + ' ', color[clusterid%2]) # self.output.insert("end", ele[0], color[clusterid%3]) ct_entity_cnt = self.RelInstance.storage[ele[0]].ct_entity_cnt ct_entity_cnt = self.RelInstance.storage[ele[0]].ct_entity_cnt modifier1 = self.RelInstance.storage[ele[0]].modifier1.most_common(1) modifier2 = self.RelInstance.storage[ele[0]].modifier2.most_common(1) if len(modifier1) == 0: modifier1 = [['','']] if len(modifier2) == 0: modifier2 = [['','']] name1 = ' '.join(modifier1[0][0].split('@')[::-1]) name2 = ' '.join(modifier2[0][0].split('@')[::-1]) self.output.insert("end", '(' + name1 + ', ' + str(modifier1[0][1]) + ')' + ' ' + '(' + name2 + ', ' + str(modifier2[0][1]) + ')' + '\n', "red") for entity_ele in ct_entity_cnt.most_common(10): self.output.insert("end", str(entity_ele) + ' ', "green") self.output.insert("end", '\n', "black") clusterid += 1 for noise_ele in noise: self.output.insert("end", noise_ele[0] + ' ' + str(noise_ele[1]) + ' NOISE\n', "black") else: for m in sorted_cluster: dic = cluster_content[m] sorted_list = dic.most_common(10) for ele in sorted_list: self.output.insert("end", ele[0] + ' ' + str(ele[1]) + '\n', color[clusterid%3]) clusterid += 1