def constructOrderedHBF(ahbf, dataset): record_ids = [item.id for item in dataset] attributes = ahbf.keys() # attribute names print(attributes) orderhbf = HBForest('cora') orderhbf.set_dataset_records_ids(record_ids) layer_order_dict = {} for attra in attributes: print(attra) values = ahbf[attra].keys() # values for attribute attra # print(values) attra_nodesize_dict = {} recordids_has_attra = [] for value in values: print(" ", value) # print(" ",ahbf[attra][value]) syns = ahbf[attra][value].keys() for syn in syns: print(" ", syn, ":", ahbf[attra][value][syn]) orderhbf.add_hbflayer(attra, value, syn, ahbf[attra][value][syn]) recordids_has_attra.extend(ahbf[attra][value][syn]) attra_nodesize_dict[value] = orderhbf.get_single_layer( attra).get_single_child(value).get_node_size() list_sort_value_desc = precluster.sort_dict(attra_nodesize_dict) order = 1 # max = [] ordered_nodes_dict = {} for item in list_sort_value_desc: orderhbf.get_single_layer(attra).get_single_child( item[0]).set_order(order) ordered_nodes_dict[order] = item[0] order = order + 1 # if len(max) == 0: # max.append(value) ordered_nodes_dict[order] = 'NULL' orderhbf.add_hbflayer( attra, 'NULL', 'NULL', list(set(record_ids).difference(set(recordids_has_attra)))) orderhbf.get_single_layer(attra).set_ordered_nodes_dict( ordered_nodes_dict) layer_order_dict[attra] = list_sort_value_desc[0][1] layers = precluster.sort_dict(layer_order_dict) level = 1 order_layers = {} for item in layers: orderhbf.get_single_layer(item[0]).set_level(level) order_layers[level] = item[0] level = level + 1 orderhbf.set_orderlayers_dict(order_layers) return orderhbf
def searchmh(kw,seedid,num): print(kw) kws= kw.split(',') print('searchmh---------------------------------------') print(kws) # if len(kws) == 3: # print("kws:",len(kws)) # coras = Cora.objects.filter(Q(text__contains=kws[0])|Q(text__contains=kws[1])|Q(text__contains=kws[2])) # else: # coras = Cora.objects.all() coras = Cora.objects.all() dic = {} ss = [] print("seed text") print(Cora.objects.get(id=seedid).text) mhfocused = precluster.mh(Cora.objects.get(id=seedid).text) for ca in coras: mhca = precluster.mh(ca.text) dic[ca.id] = precluster.mhJC(mhfocused, mhca) # print(dic[ca.id]) list_sort_value_desc = precluster.sort_dict(dic) if len(list_sort_value_desc) <= num: al = list_sort_value_desc else: al = list_sort_value_desc[:num] for d in al: print(d) print(d[0]) a = Cora.objects.get(id=d[0]) print(a) ss += [a] return ss
def searchmh(kw, seedid, num): # coras = Cora.objects.all() kws = kw.split(',') # for k in kw.split(','): # print(k) # coras.filter(text__contains=k) coras = Cora.objects.filter( Q(text__icontains=kws[0]) | Q(text__icontains=kws[1]) | Q(text__icontains=kws[2])) dic = {} ss = [] mhfocused = precluster.mh(Cora.objects.get(id=seedid).text.split(' ')) for ca in coras: mhca = precluster.mh(ca.text.split(' ')) dic[ca.id] = precluster.mhJC(mhfocused, mhca) print(dic[ca.id]) list_sort_value_desc = precluster.sort_dict(dic) print(list_sort_value_desc[15:15 + num]) for d in list_sort_value_desc[1:1 + num]: print(d) print(d[0]) a = Cora.objects.get(id=d[0]) print(a) ss += [a] return ss
def fulfillRecordBufferPool(clustdict,BP_size): cluster_stats = {} for k, v in clustdict.items(): for d in v: cluster_stats[k] = len(v) list_sort_value_desc = precluster.sort_dict(cluster_stats) BP = [] for k,v in list_sort_value_desc[0:BP_size]: print(k,v) BP.append(clustdict[k][0]) return BP
def IG(data, clusterids): cv = CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english') X_vec = cv.fit_transform(data) res = dict( zip(cv.get_feature_names(), mutual_info_classif(X_vec, clusterids, discrete_features=True))) print(res) list_sort_value_desc = precluster.sort_dict(res) print(list_sort_value_desc[0:20]) # for k,v in list_sort_value_desc: # print(k,v) # models.sigirAttrExploration.objects.create(substring=k,orderscore=v) return list_sort_value_desc
def findNearestClusters(focusedentityid, num): coras = models.clusterCanonicalRepresentation.objects.all() dic = {} ss = [] mhfocused = precluster.mh( Cora.objects.get(id=focusedentityid).text.split(' ')) for ca in coras: # print(ca.canonrep) ca_canonrep = ast.literal_eval(ca.canonrep) mhca = precluster.mh(ca_canonrep['text'].split(' ')) dic[ca.id] = precluster.mhJC(mhfocused, mhca) list_sort_value_desc = precluster.sort_dict(dic) # print(list_sort_value_desc[15:15 + num]) for d in list_sort_value_desc[0:0 + num]: # print(d) # print(d[0]) a = models.clusterCanonicalRepresentation.objects.get(id=d[0]) ca_canonrep = ast.literal_eval(a.canonrep) temp = {"clusterid": a.clusterid, "can_rep": ca_canonrep['text']} # print(temp) ss += [temp] return ss
def findSimilarEntityes(focusedentityid, num): _t = models.CoraToAttrEntity.objects.filter(cora_id=focusedentityid) ss = [] if _t: print('') else: coras = Cora.objects.all() dic = {} mhfocused = precluster.mh( Cora.objects.get(id=focusedentityid).text.split(' ')) for ca in coras: mhca = precluster.mh(ca.text.split(' ')) dic[ca.id] = precluster.mhJC(mhfocused, mhca) print(dic[ca.id]) list_sort_value_desc = precluster.sort_dict(dic) print(list_sort_value_desc[15:15 + num]) for d in list_sort_value_desc[0:0 + num]: print(d) print(d[0]) a = Cora.objects.get(id=d[0]) print(a) ss += [a] return ss