Esempio n. 1
0
def constructOrderedHBF(ahbf, dataset):
    record_ids = [item.id for item in dataset]

    attributes = ahbf.keys()  # attribute names
    print(attributes)

    orderhbf = HBForest('cora')
    orderhbf.set_dataset_records_ids(record_ids)
    layer_order_dict = {}

    for attra in attributes:
        print(attra)
        values = ahbf[attra].keys()  # values for attribute attra
        # print(values)
        attra_nodesize_dict = {}
        recordids_has_attra = []
        for value in values:
            print(" ", value)
            # print(" ",ahbf[attra][value])
            syns = ahbf[attra][value].keys()
            for syn in syns:
                print("     ", syn, ":", ahbf[attra][value][syn])
                orderhbf.add_hbflayer(attra, value, syn,
                                      ahbf[attra][value][syn])
                recordids_has_attra.extend(ahbf[attra][value][syn])
            attra_nodesize_dict[value] = orderhbf.get_single_layer(
                attra).get_single_child(value).get_node_size()
        list_sort_value_desc = precluster.sort_dict(attra_nodesize_dict)
        order = 1
        # max = []
        ordered_nodes_dict = {}
        for item in list_sort_value_desc:
            orderhbf.get_single_layer(attra).get_single_child(
                item[0]).set_order(order)
            ordered_nodes_dict[order] = item[0]
            order = order + 1
            # if len(max) == 0:
            #     max.append(value)
        ordered_nodes_dict[order] = 'NULL'
        orderhbf.add_hbflayer(
            attra, 'NULL', 'NULL',
            list(set(record_ids).difference(set(recordids_has_attra))))
        orderhbf.get_single_layer(attra).set_ordered_nodes_dict(
            ordered_nodes_dict)
        layer_order_dict[attra] = list_sort_value_desc[0][1]
    layers = precluster.sort_dict(layer_order_dict)
    level = 1
    order_layers = {}
    for item in layers:
        orderhbf.get_single_layer(item[0]).set_level(level)
        order_layers[level] = item[0]
        level = level + 1
    orderhbf.set_orderlayers_dict(order_layers)
    return orderhbf
Esempio n. 2
0
def searchmh(kw,seedid,num):

    print(kw)
    kws= kw.split(',')
    print('searchmh---------------------------------------')
    print(kws)
    # if len(kws) == 3:
    #     print("kws:",len(kws))
    #     coras = Cora.objects.filter(Q(text__contains=kws[0])|Q(text__contains=kws[1])|Q(text__contains=kws[2]))
    # else:
    #     coras = Cora.objects.all()
    coras = Cora.objects.all()
    dic = {}
    ss = []
    print("seed text")
    print(Cora.objects.get(id=seedid).text)
    mhfocused = precluster.mh(Cora.objects.get(id=seedid).text)
    for ca in coras:
        mhca = precluster.mh(ca.text)
        dic[ca.id] = precluster.mhJC(mhfocused, mhca)
        # print(dic[ca.id])
    list_sort_value_desc = precluster.sort_dict(dic)
    if len(list_sort_value_desc) <= num:
        al = list_sort_value_desc
    else:
        al = list_sort_value_desc[:num]
    for d in al:
        print(d)
        print(d[0])
        a = Cora.objects.get(id=d[0])
        print(a)
        ss += [a]
    return ss
Esempio n. 3
0
def searchmh(kw, seedid, num):
    # coras = Cora.objects.all()
    kws = kw.split(',')
    # for k in kw.split(','):
    #     print(k)
    #     coras.filter(text__contains=k)
    coras = Cora.objects.filter(
        Q(text__icontains=kws[0]) | Q(text__icontains=kws[1])
        | Q(text__icontains=kws[2]))
    dic = {}
    ss = []
    mhfocused = precluster.mh(Cora.objects.get(id=seedid).text.split(' '))
    for ca in coras:
        mhca = precluster.mh(ca.text.split(' '))
        dic[ca.id] = precluster.mhJC(mhfocused, mhca)
        print(dic[ca.id])
    list_sort_value_desc = precluster.sort_dict(dic)
    print(list_sort_value_desc[15:15 + num])

    for d in list_sort_value_desc[1:1 + num]:
        print(d)
        print(d[0])
        a = Cora.objects.get(id=d[0])
        print(a)
        ss += [a]
    return ss
Esempio n. 4
0
def fulfillRecordBufferPool(clustdict,BP_size):
    cluster_stats = {}
    for k, v in clustdict.items():
        for d in v:
            cluster_stats[k] = len(v)

    list_sort_value_desc = precluster.sort_dict(cluster_stats)
    BP = []
    for k,v in list_sort_value_desc[0:BP_size]:
        print(k,v)
        BP.append(clustdict[k][0])
    return BP
Esempio n. 5
0
def IG(data, clusterids):
    cv = CountVectorizer(max_df=0.95,
                         min_df=2,
                         max_features=10000,
                         stop_words='english')
    X_vec = cv.fit_transform(data)
    res = dict(
        zip(cv.get_feature_names(),
            mutual_info_classif(X_vec, clusterids, discrete_features=True)))
    print(res)
    list_sort_value_desc = precluster.sort_dict(res)
    print(list_sort_value_desc[0:20])
    # for k,v in list_sort_value_desc:
    #     print(k,v)
    #     models.sigirAttrExploration.objects.create(substring=k,orderscore=v)
    return list_sort_value_desc
Esempio n. 6
0
def findNearestClusters(focusedentityid, num):
    coras = models.clusterCanonicalRepresentation.objects.all()
    dic = {}
    ss = []
    mhfocused = precluster.mh(
        Cora.objects.get(id=focusedentityid).text.split(' '))
    for ca in coras:
        # print(ca.canonrep)
        ca_canonrep = ast.literal_eval(ca.canonrep)
        mhca = precluster.mh(ca_canonrep['text'].split(' '))
        dic[ca.id] = precluster.mhJC(mhfocused, mhca)
    list_sort_value_desc = precluster.sort_dict(dic)
    # print(list_sort_value_desc[15:15 + num])

    for d in list_sort_value_desc[0:0 + num]:
        # print(d)
        # print(d[0])
        a = models.clusterCanonicalRepresentation.objects.get(id=d[0])
        ca_canonrep = ast.literal_eval(a.canonrep)
        temp = {"clusterid": a.clusterid, "can_rep": ca_canonrep['text']}
        # print(temp)
        ss += [temp]
    return ss
Esempio n. 7
0
def findSimilarEntityes(focusedentityid, num):
    _t = models.CoraToAttrEntity.objects.filter(cora_id=focusedentityid)
    ss = []
    if _t:
        print('')
    else:
        coras = Cora.objects.all()
        dic = {}
        mhfocused = precluster.mh(
            Cora.objects.get(id=focusedentityid).text.split(' '))
        for ca in coras:
            mhca = precluster.mh(ca.text.split(' '))
            dic[ca.id] = precluster.mhJC(mhfocused, mhca)
            print(dic[ca.id])
        list_sort_value_desc = precluster.sort_dict(dic)
        print(list_sort_value_desc[15:15 + num])

        for d in list_sort_value_desc[0:0 + num]:
            print(d)
            print(d[0])
            a = Cora.objects.get(id=d[0])
            print(a)
            ss += [a]
    return ss