Exemple #1
0
def getId2PoteryName():
    result = {
        potery.id: potery.sentences[0].content
        for potery in poteryManager.poteries
    }
    writeJson('id2potery_name', result)


# getId2PoteryName()
def getSimCat():
    index = 4

    # tree = loadJson(tree_path)
    imp_ids = getImpIds(
        imp_ids_num
    )  #[node_id for node_id in tree if len(tree[node_id]['childs'])==0]

    imp_id1 = imp_ids[0]
    imp_ids_set = set(imp_ids)

    ids = getAllIds()[index * 180000:(index + 1) * 180000]
    ids2sim_imp = {}
    count = 0
    print('开始循环', len(ids), len(imp_ids))
    for _id in ids:
        if _id in imp_ids_set:
            ids2sim_imp[_id] = _id
        else:
            max_sim_id = imp_id1

            def getSim(t_id):
                if _id not in docvecs or t_id not in docvecs:
                    return -999
                # print(_id, t_id, docvecs.similarity(_id, t_id), '存在')
                return docvecs.similarity(_id, t_id)

            max_sim = getSim(max_sim_id)
            for imp_id in imp_ids:
                sim = getSim(imp_id)
                if sim > max_sim:
                    # print(max_sim_id, max_sim, imp_id, sim)
                    max_sim = sim
                    max_sim_id = imp_id
            ids2sim_imp[_id] = max_sim_id
            # print(max_sim_id, _id, imp_id1)
            count += 1
            if count % 1000 == 0:
                print(count, len(ids), index)
                # break
    writeJson(str(index) + '.json', ids2sim_imp)
Exemple #3
0
potery2vec_path = './data/poteryCat10000/potery2vec3d.json'
potery2vec = loadJson(potery2vec_path)
# for imp_id in imp_ids:
#     poteries = cat2poteries[imp_id]
#     vecs = [getPoteryVec128d(potery) for potery in poteries]
#     vecs = tsne3d.fit_transform(vecs)
#     for index, vec in enumerate(vecs):
#         potery2vec[poteries[index]] = vec.tolist()
# writeJson(potery2vec_path, potery2vec)

# test = nodeCompany.get(831)
# print(test.id)
# print(test.parent.id)
# print(test.vec3d, test.getChildIds())

# print(imp_ids)
for index, imp_id in enumerate(imp_ids):
    poteries = cat2poteries[imp_id]
    node = nodeCompany.get(index)
    for p_id in poteries:
        vec = potery2vec[p_id]
        vec = np.array(vec)
        # 这里有个bug, 88592 [-26.0718441    7.09674025   5.34221125] None 53171 831
        try:
            vec += node.vec3d
        except:
            print(p_id, vec, node.vec3d, imp_id, index)

        potery2vec[p_id] = vec.tolist()
writeJson('vec.json', potery2vec)
Exemple #4
0
def getVecs():
    class NodeCompany():
        def __init__(self):
            self.id2node = {}
            self.root_node = None

        def getAllNodes(self):
            return [self.id2node[key] for key in self.id2node]

        def get(self, _id):
            if _id is None:
                return None
            _id = str(_id)
            id2node = self.id2node
            if _id not in id2node:
                id2node[_id] = TreeNode(_id)
            return id2node[_id]

        def load(self):
            tree_json = loadJson(cat_tree_path)
            for _id in tree_json:
                node_data = tree_json[_id]
                node = self.get(_id)
                node.parent = self.get(node_data['parent'])
                node.childs = [self.get(n_id) for n_id in node_data['childs']]
                # if node.parent is not None:
                #     node.parent.childs.append(node)
                node.potery_ids = node_data['potery_ids']
                node.is_leave = len(node.potery_ids) > 0
                if node.parent is None:
                    self.root_node = node

        def getNodeAtLevel(self, level):
            nodes = [self.root_node]
            for index in range(level):
                children = []
                for node in nodes:
                    children += node.childs
                nodes = children
            return nodes

    class TreeNode:
        def __init__(self, _id):
            self.id = _id
            self.parent = None
            self.childs = []
            self.potery_ids = []
            self.is_leave = False
            self.vec3d = None
            self.vec128d = None
            self.level = 1

        def getAllChilds(self):
            all_childs = list(self.childs)
            for child in self.childs:
                all_childs += child.getAllChilds()
            return all_childs

        def getLeaves(self):
            all_childs = self.getAllChilds()
            return [node for node in all_childs if len(node.childs) == 0]

        def getParentId(self):
            if self.parent is not None:
                return self.parent.id
            else:
                return None

        def getAllPoteries(self):
            if self.is_leave:
                # print(self.id)
                return self.potery_ids
            leaves = self.getLeaves()
            # if len(leaves)==0:
            #     print(self.id)
            poteries = []
            for leaf in leaves:
                poteries += leaf.potery_ids
            # if len(poteries)>0:
            #     print(len(poteries))
            return poteries

    nodeManager = NodeCompany()
    nodeManager.load()
    root_node = nodeManager.root_node

    def getCenterVec(vecs):
        vecs = np.array(vecs)
        return vecs.mean(0)

    print(root_node, root_node.id, len(root_node.getAllPoteries()),
          len(root_node.getAllChilds()))
    print('第一步')
    # print([node.id  for node in root_node.getAllChilds()])
    all_nodes = nodeManager.getAllNodes()
    for node in all_nodes:
        point = node
        while point.parent is not None:
            node.level += 1
            point = point.parent

    print('第二步')
    for node in all_nodes:
        poteries = node.getAllPoteries()
        vecs = [getPoteryVec128d(p_id) for p_id in poteries]
        # print(poteries,)
        node.vec128d = getCenterVec(vecs)

    tsne3d = TSNE(n_components=3, n_iter=300, learning_rate=250, perplexity=10)
    root_node.vec128d = np.zeros(128)
    root_node.vec3d = np.zeros(3)
    print('第三步')
    for node in all_nodes:
        childs = node.childs
        if len(childs) == 0:
            continue
        if len(childs) == 1:
            childs[0].vec3d = np.zeros(3)
            continue
        vecs = [child.vec128d for child in childs]
        vecs = tsne3d.fit_transform(vecs)
        center_vec = getCenterVec(vecs)

        size = pow(len(node.getAllPoteries()) + 1, 1 / 3)
        for index, child in enumerate(childs):
            child.vec3d = (vecs[index] - center_vec) * size

    print('第四步')
    for node in all_nodes:
        childs = node.getAllChilds()
        for child in childs:
            child.vec3d += node.vec3d

    pid2vec = {}
    leaves = root_node.getLeaves()
    for leaf in leaves:
        potery_ids = leaf.potery_ids
        if len(potery_ids) == 1:
            p_id = potery_ids[0]
            pid2vec[p_id] = np.zeros(3)
            continue

        vecs = [getPoteryVec128d(p_id) for p_id in potery_ids]
        vecs = tsne3d.fit_transform(vecs)
        center_vec = getCenterVec(vecs)
        for index, p_id in enumerate(potery_ids):
            pid2vec[p_id] = vecs[index] - center_vec + leaf.vec3d

    print('第五步')
    for p_id in pid2vec:
        pid2vec[p_id] = pid2vec[p_id].tolist()

    writeJson('vec.json', pid2vec)
Exemple #5
0
def getCatTree():
    id2node = {}
    max_levae_num = 60

    class TreeNode(object):
        def __init__(self, potery_ids, parent):
            global count
            self.id = str(count)
            count += 1
            self.parent = parent
            self.childs = []
            self.potery_ids = potery_ids
            self.loadChilds()

            id2node[self.id] = self
            if not self.isLeave():
                self.potery_ids = []

        def getParentId(self):
            if self.parent is not None:
                return self.parent.id
            else:
                return None

        def isLeave(self):
            return len(self.potery_ids) < max_levae_num

        def loadChilds(self):
            potery_ids = self.potery_ids
            if self.isLeave():
                return

            vecs = [getPoteryVec128d(_id) for _id in potery_ids]
            labels = KMeans(n_clusters=max_levae_num - 2,
                            max_iter=2000,
                            n_jobs=-1).fit_predict(vecs)

            label2id = {}
            for index, label in enumerate(labels):
                label = str(label)
                if label not in label2id:
                    label2id[label] = []
                label2id[label].append(potery_ids[index])
            self.childs = [
                TreeNode(label2id[label], self) for label in label2id
                if len(label2id[label]) > 0
            ]

    ids = getAllIds()
    TreeNode(ids, None)

    id2dict = {}
    # temp_ids = []
    for _id in id2node:
        node = id2node[_id]
        new_dict = {
            'childs': [child.id for child in node.childs],
            'parent': node.getParentId(),
            'potery_ids': node.potery_ids if (node.isLeave()) else []
        }
        # temp_ids += node.potery_ids
        id2dict[_id] = new_dict
    # temp_ids = set(temp_ids)
    # print('保存了', len(temp_ids))
    writeJson(cat_tree_path, id2dict)
Exemple #6
0
def getId2simpPotery():
    result = {
        potery.id: potery.getSimpJson()
        for potery in poteryManager.poteries
    }
    writeJson('simp_potery', result)
def getImpCat():
    vecs = []

    ids = getImpIds(imp_ids_num)  #getAllIds()

    for _id in ids:
        vecs.append(docvecs[_id])

    print(len(vecs), len(vecs[0]))
    # print(points)
    disMat = distance.pdist(vecs, 'euclidean')

    #define the linkage_matrix using ward clustering pre-computed distances
    print('开始计算')
    linkage_matrix = linkage(disMat, method='ward',
                             optimal_ordering=True)  #optonal :average ward etc

    def getTree(linkage_matrix):
        class TreeNode(object):
            def __init__(self, _id):
                self.id = _id
                self.parent = None
                self.childs = set()
                self.child_num = 0

        class NodeCompany():
            def __init__(self):
                self.id2node = {}

            def get(self, _id):
                id2node = self.id2node
                if _id in id2node:
                    return id2node[_id]
                else:
                    id2node[_id] = TreeNode(_id)
                    return id2node[_id]

        nodeCompany = NodeCompany()
        linkage_matrix = linkage_matrix.tolist()
        # print(linkage_matrix)

        for item in linkage_matrix:
            node1 = int(item[0])
            node2 = int(item[1])
            # sim = item[2]
            num = int(item[3])
            item[0] = node1
            item[1] = node2
            item[3] = num

        l_length = imp_ids_num
        for index, item in enumerate(linkage_matrix):
            child_num = item[3]
            node1 = nodeCompany.get(item[0])
            node2 = nodeCompany.get(item[1])
            index += l_length
            parent_node = nodeCompany.get(index)
            node1.parent = parent_node
            node2.parent = parent_node
            parent_node.childs.add(node1)
            parent_node.childs.add(node2)
            parent_node.child_num = child_num
        id2node = nodeCompany.id2node
        # for key in id2node:
        #     node = id2node[key]
        # print([sub.id for sub in node.childs], node.id)
        return id2node

    tree = getTree(linkage_matrix)
    result = {}
    for _id in tree:
        item = tree[_id]
        result[_id] = {
            # 'id': item.id,
            'child_num': item.child_num,
            'parent': None if (item.parent is None) else item.parent.id,
            'childs': [child.id for child in item.childs],
        }
    writeJson(tree_path, result)