Example #1
0
def tagged_paths_to_tree(path_list, base_path=DEFAULT_BASE_PATH, taggers=[]):
    """
        Same as paths_to_tree but add tags
        taggers must contains functions in format

        def tagger(path: str) -> List[str]:
            ...
    """
    def tag_path(path):
        tags = []
        for func in taggers:
            tags += func(path)
        if not tags:
            return ""
        return " ({tags})".format(tags=", ".join(tags))

    tree = Tree()
    root = Path().resolve().root
    tree.create_node(root, root)

    base_path = Path(base_path).resolve()
    for path in path_list:
        path = base_path.joinpath(path).resolve()
        if len(path.parts) < 2:
            continue
        parent = path.parts[0]
        parts = path.parts[1:]
        for part in parts:
            current_path = joinpath(parent, part)
            if not tree.contains(current_path):
                tree.create_node(part + tag_path(current_path),
                                 current_path,
                                 parent=parent)
            parent = current_path
    return tree
Example #2
0
class TreePipeline(object):

    def open_spider(self, spider):
        self.tree = Tree()
        self.tree.create_node("root", "root")

    def process_item(self, item, spider):
        lst = item['text']
        lst = [x.strip() for x in [y.replace('...', '') for y in lst]]
        item['pagetitle'] = item['pagetitle'].replace('...', '')
        lst[-1] = item['pagetitle']
        for idx, elem in enumerate(lst):
            if idx == 0:
                previous = "root"
            else:
                previous = "|".join(lst[:idx])
            elem = "|".join(lst[:idx + 1])
            # elem = elem.replace('...', '')
            elem = elem.encode('utf-8').decode('utf-8')
            if not self.tree.contains(elem):
                print "Adding node %s" % elem
                self.tree.create_node(elem, elem, parent=previous)
                # self.tree.show()
        return item

    def close_spider(self, spider):
        self.tree.show()
        with open(makepath('data/cats/tree.json'), 'w') as outfile:
            outfile.write(self.tree.to_json())
        self.tree.save2file(makepath('data/cats/tree.tree'))
Example #3
0
def create_ding_tree():
    #global ding_tree, dept_result #debug only
    #连接数据库
    db, cursor = connect_db('localhost', 'root', 'yoyoball', 'dingtalk')

    ding_tree = Tree()
    
    sql = "SELECT `id`, `name`, `parentid` FROM dingding_department_list"
    cursor.execute(sql)
    dept_result = cursor.fetchall()
    #print dept_result #debug only
    if dept_result != None and len(dept_result) > 0:
        ding_tree.create_node('##ding_root##', '0') #先创建虚拟根
        for i in range(len(dept_result)): #向虚拟根填充所有组织
            #print dept_result[i] #debug only
            #ding_tree.create_node(dept_result[i][1].decode('utf-8'), dept_result[i][0], '0000')
            ding_tree.create_node(dept_result[i][1], dept_result[i][0], '0')
        for i in range(len(dept_result)): #修改隶属关系
            if dept_result[i][0] != '1' : #只要不是实根,就要修改隶属关系【钉钉中实根id为'1'且无上级部门,数据表dingding_department_list中存储id为'1'的部门上级为'0'】
                if ding_tree.contains(dept_result[i][2]): #判断上级是否存在
                    ding_tree.move_node(dept_result[i][0], dept_result[i][2])
                else: #没有上级的不修改
                    #print type(dept_result[i][2]), dept_result[i][2] #debug only
                    continue
    #断开数据库
    close_db(db)
    #return ding_tree
    return ding_tree.subtree('1')
Example #4
0
def create_oa_tree():
    #global oa_tree #debug only
    #连接数据库
    db, cursor = connect_db('localhost', 'root', 'yoyoball', 'test') 
    
    oa_tree = Tree()
    
    sql = "SELECT `orgid`, `shortname`, `parentorgid` FROM groupinfo"
    cursor.execute(sql)
    dept_result = cursor.fetchall()
    #print dept_result debug only
    if dept_result != None and len(dept_result) > 0:
        oa_tree.create_node('##oa_root##', '0000') #先创建虚拟根
        for i in range(len(dept_result)): #向虚拟根填充所有组织
            #print dept_result[i][1].decode('utf-8'), dept_result[i][0], dept_result[i][2]
            oa_tree.create_node(dept_result[i][1], dept_result[i][0], '0000')
        for i in range(len(dept_result)): #修改隶属关系
            if dept_result[i][2] != '0000' : #OA中'001000'等的组织上级为'0000'即OA数据库中存在虚根,所以无需做此步骤
                if oa_tree.contains(dept_result[i][2]): #判断上级是否存在
                    oa_tree.move_node(dept_result[i][0], dept_result[i][2])
                else: #没有上级的不修改
                    continue
    #断开数据库
    close_db(db)
    return oa_tree
def update(utree, freq):
    updt_tree = Tree(tree=utree, deep=True)
    for item in utree.expand_tree('root'):
        if item == 'root':
            continue
        if updt_tree.contains(item) and utree[item].data < freq:
            updt_tree.remove_node(item)
    return updt_tree
Example #6
0
def create_tree(sitemap_dict):
    """
    Reads sitemap dict and generate a tree of all links

    :param sitemap_dict: Sitemap generated during run
    :type sitemap_dict: dict
    """
    tree = Tree()
    root = list(sitemap_dict.keys())[0]
    tree.create_node(root, root)
    for k, v in sitemap_dict.items():
        if not tree.contains(k):
            logger.debug(f'Creating key: {k}')
            tree.create_node(k, k, parent=root)
        for i in v:
            if not tree.contains(i):
                logger.debug(f'Creating node {i}, parent: {k}')
                tree.create_node(i, i, parent=k)
    return tree
Example #7
0
 def _create_categories_tree(self) -> Tree:
     tree = Tree()
     tree.create_node('root', 'root')
     for category_path_split in self.category_paths_split:
         parent_node = 'root'
         for node in category_path_split:
             if not tree.contains(node):
                 tree.create_node(node, node, parent=parent_node)
             parent_node = node
     return tree
Example #8
0
def visualise_tree_from_slice(data):
    tree = Tree()
    for index, item in data.iterrows():
        if not tree.contains(item['id']):
            if item['parent_id'] == 1 or item['parent_id'] ==2:
                tree.create_node(item['answer'], item['id'])
            else:
                tree.create_node(item['answer'], item['id'], parent=item['parent_id'])
    tree.show()
    return tree
    pass
Example #9
0
    def create_tree(files):
        tree = Tree()
        root = files[0]
        tree.create_node(f"[{root.url.split('/')[0]}]", root.url.split("/")[0])
        for item in files:
            if not tree.contains(item.url):
                pieces = item.url.split("/")
                for index, path in enumerate(pieces, 1):
                    if not tree.contains("/".join(pieces[:index])):
                        if len(pieces) == index and item.is_file:
                            path = f"{path} ({item.qty_lines} linhas)"
                        else:
                            path = f"[{path}]"
                        tree.create_node(path, "/".join(pieces[:index]),
                                         parent="/".join(pieces[:index - 1]))

        if tree.contains(f'{root.url}/tree'):
            tree.remove_node(f'{root.url}/tree')

        return tree
def union_tree(c_tree1, c_tree2):
    union_tree = Tree(tree=c_tree1, deep=True)
    for item in c_tree2.expand_tree('root'):
        if item == "root":
            continue
        if union_tree.contains(item):
            union_tree[item].data += c_tree2[item].data
        else:
            parent = c_tree2.parent(item).identifier
            union_tree.create_node(item,
                                   item,
                                   data=c_tree2[item].data,
                                   parent=parent)
    return union_tree
Example #11
0
  def __call__(self, inputs,words,dep,is_train=True):
      """
      :param xs: a list of ngrams (or words if win is set to 1)
      :return: embeddings looked from tables
      """
      list=eval(dep)
      tree = Tree()
      
      finish=False
      err_node=[]
      root_index=0
      while 1:
          if finish:
              break;
          if len(tree.all_nodes())==len(list):
              finish=True;
          for i in range(0,len(list)):
              arr=list[i]
              parentIdx=arr[1]
              nodeIdx=arr[2]
 
              if not tree.contains(nid=nodeIdx):
                  if i==0:
                      tree.create_node(words[nodeIdx-1],identifier=nodeIdx)
                      root_index=nodeIdx-1
                  else:
                      if tree.contains(nid=parentIdx):
                          tree.create_node(words[nodeIdx-1],identifier=nodeIdx,parent=parentIdx)
                              
      H=[]
      
      for idx in range(0,len(inputs)):
          h=self.expr_for_tree(xt=inputs[idx],tree=tree,node=tree.get_node(idx+1),is_train=is_train)
          H.append(h)
      
      return H,root_index
def create_tree(arr, depth='4'):
    tree = Tree()
    print("creating your tree ..")
    # print(tree.get_node("compsci"))
    tree.create_node("compsci", "papa")
    for course in arr:
        num = comparator(course)
        if (num[0] == str(depth)):
            # print(course)
            tree.create_node(course, course, parent="papa")
            pre_reqs = get_prerequisites(course)
            time.sleep(1)
            for pre in pre_reqs:
                if (tree.get_node(course + "" + pre) == None):
                    tree.create_node(pre, course + "" + pre, parent=course)
                    pre2 = get_prerequisites(pre)
                    time.sleep(1)
                    for pr in pre2:
                        if (tree.get_node(course + "" + pre + "" +
                                          pr) == None):
                            tree.create_node(pr,
                                             course + "" + pre + "" + pr,
                                             parent=course + "" + pre)
                            tests = get_prerequisites(pr)
                            time.sleep(1)
                            for tst in tests:
                                if (tree.get_node(course + "" + pre + "" + pr +
                                                  "" + tst) == None):
                                    tree.create_node(
                                        tst,
                                        course + "" + pre + "" + pr + "" + tst,
                                        parent=course + "" + pre + "" + pr)
    print("Still loading...")
    for course in arr:
        if (not tree.contains(course)):
            tree.create_node(course, course, parent="papa")
            pre_reqs = get_prerequisites(course)
            time.sleep(1)
            for pre in pre_reqs:
                if (tree.get_node(course + "" + pre) == None):
                    tree.create_node(pre, course + "" + pre, parent=course)
                    pre2 = get_prerequisites(pre)
                    time.sleep(1)

    return tree, arr
Example #13
0
class DockerIf(object):
    def __init__(self, base_url='unix://var/run/docker.sock'):
        self.client = Client(base_url=base_url)

    def remove_image(self, *args, **kwargs):
        self.client.remove_image(*args, **kwargs)

    def add_image_node(self, image, image_id, parent=''):
        is_dangling = image_id in self.dangling

        node = "%s (%s)" % (image[u'RepoTags'], image[u'Id'])

        if parent == '':
            self.image_tree.create_node(node, image_id, parent='/',
                                        data={u'image': image, u'Dangling': is_dangling})
        else:
            self.image_tree.create_node(node, image_id, parent=parent,
                                        data={u'image': image, u'Dangling': is_dangling})

        if image[u'Id'] in self.pending:
            for node in self.pending[image[u'Id']]:
                self.add_image_node(node[0], node[1], parent=image[u'Id'])

    def prepare_image_tree(self):
        self.image_tree = Tree()
        self.image_tree.create_node('', '/')
        self.pending = dict()
        dangling_images = self.client.images(filters={u'dangling': True})
        self.dangling = []
        for image in dangling_images:
            self.dangling.append(image[u'Id'])
        for image in self.client.images(all=True):
            if u'ParentId' in image and image[u'ParentId'] != '':
                if self.image_tree.contains(image[u'ParentId']) or image[u'ParentId'] == '':
                    self.add_image_node(image, image[u'Id'], parent=image[u'ParentId'])
                else:
                    if image[u'ParentId'] not in self.pending:
                        self.pending[image[u'ParentId']] = []
                    self.pending[image[u'ParentId']].append((image, image[u'Id']))
            else:
                self.add_image_node(image, image[u'Id'])

    def get_image_tree(self):
        self.prepare_image_tree()
        return self.image_tree
Example #14
0
def paths_to_tree(path_list, base_path=DEFAULT_BASE_PATH):
    tree = Tree()
    root = Path().resolve().root
    tree.create_node(root, root)

    base_path = Path(base_path).resolve()
    for path in path_list:
        path = base_path.joinpath(path).resolve()
        if len(path.parts) < 2:
            continue
        parent = path.parts[0]
        parts = path.parts[1:]
        for part in parts:
            current_path = joinpath(parent, part)
            if not tree.contains(current_path):
                tree.create_node(part, current_path, parent=parent)
            parent = current_path
    return tree
Example #15
0
    def get_invoke_tree(self, method: MethodId, search_depth=3):
        tree = Tree(deep=search_depth, identifier=method.address)

        # Parent method with invoke address list
        tree.create_node(identifier=method, data=[])

        for _ in range(search_depth):
            for leaf in tree.leaves():
                uppers = self.apkinfo.find_upper_methods(leaf.identifier)
                for offset, upper in uppers:
                    bytecode = self.apkinfo.find_bytecode_by_addr(
                        upper.dexindex, offset)
                    if not tree.contains(upper):
                        tree.create_node(identifier=upper,
                                         data=[bytecode],
                                         parent=leaf)
                    else:
                        tree.get_node(upper).data.append(bytecode)

        return tree
Example #16
0
def FpGrowth(fName):
    
    readFile(fName)
    Cone = getSizeOneItemSet(globOriginalList)
    priorityDict = priorityDic(Cone)
    #print(priorityDict)
    tree = Tree()   
    tree.create_node("{}", "root")
    #reconstruct the whole transction database based on the priority
    counter = 0
    for set in globOriginalList:
        temp = dict()
        for element in set:
            priority = priorityDict.get(element)
            temp.update({element:priority})
            sorted_temp = sorted(temp.items(), key=operator.itemgetter(1))
            sorted_temp.reverse()
        #print(sorted_temp)
        # construct Fp tree
        root = "root"
        for tuple in sorted_temp:
            if(not tree.contains(tuple[0])):
                tree.create_node(tuple[0], tuple[0], root, 0)
                root = tuple[0]
            else: 
                if tuple[0] in tree.is_branch(root):
                    #print("node already in this branch, don't know what to do")
                    #print("going down")
                    root = tuple[0]
                    #print(root)
                else:
                    #print("should create a duplicate node")
                    tree.create_node(tuple[0], counter, root, 0)
                    root = counter
                    counter += 1
                # I need to decide whether to create a new node or not
                # the condition is under this branch if this node exist
                # so I should check the root
    tree.show()
Example #17
0
class Mwrp:
    def __init__(self, world, number_of_agent, start_node):
        self.number_of_agent = number_of_agent
        self.open_list = np.array([start_node])
        self.tree = Tree()
        self.tree.create_node(start_node.pos.__str__(),
                              start_node.pos.__str__(),
                              data=start_node)
        self.world = world
        self.node_expend_index = 1
        self.need_to_see = np.sum(self.world.grid_map == 0)

    def insert_to_open_list(self, new_node):
        for index, data in enumerate(self.open_list):
            if data.f > new_node.f:
                self.open_list = np.insert(self.open_list, index, new_node)
                return
        self.open_list = np.append(self.open_list, new_node)

    def pop_open_list(self):
        return self.open_list[0].pos

    def move_from_open_to_close(self, index=0):
        self.open_list = np.delete(self.open_list, index)

    def heuristic(self, state):
        return 1

    # #old
    # def fix_g_subtree(self,neighbor,state):
    #     tmp_new_open = []
    #     #self.tree.move_node(neighbor.__str__(), state.__str__())
    #
    #     for node in mwrp.tree.subtree(state.data.pos.__str__()).expand_tree(mode=Tree.DEPTH):
    #         parent = mwrp.tree.parent(node)
    #
    #         new_g = parent.data.g + Utils.n_dim_distance(
    #             self.tree.get_node(node).data.pos, parent.data.pos)
    #         self.tree.get_node(node).data.f = self.tree.get_node(node).data.f - self.tree.get_node(node).data.g + new_g
    #
    #         self.tree.get_node(node).data.g = new_g
    #         tmp_new_open.append(self.tree.get_node(node))
    #
    #     for need_new_open in tmp_new_open:
    #         for index, data in enumerate(self.open_list):
    #             if (np.all(data.pos == need_new_open.data.pos)):
    #                 self.open_list = np.delete(self.open_list, index)
    #                 self.insert_to_open_list(self.tree.get_node(need_new_open.data.pos.__str__()).data)
    #                 break

    def get_all_seen(self, state):

        tmp_node = self.tree.get_node(state.identifier)
        tmp_seen = self.world.get_seen(tmp_node)

        while not self.tree.get_node(tmp_node.identifier).is_root():
            tmp_node = self.tree.get_node(
                tmp_node.predecessor(self.tree.identifier))
            tmp_seen = np.vstack((tmp_seen, self.world.get_seen(tmp_node)))

        all_seen = np.unique(tmp_seen, axis=0)

        return all_seen

    # def get_all_seen1(self, state):
    #
    #     tmp_node = self.tree.get_node(state.identifier)
    #     dictOfWords = {i.__str__(): 0 for i in self.world.get_seen(tmp_node)}
    #
    #     while not self.tree.get_node(tmp_node.identifier).is_root():
    #         tmp_node = self.tree.get_node(tmp_node.predecessor(self.tree.identifier))
    #         for seen in self.world.get_seen(tmp_node):
    #             if not seen.__str__() in dictOfWords:
    #                 dictOfWords[seen.__str__()]=0
    #
    #     return dictOfWords

    # def expend_all(self,state):
    #     for neighbor in self.world.get_neighbors(state):
    #         if self.world.in_bund(neighbor) and self.world.is_obstical(neighbor):
    #             new_g = mwrp.tree.get_node(state.__str__()).data.g + Utils.n_dim_distance(state, neighbor)
    #             if not self.tree.get_node(neighbor.__str__()):
    #                 h = self.heuristic(state)
    #                 new_node=Node(neighbor,new_g,new_g+h)
    #                 self.tree.create_node(neighbor.__str__(),neighbor.__str__(),parent=(state.__str__()),data=new_node)
    #                 self.insert_to_open_list(new_node)
    #             else:
    #                 self.fix_g(neighbor, state)
    #
    #     #self.tree.show()
    #     self.move_from_open_to_close()

    def expend(self, state):
        move_index = np.zeros(self.number_of_agent).astype(int)
        for i in range(LOS**number_of_agent):
            for j in range(number_of_agent):
                i, index = divmod(i, LOS)
                move_index[j] = index
            neighbor = self.world.get_one_neighbor(state, move_index)

            if self.world.in_bund(neighbor) and self.world.is_obstical(
                    neighbor):
                new_g = mwrp.tree.get_node(
                    state.__str__()).data.g + Utils.n_dim_distance(
                        state, neighbor)
                if not self.tree.contains(neighbor.__str__()):
                    h = self.heuristic(state)
                    new_node = Node(neighbor, new_g, new_g + h)
                    self.tree.create_node(neighbor.__str__(),
                                          neighbor.__str__(),
                                          parent=(state.__str__()),
                                          data=new_node)
                    self.insert_to_open_list(new_node)
                else:
                    if new_g < self.tree.get_node(neighbor.__str__()).data.g:
                        self.fix_g(neighbor, state)

        self.move_from_open_to_close()

    def fix_g(self, old_state, new_parent):
        state = self.tree.get_node(old_state.__str__())

        old_parent = self.tree.get_node(state.predecessor(
            self.tree.identifier))

        new_parent = self.tree.get_node(new_parent.__str__())

        if self.seen_comparison(new_parent, old_parent):

            self.tree.move_node(state.identifier, new_parent.identifier)

            new_g = new_parent.data.g + \
                    Utils.n_dim_distance(self.tree.get_node(state.identifier).data.pos, new_parent.data.pos)

            self.tree.get_node(state.identifier).data.f = self.tree.get_node(state.identifier).data.f - \
                                                          self.tree.get_node(state.identifier).data.g + new_g

            self.tree.get_node(state.identifier).data.g = new_g

            for index, data in enumerate(self.open_list):
                if np.all(data.pos == state.data.pos):
                    self.open_list = np.delete(self.open_list, index)
                    self.insert_to_open_list(
                        self.tree.get_node(state.identifier).data)
                    break

    def goal_test(self, state):
        seen_number = self.world.get_seen(state).shape[0]
        if (seen_number == self.need_to_see):
            return True
        return False

    def seen_comparison(self, state_new, state_old):

        seen_new = self.get_all_seen(state_new).tolist()

        seen_old = self.get_all_seen(state_old).tolist()

        if seen_new.shape[0] < seen_old.shape[0]:
            return False
        for one_seen in seen_old:
            if one_seen not in seen_new:
                return False
        return True
# In[8]:

# 定义节点 node = {clifi:pluno/10**(7-d),freq:n} d为节点的深度
ftctree_dict = {}
# i = 0
for index in customer_pur_recoder:
    #     if i > 0:
    #         break
    #     i+=1
    c_tree = Tree()
    c_tree.create_node(index, 'root')
    for category in customer_pur_recoder[index]['category']:
        parent = 'root'
        for item in category:
            if c_tree.contains(item):
                c_tree[item].data += 1
            else:
                c_tree.create_node(item, item, data=1, parent=parent)
            parent = item
    ftctree_dict[index] = c_tree
# c_tree.show()
# ftctree_dict
# print(c_tree.to_json(with_data=True))

# ## 求unionTree

# In[46]:

# 781924   13325038116
# c_tree1 = ftctree_dict[781924]
Example #19
0
class GitTool(object):
    def __init__(self, parent_path, shells, build_tree=False, log=None):
        """初始化操作目录, 操作命令.

            :parameter parent_path: 操作目录
            :parameter shells: 执行shell
            :parameter build_tree: 是否生成树形导航
            :parameter log: log文件
        """

        self._directory = parent_path
        self._unix_shell = shells
        self._log_file = log
        self._tree = None
        self._build_tree = build_tree

    def get_build_tree(self):
        return self._build_tree

    def set_build_tree(self, value):
        self._build_tree = value

    build_tree = property(get_build_tree, set_build_tree)

    def _print(self, info=''):
        if self._log_file:
            os.system("echo %s >> %s" % (info, self._log_file))
        else:
            print(info)

    def run_work(self):
        """对指定的操作目录, 执行指定的操作命令.
        """

        # 如果传入日志路径不存在则创建
        if self._log_file:
            dir_name = os.path.dirname(self._log_file)
            if not os.path.exists(dir_name):
                os.makedirs(dir_name)
            if not os.path.exists(self._log_file):
                os.mknod(self._log_file)

        def build_tree(target_path):
            """创建树节点.

                :param target_path: 指定目录
            """

            if not self._build_tree:
                return

            self._tree = Tree()
            parent_name = os.path.basename(target_path)
            self._tree.create_node(parent_name, parent_name)

        def exist_node(sub_name):
            """指定节点是否存在.

                :param sub_name: 指定节点.
            """

            if not self._build_tree:
                return sub_name

            nid = 0
            while self._tree.contains(sub_name):
                sub_name = '_'.join((sub_name, str(nid)))
                nid += 1

            return sub_name

        def report_tree(target_path, out_file=True):
            """输出文件树.

                :param target_path: 指定节点.
                :param out_file: 指定节点.
            """

            if not self._build_tree:
                return

            if out_file:
                report_file = os.path.basename(target_path.strip(os.path.sep))
                self._tree.save2file('%s.txt' % report_file)
            else:
                self._tree.show()

        def process_target_path(target_path, target_tag=None):
            """对指定目录执行操作.

                :param target_path: 指定目录
                :param target_tag: 指定标签
            """

            # 判断路径是否存在
            if not os.path.exists(target_path):
                self._print("Directory does not exist!")
                return

            parent_name = os.path.basename(
                target_path) if not target_tag else target_tag

            # 遍历目录下的Git Repository
            for i in os.listdir(target_path):
                sub_path = os.path.join(target_path, i)
                sub_name = os.path.basename(sub_path)

                # sub_path类型为目录, 并且存在.git且为目录, 视为Git Repository
                git_path = os.path.join(sub_path, ".git")
                if os.path.isdir(sub_path):
                    sub_name = exist_node(sub_name)
                    if self._build_tree:
                        self._tree.create_node(sub_name,
                                               sub_name,
                                               parent=parent_name)

                    if os.path.exists(git_path) and os.path.isdir(git_path):
                        start_info = "Starting: %(sub_dir)s %(ph)s" % {
                            'sub_dir': i,
                            'ph': "." * (80 - len(i) - 1)
                        }
                        self._print(start_info)
                        os.system(self._unix_shell % sub_path)
                        self._print()
                    else:
                        process_target_path(sub_path, sub_name)

        if isinstance(self._directory, six.string_types):
            build_tree(self._directory)
            process_target_path(self._directory)
            report_tree(self._directory)
        elif isinstance(self._directory, (tuple, list)):
            for path in self._directory:
                build_tree(path)
                process_target_path(path)
                report_tree(path)
        else:
            pass

        self._print("Ok,All work is done!\r")

    def __call__(self):
        if self._log_file:
            now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            self._print("%s %s %s" % ("=" * 35, now_time, "=" * 35))

        self.run_work()
Example #20
0
def tree_build_from_list(containers):
    """
    Build a tree based on a unsorted list.

    Build a tree of containers based on an unsorted list of containers.

    Example:
    --------
        >>> containers = [
            {
                "childContainerKey": null,
                "configlets": [],
                "devices": [],
                "imageBundle": "",
                "key": "root",
                "name": "Tenant",
                "parentName": null
            },
            {
                "childContainerKey": null,
                "configlets": [
                    "veos3-basic-configuration"
                ],
                "devices": [
                    "veos-1"
                ],
                "imageBundle": "",
                "key": "container_43_840035860469981",
                "name": "staging",
                "parentName": "Tenant"
            }]
        >>> print(tree_build_from_list(containers=containers))
            {"Tenant": {"children": [{"Fabric": {"children": [{"Leaves": {"children": ["MLAG01", "MLAG02"]}}, "Spines"]}}]}}
    Parameters
    ----------
    containers : dict, optional
        Container topology to create on CVP, by default None

    Returns
    -------
    json
        tree topology
    """
    # Create tree object
    tree = Tree()  # Create the base node
    previously_created = list()
    # Create root node to mimic CVP behavior
    tree.create_node("Tenant", "Tenant")
    # Iterate for first level of containers directly attached under root.
    for cvp_container in containers:
        if cvp_container['parentName'] is None:
            continue
        elif cvp_container['parentName'] in ['Tenant']:
            previously_created.append(cvp_container['name'])
            tree.create_node(cvp_container['name'],
                             cvp_container['name'],
                             parent=cvp_container['parentName'])
    # Loop since expected tree is not equal to number of entries in container topology
    while len(tree.all_nodes()) < len(containers):
        for cvp_container in containers:
            if tree.contains(
                    cvp_container['parentName']
            ):  # and cvp_container['parentName'] not in ['Tenant']
                try:
                    tree.create_node(cvp_container['name'],
                                     cvp_container['name'],
                                     parent=cvp_container['parentName'])
                except:  # noqa E722
                    continue
    return tree.to_json()
Example #21
0
def tree_build_from_dict(containers=None):
    """
    Build a tree based on a unsorted dictConfig(config).

    Build a tree of containers based on an unsorted dict of containers.

    Example:
    --------
        >>> containers = {'Fabric': {'parent_container': 'Tenant'},
            'Leaves': {'configlets': ['container_configlet'],
                        'devices': ['veos01'],
                        'images': ['4.22.0F'],
                        'parent_container': 'Fabric'},
            'MLAG01': {'configlets': ['container_configlet'],
                        'devices': ['veos01'],
                        'images': ['4.22.0F'],
                        'parent_container': 'Leaves'},
            'MLAG02': {'configlets': ['container_configlet'],
                        'devices': ['veos01'],
                        'images': ['4.22.0F'],
                        'parent_container': 'Leaves'},
            'Spines': {'configlets': ['container_configlet'],
                        'devices': ['veos01'],
                        'images': ['4.22.0F'],
                        'parent_container': 'Fabric'}}
        >>> print(tree_build_from_dict(containers=containers))
            {"Tenant": {"children": [{"Fabric": {"children": [{"Leaves": {"children": ["MLAG01", "MLAG02"]}}, "Spines"]}}]}}
    Parameters
    ----------
    containers : dict, optional
        Container topology to create on CVP, by default None

    Returns
    -------
    json
        tree topology
    """
    # Create tree object
    tree = Tree()  # Create the base node
    previously_created = list()
    # Create root node to mimic CVP behavior
    tree.create_node("Tenant", "Tenant")
    # Iterate for first level of containers directly attached under root.
    for container_name, container_info in containers.items():
        if container_info['parent_container'] in ['Tenant']:
            previously_created.append(container_name)
            tree.create_node(container_name,
                             container_name,
                             parent=container_info['parent_container'])
    # Loop since expected tree is not equal to number of entries in container topology
    while len(tree.all_nodes()) < len(containers) + 1:
        for container_name, container_info in containers.items():
            if tree.contains(
                    container_info['parent_container']
            ) and container_info['parent_container'] not in ['Tenant']:
                try:
                    tree.create_node(container_name,
                                     container_name,
                                     parent=container_info['parent_container'])
                except:  # noqa E722
                    continue
    return tree.to_json()
Example #22
0
def construct_celltree(nucleus_file, config):
    '''
    Construct cell tree structure with cell names
    :param nucleus_file:  the name list file to the tree initilization
    :param max_time: the maximum time point to be considered
    :return cell_tree: cell tree structure where each time corresponds to one cell (with specific name)
    '''

    ##  Construct cell
    #  Add unregulized naming
    cell_tree = Tree()
    cell_tree.create_node('P0', 'P0')
    cell_tree.create_node('AB', 'AB', parent='P0')
    cell_tree.create_node('P1', 'P1', parent='P0')
    cell_tree.create_node('EMS', 'EMS', parent='P1')
    cell_tree.create_node('P2', 'P2', parent='P1')
    cell_tree.create_node('P3', 'P3', parent='P2')
    cell_tree.create_node('C', 'C', parent='P2')
    cell_tree.create_node('P4', 'P4', parent='P3')
    cell_tree.create_node('D', 'D', parent='P3')
    cell_tree.create_node('Z2', 'Z2', parent='P4')
    cell_tree.create_node('Z3', 'Z3', parent='P4')

    # EMS
    cell_tree.create_node('E', 'E', parent='EMS')
    cell_tree.create_node('MS', 'MS', parent='EMS')

    # Read the name excel and construct the tree with complete SegCell
    df_time = pd.read_csv(nucleus_file)

    # read and combine all names from different acetrees
    ## Get cell number
    try:
        with open('./ShapeUtil/number_dictionary.txt', 'rb') as f:
            number_dictionary = pickle.load(f)
    except:
        ace_files = glob.glob('./ShapeUtil/AceForLabel/*.csv')
        cell_list = [x for x in cell_tree.expand_tree()]
        for ace_file in ace_files:
            ace_pd = pd.read_csv(os.path.join(ace_file))
            cell_list = list(ace_pd.cell.unique()) + cell_list
            cell_list = list(set(cell_list))
        cell_list.sort()
        number_dictionary = dict(zip(cell_list, range(1, len(cell_list) + 1)))
        with open('./ShapeUtil/number_dictionary.txt', 'wb') as f:
            pickle.dump(number_dictionary, f)
        with open('./ShapeUtil/name_dictionary.txt', 'wb') as f:
            pickle.dump(dict(zip(range(1, len(cell_list) + 1), cell_list)), f)

    max_time = config.get('max_time', 100)
    df_time = df_time[df_time.time <= max_time]
    all_cell_names = list(df_time.cell.unique())
    for cell_name in list(all_cell_names):
        if cell_name not in number_dictionary:
            continue
        times = list(df_time.time[df_time.cell == cell_name])
        cell_info = cell_node()
        cell_info.set_number(number_dictionary[cell_name])
        cell_info.set_time(times)
        if not cell_tree.contains(cell_name):
            if "Nuc" not in cell_name:
                parent_name = cell_name[:-1]
                cell_tree.create_node(cell_name,
                                      cell_name,
                                      parent=parent_name,
                                      data=cell_info)
        else:
            cell_tree.update_node(cell_name, data=cell_info)

    return cell_tree, max_time
Example #23
0
clnt = CvpClient()
clnt.connect([cvpIP], switchuser, switchpass)
clntapi = CvpApi(clnt)
app_name = ""

getContainers = clntapi.get_containers()["data"]

tree = Tree()
tree.create_node("Tenant", "Tenant")  #root

for container in getContainers:
    containername = container["name"]
    parentName = container["parentName"]
    parentId = container["parentId"]
    if containername != "Tenant":
        if tree.contains(parentName):
            if tree.contains(containername) is False:
                tree.create_node(containername,
                                 containername,
                                 parent=parentName)
        else:
            getcontainerbyid = clntapi.get_container_by_id(parentId)
            parent_parentname = getcontainerbyid["parentName"]
            tree.create_node(parentName, parentName, parent=parent_parentname)
            tree.create_node(containername, containername, parent=parentName)
    if containername == targetcontainer:
        targetcontainerkey = container["key"]

sub_t = tree.subtree(containertobemoved)
sub_t.show()
paths_to_leaves = sub_t.paths_to_leaves()
class FacultyPagesFilteredSpider(scrapy.Spider):
    name = 'faculty_pages_filtered'
    allowed_domains = [
        'cmu.edu', 'cornell.edu', 'washington.edu', 'gatech.edu',
        'princeton.edu', 'utexas.edu', 'illinois.edu', 'berkeley.edu'
        'mit.edu', 'stanford.edu'
    ]
    count = 0
    record = {}
    start_urls = [
        'https://www.cmu.edu/', 'https://www.cornell.edu/',
        'https://www.washington.edu/', 'https://www.gatech.edu/',
        'https://www.princeton.edu/', 'https://www.utexas.edu/',
        'https://illinois.edu/', 'https://www.berkeley.edu/',
        'https://www.mit.edu/', 'https://www.stanford.edu/'
    ]

    exclude_words = [
        'news', 'events', 'publications', 'pub', 'gallery', 'category',
        'courses', 'students', 'references', 'reference', 'software',
        'softwares', 'tags', 'tutorials', 'workshop', 'festival', 'admissions',
        'exhibitions', 'alumni', 'lectures', 'undergraduate', 'about',
        'history', 'awards', 'ranking', 'enrollment', 'graduate', 'archive',
        'stories', 'post', 'pages', 'magazine', 'curriculum', '404', 'faqs',
        'engage', 'campaign', 'career', 'resources', 'services', 'network',
        'security', 'donate', 'giving', 'finance', 'forms', 'policies',
        'policy', 'alphabetical', 'summer', 'winter', 'spring', 'autumn',
        'fall', 'health', 'facilities', 'facility', 'wp', 'information',
        'general', 'catalog', 'guides', 'library', 'publish', 'blog',
        'collection', 'share', 'search', 'periodicals', 'bookstore', 'store',
        'product', 'organisation', 'webstore', 'funding', 'pdf'
    ]

    rules = [Rule(LinkExtractor(unique=True), callback='parse', follow=True)]

    #count_limits = {"page_count": 200, "item_count": 200}

    def __init__(self):

        self.tree = Tree()
        self.tree.create_node("root", "root")
        self.tree.create_node("unknown", "unknown", parent="root")

        self.bio_identifier = BioIdentifier(model="bio-model")

        for dom in self.allowed_domains:
            domain = dom.split('.')[0]
            if not os.path.exists('Crawled_Data'):
                os.makedirs('Crawled_Data')

            folder_name = 'Crawled_Data/' + domain.capitalize(
            ) + '_University_Files'
            self.record[domain] = 0
            if not os.path.exists(folder_name):
                os.makedirs(folder_name)

    def parse(self, response):

        matched_domain = [x for x in self.allowed_domains if x in response.url]
        if len(matched_domain) > 0:
            domain = matched_domain[0].split('.')[0]

            folder_name = 'Crawled_Data/' + domain.capitalize(
            ) + '_University_Files'

            self.record[domain] = self.record.get(domain, 0) + 1

            if self.record[domain] % 50 == 0:
                print('\n Crawled {} Bio-pages of {} University ...'.format(
                    self.record[domain], domain.capitalize()))
                self.tree.save2file(folder_name + "/00__" +
                                    str(self.record[domain]) + "_tree.txt")

            isBio = self.bio_identifier.is_bio_html_content(
                response.xpath('//*').get())

            if isBio:
                text = BeautifulSoup(response.xpath('//*').get(),
                                     features="html.parser").get_text()
                tokens = nltk.word_tokenize(text)
                normalized_text = ' '.join(
                    [word for word in tokens if word.isalnum()])
                normalized_text += '\n' + response.url

                hash_text = hashlib.md5(response.url.encode())
                file_name = hash_text.hexdigest()

                with open(folder_name + "/" + file_name + ".txt",
                          "w",
                          encoding="utf-8") as file:
                    file.write(normalized_text)

            AllLinks = LinkExtractor(allow_domains=domain + '.edu',
                                     unique=True).extract_links(response)

            for n, link in enumerate(AllLinks):
                if not any([x in link.url for x in self.exclude_words]):
                    if self.tree.get_node(link.url) == None:
                        referer = response.request.headers.get('Referer', None)

                        if referer == None:
                            self.tree.create_node(link.url,
                                                  link.url,
                                                  parent='root')
                        else:
                            referer = referer.decode("utf-8")
                            if self.tree.contains(referer):

                                self.tree.create_node(link.url,
                                                      link.url,
                                                      parent=referer)
                            else:
                                self.tree.create_node(link.url,
                                                      link.url,
                                                      parent='unknown')

                        yield scrapy.Request(url=link.url, callback=self.parse)
Example #25
0
    for z in d:
        path = walkTree(tree, z, path + z)

    return path


input = list(map(lambda x: x.strip(), open("test_input.txt").readlines()))
tree = Tree()
tree.create_node("root", "root")

# first figure out how many steps there are and then sort them
# by their name
for lines in input:
    (l1, l2) = (lines[5], lines[36])
    print(lines)
    if tree.contains(l1) and tree.contains(l2):
        tree.move_node(l2, l1)
    elif tree.contains(l1) and not tree.contains(l2):
        tree.create_node(l2, l2, parent=l1)
    elif not tree.contains(l1) and tree.contains(l2):
        # get the root for l2 and make that the root for l1
        # then move l2 under l1
        tree.create_node(l1, l1, parent=tree.parent(l2))
        tree.move_node(l2, l1)

    else:
        tree.create_node(l1, l1, parent="root")
        tree.create_node(l2, l2, parent=l1)

tree.show()
print(walkTree(tree, 'root', ''))
Example #26
0
    def create(self,words_list,postags_list,arcs_list):
        # 输入三个list
        # 第一个是words_list 词语序列,词序
        # 第二个词性
        # 第三个是依存关系,这个也是用于构建树的关键

        tree = Tree()

        # 使用一层层的搭建技术
        # 我们设定五个层
        layer1 = []
        layer2 = []
        layer3 = []
        layer4 = []
        # layer5 = []
        # print('words_list' + str(words_list))
        # print('arcs_list'+str(arcs_list))

        # 首节点
        for i in range(len(arcs_list)):
            arc_head = arcs_list[i].split(':')[0]

            # 首节点
            if int(arc_head) == 0:
                HED_id = i

        # layer1层
        for i in range(len(arcs_list)):
            arc_head = arcs_list[i].split(':')[0]

            if int(arc_head) - 1 == int(HED_id):
                node = {'node' + str(i) : 'HED'}
                layer1.append(node)

        # layer2层
        for i in range(len(arcs_list)):
            arc_head = arcs_list[i].split(':')[0]

            # 说明有arc_head在layer1中,那就是这个点在layer2中
            for lay in layer1:
                if int(list(lay.keys())[0].lstrip('node')) == int(arc_head) - 1:
                    node = {'node' + str(i) : list(lay.keys())[0]}
                    layer2.append(node)

        # layer3层
        for i in range(len(arcs_list)):
            arc_head = arcs_list[i].split(':')[0]

            # 说明有arc_head在layer2中,那就是这个点在layer3中
            for lay in layer2:
                if int(list(lay.keys())[0].lstrip('node')) == int(arc_head) - 1:
                    node = {'node' + str(i): list(lay.keys())[0]}
                    layer3.append(node)

        # layer4层
        for i in range(len(arcs_list)):
            arc_head = arcs_list[i].split(':')[0]

            # 说明有arc_head在layer3中,那就是这个点在layer4中
            for lay in layer3:
                if int(list(lay.keys())[0].lstrip('node')) == int(arc_head) - 1:
                    node = {'node' + str(i): list(lay.keys())[0]}
                    layer4.append(node)


        # print(layer1)
        # print(layer2)
        # print(layer3)
        # print(layer4)


        # 四层都构建完毕
        # 下面就根据一层层的搭建树
        # 首先创建根节点
        if not tree.contains('HED'):
            tree.create_node(str(HED_id) + ' ' + words_list[int(HED_id)],
                             'HED',
                             data=postags_list[int(HED_id)] + ' ' + arcs_list[int(HED_id)].split(':')[1])

        # layer1
        for lay in layer1:
            nodename = list(lay.keys())[0]
            parent = list(lay.values())[0]
            tree.create_node(
                nodename.lstrip('node') + ' ' + words_list[int(nodename.lstrip('node'))],
                nodename,
                parent=parent,
                data=postags_list[int(nodename.lstrip('node'))] + ' ' + arcs_list[int(nodename.lstrip('node'))].split(':')[1])

        # layer2
        for lay in layer2:
            nodename = list(lay.keys())[0]
            parent = list(lay.values())[0]
            tree.create_node(
                nodename.lstrip('node') + ' ' + words_list[int(nodename.lstrip('node'))],
                nodename,
                parent=parent,
                data=postags_list[int(nodename.lstrip('node'))] + ' ' + arcs_list[int(nodename.lstrip('node'))].split(':')[1])

        # layer3
        for lay in layer3:
            nodename = list(lay.keys())[0]
            parent = list(lay.values())[0]
            tree.create_node(
                nodename.lstrip('node') + ' ' + words_list[int(nodename.lstrip('node'))],
                nodename,
                parent=parent,
                data=postags_list[int(nodename.lstrip('node'))] + ' ' + arcs_list[int(nodename.lstrip('node'))].split(':')[1])

        # layer4
        for lay in layer4:
            nodename = list(lay.keys())[0]
            parent = list(lay.values())[0]
            tree.create_node(
                nodename.lstrip('node') + ' ' + words_list[int(nodename.lstrip('node'))],
                nodename,
                parent=parent,
                data=postags_list[int(nodename.lstrip('node'))] + ' ' + arcs_list[int(nodename.lstrip('node'))].split(':')[1])

        return tree
class WarcFileSystem( LoggingMixIn, Operations ):
	"""Filesystem built on a WARC's URI paths."""
	def __init__( self, warc ):
		self.warc = warc
		logger.debug( "Mounting %s" % self.warc )
		self.fh = WarcRecord.open_archive( warc, gzip="auto", mode="rb" )
		self.tree = Tree()
		self._get_records()

	def _get_records( self ):
		"""Parses a WARC, building a hierarchical tree."""
		statinfo = os.stat( self.warc )
		self.gid = statinfo.st_gid
		self.uid = statinfo.st_uid
		self.tree.create_node( self.warc, "/" )
		self.records = {}
		bar = progressbar.ProgressBar( maxval=statinfo.st_size, widgets=[ progressbar.Bar( "=", "[", "]"), " ", progressbar.Percentage() ] )
		bar.start()
		for( offset, record, errors ) in self.fh.read_records( limit=None ):
			if record is not None and record.type != WarcRecord.WARCINFO:
				parent = "/"
				segments = [ record.type ] + re.split( "/+", record.url )
				for e in segments:
					identifier = "/".join( [ parent, e ] )
					if not self.tree.contains( identifier ):
						node = WarcRecordNode( record, offset, tag=e, identifier=identifier )
						self.tree.add_node( node, parent=parent )
					parent = identifier
				self.records[ record.url ] = ( offset, record )
				bar.update( offset )
		bar.finish()
		logger.debug( self.tree.show() )

#	def access( self, path, amode ):
#		logger.debug( path )
#		raise FuseOSError( EPERM )

	def chmod( self, path, mode ):
		raise FuseOSError( EPERM )

	def chown( self, path, uid, gid ):
		raise FuseOSError( EPERM )

	def create( self, path, mode ):
		raise FuseOSError( EPERM )

	def destroy( self, path ):
		self.fh.close()

#	def flush( self, path, fh ):
#		raise FuseOSError( EPERM )

	def fsync( self, path, datasync, fh ):
		raise FuseOSError( EPERM )
		
	def fsyncdir( self, path, datasync, fh ):
		raise FuseOSError( EPERM )

	def getattr( self, path, fh=None ):
		"""Returns stat info for a path in the tree."""
		logger.debug( path )
		if path == "/":
			stat = os.stat( self.warc )
			return dict( [
				( "st_mode", ( S_IFDIR | 0444 ) ),
				( "st_ino", stat.st_ino ),
				( "st_dev", stat.st_dev ),
				( "st_nlink", stat.st_nlink ),
				( "st_uid", stat.st_uid ),
				( "st_gid", stat.st_gid ),
				( "st_size", stat.st_size ),
				( "st_ctime", stat.st_ctime ),
				( "st_mtime", stat.st_mtime ),
				( "st_atime", stat.st_atime )
			] )
		else:
			return self.name_to_attrs( "/%s" % path )

	def getxattr( self, path, name, position=0 ):
		"""Returns the value for an extended attribute."""
		if path != "/":
			path = "/%s" % path

		node = self.tree.get_node( path )
		if node is None:
			raise FuseOSError( ENOENT )

		try:
			return node.xattrs[ name ]
		except KeyError:
			raise FuseOSError( ENODATA )

	def init( self, path ):
		pass

	def link( self, target, source ):
		raise FuseOSError( EPERM )

	def listxattr( self, path ):
		"""Returns a list of extended attribute names."""
		if path != "/":
			path = "/%s" % path

		node = self.tree.get_node( path )
		if node is None:
			raise FuseOSError( ENOENT )
		return node.xattrs.keys()

	def mkdir( self, path, mode ):
		raise FuseOSError( EPERM )

	def mknod( self, path, mode, dev ):
		raise FuseOSError( EPERM )

	def open( self, path, flags ):
		"""Should return numeric filehandle; returns file offset for convenience."""
		if path != "/":
			path = "/%s" % path

		node = self.tree.get_node( path )
		if node is None:
			raise FuseOSError( ENOENT )

		return node.offset

#	def opendir( self, path ):
#		raise FuseOSError( EPERM )

	def read( self, path, size, offset, fh ):
		"""Reads 'size' data from 'path', starting at 'offset'."""
		logger.debug( "read %s from %s at %s " % ( size, path, offset ) )

		if path != "/":
			path = "/%s" % path

		node = self.tree.get_node( path )
		if node is None:
			raise FuseOSError( ENOENT )

		offset += node.payload_offset
		mime, data = node.record.content
		end = offset + size
		return data[ offset:end ]

	def name_to_attrs( self, name ):
		"""Retrieves attrs for a path name."""
		logger.debug( name )
		node = self.tree.get_node( name )
		if node is None:
			raise FuseOSError( ENOENT )

		if node.is_leaf():
			st_mode = ( S_IFREG | 0444 )
			size = node.record.content_length
			try:
				timestamp = time.mktime( parse( node.record.date ).timetuple() )
			except ValueError as v:
				logger.warning( "Error parsing time: %s [%s]" % ( node.record.date, str( v ) ) )
				timestamp = time.mktime( datetime.fromtimestamp( 0 ).timetuple() )
		else:
			st_mode = ( S_IFDIR | 0555 )
			size = 0
			timestamp = time.time()
		return dict( [
			( "st_mode", st_mode ),
			( "st_ino", 0 ),
			( "st_dev", 0 ),
			( "st_nlink", 0 ),
			( "st_uid", self.uid ),
			( "st_gid", self.gid ),
			( "st_size", size ), 
			( "st_ctime", timestamp ),
			( "st_mtime", timestamp ),
			( "st_atime", timestamp )
		] )

	def readdir( self, path, fh ):
		"""Returns a tuple of all files in path."""
		logger.debug( path )
		if path != "/":
			path = "/%s" % path
		if self.tree.contains( path ):
			names = []

			for c in self.tree.get_node( path ).fpointer:
				child = self.tree.get_node( c )
				names.append( ( child.tag, self.name_to_attrs( child.identifier ), 0  ) )
			return names
		else:
			raise FuseOSError( ENOENT )

	def readlink( self, path ):
		raise FuseOSError( EPERM )

#	def release( self, path, fh ):
#		raise FuseOSError( EPERM )

#	def releasedir( self, path, fh ):
#		raise FuseOSError( EPERM )

	def removexattr( self, path, name ):
		raise FuseOSError( EPERM )

	def rename( self, old, new ):
		raise FuseOSError( EPERM )

	def rmdir( self, path ):
		raise FuseOSError( EPERM )

	def setxattr( self, path, name, value, options, position=0 ):
		raise FuseOSError( EPERM )

	def statfs( self, path ):
		raise FuseOSError( EPERM )

	def symlink( self, target, source ):
		raise FuseOSError( EPERM )

	def truncate( self, path, length, fh=None ):
		raise FuseOSError( EPERM )

	def unlink( self, path ):
		raise FuseOSError( EPERM )

	def utimens( self, path, times=None ):
		raise FuseOSError( EPERM )

	def write( self, path, data, offset, fh ):
		raise FuseOSError( EPERM )
Example #28
0
class LegalDocMLconverter(PDFConverter):

    CONTROL = re.compile('[\x00-\x08\x0b-\x0c\x0e-\x1f]')

    def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
                 imagewriter=None, stripcontrol=False):
        PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
                              laparams=laparams)
        self.imagewriter = imagewriter
        self.stripcontrol = stripcontrol
        self.textboxes = []
        self.page_width = []
        self.page_height = []
        self.classified = []
        self.classified_header = []
        self.classified_paragraph = []
        self.classified_section = []
        self.classified_subsection = []
        self.tree = Tree()
        self.tree.create_node("Documents", 'documents')
        self.num_tabs = 0
        self.write_header()

        self.headerExist = False
        self.in_li = False

        json_file = open('data/model.json', 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        self.model = model_from_json(loaded_model_json)
        self.model.load_weights("data/model.h5")
        
        self.tokenizer = []

        with open('data/tokenizer.pickle', 'rb') as handle:
            self.tokenizer = pickle.load(handle)
        
        return

    def decode_tags(self, pred) :
        tags = {
            'header':0,
            'document':1,
            'paragraph':2,
            'topic':3,
            'section':4,
            'subsection':5,
            'li':6,
            'footer':7,
            'page_number':8,
            'figure':9,
            'table':10,
            'table_li':11,
            'commentary':12,
            '?':13,
        }
        decode = {v: k for k, v in tags.items()}
        num_tags = max(tags.values()) + 1
        
        return decode[np.argmax(pred)]

    def write(self, text):
        if self.codec:
            text = text.encode(self.codec)
        self.outfp.write(text)
        return

    def write_header(self):
        if self.codec:
            self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
        else:
            self.write('<?xml version="1.0" ?>\n')
        self.write('<documents>\n')
        self.num_tabs = 1
        return

    def write_footer(self):
        self.write('</documents>\n')
        self.num_tabs = 0
        return

    def write_text(self, text):
        if self.stripcontrol:
            text = self.CONTROL.sub('', text)
        self.write(enc(text))
        return

    def write_tab(self):
        for i in range(self.num_tabs):
            self.write("\t")

    def receive_layout(self, ltpage):
        self.items = []

        def extract_text(item):
            if isinstance(item, LTPage):
                #print(bbox2str(item.bbox))
                self.page_width = item.x1
                self.page_height = item.y1
                for child in item:
                    extract_text(child)
            elif isinstance(item, LTFigure):
                for child in item:
                    extract_text(child)
            elif isinstance(item, LTTextBox):
                self.items.append(item)
            elif isinstance(item, LTChar):
                self.items.append(item)

        extract_text(ltpage)

        def get_y0(item):
            return item.y0

        def get_id(item):
            return item.index

        def get_size(item):
            if isinstance(item, LTChar):
                return item.size
            elif isinstance(item, LTAnno):
                return 0
            else:
                for child in item:
                    return get_size(child)

        self.items.sort(key=get_y0, reverse=True)

        def group_textboxes(items):
            new_items = []
            prev = items[0]
            for item in items[1:]:
                if isinstance(prev, LTChar):
                    box = LTTextBox()
                    box.add(prev)
                    box.set_bbox((prev.x0, prev.y0, prev.x1, prev.y1))
                    prev = box
                y_diff = (prev.y0 - item.y1)
                x_diff = (item.x0 - prev.x1)
                if y_diff < get_size(prev)/2 and x_diff < get_size(prev) and x_diff >= -get_size(prev)/2:
                    xs = [item.x0, item.x1, prev.x0, prev.x1]
                    ys = [item.y0, item.y1, prev.y0, prev.y1]
                    prev.add(item)
                    prev.set_bbox((min(xs), min(ys), max(xs), max(ys)))
                elif y_diff < get_size(prev)/2 and (item.x0 - prev.x0) < get_size(prev)/2 and (item.x1 - prev.x1) > -get_size(prev)/2:
                    vert = LTTextBoxVertical()
                    xs = [item.x0, item.x1, prev.x0, prev.x1]
                    ys = [item.y0, item.y1, prev.y0, prev.y1]
                    for child in prev:
                        vert.add(child)
                    vert.add(item)
                    vert.set_bbox((min(xs), min(ys), max(xs), max(ys)))
                    prev = vert
                else:
                    new_items.append(prev)
                    prev = item
                #new_items.append(prev)
                #prev = item
            new_items.append(prev)
            return new_items

        def classify(item):
            if isinstance(item, LTTextBox):
                wmode = ''
                if isinstance(item, LTTextBoxVertical):
                    wmode = ' wmode="vertical"'

                box = NLPTextBox(item)
                
                s = ('%s %d %d %d ' % (bbox2str(box.bbox), box.b, box.i, box.size) + item.get_text().replace('\n', ' '))
                
                s_list = []
                s_list.append(s)
                X = self.tokenizer.texts_to_sequences(s_list)
                maxlen = 100
                X = pad_sequences(X, padding='post', maxlen=maxlen)
                preds = self.model.predict(X)
                tag = self.decode_tags(preds)

                box.set_tag(tag)

                if (tag == "header"):
                    self.classified_header.append(box)
                elif (tag == "paragraph"):
                    self.classified_paragraph.append(box)
                elif (tag == "section"):
                    self.classified_section.append(box)
                elif (tag == "subsection"):
                    self.classified_subsection.append(box)

                self.classified.append(box)
                    
            else:
                assert False, str(('Unhandled', item))

        def into_tree():
            _header = self.classified_header[0]
            self.tree.create_node(_header.get_text(), _header.key, parent="documents", data=_header)

            for _section in self.classified_section:
                self.tree.create_node(_section.get_text(), _section.key, parent=_header.key, data=_section)

            for _sebsection in self.classified_subsection:
                keys = _sebsection.key.split('.')
                keys.pop()
                _key = ''.join([i + "." for i in keys])
                _key = _key[:-1] + ".0"
                if (not self.tree.contains(_key)):
                    data = NLPSimpleBox("section", _key)
                    self.tree.create_node(_key, _key, parent=_header.key, data=data)
                    self.classified.append(data)
                self.tree.create_node(_sebsection.get_text(), _sebsection.key, parent=_key, data=_sebsection)

            for _paragraph in self.classified_paragraph:
                keys = _paragraph.key.split('.')
                keys.pop()
                _key = ''.join([i + "." for i in keys])
                _key = _key[:-1]
                if (not self.tree.contains(_key)):
                    section_keys = _key.split('.')
                    section_keys.pop()
                    section_key = ''.join([i + "." for i in section_keys])
                    section_key = section_key[:-1] + ".0"

                    if (not self.tree.contains(section_key)):
                        data = NLPSimpleBox("section", _key)
                        self.tree.create_node(section_key, section_key, parent=_header.key, data=data)
                        self.classified.append(data)

                    data = NLPSimpleBox("subsection", _key)
                    self.tree.create_node(_key, _key, parent=section_key, data=data)
                    self.classified.append(data)
                try:
                    self.tree.create_node(_paragraph.get_text(), _paragraph.key, parent=_key, data=_paragraph)
                except:
                    self.tree.create_node(_paragraph.get_text(), _paragraph.key + ".0", parent=_key, data=_paragraph)

            new_classified = []

            prev_box = self.classified[0]
            for _boxes in self.classified:
                if _boxes.tag == "commentary":
                    if prev_box.tag == "commentary":
                        prev_box.text += _boxes.text
                        prev_box.set_tag("commentary")
                    else:
                        prev_box = _boxes
                else:
                    if prev_box.tag == "commentary":
                        new_classified.append(prev_box)
                    prev_box = _boxes
                    new_classified.append(_boxes)

            self.classified = new_classified

            for _boxes in self.classified:
                
                if (_boxes.tag == "footer"):
                    None
                elif (_boxes.tag == "page_number"):
                    None
                elif (_boxes.tag == "?"):
                    None
                elif (_boxes.tag == "topic"):
                    _prev_subsection = find_prev_with_tag(_boxes, "subsection")
                    try:
                        self.tree.create_node(_boxes.get_text(), _boxes.key, parent=_prev_subsection.key, data=_boxes)
                    except:
                        self.tree.create_node(_boxes.get_text(), _boxes.key  + '1', parent=_prev_subsection.key, data=_boxes)
        
                elif _boxes.tag != "header" and _boxes.tag != "paragraph" and _boxes.tag != "section" and _boxes.tag != "subsection":
                    _prev_paragraph = find_prev_with_tag(_boxes, "paragraph")
                    try:
                        self.tree.create_node(_boxes.get_text(), _prev_paragraph.key + "." + _boxes.key, parent=_prev_paragraph.key, data=_boxes)
                    except: 
                        self.tree.create_node(_boxes.get_text(), _prev_paragraph.key + "." + _boxes.key + '1', parent=_prev_paragraph.key, data=_boxes)
                    
        def find_prev_with_tag(item, tag):
            _prev = ''
            _next = False
            for _boxes in self.classified:
                if (_boxes.tag == tag):
                    _prev = _boxes
                    if (_next):
                        break
                if (_boxes == item):
                    if (_prev == ''):
                        _next = True
                    else:
                        break
            return _prev

        def get_node_id(node):
            return node.identifier

        def render(node):
            
            tag = ''
            item = node.data
            
            if isinstance(item, LTTextBox):
                wmode = ''
                if isinstance(item, LTTextBoxVertical):
                    wmode = ' wmode="vertical"'
                
                tag = item.tag

                if (tag == "header"):
                    if (not self.headerExist):
                        self.write_tab()
                        self.write('<document title="%s">\n' % item.get_text())     
                        self.num_tabs = self.num_tabs + 1
                        self.headerExist = True

                elif (tag == "paragraph"):
                    self.write_tab()
                    self.write('<paragraph key="%s">\n' % item.get_key())
                    self.num_tabs = self.num_tabs + 1
                    self.write_tab()
                    self.write("<p>" + item.get_text().replace('\n', ' ').lstrip().rstrip() + "</p>\n")

                elif (tag == "commentary"):
                    self.write_tab()
                    self.write('<commentary title="COMMENT:">')
                    self.write(item.get_text().replace('COMMENT:', '').lstrip())
                    self.write('</commentary>\n')

                elif (tag == "topic"):
                    self.write_tab()
                    self.write('<topic>')
                    self.write(item.get_text())
                    self.write('</topic>\n')

                elif (tag == "section"):
                    self.write_tab()
                    self.write('<section key="%s" title="%s">\n' % (item.get_key(), item.get_text()))
                    self.num_tabs = self.num_tabs + 1

                elif (tag == "subsection"):
                    self.write_tab()
                    self.write('<subsection key="%s" title="%s">\n' % (item.get_key(), item.get_text()))
                    self.num_tabs = self.num_tabs + 1

                elif (tag == "li"):
                    if (not self.in_li):
                        self.write_tab()
                        self.write('<ol>\n')
                        self.num_tabs = self.num_tabs + 1
                        self.in_li = True
                    self.write_tab()
                    if (item.list_tag):
                        self.write('<li key="%s">' % node.identifier)
                        self.write(item.get_text())
                        self.write('</li>\n')
                    else:
                        self.write('<li>')
                        self.write(item.get_text())
                        self.write('</li>\n')

                elif (tag == "footer"):
                    None
                elif (tag == "page_number"):
                    None
                elif (tag == "?"):
                    None
                else:
                    None
            

            branches = self.tree.is_branch(node.identifier)
            _branches = []
            for child in branches:
                _branches.append(self.tree.get_node(child))

            if (tag == "section" or tag == "subsection"):
                _branches.sort(key=get_node_id, reverse=False)

            for _child in _branches:
                render(_child)
            
            if (tag != "li" and tag != "footer" and tag != "page_number" and tag != "?" and self.in_li):
                self.num_tabs = self.num_tabs - 1
                self.write_tab()
                self.write('</ol>\n')
                self.in_li = False

            if (tag == "paragraph"):
                self.num_tabs = self.num_tabs - 1
                self.write_tab()
                self.write('</paragraph>\n')
            elif (tag == "header"):
                self.num_tabs = self.num_tabs - 1
                self.write_tab()
                self.write('</document>\n')
            elif (tag == "section"):
                self.num_tabs = self.num_tabs - 1
                self.write_tab()
                self.write('</section>\n')
            elif (tag == "subsection"):
                self.num_tabs = self.num_tabs - 1
                self.write_tab()
                self.write('</subsection>\n')  

        def highlights(item):
            s = ''
            prev_bold = False
            prev_italic = False
            for child in item:
                if isinstance(child, LTChar):
                    if 'Bold' in child.fontname:
                        if prev_italic:
                            s += '</i>'
                        if not prev_bold:
                            s += '<b>'
                            
                        prev_bold = True
                        prev_italic = False
                    elif 'Italic' in child.fontname:
                        if prev_bold:
                            s += '</b>'
                        if not prev_italic:
                            s += '<i>'
                        
                        prev_italic = True
                        prev_bold = False
                    else:
                        if prev_bold:
                            s += '</b>'
                        elif prev_italic:
                            s += '</i>'
                        prev_bold = False
                        prev_italic = False
                    
                    s += child.get_text()
                
                elif isinstance(child, LTTextLine):
                    s += highlights(child)
                elif isinstance(child, LTTextBox):
                    s += highlights(child)
                elif isinstance(child, NLPTextBox):
                    s += highlights(child)
                else:
                    if child.get_text() == '\n':
                        if prev_bold:
                            s += '</b>'
                        elif prev_italic:
                            s += '</i>'
                        
                        prev_bold = False
                        prev_italic = False
                    s +=  child.get_text()
            return s

        self.textboxes = group_textboxes(self.items)
        
        self.textboxes.sort(key=get_id, reverse=False)
        for item in self.textboxes:
            classify(item)
        
        into_tree()

        self.tree.show()

        render(self.tree.get_node("documents"))

        return

    def draw_layout(self, input_path, output_path):
        #init cv2

        pages = convert_from_path(input_path, 500)
        
        pages[0].save(output_path, 'JPEG')
        page1 = cv2.imread(output_path)

        page1_disp = page1
        for i in range(3):
            page1_disp = cv2.pyrDown(page1_disp)

        height, width, channels = page1.shape
        #print(width, height)
        #print(height)
        scale = height/int(self.page_height)
        for item in self.textboxes:
            if isinstance(item, LTTextBox) or isinstance(item, LTChar):
                #render cv2
                
                start = (int(item.x0 * scale), (height - int(item.y0 * scale)))
                end = (int(item.x1 * scale), (height - int(item.y1 * scale)))
                #print(start , end)
                color = (0, 0, 255)
                thickness = 5
                page1 = cv2.rectangle(page1, start, end, color, thickness)
            else:
                assert False, str(('Unhandled', item))

        page1 = cv2.rectangle(page1, (40,40), (50,50), (0,0,255), 2)
        boxed_disp = page1
        for i in range(3):
            boxed_disp = cv2.pyrDown(boxed_disp)

        while True:
            cv2.imshow('page', page1_disp)
            cv2.imshow('boxed', boxed_disp)
        
            #exit on ESC
            k = cv2.waitKey(30) & 0xFF
            if k == 27:
                break
            
        cv2.destroyAllWindows()

    def close(self):
        self.write_footer()
        return
Example #29
0
class Route(object):
    def __init__(self, universe):
        self.route    = Tree()
        self.universe = universe
        self.max_hops = 4

    def show(self):
        self.route.show()
    
    def asString(self):
        return (','.join([self.route[node].tag for node in self.route.expand_tree(mode=Tree.DEPTH)]))

    def getRoute(self):
        return self.route
        
    def byScore_key(self, s):
        return s.score
            
    def findRoute(self, start):
        parent = self.universe.findSystem(start)
        self.route.create_node(start, start, data=parent)
        systems = self.findNextSystems(start, start) 
        self.buildRoute(systems, start)
        
        return self.route

    def buildRoute(self, systems, parent):
        for s in systems:
            n = s.name
            h = 0
            
            if (self.route.contains(n) == False):  
                self.route.create_node(n, n, parent=parent, data=s)
        
                hop = h + self.route.depth(n)
                
                if (hop < self.max_hops):
                    sub_systems = self.findNextSystems(parent, n)                        
                    self.buildRoute(sub_systems, n)
            else:
                 n = parent + ' --> ' + n
                 self.route.create_node(n, n, parent=parent, data=s)

    def getSystemId(self, name, i=0):
        if (self.route.contains(name) == False):
            return name
        else:
            i += 1
            n = name + '(' + str(i) + ')'
            return self.getSystemId(n)

    def findNextSystems(self, parent, start):
        systems = []
        optimal = self.universe.distances.findOptimalSystems(start)
        
        for s in sorted(set(optimal)):
            if (s != parent):
                i = self.universe.findSystem(s)
                if (i.permit == False):
                    systems.append(i)
            
        s = sorted(systems, key = self.byScore_key)

        return s[:self.max_hops]

# http://xiaming.me/treelib/examples.html
#
# class SystemTree(object):
#     def __init__(self):
#         self.tree = Tree()
#         
#     def addNode(self, id, o):
#         self.tree.create_node(o, id)
#         
#     def addChildNode(self, p, id, o):
#         self.tree.create_node(o, id, parent=p)
#     
#     def getNode(self, id):
#         return self.tree.subtree(id)
#         
#     def __repr__(self):
#         return self.tree.to_json(with_data=True)
# 
# 
# t = SystemTree()
# t.addNode('Aerial', 'Aerial')
# t.addChildNode('Aerial', 'Jotun', 'Jotun')
# t.addChildNode('Jotun', 'Rusani', 'Rusani')
# n = t.getNode('Jotun')
# print(n)
# n = t.tree.contains('Invalid')
# print(n)
# t.tree.show()
Example #30
0
import os

path='' # path to list of domains and sub-domians
file= open(path)
lines = file.read().splitlines()
    
tree = Tree()

tree.create_node("name of root node", "ID of root node")  # root node
for url in lines:

    domain=""
    subdomain=""
    domain = tldextract.extract(url).domain  
    subdomain = tldextract.extract(url).subdomain
    if not (tree.contains(domain)):
        tree.create_node(domain, domain, parent="ID of root node") #Add domains to root node
    if subdomain:    
        tree.create_node(subdomain, subdomain+domain, parent=domain) #Add sub-domains to domain node



file.close()

tree.show(line_type="ascii-emv") #show data as stdout

tree.to_graphviz(filename="tree_graphviz") #dump tree as graphviz
#dot  xxx -Tps -o test.ps -Grankdir=LR #left to right 
subprocess.call(["dot", "tree_graphviz", "-Tps", "-o" ,"output.ps" ,"-Grankdir=LR"]) #Grankdir=LR option to build tree from left to right
#convert -flatten -density 150 -geometry 100% test.ps test.png
subprocess.call(["convert" ,"-flatten" ,"-density" ,"150" ,"-geometry" ,"100%" ,"output.ps" ,
Example #31
0
class OntoTypes:

    def __init__(self, tcu="http://www.w3.org/2002/07/owl#Thing"):
        self.top_class_uri = tcu
        self.tree = Tree()

    # TODO
    def get_dbo_type_levels_dict(self, path, top_class_uri):

        logger.info("Starting get_dbo_types_dict from %s, topclass=%s" % (path, top_class_uri))
        # http://rdflib.readthedocs.io/en/stable/intro_to_sparql.html
        g = rdflib.Graph()

        # ... add some triples to g somehow ...

        g.parse(path)

        orgClass = rdflib.term.URIRef(top_class_uri)

        class_list = [orgClass]

        stack = [orgClass]
        while stack:
            currentClass = stack.pop()
            logger.debug("Current class %s" % currentClass)

            subclasses = [s for s, p, o in g if p == rdflib.RDFS.subClassOf and o == currentClass]
            if subclasses:
                stack = stack + subclasses
                class_list = class_list + subclasses
            # print(subclasses)

        class_list = set(class_list)
        return class_list


    # get all parent types from a given one
    def get_subclassesof_tree(self, ifile):
        # http://rdflib.readthedocs.io/en/stable/intro_to_sparql.html
        g = rdflib.Graph()
        g.parse(ifile)


        orgClass = rdflib.term.URIRef(self.top_class_uri)
        stack  = [orgClass]

        self.tree.create_node(orgClass,orgClass)
        while stack:
            currentClass = stack.pop()
            logger.debug("Current class %s" % currentClass)
            subclasses = [s for s, p, o in g if p == rdflib.RDFS.subClassOf and o == currentClass]
            logger.debug("Subclasses: %s" % subclasses)
            if subclasses:
                stack  =  stack + subclasses

                for sc in subclasses:
                    if not self.tree.contains(sc):
                        self.tree.create_node(sc, sc, parent=currentClass)
                    else:
                        sc_bis =  str(sc) + "_bis"
                        self.tree.create_node(sc_bis, sc_bis, parent=currentClass)
Example #32
0
df = df['data']

pathLength = 0



orderedList = list() #list is ordered by length of the path

while len(orderedList)!=329: #329 is the total number of categories (use CTRL + F and count the number of "id" instances)
    pathLength+=1
    for i in df:
        if len(i['path'])==pathLength:
            orderedList.append(i)

for c in orderedList:
    if fbTree.contains(c['name']):
        continue
    if len(c['path'])==1:
        fbTree.create_node(c['name'], c['name'], "root")
    else:
        fbTree.create_node(c['name'],c['name'], c['path'][-2])

fbTree.show()




'''
create buttons for root's children
<collapse rootChildren>