def tagged_paths_to_tree(path_list, base_path=DEFAULT_BASE_PATH, taggers=[]): """ Same as paths_to_tree but add tags taggers must contains functions in format def tagger(path: str) -> List[str]: ... """ def tag_path(path): tags = [] for func in taggers: tags += func(path) if not tags: return "" return " ({tags})".format(tags=", ".join(tags)) tree = Tree() root = Path().resolve().root tree.create_node(root, root) base_path = Path(base_path).resolve() for path in path_list: path = base_path.joinpath(path).resolve() if len(path.parts) < 2: continue parent = path.parts[0] parts = path.parts[1:] for part in parts: current_path = joinpath(parent, part) if not tree.contains(current_path): tree.create_node(part + tag_path(current_path), current_path, parent=parent) parent = current_path return tree
class TreePipeline(object): def open_spider(self, spider): self.tree = Tree() self.tree.create_node("root", "root") def process_item(self, item, spider): lst = item['text'] lst = [x.strip() for x in [y.replace('...', '') for y in lst]] item['pagetitle'] = item['pagetitle'].replace('...', '') lst[-1] = item['pagetitle'] for idx, elem in enumerate(lst): if idx == 0: previous = "root" else: previous = "|".join(lst[:idx]) elem = "|".join(lst[:idx + 1]) # elem = elem.replace('...', '') elem = elem.encode('utf-8').decode('utf-8') if not self.tree.contains(elem): print "Adding node %s" % elem self.tree.create_node(elem, elem, parent=previous) # self.tree.show() return item def close_spider(self, spider): self.tree.show() with open(makepath('data/cats/tree.json'), 'w') as outfile: outfile.write(self.tree.to_json()) self.tree.save2file(makepath('data/cats/tree.tree'))
def create_ding_tree(): #global ding_tree, dept_result #debug only #连接数据库 db, cursor = connect_db('localhost', 'root', 'yoyoball', 'dingtalk') ding_tree = Tree() sql = "SELECT `id`, `name`, `parentid` FROM dingding_department_list" cursor.execute(sql) dept_result = cursor.fetchall() #print dept_result #debug only if dept_result != None and len(dept_result) > 0: ding_tree.create_node('##ding_root##', '0') #先创建虚拟根 for i in range(len(dept_result)): #向虚拟根填充所有组织 #print dept_result[i] #debug only #ding_tree.create_node(dept_result[i][1].decode('utf-8'), dept_result[i][0], '0000') ding_tree.create_node(dept_result[i][1], dept_result[i][0], '0') for i in range(len(dept_result)): #修改隶属关系 if dept_result[i][0] != '1' : #只要不是实根,就要修改隶属关系【钉钉中实根id为'1'且无上级部门,数据表dingding_department_list中存储id为'1'的部门上级为'0'】 if ding_tree.contains(dept_result[i][2]): #判断上级是否存在 ding_tree.move_node(dept_result[i][0], dept_result[i][2]) else: #没有上级的不修改 #print type(dept_result[i][2]), dept_result[i][2] #debug only continue #断开数据库 close_db(db) #return ding_tree return ding_tree.subtree('1')
def create_oa_tree(): #global oa_tree #debug only #连接数据库 db, cursor = connect_db('localhost', 'root', 'yoyoball', 'test') oa_tree = Tree() sql = "SELECT `orgid`, `shortname`, `parentorgid` FROM groupinfo" cursor.execute(sql) dept_result = cursor.fetchall() #print dept_result debug only if dept_result != None and len(dept_result) > 0: oa_tree.create_node('##oa_root##', '0000') #先创建虚拟根 for i in range(len(dept_result)): #向虚拟根填充所有组织 #print dept_result[i][1].decode('utf-8'), dept_result[i][0], dept_result[i][2] oa_tree.create_node(dept_result[i][1], dept_result[i][0], '0000') for i in range(len(dept_result)): #修改隶属关系 if dept_result[i][2] != '0000' : #OA中'001000'等的组织上级为'0000'即OA数据库中存在虚根,所以无需做此步骤 if oa_tree.contains(dept_result[i][2]): #判断上级是否存在 oa_tree.move_node(dept_result[i][0], dept_result[i][2]) else: #没有上级的不修改 continue #断开数据库 close_db(db) return oa_tree
def update(utree, freq): updt_tree = Tree(tree=utree, deep=True) for item in utree.expand_tree('root'): if item == 'root': continue if updt_tree.contains(item) and utree[item].data < freq: updt_tree.remove_node(item) return updt_tree
def create_tree(sitemap_dict): """ Reads sitemap dict and generate a tree of all links :param sitemap_dict: Sitemap generated during run :type sitemap_dict: dict """ tree = Tree() root = list(sitemap_dict.keys())[0] tree.create_node(root, root) for k, v in sitemap_dict.items(): if not tree.contains(k): logger.debug(f'Creating key: {k}') tree.create_node(k, k, parent=root) for i in v: if not tree.contains(i): logger.debug(f'Creating node {i}, parent: {k}') tree.create_node(i, i, parent=k) return tree
def _create_categories_tree(self) -> Tree: tree = Tree() tree.create_node('root', 'root') for category_path_split in self.category_paths_split: parent_node = 'root' for node in category_path_split: if not tree.contains(node): tree.create_node(node, node, parent=parent_node) parent_node = node return tree
def visualise_tree_from_slice(data): tree = Tree() for index, item in data.iterrows(): if not tree.contains(item['id']): if item['parent_id'] == 1 or item['parent_id'] ==2: tree.create_node(item['answer'], item['id']) else: tree.create_node(item['answer'], item['id'], parent=item['parent_id']) tree.show() return tree pass
def create_tree(files): tree = Tree() root = files[0] tree.create_node(f"[{root.url.split('/')[0]}]", root.url.split("/")[0]) for item in files: if not tree.contains(item.url): pieces = item.url.split("/") for index, path in enumerate(pieces, 1): if not tree.contains("/".join(pieces[:index])): if len(pieces) == index and item.is_file: path = f"{path} ({item.qty_lines} linhas)" else: path = f"[{path}]" tree.create_node(path, "/".join(pieces[:index]), parent="/".join(pieces[:index - 1])) if tree.contains(f'{root.url}/tree'): tree.remove_node(f'{root.url}/tree') return tree
def union_tree(c_tree1, c_tree2): union_tree = Tree(tree=c_tree1, deep=True) for item in c_tree2.expand_tree('root'): if item == "root": continue if union_tree.contains(item): union_tree[item].data += c_tree2[item].data else: parent = c_tree2.parent(item).identifier union_tree.create_node(item, item, data=c_tree2[item].data, parent=parent) return union_tree
def __call__(self, inputs,words,dep,is_train=True): """ :param xs: a list of ngrams (or words if win is set to 1) :return: embeddings looked from tables """ list=eval(dep) tree = Tree() finish=False err_node=[] root_index=0 while 1: if finish: break; if len(tree.all_nodes())==len(list): finish=True; for i in range(0,len(list)): arr=list[i] parentIdx=arr[1] nodeIdx=arr[2] if not tree.contains(nid=nodeIdx): if i==0: tree.create_node(words[nodeIdx-1],identifier=nodeIdx) root_index=nodeIdx-1 else: if tree.contains(nid=parentIdx): tree.create_node(words[nodeIdx-1],identifier=nodeIdx,parent=parentIdx) H=[] for idx in range(0,len(inputs)): h=self.expr_for_tree(xt=inputs[idx],tree=tree,node=tree.get_node(idx+1),is_train=is_train) H.append(h) return H,root_index
def create_tree(arr, depth='4'): tree = Tree() print("creating your tree ..") # print(tree.get_node("compsci")) tree.create_node("compsci", "papa") for course in arr: num = comparator(course) if (num[0] == str(depth)): # print(course) tree.create_node(course, course, parent="papa") pre_reqs = get_prerequisites(course) time.sleep(1) for pre in pre_reqs: if (tree.get_node(course + "" + pre) == None): tree.create_node(pre, course + "" + pre, parent=course) pre2 = get_prerequisites(pre) time.sleep(1) for pr in pre2: if (tree.get_node(course + "" + pre + "" + pr) == None): tree.create_node(pr, course + "" + pre + "" + pr, parent=course + "" + pre) tests = get_prerequisites(pr) time.sleep(1) for tst in tests: if (tree.get_node(course + "" + pre + "" + pr + "" + tst) == None): tree.create_node( tst, course + "" + pre + "" + pr + "" + tst, parent=course + "" + pre + "" + pr) print("Still loading...") for course in arr: if (not tree.contains(course)): tree.create_node(course, course, parent="papa") pre_reqs = get_prerequisites(course) time.sleep(1) for pre in pre_reqs: if (tree.get_node(course + "" + pre) == None): tree.create_node(pre, course + "" + pre, parent=course) pre2 = get_prerequisites(pre) time.sleep(1) return tree, arr
class DockerIf(object): def __init__(self, base_url='unix://var/run/docker.sock'): self.client = Client(base_url=base_url) def remove_image(self, *args, **kwargs): self.client.remove_image(*args, **kwargs) def add_image_node(self, image, image_id, parent=''): is_dangling = image_id in self.dangling node = "%s (%s)" % (image[u'RepoTags'], image[u'Id']) if parent == '': self.image_tree.create_node(node, image_id, parent='/', data={u'image': image, u'Dangling': is_dangling}) else: self.image_tree.create_node(node, image_id, parent=parent, data={u'image': image, u'Dangling': is_dangling}) if image[u'Id'] in self.pending: for node in self.pending[image[u'Id']]: self.add_image_node(node[0], node[1], parent=image[u'Id']) def prepare_image_tree(self): self.image_tree = Tree() self.image_tree.create_node('', '/') self.pending = dict() dangling_images = self.client.images(filters={u'dangling': True}) self.dangling = [] for image in dangling_images: self.dangling.append(image[u'Id']) for image in self.client.images(all=True): if u'ParentId' in image and image[u'ParentId'] != '': if self.image_tree.contains(image[u'ParentId']) or image[u'ParentId'] == '': self.add_image_node(image, image[u'Id'], parent=image[u'ParentId']) else: if image[u'ParentId'] not in self.pending: self.pending[image[u'ParentId']] = [] self.pending[image[u'ParentId']].append((image, image[u'Id'])) else: self.add_image_node(image, image[u'Id']) def get_image_tree(self): self.prepare_image_tree() return self.image_tree
def paths_to_tree(path_list, base_path=DEFAULT_BASE_PATH): tree = Tree() root = Path().resolve().root tree.create_node(root, root) base_path = Path(base_path).resolve() for path in path_list: path = base_path.joinpath(path).resolve() if len(path.parts) < 2: continue parent = path.parts[0] parts = path.parts[1:] for part in parts: current_path = joinpath(parent, part) if not tree.contains(current_path): tree.create_node(part, current_path, parent=parent) parent = current_path return tree
def get_invoke_tree(self, method: MethodId, search_depth=3): tree = Tree(deep=search_depth, identifier=method.address) # Parent method with invoke address list tree.create_node(identifier=method, data=[]) for _ in range(search_depth): for leaf in tree.leaves(): uppers = self.apkinfo.find_upper_methods(leaf.identifier) for offset, upper in uppers: bytecode = self.apkinfo.find_bytecode_by_addr( upper.dexindex, offset) if not tree.contains(upper): tree.create_node(identifier=upper, data=[bytecode], parent=leaf) else: tree.get_node(upper).data.append(bytecode) return tree
def FpGrowth(fName): readFile(fName) Cone = getSizeOneItemSet(globOriginalList) priorityDict = priorityDic(Cone) #print(priorityDict) tree = Tree() tree.create_node("{}", "root") #reconstruct the whole transction database based on the priority counter = 0 for set in globOriginalList: temp = dict() for element in set: priority = priorityDict.get(element) temp.update({element:priority}) sorted_temp = sorted(temp.items(), key=operator.itemgetter(1)) sorted_temp.reverse() #print(sorted_temp) # construct Fp tree root = "root" for tuple in sorted_temp: if(not tree.contains(tuple[0])): tree.create_node(tuple[0], tuple[0], root, 0) root = tuple[0] else: if tuple[0] in tree.is_branch(root): #print("node already in this branch, don't know what to do") #print("going down") root = tuple[0] #print(root) else: #print("should create a duplicate node") tree.create_node(tuple[0], counter, root, 0) root = counter counter += 1 # I need to decide whether to create a new node or not # the condition is under this branch if this node exist # so I should check the root tree.show()
class Mwrp: def __init__(self, world, number_of_agent, start_node): self.number_of_agent = number_of_agent self.open_list = np.array([start_node]) self.tree = Tree() self.tree.create_node(start_node.pos.__str__(), start_node.pos.__str__(), data=start_node) self.world = world self.node_expend_index = 1 self.need_to_see = np.sum(self.world.grid_map == 0) def insert_to_open_list(self, new_node): for index, data in enumerate(self.open_list): if data.f > new_node.f: self.open_list = np.insert(self.open_list, index, new_node) return self.open_list = np.append(self.open_list, new_node) def pop_open_list(self): return self.open_list[0].pos def move_from_open_to_close(self, index=0): self.open_list = np.delete(self.open_list, index) def heuristic(self, state): return 1 # #old # def fix_g_subtree(self,neighbor,state): # tmp_new_open = [] # #self.tree.move_node(neighbor.__str__(), state.__str__()) # # for node in mwrp.tree.subtree(state.data.pos.__str__()).expand_tree(mode=Tree.DEPTH): # parent = mwrp.tree.parent(node) # # new_g = parent.data.g + Utils.n_dim_distance( # self.tree.get_node(node).data.pos, parent.data.pos) # self.tree.get_node(node).data.f = self.tree.get_node(node).data.f - self.tree.get_node(node).data.g + new_g # # self.tree.get_node(node).data.g = new_g # tmp_new_open.append(self.tree.get_node(node)) # # for need_new_open in tmp_new_open: # for index, data in enumerate(self.open_list): # if (np.all(data.pos == need_new_open.data.pos)): # self.open_list = np.delete(self.open_list, index) # self.insert_to_open_list(self.tree.get_node(need_new_open.data.pos.__str__()).data) # break def get_all_seen(self, state): tmp_node = self.tree.get_node(state.identifier) tmp_seen = self.world.get_seen(tmp_node) while not self.tree.get_node(tmp_node.identifier).is_root(): tmp_node = self.tree.get_node( tmp_node.predecessor(self.tree.identifier)) tmp_seen = np.vstack((tmp_seen, self.world.get_seen(tmp_node))) all_seen = np.unique(tmp_seen, axis=0) return all_seen # def get_all_seen1(self, state): # # tmp_node = self.tree.get_node(state.identifier) # dictOfWords = {i.__str__(): 0 for i in self.world.get_seen(tmp_node)} # # while not self.tree.get_node(tmp_node.identifier).is_root(): # tmp_node = self.tree.get_node(tmp_node.predecessor(self.tree.identifier)) # for seen in self.world.get_seen(tmp_node): # if not seen.__str__() in dictOfWords: # dictOfWords[seen.__str__()]=0 # # return dictOfWords # def expend_all(self,state): # for neighbor in self.world.get_neighbors(state): # if self.world.in_bund(neighbor) and self.world.is_obstical(neighbor): # new_g = mwrp.tree.get_node(state.__str__()).data.g + Utils.n_dim_distance(state, neighbor) # if not self.tree.get_node(neighbor.__str__()): # h = self.heuristic(state) # new_node=Node(neighbor,new_g,new_g+h) # self.tree.create_node(neighbor.__str__(),neighbor.__str__(),parent=(state.__str__()),data=new_node) # self.insert_to_open_list(new_node) # else: # self.fix_g(neighbor, state) # # #self.tree.show() # self.move_from_open_to_close() def expend(self, state): move_index = np.zeros(self.number_of_agent).astype(int) for i in range(LOS**number_of_agent): for j in range(number_of_agent): i, index = divmod(i, LOS) move_index[j] = index neighbor = self.world.get_one_neighbor(state, move_index) if self.world.in_bund(neighbor) and self.world.is_obstical( neighbor): new_g = mwrp.tree.get_node( state.__str__()).data.g + Utils.n_dim_distance( state, neighbor) if not self.tree.contains(neighbor.__str__()): h = self.heuristic(state) new_node = Node(neighbor, new_g, new_g + h) self.tree.create_node(neighbor.__str__(), neighbor.__str__(), parent=(state.__str__()), data=new_node) self.insert_to_open_list(new_node) else: if new_g < self.tree.get_node(neighbor.__str__()).data.g: self.fix_g(neighbor, state) self.move_from_open_to_close() def fix_g(self, old_state, new_parent): state = self.tree.get_node(old_state.__str__()) old_parent = self.tree.get_node(state.predecessor( self.tree.identifier)) new_parent = self.tree.get_node(new_parent.__str__()) if self.seen_comparison(new_parent, old_parent): self.tree.move_node(state.identifier, new_parent.identifier) new_g = new_parent.data.g + \ Utils.n_dim_distance(self.tree.get_node(state.identifier).data.pos, new_parent.data.pos) self.tree.get_node(state.identifier).data.f = self.tree.get_node(state.identifier).data.f - \ self.tree.get_node(state.identifier).data.g + new_g self.tree.get_node(state.identifier).data.g = new_g for index, data in enumerate(self.open_list): if np.all(data.pos == state.data.pos): self.open_list = np.delete(self.open_list, index) self.insert_to_open_list( self.tree.get_node(state.identifier).data) break def goal_test(self, state): seen_number = self.world.get_seen(state).shape[0] if (seen_number == self.need_to_see): return True return False def seen_comparison(self, state_new, state_old): seen_new = self.get_all_seen(state_new).tolist() seen_old = self.get_all_seen(state_old).tolist() if seen_new.shape[0] < seen_old.shape[0]: return False for one_seen in seen_old: if one_seen not in seen_new: return False return True
# In[8]: # 定义节点 node = {clifi:pluno/10**(7-d),freq:n} d为节点的深度 ftctree_dict = {} # i = 0 for index in customer_pur_recoder: # if i > 0: # break # i+=1 c_tree = Tree() c_tree.create_node(index, 'root') for category in customer_pur_recoder[index]['category']: parent = 'root' for item in category: if c_tree.contains(item): c_tree[item].data += 1 else: c_tree.create_node(item, item, data=1, parent=parent) parent = item ftctree_dict[index] = c_tree # c_tree.show() # ftctree_dict # print(c_tree.to_json(with_data=True)) # ## 求unionTree # In[46]: # 781924 13325038116 # c_tree1 = ftctree_dict[781924]
class GitTool(object): def __init__(self, parent_path, shells, build_tree=False, log=None): """初始化操作目录, 操作命令. :parameter parent_path: 操作目录 :parameter shells: 执行shell :parameter build_tree: 是否生成树形导航 :parameter log: log文件 """ self._directory = parent_path self._unix_shell = shells self._log_file = log self._tree = None self._build_tree = build_tree def get_build_tree(self): return self._build_tree def set_build_tree(self, value): self._build_tree = value build_tree = property(get_build_tree, set_build_tree) def _print(self, info=''): if self._log_file: os.system("echo %s >> %s" % (info, self._log_file)) else: print(info) def run_work(self): """对指定的操作目录, 执行指定的操作命令. """ # 如果传入日志路径不存在则创建 if self._log_file: dir_name = os.path.dirname(self._log_file) if not os.path.exists(dir_name): os.makedirs(dir_name) if not os.path.exists(self._log_file): os.mknod(self._log_file) def build_tree(target_path): """创建树节点. :param target_path: 指定目录 """ if not self._build_tree: return self._tree = Tree() parent_name = os.path.basename(target_path) self._tree.create_node(parent_name, parent_name) def exist_node(sub_name): """指定节点是否存在. :param sub_name: 指定节点. """ if not self._build_tree: return sub_name nid = 0 while self._tree.contains(sub_name): sub_name = '_'.join((sub_name, str(nid))) nid += 1 return sub_name def report_tree(target_path, out_file=True): """输出文件树. :param target_path: 指定节点. :param out_file: 指定节点. """ if not self._build_tree: return if out_file: report_file = os.path.basename(target_path.strip(os.path.sep)) self._tree.save2file('%s.txt' % report_file) else: self._tree.show() def process_target_path(target_path, target_tag=None): """对指定目录执行操作. :param target_path: 指定目录 :param target_tag: 指定标签 """ # 判断路径是否存在 if not os.path.exists(target_path): self._print("Directory does not exist!") return parent_name = os.path.basename( target_path) if not target_tag else target_tag # 遍历目录下的Git Repository for i in os.listdir(target_path): sub_path = os.path.join(target_path, i) sub_name = os.path.basename(sub_path) # sub_path类型为目录, 并且存在.git且为目录, 视为Git Repository git_path = os.path.join(sub_path, ".git") if os.path.isdir(sub_path): sub_name = exist_node(sub_name) if self._build_tree: self._tree.create_node(sub_name, sub_name, parent=parent_name) if os.path.exists(git_path) and os.path.isdir(git_path): start_info = "Starting: %(sub_dir)s %(ph)s" % { 'sub_dir': i, 'ph': "." * (80 - len(i) - 1) } self._print(start_info) os.system(self._unix_shell % sub_path) self._print() else: process_target_path(sub_path, sub_name) if isinstance(self._directory, six.string_types): build_tree(self._directory) process_target_path(self._directory) report_tree(self._directory) elif isinstance(self._directory, (tuple, list)): for path in self._directory: build_tree(path) process_target_path(path) report_tree(path) else: pass self._print("Ok,All work is done!\r") def __call__(self): if self._log_file: now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') self._print("%s %s %s" % ("=" * 35, now_time, "=" * 35)) self.run_work()
def tree_build_from_list(containers): """ Build a tree based on a unsorted list. Build a tree of containers based on an unsorted list of containers. Example: -------- >>> containers = [ { "childContainerKey": null, "configlets": [], "devices": [], "imageBundle": "", "key": "root", "name": "Tenant", "parentName": null }, { "childContainerKey": null, "configlets": [ "veos3-basic-configuration" ], "devices": [ "veos-1" ], "imageBundle": "", "key": "container_43_840035860469981", "name": "staging", "parentName": "Tenant" }] >>> print(tree_build_from_list(containers=containers)) {"Tenant": {"children": [{"Fabric": {"children": [{"Leaves": {"children": ["MLAG01", "MLAG02"]}}, "Spines"]}}]}} Parameters ---------- containers : dict, optional Container topology to create on CVP, by default None Returns ------- json tree topology """ # Create tree object tree = Tree() # Create the base node previously_created = list() # Create root node to mimic CVP behavior tree.create_node("Tenant", "Tenant") # Iterate for first level of containers directly attached under root. for cvp_container in containers: if cvp_container['parentName'] is None: continue elif cvp_container['parentName'] in ['Tenant']: previously_created.append(cvp_container['name']) tree.create_node(cvp_container['name'], cvp_container['name'], parent=cvp_container['parentName']) # Loop since expected tree is not equal to number of entries in container topology while len(tree.all_nodes()) < len(containers): for cvp_container in containers: if tree.contains( cvp_container['parentName'] ): # and cvp_container['parentName'] not in ['Tenant'] try: tree.create_node(cvp_container['name'], cvp_container['name'], parent=cvp_container['parentName']) except: # noqa E722 continue return tree.to_json()
def tree_build_from_dict(containers=None): """ Build a tree based on a unsorted dictConfig(config). Build a tree of containers based on an unsorted dict of containers. Example: -------- >>> containers = {'Fabric': {'parent_container': 'Tenant'}, 'Leaves': {'configlets': ['container_configlet'], 'devices': ['veos01'], 'images': ['4.22.0F'], 'parent_container': 'Fabric'}, 'MLAG01': {'configlets': ['container_configlet'], 'devices': ['veos01'], 'images': ['4.22.0F'], 'parent_container': 'Leaves'}, 'MLAG02': {'configlets': ['container_configlet'], 'devices': ['veos01'], 'images': ['4.22.0F'], 'parent_container': 'Leaves'}, 'Spines': {'configlets': ['container_configlet'], 'devices': ['veos01'], 'images': ['4.22.0F'], 'parent_container': 'Fabric'}} >>> print(tree_build_from_dict(containers=containers)) {"Tenant": {"children": [{"Fabric": {"children": [{"Leaves": {"children": ["MLAG01", "MLAG02"]}}, "Spines"]}}]}} Parameters ---------- containers : dict, optional Container topology to create on CVP, by default None Returns ------- json tree topology """ # Create tree object tree = Tree() # Create the base node previously_created = list() # Create root node to mimic CVP behavior tree.create_node("Tenant", "Tenant") # Iterate for first level of containers directly attached under root. for container_name, container_info in containers.items(): if container_info['parent_container'] in ['Tenant']: previously_created.append(container_name) tree.create_node(container_name, container_name, parent=container_info['parent_container']) # Loop since expected tree is not equal to number of entries in container topology while len(tree.all_nodes()) < len(containers) + 1: for container_name, container_info in containers.items(): if tree.contains( container_info['parent_container'] ) and container_info['parent_container'] not in ['Tenant']: try: tree.create_node(container_name, container_name, parent=container_info['parent_container']) except: # noqa E722 continue return tree.to_json()
def construct_celltree(nucleus_file, config): ''' Construct cell tree structure with cell names :param nucleus_file: the name list file to the tree initilization :param max_time: the maximum time point to be considered :return cell_tree: cell tree structure where each time corresponds to one cell (with specific name) ''' ## Construct cell # Add unregulized naming cell_tree = Tree() cell_tree.create_node('P0', 'P0') cell_tree.create_node('AB', 'AB', parent='P0') cell_tree.create_node('P1', 'P1', parent='P0') cell_tree.create_node('EMS', 'EMS', parent='P1') cell_tree.create_node('P2', 'P2', parent='P1') cell_tree.create_node('P3', 'P3', parent='P2') cell_tree.create_node('C', 'C', parent='P2') cell_tree.create_node('P4', 'P4', parent='P3') cell_tree.create_node('D', 'D', parent='P3') cell_tree.create_node('Z2', 'Z2', parent='P4') cell_tree.create_node('Z3', 'Z3', parent='P4') # EMS cell_tree.create_node('E', 'E', parent='EMS') cell_tree.create_node('MS', 'MS', parent='EMS') # Read the name excel and construct the tree with complete SegCell df_time = pd.read_csv(nucleus_file) # read and combine all names from different acetrees ## Get cell number try: with open('./ShapeUtil/number_dictionary.txt', 'rb') as f: number_dictionary = pickle.load(f) except: ace_files = glob.glob('./ShapeUtil/AceForLabel/*.csv') cell_list = [x for x in cell_tree.expand_tree()] for ace_file in ace_files: ace_pd = pd.read_csv(os.path.join(ace_file)) cell_list = list(ace_pd.cell.unique()) + cell_list cell_list = list(set(cell_list)) cell_list.sort() number_dictionary = dict(zip(cell_list, range(1, len(cell_list) + 1))) with open('./ShapeUtil/number_dictionary.txt', 'wb') as f: pickle.dump(number_dictionary, f) with open('./ShapeUtil/name_dictionary.txt', 'wb') as f: pickle.dump(dict(zip(range(1, len(cell_list) + 1), cell_list)), f) max_time = config.get('max_time', 100) df_time = df_time[df_time.time <= max_time] all_cell_names = list(df_time.cell.unique()) for cell_name in list(all_cell_names): if cell_name not in number_dictionary: continue times = list(df_time.time[df_time.cell == cell_name]) cell_info = cell_node() cell_info.set_number(number_dictionary[cell_name]) cell_info.set_time(times) if not cell_tree.contains(cell_name): if "Nuc" not in cell_name: parent_name = cell_name[:-1] cell_tree.create_node(cell_name, cell_name, parent=parent_name, data=cell_info) else: cell_tree.update_node(cell_name, data=cell_info) return cell_tree, max_time
clnt = CvpClient() clnt.connect([cvpIP], switchuser, switchpass) clntapi = CvpApi(clnt) app_name = "" getContainers = clntapi.get_containers()["data"] tree = Tree() tree.create_node("Tenant", "Tenant") #root for container in getContainers: containername = container["name"] parentName = container["parentName"] parentId = container["parentId"] if containername != "Tenant": if tree.contains(parentName): if tree.contains(containername) is False: tree.create_node(containername, containername, parent=parentName) else: getcontainerbyid = clntapi.get_container_by_id(parentId) parent_parentname = getcontainerbyid["parentName"] tree.create_node(parentName, parentName, parent=parent_parentname) tree.create_node(containername, containername, parent=parentName) if containername == targetcontainer: targetcontainerkey = container["key"] sub_t = tree.subtree(containertobemoved) sub_t.show() paths_to_leaves = sub_t.paths_to_leaves()
class FacultyPagesFilteredSpider(scrapy.Spider): name = 'faculty_pages_filtered' allowed_domains = [ 'cmu.edu', 'cornell.edu', 'washington.edu', 'gatech.edu', 'princeton.edu', 'utexas.edu', 'illinois.edu', 'berkeley.edu' 'mit.edu', 'stanford.edu' ] count = 0 record = {} start_urls = [ 'https://www.cmu.edu/', 'https://www.cornell.edu/', 'https://www.washington.edu/', 'https://www.gatech.edu/', 'https://www.princeton.edu/', 'https://www.utexas.edu/', 'https://illinois.edu/', 'https://www.berkeley.edu/', 'https://www.mit.edu/', 'https://www.stanford.edu/' ] exclude_words = [ 'news', 'events', 'publications', 'pub', 'gallery', 'category', 'courses', 'students', 'references', 'reference', 'software', 'softwares', 'tags', 'tutorials', 'workshop', 'festival', 'admissions', 'exhibitions', 'alumni', 'lectures', 'undergraduate', 'about', 'history', 'awards', 'ranking', 'enrollment', 'graduate', 'archive', 'stories', 'post', 'pages', 'magazine', 'curriculum', '404', 'faqs', 'engage', 'campaign', 'career', 'resources', 'services', 'network', 'security', 'donate', 'giving', 'finance', 'forms', 'policies', 'policy', 'alphabetical', 'summer', 'winter', 'spring', 'autumn', 'fall', 'health', 'facilities', 'facility', 'wp', 'information', 'general', 'catalog', 'guides', 'library', 'publish', 'blog', 'collection', 'share', 'search', 'periodicals', 'bookstore', 'store', 'product', 'organisation', 'webstore', 'funding', 'pdf' ] rules = [Rule(LinkExtractor(unique=True), callback='parse', follow=True)] #count_limits = {"page_count": 200, "item_count": 200} def __init__(self): self.tree = Tree() self.tree.create_node("root", "root") self.tree.create_node("unknown", "unknown", parent="root") self.bio_identifier = BioIdentifier(model="bio-model") for dom in self.allowed_domains: domain = dom.split('.')[0] if not os.path.exists('Crawled_Data'): os.makedirs('Crawled_Data') folder_name = 'Crawled_Data/' + domain.capitalize( ) + '_University_Files' self.record[domain] = 0 if not os.path.exists(folder_name): os.makedirs(folder_name) def parse(self, response): matched_domain = [x for x in self.allowed_domains if x in response.url] if len(matched_domain) > 0: domain = matched_domain[0].split('.')[0] folder_name = 'Crawled_Data/' + domain.capitalize( ) + '_University_Files' self.record[domain] = self.record.get(domain, 0) + 1 if self.record[domain] % 50 == 0: print('\n Crawled {} Bio-pages of {} University ...'.format( self.record[domain], domain.capitalize())) self.tree.save2file(folder_name + "/00__" + str(self.record[domain]) + "_tree.txt") isBio = self.bio_identifier.is_bio_html_content( response.xpath('//*').get()) if isBio: text = BeautifulSoup(response.xpath('//*').get(), features="html.parser").get_text() tokens = nltk.word_tokenize(text) normalized_text = ' '.join( [word for word in tokens if word.isalnum()]) normalized_text += '\n' + response.url hash_text = hashlib.md5(response.url.encode()) file_name = hash_text.hexdigest() with open(folder_name + "/" + file_name + ".txt", "w", encoding="utf-8") as file: file.write(normalized_text) AllLinks = LinkExtractor(allow_domains=domain + '.edu', unique=True).extract_links(response) for n, link in enumerate(AllLinks): if not any([x in link.url for x in self.exclude_words]): if self.tree.get_node(link.url) == None: referer = response.request.headers.get('Referer', None) if referer == None: self.tree.create_node(link.url, link.url, parent='root') else: referer = referer.decode("utf-8") if self.tree.contains(referer): self.tree.create_node(link.url, link.url, parent=referer) else: self.tree.create_node(link.url, link.url, parent='unknown') yield scrapy.Request(url=link.url, callback=self.parse)
for z in d: path = walkTree(tree, z, path + z) return path input = list(map(lambda x: x.strip(), open("test_input.txt").readlines())) tree = Tree() tree.create_node("root", "root") # first figure out how many steps there are and then sort them # by their name for lines in input: (l1, l2) = (lines[5], lines[36]) print(lines) if tree.contains(l1) and tree.contains(l2): tree.move_node(l2, l1) elif tree.contains(l1) and not tree.contains(l2): tree.create_node(l2, l2, parent=l1) elif not tree.contains(l1) and tree.contains(l2): # get the root for l2 and make that the root for l1 # then move l2 under l1 tree.create_node(l1, l1, parent=tree.parent(l2)) tree.move_node(l2, l1) else: tree.create_node(l1, l1, parent="root") tree.create_node(l2, l2, parent=l1) tree.show() print(walkTree(tree, 'root', ''))
def create(self,words_list,postags_list,arcs_list): # 输入三个list # 第一个是words_list 词语序列,词序 # 第二个词性 # 第三个是依存关系,这个也是用于构建树的关键 tree = Tree() # 使用一层层的搭建技术 # 我们设定五个层 layer1 = [] layer2 = [] layer3 = [] layer4 = [] # layer5 = [] # print('words_list' + str(words_list)) # print('arcs_list'+str(arcs_list)) # 首节点 for i in range(len(arcs_list)): arc_head = arcs_list[i].split(':')[0] # 首节点 if int(arc_head) == 0: HED_id = i # layer1层 for i in range(len(arcs_list)): arc_head = arcs_list[i].split(':')[0] if int(arc_head) - 1 == int(HED_id): node = {'node' + str(i) : 'HED'} layer1.append(node) # layer2层 for i in range(len(arcs_list)): arc_head = arcs_list[i].split(':')[0] # 说明有arc_head在layer1中,那就是这个点在layer2中 for lay in layer1: if int(list(lay.keys())[0].lstrip('node')) == int(arc_head) - 1: node = {'node' + str(i) : list(lay.keys())[0]} layer2.append(node) # layer3层 for i in range(len(arcs_list)): arc_head = arcs_list[i].split(':')[0] # 说明有arc_head在layer2中,那就是这个点在layer3中 for lay in layer2: if int(list(lay.keys())[0].lstrip('node')) == int(arc_head) - 1: node = {'node' + str(i): list(lay.keys())[0]} layer3.append(node) # layer4层 for i in range(len(arcs_list)): arc_head = arcs_list[i].split(':')[0] # 说明有arc_head在layer3中,那就是这个点在layer4中 for lay in layer3: if int(list(lay.keys())[0].lstrip('node')) == int(arc_head) - 1: node = {'node' + str(i): list(lay.keys())[0]} layer4.append(node) # print(layer1) # print(layer2) # print(layer3) # print(layer4) # 四层都构建完毕 # 下面就根据一层层的搭建树 # 首先创建根节点 if not tree.contains('HED'): tree.create_node(str(HED_id) + ' ' + words_list[int(HED_id)], 'HED', data=postags_list[int(HED_id)] + ' ' + arcs_list[int(HED_id)].split(':')[1]) # layer1 for lay in layer1: nodename = list(lay.keys())[0] parent = list(lay.values())[0] tree.create_node( nodename.lstrip('node') + ' ' + words_list[int(nodename.lstrip('node'))], nodename, parent=parent, data=postags_list[int(nodename.lstrip('node'))] + ' ' + arcs_list[int(nodename.lstrip('node'))].split(':')[1]) # layer2 for lay in layer2: nodename = list(lay.keys())[0] parent = list(lay.values())[0] tree.create_node( nodename.lstrip('node') + ' ' + words_list[int(nodename.lstrip('node'))], nodename, parent=parent, data=postags_list[int(nodename.lstrip('node'))] + ' ' + arcs_list[int(nodename.lstrip('node'))].split(':')[1]) # layer3 for lay in layer3: nodename = list(lay.keys())[0] parent = list(lay.values())[0] tree.create_node( nodename.lstrip('node') + ' ' + words_list[int(nodename.lstrip('node'))], nodename, parent=parent, data=postags_list[int(nodename.lstrip('node'))] + ' ' + arcs_list[int(nodename.lstrip('node'))].split(':')[1]) # layer4 for lay in layer4: nodename = list(lay.keys())[0] parent = list(lay.values())[0] tree.create_node( nodename.lstrip('node') + ' ' + words_list[int(nodename.lstrip('node'))], nodename, parent=parent, data=postags_list[int(nodename.lstrip('node'))] + ' ' + arcs_list[int(nodename.lstrip('node'))].split(':')[1]) return tree
class WarcFileSystem( LoggingMixIn, Operations ): """Filesystem built on a WARC's URI paths.""" def __init__( self, warc ): self.warc = warc logger.debug( "Mounting %s" % self.warc ) self.fh = WarcRecord.open_archive( warc, gzip="auto", mode="rb" ) self.tree = Tree() self._get_records() def _get_records( self ): """Parses a WARC, building a hierarchical tree.""" statinfo = os.stat( self.warc ) self.gid = statinfo.st_gid self.uid = statinfo.st_uid self.tree.create_node( self.warc, "/" ) self.records = {} bar = progressbar.ProgressBar( maxval=statinfo.st_size, widgets=[ progressbar.Bar( "=", "[", "]"), " ", progressbar.Percentage() ] ) bar.start() for( offset, record, errors ) in self.fh.read_records( limit=None ): if record is not None and record.type != WarcRecord.WARCINFO: parent = "/" segments = [ record.type ] + re.split( "/+", record.url ) for e in segments: identifier = "/".join( [ parent, e ] ) if not self.tree.contains( identifier ): node = WarcRecordNode( record, offset, tag=e, identifier=identifier ) self.tree.add_node( node, parent=parent ) parent = identifier self.records[ record.url ] = ( offset, record ) bar.update( offset ) bar.finish() logger.debug( self.tree.show() ) # def access( self, path, amode ): # logger.debug( path ) # raise FuseOSError( EPERM ) def chmod( self, path, mode ): raise FuseOSError( EPERM ) def chown( self, path, uid, gid ): raise FuseOSError( EPERM ) def create( self, path, mode ): raise FuseOSError( EPERM ) def destroy( self, path ): self.fh.close() # def flush( self, path, fh ): # raise FuseOSError( EPERM ) def fsync( self, path, datasync, fh ): raise FuseOSError( EPERM ) def fsyncdir( self, path, datasync, fh ): raise FuseOSError( EPERM ) def getattr( self, path, fh=None ): """Returns stat info for a path in the tree.""" logger.debug( path ) if path == "/": stat = os.stat( self.warc ) return dict( [ ( "st_mode", ( S_IFDIR | 0444 ) ), ( "st_ino", stat.st_ino ), ( "st_dev", stat.st_dev ), ( "st_nlink", stat.st_nlink ), ( "st_uid", stat.st_uid ), ( "st_gid", stat.st_gid ), ( "st_size", stat.st_size ), ( "st_ctime", stat.st_ctime ), ( "st_mtime", stat.st_mtime ), ( "st_atime", stat.st_atime ) ] ) else: return self.name_to_attrs( "/%s" % path ) def getxattr( self, path, name, position=0 ): """Returns the value for an extended attribute.""" if path != "/": path = "/%s" % path node = self.tree.get_node( path ) if node is None: raise FuseOSError( ENOENT ) try: return node.xattrs[ name ] except KeyError: raise FuseOSError( ENODATA ) def init( self, path ): pass def link( self, target, source ): raise FuseOSError( EPERM ) def listxattr( self, path ): """Returns a list of extended attribute names.""" if path != "/": path = "/%s" % path node = self.tree.get_node( path ) if node is None: raise FuseOSError( ENOENT ) return node.xattrs.keys() def mkdir( self, path, mode ): raise FuseOSError( EPERM ) def mknod( self, path, mode, dev ): raise FuseOSError( EPERM ) def open( self, path, flags ): """Should return numeric filehandle; returns file offset for convenience.""" if path != "/": path = "/%s" % path node = self.tree.get_node( path ) if node is None: raise FuseOSError( ENOENT ) return node.offset # def opendir( self, path ): # raise FuseOSError( EPERM ) def read( self, path, size, offset, fh ): """Reads 'size' data from 'path', starting at 'offset'.""" logger.debug( "read %s from %s at %s " % ( size, path, offset ) ) if path != "/": path = "/%s" % path node = self.tree.get_node( path ) if node is None: raise FuseOSError( ENOENT ) offset += node.payload_offset mime, data = node.record.content end = offset + size return data[ offset:end ] def name_to_attrs( self, name ): """Retrieves attrs for a path name.""" logger.debug( name ) node = self.tree.get_node( name ) if node is None: raise FuseOSError( ENOENT ) if node.is_leaf(): st_mode = ( S_IFREG | 0444 ) size = node.record.content_length try: timestamp = time.mktime( parse( node.record.date ).timetuple() ) except ValueError as v: logger.warning( "Error parsing time: %s [%s]" % ( node.record.date, str( v ) ) ) timestamp = time.mktime( datetime.fromtimestamp( 0 ).timetuple() ) else: st_mode = ( S_IFDIR | 0555 ) size = 0 timestamp = time.time() return dict( [ ( "st_mode", st_mode ), ( "st_ino", 0 ), ( "st_dev", 0 ), ( "st_nlink", 0 ), ( "st_uid", self.uid ), ( "st_gid", self.gid ), ( "st_size", size ), ( "st_ctime", timestamp ), ( "st_mtime", timestamp ), ( "st_atime", timestamp ) ] ) def readdir( self, path, fh ): """Returns a tuple of all files in path.""" logger.debug( path ) if path != "/": path = "/%s" % path if self.tree.contains( path ): names = [] for c in self.tree.get_node( path ).fpointer: child = self.tree.get_node( c ) names.append( ( child.tag, self.name_to_attrs( child.identifier ), 0 ) ) return names else: raise FuseOSError( ENOENT ) def readlink( self, path ): raise FuseOSError( EPERM ) # def release( self, path, fh ): # raise FuseOSError( EPERM ) # def releasedir( self, path, fh ): # raise FuseOSError( EPERM ) def removexattr( self, path, name ): raise FuseOSError( EPERM ) def rename( self, old, new ): raise FuseOSError( EPERM ) def rmdir( self, path ): raise FuseOSError( EPERM ) def setxattr( self, path, name, value, options, position=0 ): raise FuseOSError( EPERM ) def statfs( self, path ): raise FuseOSError( EPERM ) def symlink( self, target, source ): raise FuseOSError( EPERM ) def truncate( self, path, length, fh=None ): raise FuseOSError( EPERM ) def unlink( self, path ): raise FuseOSError( EPERM ) def utimens( self, path, times=None ): raise FuseOSError( EPERM ) def write( self, path, data, offset, fh ): raise FuseOSError( EPERM )
class LegalDocMLconverter(PDFConverter): CONTROL = re.compile('[\x00-\x08\x0b-\x0c\x0e-\x1f]') def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, imagewriter=None, stripcontrol=False): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) self.imagewriter = imagewriter self.stripcontrol = stripcontrol self.textboxes = [] self.page_width = [] self.page_height = [] self.classified = [] self.classified_header = [] self.classified_paragraph = [] self.classified_section = [] self.classified_subsection = [] self.tree = Tree() self.tree.create_node("Documents", 'documents') self.num_tabs = 0 self.write_header() self.headerExist = False self.in_li = False json_file = open('data/model.json', 'r') loaded_model_json = json_file.read() json_file.close() self.model = model_from_json(loaded_model_json) self.model.load_weights("data/model.h5") self.tokenizer = [] with open('data/tokenizer.pickle', 'rb') as handle: self.tokenizer = pickle.load(handle) return def decode_tags(self, pred) : tags = { 'header':0, 'document':1, 'paragraph':2, 'topic':3, 'section':4, 'subsection':5, 'li':6, 'footer':7, 'page_number':8, 'figure':9, 'table':10, 'table_li':11, 'commentary':12, '?':13, } decode = {v: k for k, v in tags.items()} num_tags = max(tags.values()) + 1 return decode[np.argmax(pred)] def write(self, text): if self.codec: text = text.encode(self.codec) self.outfp.write(text) return def write_header(self): if self.codec: self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec) else: self.write('<?xml version="1.0" ?>\n') self.write('<documents>\n') self.num_tabs = 1 return def write_footer(self): self.write('</documents>\n') self.num_tabs = 0 return def write_text(self, text): if self.stripcontrol: text = self.CONTROL.sub('', text) self.write(enc(text)) return def write_tab(self): for i in range(self.num_tabs): self.write("\t") def receive_layout(self, ltpage): self.items = [] def extract_text(item): if isinstance(item, LTPage): #print(bbox2str(item.bbox)) self.page_width = item.x1 self.page_height = item.y1 for child in item: extract_text(child) elif isinstance(item, LTFigure): for child in item: extract_text(child) elif isinstance(item, LTTextBox): self.items.append(item) elif isinstance(item, LTChar): self.items.append(item) extract_text(ltpage) def get_y0(item): return item.y0 def get_id(item): return item.index def get_size(item): if isinstance(item, LTChar): return item.size elif isinstance(item, LTAnno): return 0 else: for child in item: return get_size(child) self.items.sort(key=get_y0, reverse=True) def group_textboxes(items): new_items = [] prev = items[0] for item in items[1:]: if isinstance(prev, LTChar): box = LTTextBox() box.add(prev) box.set_bbox((prev.x0, prev.y0, prev.x1, prev.y1)) prev = box y_diff = (prev.y0 - item.y1) x_diff = (item.x0 - prev.x1) if y_diff < get_size(prev)/2 and x_diff < get_size(prev) and x_diff >= -get_size(prev)/2: xs = [item.x0, item.x1, prev.x0, prev.x1] ys = [item.y0, item.y1, prev.y0, prev.y1] prev.add(item) prev.set_bbox((min(xs), min(ys), max(xs), max(ys))) elif y_diff < get_size(prev)/2 and (item.x0 - prev.x0) < get_size(prev)/2 and (item.x1 - prev.x1) > -get_size(prev)/2: vert = LTTextBoxVertical() xs = [item.x0, item.x1, prev.x0, prev.x1] ys = [item.y0, item.y1, prev.y0, prev.y1] for child in prev: vert.add(child) vert.add(item) vert.set_bbox((min(xs), min(ys), max(xs), max(ys))) prev = vert else: new_items.append(prev) prev = item #new_items.append(prev) #prev = item new_items.append(prev) return new_items def classify(item): if isinstance(item, LTTextBox): wmode = '' if isinstance(item, LTTextBoxVertical): wmode = ' wmode="vertical"' box = NLPTextBox(item) s = ('%s %d %d %d ' % (bbox2str(box.bbox), box.b, box.i, box.size) + item.get_text().replace('\n', ' ')) s_list = [] s_list.append(s) X = self.tokenizer.texts_to_sequences(s_list) maxlen = 100 X = pad_sequences(X, padding='post', maxlen=maxlen) preds = self.model.predict(X) tag = self.decode_tags(preds) box.set_tag(tag) if (tag == "header"): self.classified_header.append(box) elif (tag == "paragraph"): self.classified_paragraph.append(box) elif (tag == "section"): self.classified_section.append(box) elif (tag == "subsection"): self.classified_subsection.append(box) self.classified.append(box) else: assert False, str(('Unhandled', item)) def into_tree(): _header = self.classified_header[0] self.tree.create_node(_header.get_text(), _header.key, parent="documents", data=_header) for _section in self.classified_section: self.tree.create_node(_section.get_text(), _section.key, parent=_header.key, data=_section) for _sebsection in self.classified_subsection: keys = _sebsection.key.split('.') keys.pop() _key = ''.join([i + "." for i in keys]) _key = _key[:-1] + ".0" if (not self.tree.contains(_key)): data = NLPSimpleBox("section", _key) self.tree.create_node(_key, _key, parent=_header.key, data=data) self.classified.append(data) self.tree.create_node(_sebsection.get_text(), _sebsection.key, parent=_key, data=_sebsection) for _paragraph in self.classified_paragraph: keys = _paragraph.key.split('.') keys.pop() _key = ''.join([i + "." for i in keys]) _key = _key[:-1] if (not self.tree.contains(_key)): section_keys = _key.split('.') section_keys.pop() section_key = ''.join([i + "." for i in section_keys]) section_key = section_key[:-1] + ".0" if (not self.tree.contains(section_key)): data = NLPSimpleBox("section", _key) self.tree.create_node(section_key, section_key, parent=_header.key, data=data) self.classified.append(data) data = NLPSimpleBox("subsection", _key) self.tree.create_node(_key, _key, parent=section_key, data=data) self.classified.append(data) try: self.tree.create_node(_paragraph.get_text(), _paragraph.key, parent=_key, data=_paragraph) except: self.tree.create_node(_paragraph.get_text(), _paragraph.key + ".0", parent=_key, data=_paragraph) new_classified = [] prev_box = self.classified[0] for _boxes in self.classified: if _boxes.tag == "commentary": if prev_box.tag == "commentary": prev_box.text += _boxes.text prev_box.set_tag("commentary") else: prev_box = _boxes else: if prev_box.tag == "commentary": new_classified.append(prev_box) prev_box = _boxes new_classified.append(_boxes) self.classified = new_classified for _boxes in self.classified: if (_boxes.tag == "footer"): None elif (_boxes.tag == "page_number"): None elif (_boxes.tag == "?"): None elif (_boxes.tag == "topic"): _prev_subsection = find_prev_with_tag(_boxes, "subsection") try: self.tree.create_node(_boxes.get_text(), _boxes.key, parent=_prev_subsection.key, data=_boxes) except: self.tree.create_node(_boxes.get_text(), _boxes.key + '1', parent=_prev_subsection.key, data=_boxes) elif _boxes.tag != "header" and _boxes.tag != "paragraph" and _boxes.tag != "section" and _boxes.tag != "subsection": _prev_paragraph = find_prev_with_tag(_boxes, "paragraph") try: self.tree.create_node(_boxes.get_text(), _prev_paragraph.key + "." + _boxes.key, parent=_prev_paragraph.key, data=_boxes) except: self.tree.create_node(_boxes.get_text(), _prev_paragraph.key + "." + _boxes.key + '1', parent=_prev_paragraph.key, data=_boxes) def find_prev_with_tag(item, tag): _prev = '' _next = False for _boxes in self.classified: if (_boxes.tag == tag): _prev = _boxes if (_next): break if (_boxes == item): if (_prev == ''): _next = True else: break return _prev def get_node_id(node): return node.identifier def render(node): tag = '' item = node.data if isinstance(item, LTTextBox): wmode = '' if isinstance(item, LTTextBoxVertical): wmode = ' wmode="vertical"' tag = item.tag if (tag == "header"): if (not self.headerExist): self.write_tab() self.write('<document title="%s">\n' % item.get_text()) self.num_tabs = self.num_tabs + 1 self.headerExist = True elif (tag == "paragraph"): self.write_tab() self.write('<paragraph key="%s">\n' % item.get_key()) self.num_tabs = self.num_tabs + 1 self.write_tab() self.write("<p>" + item.get_text().replace('\n', ' ').lstrip().rstrip() + "</p>\n") elif (tag == "commentary"): self.write_tab() self.write('<commentary title="COMMENT:">') self.write(item.get_text().replace('COMMENT:', '').lstrip()) self.write('</commentary>\n') elif (tag == "topic"): self.write_tab() self.write('<topic>') self.write(item.get_text()) self.write('</topic>\n') elif (tag == "section"): self.write_tab() self.write('<section key="%s" title="%s">\n' % (item.get_key(), item.get_text())) self.num_tabs = self.num_tabs + 1 elif (tag == "subsection"): self.write_tab() self.write('<subsection key="%s" title="%s">\n' % (item.get_key(), item.get_text())) self.num_tabs = self.num_tabs + 1 elif (tag == "li"): if (not self.in_li): self.write_tab() self.write('<ol>\n') self.num_tabs = self.num_tabs + 1 self.in_li = True self.write_tab() if (item.list_tag): self.write('<li key="%s">' % node.identifier) self.write(item.get_text()) self.write('</li>\n') else: self.write('<li>') self.write(item.get_text()) self.write('</li>\n') elif (tag == "footer"): None elif (tag == "page_number"): None elif (tag == "?"): None else: None branches = self.tree.is_branch(node.identifier) _branches = [] for child in branches: _branches.append(self.tree.get_node(child)) if (tag == "section" or tag == "subsection"): _branches.sort(key=get_node_id, reverse=False) for _child in _branches: render(_child) if (tag != "li" and tag != "footer" and tag != "page_number" and tag != "?" and self.in_li): self.num_tabs = self.num_tabs - 1 self.write_tab() self.write('</ol>\n') self.in_li = False if (tag == "paragraph"): self.num_tabs = self.num_tabs - 1 self.write_tab() self.write('</paragraph>\n') elif (tag == "header"): self.num_tabs = self.num_tabs - 1 self.write_tab() self.write('</document>\n') elif (tag == "section"): self.num_tabs = self.num_tabs - 1 self.write_tab() self.write('</section>\n') elif (tag == "subsection"): self.num_tabs = self.num_tabs - 1 self.write_tab() self.write('</subsection>\n') def highlights(item): s = '' prev_bold = False prev_italic = False for child in item: if isinstance(child, LTChar): if 'Bold' in child.fontname: if prev_italic: s += '</i>' if not prev_bold: s += '<b>' prev_bold = True prev_italic = False elif 'Italic' in child.fontname: if prev_bold: s += '</b>' if not prev_italic: s += '<i>' prev_italic = True prev_bold = False else: if prev_bold: s += '</b>' elif prev_italic: s += '</i>' prev_bold = False prev_italic = False s += child.get_text() elif isinstance(child, LTTextLine): s += highlights(child) elif isinstance(child, LTTextBox): s += highlights(child) elif isinstance(child, NLPTextBox): s += highlights(child) else: if child.get_text() == '\n': if prev_bold: s += '</b>' elif prev_italic: s += '</i>' prev_bold = False prev_italic = False s += child.get_text() return s self.textboxes = group_textboxes(self.items) self.textboxes.sort(key=get_id, reverse=False) for item in self.textboxes: classify(item) into_tree() self.tree.show() render(self.tree.get_node("documents")) return def draw_layout(self, input_path, output_path): #init cv2 pages = convert_from_path(input_path, 500) pages[0].save(output_path, 'JPEG') page1 = cv2.imread(output_path) page1_disp = page1 for i in range(3): page1_disp = cv2.pyrDown(page1_disp) height, width, channels = page1.shape #print(width, height) #print(height) scale = height/int(self.page_height) for item in self.textboxes: if isinstance(item, LTTextBox) or isinstance(item, LTChar): #render cv2 start = (int(item.x0 * scale), (height - int(item.y0 * scale))) end = (int(item.x1 * scale), (height - int(item.y1 * scale))) #print(start , end) color = (0, 0, 255) thickness = 5 page1 = cv2.rectangle(page1, start, end, color, thickness) else: assert False, str(('Unhandled', item)) page1 = cv2.rectangle(page1, (40,40), (50,50), (0,0,255), 2) boxed_disp = page1 for i in range(3): boxed_disp = cv2.pyrDown(boxed_disp) while True: cv2.imshow('page', page1_disp) cv2.imshow('boxed', boxed_disp) #exit on ESC k = cv2.waitKey(30) & 0xFF if k == 27: break cv2.destroyAllWindows() def close(self): self.write_footer() return
class Route(object): def __init__(self, universe): self.route = Tree() self.universe = universe self.max_hops = 4 def show(self): self.route.show() def asString(self): return (','.join([self.route[node].tag for node in self.route.expand_tree(mode=Tree.DEPTH)])) def getRoute(self): return self.route def byScore_key(self, s): return s.score def findRoute(self, start): parent = self.universe.findSystem(start) self.route.create_node(start, start, data=parent) systems = self.findNextSystems(start, start) self.buildRoute(systems, start) return self.route def buildRoute(self, systems, parent): for s in systems: n = s.name h = 0 if (self.route.contains(n) == False): self.route.create_node(n, n, parent=parent, data=s) hop = h + self.route.depth(n) if (hop < self.max_hops): sub_systems = self.findNextSystems(parent, n) self.buildRoute(sub_systems, n) else: n = parent + ' --> ' + n self.route.create_node(n, n, parent=parent, data=s) def getSystemId(self, name, i=0): if (self.route.contains(name) == False): return name else: i += 1 n = name + '(' + str(i) + ')' return self.getSystemId(n) def findNextSystems(self, parent, start): systems = [] optimal = self.universe.distances.findOptimalSystems(start) for s in sorted(set(optimal)): if (s != parent): i = self.universe.findSystem(s) if (i.permit == False): systems.append(i) s = sorted(systems, key = self.byScore_key) return s[:self.max_hops] # http://xiaming.me/treelib/examples.html # # class SystemTree(object): # def __init__(self): # self.tree = Tree() # # def addNode(self, id, o): # self.tree.create_node(o, id) # # def addChildNode(self, p, id, o): # self.tree.create_node(o, id, parent=p) # # def getNode(self, id): # return self.tree.subtree(id) # # def __repr__(self): # return self.tree.to_json(with_data=True) # # # t = SystemTree() # t.addNode('Aerial', 'Aerial') # t.addChildNode('Aerial', 'Jotun', 'Jotun') # t.addChildNode('Jotun', 'Rusani', 'Rusani') # n = t.getNode('Jotun') # print(n) # n = t.tree.contains('Invalid') # print(n) # t.tree.show()
import os path='' # path to list of domains and sub-domians file= open(path) lines = file.read().splitlines() tree = Tree() tree.create_node("name of root node", "ID of root node") # root node for url in lines: domain="" subdomain="" domain = tldextract.extract(url).domain subdomain = tldextract.extract(url).subdomain if not (tree.contains(domain)): tree.create_node(domain, domain, parent="ID of root node") #Add domains to root node if subdomain: tree.create_node(subdomain, subdomain+domain, parent=domain) #Add sub-domains to domain node file.close() tree.show(line_type="ascii-emv") #show data as stdout tree.to_graphviz(filename="tree_graphviz") #dump tree as graphviz #dot xxx -Tps -o test.ps -Grankdir=LR #left to right subprocess.call(["dot", "tree_graphviz", "-Tps", "-o" ,"output.ps" ,"-Grankdir=LR"]) #Grankdir=LR option to build tree from left to right #convert -flatten -density 150 -geometry 100% test.ps test.png subprocess.call(["convert" ,"-flatten" ,"-density" ,"150" ,"-geometry" ,"100%" ,"output.ps" ,
class OntoTypes: def __init__(self, tcu="http://www.w3.org/2002/07/owl#Thing"): self.top_class_uri = tcu self.tree = Tree() # TODO def get_dbo_type_levels_dict(self, path, top_class_uri): logger.info("Starting get_dbo_types_dict from %s, topclass=%s" % (path, top_class_uri)) # http://rdflib.readthedocs.io/en/stable/intro_to_sparql.html g = rdflib.Graph() # ... add some triples to g somehow ... g.parse(path) orgClass = rdflib.term.URIRef(top_class_uri) class_list = [orgClass] stack = [orgClass] while stack: currentClass = stack.pop() logger.debug("Current class %s" % currentClass) subclasses = [s for s, p, o in g if p == rdflib.RDFS.subClassOf and o == currentClass] if subclasses: stack = stack + subclasses class_list = class_list + subclasses # print(subclasses) class_list = set(class_list) return class_list # get all parent types from a given one def get_subclassesof_tree(self, ifile): # http://rdflib.readthedocs.io/en/stable/intro_to_sparql.html g = rdflib.Graph() g.parse(ifile) orgClass = rdflib.term.URIRef(self.top_class_uri) stack = [orgClass] self.tree.create_node(orgClass,orgClass) while stack: currentClass = stack.pop() logger.debug("Current class %s" % currentClass) subclasses = [s for s, p, o in g if p == rdflib.RDFS.subClassOf and o == currentClass] logger.debug("Subclasses: %s" % subclasses) if subclasses: stack = stack + subclasses for sc in subclasses: if not self.tree.contains(sc): self.tree.create_node(sc, sc, parent=currentClass) else: sc_bis = str(sc) + "_bis" self.tree.create_node(sc_bis, sc_bis, parent=currentClass)
df = df['data'] pathLength = 0 orderedList = list() #list is ordered by length of the path while len(orderedList)!=329: #329 is the total number of categories (use CTRL + F and count the number of "id" instances) pathLength+=1 for i in df: if len(i['path'])==pathLength: orderedList.append(i) for c in orderedList: if fbTree.contains(c['name']): continue if len(c['path'])==1: fbTree.create_node(c['name'], c['name'], "root") else: fbTree.create_node(c['name'],c['name'], c['path'][-2]) fbTree.show() ''' create buttons for root's children <collapse rootChildren>