def build_tree1(tup3): tree = Tree() tree.create_node('Root', tup3[0]) path_list = tup3[1] for i in range(len(path_list)): path = path_list[i] for j in range(len(path)): if j == 0 and path[j] != []: parent_chlidren = tree.is_branch(tup3[0]) if parent_chlidren.__contains__(path[0]) == False: row_col_str = "_" + str(i) + str(j) tree.create_node(path[j], path[j] + row_col_str, parent=tup3[0]) elif j > 0 and path[j] != []: parent_row_col_str = "_" + str(i) + str(j - 1) parent_chlidren = tree.is_branch(path[j - 1] + parent_row_col_str) if parent_chlidren.__contains__(path[j]) == False: row_col_str = "_" + str(i) + str(j) tree.create_node(path[j], path[j] + str(count), path[j - 1]) return tree
def nexus_from_dictionary(output_fname, md, show_tree=False): """ convert a dictionary to nexus fileformat; Args: output_fname: filename for the output file; it will overwrite any existing files; md: the dictionary that contains the data; the keys are defined in the nexus file format; show_tree: [True, False] to show the hierarch tree or not; Return: None """ # deal with empty dictionary if not md: return nx_tree = Tree() nx_tree.create_node('root', '/') for key in md.keys(): # parse the path and get ride of the starting empty string; temp = key.split('/')[1:] for n, x in enumerate(temp): idf = '/' + '/'.join(temp[:n + 1]) parent = '/' + '/'.join(temp[:n]) # create branch if it doesn't exist if idf not in nx_tree.is_branch(parent): nx_tree.create_node(x, idf, parent=parent) if show_tree: nx_tree.show() if len(output_fname) < 3 or \ output_fname[-3:] not in ['.h5', 'hdf', '.nx']: output_fname += '.nx' with h5py.File(output_fname, 'w') as f: create_nx_worker(f, nx_tree, '/', md)
def FpGrowth(fName): readFile(fName) Cone = getSizeOneItemSet(globOriginalList) priorityDict = priorityDic(Cone) #print(priorityDict) tree = Tree() tree.create_node("{}", "root") #reconstruct the whole transction database based on the priority counter = 0 for set in globOriginalList: temp = dict() for element in set: priority = priorityDict.get(element) temp.update({element:priority}) sorted_temp = sorted(temp.items(), key=operator.itemgetter(1)) sorted_temp.reverse() #print(sorted_temp) # construct Fp tree root = "root" for tuple in sorted_temp: if(not tree.contains(tuple[0])): tree.create_node(tuple[0], tuple[0], root, 0) root = tuple[0] else: if tuple[0] in tree.is_branch(root): #print("node already in this branch, don't know what to do") #print("going down") root = tuple[0] #print(root) else: #print("should create a duplicate node") tree.create_node(tuple[0], counter, root, 0) root = counter counter += 1 # I need to decide whether to create a new node or not # the condition is under this branch if this node exist # so I should check the root tree.show()
print(sep + "All family members in DEPTH mode:") for node in tree.expand_tree(mode=Tree.DEPTH): print(tree[node].tag) print(sep + "All family members without Diane sub-family:") tree.show(idhidden=False, filter=lambda x: x.identifier != "diane") # for node in tree.expand_tree(filter=lambda x: x.identifier != 'diane', mode=Tree.DEPTH): # print tree[node].tag print(sep + "Let me introduce Diane family only:") sub_t = tree.subtree("diane") sub_t.show() print(sep + "Children of Diane") for child in tree.is_branch("diane"): print(tree[child].tag) print(sep + "OOhh~ new members join Jill's family:") new_tree = Tree() new_tree.create_node("n1", 1) # root node new_tree.create_node("n2", 2, parent=1) new_tree.create_node("n3", 3, parent=1) tree.paste("jill", new_tree) tree.show() print(sep + "They leave after a while:") tree.remove_node(1) tree.show() print(sep + "Now Jill moves to live with Grand-x-father Harry:")
for node in tree.expand_tree(mode=Tree.DEPTH): print tree[node].tag print('\n') print("#"*4 + "All family members without Diane sub-family") for node in tree.expand_tree(filter=lambda x: x != 'diane', mode=Tree.DEPTH): print tree[node].tag print('\n') print("#"*4 + "Let me introduce Diane family only") sub_t = tree.subtree('diane') sub_t.show() print('\n') print("#"*4 + "Children of Diane") print tree.is_branch('diane') print('\n') print("#"*4 + "OOhh~ new members enter Jill's family") new_tree = Tree() new_tree.create_node("n1", 1) # root node new_tree.create_node("n2", 2, parent=1) new_tree.create_node("n3", 3, parent=1) tree.paste('jill', new_tree) tree.show() print('\n') print("#"*4 + "We are sorry they are gone accidently :(") tree.remove_node(1) tree.show() print('\n')
class StepParse: def __init__(self): pass def load_step(self, step_filename): self.nauo_lines = [] self.prod_def_lines = [] self.prod_def_form_lines = [] self.prod_lines = [] self.filename = os.path.splitext(step_filename)[0] line_hold = '' line_type = '' # Find all search lines with open(step_filename) as f: for line in f: # TH: read pointer of lines as they are read, so if the file has text wrap it will notice and add it to the following lines index = re.search("#(.*)=", line) if index: # TH: if not none then it is the start of a line so read it # want to hold line until it has checked next line # if next line is a new indexed line then save previous line if line_hold: if line_type == 'nauo': self.nauo_lines.append(line_hold) elif line_type == 'prod_def': self.prod_def_lines.append(line_hold) elif line_type == 'prod_def_form': self.prod_def_form_lines.append(line_hold) elif line_type == 'prod': self.prod_lines.append(line_hold) line_hold = '' line_type = '' prev_index = True # TH remember previous line had an index if 'NEXT_ASSEMBLY_USAGE_OCCURRENCE' in line: line_hold = line.rstrip() line_type = 'nauo' elif ('PRODUCT_DEFINITION ' in line or 'PRODUCT_DEFINITION(' in line): line_hold = line.rstrip() line_type = 'prod_def' elif 'PRODUCT_DEFINITION_FORMATION' in line: line_hold = line.rstrip() line_type = 'prod_def_form' elif ('PRODUCT ' in line or 'PRODUCT(' in line): line_hold = line.rstrip() line_type = 'prod' else: prev_index = False #TH: if end of file and previous line was held if 'ENDSEC;' in line: if line_hold: if line_type == 'nauo': self.nauo_lines.append(line_hold) elif line_type == 'prod_def': self.prod_def_lines.append(line_hold) elif line_type == 'prod_def_form': self.prod_def_form_lines.append(line_hold) elif line_type == 'prod': self.prod_lines.append(line_hold) line_hold = '' line_type = '' else: #TH: if not end of file line_hold = line_hold + line.rstrip() self.nauo_refs = [] self.prod_def_refs = [] self.prod_def_form_refs = [] self.prod_refs = [] # TH: added 'replace(","," ").' to replace ',' with a space to make the spilt easier if there are not spaces inbetween the words' # Find all (# hashed) line references and product names # TH: it might be worth finding a different way of extracting data we do want rather than fixes to get rid of the data we don't for j, el_ in enumerate(self.nauo_lines): self.nauo_refs.append([ el.rstrip(',') for el in el_.replace(",", " ").replace("=", " ").split() if el.startswith('#') ]) for j, el_ in enumerate(self.prod_def_lines): self.prod_def_refs.append([ el.rstrip(',') for el in el_.replace(",", " ").replace("=", " ").split() if el.startswith('#') ]) for j, el_ in enumerate(self.prod_def_form_lines): self.prod_def_form_refs.append([ el.rstrip(',') for el in el_.replace(",", " ").replace("=", " ").split() if el.startswith('#') ]) for j, el_ in enumerate(self.prod_lines): self.prod_refs.append([ el.strip(',') for el in el_.replace(",", " ").replace( "(", " ").replace("=", " ").split() if el.startswith('#') ]) self.prod_refs[j].append(el_.split("'")[1]) # Get first two items in each sublist (as third is shape ref) # # First item is 'PRODUCT_DEFINITION' ref # Second item is 'PRODUCT_DEFINITION_FORMATION <etc>' ref self.prod_all_refs = [el[:2] for el in self.prod_def_refs] # Match up all references down to level of product name for j, el_ in enumerate(self.prod_all_refs): # Add 'PRODUCT_DEFINITION' ref for i, el in enumerate(self.prod_def_form_refs): if el[0] == el_[1]: el_.append(el[1]) break # Add names from 'PRODUCT_DEFINITION' lines for i, el in enumerate(self.prod_refs): if el[0] == el_[2]: el_.append(el[2]) break # Find all parent and child relationships (3rd and 2nd item in each sublist) self.parent_refs = [el[1] for el in self.nauo_refs] self.child_refs = [el[2] for el in self.nauo_refs] # Find distinct parts and assemblies via set operations; returns list, so no repetition of items self.all_type_refs = set(self.child_refs) | set(self.parent_refs) self.ass_type_refs = set(self.parent_refs) self.part_type_refs = set(self.child_refs) - set(self.parent_refs) #TH: find root node self.root_type_refs = set(self.parent_refs) - set(self.child_refs) # Create simple parts dictionary (ref + label) self.part_dict = {el[0]: el[3] for el in self.prod_all_refs} # self.part_dict_inv = {el[3]:el[0] for el in self.prod_all_refs} def show_values(self): # TH: basic testing, if needed these could be spilt up print(self.nauo_lines) print(self.prod_def_lines) print(self.prod_def_form_lines) print(self.prod_lines) print(self.nauo_refs) print(self.prod_def_refs) print(self.prod_def_form_refs) print(self.prod_refs) # HR: "create_dict" replaced by list comprehension elsewhere # # def create_dict(self): # # # TH: links nauo number with a name and creates dict # self.part_dict = {} # for part in self.all_type_refs: # for sublist in self.prod_def_refs: # if sublist[0] == part: # prod_loc = '#' + re.findall('\d+',sublist[1])[0] # pass # for sublist in self.prod_def_form_refs: # if sublist[0] == prod_loc: # prod_loc = '#' + str(re.findall('\d+',sublist[1])[0]) # pass # for sublist in self.prod_refs: # if sublist[0] == prod_loc: # part_name = sublist[2] # # self.part_dict[part] = part_name def create_tree(self): #TH: create tree diagram in newick format #TH: find root node self.tree = Tree() #TH: check if there are any parts to make a tree from, if not don't bother if self.part_dict == {}: return root_node_ref = list(self.root_type_refs)[0] # HR added part reference as data for later use self.tree.create_node(self.part_dict[root_node_ref], 0, data={'ref': root_node_ref}) #TH: created root node now fill in next layer #TH: create dict for tree, as each node needs a unique name i = [0] # Iterates through nodes self.tree_dict = {} self.tree_dict[i[0]] = root_node_ref def tree_next_layer(self, parent): root_node = self.tree_dict[i[0]] for line in self.nauo_refs: if line[1] == root_node: i[0] += 1 self.tree_dict[i[0]] = str(line[2]) # HR added part reference as data for later use self.tree.create_node(self.part_dict[line[2]], i[0], parent=parent, data={'ref': str(line[2])}) tree_next_layer(self, i[0]) tree_next_layer(self, 0) self.appended = False self.get_levels() def get_levels(self): # Initialise dict and get first level (leaves) self.levels = {} self.levels_set_p = set() self.levels_set_a = set() self.leaf_ids = [el.identifier for el in self.tree.leaves()] self.all_ids = [el for el in self.tree.nodes] self.non_leaf_ids = set(self.all_ids) - set(self.leaf_ids) self.part_level = 1 def do_level(self, tree_level): # Get all nodes within this level node_ids = [ el for el in self.tree.nodes if self.tree.level(el) == tree_level ] for el in node_ids: # If leaf, then n_p = 1 and n_a = 1 if el in self.leaf_ids: self.levels[el] = {} self.levels[el]['n_p'] = self.part_level self.levels[el]['n_a'] = self.part_level # If assembly, then get all children and sum all parts + assemblies else: # Get all children of node and sum levels child_ids = self.tree.is_branch(el) child_sum_p = 0 child_sum_a = 0 for el_ in child_ids: child_sum_p += self.levels[el_]['n_p'] child_sum_a += self.levels[el_]['n_a'] self.levels[el] = {} self.levels[el]['n_p'] = child_sum_p self.levels[el]['n_a'] = child_sum_a + 1 self.levels_set_p.add(child_sum_p) self.levels_set_a.add(child_sum_a + 1) # Go up through tree levels and populate lattice level dict for i in range(self.tree.depth(), -1, -1): do_level(self, i) self.create_lattice() self.levels_p_sorted = sorted(list(self.levels_set_p)) self.levels_a_sorted = sorted(list(self.levels_set_a)) # Function to return dictionary of item IDs for each lattice level def get_levels_inv(list_in, key): #Initialise levels_inv = {} levels_inv[self.part_level] = [] for el in list_in: levels_inv[el] = [] for k, v in self.levels.items(): levels_inv[v[key]].append(k) return levels_inv self.levels_p_inv = get_levels_inv(self.levels_p_sorted, 'n_p') self.levels_a_inv = get_levels_inv(self.levels_a_sorted, 'n_a') def get_all_children(self, id_): ancestors = [el.identifier for el in self.tree.children(id_)] parents = ancestors while parents: children = [] for parent in parents: children = [el.identifier for el in self.tree.children(parent)] ancestors.extend(children) parents = children return ancestors def create_lattice(self): # Create lattice self.g = nx.DiGraph() self.default_colour = 'r' # Get root node and set parent to -1 to maintain data type of "parent" # Set position to top/middle node_id = self.tree.root label_text = self.tree.get_node(node_id).tag self.g.add_node(node_id, parent=-1, label=label_text, colour=self.default_colour) # Do nodes from treelib "nodes" dictionary for key in self.tree.nodes: # Exclude root if key != self.tree.root: parent_id = self.tree.parent(key).identifier label_text = self.tree.get_node(key).tag # Node IDs same as for tree self.g.add_node(key, parent=parent_id, label=label_text, colour=self.default_colour) # Do edges from nodes for key in self.tree.nodes: # Exclude root if key != self.tree.root: parent_id = self.tree.parent(key).identifier self.g.add_edge(key, parent_id) # Escape if only one node # HR 6/3/20 QUICK BUG FIX: SINGLE-NODE TREE DOES NOT PLOT # IMPROVE LATER; SHOULD BE PART OF A GENERAL METHOD if self.tree.size() == 1: id_ = [el.identifier for el in self.tree.leaves()] self.g.nodes[id_[-1]]['pos'] = (0, 0) return # Get set of parents of leaf nodes leaf_parents = set( [self.tree.parent(el).identifier for el in self.leaf_ids]) # For each leaf_parent, set position of leaf nodes sequentially i = 0 no_leaves = len(self.tree.leaves()) for el in leaf_parents: for el_ in self.tree.is_branch(el): child_ids = [el.identifier for el in self.tree.leaves()] if el_ in child_ids: self.g.nodes[el_]['pos'] = ((i / (no_leaves)), 1) i += 1 # To set plot positions of nodes from lattice levels # --- # Traverse upwards from leaves for el in sorted(list(self.levels_set_a)): # Get all nodes at that level node_ids = [k for k, v in self.levels.items() if v['n_a'] == el] # Get all positions of children of that node # and set position as mean value of them for el_ in node_ids: child_ids = self.tree.is_branch(el_) pos_sum = 0 for el__ in child_ids: pos_ = self.g.nodes[el__]['pos'][0] pos_sum += pos_ pos_sum = pos_sum / len(child_ids) self.g.nodes[el_]['pos'] = (pos_sum, el) def print_tree(self): try: self.tree.show() except: self.create_tree() self.tree.show() def tree_to_json(self, save_to_file=False, filename='file', path=''): #TH: return json format tree, can also save to file if self.tree.size() != 0: data = self.tree.to_json() j = json.loads(data) if save_to_file == True: if path: file_path = os.path.join(path, filename) else: file_path = filename with open(file_path + '.json', 'w') as outfile: json.dump(j, outfile) return data else: print("no tree to print") return
print(sep + "All family members in DEPTH mode:") for node in tree.expand_tree(mode=Tree.ZIGZAG): print(tree[node].tag) print(sep + "All family members without Diane sub-family:") tree.show(idhidden=False, filter=lambda x: x.identifier != 'diane') # for node in tree.expand_tree(filter=lambda x: x.identifier != 'diane', mode=Tree.DEPTH): # print tree[node].tag print(sep + "Let me introduce Diane family only:") sub_t = tree.subtree('diane') sub_t.show() print(sep + "Children of Diane") for child in tree.is_branch('diane'): print(tree[child].tag) print(sep + "OOhh~ new members join Jill's family:") new_tree = Tree() new_tree.create_node("n1", 1) # root node new_tree.create_node("n2", 2, parent=1) new_tree.create_node("n3", 3, parent=1) tree.paste('jill', new_tree) tree.show() print(sep + "They leave after a while:") tree.remove_node(1) tree.show() print(sep + "Now Jill moves to live with Grand-x-father Harry:")
if __name__ == '__main__': # str1 = "www.sasac.gov.cn/n2588025/n2588119/index.html?t=1573435313677" # pa = resolution_line(str1) # print(pa) ##### # tup1 = (u'tybb.mof.gov.cn',[[u'printasyncwork'], [u'dnaserver'],[],[u'dnaserver'],[u'printasyncwork'],[u'printasyncwork']]) # tup2 = (u'www.nkj.moa.gov.cn', [[u'ggzt', u''], [u'dwhz', u''], [u'tongji']]) # list_path = duplicate_removal(tup1) # print(list_path) # print(count_gov_url(tup2)) ### # tree = build_tree() # tree.show() # # print(tree.contains("child4")) # print(tree.is_branch("child1")) # list = ["aa","bb","c",0] # print(list.__contains__("aa")) ######################################### tup3 = ("gov", [["1", "2"], ["2", "4"]]) # tree1 = build_tree1(tup3) # tree1.show() tree = Tree() tree.create_node("gov", 0) tree.create_node("122", 1, parent=0) tree.create_node("2222", 2, parent=1) print(tree.is_branch(0)) # print(tree.) tree.show()
print("#"*4 + "All family members without Diane sub-family") tree.show(idhidden=False, filter=lambda x: x.identifier != 'diane') # for node in tree.expand_tree(filter=lambda x: x.identifier != 'diane', mode=Tree.DEPTH): # print tree[node].tag print('\n') print("#"*4 + "Let me introduce Diane family only") sub_t = tree.subtree('diane') sub_t.show() print('\n') print("#"*4 + "Children of Diane") print tree.is_branch('diane') print('\n') print("#"*4 + "OOhh~ new members enter Jill's family") new_tree = Tree() new_tree.create_node("n1", 1) # root node new_tree.create_node("n2", 2, parent=1) new_tree.create_node("n3", 3, parent=1) tree.paste('jill', new_tree) tree.show() print('\n') print("#"*4 + "We are sorry they are gone accidently :(") tree.remove_node(1)
print(sep + "All family members in DEPTH mode:") for node in tree.expand_tree(mode=Tree.DEPTH): print(tree[node].tag) print(sep + "All family members without Diane sub-family:") tree.show(idhidden=False, filter=lambda x: x.identifier != 'diane') # for node in tree.expand_tree(filter=lambda x: x.identifier != 'diane', mode=Tree.DEPTH): # print tree[node].tag print(sep + "Let me introduce Diane family only:") sub_t = tree.subtree('diane') sub_t.show() print(sep + "Children of Diane") for child in tree.is_branch('diane'): print(tree[child].tag) print(sep + "OOhh~ new members join Jill's family:") new_tree = Tree() new_tree.create_node("n1", 1) # root node new_tree.create_node("n2", 2, parent=1) new_tree.create_node("n3", 3, parent=1) tree.paste('jill', new_tree) tree.show() print(sep + "They leave after a while:") tree.remove_node(1) tree.show() print(sep + "Now Jill moves to live with Grand-x-father Harry:")
class LegalDocMLconverter(PDFConverter): CONTROL = re.compile('[\x00-\x08\x0b-\x0c\x0e-\x1f]') def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, imagewriter=None, stripcontrol=False): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) self.imagewriter = imagewriter self.stripcontrol = stripcontrol self.textboxes = [] self.page_width = [] self.page_height = [] self.classified = [] self.classified_header = [] self.classified_paragraph = [] self.classified_section = [] self.classified_subsection = [] self.tree = Tree() self.tree.create_node("Documents", 'documents') self.num_tabs = 0 self.write_header() self.headerExist = False self.in_li = False json_file = open('data/model.json', 'r') loaded_model_json = json_file.read() json_file.close() self.model = model_from_json(loaded_model_json) self.model.load_weights("data/model.h5") self.tokenizer = [] with open('data/tokenizer.pickle', 'rb') as handle: self.tokenizer = pickle.load(handle) return def decode_tags(self, pred) : tags = { 'header':0, 'document':1, 'paragraph':2, 'topic':3, 'section':4, 'subsection':5, 'li':6, 'footer':7, 'page_number':8, 'figure':9, 'table':10, 'table_li':11, 'commentary':12, '?':13, } decode = {v: k for k, v in tags.items()} num_tags = max(tags.values()) + 1 return decode[np.argmax(pred)] def write(self, text): if self.codec: text = text.encode(self.codec) self.outfp.write(text) return def write_header(self): if self.codec: self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec) else: self.write('<?xml version="1.0" ?>\n') self.write('<documents>\n') self.num_tabs = 1 return def write_footer(self): self.write('</documents>\n') self.num_tabs = 0 return def write_text(self, text): if self.stripcontrol: text = self.CONTROL.sub('', text) self.write(enc(text)) return def write_tab(self): for i in range(self.num_tabs): self.write("\t") def receive_layout(self, ltpage): self.items = [] def extract_text(item): if isinstance(item, LTPage): #print(bbox2str(item.bbox)) self.page_width = item.x1 self.page_height = item.y1 for child in item: extract_text(child) elif isinstance(item, LTFigure): for child in item: extract_text(child) elif isinstance(item, LTTextBox): self.items.append(item) elif isinstance(item, LTChar): self.items.append(item) extract_text(ltpage) def get_y0(item): return item.y0 def get_id(item): return item.index def get_size(item): if isinstance(item, LTChar): return item.size elif isinstance(item, LTAnno): return 0 else: for child in item: return get_size(child) self.items.sort(key=get_y0, reverse=True) def group_textboxes(items): new_items = [] prev = items[0] for item in items[1:]: if isinstance(prev, LTChar): box = LTTextBox() box.add(prev) box.set_bbox((prev.x0, prev.y0, prev.x1, prev.y1)) prev = box y_diff = (prev.y0 - item.y1) x_diff = (item.x0 - prev.x1) if y_diff < get_size(prev)/2 and x_diff < get_size(prev) and x_diff >= -get_size(prev)/2: xs = [item.x0, item.x1, prev.x0, prev.x1] ys = [item.y0, item.y1, prev.y0, prev.y1] prev.add(item) prev.set_bbox((min(xs), min(ys), max(xs), max(ys))) elif y_diff < get_size(prev)/2 and (item.x0 - prev.x0) < get_size(prev)/2 and (item.x1 - prev.x1) > -get_size(prev)/2: vert = LTTextBoxVertical() xs = [item.x0, item.x1, prev.x0, prev.x1] ys = [item.y0, item.y1, prev.y0, prev.y1] for child in prev: vert.add(child) vert.add(item) vert.set_bbox((min(xs), min(ys), max(xs), max(ys))) prev = vert else: new_items.append(prev) prev = item #new_items.append(prev) #prev = item new_items.append(prev) return new_items def classify(item): if isinstance(item, LTTextBox): wmode = '' if isinstance(item, LTTextBoxVertical): wmode = ' wmode="vertical"' box = NLPTextBox(item) s = ('%s %d %d %d ' % (bbox2str(box.bbox), box.b, box.i, box.size) + item.get_text().replace('\n', ' ')) s_list = [] s_list.append(s) X = self.tokenizer.texts_to_sequences(s_list) maxlen = 100 X = pad_sequences(X, padding='post', maxlen=maxlen) preds = self.model.predict(X) tag = self.decode_tags(preds) box.set_tag(tag) if (tag == "header"): self.classified_header.append(box) elif (tag == "paragraph"): self.classified_paragraph.append(box) elif (tag == "section"): self.classified_section.append(box) elif (tag == "subsection"): self.classified_subsection.append(box) self.classified.append(box) else: assert False, str(('Unhandled', item)) def into_tree(): _header = self.classified_header[0] self.tree.create_node(_header.get_text(), _header.key, parent="documents", data=_header) for _section in self.classified_section: self.tree.create_node(_section.get_text(), _section.key, parent=_header.key, data=_section) for _sebsection in self.classified_subsection: keys = _sebsection.key.split('.') keys.pop() _key = ''.join([i + "." for i in keys]) _key = _key[:-1] + ".0" if (not self.tree.contains(_key)): data = NLPSimpleBox("section", _key) self.tree.create_node(_key, _key, parent=_header.key, data=data) self.classified.append(data) self.tree.create_node(_sebsection.get_text(), _sebsection.key, parent=_key, data=_sebsection) for _paragraph in self.classified_paragraph: keys = _paragraph.key.split('.') keys.pop() _key = ''.join([i + "." for i in keys]) _key = _key[:-1] if (not self.tree.contains(_key)): section_keys = _key.split('.') section_keys.pop() section_key = ''.join([i + "." for i in section_keys]) section_key = section_key[:-1] + ".0" if (not self.tree.contains(section_key)): data = NLPSimpleBox("section", _key) self.tree.create_node(section_key, section_key, parent=_header.key, data=data) self.classified.append(data) data = NLPSimpleBox("subsection", _key) self.tree.create_node(_key, _key, parent=section_key, data=data) self.classified.append(data) try: self.tree.create_node(_paragraph.get_text(), _paragraph.key, parent=_key, data=_paragraph) except: self.tree.create_node(_paragraph.get_text(), _paragraph.key + ".0", parent=_key, data=_paragraph) new_classified = [] prev_box = self.classified[0] for _boxes in self.classified: if _boxes.tag == "commentary": if prev_box.tag == "commentary": prev_box.text += _boxes.text prev_box.set_tag("commentary") else: prev_box = _boxes else: if prev_box.tag == "commentary": new_classified.append(prev_box) prev_box = _boxes new_classified.append(_boxes) self.classified = new_classified for _boxes in self.classified: if (_boxes.tag == "footer"): None elif (_boxes.tag == "page_number"): None elif (_boxes.tag == "?"): None elif (_boxes.tag == "topic"): _prev_subsection = find_prev_with_tag(_boxes, "subsection") try: self.tree.create_node(_boxes.get_text(), _boxes.key, parent=_prev_subsection.key, data=_boxes) except: self.tree.create_node(_boxes.get_text(), _boxes.key + '1', parent=_prev_subsection.key, data=_boxes) elif _boxes.tag != "header" and _boxes.tag != "paragraph" and _boxes.tag != "section" and _boxes.tag != "subsection": _prev_paragraph = find_prev_with_tag(_boxes, "paragraph") try: self.tree.create_node(_boxes.get_text(), _prev_paragraph.key + "." + _boxes.key, parent=_prev_paragraph.key, data=_boxes) except: self.tree.create_node(_boxes.get_text(), _prev_paragraph.key + "." + _boxes.key + '1', parent=_prev_paragraph.key, data=_boxes) def find_prev_with_tag(item, tag): _prev = '' _next = False for _boxes in self.classified: if (_boxes.tag == tag): _prev = _boxes if (_next): break if (_boxes == item): if (_prev == ''): _next = True else: break return _prev def get_node_id(node): return node.identifier def render(node): tag = '' item = node.data if isinstance(item, LTTextBox): wmode = '' if isinstance(item, LTTextBoxVertical): wmode = ' wmode="vertical"' tag = item.tag if (tag == "header"): if (not self.headerExist): self.write_tab() self.write('<document title="%s">\n' % item.get_text()) self.num_tabs = self.num_tabs + 1 self.headerExist = True elif (tag == "paragraph"): self.write_tab() self.write('<paragraph key="%s">\n' % item.get_key()) self.num_tabs = self.num_tabs + 1 self.write_tab() self.write("<p>" + item.get_text().replace('\n', ' ').lstrip().rstrip() + "</p>\n") elif (tag == "commentary"): self.write_tab() self.write('<commentary title="COMMENT:">') self.write(item.get_text().replace('COMMENT:', '').lstrip()) self.write('</commentary>\n') elif (tag == "topic"): self.write_tab() self.write('<topic>') self.write(item.get_text()) self.write('</topic>\n') elif (tag == "section"): self.write_tab() self.write('<section key="%s" title="%s">\n' % (item.get_key(), item.get_text())) self.num_tabs = self.num_tabs + 1 elif (tag == "subsection"): self.write_tab() self.write('<subsection key="%s" title="%s">\n' % (item.get_key(), item.get_text())) self.num_tabs = self.num_tabs + 1 elif (tag == "li"): if (not self.in_li): self.write_tab() self.write('<ol>\n') self.num_tabs = self.num_tabs + 1 self.in_li = True self.write_tab() if (item.list_tag): self.write('<li key="%s">' % node.identifier) self.write(item.get_text()) self.write('</li>\n') else: self.write('<li>') self.write(item.get_text()) self.write('</li>\n') elif (tag == "footer"): None elif (tag == "page_number"): None elif (tag == "?"): None else: None branches = self.tree.is_branch(node.identifier) _branches = [] for child in branches: _branches.append(self.tree.get_node(child)) if (tag == "section" or tag == "subsection"): _branches.sort(key=get_node_id, reverse=False) for _child in _branches: render(_child) if (tag != "li" and tag != "footer" and tag != "page_number" and tag != "?" and self.in_li): self.num_tabs = self.num_tabs - 1 self.write_tab() self.write('</ol>\n') self.in_li = False if (tag == "paragraph"): self.num_tabs = self.num_tabs - 1 self.write_tab() self.write('</paragraph>\n') elif (tag == "header"): self.num_tabs = self.num_tabs - 1 self.write_tab() self.write('</document>\n') elif (tag == "section"): self.num_tabs = self.num_tabs - 1 self.write_tab() self.write('</section>\n') elif (tag == "subsection"): self.num_tabs = self.num_tabs - 1 self.write_tab() self.write('</subsection>\n') def highlights(item): s = '' prev_bold = False prev_italic = False for child in item: if isinstance(child, LTChar): if 'Bold' in child.fontname: if prev_italic: s += '</i>' if not prev_bold: s += '<b>' prev_bold = True prev_italic = False elif 'Italic' in child.fontname: if prev_bold: s += '</b>' if not prev_italic: s += '<i>' prev_italic = True prev_bold = False else: if prev_bold: s += '</b>' elif prev_italic: s += '</i>' prev_bold = False prev_italic = False s += child.get_text() elif isinstance(child, LTTextLine): s += highlights(child) elif isinstance(child, LTTextBox): s += highlights(child) elif isinstance(child, NLPTextBox): s += highlights(child) else: if child.get_text() == '\n': if prev_bold: s += '</b>' elif prev_italic: s += '</i>' prev_bold = False prev_italic = False s += child.get_text() return s self.textboxes = group_textboxes(self.items) self.textboxes.sort(key=get_id, reverse=False) for item in self.textboxes: classify(item) into_tree() self.tree.show() render(self.tree.get_node("documents")) return def draw_layout(self, input_path, output_path): #init cv2 pages = convert_from_path(input_path, 500) pages[0].save(output_path, 'JPEG') page1 = cv2.imread(output_path) page1_disp = page1 for i in range(3): page1_disp = cv2.pyrDown(page1_disp) height, width, channels = page1.shape #print(width, height) #print(height) scale = height/int(self.page_height) for item in self.textboxes: if isinstance(item, LTTextBox) or isinstance(item, LTChar): #render cv2 start = (int(item.x0 * scale), (height - int(item.y0 * scale))) end = (int(item.x1 * scale), (height - int(item.y1 * scale))) #print(start , end) color = (0, 0, 255) thickness = 5 page1 = cv2.rectangle(page1, start, end, color, thickness) else: assert False, str(('Unhandled', item)) page1 = cv2.rectangle(page1, (40,40), (50,50), (0,0,255), 2) boxed_disp = page1 for i in range(3): boxed_disp = cv2.pyrDown(boxed_disp) while True: cv2.imshow('page', page1_disp) cv2.imshow('boxed', boxed_disp) #exit on ESC k = cv2.waitKey(30) & 0xFF if k == 27: break cv2.destroyAllWindows() def close(self): self.write_footer() return
class DependencyReader: """DependencyReader object""" def __init__(self): self.tempDirectoryPath = mkdtemp(dir=".") self.tree = Tree() self.dependencies = {} self.graphRelationships = [] def getPom(self, pomPath): shutil.copy(pomPath, self.tempDirectoryPath) os.chdir(self.tempDirectoryPath) def getDependencies(self): mavenTreeOutput = subprocess.Popen('mvn org.apache.maven.plugins:maven-dependency-plugin:RELEASE:tree -DoutputType=tgf', stdout=subprocess.PIPE, shell=True) while True: line = mavenTreeOutput.stdout.readline().rstrip() if not line or re.search(r"BUILD SUCCESS", line): break match = re.match(r"\[INFO\]\s(\d*)\s*(.*):(.*):(\w+):([0-9\.]*)", line) if match: if not match.group(1) in self.dependencies.keys(): self.dependencies[match.group(1)] = DependencyNode(match.group(2), match.group(3), match.group(5), match.group(1)) if not self.tree.leaves(): self.tree.create_node(match.group(1), match.group(1), data=self.dependencies[match.group(1)]) self.dependencies[match.group(1)].get('jar', self.tempDirectoryPath) match = re.match(r"\[INFO\]\s(\d*)\s(\d*)", line) if match and match.group(2): self.graphRelationships.append((match.group(1), match.group(2))) def relateDependencies(self): while self.graphRelationships: for item in self.graphRelationships: node = self.tree.get_node(item[0]) if node is not None: parent = self.dependencies[item[0]] child = self.dependencies[item[1]] self.tree.create_node(child.referenceId, child.referenceId, parent=parent.referenceId, data=child) self.graphRelationships.remove(item) def scanDependencies(self): # Need to run on each package with oneshot to get identifiers # unless update dosocsv2 to create identifiers on scan # or fix up dosocsv2 to create identifiers on scan instead for node in self.tree.expand_tree(mode=Tree.DEPTH): treeNode = self.tree.get_node(node) subprocess.call('dosocs2 oneshot ' + treeNode.data.jarName, shell=True) def createRelationships(self): # Pass packages as relationships to new dosocsv2 command created self.recursiveRelationship(self.tree.root) def recursiveRelationship(self, parent): for node in self.tree.is_branch(parent): parentNode = self.tree.get_node(parent) childNode = self.tree.get_node(node) subprocess.call('dosocs2 packagerelate ' + parentNode.data.jarName + ' ' + childNode.data.jarName, shell=True) self.recursiveRelationship(node) def retrieve_dependencies(self, jarName): if jarName is None: root = self.tree.get_node(self.tree.root) root = root.data.jarName else: root = jarName tgfOutput = subprocess.Popen('dosocs2 dependencies ' + root, stdout=subprocess.PIPE, shell=True) count = 0 tree = Tree() dependencies = [] relationships = [] while True: line = tgfOutput.stdout.readline() if not line: break match = re.match(r"(\d+) - (.*)", line) if match: if count == 0: count = count + 1 tree.create_node(match.group(2), match.group(1)) else: dependencies.append((match.group(2), match.group(1))) match = re.match(r"(\d+) (\d+)", line) if match: relationships.append((match.group(1), match.group(2))) if not relationships: print("No child relationships for " + jarName) return None while relationships: for item in relationships: node = tree.get_node(item[0]) if node is not None: rel = [item for item in relationships if int(item[0]) == int(node.identifier)] if rel is not None: rel = rel[0] dep = [item for item in dependencies if int(item[1]) == int(rel[1])] if dep is not None: dep = dep[0] tree.create_node(dep[0], dep[1], parent=node.identifier) relationships.remove(rel) dependencies.remove(dep) tree.show() if jarName is None: os.chdir(os.pardir)
print ("#" * 4 + "All family members without Diane sub-family") tree.show(idhidden=False, filter=lambda x: x.identifier != "diane") # for node in tree.expand_tree(filter=lambda x: x.identifier != 'diane', mode=Tree.DEPTH): # print tree[node].tag print ("\n") print ("#" * 4 + "Let me introduce Diane family only") sub_t = tree.subtree("diane") sub_t.show() print ("\n") print ("#" * 4 + "Children of Diane") print tree.is_branch("diane") print ("\n") print ("#" * 4 + "OOhh~ new members enter Jill's family") new_tree = Tree() new_tree.create_node("n1", 1) # root node new_tree.create_node("n2", 2, parent=1) new_tree.create_node("n3", 3, parent=1) tree.paste("jill", new_tree) tree.show() print ("\n") print ("#" * 4 + "We are sorry they are gone accidently :(") tree.remove_node(1)
class RST_DT: def load(self, path2file): self.id_EDUs = [] self.EDU = {} self.treeNS = Tree() self.tree = Tree() # nombre max d'espace pour init id_parents with open(path2file, "r") as f: max_space = 0 nb_line = 0 for i, line in enumerate(f): nb_space = 0 for c in line: if c == " ": nb_space += 1 else: break if nb_space > max_space: max_space = nb_space nb_line += 1 with open(path2file, "r") as f: id_parents = [0] * max_space NS_parents = [0] * max_space for i, line in enumerate(f): # nombre d'espace détermine le parent nb_space = 0 for c in line: if c == " ": nb_space += 1 else: break space = nb_space / 2 id_parents[space] = i parent = id_parents[space - 1] reg = "\(([\w\-\[\]]+)|(_!.+!_)" # récupération du contenu match = re.findall(reg, line)[0] if match[0] == "": content = match[1] # feuille EDU self.id_EDUs.append(i) # print content self.EDU[i] = re.findall("_!(.*)!_", content) else: content = match[0] reg2 = "\[(N|S)\]" # récupération NS match2 = re.findall(reg2, content) NS_parents[space] = match2 # ['N','S'] # création du noeud if i == 0: self.tree.create_node(content, 0) self.treeNS.create_node("Root", 0) else: id_NS = len(self.tree.is_branch(parent)) # 0 ou 1 car arbre binaire self.tree.create_node(content, i, parent=parent) self.treeNS.create_node(NS_parents[space - 1][id_NS], i, parent=parent) def toDEP(self): ############################### # Etape 1 : construction du head_tree # parcours en largeur de tree afin de récupérer chaque id_node # pour chaque profondeur (init à 0) _! sans compter !_ les feuilles (EDUs) nodes_depth = [-1] * self.tree.size() for i in xrange(self.tree.size()): id_nodes = [0] depth = [999] * self.tree.size() while id_nodes: # False if empty id_node = id_nodes.pop(0) node = self.tree.get_node(id_node) if node.bpointer != None: node_parent = self.tree.get_node(node.bpointer) depth[node.identifier] = depth[node_parent.identifier] + 1 else: depth[node.identifier] = 0 if id_node == i: # print 'noeud ',i,' en profondeur', depth[node.identifier] if node.fpointer: nodes_depth[i] = depth[i] break if node.fpointer: id_nodes.append(node.fpointer[0]) id_nodes.append(node.fpointer[1]) # print nodes_depth id_nodes_depth = [] for d in xrange(self.tree.depth()): id_nodes_depth.append([]) for i in xrange(self.tree.size()): if nodes_depth[i] == d: id_nodes_depth[d].append(i) # print id_nodes_depth # # construction du head_tree head_tree = [-1] * self.treeNS.size() # pour chaque noeud (non EDU/feuille) en partant de la plus grande profondeur dans l'arbre for d in range(len(id_nodes_depth) - 1, -1, -1): for id_node in id_nodes_depth[d]: node = self.treeNS.get_node(id_node) node_left = self.treeNS.get_node(node.fpointer[0]) node_right = self.treeNS.get_node(node.fpointer[1]) if node_left.tag == "N": if head_tree[node_left.identifier] == -1: identifier = node_left.identifier else: identifier = head_tree[node_left.identifier] else: if head_tree[node_right.identifier] == -1: identifier = node_right.identifier else: identifier = head_tree[node_right.identifier] head_tree[id_node] = identifier # print head_tree ############################### # Etape 2 : construction du DEP # # construction du DEP # init # root est le premier noeud de head # pour chaque EDU son père est le root dans DEP dep_tree = Tree() id_root = head_tree[0] root = self.tree.get_node(id_root) # dep_tree.create_node(root.tag, root.identifier) dep_tree.create_node(root.tag, root.identifier) for id_EDU in xrange(len(head_tree)): if head_tree[id_EDU] == -1 and id_EDU != id_root: node = self.tree.get_node(id_EDU) # dep_tree.create_node(node.tag, node.identifier, parent=id_root) # dep_tree.create_node(str(id_EDU), node.identifier, parent=id_root) dep_tree.create_node(node.tag, node.identifier, parent=id_root) # print '//////////////////////' # print 'EDU', id_root # pour chaque EDU for id_EDU in xrange(len(head_tree)): if head_tree[id_EDU] == -1 and id_EDU != id_root: EDU_NS = self.treeNS.get_node(id_EDU) # print '.......................' # print 'EDU', id_EDU # print 'TAG', EDU_NS.tag if EDU_NS.tag == "N": # parcours en largeur jusqu'à trouver un S avec un head donc qui soit pas EDU id_nodes = [EDU_NS.identifier] visited = [False] * self.treeNS.size() while id_nodes: id_node = id_nodes.pop(0) EDU = self.tree.get_node(id_node) # print 'visited EDU', EDU.identifier visited[EDU.identifier] = True # cas d'arret head_EDU = head_tree[EDU.identifier] == -1 head_EDU = False node_tag = self.treeNS.get_node(EDU.identifier).tag # print ' head_EDU', head_EDU # print ' node_tag', node_tag if not head_EDU and node_tag == "S": break if EDU.bpointer: if not visited[EDU.bpointer]: id_nodes.append(EDU.bpointer) if EDU.fpointer: # sécurité if not visited[EDU.fpointer[0]]: id_nodes.append(EDU.fpointer[0]) if not visited[EDU.fpointer[1]]: id_nodes.append(EDU.fpointer[1]) # puis ajouter au DEP comme enfant du head du parent du noeud S id_head = head_tree[EDU.bpointer] # si parent S else: # parcours en largeur des ancêtre jusqu'à trouver un ancêtre avec un head parent = self.treeNS.get_node(EDU_NS.bpointer) id_head = head_tree[parent.identifier] # puis ajouter au DEP comme enfant de ce head if id_EDU != id_head: dep_tree.move_node(id_EDU, id_head) EDU = self.tree.get_node(id_EDU) # print '---- ajout de',EDU.identifier,' à',id_head # if id_EDU == id_head: # dep_tree.show() return dep_tree # showDepth(dep_tree, 4) # dep_tree.show() # node = dep_tree. def toString(self): """ affiche comme la sortie de Hilda """ showDepth(self.tree, 0)