def do_remove_stuff(soup, remove_selectors, remove): all_selectors = [] if remove is not None and remove != '': all_selectors.append(remove) if remove_selectors: all_selectors.extend(remove_selectors) logger.debug('all_selectors: %s' % all_selectors) all_removed = '' for selector in all_selectors: nremoved = 0 logger.debug('Removing selector %r' % remove) toremove = list(soup.select(selector)) logger.debug('Removing %d objects' % len(toremove)) for x in toremove: nremoved += 1 nd = len(list(x.descendants)) logger.debug('removing %s with %s descendants' % (x.name, nd)) if nd > 1000: s = str(x)[:300] logger.debug(' it is %s' % s) x.extract() all_removed += '\n\n' + '-' * 50 + ' chunk %d removed\n' % nremoved all_removed += str(x) all_removed += '\n\n' + '-' * 100 + '\n\n' logger.info('Removed %d elements of selector %r' % (nremoved, remove))
def fix_duplicated_ids(basename2soup): ''' fragments is a list of soups that might have duplicated ids. ''' id2frag = {} tochange = [] # (i, from, to) for basename, fragment in basename2soup.items(): # get all the ids for fragment for element in fragment.find_all(id=True): id_ = element.attrs['id'] # ignore the mathjax stuff if 'MathJax' in id_: # or id_.startswith('MJ'): continue # is this a new ID if not id_ in id2frag: id2frag[id_] = basename else: # already know it if id2frag[id_] == basename: # frome the same frag logger.debug('duplicated id %r inside frag %s' % (id_, basename)) else: # from another frag # we need to rename all references in this fragment # '%s' % random.randint(0,1000000) new_id = id_ + '-' + basename element['id'] = new_id tochange.append((basename, id_, new_id)) #logger.info(tochange) for i, id_, new_id in tochange: fragment = basename2soup[i] for a in fragment.find_all(href="#" + id_): a.attrs['href'] = '#' + new_id
def check_translation_diskrep_to_gitrep(disk_rep0, disk_events, disk_rep1, out): # @UnusedVariable if not disk_events: raise ValueError('no disk events') repo = gitrep_from_diskrep(disk_rep0) wd = repo.working_tree_dir readback = diskrep_from_gitrep(repo) assert_diskreps_same(disk_rep0, readback, 'original', 'written back') logger.debug(wd) logger.debug('\n'+indent(readback.tree(), 'read back |')) logger.debug('\n'+indent(yaml_dump(disk_events), 'disk_events|')) commits = [] for disk_event in disk_events: logger.debug(indent(yaml_dump(disk_event), 'disk_event | ')) apply_disk_event_to_filesystem(wd, disk_event, repo=repo) if repo.untracked_files: logger.debug('adding untracked file %r' % repo.untracked_files) repo.index.add(repo.untracked_files) message = yaml_dump(disk_event) who = disk_event['who'] logger.info('who: %s' % who) actor = who['actor'] instance = who.get('instance', None) host = who.get('host', None) author = Actor(actor, instance) committer = Actor(instance, host) commit = repo.index.commit(message, author=author, committer=committer) commits.append(commit) res = {} res['repo'] = repo return res
def get_image(self, name, data_format): extension = data_format for p in self.paths: for fn in _list_files_with_extension(p, extension): bn = os.path.basename(fn) x = os.path.splitext(bn)[0] if x.lower() == name.lower(): if x != name: msg = 'Using file "%s" for image "%s", even though case does not match.' % ( bn, name) _warn_once(msg) if os.path.exists(fn): return open(fn).read() else: # warn broken link msg = 'Filename does not exist (broken link?): %s' % fn logger.debug(msg) msg = 'Could not find %s.%s in %d paths.' % (name, data_format, len(self.paths)) for p in self.paths: msg += '\n path: %s' % p raise NoImageFound(msg)
def get_empty_links_to_fragment(soup): """ Find all empty links that have a reference to a fragment. yield LinkElement """ logger.debug('building index') # first find all elements by id id2element = {} for x in list(soup.descendants): if isinstance(x, Tag) and 'id' in x.attrs: id2element[x.attrs['id']] = x logger.debug('building index done') for element in get_empty_links(soup): if not 'href' in element.attrs: continue href = element.attrs['href'] if not href.startswith('#'): continue rest = href[1:] # if '/' in rest: # i = rest.index('/') # eid = rest[:i] # query = rest[i + 1:] # else: eid = rest query = None linked = id2element.get(eid, None) yield LinkElement(linker=element, eid=eid, linked=linked, query=query)
def get_link(specname, libname, thingname): # find library. Returns a string or raises error try: rname, sname = e.session.get_repo_shelf_for_libname(libname) except NoSuchLibrary: msg = 'No such library %r' % libname logger.debug(msg) raise # return None things = e.db_view.repos[rname].shelves[sname].libraries[ libname].things.child(specname) if thingname in things: # check if the thing exists res = get_link_library( libname) + '%s/%s/views/syntax/' % (specname, thingname) # logger.debug(' link for %s = %s' % (thingname, res)) return res else: msg = 'No such thing %r' % thingname logger.debug(msg) raise NoSuchLibrary(msg)
def savefile(filename_hint, data): """ must return the url (might be equal to filename) """ where = os.path.join(assets_dir, filename_hint) logger.debug('writing to %s' % where) with open(where, 'wb') as f: f.write(data) relative = os.path.relpath(where, os.path.dirname(out)) return relative
def replay_events(view_manager, db0, events): db0 = deepcopy(db0) for event in events: event_intepret(view_manager, db0, event) msg = '\nAfter playing event:\n' msg += indent(yaml_dump(event), ' event: ') msg += '\nthe DB is:\n' msg += indent(yaml_dump(db0), ' db: ') logger.debug(msg) return db0
def displayfile1(): defaults = {'org': 'AndreaCensi', 'repo': 'mcdp', 'branch': 'duckuments'} s = """ <display-file src="github:path=context_eval_as_constant.py,from_text=get_connections_for,to_text=return"></a> """ soup = bs(s) n = display_files(soup, defaults, raise_errors=True) assert n == 1 s2 = str(soup) logger.debug('\n' + indent(s2, ' '))
def __call__(self, event): self.other(event) repo = self.other.repo logger.debug('pushing') repo.remotes.origin.push() # hi_config = Schema() # # hi_config.string('root') # where to put temporary files # hi_config.string('instance') # instance name # hi_config.hash('repo_local', SchemaString()) # dirname for local repo # hi_config.hash('repo_git', SchemaString()) # git url for local repo #
def generate_toc(soup, max_depth=None): stack = [Item(None, 0, 'root', 'root', [])] headers_depths = list(get_things_to_index(soup)) for header, depth, using in headers_depths: if max_depth is not None: if depth > max_depth: continue item = Item(header, depth, using, header['id'], []) while (stack[-1].depth >= depth): stack.pop() stack[-1].items.append(item) stack.append(item) root = stack[0] logger.debug('numbering items') number_items2(root) if False: logger.debug(toc_summary(root)) # # logger.debug('toc iterating') # # iterate over chapters (below each h1) # # XXX: this is parts # if False: # for item in root.items: # s = item.to_html(root=True, max_levels=100) # stoc = bs(s) # if stoc.ul is not None: # empty document case # ul = stoc.ul # ul.extract() # ul['class'] = 'toc chapter_toc' # # todo: add specific h1 # item.tag.insert_after(ul) # XXX: uses <fragment> # # logger.debug('toc done iterating') exclude = [ 'subsub', 'fig', 'code', 'tab', 'par', 'subfig', 'appsubsub', 'def', 'eq', 'rem', 'lem', 'prob', 'prop', 'exa', 'thm' ] without_levels = root.copy_excluding_levels(exclude) res = without_levels.to_html(root=True, max_levels=13) return res
def sub1(): defaults = {'org': 'AndreaCensi', 'repo': 'mcdp', 'branch': 'duckuments'} s = """ <a href="github:path=context_eval_as_constant.py"></a> """ soup = bs(s) n = substitute_github_refs(soup, defaults) assert n == 1 s2 = str(soup) logger.debug(indent(s2, ' ')) expect = '<code class="github-resource-link">context_eval_as_constant.py</code>' if not expect in s2: raise Exception(s2)
def sub2(): defaults = {'org': 'AndreaCensi', 'repo': 'mcdp', 'branch': 'duckuments'} s = """ <a href="github:path=context_eval_as_constant.py,from_text=get_connections_for,to_text=return"></a> """ soup = bs(s) n = substitute_github_refs(soup, defaults) assert n == 1 s2 = str(soup) logger.debug('\n' + indent(s2, ' ')) expect = 'context_eval_as_constant.py#L7-L12' if not expect in s2: raise Exception('No %s in %s' % (expect, s2))
def go(): if len(sys.argv) != 3: print('Syntax:\n\n %s input_html output_html' % os.path.basename(sys.argv[0])) print('\n\nError: I need exactly 2 arguments.') sys.exit(1) fn = sys.argv[1] out = sys.argv[2] assets_dir = out + '.assets' if not os.path.exists(assets_dir): os.makedirs(assets_dir) logger.debug('Using assets dir %s' % assets_dir) outd = os.path.dirname(out) if not os.path.exists(outd): os.makedirs(outd) return go_(fn, out, assets_dir)
def read_as_user_db(dirname): dm = DB.dm hierarchy = ProxyDirectory.from_disk(dirname) logger.info('These are the files found:\n%s' % indent(hierarchy.tree(), ' ')) user_db_schema = DB.user_db user_db_data = dm.interpret_hierarchy_(user_db_schema, hierarchy) logger.debug('user_db schema: \n' + str(user_db_schema)) logger.debug('user_db:\n' + indent(yaml_dump(user_db_data), ' > ')) DB.user_db.validate(user_db_data) user_db_view = DB.view_manager.create_view_instance( user_db_schema, user_db_data) user_db_view.set_root() return user_db_view
def get_empty_links_to_fragment(soup): """ Find all links that have a reference to a fragment. yield LinkElement """ # # s.findAll(lambda tag: tag.name == 'p' and tag.find(True) is None and # (tag.string is None or tag.string.strip()=="")) logger.debug('building index') # first find all elements by id id2element = {} for x in soup.descendants: if isinstance(x, Tag) and 'id' in x.attrs: id2element[x.attrs['id']] = x logger.debug('building index done') for element in soup.find_all('a'): empty = len(list(element.descendants)) == 0 if not empty: continue if not 'href' in element.attrs: continue href = element.attrs['href'] if href.startswith('#'): rest = href[1:] if '?' in rest: i = rest.index('?') eid = rest[:i] query = rest[i + 1:] else: eid = rest query = None linked = id2element.get(eid, None) yield LinkElement(linker=element, eid=eid, linked=linked, query=query)
def make_sections2(elements, is_marker, copy=True, element_name='div', attrs={}, add_debug_comments=False): sections = [] def make_new(): x = Tag(name=element_name) for k, v in attrs.items(): x.attrs[k] = v return x current_header = None current_section = make_new() current_section['class'] = 'without-header-inside' for x in elements: if is_marker(x): if contains_something_else_than_space(current_section): sections.append((current_header, current_section)) current_section = make_new() logger.debug('marker %s' % x.attrs.get('id', 'unnamed')) current_header = x.__copy__() # current_section.append(x.__copy__()) current_section['class'] = 'with-header-inside' else: x2 = x.__copy__() if copy else x.extract() current_section.append(x2) if current_header or contains_something_else_than_space(current_section): sections.append((current_header, current_section)) logger.info('make_sections: %s found using marker %s' % (len(sections), is_marker.__name__)) return sections
def check_translation_gitrep_to_diskrep(repo, branch_name, out): wd = repo.working_tree_dir commits = list(reversed(list(repo.iter_commits(branch_name)))) # make sure that commits[0] is the first for i in range(1, len(commits)): assert commits[i].parents[0] == commits[i - 1] repo.head.reference = commits[0] repo.head.reset(index=True, working_tree=True) disk_rep0 = ProxyDirectory.from_disk(wd) disk_rep = deepcopy(disk_rep0) if os.path.exists(out): shutil.rmtree(out) if not os.path.exists(out): os.makedirs(out) def write_file_(name, what): name = os.path.join(out, name) with open(name, 'w') as f: f.write(what) logger.info('wrote on %s' % name) def write_file(i, n, what): name = '%d-%s.txt' % (i, n) write_file_(name, what) logger.debug('Initial files: %s' % list(_.path for _ in commits[1].tree.traverse())) msg = "" for i, commit in enumerate(commits): d = disk_rep_from_git_tree(commit.tree) msg += '\n\n' + indent(d.tree(), ' tree at commit #%d | ' % i) write_file_('00-commits.txt', msg) all_disk_events = [] for i in range(1, len(commits)): write_file(i, 'a-disk_rep', disk_rep.tree()) msg = "" for d in commits[i - 1].diff(commits[i]): msg += '\n' + str(d) write_file(i, 'c-diffs', msg) events = diskevents_from_diff(commits[i - 1], commits[i]) write_file(i, 'd-diskevents_from_diff', yaml_dump(events)) for disk_event in events: disk_event_interpret(disk_rep, disk_event) all_disk_events.extend(events) write_file(i, 'e-disk_rep-after-diskevents', disk_rep.tree()) repo.head.reference = commits[i] repo.head.reset(index=True, working_tree=True) supposedly = ProxyDirectory.from_disk(wd) write_file(i, 'f-supposedly', supposedly.tree()) assert_equal_disk_rep(disk_rep, supposedly) logger.debug('wd: %s' % wd) return dict(disk_rep0=disk_rep0, disk_events=all_disk_events, disk_rep=disk_rep)
def diskevents_from_diff(commit_a, commit_b): diff = commit_a.diff(commit_b) def dirname_name_from_path(path): path = path.encode('utf8') dirname = os.path.dirname(path) if dirname == '': dirname = () else: dirname = tuple(dirname.split('/')) basename = os.path.basename(path) return dirname, basename _id = 'ID' who = who_from_commit(commit_b) events = [] existing = set([_.path.encode('utf8') for _ in commit_a.tree.traverse()]) # create hash directory -> everything contained dir2contents = {} for tree in commit_a.tree.traverse(): if isinstance(tree, Tree): dir2contents[tree.path.encode('utf8')] = set() for blob in commit_a.tree.traverse(): if isinstance(blob, Blob): path = blob.path for d in dir2contents: if path.startswith(d): dir2contents[d].add(path) removed_files = set() for d in diff.iter_change_type('D'): removed_files.add(d.b_path) deleted_completely = set() deleted_by_deleting_dir = set() for di, di_contents in dir2contents.items(): if all(x in removed_files for x in di_contents): print('detected that %s was removed completely' % di) deleted_completely.add(di) for di in deleted_completely: # do not do this if the parent was already deleted if os.path.dirname(di) in deleted_completely: continue else: dirname, name = dirname_name_from_path(di) print('%s -> %s, %s' % (di, dirname, name)) deleted_by_deleting_dir.update(dir2contents[di]) e = disk_event_dir_delete(_id, who, dirname=dirname, name=name) events.append(e) for d in diff.iter_change_type('D'): if d.b_path in deleted_by_deleting_dir: continue dirname, name = dirname_name_from_path(d.b_path) e = disk_event_file_delete(_id, who, dirname=dirname, name=name) events.append(e) logger.debug('trees: %s' % list(commit_a.tree.traverse())) logger.debug('existing: %s' % "\n- ".join(existing)) for d in diff.iter_change_type('A'): dirname, name = dirname_name_from_path(d.b_path) # create all partial directories for i in range(1, len(dirname) + 1): partial = dirname[:i] partial_path = "/".join(partial) if not partial_path in existing: logger.debug('I need to create directory %r' % partial_path) d2 = partial[:-1] n2 = partial[-1] e = disk_event_dir_create(_id, who, dirname=d2, name=n2) events.append(e) existing.add("/".join(partial)) contents = d.b_blob.data_stream.read() e = disk_event_file_create(_id, who, dirname=dirname, name=name, contents=contents) events.append(e) for d in diff.iter_change_type('M'): dirname, name = dirname_name_from_path(d.b_path) contents = d.b_blob.data_stream.read() e = disk_event_file_modify(_id, who, dirname=dirname, name=name, contents=contents) events.append(e) dir_renames = set() for d in diff.iter_change_type('R'): # rename a_dirname, a_name = dirname_name_from_path(d.a_path) b_dirname, b_name = dirname_name_from_path(d.b_path) if a_dirname != b_dirname: dirname, name1, name2 = get_first_diff(d.a_path, d.b_path) dir_renames.add((tuple(dirname), name1, name2)) else: e = disk_event_file_rename(_id, who, dirname=a_dirname, name=a_name, name2=b_name) events.append(e) for dirname, name1, name2 in dir_renames: e = disk_event_dir_rename(_id, who, dirname=dirname, name=name1, name2=name2) events.append(e) return events
def substituting_empty_links(soup, raise_errors=False): ''' default style is [](#sec:systems) "Chapter 10" the name is [](#sec:systems?only_name) "My title" the number is [](#sec:systems?only_number) "10" and full is [](#sec:systems?toc_link) "Chapter 10 - My title" You can also use "class": <a href='#sec:name' class='only_number'></a> or <a href='#sec:name?only_number'></a> ''' CLASS_ONLY_NUMBER = MCDPManualConstants.CLASS_ONLY_NUMBER CLASS_NUMBER_NAME = MCDPManualConstants.CLASS_NUMBER_NAME CLASS_ONLY_NAME = MCDPManualConstants.CLASS_ONLY_NAME logger.debug('substituting_empty_links') n = 0 nerrors = 0 for le in get_empty_links_to_fragment(soup): a = le.linker element_id = le.eid element = le.linked n += 1 if not element: msg = ('Cannot find %s' % element_id) note_error_msg(a, msg) nerrors += 1 if raise_errors: raise ValueError(msg) continue # if there is a query, remove it if le.query is not None: new_href = '#' + le.eid a.attrs['href'] = new_href logger.info('setting new href= %s' % (new_href)) if (not LABEL_WHAT_NUMBER in element.attrs) or \ (not LABEL_NAME in element.attrs): msg = ( 'substituting_empty_links: Could not find attributes %s or %s in %s' % (LABEL_NAME, LABEL_WHAT_NUMBER, element)) if True: logger.warning(msg) else: note_error_msg(a, msg) nerrors += 1 if raise_errors: raise ValueError(msg) continue label_what_number = element.attrs[LABEL_WHAT_NUMBER] label_number = element.attrs[LABEL_NUMBER] label_what = element.attrs[LABEL_WHAT] label_name = element.attrs[LABEL_NAME] classes = list(a.attrs.get('class', [])) # bug: I was modifying if le.query is not None: classes.append(le.query) if 'toc_link' in classes: s = Tag(name='span') s.string = label_what add_class(s, 'toc_what') a.append(s) a.append(' ') s = Tag(name='span') s.string = label_number add_class(s, 'toc_number') a.append(s) s = Tag(name='span') s.string = ' - ' add_class(s, 'toc_sep') a.append(s) if label_name is not None and '<' in label_name: contents = bs(label_name) # sanitize the label name for br in contents.findAll('br'): br.replaceWith(NavigableString(' ')) for _ in contents.findAll('a'): _.extract() a.append(contents) #logger.debug('From label_name = %r to a = %r' % (label_name, a)) else: s = Tag(name='span') if label_name is None: s.string = '(unnamed)' # XXX else: s.string = label_name add_class(s, 'toc_name') a.append(s) else: if CLASS_ONLY_NUMBER in classes: label = label_number elif CLASS_NUMBER_NAME in classes: if label_name is None: label = label_what_number + \ ' - ' + '(unnamed)' # warning else: label = label_what_number + ' - ' + label_name elif CLASS_ONLY_NAME in classes: if label_name is None: label = '(unnamed)' # warning else: label = label_name else: label = label_what_number span1 = Tag(name='span') add_class(span1, 'reflabel') span1.string = label a.append(span1) logger.debug('substituting_empty_links: %d total, %d errors' % (n, nerrors))
def do_bib(soup, bibhere): """ find used bibliography entries put them there """ used = [] unused = set() for a in soup.find_all('a'): href = a.attrs.get('href', '') if href.startswith('#bib:'): used.append(href[1:]) # no "#" logger.debug('I found %d references, to these: %s' % (len(used), used)) # collect all the <cite> id2cite = {} for c in soup.find_all('cite'): ID = c.attrs.get('id', None) id2cite[ID] = c if ID in used: add_class(c, 'used') else: unused.add(ID) add_class(c, 'unused') # divide in found and not found found = [] notfound = [] for ID in used: if not ID in id2cite: if not ID in notfound: notfound.append(ID) else: found.append(ID) # now create additional <cite> for the ones that are not found for ID in notfound: cite = Tag(name='cite') s = 'Reference %s not found.' % ID cite.append(NavigableString(s)) cite.attrs['class'] = ['errored', 'error'] # XXX soup.append(cite) id2cite[ID] = cite # now number the cites n = 1 id2number = {} for ID in used: if not ID in id2number: id2number[ID] = n n += 1 # now add the attributes for cross-referencing for ID in used: number = id2number[ID] cite = id2cite[ID] cite.attrs[LABEL_NAME] = '[%s]' % number cite.attrs[LABEL_SELF] = '[%s]' % number cite.attrs[LABEL_NUMBER] = number cite.attrs[LABEL_WHAT] = 'Reference' cite.attrs[LABEL_WHAT_NUMBER_NAME] = '[%s]' % number cite.attrs[LABEL_WHAT_NUMBER] = '[%s]' % number # now put the cites at the end of the document for ID in used: c = id2cite[ID] # remove it from parent c.extract() # logger.debug('Extracting cite for %r: %s' % (ID, c)) # add to bibliography bibhere.append(c) s = ("Bib cites: %d\nBib used: %s\nfound: %s\nnot found: %s\nunused: %d" % (len(id2cite), len(used), len(found), len(notfound), len(unused))) logger.info(s)
def check_if_any_href_is_invalid(soup): ''' Checks if references are invalid and tries to correct them. if it is of the form "#frag?query" then query is stripped out ''' errors = [] math_errors = [] # let's first find all the IDs id2element, duplicates = get_id2element(soup, 'id') _name2element, _duplicates = get_id2element(soup, 'name') # id2element.update(name2element) # for a in soup.select('a[href^="#"]'): for a in soup.select('[href^="#"]'): href = a['href'] if a.has_attr('class') and "mjx-svg-href" in a['class']: msg = 'Invalid math reference (sorry, no details): href = %s .' % href logger.error(msg) a.insert_before(Comment('Error: %s' % msg)) math_errors.append(msg) continue assert href.startswith('#') ID = href[1:] # remove query if it exists if '?' in ID: ID = ID[:ID.index('?')] # not_found = [] if not ID in id2element: # try to fix it # # # it there is named element # if ID in name2element: # real_id = name2element[ID].attrs # if there is already a prefix, remove it if ':' in href: i = href.index(':') core = href[i+1:] else: core = ID possible = ['sec', 'sub', 'subsub', 'fig', 'tab', 'code', 'app', 'appsub', 'appsubsub', 'def', 'eq', 'rem', 'lem', 'prob', 'prop', 'exa', 'thm' ] matches = [] others = [] for possible_prefix in possible: why_not = possible_prefix + ':' + core others.append(why_not) if why_not in id2element: matches.append(why_not) if len(matches) > 1: msg = '%s not found, and multiple matches for heuristics (%s)' % (href, matches) logger.error(msg) add_class(a, 'errored') w = Tag(name='span', attrs={'class':'href-invalid href-invalid-missing'}) w.string = msg a.insert_after(w) elif len(matches) == 1: msg = '%s not found, but corrected in %s' % (href, matches[0]) logger.debug(msg) add_class(a, 'warning') w = Tag(name='span', attrs={'class':'href-replaced'}) w.string = msg a['href'] = '#' + matches[0] a.insert_after(w) else: # msg = 'Not found %r (also tried %s)' % (href, ", ".join(others)) # not_found.append(ID) # logger.error(msg) errors.append('Not found %r' % (href)) if not 'errored' in a.attrs.get('class', ''): add_class(a, 'errored') w = Tag(name='span', attrs={'class':'href-invalid href-invalid-missing'}) w.string = 'Not found %r' % (href) a.insert_after(w) if ID in duplicates: msg = 'More than one element matching %r.' % href logger.error(msg) if not 'errored' in a.attrs.get('class', ''): add_class(a, 'errored') w = Tag(name='span', attrs={'class':'href-invalid href-invalid-multiple'}) w.string = msg a.insert_after(w) errors.append(msg) return errors, math_errors
def debug(s): if False: logger.debug(s)
def manual_join(template, files_contents, bibfile, stylesheet, remove=None, extra_css=None, remove_selectors=None, hook_before_toc=None): """ extra_css: if not None, a string of more CSS to be added Remove_selectors: list of selectors to remove (e.g. ".draft"). hook_before_toc if not None is called with hook_before_toc(soup=soup) just before generating the toc """ logger.debug('remove_selectors: %s' % remove_selectors) logger.debug('remove: %s' % remove) from mcdp_utils_xml import bs template = replace_macros(template) # cannot use bs because entire document template_soup = BeautifulSoup(template, 'lxml', from_encoding='utf-8') d = template_soup assert d.html is not None assert '<html' in str(d) head = d.find('head') assert head is not None for x in get_manual_css_frag().contents: head.append(x.__copy__()) if stylesheet is not None: link = Tag(name='link') link['rel'] = 'stylesheet' link['type'] = 'text/css' from mcdp_report.html import get_css_filename link['href'] = get_css_filename('compiled/%s' % stylesheet) head.append(link) basename2soup = OrderedDict() for (_libname, docname), data in files_contents: frag = bs(data) basename2soup[docname] = frag fix_duplicated_ids(basename2soup) body = d.find('body') add_comments = False for docname, content in basename2soup.items(): logger.debug('docname %r -> %s KB' % (docname, len(data) / 1024)) from mcdp_docs.latex.latex_preprocess import assert_not_inside assert_not_inside(data, 'DOCTYPE') if add_comments: body.append(NavigableString('\n\n')) body.append(Comment('Beginning of document dump of %r' % docname)) body.append(NavigableString('\n\n')) for x in content: x2 = x.__copy__() # not clone, not extract body.append(x2) if add_comments: body.append(NavigableString('\n\n')) body.append(Comment('End of document dump of %r' % docname)) body.append(NavigableString('\n\n')) extract_bibtex_blocks(d) logger.info('external bib') if bibfile is not None: if not os.path.exists(bibfile): logger.error('Cannot find bib file %s' % bibfile) else: bibliography_entries = get_bibliography(bibfile) bibliography_entries['id'] = 'bibliography_entries' body.append(bibliography_entries) bibhere = d.find('div', id='put-bibliography-here') if bibhere is None: logger.warning('Could not find #put-bibliography-here in document.' 'Adding one at end of document') bibhere = Tag(name='div') bibhere.attrs['id'] = 'put-bibliography-here' d.find('body').append(bibhere) do_bib(d, bibhere) if True: logger.info('reorganizing contents in <sections>') body2 = reorganize_contents(d.find('body')) body.replace_with(body2) else: warnings.warn('fix') body2 = body # Removing all_selectors = [] if remove is not None and remove != '': all_selectors.append(remove) if remove_selectors: all_selectors.extend(remove_selectors) logger.debug('all_selectors: %s' % all_selectors) all_removed = '' for selector in all_selectors: nremoved = 0 logger.debug('Removing selector %r' % remove) toremove = list(body2.select(selector)) logger.debug('Removing %d objects' % len(toremove)) for x in toremove: nremoved += 1 nd = len(list(x.descendants)) logger.debug('removing %s with %s descendants' % (x.name, nd)) if nd > 1000: s = str(x)[:300] logger.debug(' it is %s' % s) x.extract() all_removed += '\n\n' + '-' * 50 + ' chunk %d removed\n' % nremoved all_removed += str(x) all_removed += '\n\n' + '-' * 100 + '\n\n' logger.info('Removed %d elements of selector %r' % (nremoved, remove)) # if False: with open('all_removed.html', 'w') as f: f.write(all_removed) if hook_before_toc is not None: hook_before_toc(soup=d) ### logger.info('adding toc') toc = generate_toc(body2) logger.info('TOC:\n' + str(toc)) toc_ul = bs(toc).ul toc_ul.extract() assert toc_ul.name == 'ul' toc_ul['class'] = 'toc' toc_ul['id'] = 'main_toc' toc_selector = 'div#toc' tocs = list(d.select(toc_selector)) if not tocs: msg = 'Cannot find any element of type %r to put TOC inside.' % toc_selector logger.warning(msg) else: toc_place = tocs[0] toc_place.replaceWith(toc_ul) logger.info('checking errors') check_various_errors(d) from mcdp_docs.check_missing_links import check_if_any_href_is_invalid logger.info('checking hrefs') check_if_any_href_is_invalid(d) # Note that this should be done *after* check_if_any_href_is_invalid() # because that one might fix some references logger.info('substituting empty links') substituting_empty_links(d) warn_for_duplicated_ids(d) if extra_css is not None: logger.info('adding extra CSS') add_extra_css(d, extra_css) add_footnote_polyfill(d) logger.info('converting to string') # do not use to_html_stripping_fragment - this is a complete doc res = unicode(d) res = res.encode('utf8') logger.info('done - %d bytes' % len(res)) return res
def notify_callback(event): logger.debug('\n' + yaml_dump(event)) events.append(event)
def check_if_any_href_is_invalid(soup): ''' Checks if references are invalid and tries to correct them. if it is of the form "#frag?query" then query is stripped out ''' logger.debug('check_if_any_href_is_invalid') errors = [] math_errors = [] # let's first find all the IDs id2element, duplicates = get_id2element(soup, 'id') _name2element, _duplicates = get_id2element(soup, 'name') for a in soup.select('[href^="#"]'): href = a['href'] if a.has_attr('class') and "mjx-svg-href" in a['class']: msg = 'Invalid math reference (sorry, no details): href = %s .' % href logger.warning(msg) a.insert_before(Comment('Error: %s' % msg)) math_errors.append(msg) continue assert href.startswith('#') ID = href[1:] # remove query if it exists if '?' in ID: ID = ID[:ID.index('?')] if not ID in id2element: # try to fix it # if there is already a prefix, remove it if ':' in href: i = href.index(':') core = href[i + 1:] else: core = ID # logger.debug('check_if_any_href_is_invalid: not found %r, core %r' % (ID, core)) possible = [ 'part', 'sec', 'sub', 'subsub', 'fig', 'tab', 'code', 'app', 'appsub', 'appsubsub', 'def', 'eq', 'rem', 'lem', 'prob', 'prop', 'exa', 'thm', # 'bib' ] matches = [] others = [] for possible_prefix in possible: why_not = possible_prefix + ':' + core others.append(why_not) if why_not in id2element: matches.append(why_not) # logger.debug('others = %r, matches = %r' % (others, matches)) if len(matches) > 1: short = 'Ref. error' msg = '%s not found, and multiple matches for heuristics (%s)' % ( href, matches) note_error2(a, short, msg, ['href-invalid', 'href-invalid-missing']) elif len(matches) == 1: a['href'] = '#' + matches[0] if show_debug_message_for_corrected_links: short = 'Ref replaced' msg = '%s not found, but corrected in %s' % (href, matches[0]) note_warning2(a, short, msg, ['href-replaced']) else: if has_class(a, MCDPConstants.CLASS_IGNORE_IF_NOT_EXISTENT): pass else: short = 'Ref. error' # msg = 'Not found %r (also tried %s)' % (href, ", ".join(others)) msg = 'I do not know the link that is indicated by the link %r.' % href note_error2(a, short, msg, ['href-invalid', 'href-invalid-missing']) errors.append(msg) if ID in duplicates: msg = 'More than one element matching %r.' % href short = 'Ref. error' note_error2(a, short, msg, ['href-invalid', 'href-invalid-multiple']) errors.append(msg) return errors, math_errors
def make_sections(body, is_marker, preserve=lambda _: False, element_name='section', copy=True, attrs={}): sections = [] def make_new(): x = Tag(name=element_name) for k, v in attrs.items(): x.attrs[k] = v return x current_section = make_new() current_section['id'] = 'before-any-match-of-%s' % is_marker.__name__ current_section['class'] = 'without-header-inside' # sections.append(current_section) for x in body.contents: if is_marker(x): #print('starting %s' % str(x)) if contains_something_else_than_space(current_section): sections.append(current_section) current_section = make_new() current_section['id'] = x.attrs.get( 'id', 'unnamed-h1') + ':' + element_name logger.debug('marker %s' % current_section['id']) current_section['class'] = x.attrs.get('class', '') #print('%s/section %s %s' % (is_marker.__name__, x.attrs.get('id','unnamed'), current_section['id'])) current_section.append(x.__copy__()) current_section['class'] = 'with-header-inside' elif preserve(x): if contains_something_else_than_space(current_section): sections.append(current_section) #current_section['id'] = x.attrs.get('id', 'unnamed-h1') + ':' + element_name #print('%s/preserve %s' % (preserve.__name__, current_section['id'])) sections.append(x.__copy__()) current_section = make_new() current_section.attrs['comment'] = "Triggered by %r" % x else: #x2 = x.__copy__() if copy else x x2 = x.__copy__() if copy else x.extract() current_section.append(x2) if contains_something_else_than_space(current_section): sections.append(current_section) # XXX new_body = Tag(name=body.name) # if len(sections) < 3: # msg = 'Only %d sections found (%s).' % (len(sections), is_marker.__name__) # raise ValueError(msg) logger.info('make_sections: %s found using marker %s' % (len(sections), is_marker.__name__)) for i, s in enumerate(sections): if add_debug_comments: new_body.append('\n') new_body.append( Comment('Start of %s section %d/%d' % (is_marker.__name__, i, len(sections)))) new_body.append('\n') new_body.append(s) new_body.append('\n') if add_debug_comments: new_body.append( Comment('End of %s section %d/%d' % (is_marker.__name__, i, len(sections)))) new_body.append('\n') return new_body