def manual_join(template, files_contents, stylesheet, remove=None, extra_css=None, remove_selectors=None, hook_before_toc=None, references=None, resolve_references=True, hook_before_final_pass=None, require_toc_placeholder=False, permalink_prefix=None, crossrefs_aug=None, aug0=None): """ files_contents: a list of tuples that can be cast to DocToJoin: where the string is a unique one to be used for job naming. extra_css: if not None, a string of more CSS to be added Remove_selectors: list of selectors to remove (e.g. ".draft"). hook_before_toc if not None is called with hook_before_toc(soup=soup) just before generating the toc """ result = AugmentedResult() if references is None: references = {} check_isinstance(files_contents, list) if crossrefs_aug is None: crossrefs = Tag(name='no-cross-refs') else: crossrefs = bs(crossrefs_aug.get_result()) result.merge(crossrefs_aug) if aug0 is not None: result.merge(aug0) @contextmanager def timeit(_): yield with timeit('manual_join'): files_contents = [DocToJoin(*_) for _ in files_contents] # cannot use bs because entire document with timeit('parsing template'): template0 = template template = replace_macros(template) template_soup = BeautifulSoup(template, 'lxml', from_encoding='utf-8') d = template_soup if d.html is None: s = "Invalid template" raise_desc(ValueError, s, template0=template0) with timeit('adding head'): assert d.html is not None assert '<html' in str(d) head = d.find('head') if head is None: msg = 'Could not find <head> in template:' logger.error(msg) logger.error(str(d)) raise Exception(msg) assert head is not None for x in get_manual_css_frag().contents: head.append(x.__copy__()) with timeit('adding stylesheet'): if stylesheet is not None: link = Tag(name='link') link['rel'] = 'stylesheet' link['type'] = 'text/css' from mcdp_report.html import get_css_filename link['href'] = get_css_filename('compiled/%s' % stylesheet) head.append(link) with timeit('making basename2soup'): basename2soup = OrderedDict() for doc_to_join in files_contents: if doc_to_join.docname in basename2soup: msg = 'Repeated docname %r' % doc_to_join.docname raise ValueError(msg) from .latex.latex_preprocess import assert_not_inside if isinstance(doc_to_join.contents, AugmentedResult): result.merge(doc_to_join.contents) contents = doc_to_join.contents.get_result() else: contents = doc_to_join.contents assert_not_inside(contents, '<fragment') assert_not_inside(contents, 'DOCTYPE') frag = bs(contents) basename2soup[doc_to_join.docname] = frag # with timeit('fix_duplicate_ids'): # XXX # fix_duplicated_ids(basename2soup) with timeit('copy contents'): body = d.find('body') add_comments = False for docname, content in basename2soup.items(): if add_comments: body.append(NavigableString('\n\n')) body.append( Comment('Beginning of document dump of %r' % docname)) body.append(NavigableString('\n\n')) try_faster = True if try_faster: for e in list(content.children): body.append(e.extract()) else: copy_contents_into(content, body) if add_comments: body.append(NavigableString('\n\n')) body.append(Comment('End of document dump of %r' % docname)) body.append(NavigableString('\n\n')) with timeit('extract_bibtex_blocks'): extract_bibtex_blocks(d) with timeit('ID_PUT_BIB_HERE'): ID_PUT_BIB_HERE = MCDPManualConstants.ID_PUT_BIB_HERE bibhere = d.find('div', id=ID_PUT_BIB_HERE) if bibhere is None: msg = ('Could not find #%s in document. ' 'Adding one at end of document.') % ID_PUT_BIB_HERE result.note_warning(msg) bibhere = Tag(name='div') bibhere.attrs['id'] = ID_PUT_BIB_HERE d.find('body').append(bibhere) do_bib(d, bibhere) with timeit('hook_before_final_pass'): if hook_before_final_pass is not None: hook_before_final_pass(soup=d) with timeit('document_final_pass_before_toc'): location = LocationUnknown() document_final_pass_before_toc(d, remove, remove_selectors, result, location) with timeit('hook_before_toc'): if hook_before_toc is not None: hook_before_toc(soup=d) with timeit('generate_and_add_toc'): try: generate_and_add_toc(d, raise_error=True, res=result) except NoTocPlaceholder as e: if require_toc_placeholder: msg = 'Could not find toc placeholder: %s' % e # logger.error(msg) if aug0 is not None: result.note_error(msg) else: raise Exception(msg) with timeit('document_final_pass_after_toc'): document_final_pass_after_toc( soup=d, crossrefs=crossrefs, resolve_references=resolve_references, res=result) if extra_css is not None: logger.info('adding extra CSS') add_extra_css(d, extra_css) with timeit('document_only_once'): document_only_once(d) location = LocationUnknown() substitute_github_refs(d, defaults={}, res=result, location=location) with timeit('another A pass'): for a in d.select('a[href]'): href = a.attrs['href'] if href in references: r = references[href] a.attrs['href'] = r.url if not a.children: # empty a.append(r.title) # do not use to_html_stripping_fragment - this is a complete doc # mark_in_html(result, soup=d) add_github_links_if_edit_url(soup=d, permalink_prefix=permalink_prefix) with timeit('converting to string'): res = unicode(d) with timeit('encoding'): res = res.encode('utf8') logger.info('done - %.1f MB' % (len(res) / (1024 * 1024.0))) result.set_result(res) return result
def get_cross_refs(src_dirs, permalink_prefix, extra_crossrefs, ignore=[]): res = AugmentedResult() files = look_for_files(src_dirs, "crossref.html") id2file = {} soup = Tag(name='div') def add_from_soup(s, f, ignore_alread_present, ignore_if_conflict): for img in list(s.find_all('img')): img.extract() for e in s.select('[base_url]'): e['external_crossref_file'] = f # Remove the ones with the same base_url for e in list(s.select('[base_url]')): if e.attrs['base_url'] == permalink_prefix: e.extract() for e in s.select('[id]'): id_ = e.attrs['id'] if id_ == 'container': continue # XXX: if id_ in id2file: if not ignore_alread_present: msg = 'Found two elements with same ID "%s":' % id_ msg += '\n %s' % id2file[id_] msg += '\n %s' % f res.note_error(msg) else: id2file[id_] = f e2 = e.__copy__() if ignore_if_conflict: e2.attrs['ignore_if_conflict'] = '1' soup.append(e2) soup.append('\n') ignore = [os.path.realpath(_) for _ in ignore] for _f in files: if os.path.realpath(_f) in ignore: msg = 'Ignoring file %r' % _f logger.info(msg) continue logger.info('cross ref file %s' % _f) data = open(_f).read() if permalink_prefix in data: msg = 'skipping own file' logger.debug(msg) continue s = bs(data) add_from_soup(s, _f, ignore_alread_present=False, ignore_if_conflict=False) if extra_crossrefs is not None: logger.info('Reading external refs\n%s' % extra_crossrefs) try: r = requests.get(extra_crossrefs) except Exception as ex: msg = 'Could not read external cross reference links' msg += '\n %s' % extra_crossrefs msg += '\n\n' + indent(str(ex), ' > ') res.note_error(msg) else: logger.debug('%s %s' % (r.status_code, extra_crossrefs)) if r.status_code == 404: msg = 'Could not read external cross refs: %s' % r.status_code msg += '\n url: ' + extra_crossrefs msg += '\n This is normal if you have not pushed this branch yet.' res.note_warning(msg) # logger.error(msg) s = bs(r.text) add_from_soup(s, extra_crossrefs, ignore_alread_present=True, ignore_if_conflict=True) # print soup res.set_result(str(soup)) return res