def insertEbookISBN(report_dict, doc_root, copyrightsection_stylename, copyrightstyles, isbn, isbnstyle): logger.info("* * * commencing insertEbookISBN function...") sectionpara = lxml_utils.findParasWithStyle(copyrightsection_stylename, doc_root)[0] lastpara = sectionpara pneighbors = lxml_utils.getNeighborParas(sectionpara) while pneighbors["nextstyle"] in copyrightstyles: lastpara = pneighbors["next"] # increment the loop pneighbors = lxml_utils.getNeighborParas(lastpara) # add para lxml_utils.insertPara(copyrightstyles[0], lastpara, doc_root, isbn, "after") # add runstyle to the isbn: new_para = lastpara.getnext() new_text = new_para.find(".//*w:t", wordnamespaces) # create runstyle and append to run element new_run_props = etree.Element("{%s}rPr" % wnamespace) new_run_props_style = etree.Element("{%s}rStyle" % wnamespace) new_run_props_style.attrib["{%s}val" % wnamespace] = isbnstyle new_run_props.append(new_run_props_style) new_text.addprevious(new_run_props) # log for report lxml_utils.logForReport_old(report_dict, doc_root, lastpara.getnext(), "added_ebook_isbn", "added '%s'" % isbn) return report_dict
def concatTitleParas(titlestyle, report_dict, doc_root): logger.info("* * * commencing concatTitleParas function...") # combine runs from titleparas searchstring = ".//*w:pStyle[@w:val='%s']" % titlestyle firsttitlepara = doc_root.find(searchstring, wordnamespaces).getparent().getparent() # replace softbreaks in the firsttitlepara report_dict = replaceSoftBreak(firsttitlepara, report_dict) # set vars titlestring = lxml_utils.getParaTxt(firsttitlepara) newtitlestring = titlestring pneighbors = lxml_utils.getNeighborParas(firsttitlepara) while pneighbors['nextstyle'] == titlestyle: # replace softbreaks in this title para (this will add spaces in cases where a softbreak was used) report_dict = replaceSoftBreak(pneighbors['next'], report_dict) # set newtitlestring newtitlestring = "%s %s" % (newtitlestring, lxml_utils.getParaTxt(pneighbors['next'])) # newtitlestring = "{} {}".format(newtitlestring, nexttext) # should review why this failed with unicode # increment, and delete this para tmp_para = pneighbors['next'] pneighbors = lxml_utils.getNeighborParas(pneighbors['next']) tmp_para.getparent().remove(tmp_para) # if we have changes in the titlestring, remove existing contents and write the new full title as a new run if newtitlestring != titlestring: lxml_utils.addRunToPara(newtitlestring, firsttitlepara, True) # log for report (optional) lxml_utils.logForReport_old( report_dict, doc_root, pneighbors['next'], "concatenated_extra_titlepara_and_removed", newtitlestring) return report_dict
def evalPosition(sectionname, section_start_rules, cbstring, sectionbegin_para, sectiontypes): logger.debug("evaluate 'position' rule...") # get previous para style then scan upwards with while loop pneighbors = lxml_utils.getNeighborParas(sectionbegin_para) while pneighbors['prevstyle'] and pneighbors[ 'prevstyle'] not in sectiontypes["all"]: # increment para upwards para_tmp = pneighbors['prev'] pneighbors = lxml_utils.getNeighborParas(para_tmp) last_sectionstart = pneighbors['prevstyle'] # in case there were no preceding section starts: if last_sectionstart not in sectiontypes["all"]: last_sectionstart = sectiontypes["frontmatter"][0] # get next SectionStart style pneighbors = lxml_utils.getNeighborParas(sectionbegin_para) # para_tmp = sectionbegin_para while pneighbors['nextstyle'] and pneighbors[ 'nextstyle'] not in sectiontypes["all"]: # increment para (down) para_tmp = pneighbors['next'] pneighbors = lxml_utils.getNeighborParas(para_tmp) next_sectionstart = pneighbors['nextstyle'] # in case there were no follwoing section starts: if next_sectionstart not in sectiontypes["all"]: next_sectionstart = sectiontypes["backmatter"][0] # the desired 'position': position = section_start_rules[sectionname]["position"] # evaluate desired position vs. position as determined by Seciton start position if position == "frontmatter" and last_sectionstart in sectiontypes[ "frontmatter"]: logger.debug( "'frontmatter' criteria matched- prev_sectionstart: '%s'" % last_sectionstart) return True elif position == "main" and ((last_sectionstart in sectiontypes["main"]) or (next_sectionstart in sectiontypes["main"])): logger.debug("'main' criteria matched- betweem '%s' and '%s'" % (last_sectionstart, next_sectionstart)) return True elif position == "backmatter" and next_sectionstart in sectiontypes[ "backmatter"]: logger.debug("'backmatter' criteria matched- next_sectionstart: '%s'" % next_sectionstart) return True else: logger.debug("'%s' criteria not matched- betweem '%s' and '%s'" % (position, last_sectionstart, next_sectionstart)) return False
def evalSectionRequired(sectionname, section_start_rules, doc_root, titlestyle): logger.debug("evaluate section-required rule...") # set default return to None sectionbegin_para = None # lets see if this section start is already present: if checkForParaStyle(lxml_utils.transformStylename(sectionname), doc_root) == False: # get insert_before styles insertstyles = [ lxml_utils.transformStylename(s) for s in section_start_rules[sectionname]["section_required"] ["insert_before"] ] # two find the first insert style, I can either find the first occurrence of each # insertstyle and compare para indexes, or start at the top of the document (titlepage) and scan downwards # For the only section_required style in use at time of writing this, (section-chapter), # the latter seems less resource intensive. # It's possible we would encounter a doc wihtout a titlepage, but then we have bigger problems searchstring = ".//*w:pStyle[@w:val='%s']" % lxml_utils.transformStylename( titlestyle) titlestyle = doc_root.find(searchstring, wordnamespaces) if titlestyle is not None: titlepara = titlestyle.getparent().getparent() # get next SectionStart style pneighbors = lxml_utils.getNeighborParas(titlepara) # para_tmp = titlepara while pneighbors['nextstyle'] and pneighbors[ 'nextstyle'] not in insertstyles: # increment para (down) para_tmp = pneighbors['next'] pneighbors = lxml_utils.getNeighborParas(para_tmp) next_sectionstart = pneighbors['nextstyle'] # this needs a conditional in case there were no following insertstyles ever: if next_sectionstart in insertstyles: sectionbegin_para = pneighbors['next'] logger.debug( "section_required criteria met; 1st insertbefore_style: '%s'" % next_sectionstart) else: logger.debug( "no 'insert_before' styles found, cannot insert sectionstart" ) else: logger.debug( "no titlepageTitle para, cannot process sectionrequired") return sectionbegin_para
def deletePrecedingPageBreak(para, report_dict): logger.debug("checking for page break in preceding para...") pneighbors = lxml_utils.getNeighborParas(para) if len(pneighbors['prev']): # find all pagebreaks in the preceding paragraph pagebreakstring = ".//*w:br[@w:type='page']" breaks = pneighbors['prev'].findall(pagebreakstring, wordnamespaces) if len(breaks) == 1 and not pneighbors['prevtext'].strip( ): # we need the strip.. apparently a pb carries some whitespace value logger.info("empty preceding pb para, deleting it") # # optional - log location for debug: (has to come before removal or the reference fails (para is gone) # report_dict = lxml_utils.logForReport_old(report_dict,doc_root,para,"removed_pagebreak","rm'd pagebreak para preceding inserted section-start") # remove pagebreak para pneighbors['prev'].getparent().remove(pneighbors['prev']) elif len(breaks) > 0 and pneighbors['prevtext'].strip(): # could remove the last pb anyways, here, consolidate with next case; or just remove the text and the pb logger.info("preceding pagebreak has text contents, not deleting") elif len(breaks) > 1 and not pneighbors['prevtext'].strip(): logger.info( "multiple pagebreak chars in preceding para: removing the last one" ) # # optional - log location for debug: (has to come before removal or the reference fails (para is gone) # report_dict = lxml_utils.logForReport_old(report_dict,doc_root,para,"removed_pagebreak","rm'd preceding pagebreak char preceding inserted section-start") # remove last pagebreak char from the preceding paragraph breaks[len(breaks) - 1].getparent().remove(breaks[len(breaks) - 1]) elif len(breaks) == 0: logger.info("preceding para is not a pagebreak, skipping delete") return report_dict
def findSectionBegin(sectionname, section_start_rules, doc_root, versatileblockparas, para, cbstring): # set header lists headers = [ lxml_utils.transformStylename(s) for s in section_start_rules[sectionname][cbstring]["styles"] ] if "optional_heading_styles" in section_start_rules[sectionname][cbstring]: optheaders = [ lxml_utils.transformStylename(s) for s in section_start_rules[sectionname][cbstring] ["optional_heading_styles"] ] allheaders = headers + optheaders else: allheaders = headers allheaders_plus_versatileparas = allheaders + versatileblockparas # set vars for our loop & output pneighbors = lxml_utils.getNeighborParas(para) sectionbegin_para = para sectionbegin_tmp = para firstStyleOfBlock = True # // scan upwards through any optional headers, versatile block paras, or styles in Style list (for contiguous block criteria) while pneighbors['prevstyle'] in allheaders_plus_versatileparas: logger.debug("found leading header/versatile styled para:'%s'" % pneighbors['prevstyle']) # increment the loop upwards sectionbegin_tmp = pneighbors['prev'] pneighbors = lxml_utils.getNeighborParas(sectionbegin_tmp) sectionbegin_tmp_style = lxml_utils.getParaStyle(sectionbegin_tmp) # adjust matching & leadingParas if we found optional header or para with style from # style list directly preceding a versatile block para if sectionbegin_tmp_style in allheaders: sectionbegin_para = sectionbegin_tmp # this is to help us save time, now we can stop processing this particular style-match if sectionbegin_tmp_style in headers: firstStyleOfBlock = False return sectionbegin_para, firstStyleOfBlock
def evalPrevUntil(sectionname, section_start_rules, cbstring, sectionbegin_para): logger.debug("evaluating previous until rule...") requiredstyles = [ lxml_utils.transformStylename(style) for style in section_start_rules[sectionname][cbstring] ["previous_sibling"]["required_styles"] ] prevuntil_styles = [ lxml_utils.transformStylename(style) for style in section_start_rules[sectionname][cbstring]["previous_until"] ] required_plus_prevuntil_styles = requiredstyles + prevuntil_styles # get previous para style then scan upwards with while loop pneighbors = lxml_utils.getNeighborParas(sectionbegin_para) para_tmp = sectionbegin_para while pneighbors['prevstyle'] and pneighbors[ 'prevstyle'] not in required_plus_prevuntil_styles: # increment para upwards para_tmp = pneighbors['prev'] pneighbors = lxml_utils.getNeighborParas(para_tmp) # figure out whether we matched a prevuntil style or required style if pneighbors['prevstyle'] in requiredstyles: logger.debug( "false: found required-style before prev_until-style:'%s'" % pneighbors['prevstyle']) return False elif pneighbors['prevstyle'] in prevuntil_styles: logger.debug( "true: found required-style before prev_until-style:'%s'" % pneighbors['prevstyle']) return True elif not pneighbors['prevstyle']: logger.debug( "false: reached the beginning of the document, which indicates erroneous styling" ) return False
def precedingStyleCheck(sectionname, section_start_rules, cbstring, sectionbegin_para, sectiontypes): logger.debug("checking prev-sibling for existing acceptable style...") # get acceptable previous sibling style list: requiredStyles = [ lxml_utils.transformStylename(s) for s in section_start_rules[sectionname][cbstring]["previous_sibling"] ["required_styles"] ] required_plus_section_styles = requiredStyles + sectiontypes["all"] # get preceding para style pneighbors = lxml_utils.getNeighborParas(sectionbegin_para) # check to see if previous para style is already acceptable if pneighbors["prevstyle"] in required_plus_section_styles: logger.debug("previous style already has section start style: '%s'" % pneighbors["prevstyle"]) return True else: return False