def no_fonts (pq): # yuk - lxml etree and PyQuery objs get confused - nested ones arent removed, this goes only 2 levels raise Exception, "yuk - it's a mess, use tidy!" pq = PyQuery (pq) #print fonts.__class__.__name__ for font in pq ('font'): font = PyQuery (font) #font ('a').remove() #print font.__class__.__name__ #print len (font), font [0] #print dir (font) #import sys #sys.exit() #inner = innerhtml (font) # .text() #.replace (':','').strip() #print 'Replacing font with:', font.html() font.replaceWith (font.html()) #font.getparent().replace (font, PyQuery (inner)) print 'font replaced:', font [:60] #font = no_fonts (font) for font in pq ('font'): font = PyQuery (font) font.replaceWith (font.html()) print 'font 2 replaced:', font [:60] return pq
def ReadURL(url): trytime = 0 pq = None while (trytime < 3): try: pq = PyQuery(url = url) break except Exception as e: print 'Exception!', url trytime += 1 raise e time.sleep(SLEEP_BETWEEN_REQUEST) if pq == None or pq.html() == None: return '' return pq.html()
def _split(inputfile, outputdir): source = open(inputfile, 'r') html = source.read() source.close() if not os.path.isdir(outputdir): os.mkdir(outputdir) idx_slide=0 idx_section=0 parsed = PyQuery(html) for section in parsed('section'): slide = PyQuery(section) if slide.has_class('stack'): idx_section+=1 stack_path = os.path.join(outputdir,'%02d' % idx_section ) os.mkdir(stack_path) for sub_slide in PyQuery(slide.html())('section'): idx_slide+=1 _dump_slide(sub_slide, idx_slide, stack_path) else: if not slide.parent().has_class('stack'): idx_slide+=1 _dump_slide(slide, idx_slide, outputdir)
def _enhance_text(self): """ Transforms a simplified text into a valid mail.template text. :return: mail.template text """ self.ensure_one() # Parse and set back the keywords into raw template code html_text = PyQuery(self.simplified_text.replace('\n', '')) def sort_keywords(kw): # Replace first if/for-clauses, then var, then code index = kw.position if kw.type == 'if' or 'for' in kw.type: index += 2*len(self.body_html) * kw.nested_position # Take if and for in the appearing order in the text index -= kw.position elif kw.type == 'var': index += len(self.body_html) return index keywords = self.keyword_ids.sorted(sort_keywords, reverse=True) # Replace automatic-generated keywords for keyword in keywords: keyword_text = html_text('#' + keyword.html_id) keyword_text.replace_with(keyword.final_text) # Replace user added keywords template_text = html_text.html() for keyword in keywords.filtered(lambda k: k.type == 'code'): to_replace = u"[{}]".format(keyword.short_code) template_text = template_text.replace(to_replace, keyword.raw_code) final_text = PyQuery(BeautifulSoup(template_text).prettify()) return final_text('body').html()
def test_mount_tag(): root = PyQuery('<root></root>') tag = {'name': 'custom', 'html': '<custom><text>{opts.txt}</text></custom>'} dom = vdom.mount_tag(root, tag, {'txt': 'hello world'}) assert dom and dom.uuid # dom created assert vdom.get_dom(dom.uuid) # dom cached assert root.html() # mounted something
def scrape(slug, url, name, title=None): f = urlopen(url) doc = f.read() doc, errs = tidy_document( doc, options={ "output-html": 1, #'indent':1, "clean": 1, "drop-font-tags": 1, }, ) if errs: # raise Exception, errs print errs doc = html5lib.parse(doc, treebuilder="lxml") # this didn't work, but above three lines did: encoding='utf-8', html.xhtml_to_html(doc) jQuery = PyQuery([doc]) td = jQuery("td#content") assert len(td) == 1 for img in td("img"): # print 'img:', PyQuery (img) img = PyQuery(img) src = img.attr("src") # alt = img.attr('alt') # if src.startswith ('/image'): rslt = getimage(src, slug.split("/")[0]) img.attr("src", rslt) if trace: print rslt # td = # no_fonts (td) # need to fix links here content = PyQuery(td[0]) # content = content.html() content = no_namespaces(content.html()) print slug, content[:60] # .html() # [:60] if dbteeth: # q, created = QuickPage.objects.get_or_create ( qp, created = create_or_update( QuickPage, keys=dict(slug=slug), fields=dict( name=name, title=title if title else name, content=content, # defaults = dict (sortorder = sortorder), ), )
def fixLinks(text, parser): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a, link'): e = PyQuery(element) href = e.attr('href') if href is None: continue print '// Drop queryString in included src' print 'from: ', href result = urlparse(href) if result.scheme == 'https': href = href elif result.scheme == '': href = result.path + (('#' + result.fragment) if result.fragment != '' else '') print 'to: ', href new_href = re.sub(r'(rss/index\.html)|(rss/?)$', 'rss/index.rss', href) if not abs_url_regex.search(href): new_href = re.sub(r'/index\.html$', '/', new_href) if href != new_href: e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def sanitize_description(value): cleaned = PyQuery(value) cleaned = cleaned.remove('span.playMetaText') cleaned.remove('span.playMetaText') cleaned.remove('time') cleaned.remove('strong') return cleaned.html().split('<span>')[-1:][0].replace('</span>', '')
def sanitize_html2(value): soup = PyQuery(value) soup = soup.remove("span.playMetaText") soup.remove("span.playMetaText") soup.remove("time") soup.remove("strong") return soup.html().split("<span>")[-1:]
def get_pastes ( self ): Logger ().log ( 'Getting pastes', True ) try: page = PyQuery ( url = self.PASTES_URL ) except KeyboardInterrupt: raise except: return self.CONNECTION_FAIL,None """ There are a set of encoding issues which, coupled with some bugs in etree (such as in the Raspbian packages) can trigger encoding exceptions here. As a workaround, we try every possible encoding first, and even if that fails, we resort to a very hacky workaround whereby we manually get the page and attempt to encode it as utf-8. It's ugly, but it works for now. """ try: page_html = page.html () except KeyboardInterrupt: raise except: worked = False for enc in all_python_encodings(): try: page_html = page.html(encoding=enc) worked = True break except KeyboardInterrupt: raise except: pass if not worked: # One last try... try: f = urllib.request.urlopen(Crawler.PASTES_URL) page_html = PyQuery(str(f.read()).encode('utf8')).html() f.close() except KeyboardInterrupt: raise except: return self.OTHER_ERROR, None if re.match ( r'Pastebin\.com - Access Denied Warning', page_html, re.IGNORECASE ) or 'blocked your IP' in page_html: return self.ACCESS_DENIED,None else: return self.OK,page('.maintable img').next('a')
def clean_body(body): site = Site.objects.get_current() html = PyQuery('<body>' + body + '</body>') for p in html('p'): p = PyQuery(p) p.replaceWith('\n\n%s\n\n' % p.html()) html('.alignright').addClass('pull-right').removeClass('alignright') html('.alignleft').addClass('pull-left').removeClass('alignleft') html('[style="float: left;"]').removeAttr('style').addClass('alignleft') html('[style="float: right;"]').removeAttr('style').addClass('alignright') while '\n\n\n' in body: body = body.replace('\n\n\n', '\n\n') while '\r\r\r' in body: body = body.replace('\r\r\r', '\r\r') body = html.html() body = body.replace('<br />', ' \n') body = body.replace('<br/>', ' \n') body = body.replace('<br>', ' \n') body = body.replace('\r\n', '\n') body = body.replace('\n\r', '\n') while body.find('\n\n\n') > -1: body = body.replace('\n\n\n', '\n\n') while body.startswith('\n'): body = body[1:] while body.endswith('\n'): body = body[:-1] while body.startswith('\r'): body = body[1:] while body.endswith('\r'): body = body[:-1] while body.startswith('\t'): body = body[1:] return body
def extract(self): self.html = re.sub('<!--.*?-->', '', self.html) doc = PyQuery(self.html) content_node = doc('div#contentText') content_node.remove('script') content_node.remove('style') content_node.remove('.line') content_node.remove('#shareIn') content_node.remove('.tagHotg') content_node.remove('.blank8') content_node.remove('."editShare clear"') content_node.remove('select') #content_node.remove('table[width = "100%"]')('td[align = "center"]') content_node.remove('div[class = "jingbian_travel01_04"]') content_node.remove('div[class = "txt2"]') content_node.remove('iframe') content_node.remove('embed') content_node.remove('td[style = "font-size: 14px; font-weight: bold;"]') content_node.remove('table[style = "margin-right: 20px;"]') content_node.remove('digi_perpage_bottom') content_node.remove('div[class = "extract clear"]') content_node.remove('table[bgcolor = "#eeeeee"]') content_node.remove('img[alt = "搜狐教育频道"]') content_node.remove('table[bgcolor = "#e2e2e2"]') content_node.remove('table[bgcolor = "#66ccff"]') content_node.remove('div[class = "digi_digest"]') item = ContentItem() imgs = content_node('img') img_all = [] for img in imgs: if".gif" in img.get('src'): continue else: imgs.eq(imgs.index(img)).append('<br>') imgs.eq(imgs.index(img)).before('<br>') img_all.append(self.getRealURI(img.get('src'))) item['image_urls'] = img_all item['title'] = self.title = doc('h1').text() item['content'] = self.content = content_node.__unicode__() t = re.compile(u'var club_artinputdate = "(.*?)";') release_time = t.search(doc.html()) if release_time: item['release_time'] = self.release_time = release_time.group(1) # item['release_switch_time'] = time.mktime(time.strptime(t.search(doc.html()).group(1),'%Y-%m-%d %H:%M:%S')) item['source'] = u'搜狐' author = doc('div[class = "function clear"]') self.author = author('div.l')('a').text() item['author'] = self.author item['pic_url'] = '' return item
def sanitize_description(value): cleaned = PyQuery(value) cleaned = cleaned.remove('span.playMetaText') cleaned.remove('span.playMetaText') cleaned.remove('span.playCount') cleaned.remove('time') cleaned.remove('strong') desc = cleaned.html() if desc is None: return "" return desc.split('<span>')[-1:][0].replace('</span>', '').strip()
def fixLinks(text, parser): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a'): e = PyQuery(element) href = e.attr('href') if not abs_url_regex.search(href): new_href = re.sub(r'/index\.html$', '/', href) new_href = re.sub(r'index.html', '/', new_href) e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def render_md5(self, post_content): config = Config() self.body = markdown2.markdown( post_content, extras=config.mdextras, ) # rewrite relative img-srcs to full paths. d = PyQuery(self.body) for img in d.find('img'): if '/' not in img.attrib['src']: img.attrib['src'] = '{}{}/{}'.format(config.blogurl, self.outputpath, img.attrib['src']) self.body = d.html()
def plainify(html): doc = PyQuery('<body>%s</body>' % html) doc('img, audio, video, iframe, embed, object, script').remove() for a in doc('a, i, b, strong, em'): PyQuery(a).replaceWith( PyQuery(a).html() ) for b in doc('blockquote'): PyQuery(b).replaceWith( PyQuery(b).html() ) for a in doc('h1, h2, h3, h4, h5, h6'): PyQuery(a).replaceWith('<p>%s:</p>' % PyQuery(a).text()) for p in doc('p'): t = (PyQuery(p).text() or '').strip() if not t: PyQuery(p).remove() continue if not t[-1] in string.punctuation: t += '. ' if t.startswith('http:') or t.startswith('https:'): PyQuery(p).remove() if t.startswith('[') and t.endswith(']'): PyQuery(p).remove() PyQuery(p).html(t) for li in doc('li'): t = (PyQuery(li).text() or '').strip() if not t: PyQuery(li).remove() continue if not t[-1] in string.punctuation: t += '.' PyQuery(li).html(t) return html2text( doc.html() )
def fix_share_links(text,parser): td_regex = re.compile(target_domain + '|' ) assert target_domain, "target domain must be specified --target_domain=<http://your-host-url>" d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for share_class in ['.icon-twitter','.icon-facebook','.icon-google-plus']: for element in d(share_class): e = PyQuery(element) href = e.attr('href') new_href = re.sub(domain, target_domain, href) e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def laundry_document(self, html, response): html = self.clean_html_document(html) #replace a tag to text #remove class / id cleaner = Cleaner(style=False, links=False, add_nofollow=False, page_structure=True, safe_attrs_only=True) html = cleaner.clean_html(html) dom = PyQuery(html) dom = self.convert_imgs(dom, response) dom = self.remove_links(dom) html = dom.html() #need to remove empty tags return html
def prepare_html(fileobj): """ prepares the html for wordpress pages """ pq=PyQuery("".join(strip_if_not_pre(fileobj))) out = PyQuery(pq("div.content").outerHtml() ) # TODO: do we want to extract the title # Do we want title at all? if out("div.section"): out("div.section")[0].set("itemscope","true") out("div.section")[0].set("itemtype","http://schema.org/WebPage") if out("div.section > p > em"): out("div.section > p > em")[0].set("itemprop","author") if out("div.section p"): if out("div.section > p > em"): out("div.section p")[1].set("itemprop","description") else: out("div.section p")[0].set("itemprop","description") if pq("div.section h1"): title= pq("div.section h1")[0].text out("div.section h1").css("display","none") " set schema.org microdata for sharing " out("div.section h1")[0].set("itemprop","name") else: title="" # TODO: insert toc (??) # insert after h1 on 4th ine # lines = out.split('\n') # out = '\n'.join(lines[:4] + [ '[toc]' ] + lines[4:]) # now various regex out=out.html() # replace .html with / and index.html with simple ./ pattern = '(internal" href=".[^"]*)index\.html"' out = re.sub(pattern, '\\1"', out) pattern = 'internal" href="index\.html"' out = re.sub(pattern, 'href="./"', out) pattern = '(internal" href="[^"]*).html"' out = re.sub(pattern, '\\1/"', out) pattern = '(internal" href="[^"]*).html#([^"]*)"' out = re.sub(pattern, '\\1/#\\2"', out) return (out, title)
def fixLinks(text, parser): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a, link'): e = PyQuery(element) href = e.attr('href') if href is None: continue new_href = re.sub(r'(rss/index\.html)|((?<!\.)rss/?)$', 'rss/index.rss', href) if not abs_url_regex.search(href): new_href = re.sub(r'/index\.html$', '/', new_href) if href != new_href: e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def get_toc(self): def get_last_child(e, level): if level==0: return e if level>0: ee = e.children('ul:last-child') if not ee: #e.append('<li>no title</li>') e.append('<ul></ul>') ee = e.children('ul:last-child') return get_last_child(ee, level-1) out = PyQuery('<ul></ul>') for tag in self.q('h1, h2, h3').items(): level = int(tag[0].tag[1])-1 assert(0<=level) aname = tag.attr('id') pp = get_last_child(out, level) pp.append(f'<li><a href="#{aname}">{tag.text()}</a></li>\n') return out.html() or ""
def to_xml(self): (_tag, contents) = list(self.iteritems())[0] pqi = PyQuery('<wrap />') def _append_contents(struct, par): tag = struct['tag'] _node = PyQuery('<%s />' % tag) if 'attributes' in struct: for key in struct['attributes'].keys(): _node.attr(key, struct['attributes'][key]) if 'text' in struct: _node.text(struct['text']) elif 'children' in struct: for (ugh, child) in struct['children'].iteritems(): _append_contents(child, _node) par.append(_node) _append_contents(contents, pqi) _xio = StringIO(pqi.html()) _parsed = etree.parse(_xio) return etree.tostring(_parsed, pretty_print=True)
def fixLinks(text, parser): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a'): e = PyQuery(element) href = e.attr('href') print href if href is None: continue new_href = re.sub(r'(rss/index\.html)|(rss/?)$', 'rss/index.rss', href) if not abs_url_regex.search(href): new_href = re.sub(r'/index\.html$', '/', new_href) if href != new_href: e.attr('href', new_href) print "\t", href, "=>", new_href # remove ?v=XXXXXXXXX in css for element in d('link'): e = PyQuery(element) href = e.attr('href') if href is None: continue if re.match(r'http://fonts',href) is not None: continue new_href = re.sub(r'\?.*', '',href) if href != new_href: e.attr('href',new_href) print "\t", href, "=>", new_href # remove ?v=XXXXXXXXX in js for element in d('script'): e = PyQuery(element) src = e.attr('src') if src is None: continue new_src = re.sub(r'\?.*', '',src) if src != new_src: e.attr('src',new_src) print "\t", src, "=>", new_src ################### if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def refresh_text(self): """ Save the current text in the if clause and refresh the selected clauses. """ text = PyQuery(self.simplified_text) for key in self.keyword_ids.filtered( lambda k: k.type in ('if', 'for', 'for_ul')): text_selector = text('#' + key.html_id) current_text = text_selector.html() # Save the current text in the correct if clause if key.edit_changed % 2 == 0: edit_value = key.edit_value else: edit_value = not key.edit_value if current_text is not None: key.set_text(current_text, edit_value) if key.edit_changed and not self._context.get('save_mode'): # Now we fetch the current clause text text_selector.html(key.get_text()) key.write({'edit_changed': 0}) self.with_context(no_update=True).simplified_text = text.html() return True
def fixLinks(text, parser): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a, link'): e = PyQuery(element) href = e.attr('href') if href is None: continue if (not abs_url_regex.search(href)) or ('/rss/' in href): new_href = re.sub(r'rss/$', 'feed.rss', href) new_href = re.sub(r'index\.html$', '', new_href) new_href = re.sub(r'index\.html\#$', '', new_href) e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return "<!DOCTYPE html>\n<html>" + d.html( method='html').encode('utf8') + "</html>" elif parser == 'xml': return "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + d.__unicode__( ).encode('utf8') return "<!DOCTYPE html>\n<html>" + d.__unicode__().encode( 'utf8') + "</html>"
def GetBrands(): mysql = pymysql.connect("localhost", "root", "root", "test", charset="utf8") url = "https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx?typeId=1%20&brandId=0%20&fctId=0%20&seriesId=0" r = GetHtml(url) doc = PyQuery(r.text) cartree = doc('.cartree') cursor = mysql.cursor() count = 0 for pp in doc("ul"): pp1 = PyQuery(pp) for zipp in pp1('li'): a = PyQuery(zipp).find("a") title = a.html() title = re.findall("/>(.*)<em>", title) number = PyQuery(zipp).find("a").find("em").html() number = re.findall("[(](.*)[)]", number) a = 'https://car.autohome.com.cn' + a.attr("href") sql = "insert into brands (brand,count,url) values('%s','%s','%s')" % ( title[0], number[0], a) try: # 执行sql语句 cursor.execute(sql) # 执行sql语句 mysql.commit() count = count + 1 except: # 发生错误时回滚 mysql.rollback() # 关闭数据库连接 mysql.close() return count
def build_dict_from_sane_json(elem: PyQuery, already_wrapped=False) -> dict: # Find if has children elem = PyQuery(elem) children = list(elem.contents()) has_children = len(elem.children()) > 0 contents = [] if has_children: # Fix unwrapped children if not already_wrapped: children = fix_unwrapped_text(elem).contents() for child in children: child_dict = build_dict_from_sane_json(child, already_wrapped=True) if child_dict: contents.append(child_dict) else: contents = elem.html() extra = {} # Only tables need the HTML (to use later for extraction of relevant data) if elem.is_("table"): extra = {'original_html': str(elem)} if 'src' in elem[0].attrib: extra['src'] = elem.attr('src') if 'href' in elem[0].attrib: extra['href'] = elem.attr('href') return { 'type': list(elem)[0].tag, 'attrs': [], 'layout': {}, 'contents': contents, 'extra': extra }
def prepare_html(fileobj): """ prepares the html for wordpress pages """ pq=PyQuery("".join(strip_if_not_pre(fileobj))) pq("a.headerlink").remove() # Do we want title at all? if pq("div.section h1"): title= pq("div.section h1")[0].text pq("div.section h1:first").remove() else: title="" # TODO: insert toc (??) out = PyQuery(pq("div.content").outerHtml() ) # insert after h1 on 4th ine # lines = out.split('\n') # out = '\n'.join(lines[:4] + [ '[toc]' ] + lines[4:]) # now various regex out=out.html() print out # replace .html with / and index.html with simple ./ pattern = '(internal" href=".[^"]*)index\.html"' out = re.sub(pattern, '\\1"', out) pattern = 'internal" href="index\.html"' out = re.sub(pattern, 'href="./"', out) pattern = '(internal" href="[^"]*).html"' out = re.sub(pattern, '\\1/"', out) pattern = '(internal" href="[^"]*).html#([^"]*)"' out = re.sub(pattern, '\\1/#\\2"', out) pattern = '(internal" href="[^"]*/)index/#([^"]*)"' out = re.sub(pattern, '\\1/#\\2"', out) return (out, title)
def get_readers_from_html_content(self, fname, html, **kwargs): try: from pyquery import PyQuery except: print >>sys.stderr, "could not import pyquery" return [] parsers = [] pq = PyQuery(html) tables = self.find_ideal_tables(pq('table')) for table_el in tables: try: table = PyQuery(table_el) p = HTMLTableParser(StringIO(table.html()), fname, **kwargs) i = p.get_data_iter() consistent, ncols = html_rows_consistent(i()) if consistent and ncols > 1: parsers.append(i) except KeyboardInterrupt: pass except Exception as e: _log.info(traceback.format_exc()) return parsers
def fix_meta_url_links(text, parser): filetext = text.decode('utf8') td_regex = re.compile(target_domain + '|') assert target_domain, "target domain must be specified --target_domain=<http://your-host-url>" d = PyQuery(bytes(bytearray(filetext, encoding='utf-8')), parser=parser) for share_class in [ 'meta[property="og:url"], meta[name="twitter:url"]' ]: print "share_class : ", share_class for element in d(share_class): e = PyQuery(element) print "element : ", e href = e.attr('content') print "href : ", href print "domain : ", domain print "target_domain : ", target_domain new_href = re.sub(domain, target_domain, href) e.attr('content', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
exit(1) # csvwriter.writerow(csv_header) # Query the first page with location option url = "https://www.vrbo.com/search/keywords:chapel-hill-nc-usa/@35.874919139908165,-79.113394930114,35.95736065574712,-79.01846618865892,13z?petIncluded=false&ssr=true" # url = 'https://www.homeaway.com/results/keywords:Chapel%20Hill%2C+NC%2C+USA%29/Page:' results = [['' for i in range(5)] for j in range(200)] result_count = 0 page = 1 while page < 2: x = PyQuery(url + str(page)) start_location = end_location = 0 while True: property_data = {} start_location = x.html().find('"bathrooms":', end_location) + 12 end_location = x.html().find('}', start_location) + 1 try: property_data = json.loads(x.html()[start_location:end_location]) results[result_count][ 0] = property_data['full'] + property_data['half'] * 0.5 except: break start_location = x.html().find('"bedrooms":', end_location) + 11 end_location = x.html().find(',', start_location) results[result_count][1] = x.html()[start_location:end_location] start_location = x.html().find('"propertyType":', end_location) + 15 end_location = x.html().find(',', start_location) results[result_count][2] = re.sub( r'"', '',
class SummaryPublisherEngine(object): def __init__(self, pro, doc, wc, group, organization=None): self.project = pro self.document = doc self.word_count = wc self.groups = [group.key, Group.get_worldshare().key] self.organization = organization self.user = User() self.user.groups = self.groups self.walker = ConceptPublishWalker(pro) if organization: self.user.organization = organization.key self.html = '' self.body = Pq('<span></span>') self.con_count = 0 self.paragraph = None def _get_next_concept(self): for level in self.walker: for concept in level: yield concept def render(self): cur_wc = 0 concept_count = 0 processed_concepts = {} for concept in self._get_next_concept(): if concept: if not concept.has_permission_read(self.user): continue render = True if not concept.is_summary_crawlable(document=self.document, project=self.project): render = False attr = concept.get_attr_by_doc(self.document) if attr and attr.is_header(): render = False if attr and attr.is_image(): render = False if render: phrase = concept.get_phrasing(doc=self.document, return_text=False) wc = phrase.get_word_count() if wc + cur_wc > self.word_count: break concept_count += 1 cur_wc += wc parent = concept.get_parent() if not processed_concepts.get(parent.id): processed_concepts[parent.id] = [] processed_concepts[parent.id].append(concept) paragraph_divider = 300 paragraph_count = cur_wc / paragraph_divider if cur_wc % paragraph_divider > 0: paragraph_count += 1 con_pre_par = (concept_count / paragraph_count) + 1 self.paragraph = Pq('<p></p>') self.body.append(self.paragraph) self.con_count = 0 self._render(self.project, con_pre_par, processed_concepts) self.html = self.body.html(method='html') def _render(self, parent, con_pre_par, processed_concepts): if not processed_concepts.get(parent.id): return for concept in processed_concepts.get(parent.id): render = True if not concept.is_summary_crawlable(document=self.document, project=self.project): render = False attr = concept.get_attr_by_doc(self.document) if attr and attr.is_header(): render = False if attr and attr.is_image(): render = False if render: if self.con_count == con_pre_par: self.con_count = 0 self.paragraph = Pq('<p></p>') self.body.append(self.paragraph) phrase = concept.get_summary_phrasing(document=self.document) span = Pq('<span></span>') span.append(phrase.text + ' ') # span.css('background-color', ChannelToken.generate_color()) self.paragraph.append(span) self.con_count += 1 self._render(concept, con_pre_par, processed_concepts)
def save_cache(self, content: pq): with open(self.get_cache_filename(), 'w+', encoding='utf-8') as f: print(content.html(), file=f) f.close()
def _wrap(elem): """ Wrap an element with a span element """ span = PyQuery('<span></span>') span.html(elem) return span
})) for i, data in enumerate(db.data) ] state['all'] = len(db.data) # threadLoadCont(data, i) mReq = threadpool.makeRequests(threadLoadCont, argList) [pool.putRequest(req) for req in mReq] pool.wait() # 加载主界面 loadPage = LoadPage(hostURL, 'gbk') # a标签 aList = PQ(loadPage.data)('dl.chapterlist a') # 提取全部a标签存入数据 for i in aList: aElem = PQ(i) title = aElem.html() url = hostURL + aElem.attr('href') if not (url in db.data): db.setData(url, {'title': title, 'url': url, 'isLoad': False}) # 保存 db.save() loadCont() print( '\n\n抓取:【%s】\n总章节:%s\n成功:%s\n失败:%s\n缓存读取:%s\n网络抓取:%s\n错误地址:' % (hostURL, state['all'], state['success'], state['error'], state['forCache'], state['forNet']), state['errList'])
def _generate_translation(self): """ Generate child description. """ desc = PyQuery(HTML_TEMPLATE) # 1. Program type only if Home Based + Birthday estimate ######################################################## child = self.child_id if child.cdsp_type == "Home Based": desc(".program_type").html( self.home_based_lang[self.env.lang][child.gender].format( preferred_name=child.preferred_name)) else: desc("#program_type").remove() if child.estimated_birthdate: desc(".birthday_estimate").html( _("* The birthday is an estimation.")) else: desc("#birthday_estimate").remove() # 2. Household ############## household = child.household_id.with_context(active_gender=child.gender) live_with = self._live_with() desc("#live_with").html(live_with) if not household.father_living_with_child: f_alive = desc(".father").children(".is_alive") f_alive[0].text = _("Father alive") f_alive[1].text = household.translate("father_alive") else: desc(".father").remove() self._job(desc(".father_job"), "father") if not household.mother_living_with_child: m_alive = desc(".mother").children(".is_alive") m_alive[0].text = _("Mother alive") m_alive[1].text = household.translate("mother_alive") else: desc(".mother").remove() self._job(desc(".mother_job"), "mother") if household.nb_brothers: desc(".brothers")[0].text = _("Number of brothers") desc(".brothers")[1].text = str(household.nb_brothers) else: desc(".brothers").remove() if household.nb_sisters: desc(".sisters")[0].text = _("Number of sisters") desc(".sisters")[1].text = str(household.nb_sisters) else: desc(".sisters").remove() # 3. Schooling ############## if child.us_grade_level and child.us_grade_level != "Not Enrolled": # Make sure the education level is set child.convert_us_grade_to_education_level() desc("#school_attending").remove() desc(".school_level")[0].text = _("School level") desc(".school_level")[1].text = child.translate("education_level") if child.major_course_study: desc(".school_subject")[0].text = _("Best school subject") desc(".school_subject")[1].text = child.translate( "major_course_study") else: desc("#school_subject").remove() if child.vocational_training_type and \ child.vocational_training_type.lower() not in ( "not enrolled", "other"): desc(".vocational_training")[0].text = _("Vocational training") desc(".vocational_training")[1].text = child.translate( "vocational_training_type") else: desc("#vocational_training").remove() else: desc(".school_attending_title").html( self.school_no_lang[self.env.lang][child.gender].format( preferred_name=child.preferred_name)) desc(".school").remove() # 4. House duties ################# if child.duty_ids: desc("#house_duties_intro").html( self.duties_intro_lang[self.env.lang][child.gender]) desc("#house_duties_list").html("".join([ "<li>" + duty.value + "</li>" for duty in child.duty_ids[:3] ])) else: desc(".house_duties").remove() # 5. Church activities ###################### if child.christian_activity_ids: desc("#church_activities_intro").html( self.church_intro_lang[self.env.lang][child.gender]) desc("#church_activities_list").html("".join([ "<li>" + activity.value + "</li>" for activity in child.christian_activity_ids[:3] ])) else: desc(".church_activities").remove() # 6. Hobbies ############ if child.hobby_ids: desc("#hobbies_intro").html( self.hobbies_intro_lang[self.env.lang][child.gender].format( preferred_name=child.preferred_name)) desc("#hobbies_list").html("".join([ "<li>" + hobby.value + "</li>" for hobby in child.hobby_ids[:3] ])) else: desc(".hobbies").remove() # 7. Health ########### if child.physical_disability_ids or child.chronic_illness_ids: desc("#handicap_intro").html( self.handicap_intro_lang[self.env.lang][child.gender].format( preferred_name=child.preferred_name)) handicap_list = [] if child.physical_disability_ids: handicap_list.extend([ "<li>" + handicap.value + "</li>" for handicap in child.physical_disability_ids ]) if child.chronic_illness_ids: handicap_list.extend([ "<li>" + illness.value + "</li>" for illness in child.chronic_illness_ids ]) desc("#handicap_list").html("".join(handicap_list)) else: desc(".handicap").remove() return desc.html()
def main(): arguments = docopt(__doc__, version='0.1.3') if arguments['--dir'] is not None: static_path = arguments['--dir'] else: static_path = os.path.join(os.getcwd(), 'static') if arguments['--web-url'] is not None: web_url = "{}".format(arguments['--web-url']) else: web_url = None domain = arguments['--domain'] if arguments['generate']: command = ( "wget " "--level=0 " # set level to infinitive "--recursive " # follow links to download entire site "--convert-links " # make links relative "--page-requisites " # grab everything: css/in-lined images "--no-parent " # don't go to parent level "--directory-prefix {1} " # download content to static/folder "--no-host-directories " # don't create domain named folder "--restrict-file-name=unix " # don't escape query string "{0}").format(domain, static_path) os.system(command) command = ( "wget " "--level=0 " # set level to infinitive "--recursive " # follow links to download entire site "--convert-links " # make links relative "--page-requisites " # grab everything: css/in-lined images "--no-parent " # don't go to parent level "--directory-prefix {1} " # download content to static/folder "--no-host-directories " # don't create domain named folder "--restrict-file-name=unix " # don't escape query string "{0}/about/").format(domain, static_path) os.system(command) # rather do this with sitemap-generator """ # copy sitemap files since Ghost 0.5.7 base_command = "wget --convert-links --page-requisites --no-parent " \ "--directory-prefix {1} --no-host-directories " \ "--restrict-file-name=unix {0}/{2}" command = base_command.format(domain, static_path, "sitemap.xsl") os.system(command) command = base_command.format(domain, static_path, "sitemap.xml") os.system(command) command = base_command.format(domain, static_path, "sitemap-pages.xml") os.system(command) command = base_command.format(domain, static_path, "sitemap-posts.xml") os.system(command) command = base_command.format(domain, static_path, "sitemap-authors.xml") os.system(command) command = base_command.format(domain, static_path, "sitemap-tags.xml") os.system(command) """ def pullRss(path): if path is None: baserssdir = os.path.join(static_path, "rss") mkdir_p(baserssdir) wget_command = ("wget --output-document=" + baserssdir + "/feed.rss {0}/rss/").format(domain) os.system(wget_command) else: for feed in os.listdir(os.path.join(static_path, path)): rsspath = os.path.join(path, feed, "rss") rssdir = os.path.join(static_path, 'rss', rsspath) mkdir_p(rssdir) wget_command = ("wget --output-document=" + rssdir + "/index.html {0}/" + rsspath).format(domain) os.system(wget_command) #pullRss("tag") #pullRss("author") # create 404.html file path_404 = os.path.join(static_path, "404.html") shutil.copyfile(os.path.join(static_path, "index.html"), path_404) with open(path_404) as f: file_text = f.read() d = PyQuery(bytes(bytearray(file_text, encoding='utf-8')), parser='html') e = d('main') e.replaceWith( """<main id="content"> <h2>404: Page not found</h2></main>""") text = d.html(method='html') text = text.replace('assets/styles/crisp.css', 'https://rdrn.me/assets/styles/crisp.css') new_text = "<!DOCTYPE html>\n<html>" + text + "</html>" with open(path_404, 'w') as f: try: f.write(new_text) except UnicodeEncodeError: f.write(new_text.encode('utf-8')) # remove query string since Ghost 0.4 file_regex = re.compile(r'.*?(\?.*)') bad_file_regex = re.compile(r'.+\.[0-9]{1,2}$') static_page_regex = re.compile(r"^([\w-]+)$") for root, dirs, filenames in os.walk(static_path): for filename in filenames: if file_regex.match(filename): newname = re.sub(r'\?.*', '', filename) print("Rename", filename, "=>", newname) os.rename(os.path.join(root, filename), os.path.join(root, newname)) if bad_file_regex.match(filename): os.remove(os.path.join(root, filename)) # if we're inside static_path or static_path/tag, rename # extension-less files to filename.html if (root == static_path or root == os.path.join(static_path, 'tag'))\ and static_page_regex.match(filename)\ and filename != 'CNAME' and filename != 'LICENSE': newname = filename + ".html" newpath = os.path.join(root, newname) try: os.remove(newpath) except OSError: pass shutil.move(os.path.join(root, filename), newpath) # remove superfluous "index.html" from relative hyperlinks found in text abs_url_regex = re.compile(r'^(?:[a-z]+:)?//', flags=re.IGNORECASE) bad_url_regex = bad_file_regex def fixLinks(text, parser): if text == '': return '' try: d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) except UnicodeDecodeError: d = PyQuery(bytes(bytearray(text)), parser=parser) for element in d('a, link'): e = PyQuery(element) href = e.attr('href') if href is None: continue if (not abs_url_regex.search(href)) or ('/rss/' in href): new_href = re.sub(r"index.html", r"", href) new_href = re.sub(r"^([\w-]+)$", r"\1.html", new_href) if href != new_href: e.attr('href', new_href) print("\t", href, "=>", new_href) if (not abs_url_regex.search(href)) or ('/rss/' in href): new_href = re.sub(r"/([\w-]+)$", r"/\1.html", href) new_href = re.sub(r"^([\w-]+)$", r"\1.html", new_href) if href != new_href: e.attr('href', new_href) print("\t", href, "=>", new_href) href = e.attr('href') if bad_url_regex.search(href): new_href = re.sub(r'(.+)\.[0-9]{1,2}$', r'\1', href) e.attr('href', new_href) print("\t FIX! ", href, "=>", new_href) return "<!DOCTYPE html>\n<html>" + d.html( method='html') + "</html>" # fix links in all html files for root, dirs, filenames in os.walk(static_path): for filename in fnmatch.filter(filenames, "*.html"): filepath = os.path.join(root, filename) parser = 'html' if root.endswith("/rss"): # rename rss index.html to index.rss parser = 'xml' newfilepath = os.path.join( root, os.path.splitext(filename)[0] + ".rss") os.rename(filepath, newfilepath) filepath = newfilepath with open(filepath) as f: filetext = f.read() print("fixing links in ", filepath) newtext = filetext if parser == 'html': newtext = fixLinks(filetext, parser) with open(filepath, 'w') as f: try: f.write(newtext) except UnicodeEncodeError: f.write(newtext.encode('utf-8')) def trans_local_domain(text): modified_text = text.replace('http://localhost:2368', web_url) modified_text = modified_text.replace('http://', 'https://') modified_text = modified_text.replace('https://rdrn.me/', '/') modified_text = re.sub(r'(rss\/)[a-z]+(.html)', r'\1index.rss', modified_text) return modified_text def remove_v_tag_in_css_and_html(text): modified_text = re.sub(r"%3Fv=[\d|\w]+\.css", "", text) modified_text = re.sub(r".js%3Fv=[\d|\w]+", ".js", modified_text) modified_text = re.sub(r".woff%3[\d|\w]+", ".woff", modified_text) modified_text = re.sub(r".ttf%3[\d|\w]+", ".ttf", modified_text) modified_text = re.sub(r"css\.html", "css", modified_text) modified_text = re.sub(r"png\.html", "png", modified_text) modified_text = re.sub(r"jpg\.html", "jpg", modified_text) return modified_text for root, dirs, filenames in os.walk(static_path): for filename in filenames: if filename.endswith( ('.html', '.css', '.xsl', '.rss')): # removed xml filepath = os.path.join(root, filename) with open(filepath) as f: filetext = f.read() print("fixing local domain in ", filepath) newtext = trans_local_domain(filetext) newtext = remove_v_tag_in_css_and_html(newtext) with open(filepath, 'w') as f: f.write(newtext) elif arguments['preview']: os.chdir(static_path) Handler = http.server.SimpleHTTPRequestHandler httpd = socketserver.TCPServer(("", 9001), Handler) print("Serving at port 9000") # gracefully handle interrupt here httpd.serve_forever() elif arguments['setup']: if arguments['--gh-repo']: repo_url = arguments['--gh-repo'] else: repo_url = input("Enter the Github repository URL:\n").strip() # Create a fresh new static files directory if os.path.isdir(static_path): confirm = input( "This will destroy everything inside static" " Are you sure you want to continue? (y/N)").strip() if confirm != 'y' and confirm != 'Y': sys.exit(0) shutil.rmtree(static_path) # User/Organization page -> master branch # Project page -> gh-pages branch branch = 'gh-pages' regex = re.compile(".*[\w-]+\.github\.(?:io|com).*") if regex.match(repo_url): branch = 'master' # Prepare git repository repo = Repo.init(static_path) git = repo.git if branch == 'gh-pages': git.checkout(b='gh-pages') repo.create_remote('origin', repo_url) # Add README file_path = os.path.join(static_path, 'README.md') with open(file_path, 'w') as f: f.write( '# Blog\nPowered by [Ghost](http://ghost.org)' ' and [Buster](https://github.com/manthansharma/buster/).\n') print("All set! You can generate and deploy now.") elif arguments['deploy']: repo = Repo(static_path) repo.git.add('.') current_time = strftime("%Y-%m-%d %H:%M:%S", gmtime()) repo.index.commit('Blog update at {}'.format(current_time)) origin = repo.remotes.origin repo.git.execute( ['git', 'push', '-u', origin.name, repo.active_branch.name]) print("Good job! Deployed to Github Pages.") elif arguments['add-domain']: repo = Repo(static_path) custom_domain = arguments['<domain-name>'] file_path = os.path.join(static_path, 'CNAME') with open(file_path, 'w') as f: f.write(custom_domain + '\n') print("Added CNAME file to repo. Use `deploy` to deploy") else: print(__doc__)
def __get_data(self): resp = self.session.get(reportURL) doc = PyQuery(resp.text) html = doc.html() tiwen = 36.5 + random.uniform(0, 0.3) tiwen = round(tiwen, 1) zxMatch = re.findall(r'f8_state={.*?"SelectedValue":"(.+?)"', html)[0] gnMatch = re.findall(r'f14_state={.*?"SelectedValue":"(.+?)"', html)[0] shengMatch = re.findall(r'f16_state={.+?"SelectedValueArray":\["(.+?)"]', html)[0] shiMatch = re.findall(r'f17_state={.*?"F_Items":(.+?),"SelectedValueArray":\["(.+?)"]', html)[0] xianMatch = re.findall(r'f18_state={.*?"F_Items":(.+?),"SelectedValueArray":\["(.+?)"]', html)[0] # print(shiMatch) xxMatch = re.findall(r'f20_state={.*?"Text":"(.+?)"', html)[0] F_State = template % ( self.date, zxMatch, gnMatch, shengMatch, shiMatch[0], shiMatch[1], xianMatch[0], xianMatch[1], xxMatch, "否") return { 'F_State': base64.b64encode(F_State.encode()), '__VIEWSTATE': doc.find('#__VIEWSTATE').attr('value'), '__EVENTTARGET': 'p1$ctl00$btnSubmit', '__EVENTARGUMENT': '', '__VIEWSTATEGENERATOR': doc.find('#__VIEWSTATEGENERATOR').attr('value'), 'p1$ChengNuo': 'p1_ChengNuo', 'p1$BaoSRQ': self.date, 'p1$DangQSTZK': '良好', 'p1$TiWen': str(tiwen), 'F_TARGET': 'p1_ctl00_btnSubmit', 'p1_Collapsed': 'false', 'p1$CengFWH_RiQi': '', 'p1$CengFWH_BeiZhu': '', 'p1$JieChu_RiQi': '', 'p1$JieChu_BeiZhu': '', 'p1$TuJWH_RiQi': '', 'p1$TuJWH_BeiZhu': '', 'p1$JiaRen_BeiZhu': '', 'p1$ZaiXiao': zxMatch, "p1$MingTDX": "不到校", "p1$MingTJC": "否", "p1$BanChe_1$Value": '0', "p1$BanChe_1": '不需要乘班车', "p1$BanChe_2$Value": '0', "p1$BanChe_2": '不需要乘班车', 'p1$GuoNei': '国内', "p1$ddlGuoJia$Value": "-1", "p1$ddlGuoJia": "选择国家", 'p1$ddlSheng$Value': shengMatch, 'p1$ddlSheng': shengMatch, 'p1$ddlShi$Value': shiMatch[1], 'p1$ddlShi': shiMatch[1], 'p1$ddlXian$Value': xianMatch[1], 'p1$ddlXian': xianMatch[1], 'p1$XiangXDZ': xxMatch, "p1$FanXRQ": "", "p1$WeiFHYY": "", "p1$ShangHJZD": "", 'p1$QueZHZJC$Value': '否', 'p1$QueZHZJC': '否', 'p1$DangRGL': '否', # 是否隔离 'p1$DaoXQLYGJ': '', # 旅游国家 'p1$DaoXQLYCS': '', # 旅游城市 'p1$Address2': '中国', 'p1$SuiSM': '绿色', # 随申码颜色 'p1$LvMa14Days': '是', # 截止今天是否连续14天健康码为绿色 'p1$GeLDZ': '', "p1_SuiSMSM_Collapsed": "false", "p1_GeLSM_Collapsed": 'false', "p1_SuiSMSM_Collapsed": 'false' }
def _generate_translation(self): """ Generate project description. """ desc = PyQuery(HTML_TEMPLATE) # 1. Basic Information ###################### project = self.project_id desc('.project_name')[0].text = _("Project name") desc('.project_name')[1].text = project.name desc('.project_closest_city')[0].text = _("Closest city") self._show_field( desc('.project_closest_city')[1], desc('#project_closest_city'), project.closest_city) desc('.project_cdsp_number')[0].text = _("Number of children") self._show_field( desc('.project_cdsp_number')[1], desc('#project_cdsp_number'), project.nb_cdsp_kids) if project.electrical_power == 'Not Available': desc('.project_electricity').html( _("The project has no electricity.")) else: desc('#project_electricity').remove() # 2. Community ############## desc('#community_label').html(_("Local community")) desc('.community_population')[0].text = _("Population") self._show_field( desc('.community_population')[1], desc('#community_population'), '{:,}'.format(project.community_population).replace(',', "'")) desc('.community_language')[0].text = _("Language") self._show_field( desc('.community_language')[1], desc('#community_language'), project.primary_language_id.name) if project.primary_adults_occupation_ids: desc('.community_job')[0].text = _("Typical job") self._show_field( desc('.community_job')[1], desc('#community_job'), project.primary_adults_occupation_ids[0].value) else: desc('#community_job').remove() if project.chf_income and 10 < project.chf_income < 500: desc('.community_income')[0].text = _("Family monthly income") desc('.community_income')[1].text = 'CHF {:10.0f}.-'.format( project.chf_income) else: desc('#community_income').remove() desc('.community_food')[0].text = _("Typical food") if project.primary_diet_ids: desc('.community_food')[1].text = project.primary_diet_ids[0].value else: desc('#community_food').remove() desc('.community_school_begins')[0].text = _("School begins in") self._show_field( desc('.community_school_begins')[1], desc('#community_school_begins'), project.translate('school_year_begins')) # 3. Activities ############### spiritual = project.get_activities('spiritual_activity', 3) physical = project.get_activities('physical_activity', 3) cognitive = project.get_activities('cognitive_activity', 3) socio = project.get_activities('socio_activity', 3) if spiritual or physical or cognitive or socio: desc('#activities_label').html( _("Project activities for children")) else: desc('#activities').remove() if spiritual: desc('.spiritual_activities').html(_("Spiritual activities")) desc('#spiritual_activities_list').html(''.join( ['<li>' + activity + '</li>' for activity in spiritual])) else: desc('#spiritual_activities').remove() if physical: desc('.physical_activities').html(_("Physical activities")) desc('#physical_activities_list').html(''.join( ['<li>' + activity + '</li>' for activity in physical])) else: desc('#physical_activities').remove() if cognitive: desc('.cognitive_activities').html(_("Cognitive activities")) desc('#cognitive_activities_list').html(''.join( ['<li>' + activity + '</li>' for activity in cognitive])) else: desc('#cognitive_activities').remove() if socio: desc('.socio_activities').html(_("Socio-emotional activities")) desc('#socio_activities_list').html(''.join( ['<li>' + activity + '</li>' for activity in socio])) else: desc('#socio_activities').remove() if project.activities_for_parents: desc('.parent_activities').html( _("In addition, the project offers special activities for the " "parents such as education courses.")) else: desc('#parent_activities').remove() return desc.html()
def parse_detail(url, filename): if 'genshuixue' not in url: #url = 'http://jingyan.baidu.com' + url pass #html = urllib2.urlopen(url).read() content = get_content(url, filename) if not content: return jq = PyQuery(content) res_json = { 'bread': jq('.bread-wrap').text().replace('>','').split()[-2:], 'title': jq('h1').text().replace('听语音',''), 'date': jq('time').text()[:10], 'source': 'baidu', 'url': url.replace('\n',''), 'class': 36, 'subject': u'经验', 'data_weight': 0, } methods = [] content = [each for each in jq('.exp-content-block')] print len(content) if not content: return None elif len(content) == 1: _list = [] for steps in PyQuery(content[0])('ol li'): step = PyQuery(steps) step_title = step.text() image = step.html() img = image.split('data-src="')[-1].split('"')[0] if image and '<img' in image else '' #print img _list.append({ 'img': img, 'title': step_title, 'substeps': [], }) methods.append(_list) abstract = {} else: try: question_desc_img = PyQuery(content[0])('.content-listblock-image').html().split('data-src="')[-1].split('"')[0] except: question_desc_img = '' abstract = { 'title': '', 'steps': [PyQuery(content[0])('p').text(),], 'img': question_desc_img } for each in content[1:]: method = PyQuery(each) title = method('h2').text() #print title _list = [] steps_list = [step for step in method('ol li')] if not steps_list: steps_list = [step for step in method('ul li')] for steps in steps_list: step = PyQuery(steps) step_title = step.text() image = step('.content-list-image a').html() img = image.split('data-src="')[-1].split('"')[0] if image else '' _list.append({ 'img': img, 'title': step_title, 'substeps': [], }) if not _list: _list.append((method('.content-listblock-text').text())) methods.append({'title': title, 'steps': _list}) #工具 if methods[0]['title'] == u'工具/原料': prepare = {'title': methods[0]['title'], 'steps': [v['title'] for v in methods[0]['steps']]} methods = methods[1:] res_json['prepare'] = prepare #注意事项 if len(methods) == 0: return None if methods[-1]['title'] == u'注意事项': summary = {'title': methods[-1]['title'], 'steps': [v['title'] for v in methods[-1]['steps']]} methods = methods[:-1] res_json['summary'] = summary res_json['methods'] = methods res_json['abstract'] = abstract #print json.dumps(res_json) return json.dumps(res_json)
def get_users(): global month logging.info('Récupération des membres') if config.debug: progress = progressbar.NoProgressBar() else: progress = progressbar.ProgressBar(widgets=[progressbar.SimpleProgress('/'), ' ', progressbar.Bar("#","[","]"), progressbar.Percentage()], maxval=save.nbusers) progress.start() n = len(save.users) progress.update(n) ids = [i["id"] for i in save.users] d = PyQuery(url=config.rooturl+'/admin/index.forum?part=users_groups&sub=users&extended_admin=1&' + tid, opener=fa_opener) if "notgetmember_pic.forum?u=" in d.html(): raise RuntimeError('Forum user page in "import protected" mode - cannot process users...') result = re.search('function do_pagination_start\(\)[^\}]*start = \(start > \d+\) \? (\d+) : start;[^\}]*start = \(start - 1\) \* (\d+);[^\}]*\}', d.text()) try: pages = int(result.group(1)) usersperpages = int(result.group(2)) except: pages = 1 usersperpages = 0 currentpage = int(n/usersperpages) memberslastpage = save.nbusers % usersperpages logging.debug('Utilisateurs : %d pages de %d membres - starting from page %d', pages, usersperpages, currentpage) for page in range(currentpage, pages): pageNumber = page*usersperpages if page == pages-1 : usersperpages = memberslastpage # nombre de membres sur la dernière page if page >= 1: time.sleep(61); d = PyQuery(url=config.rooturl + '/admin/index.forum?part=users_groups&sub=users&extended_admin=1&start=' + str(pageNumber) + '&' + tid, opener=fa_opener) logging.debug('Récupération membre via url: %s', config.rooturl + '/admin/index.forum?part=users_groups&sub=users&extended_admin=1&start=' + str(pageNumber) + '&' + tid) if ("notgetmember_pic.forum?u=" in d.html() or "Liste des Utilisateurs" not in d.text()) : raise RuntimeError('Forum user page in "import proteced" mode - cannot process users...') alluserinthepage = 0 for i in d('tbody tr'): if alluserinthepage == usersperpages: break e = PyQuery(i) addr = e("td a").eq(0).attr("href") if addr != "None": alluserinthepage += 1 id = int(re.search("&u=(\d+)&", e("td a").eq(0).attr("href")).group(1)) logging.debug('Récupération : membre %d', id) date = e("td").eq(3).text().split(" ") date = time.mktime(time.struct_time((int(date[2]),month[date[1]],int(date[0]),0,0,0,0,0,0))) lastvisit = e("td").eq(4).text() if lastvisit != "": lastvisit = lastvisit.split(" ") lastvisit = time.mktime(time.struct_time((int(lastvisit[2]),month[lastvisit[1]],int(lastvisit[0]),0,0,0,0,0,0))) else: lastvisit = 0 if id not in ids: name = e("td a").eq(0).text() save.users.append({'id': id, 'newid': n, 'name': e("td a").eq(0).text(), 'mail': e("td a").eq(1).text(), 'posts': int(e("td").eq(2).text()), 'date': int(date), 'lastvisit': int(lastvisit)}) n += 1 progress.update(n) else: logging.warning('L\'utilisateur %d a déjà été récupéré.', id) progress.end()
def content(self): d = Pq(self.dom('.article-content').html()) d('.main-tg-area').remove() d('.articleRecommend').remove() return self.clearInput(d.html())
from pyquery import PyQuery assert len(sys.argv) == 2, "Second argument is the notebook name!" NOTEBOOK = sys.argv[1] parts = NOTEBOOK.split('.') parts[-1] = "html" HTML_FILE = ".".join(parts) # Gather the information from the first cell. with open(NOTEBOOK) as f: res = json.load(f) blocks = json.loads("".join(res['cells'][0]['source'])) # Convert the notebook. call(['ipython', 'nbconvert', NOTEBOOK, '--to', 'html', '--template', 'basic']) # Remove input cells. with open(HTML_FILE) as f: doc = PyQuery(f.read(), parser='html') doc.remove('.input') blocks['body'] = doc.html() # Insert into simple template. BASE_DIR = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(BASE_DIR, 'my_template.html')) as f: tmpl = f.read() template = Template(tmpl) with open(HTML_FILE, 'w') as f: f.write(template.render(**blocks))
def _generate_translation(self): """ Generate project description. """ desc = PyQuery(HTML_TEMPLATE) # 1. Basic Information ###################### project = self.project_id # Put country if not the same as Field Office if (project.country_id and project.country_id != project.field_office_id.country_id): desc(".project_country")[0].text = _( "The project is located in %s, close to the border." ) % project.country_id.name else: desc("#project_country").remove() desc(".project_name")[0].text = _("Project name") desc(".project_name")[1].text = project.name desc(".project_closest_city")[0].text = _("Closest city") self._show_field( desc(".project_closest_city")[1], desc("#project_closest_city"), project.closest_city, ) desc(".project_cdsp_number")[0].text = _("Number of children") self._show_field( desc(".project_cdsp_number")[1], desc("#project_cdsp_number"), project.nb_cdsp_kids, ) if project.electrical_power == "Not Available": desc(".project_electricity").html( _("The project has no electricity.")) else: desc("#project_electricity").remove() # 2. Community ############## desc("#community_label").html(_("Local community")) desc(".community_population")[0].text = _("Population") self._show_field( desc(".community_population")[1], desc("#community_population"), "{:,}".format(project.community_population).replace(",", "'"), ) desc(".community_language")[0].text = _("Language") self._show_field( desc(".community_language")[1], desc("#community_language"), project.primary_language_id.name, ) if project.primary_adults_occupation_ids: desc(".community_job")[0].text = _("Typical job") self._show_field( desc(".community_job")[1], desc("#community_job"), project.primary_adults_occupation_ids[0].value, ) else: desc("#community_job").remove() desc(".community_food")[0].text = _("Typical food") if project.primary_diet_ids: desc(".community_food")[1].text = project.primary_diet_ids[0].value else: desc("#community_food").remove() desc(".community_school_begins")[0].text = _("School begins in") self._show_field( desc(".community_school_begins")[1], desc("#community_school_begins"), project.translate("school_year_begins"), ) # 3. Activities ############### spiritual = project.get_activities("spiritual_activity", 3) physical = project.get_activities("physical_activity", 3) cognitive = project.get_activities("cognitive_activity", 3) socio = project.get_activities("socio_activity", 3) if spiritual or physical or cognitive or socio: desc("#activities_label").html( _("Project activities for children")) else: desc("#activities").remove() if spiritual: desc(".spiritual_activities").html(_("Spiritual activities")) desc("#spiritual_activities_list").html("".join( ["<li>" + activity + "</li>" for activity in spiritual])) else: desc("#spiritual_activities").remove() if physical: desc(".physical_activities").html(_("Physical activities")) desc("#physical_activities_list").html("".join( ["<li>" + activity + "</li>" for activity in physical])) else: desc("#physical_activities").remove() if cognitive: desc(".cognitive_activities").html(_("Cognitive activities")) desc("#cognitive_activities_list").html("".join( ["<li>" + activity + "</li>" for activity in cognitive])) else: desc("#cognitive_activities").remove() if socio: desc(".socio_activities").html(_("Socio-emotional activities")) desc("#socio_activities_list").html("".join( ["<li>" + activity + "</li>" for activity in socio])) else: desc("#socio_activities").remove() if project.activities_for_parents: desc(".parent_activities").html( _("In addition, the project offers special activities for the " "parents such as education courses.")) else: desc("#parent_activities").remove() return desc.html()
def scrape_product(url, category_slug): f = urlopen(url) doc = html5lib.parse( f, treebuilder='lxml', namespaceHTMLElements=False ) # this didn't work, but above three lines did: encoding='utf-8', html.xhtml_to_html(doc) jQuery = PyQuery([doc]) #content = jQuery ('td#content table').eq(0) content = jQuery('td#content') content('form').remove() # used to do this, but some models (eg blades) don't have tables: #content = jQuery ('td#content table td').eq (0) #if content.is_('table'): # content = content ('table td').eq (0) # nope, this was too simplistic - let's take apart the tables - see below in final save # nope, this doens't work either. I give up. skus = find_sku.findall(url) sku = skus[0] slug = slugify(sku) print sku ''' if sku in ['ESERVE', 'NAS6X', 'NAS16X', 'PREMIUM', 'TWINSERVE', 'PREMIUM2', 'SANDYCORE', 'i7CORE', 'i7SHORT',]: print 'Skipping..' return #elif testing and sku != 'NAS12': # print 'Skipping due to testing..' # return ''' content('.small').filter(lambda notused: PyQuery(this).text().startswith( "Per single unit, this configuration's price")).remove() content('.small').filter(lambda notused: PyQuery(this).text().startswith( "The base price with this configuration is")).remove() content('.small').filter(lambda notused: PyQuery(this).text().startswith( "All eRacks systems come with a Standard")).remove() content('.small').filter(lambda notused: PyQuery(this).text().startswith( "The price differences between the default")).remove() content('.small').filter(lambda notused: PyQuery(this).text().startswith( "Contact eRacks to inquire about leasing")).remove() content('form').remove() content('#pricetext').remove() content('#warrantynote').remove() content('#closenote').remove() xbig = content('.xbig') if xbig: xbig('a').remove() inner = xbig.html().replace(':', '').strip() xbig.replaceWith('<h5 class=xbig>%s</h5>' % inner) print 'xbig replaced:', inner font = content('font[size=4], font[size=5]') if font: font('a').remove() inner = font.text().replace(':', '').strip() font.replaceWith('<h5 class="product">%s</h5>' % inner) print 'font replaced:', inner if testing: print print sku, 'content:' print content.html() links = content('a') images = content('img') for link in links: a = PyQuery(link) href = a.attr('href') if href: if '?' in href: href = href.split('?')[ 0] # doesn't this get rid of all get parms? a.attr('href', href) linkskus = find_sku.findall(href) # That this is looking for?!! else: print "Empty Link:", a.html() linkskus = [] print content.html() if linkskus: linksku = linkskus[0] a.attr('href', '/products/%s/%s/' % (category_slug, linksku)) print 'New link:', a.attr('href') elif href.startswith('/Legacy'): linksku = slugify(href.split('/')[-1]) a.attr('href', '/products/%s/%s/' % (category_slug, linksku)) print 'New link:', a.attr('href') elif 'ore photos' in a.text(): print 'Scraping:', href scrape_photos(url, href, slug) #print 'Removing link (scraped):', href #a.remove() print 'Updating "more photos" link:', href a.attr('href', '#photos') a.attr('onclick', '$("#photos-tab").click();') elif href.endswith('_photos'): print 'Scraping:', href scrape_photos(url, href, slug) print 'Updating "<prod>_photos" link:', href a.attr('href', '#photos') a.attr('onclick', '$("#photos-tab").click();') for image in images: img = PyQuery(image) src = img.attr('src') newsrc = getimage(src, 'products/' + slug) img.attr('src', newsrc) print 'image:', newsrc if dbteeth: #prod, created = Product.objects.get_or_create (sku=sku) # prods are already in the db, silly! prod = Product.objects.get(sku=sku) prod.comments = prod.comments + '\n\nScraped from Zope as of ' + str( datetime.date.today()) #prod.description = content.text() + '<br>'.join ([PyQuery(c).html() for c in content ('td')]) # content.html() prod.description = content.html() # save image(s): # prod.image = # prod.images.add (name, title, src, etc) prod.save() print '..saved.'
def test_pop_html(): node = PyQuery('<test><h1></h1></test>') assert vdom.pop_html(node) == '<h1/>' assert not node.html()
def handle(self, *args, **options): xml = ElementTree.parse(open(args[0], 'r')) channel = xml.find('channel') def node_text(node, namespace = None, parent = None): if namespace: item = (parent or channel).find(ns(namespace, node)) else: item = (parent or channel).find(node) if not item is None: return item.text return None def ns(n, o): return '{%s}%s' % (XML_NS[n], o) if channel is None: raise CommandError('Cannot find <channel> tag') title = node_text('title') if title: print(u'Blog title: %s' % title) link = node_text('link') if link: print(u'Blog URL: %s' % link) description = node_text('description') if description: print(u'Blog description: %s' % description) mappings = { 'users': {}, 'posts': {}, 'categories': {}, 'comments': {} } content_type = ContentType.objects.get_for_model(Post) site = Site.objects.get_current() postmeta = {} print with transaction.commit_manually(): try: for author in channel.findall(ns('wp', 'wp_author')): username = node_text('author_login', 'wp', author) email = node_text('author_email', 'wp', author) display_name = node_text('author_display_name', 'wp', author) user = None if not username: continue if display_name: display_name = '%s (%s)' % (username, display_name) else: display_name = username try: user = User.objects.get(username__iexact = username) except User.DoesNotExist: if email: try: user = User.objects.get(email__iexact = email) except: pass if not user: new_username = raw_input('Map old user %s to a user in your database: ' % display_name) if not new_username: continue while True: try: user = User.objects.get(username__iexact = new_username) break except User.DoesNotExist: new_username = raw_input('User not found. Please try again ,or press Enter to ignore: ') if not new_username: print 'Ignoring user %s' % username break if user: mappings['users'][username] = user print 'Mapping user %s to %s' % ( username, user.get_full_name() or user.username ) for item in channel.findall('item'): id = node_text('post_id', 'wp', item) title = node_text('title', parent = item) url = node_text('link', parent = item) kind = node_text('post_type', 'wp', item) parent = node_text('post_parent', 'wp', item) published = node_text('status', 'wp', item) == 'publish' author = node_text('creator', 'dc', item) date = node_text('post_date_gmt', 'wp', item) body = node_text('encoded', 'content', item) or u'' try: id = int(id) except ValueError: continue if not date: continue try: date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S' ).replace( tzinfo = get_current_timezone() ) except: continue try: parent = int(parent) except ValueError: continue if parent: continue if not author: continue if not mappings['users'].has_key(author): continue author = mappings['users'][author] if not kind in ('post', 'page'): continue if kind == 'post': try: post = Post.objects.get(title = title, date = date) print 'Updating %s "%s"' % (kind, title) except Post.DoesNotExist: post = Post( title = title, slug = title and slugify(title) or None, date = date, published = published, broadcast = True, author = author ) print 'Creating %s "%s"' % (kind, title) else: continue post.body = body post.save() mappings['posts'][id] = post for category in item.findall('category'): domain = category.get('domain') slug = category.get('nicename') if not category.text: continue if domain == 'category': if not mappings['categories'].has_key(slug): mappings['categories'][slug], created = Category.objects.get_or_create( name = category.text, slug = slugify(category.text) ) if created: print '- Created category "%s"' % category.text post.categories.add( mappings['categories'][slug] ) elif domain == 'post_tag': if category.text.startswith('"') and category.text.endswith('"'): post.tags.add(category.text[1:-1]) else: post.tags.add(category.text) for comment in item.findall(ns('wp', 'comment')): comment_id = node_text('comment_id', 'wp', comment) comment_name = node_text('comment_author', 'wp', comment) comment_email = node_text('comment_author_email', 'wp', comment) comment_url = node_text('comment_author_url', 'wp', comment) comment_date = node_text('comment_date_gmt', 'wp', comment) comment_type = node_text('comment_type', 'wp', comment) comment_body = node_text('comment_content', 'wp', comment) comment_parent = node_text('comment_parent', 'wp', comment) comment_approved = node_text('comment_approved', 'wp', comment) == '1' try: comment_id = int(comment_id) except ValueError: continue try: comment_parent = int(comment_parent) except ValueError: comment_parent = 0 try: comment_date = datetime.strptime( comment_date, '%Y-%m-%d %H:%M:%S' ).replace( tzinfo = get_current_timezone() ) except: continue if not comment_name: continue if not comment_type or comment_type == 'comment': try: comment = post.comments.get( name = comment_name, sent = comment_date ) except Comment.DoesNotExist: comment = Comment( name = comment_name, website = comment_url, email = comment_email or '', sent = comment_date, approved = comment_approved, body = comment_body, content_type = content_type, object_id = post.pk ) print '- Comment by %s' % comment_name comment.save(notify = False) mappings['comments'][comment_id] = comment postmeta[id] = {} for meta in item.findall(ns('wp', 'postmeta')): meta_key = node_text('meta_key', 'wp', meta) meta_value = node_text('meta_value', 'wp', meta) postmeta[id][meta_key] = meta_value ai = 1 for subitem in channel.findall('item'): subid = node_text('post_id', 'wp', subitem) subparent_id = node_text('post_parent', 'wp', subitem) subtitle = node_text('title', parent = subitem) suburl = node_text('link', parent = subitem) subkind = node_text('post_type', 'wp', subitem) suburl = node_text('attachment_url', 'wp', subitem) try: subparent_id = int(subparent_id) except ValueError: continue if not suburl: continue if subkind != 'attachment' or subparent_id != id: continue s, d, p, a, q, f = urlparse(suburl) d, s, filename = p.rpartition('/') try: attachment = post.attachments.get( title = subtitle or filename ) except Attachment.DoesNotExist: print '- Downloading %s' % filename response = requests.get(suburl) handle, tmp = mkstemp( path.splitext(filename)[-1] ) write(handle, response.content) close(handle) attachment = Attachment( title = subtitle or filename, file = File(open(tmp, 'r'), name = filename), content_type = content_type, object_id = post.pk ) if '_thumbnail_id' in postmeta[id]: if unicode(postmeta[id]['_thumbnail_id']) == unicode(subid): attachment.featured = True attachment.save() remove(tmp) if post.body: html = PyQuery('<body>' + post.body + '</body>') for a in html( 'a[href="%(url)s"], [src="%(url)s"]' % { 'url': suburl } ): a = PyQuery(a) a.replaceWith('\n\n[attachment %d]\n\n' % ai) post.body = html.html() ai += 1 if post.body: html = PyQuery('<body>' + post.body + '</body>') for a in html('a[href]'): href = a.get('href') if href.startswith(link): href = href.replace(link, 'http://%s' % site.domain) a = PyQuery(a) for p in html('p'): p = PyQuery(p) p.replaceWith('\n\n%s\n\n' % p.html()) html('.alignright').addClass('pull-right').removeClass('alignright') html('.alignleft').addClass('pull-left').removeClass('alignleft') while '\n\n\n' in post.body: post.body = post.body.replace('\n\n\n', '\n\n') while '\r\r\r' in post.body: post.body = post.body.replace('\r\r\r', '\r\r') post.body = html.html() post.body = post.body.replace('<br />', ' \n') post.body = post.body.replace('<br/>', ' \n') post.body = post.body.replace('<br>', ' \n') while post.body.startswith('\n'): post.body = post.body[1:] while post.body.endswith('\n'): post.body = post.body[:-1] while post.body.startswith('\r'): post.body = post.body[1:] while post.body.endswith('\r'): post.body = post.body[:-1] while post.body.startswith('\t'): post.body = post.body[1:] post.body = post.body.strip() post.save() transaction.commit() except: transaction.rollback() raise
def get_one_page_audio(account_id, page_count): # http://www.ximalaya.com/1014267/index_tracks?page=2 audit_pagination_url = "http://www.ximalaya.com/%s/index_tracks" % account_id query_data = {"page": page_count} audit_pagination_response = net.http_request(audit_pagination_url, method="GET", fields=query_data, json_decode=True) result = { "audio_info_list": [], # 页面解析出的歌曲信息列表 "is_over": False, # 是不是最后一页 } if audit_pagination_response.status == 404: raise crawler.CrawlerException("账号不存在") elif audit_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(audit_pagination_response.status)) if not crawler.check_sub_key( ("res", "html"), audit_pagination_response.json_data): raise crawler.CrawlerException("返回数据'res'或'html'字段不存在\n%s" % audit_pagination_response.json_data) if audit_pagination_response.json_data["res"] is not True: raise crawler.CrawlerException("返回数据'res'字段取值不正确\n%s" % audit_pagination_response.json_data) # 获取歌曲信息 audio_list_selector = PQ(audit_pagination_response.json_data["html"]).find( "ul.body_list li.item") for audio_index in range(0, audio_list_selector.size()): audio_info = { "audio_id": None, # 页面解析出的歌曲id "audio_title": "", # 页面解析出的歌曲标题 } audio_selector = audio_list_selector.eq(audio_index) # 获取歌曲id audio_id = audio_selector.find(".content_wrap").attr("sound_id") if not crawler.is_integer(audio_id): raise crawler.CrawlerException( "歌曲信息匹配歌曲id失败\n%s" % audio_list_selector.html().encode("UTF-8")) audio_info["audio_id"] = str(audio_id) # 获取歌曲标题 audio_title = audio_selector.find(".sound_title").attr("title") if not audio_title: raise crawler.CrawlerException( "歌曲信息匹配歌曲标题失败\n%s" % audio_list_selector.html().encode("UTF-8")) audio_info["audio_title"] = str(audio_title.encode("UTF-8").strip()) result["audio_info_list"].append(audio_info) # 判断是不是最后一页 max_page_count = 1 pagination_list_selector = PQ( audit_pagination_response.json_data["html"]).find( ".pagingBar_wrapper a.pagingBar_page") for pagination_index in range(0, pagination_list_selector.size()): pagination_selector = pagination_list_selector.eq(pagination_index) data_page = pagination_selector.attr("data-page") if data_page is None: continue if not crawler.is_integer(data_page): raise crawler.CrawlerException( "分页信息匹配失败\n%s" % audio_list_selector.html().encode("UTF-8")) max_page_count = max(max_page_count, int(data_page)) result["is_over"] = page_count >= max_page_count return result
def search(self, word): response = requests.get(self.URL.format(word=word), headers=headers) text = response.text # たまにhtmlに「𥝱」があって、処理はエラーが発生する text = text.replace('𥝱', '') doc = PyQuery(text) results = [] normal_dict = doc("div.NetDicHead") if normal_dict: for head in normal_dict: result = {'word': word, 'type': 'normal'} head = PyQuery(head) # 括弧(【】)がある場合、漢字か外来語は入ってる match_kakko = re.compile(r"【(.*)】").search(head.text()) if match_kakko: kakko = match_kakko.group(1) match_gairaigo = re.compile(r"[a-zA-Z]").search(kakko) if match_gairaigo: result['gogen'] = kakko result['kana'] = word else: result['kanji'] = kakko result['kana'] = head('b').text().replace(' ', '').replace( '・', '') for accent in head('span'): accent = PyQuery(accent) match_accent = re.compile(r"[([0-9]*)]").search( accent.text()) if match_accent: result['accent'] = result.get( 'accent', '') + match_accent.group(1) + ',' if 'accent' in result: result['accent'] = result['accent'][:-1] body = head.next() for a in body('a'): a = PyQuery(a) a.replaceWith(a.html()) result['meaning'] = body.html() # 単語自体は仮名のみの場合 if 'kana' not in result: result['kana'] = word results.append(result) Jitsu_dict = doc("div.Jtnhj") if Jitsu_dict: result = {'word': word, 'type': 'Jitsu'} match = re.compile( r"読み方:<!--\/AVOID_CROSSLINK-->(.*)<br/?><!--AVOID_CROSSLINK-->別表記" ).search(Jitsu_dict.html()) if match: result['kana'] = match.group(1) if result['kana'].find('<a') != -1: result['kana'] = PyQuery(result['kana']).text() else: match = re.compile( r"読み方:<!--\/AVOID_CROSSLINK-->(.*)<br/?>").search( Jitsu_dict.html()) if match: result['kana'] = match.group(1) if result['kana'].find('<a') != -1: result['kana'] = PyQuery(result['kana']).text() if Jitsu_dict('.AM'): meaning = PyQuery('<div>') meaning.html(Jitsu_dict('.AM').nextAll()) else: meaning = Jitsu_dict for a in meaning('a'): a = PyQuery(a) a.replaceWith(a.html()) result['meaning'] = meaning.text() results.append(result) IT_dict = doc('div.Binit') if IT_dict: result = {'word': word, 'type': 'IT'} a = IT_dict('a').eq(0) if a.text().find('読み方') != -1: kana_tag = a.next('a').eq(0) result['kana'] = kana_tag.text().replace(' ', "") else: result['kana'] = word if IT_dict.text().find('【') != -1: result['gogen'] = a.eq(0).text() for p in IT_dict('p'): p = PyQuery(p) for a in p('a'): a = PyQuery(a) a.replaceWith(a.html()) if not p.html(): continue result['meaning'] = result.get('meaning', '') + "<p>" + p.html() + "</p>" result['kanji'] = IT_dict.prev("h2.midashigo").text() results.append(result) WIKI = doc('div.Wkpja') if WIKI: result = {'word': word, 'type': 'WIKI'} p = WIKI('p').not_(".WkpjaTs") for a in p('a'): a = PyQuery(a) a.replaceWith(a.html()) result['meaning'] = p.html() result['kanji'] = WIKI.prev("h2.midashigo").text() results.append(result) if results: return {"status": 'success', "results": results} else: return {"status": 'error', "error_detail": "Nothing found."}
class Browser(object): def __init__(self, debug=False, opener_handlers=[]): self.debug = debug self.tree = None self.current_response = None self.current_html = None self._pyquery = None self.form_manager = FormManager(self) self.cookie_jar = cookielib.CookieJar() self.opener_handlers = opener_handlers if self.debug: LOG.setLevel(logging.DEBUG) else: LOG.setLevel(logging.WARNING) def _set_response(self, response): self.current_response = response self.current_html = response.read() self.tree = html.fromstring(self.current_html) self._pyquery = PyQuery(self.tree) def _get_opener(self): cookie_processor = urllib2.HTTPCookieProcessor(self.cookie_jar) handlers = [cookie_processor] + self.opener_handlers return urllib2.build_opener(*handlers) def _open(self, request): url = self.get_absolute_url(request.get_full_url()) abs_request = urllib2.Request(url, request.data, request.headers, request.origin_req_host, request.unverifiable) opener = self._get_opener() self._maybe_log_request(request) response = opener.open(abs_request) self._set_response(response) def _maybe_log_request(self, request): if not LOG.isEnabledFor(logging.DEBUG): return message = "HTTP request: (%s) %s" % (request.get_method(), request.get_full_url()) if request.get_method() == "POST": message += "\n POSTDATA: %s" % request.get_data() LOG.debug(message) def visit(self, url): self._open(urllib2.Request(url)) def fill(self, selector_value_dict): return self.form_manager.fill(selector_value_dict) def submit(self, form_selector=None): request = self.form_manager.get_submit_request(form_selector) self._open(request) def query(self, selector): return self._pyquery(selector) def html(self, selector=None): if selector is None: return self._pyquery.html() def get_absolute_url(self, relative_url): if self.url is None: # Browser not used yet return relative_url return urlparse.urljoin(self.url, relative_url) @property def url(self): if not self.current_response: return None return self.current_response.url
if response.url != SERVER_URL: #update host for redirects ex. goo.gl links logger.debug('Updating server host to match what was retreived') SERVER_HOST = urlparse(response.url).netloc raw_html = response.text jQuery = PyQuery(raw_html) job_list = None for job_css in JOB_CSS_SEARCH_LIST: job_list = jQuery(job_css).text() if job_list: break if not job_list: logger.info('Running job checker against: ' + SERVER_HOST + ' and could not parse page') email_message[ 'Subject'] = "Job Checker could not parse page at: " + SERVER_HOST body = "Raw HTML FROM: " + SERVER_HOST + ":\n" + jQuery.html( method='html') email_message.attach(MIMEText(body, 'plain')) send_email(email, email_message) else: match = JOB_REGEX.search(job_list) email_message['Subject'] = "Jobs Available at: " + SERVER_HOST body = "Content Retrieved from " + SERVER_HOST + ":\n" + job_list email_message.attach(MIMEText(body, 'plain')) logger.debug('Found job_list: ' + job_list) if not match: logger.info('Jobs have appeared so sending email.') send_email(email, email_message) else: logger.info('Found no jobs so not sending email') except Exception as error: logger.error('Found error in email process:', exc_info=True)
"name": text, "type": "Function", "path": href.replace(url, ""), "href": href }) jQuery(item).find("a").attr("href", href.replace(url, "")) # Step 1: create the docset folder docsetPath = os.path.join(currentPath, output, "Contents", "Resources", "Documents") if not os.path.exists(docsetPath): os.makedirs(docsetPath) # Step 2: Copy the HTML Documentation fin = codecs.open(os.path.join(docsetPath, "index.html"), "w", "utf-8") newContent = jQuery.html() fin.write(newContent) fin.close() # Step 2.1 创建每一个函数的值页面 for result in results: dest = os.path.join(docsetPath, result["href"].replace(url, "")) if not os.path.exists(dest): os.makedirs(dest) fin = open(os.path.join(dest, "index.html"), "w") fin.write(urllib2.urlopen(result["href"]).read()) fin.close() # Step 2.2 下载CSS和JS links = [ "http://www.css88.com/jqapi-1.9/cssstyle/main.min.css",
def index_page(self, response): title = response.save['river_name'] url = response.url spider_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 爬虫的时间 type_id = 0 source = '百度百科' if url.find('https://baike.baidu.com/error.html') == 0: result = None else: context = self.filter_page( response, 'body > div.body-wrapper > div.content-wrapper > div > div.main-content' ) for i in ['em', 'span', 'a']: context = re.sub(r'<{}[^<>]*>'.format(i), r'', context) #删除a标签 context = re.sub(r'</{}>'.format(i), r'', context) #删除a标签 for i in ['em', 'div', 'span']: #替换成p标签 context = re.sub(r'<([/]*){}([/]*)>'.format(i), r'<\1p\2>', context) context = re.sub(r'<p></p>', r'', context) #删除空标签 context = re.sub(r'<p/>', r'', context) #删除空标签 for i in range(10): #去除多层多余标签 context = re.sub(r'<p>(<p>(?!<p>|</p>)[^<>]+<p/>)<p/>', r'\1', context) context = re.sub(r'<[/]*(?=span|em)[^ \"</>]+[/]*>', r'', context) #删除span和em for count in range(10): for i in re.findall(r'<([^ \"<>]+)></\1>', context): #删除空标签 context = context.replace('<{}></{}>'.format(i, i), '') context = context.replace('<{}/>'.format(i), '') context = re.sub(r'<a[^<>]*>', r'', context) #删除链接 context = re.sub(r'</a>', r'', context) #删除链接 context = re.sub(r'<div>', r'<p>', context) context = re.sub(r'</div>', r'</p>', context) context = re.sub( r'<p>([^<>]*)<img src=\"([^ \"<>]+)\"/></p>', r'<p>\1</p><img src="\2" alt="" class="entry__img"/>', context) py = PyQuery(context) context = py.html() context = re.sub(r'(<img[^<>]+/>)<p>([^<>]+)</p>', r'\1<figcaption>\2</figcaption>', context) context = re.sub(r'<p></p>', r'', context) py = PyQuery(context) context = py.html() context = re.sub(r'<p></p>', r'', context) context = re.sub(r'<p/>', r'', context) context = re.sub(r'(<p>[^<>]*)<img', r'\1</p><img', context) context = re.sub(r'(</p>(?!</p>).*)</p>', r'\1', context) context = re.sub(r'(<p>(?!</p>).*)<p>', r'\1', context) context = re.sub(r'<([/]*)h[1-9]>', r'<\1h3>', context) context = re.sub(r'<h3>', r'<h3 class="text-center">', context) context = re.sub('^<h3', '<div class=\"entry__article\"><h3', context) # 开头就是标题 context = re.sub('</h3>$', '</div></h3>', context) # 以标题结尾 context = re.sub( '^<img', '<div class=\"entry__img-holder text-center\"><img', context) # 开头就是图片 context = re.sub('\">$', '\"></div>', context) # 图片后面没有图片名并且是最后一个 context = re.sub('</figcaption>$', '</figcaption></div>', context) # 图片后面有图片名并且是最后一个 context = re.sub('^<p>', '<div class=\"entry__article\"><p>', context) # 开头就是正文 context = re.sub('</p>$', '</p></div>', context) # 以正文结尾 context = re.sub( '</p><img', '</p></div><div class=\"entry__img-holder text-center\"><img', context) # 图片在正文的后面 context = re.sub( '</figcaption><p>', '</figcaption></div><div class=\"entry__article\"><p>', context) # 有图片名的图片在正文的前面 context = re.sub('\"><p>', '\"></div><div class=\"entry__article\"><p>', context) # 没有图片名的图片在正文的前面 context = re.sub( '</h3><img', '</h3></div><div class=\"entry__img-holder text-center\"><img', context) # 图片在标题的后面 context = re.sub( '</figcaption><h3', '</figcaption></div><div class=\"entry__article\"><h3', context) # 有图片名的图片在标题的前面 ############## context = self.context_css + self.context_html_tmp.format(context) context = re.sub( r'(<link rel=\"stylesheet\" type=\"text/css\" href=\"css/style.css\"><link rel=\"stylesheet\" type=\"text/css\" href=\"css/bootstrap.css\"><link rel=\"stylesheet\" type=\"text/css\" href=\"css/font-awesome.min.css\"><article class=\"entry\"><div class=\"entry__article\"></div><div class=\"entry__article\">).*<p>[\d]+</p>', r'\1', context) # with open('/home/mininet/test.txt','w+') as f: # f.write(context) # print context context = context.replace('data-src', 'src') result = [ title, url, '', context, '', '', type_id, spider_time, source ] return result
def _generate_translation(self): """ Generate child description. """ desc = PyQuery(HTML_TEMPLATE) # 1. Program type only if Home Based + Birthday estimate ######################################################## child = self.child_id if child.cdsp_type == 'Home Based': desc('.program_type').html( self.home_based_lang[self.env.lang][child.gender].format( preferred_name=child.preferred_name)) else: desc('#program_type').remove() if child.estimated_birthdate: desc('.birthday_estimate').html( _("* The birthday is an estimation.")) else: desc('#birthday_estimate').remove() # 2. Household ############## household = child.household_id.with_context(active_gender=child.gender) live_with = self._live_with() desc('#live_with').html(live_with) if not household.father_living_with_child: f_alive = desc('.father').children('.is_alive') f_alive[0].text = _('Father alive') f_alive[1].text = household.translate('father_alive') else: desc('.father').remove() self._job(desc('.father_job'), 'father') if not household.mother_living_with_child: m_alive = desc('.mother').children('.is_alive') m_alive[0].text = _('Mother alive') m_alive[1].text = household.translate('mother_alive') else: desc('.mother').remove() self._job(desc('.mother_job'), 'mother') if household.nb_brothers: desc('.brothers')[0].text = _("Number of brothers") desc('.brothers')[1].text = str(household.nb_brothers) else: desc('.brothers').remove() if household.nb_sisters: desc('.sisters')[0].text = _("Number of sisters") desc('.sisters')[1].text = str(household.nb_sisters) else: desc('.sisters').remove() # 3. Schooling ############## if child.us_grade_level and child.us_grade_level != 'Not Enrolled': # Make sure the education level is set child.convert_us_grade_to_education_level() desc('#school_attending').html( self.school_yes_lang[self.env.lang][child.gender].format( preferred_name=child.preferred_name, level=child.translate('education_level'))) if child.academic_performance: desc('.school_performance')[0].text = _('School performance') desc('.school_performance')[1].text = child.translate( 'academic_performance') else: desc('#school_performance').remove() if child.major_course_study: desc('.school_subject')[0].text = _('Best school subject') desc('.school_subject')[1].text = child.translate( 'major_course_study') else: desc('#school_subject').remove() if child.vocational_training_type and \ child.vocational_training_type.lower() not in ( 'not enrolled', 'other'): desc('.vocational_training')[0].text = _('Vocational training') desc('.vocational_training')[1].text = child.translate( 'vocational_training_type') else: desc('#vocational_training').remove() else: desc('#school_attending').html( self.school_no_lang[self.env.lang][child.gender].format( preferred_name=child.preferred_name)) desc('.school').remove() # 4. House duties ################# if child.duty_ids: desc('#house_duties_intro').html( self.duties_intro_lang[self.env.lang][child.gender]) desc('#house_duties_list').html(''.join([ '<li>' + duty.value + '</li>' for duty in child.duty_ids[:3] ])) else: desc('.house_duties').remove() # 5. Church activities ###################### if child.christian_activity_ids: desc('#church_activities_intro').html( self.church_intro_lang[self.env.lang][child.gender]) desc('#church_activities_list').html(''.join([ '<li>' + activity.value + '</li>' for activity in child.christian_activity_ids[:3] ])) else: desc('.church_activities').remove() # 6. Hobbies ############ if child.hobby_ids: desc('#hobbies_intro').html( self.hobbies_intro_lang[self.env.lang][child.gender].format( preferred_name=child.preferred_name)) desc('#hobbies_list').html(''.join([ '<li>' + hobby.value + '</li>' for hobby in child.hobby_ids[:3] ])) else: desc('.hobbies').remove() # 7. Health ########### if child.physical_disability_ids or child.chronic_illness_ids: desc('#handicap_intro').html( self.handicap_intro_lang[self.env.lang][child.gender].format( preferred_name=child.preferred_name)) handicap_list = [] if child.physical_disability_ids: handicap_list.extend([ '<li>' + handicap.value + '</li>' for handicap in child.physical_disability_ids ]) if child.chronic_illness_ids: handicap_list.extend([ '<li>' + illness.value + '</li>' for illness in child.chronic_illness_ids ]) desc('#handicap_list').html(''.join(handicap_list)) else: desc('.handicap').remove() return desc.html()
def _render_span(self, p: Paragraph, pq: PyQuery, bold=False, italic=False, strike=False, underline=False, font_size=None, sub=False, sup=False): """ 转换span change 19.5.3 公式转换错误,则直接用图片 :param pq: :return: """ try: if pq.attr('data-latex'): # 公式 omml_str = converter.to_omml( self.mini_trim(pq.attr('data-latex'))) omml_str = omml_str.replace( '<m:oMath', '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"' ) pq(p._element).append(omml_str) return if pq.has_class("math-tex"): # 公式 if pq.attr('data-latex'): omml_str = pq.attr('data-latex') else: omml_str = html.unescape( pq.html()) if pq.html() is not None else '' omml_str = omml_str.replace(r'\(', '').replace(r'\)', '') omml_str = converter.to_omml(self.mini_trim(omml_str)) omml_str = omml_str.replace( '<m:oMath', '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"' ) pq(p._element).append(omml_str) return # 阿凡题公式 if pq.has_class('afanti-latex'): metadata = AftQuestion(pq).parse_element() if metadata.startswith('^') or metadata.startswith('_'): last_ele = pq(p._element).children()[-1] metadata = last_ele.text[-1] + metadata last_ele.text = last_ele.text[:-1] omml_str = converter.to_omml(self.mini_trim(metadata)) omml_str = omml_str.replace( '<m:oMath', '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"' ) pq(p._element).append(omml_str) return except EquationConvertError: img = PyQuery('img', pq) self._render_img(p, img) return bold = any([ bold, self._get_pq_style(pq, 'font-weight') == 'bold', self._get_pq_style(pq, 'font-weight') == 'bolder' ]) italic = any( [italic, self._get_pq_style(pq, 'font-style') == 'italic']) strike = any([ strike, self._get_pq_style(pq, 'text-decoration') == 'line-through', self._get_pq_style(pq, 'text-decoration-line') == 'line-through' ]) underline = any([ underline, self._get_pq_style(pq, 'text-decoration') == 'underline', self._get_pq_style(pq, 'text-decoration-line') == 'underline' ]) if self._get_pq_style(pq, 'font-size'): size = self._get_pq_style(pq, 'font-size') if size.endswith('px'): size = size[:-2] size = int(float(size)) font_size = self.get_pt(size) elif size.endswith('pt'): size = size[:-2] size = float(size) font_size = Pt(size) # self.__render_inline_element(p, pq, bold=bold, italic=italic, underline=underline, font_size=font_size, # strike=strike) contents = pq.contents() for item in contents: if isinstance(item, (HtmlElement, _Element)): self._render_element(p, item, is_root=True, bold=bold, italic=italic, strike=strike, underline=underline, font_size=font_size) continue run = p.add_run(self._clear_text(item)) self.__force_simsun(run) if self._get_pq_style(pq, 'font-name'): run.font.name = self._get_pq_style(pq, 'font-name') if font_size: run.font.size = font_size run.underline = underline run.bold = bold run.italic = italic run.font.strike = strike run.font.superscript = sup run.font.subscript = sub
else: newfile = special_chapters[i] # TODO: handle appendices and index link_replacements[file.replace('trinkethtml/', '')] = newfile for i, file in enumerate(files[1:]): # skip book index print("Processing: ", file) selector = 'div.columns > ul > li:nth-child(' + str(i + 1) + ')' list_items = d(selector) list_items('li').eq(0).addClass('has-dropdown') list_items('ul').addClass('dropdown') toc = PyQuery('<div><ul class="right"></ul></div>') toc('ul').html(list_items) thisfile = file.replace('trinkethtml/', '') newfile = link_replacements[thisfile] toc_text = re.sub(thisfile, web_dir + newfile, toc.html(method='html')) #print(toc_text) # Extract chapter text with open(file) as f: chapter_raw = f.read() chapter_query = PyQuery(chapter_raw) chapter_text = chapter_query(".bookchapter").html(method='html') # Replace old links for old, new in link_replacements.items(): chapter_text = re.sub(old, web_dir + new, chapter_text) # placeholder for tabs and newlines since re.sub will clobber them otherwise # print(re.findall(r'^.*?\\[tn].*?$', chapter_text, flags=re.M)) chapter_text = re.sub(r'\\([tn])', 'shouldbe\g<1>',
class DocumentPublisherEngine(object): def __init__(self, pro, doc, group, organization=None): self.project = pro self.document = doc self.groups = [group.key, Group.get_worldshare().key] self.organization = organization self.user = User() self.user.groups = self.groups if organization: self.user.organization = organization.key self.html = '' self.body = Pq('<span></span>') def render(self): self._process_root() self._process_parent(self.project) self.html = self.body.html(method='html') def _process_root(self): attr = self.project.get_attr_by_doc(self.document) self.project.render_object = RenderObject() self.project.render_object.span = Pq('<span></span>') self.project.render_object.span.add_class('project_span') self.project.render_object.children_span = Pq('<span></span>') self.project.render_object.children_span.add_class('children_span') self.project.render_object.parent_children_span = Pq('<span></span>') self.project.render_object.parent_children_span.add_class( 'parent_children_span') if attr and attr.is_unordered_list(): self.project.render_object.ul = Pq('<ul></ul>') self.project.render_object.ul.append( self.project.render_object.children_span) self.project.render_object.ul.append( self.project.render_object.parent_children_span) self.project.render_object.span.append( self.project.render_object.ul) self.project.render_object.cur_attr = attributes.UNORDERED_LIST elif attr and attr.is_ordered_list(): self.project.render_object.ul = Pq('<ol></ol>') self.project.render_object.ul.append( self.project.render_object.children_span) self.project.render_object.ul.append( self.project.render_object.parent_children_span) self.project.render_object.span.append( self.project.render_object.ul) self.project.render_object.cur_attr = attributes.ORDERED_LIST else: self.project.render_object.span.append( self.project.render_object.children_span) self.project.render_object.span.append( self.project.render_object.parent_children_span) self.project.render_object.cur_attr = attributes.NONE self.body.append(self.project.render_object.span) def _process_parent(self, parent): children = ndb.get_multi(parent.children) parent_span = False for child in children: if not child or not child.has_permission_read(self.user): continue child.parent_obj = parent self._render(child, parent) if not child.is_parent() and not parent_span: parent.render_object.children_span.append( child.render_object.span.remove()) else: parent_span = True parent.render_object.parent_children_span.append( child.render_object.span.remove()) self._process_parent(child) def _render(self, concept, parent): concept.render_object = RenderObject() attr = concept.get_attr_by_doc(self.document) ordered_list = False unordered_list = False concept.render_object.span = Pq('<span></span>') concept.render_object.span.attr('id', concept.id) if attr: concept.render_object.span.attr('data-attr', ' '.join(attr.attributes)) concept.render_object.span.add_class('concept') if not attr or (attr and not attr.is_no_list()): if not attr or (attr and not attr.is_unordered_list()): auto_list = AutoAttributeEngine.is_ordered_list(parent, None, self.document, user=self.user) if attr and attr.is_ordered_list(): ordered_list = True elif concept.is_parent() and auto_list: ordered_list = True if not attr or (attr and not attr.is_ordered_list()): auto_list = AutoAttributeEngine.is_unordered_list( parent, None, self.document, user=self.user) if attr and attr.is_unordered_list(): unordered_list = True elif concept.is_parent() and auto_list: unordered_list = True concept.render_object.render_as_ordered_list = ordered_list concept.render_object.render_as_unordered_list = unordered_list attr_str = AutoAttributeEngine.get_attr(concept, attr, doc=self.document, user=self.user) concept.render_object.span.attr('data-ordered-list', str(ordered_list)) concept.render_object.span.attr('data-unordered-list', str(unordered_list)) concept.render_object.phr_span = Pq('<span></span>') concept.render_object.phr_span.attr('id', '%s-%s' % (concept.id, 'phr_span')) concept.render_object.phr_span.add_class('phr_span') concept.render_object.span.append(concept.render_object.phr_span) if concept.is_parent(): concept.render_object.more_icon = Pq('<i></i>') concept.render_object.more_icon.attr( 'id', '%s-%s' % (concept.id, 'more_icon')) concept.render_object.more_icon.add_class( 'fa fa-angle-double-right expand_child_inc move-icon') if not concept.depth >= 0: concept.render_object.more_icon.add_class('hidden') concept.render_object.span.append(concept.render_object.more_icon) concept.render_object.children_span = Pq('<span></span>') concept.render_object.children_span.attr( 'id', '%s-%s' % (concept.id, 'children_span')) concept.render_object.children_span.add_class('children_span') concept.render_object.span.append(concept.render_object.children_span) if concept.depth >= 0: concept.render_object.children_span.add_class('hidden') concept.render_object.children_span.attr('data-collapsed', 'true') concept.render_object.parent_children_span = Pq('<span></span>') concept.render_object.parent_children_span.attr( 'id', '%s-%s' % (concept.id, 'parent_children_span')) concept.render_object.parent_children_span.add_class( 'parent_children_span') concept.render_object.span.append( concept.render_object.parent_children_span) if concept.depth >= 0: concept.render_object.parent_children_span.add_class('hidden') concept.render_object.parent_children_span.attr( 'data-collapsed', 'true') self._render_text(concept) if attr_str == attributes.HEADER: self._render_header(concept) elif attr_str == attributes.PARAGRAPH: self._render_paragraph(concept) elif attr_str == attributes.IMAGE: self._render_image(concept) elif attr_str == attributes.NONE: self._render_none(concept) if ordered_list: self._render_ordered_list(concept) elif unordered_list: self._render_unordered_list(concept) if AutoAttributeEngine.is_list_item(concept, self.document, user=self.user): self._render_list_item(concept) concept.render_object.cur_attr = attr_str def _render_none(self, concept): pass def _render_image(self, concept): concept.render_object.img_figure = Pq('<figure></figure>') concept.render_object.img_figure.attr( 'id', '%s-%s' % (concept.id, 'img-figure')) concept.render_object.img_figure.add_class('img-figure') concept.render_object.phr_span.append(concept.render_object.img_figure) concept.render_object.img = Pq('<img>') concept.render_object.img.attr('id', '%s-%s' % (concept.id, 'concept-img')) concept.render_object.img.attr('alt', concept.get_phrasing(doc=self.document)) concept.render_object.img.attr('src', '/media/download/%s' % concept.id) concept.render_object.img.add_class('concept-img img-full') concept.render_object.img_figure.append(concept.render_object.img) concept.render_object.img_caption = Pq('<figcaption></figcaption>') concept.render_object.img_caption.attr( 'id', '%s-%s' % (concept.id, 'caption')) concept.render_object.img_caption.append( concept.render_object.phr_text_span.remove()) concept.render_object.img_caption.add_class('caption') concept.render_object.img_figure.append( concept.render_object.img_caption) concept.render_object.phr_text_span.remove_class('phr_text_span') concept.render_object.phr_text_span.add_class('phr_text_span_img') def _render_unordered_list(self, concept): concept.render_object.ul = Pq('<ul></ul>') concept.render_object.ul.attr('id', '%s-%s' % (concept.id, 'ul')) concept.render_object.ul.append( concept.render_object.children_span.remove()) concept.render_object.ul.append( concept.render_object.parent_children_span.remove()) concept.render_object.span.append(concept.render_object.ul) def _render_ordered_list(self, concept): concept.render_object.ol = Pq('<ol></ol>') concept.render_object.ol.attr('id', '%s-%s' % (concept.id, 'ol')) concept.render_object.ol.append( concept.render_object.children_span.remove()) concept.render_object.ol.append( concept.render_object.parent_children_span.remove()) concept.render_object.span.append(concept.render_object.ol) def _render_list_item(self, concept): concept.render_object.li = Pq('<li></li>') concept.render_object.li.attr('id', '%s-%s' % (concept.id, 'li')) concept.render_object.li.append( concept.render_object.phr_span.children().remove()) concept.render_object.phr_span.append(concept.render_object.li) if AutoAttributeEngine.is_ordered_list(concept.get_parent(), None, self.document): concept.render_object.render_as_ordered_list = True elif AutoAttributeEngine.is_unordered_list(concept.get_parent(), None, self.document): concept.render_object.render_as_unordered_list = True def _render_paragraph(self, concept): concept.render_object.p = Pq('<p></p>') concept.render_object.p.attr('id', '%s-%s' % (concept.id, 'p')) concept.render_object.p.append( concept.render_object.span.children().remove()) concept.render_object.span.append(concept.render_object.p) concept.render_object.span.append( concept.render_object.parent_children_span.remove()) def _render_header(self, concept): hl = concept.depth + 1 if hl > 6: hl = 6 concept.render_object.header = Pq('<h%s></h%s>' % (hl, hl)) concept.render_object.header.attr('id', '%s-%s' % (concept.id, 'header')) concept.render_object.header.append( concept.render_object.phr_text_span.remove()) concept.render_object.phr_span.append(concept.render_object.header) if concept.render_object.more_icon: concept.render_object.header.append( concept.render_object.more_icon) def _render_text(self, concept): phrasing_text = concept.get_phrasing(doc=self.document) if not phrasing_text: concept.get_phrasing() concept.render_object.phr_text_span = Pq('<span></span>') concept.render_object.phr_text_span.attr( 'id', '%s-%s' % (concept.id, 'phr_text_span')) concept.render_object.phr_text_span.add_class('phr_text_span') concept.render_object.phr_text_span.append(phrasing_text + ' ') concept.render_object.phr_span.append( concept.render_object.phr_text_span)