def _rewrite_links(data): import lxml.html html = lxml.html.parse(data) html.getroot().rewrite_links(_rewrite_url, resolve_base_href=True, base_href=data.geturl()) return lxml.html.tostring(html)
def main(): print 'start at %s' % time.asctime() users = db.select("users") print 'current users count %s ' % len(users) for user in users: # print 'user %s ' % user.token # print 'user %s ' % user.secret access_token = OAuthToken(user.token, user.secret) if not user.trunk_key: continue t = Trunkly(user.trunk_key) sinat = Sinat(sinaConsumer, access_token=access_token) statuses = sinat.statuses__user_timeline('GET') for status in statuses: weibo = status['text'] if status.has_key('retweeted_status'): weibo = '%s //@%s: %s' % (weibo , status['retweeted_status']['user']['name'], status['retweeted_status']['text']) # print 'status %s' % status['text'] urls = p.findall(weibo) for url in urls: print 'url is %s ' % url title = None trunk = None try: html = lxml.html.parse(url) title = html.find(".//title").text url = html.getroot().base_url print 'title is %s' % title print 'base url is %s ' % url try: try: trunk = t.get_link(parameters={'url': url}) print 'url Already exists!!!' continue except: print 'error' pass if title and not trunk: print 'post url to trunk.ly' t.post_link(parameters={'url': url, 'title': title, 'tags' : '', 'note' : weibo, 'text' : weibo}) except: print 'post to trunk error. url %s title %s' % (url, title) except: print 'url %s fetch error' % (url) print '---------------- end ---------------------'
def get_html_from_file(filename): """ Gets the HTML contents from a file. """ import lxml.html f=open(filename) html=lxml.html.parse(f) return html.getroot()
def get_notices(self, codes=None, page=1): ''' 个股信息地雷 Parameters -------- code:股票代码 date:信息公布日期 Return -------- DataFrame,属性列表: title:信息标题 type:信息类型 date:公告日期 url:信息内容URL ''' #print codes if codes is None: return None if codes.decode().isdigit(): # 0开头自动补sz,6开头补sh,3开头补sz,否则无效 if codes.startswith('0'): self._code = 'sz' + codes elif codes.startswith('6'): self._code = 'sh' + codes elif codes.startswith('3'): self._code = 'sz' + codes url = "http://vip.stock.finance.sina.com.cn/corp/view/vCB_BulletinGather.php?stock_str=%s&page=%d" % ( self._code, page) #print url html = lxml.html.parse(url) if not html: print "html is not found" return res = html.getroot().xpath('//table[@class=\"body_table\"]/tbody/tr') data = [] for td in res: title = td.xpath('th/a/text()') if len(title) > 0: title = title[0] else: continue ctype = td.xpath('td[1]/text()') if len(ctype) > 0: ctype = ctype[0] else: continue date = td.xpath('td[2]/text()') if len(date) > 0: date = date[0] else: continue url = '%s%s%s' % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], td.xpath('th/a/@href')[0]) data.append([title, ctype, date, url]) df = pd.DataFrame(data, columns=nv.NOTICE_INFO_CLS) return df
def main_html(text): data = {} #root = lxml.html.parse(filename).getroot() html = lxml.html.parse(StringIO(text)) root = html.getroot() hrefs_info, href_sets = grouped_hrefs_from_page(root) data['urls_grouped'] = hrefs_info data['urls_next_somehow'] = looking_for_next_page(root, href_sets) pprint(data)
def main_html(text): html = lxml.html.parse(StringIO(text)) root = html.getroot() #root = lxml.html.document_fromstring(text) ess = traversal(root, 1, min_elements=4, mintreeheight=3, maxtreeheight=4, maxmismatch=0.28) print_ess(ess, root=root) print "ITEM INFO" for es in ess: item_info_from_es(es)
def convert(htmlfile): textfile = os.path.splitext(htmlfile)[0] + '.txt' print "Convert html %s to plain text %s" % (htmlfile, textfile) html = lxml.html.parse(htmlfile) text = html.getroot().text_content() # clean text text = removeComments(text) text = removeLonglyWhiteSpace(text) text = condenseEmptyLines(text) f = open(textfile, 'w') f.write(text.encode('utf-8')) f.close()
def update_home_index(feed_path, home_path): '''Update the HTML index with the feedendry content.''' # Get HTML from the index if os.path.isfile(home_path): html = lxml.html.parse(home_path) home = html.getroot() else: logging.error("WRONG PATH: %s" % (home_path)) # Get an entry dictionary from the Feed entries = last_posts(feed_path) # Generate string with markup home_index = "<ul id='blog_index'>" + last_posts_html(entries) + "</ul>" lis = lxml.html.fragment_fromstring(home_index) # replace the content of the home index blog_ul = home.get_element_by_id("blog_index") blog_ul.getparent().replace(blog_ul, lis) return lxml.html.tostring(html, encoding='utf-8')
def update_home_index(feed_path, home_path, id_name): """Update the HTML index with the feedendry content.""" # Get HTML from the index if os.path.isfile(home_path): html = lxml.html.parse(home_path) home = html.getroot() else: logging.error("WRONG PATH: %s" % (home_path)) # Get an entry dictionary from the Feed entries = last_posts(feed_path) # Generate string with markup home_template = """<ul id="{id}"> {posts_list} </ul> """ posts_list = last_posts_html(entries) home_index = home_template.format( id=id_name, posts_list=posts_list) lis = lxml.html.fragment_fromstring(home_index.decode('utf-8')) # replace the content of the home index blog_ul = home.get_element_by_id(id_name) blog_ul.getparent().replace(blog_ul, lis) return lxml.html.tostring(html, encoding='utf-8')
# -*- coding: utf-8 -*- import os from lxml import html song_set = set() html = html.parse(r"test\myshazam-history.html") root = html.getroot() for n in range(len(root.xpath('//tr/td[2]/text()'))): # название исполнителя - название композиции; сетом убираем дубликаты # TODO убрать дефис между названием песни и исполнителем song_set.add((root.xpath('//tr/td[2]/text()')[n] + ' - ' + root.xpath('//tr/td[1]/a/text()')[n])) print(len(song_set))
def get_html(url): path, _ = download(url, to_dir=html_dir, keep_params=True) html = lxml.html.parse(path, parser=html_parser, base_url=base_url) return html.getroot()
# individual motif plots plt_tbl = [] for i,url in enumerate(motif_plot_urls[:30]) : if i%3 == 0 : plt_tbl.append([]) plt_tbl[-1].append(ReStImage(url)) doc.add(ReStSimpleTable(('**Peak strength vs refined motif strength**','(based on top 2000 peak sequences by pvalue)',''),plt_tbl)) #end if (len(motif_results_fns)>0): doc.add(ReStSection('MEME-ChIP results',level=3)) meme_index_path = os.path.join(meme_path,'index.html') print meme_index_path if (os.path.exists(meme_index_path)): html = lxml.html.parse(meme_index_path) page = html.getroot() script_list = page.findall("head/script") last_script = script_list[-1] add_link_script = """ var progs = data["programs"]; for (var i = 0; i < progs.length; i++) { var prog = progs[i]; if (prog["outputs"].length > 0) { var outputs = prog["outputs"]; for (var j = 0; j < outputs.length; j++) { outputs[j]["file"] = "%s\/" + outputs[j]["file"] } } } """%(os.path.relpath(meme_path,os.path.dirname(reSt_html_path))) script_el = lxml.html.builder.SCRIPT(add_link_script)