Example #1
0
def _rewrite_links(data):
    import lxml.html
    html = lxml.html.parse(data)
    html.getroot().rewrite_links(_rewrite_url,
                                 resolve_base_href=True,
                                 base_href=data.geturl())
    return lxml.html.tostring(html)
Example #2
0
def main():
	print 'start at %s' % time.asctime()  
	users = db.select("users")
	print 'current users count %s ' % len(users)
	for user in users:
		# print 'user %s ' % user.token
		# print 'user %s ' % user.secret
		access_token = OAuthToken(user.token, user.secret) 
		
		if not user.trunk_key:
			continue
		
		t = Trunkly(user.trunk_key)
		
		sinat = Sinat(sinaConsumer, access_token=access_token)	
		statuses = sinat.statuses__user_timeline('GET')
		for status in statuses:
			weibo = status['text']
			if status.has_key('retweeted_status'):
				weibo = '%s //@%s: %s' % (weibo , 
											status['retweeted_status']['user']['name'],
											status['retweeted_status']['text'])
				
			# print 'status %s' % status['text']
			urls = p.findall(weibo)
			for url in urls:
				print 'url is %s ' % url
				title = None
				trunk = None
				
					
				try:
					html = lxml.html.parse(url)
					title = html.find(".//title").text
					url = html.getroot().base_url
					print 'title is %s' % title 
					print 'base url is %s ' % url
					
					try:
						try:
							trunk = t.get_link(parameters={'url': url})
							print 'url Already exists!!!'
							continue
						except:
							print 'error'
							pass

						if title and not trunk:
							print 'post url to trunk.ly'
							t.post_link(parameters={'url': url,
										'title': title,
										'tags' : '',
										'note' : weibo,
										'text' : weibo})
					except:
						print 'post to trunk error. url %s title %s' % (url, title)
				except:
					print 'url %s fetch error' % (url)
					
	print '---------------- end ---------------------'
Example #3
0
def get_html_from_file(filename):
    """
    Gets the HTML contents from a file.
    """
    import lxml.html
    f=open(filename)
    html=lxml.html.parse(f)
    return html.getroot()
Example #4
0
    def get_notices(self, codes=None, page=1):
        '''
        个股信息地雷
        Parameters
        --------
            code:股票代码
            date:信息公布日期

        Return
        --------
            DataFrame,属性列表:
            title:信息标题
            type:信息类型
            date:公告日期
            url:信息内容URL
        '''
        #print codes
        if codes is None:
            return None
        if codes.decode().isdigit():
            # 0开头自动补sz,6开头补sh,3开头补sz,否则无效
            if codes.startswith('0'):
                self._code = 'sz' + codes
            elif codes.startswith('6'):
                self._code = 'sh' + codes
            elif codes.startswith('3'):
                self._code = 'sz' + codes
        url = "http://vip.stock.finance.sina.com.cn/corp/view/vCB_BulletinGather.php?stock_str=%s&page=%d" % (
            self._code, page)
        #print url
        html = lxml.html.parse(url)
        if not html:
            print "html is not found"
            return
        res = html.getroot().xpath('//table[@class=\"body_table\"]/tbody/tr')
        data = []
        for td in res:
            title = td.xpath('th/a/text()')
            if len(title) > 0:
                title = title[0]
            else:
                continue
            ctype = td.xpath('td[1]/text()')
            if len(ctype) > 0:
                ctype = ctype[0]
            else:
                continue
            date = td.xpath('td[2]/text()')
            if len(date) > 0:
                date = date[0]
            else:
                continue
            url = '%s%s%s' % (ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                              td.xpath('th/a/@href')[0])
            data.append([title, ctype, date, url])
        df = pd.DataFrame(data, columns=nv.NOTICE_INFO_CLS)

        return df
Example #5
0
def main_html(text):
    data = {}
    #root = lxml.html.parse(filename).getroot()
    html = lxml.html.parse(StringIO(text))
    root = html.getroot()
    hrefs_info, href_sets = grouped_hrefs_from_page(root)
    data['urls_grouped'] = hrefs_info
    data['urls_next_somehow'] = looking_for_next_page(root, href_sets)
    pprint(data)
Example #6
0
def main_html(text):
    data = {}
    #root = lxml.html.parse(filename).getroot()
    html = lxml.html.parse(StringIO(text))
    root = html.getroot()
    hrefs_info, href_sets = grouped_hrefs_from_page(root)
    data['urls_grouped'] = hrefs_info
    data['urls_next_somehow'] = looking_for_next_page(root, href_sets)
    pprint(data)
Example #7
0
def main_html(text):
    html = lxml.html.parse(StringIO(text))
    root = html.getroot()
    #root = lxml.html.document_fromstring(text)

    ess = traversal(root, 1, min_elements=4,
              mintreeheight=3, maxtreeheight=4, maxmismatch=0.28)
    print_ess(ess, root=root)

    print "ITEM INFO"
    for es in ess:
        item_info_from_es(es)
Example #8
0
def convert(htmlfile):
    textfile = os.path.splitext(htmlfile)[0] + '.txt'
    print "Convert html %s to plain text %s" % (htmlfile, textfile)

    html = lxml.html.parse(htmlfile)

    text = html.getroot().text_content()

    # clean text
    text = removeComments(text)
    text = removeLonglyWhiteSpace(text)
    text = condenseEmptyLines(text)

    f = open(textfile, 'w')
    f.write(text.encode('utf-8'))
    f.close()
Example #9
0
def main_html(text):
    html = lxml.html.parse(StringIO(text))
    root = html.getroot()
    #root = lxml.html.document_fromstring(text)

    ess = traversal(root,
                    1,
                    min_elements=4,
                    mintreeheight=3,
                    maxtreeheight=4,
                    maxmismatch=0.28)
    print_ess(ess, root=root)

    print "ITEM INFO"
    for es in ess:
        item_info_from_es(es)
Example #10
0
def update_home_index(feed_path, home_path):
    '''Update the HTML index with the feedendry content.'''
    # Get HTML from the index
    if os.path.isfile(home_path):
        html = lxml.html.parse(home_path)
        home = html.getroot()
    else:
        logging.error("WRONG PATH: %s" % (home_path))
    # Get an entry dictionary from the Feed
    entries = last_posts(feed_path)
    # Generate string with markup
    home_index = "<ul id='blog_index'>" + last_posts_html(entries) + "</ul>"
    lis = lxml.html.fragment_fromstring(home_index)
    # replace the content of the home index
    blog_ul = home.get_element_by_id("blog_index")
    blog_ul.getparent().replace(blog_ul, lis)
    return lxml.html.tostring(html, encoding='utf-8')
Example #11
0
File: ymir.py Project: karlcow/ymir
def update_home_index(feed_path, home_path, id_name):
    """Update the HTML index with the feedendry content."""
    # Get HTML from the index
    if os.path.isfile(home_path):
        html = lxml.html.parse(home_path)
        home = html.getroot()
    else:
        logging.error("WRONG PATH: %s" % (home_path))
    # Get an entry dictionary from the Feed
    entries = last_posts(feed_path)
    # Generate string with markup
    home_template = """<ul id="{id}">
    {posts_list}
    </ul>
    """
    posts_list = last_posts_html(entries)
    home_index = home_template.format(
        id=id_name,
        posts_list=posts_list)
    lis = lxml.html.fragment_fromstring(home_index.decode('utf-8'))
    # replace the content of the home index
    blog_ul = home.get_element_by_id(id_name)
    blog_ul.getparent().replace(blog_ul, lis)
    return lxml.html.tostring(html, encoding='utf-8')
Example #12
0
# -*- coding: utf-8 -*-
import os
from lxml import html

song_set = set()
html = html.parse(r"test\myshazam-history.html")
root = html.getroot()
for n in range(len(root.xpath('//tr/td[2]/text()'))):
    # название исполнителя - название композиции; сетом убираем дубликаты
    # TODO убрать дефис между названием песни и исполнителем
    song_set.add((root.xpath('//tr/td[2]/text()')[n] + ' - ' +
                  root.xpath('//tr/td[1]/a/text()')[n]))
print(len(song_set))
Example #13
0
def _rewrite_links(data):
  import lxml.html
  html = lxml.html.parse(data)
  html.getroot().rewrite_links(_rewrite_url, resolve_base_href=True, base_href=data.geturl())
  return lxml.html.tostring(html)
Example #14
0
def get_html(url):
    path, _ = download(url, to_dir=html_dir, keep_params=True)
    html = lxml.html.parse(path, parser=html_parser, base_url=base_url)
    return html.getroot()
                # individual motif plots
                plt_tbl = []
                for i,url in enumerate(motif_plot_urls[:30]) :
                    if i%3 == 0 :
                        plt_tbl.append([])
                    plt_tbl[-1].append(ReStImage(url))

                doc.add(ReStSimpleTable(('**Peak strength vs refined motif strength**','(based on top 2000 peak sequences by pvalue)',''),plt_tbl))
            #end if (len(motif_results_fns)>0):

            doc.add(ReStSection('MEME-ChIP results',level=3))
            meme_index_path = os.path.join(meme_path,'index.html')
            print meme_index_path
            if (os.path.exists(meme_index_path)):
                html = lxml.html.parse(meme_index_path)
                page = html.getroot()
                script_list = page.findall("head/script")
                last_script = script_list[-1]
                add_link_script = """  
  var progs = data["programs"];
  for (var i = 0; i < progs.length; i++) {
    var prog = progs[i];
    if (prog["outputs"].length > 0) {
      var outputs = prog["outputs"];
      for (var j = 0; j < outputs.length; j++) {
        outputs[j]["file"] =  "%s\/" + outputs[j]["file"]
      }
    }
  }
"""%(os.path.relpath(meme_path,os.path.dirname(reSt_html_path)))
                script_el = lxml.html.builder.SCRIPT(add_link_script)
Example #16
0
# -*- coding: utf-8 -*-
import os
from lxml import html


song_set = set()
html = html.parse(r"test\myshazam-history.html")
root = html.getroot()
for n in range(len(root.xpath('//tr/td[2]/text()'))):
    # название исполнителя - название композиции; сетом убираем дубликаты
    # TODO убрать дефис между названием песни и исполнителем
    song_set.add((root.xpath('//tr/td[2]/text()')[n] + ' - ' + root.xpath('//tr/td[1]/a/text()')[n]))
print(len(song_set))