def get_entry(href): link = "http://www.playno1.com/" + href q = pq(link, redirect=False) title = q("h1:first").text() published, author = q("h1:first ~ p").text().split(u" | \u4f5c\u8005:") content = q("#article_content").html() return {"link": link, "title": title, "author": author, "content": content, "published": published}
def get_content(url): q = pq(url) q('a:contains("pic.twitter.com")').map( lambda i, x: pq(x.attrib['href'])( '.permalink-tweet-container img:not(.avatar)' ).map(lambda j, y: q(x).before(q('<img>').attr('src', y.attrib['src']))) ) [q(i).remove() for i in q.root.iter() if callable(i.tag)] # comment q('script, .footer_social_ad, .button_top, .article_footer').remove() q('.ad_amazon, .jin-ads, #other_news_website').remove() q('#popular_articles_comment, #hot_tweet, #category-link').remove() q('.related-articles, #ad2, .ent_ad_md, #ad_rs, #tags').remove() q('.tooltip, .comment_form, .article_header').remove() q(q('.article_bodymore > table')[-2:]).remove() q(q('#comment_list li')[30:]).remove() q('#comment_list li dl').replace_with(lambda i, x: x[1].text) q('#comment_list ul')[0].tag = 'ol' q('img').wrap('<div style="float: left !important">') [strip(i) for i in q.root.iter()] content = q('.article').html() + q('#comment').html() content = re.sub(r'(<br/>)+', '<br/>', content) return content
def get_entry(url, prefix='', title=''): q = pq(url, cookies=COOKIES) author = q('.article-meta-value:eq(0)').text() title = prefix + q('.article-meta-value:eq(2)').text() or title published = post_time(q('.article-meta-value:eq(3)').text()) updated = push_time(q('.push-ipdatetime:last').text()) or published content = get_content(q('#main-content').html()) return { 'link': url, 'title': title, 'author': author, 'content': content, 'published': published, 'updated': updated }
def get_page(url): page = {} q = pq(url, cookies=COOKIES) q('.r-list-sep ~ .r-ent').remove() page['next_url'] = BASE_URL + q('a.wide:eq(1)').attr('href') page['posts'] = [] for ent in q('.r-ent'): if not q('.f0,.f1,.f3', ent): continue if q('.f3', ent).text().startswith('1'): # take out push < 20 continue if not q('a', ent): continue page['posts'].append({ 'link': BASE_URL + q('a', ent).attr('href'), 'prefix': q('.nrec', ent).text() + ' ', 'title': q('.title', ent).text(), }) return page
def get_entries(url): q = pq(url, redirect=False) for e in q(".fire_float"): href = q("a", e).attr("href") yield get_entry(href) time.sleep(3)