def crawl_blog_post(blog_id, log_no, tags, written_time=None, verbose=True): def get_title(root): result = '' try: result = root.xpath('//h3[@class="tit_h3"]/text()')[0].strip() except Exception: pass if result != '': return result try: result = root.xpath('//h3[@class="se_textarea"]/text()')[0].strip() except Exception: pass #return root.xpath('//h3[@class="tit_h3"]/text()')[0].strip() return result def get_page_html(url): try: page = requests.get(url, headers=headers) root = html.fromstring(page.content) elem = root.xpath('//div[@class="_postView"]')[0] html_ = etree.tostring(elem) return (BeautifulSoup(html_, 'lxml'), get_title(root)) except IOError: print '' return (None, None) #if blog_id.startswith('http'): # url = blog_id #else: url = mobileurl % (blog_id, log_no) (doc, title) = get_page_html(url) if doc: crawled_time = utils.get_today_str() crawler_version = utils.get_version() #url = posturl % (blog_id, log_no) post_tags = tags[(blog_id, log_no)] directory_seq = None # NOTE: No directory sequence given for query crawler post = btc.make_structure(blog_id, log_no, None, doc, crawled_time, crawler_version, title, written_time, url, post_tags, directory_seq) if not verbose: del post['directorySeq'] del post['sympathyCount'] return post else: print 'No doc in %s' % posturl return None
def crawl_blog_post(blog_id, log_no, tags, written_time=None, verbose=True): def get_title(root): return root.xpath('//h3[@class="tit_h3"]/text()')[0].strip() def get_page_html(url): try: root = html.parse(url) elem = root.xpath('//div[@class="_postView"]')[0] html_ = etree.tostring(elem) return (BeautifulSoup(html_), get_title(root)) except IOError: print '' return (None, None) if blog_id.startswith('http'): url = blog_id else: url = mobileurl % (blog_id, log_no) (doc, title) = get_page_html(url) if doc: crawled_time = utils.get_today_str() crawler_version = utils.get_version() url = posturl % (blog_id, log_no) post_tags = tags[(blog_id, log_no)] directory_seq = None # NOTE: No directory sequence given for query crawler post = btc.make_structure(blog_id, log_no, None, doc, crawled_time, crawler_version, title, written_time, url, post_tags, directory_seq) if not verbose: del post['directorySeq'] del post['sympathyCount'] return post else: print 'No doc in %s' % posturl return None