def scrape_baidu(keywords, page=1, type='news', translated=False, translate_input=False): pn = str((page-1)*10) ts = session['job_timestamp'] # keep a snapshot of the timestamp if translate_input: keywords = translate(keywords, l_from="en", l_to="zh").encode('utf-8') if type=='blog': url = "http://www.baidu.com/s?tn=baidurt&rtt=1&wd=%s&pbl=1&pbs=0&bsst=1&pn=%s&ie=utf-8" % (keywords, pn) elif type=='forum': url = "http://www.baidu.com/s?tn=baidurt&rtt=1&wd=%s&pbs=1&bsst=1&pn=%s&ie=utf-8" % (keywords, pn) else: url = "http://www.baidu.com/s?tn=baidurt&rtt=1&wd=%s&pnw=1&pbl=0&pbs=0&bsst=1&ie=utf-8&pn=%s" % (keywords, pn) f = urllib.urlopen(url) soup = BeautifulSoup(f.read().decode('utf-8', 'ignore')) news = soup.select('td.f') result = [] next_page_link = soup.find('a', text="下一页>") if next_page_link: emit('show next page', {}) else: emit('hide next page', {}) for record in news: print "-------ts--------", ts, session['job_timestamp'] if ts != session['job_timestamp']: return a_tag = record.find('a') title = get_soup_text(a_tag) link = a_tag['href'] content = get_content_soup_text(record.find('h3').next_sibling) if translated: emit('result', {'title': translate(title), 'link': link, 'content': translate(content)}) else: emit('result', {'title': title, 'link': link, 'content': content}) emit('end loading', {})
def scrape_weibo(keywords, page=1, translated=False, translate_input=False): pn = str((page-1)*10) ts = job_timestamp if translate_input: keywords = translate(keywords, l_from="en", l_to="zh").encode('utf-8') url = "http://www.baidu.com/s?tn=baiduwb&rtt=2&cl=2&ie=utf-8&wd=%s&pn=%s" % (keywords, pn) f = urllib.urlopen(url) soup = BeautifulSoup(f.read().decode('utf-8', 'ignore')) news = soup.select('#weibo li') result = [] next_page_link = soup.find('a', text="下一页>") if next_page_link: emit('show next page', {}) else: emit('hide next page', {}) for record in news: if ts != job_timestamp: return a_tag = record.select('a.weibo_all') link = a_tag[0]['href'] content = get_soup_text(record.find('p')) if translated: emit('result', {'link': link, 'content': translate(content)}) else: emit('result', {'link': link, 'content': content}) emit('end loading', {})
def scrape_weibo(keywords, page=1, translated=False, translate_input=False): pn = str((page - 1) * 10) ts = job_timestamp if translate_input: keywords = translate(keywords, l_from="en", l_to="zh").encode('utf-8') url = "http://www.baidu.com/s?tn=baiduwb&rtt=2&cl=2&ie=utf-8&wd=%s&pn=%s" % ( keywords, pn) f = urllib.urlopen(url) soup = BeautifulSoup(f.read().decode('utf-8', 'ignore')) news = soup.select('#weibo li') result = [] next_page_link = soup.find('a', text="下一页>") if next_page_link: emit('show next page', {}) else: emit('hide next page', {}) for record in news: if ts != job_timestamp: return a_tag = record.select('a.weibo_all') link = a_tag[0]['href'] content = get_soup_text(record.find('p')) if translated: emit('result', {'link': link, 'content': translate(content)}) else: emit('result', {'link': link, 'content': content}) emit('end loading', {})
def hot_search_terms(): url = "http://news.baidu.com/n?m=rddata&v=hot_word" jsons = urllib.urlopen(url).read().decode('utf-8', 'ignore') terms = jsonp.loads(jsons)['data'] for term in terms: term['title'] = translate(term['title']) return jsonify(result=terms)
def scrape_baidu(keywords, page=1, type='news', translated=False, translate_input=False): pn = str((page - 1) * 10) ts = session['job_timestamp'] # keep a snapshot of the timestamp if translate_input: keywords = translate(keywords, l_from="en", l_to="zh").encode('utf-8') if type == 'blog': url = "http://www.baidu.com/s?tn=baidurt&rtt=1&wd=%s&pbl=1&pbs=0&bsst=1&pn=%s&ie=utf-8" % ( keywords, pn) elif type == 'forum': url = "http://www.baidu.com/s?tn=baidurt&rtt=1&wd=%s&pbs=1&bsst=1&pn=%s&ie=utf-8" % ( keywords, pn) else: url = "http://www.baidu.com/s?tn=baidurt&rtt=1&wd=%s&pnw=1&pbl=0&pbs=0&bsst=1&ie=utf-8&pn=%s" % ( keywords, pn) f = urllib.urlopen(url) soup = BeautifulSoup(f.read().decode('utf-8', 'ignore')) news = soup.select('td.f') result = [] next_page_link = soup.find('a', text="下一页>") if next_page_link: emit('show next page', {}) else: emit('hide next page', {}) for record in news: print "-------ts--------", ts, session['job_timestamp'] if ts != session['job_timestamp']: return a_tag = record.find('a') title = get_soup_text(a_tag) link = a_tag['href'] content = get_content_soup_text(record.find('h3').next_sibling) if translated: emit( 'result', { 'title': translate(title), 'link': link, 'content': translate(content) }) else: emit('result', {'title': title, 'link': link, 'content': content}) emit('end loading', {})