Esempio n. 1
0
def scrape_baidu(keywords, page=1, type='news', translated=False, translate_input=False):
    pn = str((page-1)*10)
    ts = session['job_timestamp']  # keep a snapshot of the timestamp
    if translate_input:
       keywords = translate(keywords, l_from="en", l_to="zh").encode('utf-8')
    if type=='blog':
       url = "http://www.baidu.com/s?tn=baidurt&rtt=1&wd=%s&pbl=1&pbs=0&bsst=1&pn=%s&ie=utf-8" % (keywords, pn)
    elif type=='forum':
       url = "http://www.baidu.com/s?tn=baidurt&rtt=1&wd=%s&pbs=1&bsst=1&pn=%s&ie=utf-8" % (keywords, pn)
    else:
       url = "http://www.baidu.com/s?tn=baidurt&rtt=1&wd=%s&pnw=1&pbl=0&pbs=0&bsst=1&ie=utf-8&pn=%s" % (keywords, pn)
    f = urllib.urlopen(url)
    soup = BeautifulSoup(f.read().decode('utf-8', 'ignore'))
    news = soup.select('td.f')
    result = []
    next_page_link = soup.find('a', text="下一页>")
    if next_page_link:
       emit('show next page', {})
    else:
       emit('hide next page', {})
    for record in news:
        print "-------ts--------", ts, session['job_timestamp']
        if ts != session['job_timestamp']:
            return
        a_tag = record.find('a')
        title = get_soup_text(a_tag)
        link = a_tag['href']
        content = get_content_soup_text(record.find('h3').next_sibling)
        if translated:
            emit('result', {'title': translate(title), 'link': link, 'content': translate(content)})
        else:
            emit('result', {'title': title, 'link': link, 'content': content})
    emit('end loading', {})
Esempio n. 2
0
def scrape_weibo(keywords, page=1, translated=False, translate_input=False):
    pn = str((page-1)*10)
    ts = job_timestamp
    if translate_input:
        keywords = translate(keywords, l_from="en", l_to="zh").encode('utf-8')
    url = "http://www.baidu.com/s?tn=baiduwb&rtt=2&cl=2&ie=utf-8&wd=%s&pn=%s" % (keywords, pn)
    f = urllib.urlopen(url)
    soup = BeautifulSoup(f.read().decode('utf-8', 'ignore'))
    news = soup.select('#weibo li')
    result = []
    next_page_link = soup.find('a', text="下一页>")
    if next_page_link:
        emit('show next page', {})
    else:
        emit('hide next page', {})
    for record in news:
        if ts != job_timestamp:
            return
        a_tag = record.select('a.weibo_all')
        link = a_tag[0]['href']
        content = get_soup_text(record.find('p'))
        if translated:
            emit('result', {'link': link, 'content': translate(content)})
        else:
            emit('result', {'link': link, 'content': content})
	emit('end loading', {})
def scrape_weibo(keywords, page=1, translated=False, translate_input=False):
    pn = str((page - 1) * 10)
    ts = job_timestamp
    if translate_input:
        keywords = translate(keywords, l_from="en", l_to="zh").encode('utf-8')
    url = "http://www.baidu.com/s?tn=baiduwb&rtt=2&cl=2&ie=utf-8&wd=%s&pn=%s" % (
        keywords, pn)
    f = urllib.urlopen(url)
    soup = BeautifulSoup(f.read().decode('utf-8', 'ignore'))
    news = soup.select('#weibo li')
    result = []
    next_page_link = soup.find('a', text="下一页>")
    if next_page_link:
        emit('show next page', {})
    else:
        emit('hide next page', {})
    for record in news:
        if ts != job_timestamp:
            return
        a_tag = record.select('a.weibo_all')
        link = a_tag[0]['href']
        content = get_soup_text(record.find('p'))
        if translated:
            emit('result', {'link': link, 'content': translate(content)})
        else:
            emit('result', {'link': link, 'content': content})
        emit('end loading', {})
Esempio n. 4
0
def hot_search_terms():
    url = "http://news.baidu.com/n?m=rddata&v=hot_word"
    jsons = urllib.urlopen(url).read().decode('utf-8', 'ignore')
    terms = jsonp.loads(jsons)['data']
    for term in terms:
        term['title'] = translate(term['title'])
    return jsonify(result=terms)
def hot_search_terms():
    url = "http://news.baidu.com/n?m=rddata&v=hot_word"
    jsons = urllib.urlopen(url).read().decode('utf-8', 'ignore')
    terms = jsonp.loads(jsons)['data']
    for term in terms:
        term['title'] = translate(term['title'])
    return jsonify(result=terms)
def scrape_baidu(keywords,
                 page=1,
                 type='news',
                 translated=False,
                 translate_input=False):
    pn = str((page - 1) * 10)
    ts = session['job_timestamp']  # keep a snapshot of the timestamp
    if translate_input:
        keywords = translate(keywords, l_from="en", l_to="zh").encode('utf-8')
    if type == 'blog':
        url = "http://www.baidu.com/s?tn=baidurt&rtt=1&wd=%s&pbl=1&pbs=0&bsst=1&pn=%s&ie=utf-8" % (
            keywords, pn)
    elif type == 'forum':
        url = "http://www.baidu.com/s?tn=baidurt&rtt=1&wd=%s&pbs=1&bsst=1&pn=%s&ie=utf-8" % (
            keywords, pn)
    else:
        url = "http://www.baidu.com/s?tn=baidurt&rtt=1&wd=%s&pnw=1&pbl=0&pbs=0&bsst=1&ie=utf-8&pn=%s" % (
            keywords, pn)
    f = urllib.urlopen(url)
    soup = BeautifulSoup(f.read().decode('utf-8', 'ignore'))
    news = soup.select('td.f')
    result = []
    next_page_link = soup.find('a', text="下一页>")
    if next_page_link:
        emit('show next page', {})
    else:
        emit('hide next page', {})
    for record in news:
        print "-------ts--------", ts, session['job_timestamp']
        if ts != session['job_timestamp']:
            return
        a_tag = record.find('a')
        title = get_soup_text(a_tag)
        link = a_tag['href']
        content = get_content_soup_text(record.find('h3').next_sibling)
        if translated:
            emit(
                'result', {
                    'title': translate(title),
                    'link': link,
                    'content': translate(content)
                })
        else:
            emit('result', {'title': title, 'link': link, 'content': content})
    emit('end loading', {})