Beispiel #1
0
def store_by_wiki_url(url, comment_count, answered, keyword_id):
    sql_post = session.query(WikiPost).filter(WikiPost.url==url).first()
    if not sql_post:
       sql_post = WikiPost() 

    sql_post.url = url

    sql_post.keyword_id = keyword_id
    sql_post.info_source_id = SOSO_WENWEN_INFO_SOURCE_ID
    sql_post.comment_count = comment_count
    sql_post.answered = answered

    headers = {
        'Host': 'wenwen.soso.com',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17',
    }
    
    req = urllib2.Request(url, headers = headers)  
    response = urllib2.urlopen(req)  
    content = response.read() 
    
    soup = BeautifulSoup(content)
    
    wiki_user_screen_name = soup.find('a', attrs={'class':"user_name"})
    if wiki_user_screen_name == None:
        wiki_user_screen_name = u'匿名'
    else:
        wiki_user_screen_name = wiki_user_screen_name.text
    date_str = soup.find('span', attrs={'class':"question_time"}).text
    created_at = baidu_date_str_to_datetime(date_str)
    title = soup.find('h3', attrs={'id':"questionTitle"}).text
    content_div = soup.find('div', attrs={'class':"question_con"})
    if content_div is None:
        content = ""
    else:
        content = content_div.text

    sql_post.read_count = 0
    sql_post.wiki_user_screen_name = wiki_user_screen_name
    sql_post.title = title
    sql_post.content = content
    sql_post.created_at = created_at

    session.merge(sql_post) #merge

    session.flush()
    session.commit()

    sql_post = session.query(WikiPost).filter(WikiPost.url==url).first()
    if sql_post:
        #print "stored"
        store_category('wiki', str(sql_post.id))

    time.sleep(5)
Beispiel #2
0
def store_by_bbs_url(url, keyword_id):
    headers = {
            'Host': 'bbs.tianya.cn',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17',
    }
    
    req = urllib2.Request(url, headers = headers)  
    response = urllib2.urlopen(req)  
    content = response.read()  

    soup = BeautifulSoup(content)

    title = soup.find('span', attrs={'class': "s_title"})
    title = title.span.text

    info_soup = soup.find('div', attrs={'class': "atl-info"})
    infos = info_soup.findAll('span')
    
    bbs_user_screen_name = infos[0].a.text
    if infos[1].a is None:
        created_at = baidu_date_str_to_datetime(infos[1].text[3:])
        read_count = int(infos[2].text[3:])
        comment_count = int(infos[3].text[3:])
    else:
        created_at = baidu_date_str_to_datetime(infos[2].text[3:])
        read_count = int(infos[3].text[3:])
        comment_count = int(infos[4].text[3:])

    content_div = soup.find('div', attrs={'class': "bbs-content clearfix"})
    content = content_div.text

    
    store_bbs_post(url, bbs_user_screen_name, title, content,
                   TIANYA_INFO_SOURCE_ID, keyword_id, created_at, read_count, comment_count)

    time.sleep(10)
Beispiel #3
0
def search_for_baidu_news_posts():
    last_time = session.query(Job).filter(
        Job.info_source_id == BAIDU_NEWS_INFO_SOURCE_ID).order_by(
            Job.id.desc()).first().previous_executed

    previous_real_count = session.query(News).count()
    count = 0
    sql_job = Job()
    sql_job.previous_executed = datetime.now()
    sql_job.info_source_id = BAIDU_NEWS_INFO_SOURCE_ID

    for keyword in KEYWORDS:
        page = 0
        finished = False
        while (not finished):
            data = {
                'word': keyword.str.encode('gb2312'),
                'tn': 'news',
                'ie': 'gb2312',
                'sr': 0,
                'cl': 2,
                'rn': 20,
                'ct': 0,
                'clk': 'sortbytime',
                'pn': page
            }

            url = "http://news.baidu.com/ns?" + urllib.urlencode(data)

            headers = {
                'Host':
                'news.baidu.com',
                'User-Agent':
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17'
            }

            req = urllib2.Request(url, headers=headers)
            response = urllib2.urlopen(req)
            content = response.read()

            soup = BeautifulSoup(content, fromEncoding="gbk")

            news_tables = soup.findAll('table',
                                       attrs={
                                           'cellspacing': '0',
                                           'cellpadding': '2'
                                       })
            count = count + len(news_tables)

            if len(news_tables) == 0:
                break

            for news_table in news_tables:
                url = news_table.tr.td.a['href']
                title = news_table.tr.td.a.text
                source_and_date = news_table.find('font',
                                                  attrs={
                                                      'color': '#666666'
                                                  }).text.split()
                content = news_table.find('font', attrs={'size': '-1'}).text

                source_name = source_and_date[0]
                if len(source_and_date) == 3:
                    date = source_and_date[1] + ' ' + source_and_date[2]
                else:
                    continue

                created_at = baidu_date_str_to_datetime(date)

                if created_at < last_time:
                    finished = True
                    break

                add_news_to_session(url, source_name, title, content,
                                    BAIDU_NEWS_INFO_SOURCE_ID, created_at,
                                    keyword.id)

            time.sleep(5)
            page = page + 20

    current_real_count = session.query(News).count()
    sql_job.fetched_info_count = count
    sql_job.real_fetched_info_count = current_real_count - previous_real_count

    session.add(sql_job)
    session.flush()
    session.commit()
Beispiel #4
0
def wiki_date_str_to_datetime(date_str):
    if date_str[0:2] == u'今天':
        date_str = datetime.today().strftime('%Y-%m-%d') + date_str[2:]

    return baidu_date_str_to_datetime(date_str)
Beispiel #5
0
def inner_search_for_baidu_news_posts(inner_url, count, last_time, keyword,
                                      info_source_id):
    finished = False
    next_url = inner_url
    while (not finished):

        #print next_url

        headers = {
            'Host':
            'news.baidu.com',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17'
        }

        req = urllib2.Request(next_url, headers=headers)
        response = urllib2.urlopen(req)
        content = response.read()

        soup = BeautifulSoup(content, fromEncoding="gbk")

        news_tables = soup.findAll('table',
                                   attrs={
                                       'cellspacing': '0',
                                       'cellpadding': '2'
                                   })
        count = count + len(news_tables)

        if len(news_tables) == 0:
            break

        for news_table in news_tables:
            url = news_table.tr.td.a['href']
            title = news_table.tr.td.a.text
            source_and_date = news_table.find('font',
                                              attrs={
                                                  'color': '#666666'
                                              }).text.split()
            content = news_table.find('font', attrs={'size': '-1'}).text

            source_name = source_and_date[0]

            if len(source_and_date) == 3:
                date = source_and_date[1] + ' ' + source_and_date[2]
            else:
                continue

            created_at = baidu_date_str_to_datetime(date)

            if info_source_id == BAIDU_NEWS_INFO_SOURCE_ID:
                add_news_to_session(url, source_name, title, content,
                                    info_source_id, created_at, keyword)
            else:
                add_opponent_news_to_session(url, source_name, title, content,
                                             info_source_id, created_at,
                                             keyword)

        time.sleep(5)
        page_nav = soup.find('div', attrs={'class': 'page-nav'})

        # 没有下一页,则结束
        if page_nav == None:
            #print 'no page nav'
            finished = True
            break
        next_url_a = page_nav.find('a', attrs={'class': 'next'})
        if next_url_a == None:
            #print page_nav.prettify()
            finished = True
            break
        else:
            next_url = 'http://news.baidu.com' + next_url_a['href']

    return count
Beispiel #6
0
def search_for_sina_blog_posts():
    last_time = session.query(Job).filter(
        Job.info_source_id == SINA_BLOG_INFO_SOURCE_ID).order_by(
            Job.id.desc()).first().previous_executed

    previous_real_count = session.query(BlogPost).filter(
        BlogPost.info_source_id == SINA_BLOG_INFO_SOURCE_ID).count()
    count = 0
    sql_job = Job()
    sql_job.previous_executed = datetime.now()
    sql_job.info_source_id = SINA_BLOG_INFO_SOURCE_ID

    for keyword in KEYWORDS:
        finished = False
        page = 1

        while (not finished):
            data = {
                'q': keyword.str.encode('gbk'),
                'c': 'blog',
                'range': 'article',
                'by': 'title',
                'sort': 'time',
                'page': page
            }

            url = "http://search.sina.com.cn/?" + urllib.urlencode(data)

            headers = {
                'Host':
                'search.sina.com.cn',
                'User-Agent':
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17'
            }

            req = urllib2.Request(url, headers=headers)
            response = urllib2.urlopen(req)
            content = response.read()

            soup = BeautifulSoup(content.decode('gbk', 'ignore'))

            posts = soup.findAll('div', attrs={'class': 'r-info r-info2'})
            count = count + len(posts)

            if len(posts) == 0:
                break

            for post in posts:
                url = post.a['href']
                title = post.a.text
                blog_user_screen_name = post.find('a',
                                                  attrs={
                                                      'class': 'fblue'
                                                  }).text
                created_at = baidu_date_str_to_datetime(
                    post.find('span', attrs={
                        'class': 'fgreen time'
                    }).text)
                content = post.p.text

                counts = get_count_from_url(url)
                read_count = counts['read_count']
                comment_count = counts['comment_count']

                if created_at < last_time:
                    finished = True
                    break

                store_blog_post(url, blog_user_screen_name, title, content,
                                SINA_BLOG_INFO_SOURCE_ID, keyword.id,
                                created_at, read_count, comment_count)

                time.sleep(2)

            time.sleep(10)
            page = page + 1

    current_real_count = session.query(BlogPost).filter(
        BlogPost.info_source_id == SINA_BLOG_INFO_SOURCE_ID).count()

    sql_job.fetched_info_count = count
    sql_job.real_fetched_info_count = current_real_count - previous_real_count

    session.add(sql_job)
    session.flush()
    session.commit()
Beispiel #7
0
def search_for_baidu_tieba_posts():
    last_time = session.query(Job).filter(Job.info_source_id==BAIDU_TIEBA_INFO_SOURCE_ID).order_by(Job.id.desc()).first().previous_executed    

    previous_real_count = session.query(BBSPost).filter(BBSPost.info_source_id==BAIDU_TIEBA_INFO_SOURCE_ID).count()
    count = 0
    sql_job = Job()
    sql_job.previous_executed = datetime.now()
    sql_job.info_source_id = BAIDU_TIEBA_INFO_SOURCE_ID

    for keyword in KEYWORDS :
        finished = False
        page = 1

        while(not finished):
            data = {'qw': keyword.str.encode('gbk'),
                    'isnew': 1,
                    'rn': 20,
                    'sm': 1,
                    'pn': page
                   }
            
            url = "http://tieba.baidu.com/f/search/res?" + urllib.urlencode(data)
            #print url,keyword.str
            headers = {
                'Host': 'tieba.baidu.com',
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17'
            }
    
            req = urllib2.Request(url, headers = headers)  
            response = urllib2.urlopen(req)  
            content = response.read() 
    
            soup = BeautifulSoup(content.decode('gbk', 'ignore'))
            
            posts = soup.findAll('div', attrs={'class': 's_post'})
            count = count + len(posts)
            # print count


            for post in posts:
                temp_url = post.a['href']
                url = 'http://tieba.baidu.com'+ temp_url#[:tail-1]
                title = post.a.text
                #print title
                content = post.find('div', attrs={'class': 'p_content'}).text
                bbs_a_tags = post.findAll('a')
                bbs_user_screen_name = bbs_a_tags[-1].text
                created_at_str = post.findAll('font', attrs={'class': 'p_green'})[-1].text
                created_at = baidu_date_str_to_datetime(created_at_str)


                if created_at < last_time:
                    finished = True
                    break

                if title[0:3] == u'回复:':
                    continue

                comment_count = get_comment_count(url)
                read_count = 0
                #print url,title,bbs_user_screen_name,created_at,comment_count
                store_bbs_post(url, bbs_user_screen_name, title, content,
                               BAIDU_TIEBA_INFO_SOURCE_ID, keyword.id, created_at, read_count, comment_count)

                time.sleep(5)


            time.sleep(10)
            page = page + 1

            if len(posts) == 0:
                break


    current_real_count = session.query(BBSPost).filter(BBSPost.info_source_id==BAIDU_TIEBA_INFO_SOURCE_ID).count()

    sql_job.fetched_info_count = count
    sql_job.real_fetched_info_count = current_real_count - previous_real_count

    session.add(sql_job)
    session.flush()
    session.commit()