def store_by_wiki_url(url, comment_count, answered, keyword_id): sql_post = session.query(WikiPost).filter(WikiPost.url==url).first() if not sql_post: sql_post = WikiPost() sql_post.url = url sql_post.keyword_id = keyword_id sql_post.info_source_id = SOSO_WENWEN_INFO_SOURCE_ID sql_post.comment_count = comment_count sql_post.answered = answered headers = { 'Host': 'wenwen.soso.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17', } req = urllib2.Request(url, headers = headers) response = urllib2.urlopen(req) content = response.read() soup = BeautifulSoup(content) wiki_user_screen_name = soup.find('a', attrs={'class':"user_name"}) if wiki_user_screen_name == None: wiki_user_screen_name = u'匿名' else: wiki_user_screen_name = wiki_user_screen_name.text date_str = soup.find('span', attrs={'class':"question_time"}).text created_at = baidu_date_str_to_datetime(date_str) title = soup.find('h3', attrs={'id':"questionTitle"}).text content_div = soup.find('div', attrs={'class':"question_con"}) if content_div is None: content = "" else: content = content_div.text sql_post.read_count = 0 sql_post.wiki_user_screen_name = wiki_user_screen_name sql_post.title = title sql_post.content = content sql_post.created_at = created_at session.merge(sql_post) #merge session.flush() session.commit() sql_post = session.query(WikiPost).filter(WikiPost.url==url).first() if sql_post: #print "stored" store_category('wiki', str(sql_post.id)) time.sleep(5)
def store_by_bbs_url(url, keyword_id): headers = { 'Host': 'bbs.tianya.cn', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17', } req = urllib2.Request(url, headers = headers) response = urllib2.urlopen(req) content = response.read() soup = BeautifulSoup(content) title = soup.find('span', attrs={'class': "s_title"}) title = title.span.text info_soup = soup.find('div', attrs={'class': "atl-info"}) infos = info_soup.findAll('span') bbs_user_screen_name = infos[0].a.text if infos[1].a is None: created_at = baidu_date_str_to_datetime(infos[1].text[3:]) read_count = int(infos[2].text[3:]) comment_count = int(infos[3].text[3:]) else: created_at = baidu_date_str_to_datetime(infos[2].text[3:]) read_count = int(infos[3].text[3:]) comment_count = int(infos[4].text[3:]) content_div = soup.find('div', attrs={'class': "bbs-content clearfix"}) content = content_div.text store_bbs_post(url, bbs_user_screen_name, title, content, TIANYA_INFO_SOURCE_ID, keyword_id, created_at, read_count, comment_count) time.sleep(10)
def search_for_baidu_news_posts(): last_time = session.query(Job).filter( Job.info_source_id == BAIDU_NEWS_INFO_SOURCE_ID).order_by( Job.id.desc()).first().previous_executed previous_real_count = session.query(News).count() count = 0 sql_job = Job() sql_job.previous_executed = datetime.now() sql_job.info_source_id = BAIDU_NEWS_INFO_SOURCE_ID for keyword in KEYWORDS: page = 0 finished = False while (not finished): data = { 'word': keyword.str.encode('gb2312'), 'tn': 'news', 'ie': 'gb2312', 'sr': 0, 'cl': 2, 'rn': 20, 'ct': 0, 'clk': 'sortbytime', 'pn': page } url = "http://news.baidu.com/ns?" + urllib.urlencode(data) headers = { 'Host': 'news.baidu.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17' } req = urllib2.Request(url, headers=headers) response = urllib2.urlopen(req) content = response.read() soup = BeautifulSoup(content, fromEncoding="gbk") news_tables = soup.findAll('table', attrs={ 'cellspacing': '0', 'cellpadding': '2' }) count = count + len(news_tables) if len(news_tables) == 0: break for news_table in news_tables: url = news_table.tr.td.a['href'] title = news_table.tr.td.a.text source_and_date = news_table.find('font', attrs={ 'color': '#666666' }).text.split() content = news_table.find('font', attrs={'size': '-1'}).text source_name = source_and_date[0] if len(source_and_date) == 3: date = source_and_date[1] + ' ' + source_and_date[2] else: continue created_at = baidu_date_str_to_datetime(date) if created_at < last_time: finished = True break add_news_to_session(url, source_name, title, content, BAIDU_NEWS_INFO_SOURCE_ID, created_at, keyword.id) time.sleep(5) page = page + 20 current_real_count = session.query(News).count() sql_job.fetched_info_count = count sql_job.real_fetched_info_count = current_real_count - previous_real_count session.add(sql_job) session.flush() session.commit()
def wiki_date_str_to_datetime(date_str): if date_str[0:2] == u'今天': date_str = datetime.today().strftime('%Y-%m-%d') + date_str[2:] return baidu_date_str_to_datetime(date_str)
def inner_search_for_baidu_news_posts(inner_url, count, last_time, keyword, info_source_id): finished = False next_url = inner_url while (not finished): #print next_url headers = { 'Host': 'news.baidu.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17' } req = urllib2.Request(next_url, headers=headers) response = urllib2.urlopen(req) content = response.read() soup = BeautifulSoup(content, fromEncoding="gbk") news_tables = soup.findAll('table', attrs={ 'cellspacing': '0', 'cellpadding': '2' }) count = count + len(news_tables) if len(news_tables) == 0: break for news_table in news_tables: url = news_table.tr.td.a['href'] title = news_table.tr.td.a.text source_and_date = news_table.find('font', attrs={ 'color': '#666666' }).text.split() content = news_table.find('font', attrs={'size': '-1'}).text source_name = source_and_date[0] if len(source_and_date) == 3: date = source_and_date[1] + ' ' + source_and_date[2] else: continue created_at = baidu_date_str_to_datetime(date) if info_source_id == BAIDU_NEWS_INFO_SOURCE_ID: add_news_to_session(url, source_name, title, content, info_source_id, created_at, keyword) else: add_opponent_news_to_session(url, source_name, title, content, info_source_id, created_at, keyword) time.sleep(5) page_nav = soup.find('div', attrs={'class': 'page-nav'}) # 没有下一页,则结束 if page_nav == None: #print 'no page nav' finished = True break next_url_a = page_nav.find('a', attrs={'class': 'next'}) if next_url_a == None: #print page_nav.prettify() finished = True break else: next_url = 'http://news.baidu.com' + next_url_a['href'] return count
def search_for_sina_blog_posts(): last_time = session.query(Job).filter( Job.info_source_id == SINA_BLOG_INFO_SOURCE_ID).order_by( Job.id.desc()).first().previous_executed previous_real_count = session.query(BlogPost).filter( BlogPost.info_source_id == SINA_BLOG_INFO_SOURCE_ID).count() count = 0 sql_job = Job() sql_job.previous_executed = datetime.now() sql_job.info_source_id = SINA_BLOG_INFO_SOURCE_ID for keyword in KEYWORDS: finished = False page = 1 while (not finished): data = { 'q': keyword.str.encode('gbk'), 'c': 'blog', 'range': 'article', 'by': 'title', 'sort': 'time', 'page': page } url = "http://search.sina.com.cn/?" + urllib.urlencode(data) headers = { 'Host': 'search.sina.com.cn', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17' } req = urllib2.Request(url, headers=headers) response = urllib2.urlopen(req) content = response.read() soup = BeautifulSoup(content.decode('gbk', 'ignore')) posts = soup.findAll('div', attrs={'class': 'r-info r-info2'}) count = count + len(posts) if len(posts) == 0: break for post in posts: url = post.a['href'] title = post.a.text blog_user_screen_name = post.find('a', attrs={ 'class': 'fblue' }).text created_at = baidu_date_str_to_datetime( post.find('span', attrs={ 'class': 'fgreen time' }).text) content = post.p.text counts = get_count_from_url(url) read_count = counts['read_count'] comment_count = counts['comment_count'] if created_at < last_time: finished = True break store_blog_post(url, blog_user_screen_name, title, content, SINA_BLOG_INFO_SOURCE_ID, keyword.id, created_at, read_count, comment_count) time.sleep(2) time.sleep(10) page = page + 1 current_real_count = session.query(BlogPost).filter( BlogPost.info_source_id == SINA_BLOG_INFO_SOURCE_ID).count() sql_job.fetched_info_count = count sql_job.real_fetched_info_count = current_real_count - previous_real_count session.add(sql_job) session.flush() session.commit()
def search_for_baidu_tieba_posts(): last_time = session.query(Job).filter(Job.info_source_id==BAIDU_TIEBA_INFO_SOURCE_ID).order_by(Job.id.desc()).first().previous_executed previous_real_count = session.query(BBSPost).filter(BBSPost.info_source_id==BAIDU_TIEBA_INFO_SOURCE_ID).count() count = 0 sql_job = Job() sql_job.previous_executed = datetime.now() sql_job.info_source_id = BAIDU_TIEBA_INFO_SOURCE_ID for keyword in KEYWORDS : finished = False page = 1 while(not finished): data = {'qw': keyword.str.encode('gbk'), 'isnew': 1, 'rn': 20, 'sm': 1, 'pn': page } url = "http://tieba.baidu.com/f/search/res?" + urllib.urlencode(data) #print url,keyword.str headers = { 'Host': 'tieba.baidu.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17' } req = urllib2.Request(url, headers = headers) response = urllib2.urlopen(req) content = response.read() soup = BeautifulSoup(content.decode('gbk', 'ignore')) posts = soup.findAll('div', attrs={'class': 's_post'}) count = count + len(posts) # print count for post in posts: temp_url = post.a['href'] url = 'http://tieba.baidu.com'+ temp_url#[:tail-1] title = post.a.text #print title content = post.find('div', attrs={'class': 'p_content'}).text bbs_a_tags = post.findAll('a') bbs_user_screen_name = bbs_a_tags[-1].text created_at_str = post.findAll('font', attrs={'class': 'p_green'})[-1].text created_at = baidu_date_str_to_datetime(created_at_str) if created_at < last_time: finished = True break if title[0:3] == u'回复:': continue comment_count = get_comment_count(url) read_count = 0 #print url,title,bbs_user_screen_name,created_at,comment_count store_bbs_post(url, bbs_user_screen_name, title, content, BAIDU_TIEBA_INFO_SOURCE_ID, keyword.id, created_at, read_count, comment_count) time.sleep(5) time.sleep(10) page = page + 1 if len(posts) == 0: break current_real_count = session.query(BBSPost).filter(BBSPost.info_source_id==BAIDU_TIEBA_INFO_SOURCE_ID).count() sql_job.fetched_info_count = count sql_job.real_fetched_info_count = current_real_count - previous_real_count session.add(sql_job) session.flush() session.commit()