def main(): try: search_for_google_news_posts(OPPONENT_KEYWORDS, OPPONENT_GOOGLE_NEWS_INFO_SOURCE_ID) except Exception, e: news_logger.exception(e) store_error(OPPONENT_GOOGLE_NEWS_INFO_SOURCE_ID)
def main(): try: search_for_tianya_bbs_posts() search_for_tianya_bbs_315_posts() except Exception, e: store_error(TIANYA_INFO_SOURCE_ID) bbs_logger.exception(e)
def main(): try: search_for_new_statuses() refresh_monitoring_status() except Exception, e: store_error(QQ_WEIBO_INFO_SOURCE_ID) weibo_logger.exception(e)
def search_for_baidu_zhidao_posts(): previous_real_count = session.query(WikiPost).filter(WikiPost.info_source_id==BAIDU_ZHIDAO_INFO_SOURCE_ID).count() count = 0 sql_job = Job() sql_job.previous_executed = datetime.now() sql_job.info_source_id = BAIDU_ZHIDAO_INFO_SOURCE_ID count = 0 last_count = 0 for keyword in KEYWORDS : data = {'word': keyword.str.encode('utf8'), 'ie': 'utf-8', 'sort': 1, 'lm': 0, 'date':2, 'oa':0, 'sites':-1, } url = "http://zhidao.baidu.com/search?" + urllib.urlencode(data) # print url headers = { 'Host': 'zhidao.baidu.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17', } req = urllib2.Request(url, headers = headers) response = urllib2.urlopen(req) content = response.read() soup = BeautifulSoup(content) posts = soup.findAll('dl', attrs={'class': "dl"}) count = count + len(posts) #print count for post in posts: try: url = post.dt.a['href'] comment_count = 0 if post.find('a', attrs={'log': "pos:ans"}): comment_count_str = post.find('a', attrs={'log': "pos:ans"}).text tail = comment_count_str.find(u'个回答') comment_count = int(comment_count_str[:tail]) store_by_wiki_url(url, comment_count, keyword.id) except Exception, e: store_error(BAIDU_ZHIDAO_INFO_SOURCE_ID) wiki_logger.exception(e) if count - last_count > 30: time.sleep(1800) last_count = count
def main(): try: start_time = datetime.now() search_for_baidu_tieba_posts() end_time = datetime.now() consume_time = end_time - start_time bbs_logger.info("baidu tieba consume time: " + str(consume_time)) except Exception, e: store_error(BAIDU_TIEBA_INFO_SOURCE_ID) bbs_logger.exception(e)
def main(): try: start_time = datetime.now() do_login('yoyo_worms', 'bI9eK4NF') search_for_baidu_zhidao_posts() end_time = datetime.now() consume_time = end_time - start_time wiki_logger.info("baidu zhidao consume time: " + str(consume_time)) except Exception, e: store_error(BAIDU_ZHIDAO_INFO_SOURCE_ID) wiki_logger.exception(e)
def main(): search_for_soso_wenwen_posts() try: start_time = datetime.now() search_for_soso_wenwen_posts() end_time = datetime.now() consume_time = end_time - start_time wiki_logger.info("soso wenwen consume time: " + str(consume_time)) except Exception, e: store_error(SOSO_WENWEN_INFO_SOURCE_ID) wiki_logger.exception(e)
def update_by_weibo_id(id, origin_id): try: api_status = api.tweet.show(origin_id) sql_status = session.query(Status).get(id) sql_status.repost_count = api_status.count sql_status.comment_count = api_status.mcount session.commit() store_category('weibo', str(sql_status.id)) except Exception, e: #APIError store_error(QQ_WEIBO_INFO_SOURCE_ID) weibo_logger.exception(e)
def main(): try: search_for_tudou_video_posts() except Exception, e: store_error(TUDOU_INFO_SOURCE_ID) video_logger.exception(e)
def search_for_tudou_video_posts(): previous_real_count = session.query(VideoPost).filter( VideoPost.info_source_id == TUDOU_INFO_SOURCE_ID).count() count = 0 sql_job = Job() sql_job.previous_executed = datetime.now() sql_job.info_source_id = TUDOU_INFO_SOURCE_ID html_parser = HTMLParser.HTMLParser() for keyword in KEYWORDS: page = 1 finished = False while (not finished and page <= 10): url = "http://www.soku.com/t/nisearch/" + urllib.quote_plus( keyword.str.encode('utf8') ) + '/_cid__sort_date_display_album_time_0_page_' + str( page) + '?sfilter=1' page = page + 1 #print url headers = { 'Host': 'www.soku.com', 'Referer': 'http://www.soku.com/search_video/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17', } req = urllib2.Request(url, headers=headers) response = urllib2.urlopen(req) content = response.read() soup = BeautifulSoup(content) posts = soup.findAll('div', attrs={'class': "v"}) count = count + len(posts) if len(posts) == 0: finished = True break for post in posts: try: video_user_screen_name = post.find('span', attrs={ 'class': "username" }).text deltatime = post.find('span', attrs={'class': "pub"}).text v_meta_title = post.find('div', attrs={'class': "v-meta-title"}) title = v_meta_title.a['title'] title = html_parser.unescape(title) url = v_meta_title.a['href'] try: v_meta_entry = post.find( 'div', attrs={'class': "v-meta-entry"}) v_meta_datas = v_meta_entry.findAll( 'div', attrs={'class': "v-meta-data"}) playcount = v_meta_datas[1].text playcount = playcount[playcount.find(":") + 1:] playcount = playcount.replace(',', '') playcount = int(playcount) except: playcount = 0 #对关键词进行重新过滤 if not recheck_title(keyword, title): continue try: created_at = convertTime(deltatime) if created_at == -1: continue except: created_at = datetime.now() #print video_user_screen_name,created_at,title,url,playcount store_by_tudou_video_url(url, keyword.id, title, video_user_screen_name, created_at, playcount) time.sleep(5) except Exception, e: store_error(TUDOU_INFO_SOURCE_ID) video_logger.exception(e) time.sleep(5)
def main(): try: search_for_youku_video_posts() except Exception, e: store_error(YOUKU_INFO_SOURCE_ID) video_logger.exception(e)
def main(): try: search_for_baidu_news_posts(KEYWORDS, BAIDU_NEWS_INFO_SOURCE_ID) except Exception, e: news_logger.exception(e) store_error(BAIDU_NEWS_INFO_SOURCE_ID)
def main(): try: search_for_sina_video_posts() except Exception, e: store_error(SINA_VIDEO_INFO_SOURCE_ID) video_logger.exception(e)
def search_for_sina_video_posts(): previous_real_count = session.query(VideoPost).filter( VideoPost.info_source_id == SINA_VIDEO_INFO_SOURCE_ID).count() count = 0 sql_job = Job() sql_job.previous_executed = datetime.now() sql_job.info_source_id = SINA_VIDEO_INFO_SOURCE_ID for keyword in KEYWORDS: page = 1 finished = False while (not finished and page <= 10): url = "http://video.sina.com.cn/search/index.php?k=" + urllib.quote_plus( keyword.str.encode('utf8')) + "&m1=a&m3=a2&page=" + str(page) page = page + 1 #print url headers = { 'Host': 'video.sina.com.cn', 'Referer': 'http://video.sina.com.cn/search/index.php?', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17', } req = urllib2.Request(url, headers=headers) response = urllib2.urlopen(req) content = response.read() soup = BeautifulSoup(content) video_list = soup.find('div', id="contentH") if video_list == None: finished = True break divs = video_list.findAll('div') if len(divs) == 0: finished = True break tr_arr = video_list.findAll('tr') temp_arr = [] for i, tr in enumerate(tr_arr): try: if i % 2 == 0: temp_arr = [] td_divs = tr.findAll('div', attrs={'class': "v_Info"}) for j, td_div in enumerate(td_divs): name_div = td_div.find('div', attrs={'class': 'name'}) a_tag = name_div.findAll('a')[1] video_url = a_tag['href'] video_title = a_tag['title'] temp_arr.append({ 'video_url': video_url, 'video_title': video_title }) #print video_url,video_title else: #print tr.prettify() td_divs = tr.findAll('div', attrs={'class': "v_Info"}) for j, td_div in enumerate(td_divs): li_arr = td_div.findAll('li') try: video_user = li_arr[0].a['title'] except: video_user = li_arr[0].a.text video_createAt = li_arr[1].text created_at = convertTime(video_createAt) video_url = temp_arr[j]['video_url'] video_title = temp_arr[j]['video_title'] try: play_count = li_arr[2].text play_count = play_count[3:] play_count = play_count.replace(',', '') play_count = int(play_count) except: play_count = 0 #print "###"+video_title #二次过滤关键词和时间 if created_at != -1 and recheck_title( keyword, video_title) == True: #print video_title,video_url,video_user,created_at,play_count store_by_sina_video_url( video_url, keyword.id, video_title, video_user, created_at, play_count) except Exception, e: store_error(SINA_VIDEO_INFO_SOURCE_ID) video_logger.exception(e) time.sleep(5) time.sleep(5)
def main(): try: search_for_sina_blog_posts() except Exception, e: store_error(SINA_BLOG_INFO_SOURCE_ID) blog_logger.exception(e)
def main(): try: search_for_youku_global_video_posts() except Exception, e: store_error(ALL_VIDEO_INFO_SOURCE_ID) video_logger.exception(e)