def catch(account_id): # 关键就在这里了 u""" :param target_url: https://xueqiu.com/4065977305 :return: """ mock_sleep_time = 0.5 article_url_index_list = [] # 获取最大页码 # url = 'http://chuansong.me/account/{}'.format(account_id) # front_page_content = Http.get_content(url) # max_page = XueQiuWorker.parse_max_page(front_page_content) column_info = {} column_info[u'column_id'] = account_id column_info[u'title'] = "" column_info['article_count'] = 0 column_info['follower_count'] = 0 column_info['description'] = '' column_info['image_url'] = '' from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) strtT = '1558513651020' # https://api.wallstreetcn.com/apiv1/content/themes/stream/1005680?type=newest&cursor=1558066610478&limit=20 max_page = 2 index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 for raw_front_page_index in range(1, max_page): resuorcecatch(account_id, strtT)
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 column_info = FileColumnParser('').get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = "毛泽东军事文选" from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) star_page = 0 max_page = 1 from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() #获取每一页中文章的地址的地址 path = '/Users/ink/Desktop/ht' list = os.listdir(path) #列出文件夹下所有的目录与文件 for i in list: # print i if str(i).endswith('htm') or str(i).endswith('html'): filename = u'/Users/ink/Desktop/ht/{}'.format(i) convert_encoding(filename, 'utf-8') f = open(filename) contents = f.read() # print(contents) # gb2312 转 article_info = FileArticleParser(contents).get_article_info() if len(article_info) > 0: article_info['article_id'] = i article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) f.close()
def catch(account_id): # 关键就在这里了 article_url_index_list = [] # 获取最大页码 url = 'http://www.taoguba.com.cn/Article/' + account_id + '/1' front_page_content = Http.get_content(url) star_page = 1 with open('ReadList.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: if str(line).__contains__('#'): split_url = line.split('#')[0] trgId = split_url.split('/')[-2] if trgId == account_id: pg = (split_url.split('/')[-1]) print pg star_page = int(pg) if star_page == 0: star_page = 1 else: print star_page max_page = 2 dom = BeautifulSoup(front_page_content, "lxml") list_pcyc_l_ = dom.find_all('div', class_="left t_page01") try: for tgo_tgo_ in list_pcyc_l_: linkl = tgo_tgo_.findAll('a') tarUrl = linkl[0].get('href') max_page = int(tarUrl.split('/')[3]) except IndexError as e: max_page = 1 column_info = TGBColumnParser(front_page_content).get_column_info() from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 # star_page = 100 for raw_front_page_index in range(star_page, max_page + 1): request_url = 'http://www.taoguba.com.cn/Article/' + account_id + '/' + str( raw_front_page_index) article_url_index_list.append(request_url) article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = TGBArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 column_info = Todo3ColumnParser('').get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = "新能源汽车" column_info['article_count'] = 0 column_info['follower_count'] = 0 column_info['description'] = '' column_info['image_url'] = '' from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) star_page = 1 max_page = 1 from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() #获取每一页中文章的地址的地址 for raw_front_page_index in range(star_page, max_page): request_url = u'https://post.smzdm.com/fenlei/xinnengyuanche/p{}/'.format( raw_front_page_index) index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, 'lxml') list_p_list = soup.find_all('div', class_='list-border clearfix') for p in list_p_list: # print p list_pcyc_li = p.find_all('a') li = list_pcyc_li[0] tarUrl = li.get('href') ttt = str(tarUrl).split("#")[-1] print ttt if not (ttt is None): article_url_index_list.append(ttt) del index_work_set[raw_front_page_index] article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = Todo3ArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def catch(account_id): # 关键就在这里了 u""" :param target_url: https://xueqiu.com/4065977305 :return: """ mock_sleep_time = 0.5 article_url_index_list = [] # 获取最大页码 # url = 'http://chuansong.me/account/{}'.format(account_id) # front_page_content = Http.get_content(url) # max_page = XueQiuWorker.parse_max_page(front_page_content) # _url = "http://xueqiu.com/v4/statuses/user_timeline.json?user_id={0}&page={1}&type=2" ''是all 2主贴 5 回复 _url = "https://xueqiu.com/v4/statuses/user_timeline.json?user_id={0}&page={1}&type=0" first = _url.format(account_id, 1) r = Http.get_json_content(first) max_page = 1 try: jdata = json.loads(r.text, encoding='utf-8') max_page = jdata['maxPage'] + 1 except KeyError as e: print '打开失败 >>>>>>> Cookie' # max_page = 1 # 分析网页内容,存到数据库里 # 需要验证码 content_profile = Http.get_content(u'https://xueqiu.com/u/{}/profile'.format(account_id)) column_info = XueQiuColumnParser(content_profile).get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = "" with open('ReadList.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: split_url = line.split('#')[0] if split_url.split('/')[-1] == account_id: column_info[u'title'] = line.split('#')[1] column_info[u'image_url'] = str(line.split('#')[2]).strip('\n') from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) # index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 for raw_front_page_index in range(1, max_page): request_url = _url.format(account_id, raw_front_page_index) index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info(u"开始抓取{countert}号文章,剩余{article_count}篇".format(countert=article_url_index, article_count=len(index_work_set))) content = Http.get_content(request_url) if not content: return jdata = json.loads(content) articles = jdata['statuses'] for article in articles: # print article article_info = XueQiuArticleParser(article).get_article_info() if len(article_info) > 0: article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] Debug.logger.debug(u' {} 的内容抓取完成'.format(request_url)) return
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 1 max_sleep_time = 1 article_url_index_list = [] # 获取最大页码 url = 'http://www.jintiankansha.me/tag/{}?page=1'.format(account_id) column_info = JinWanKanSaEmptColumnParser('').get_column_info() column_info[u'column_id'] = account_id dt = datetime.datetime.now() column_info[u'title'] = u"AI_{}".format(dt.strftime("%Y-%m-%d")) max_page = 1 typeToTry = 'tag' with open('ReadList.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: split_url = line.split('#')[0] if split_url.split('/')[-1] == account_id: dt = datetime.datetime.now() column_info[u'title'] = u"{}_{}".format( line.split('#')[1], dt.strftime("%Y-%m-%d")) max_page = int(line.split('#')[2]) typeToTry = str(int(line.split('#')[-1])).strip('\n') from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 for raw_front_page_index in range(0, max_page + 1): # request_url = u'http://www.jintiankansha.me/column/{}?page={}'.format(account_id, raw_front_page_index) request_url = u'http://www.jintiankansha.me/{}/{}?page={}'.format( typeToTry, account_id, raw_front_page_index) print request_url index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, 'html.parser') list_p_list = soup.find_all('span', class_="item_title") for tgo_right in list_p_list: for link in tgo_right.findAll('a'): ttt = str(link.get('href')) print ttt if not (ttt is None): article_url_index_list.append(ttt) del index_work_set[raw_front_page_index] # article_url_index_list.append('http://www.jintiankansha.me/t/u8MygoqKI8') article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取 {countert} 号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) time.sleep(mock_sleep_time) if len(request_url_content) == 0: random_sleep_time = base_sleep_time + random.randint( 0, max_sleep_time) / 100.0 Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time)) time.sleep(random_sleep_time) continue article_info = JinWanKanSaArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 url = 'http://www.gushequ.com/{}/'.format(account_id) front_page_content = Http.get_content(url) column_info = TodoColumnParser(front_page_content).get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = "股社区" from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) star_page = 0 max_page = 24 if account_id == '2018': star_page = 0 max_page = 24 elif account_id == '2017': star_page = 24 max_page = 58 elif account_id == '2016': star_page = 58 max_page = 92 from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() #获取每一页中文章的地址的地址 for raw_front_page_index in range(star_page, max_page): request_url = u'http://www.gushequ.com/page/{}/'.format( raw_front_page_index) index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, 'lxml') list_p_list = soup.find_all('article') for p in list_p_list: # print p list_pcyc_li = p.find_all('a') for li in list_pcyc_li: tarUrl = li.get('href') ttt = str(tarUrl).split("#")[-1] print ttt if not (ttt is None): article_url_index_list.append(ttt) del index_work_set[raw_front_page_index] article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = TodoArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def catch(account_id): # 关键就在这里了 u""" :param target_url: https://xueqiu.com/4065977305 :return: """ mock_sleep_time = 0.5 base_sleep_time = 5 max_sleep_time = 30 article_url_index_list = [] #https://xueqiu.com/statuses/search.json?count=10&comment=0&symbol=SZ000333&hl=0&source=all&sort=&page=1&q= _url = "https://xueqiu.com/statuses/search.json?count=10&comment=0&symbol={0}&hl=0&source=all&sort=alpha&page={1}&q=" # 搜索 霍华德·马克斯 # _url = "https://xueqiu.com/statuses/search.json?sort=relevance&source=all&q={0}&count=10&page={1}" first = _url.format(account_id, 1) r = Http.get_json_content(first) max_page = 1 try: jdata = json.loads(r.text, encoding='utf-8') max_page = jdata['maxPage'] + 1 except KeyError as e: print '打开失败 >>>>>>> Cookie' # max_page = 1 # 分析网页内容,存到数据库里 # 需要验证码 max_page = 1 # print max_page column_info = XueQiuColumnParser('').get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = "" with open('ReadList.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: split_url = line.split('#')[0] if split_url.split('/')[-1] == account_id: dt = datetime.datetime.now() tit = line.split('#')[1] column_info[u'title'] = u"{}_{}".format(tit, dt.strftime("%Y-%m-%d")) column_info[u'image_url'] = str(line.split('#')[2]).strip('\n') from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) # index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 for raw_front_page_index in range(1, max_page): request_url = _url.format(account_id, raw_front_page_index) index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info(u"开始抓取{countert}号文章,剩余{article_count}篇".format(countert=article_url_index, article_count=len(index_work_set))) print request_url content = Http.get_content(request_url) if not content: random_sleep_time = base_sleep_time + random.randint(2, max_sleep_time) / 10.0 Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time)) time.sleep(random_sleep_time) continue # {"error_description":"您的请求过于频繁,请稍后再试","error_uri":"/statuses/search.json","error_code":"22612"} jdata = json.loads(content) if jdata.has_key('error_code'): random_sleep_time = base_sleep_time + random.randint(3, max_sleep_time) / 10.0 Debug.logger.info(u"error_description {}秒".format(jdata['error_description'])) time.sleep(random_sleep_time) continue articles = jdata['list'] for article in articles: # print article article_info = XueQiuArticleParser(article).get_article_info() if len(article_info) > 0: article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] random_sleep_time = 1 + random.randint(3, max_sleep_time) / 10.0 Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time)) time.sleep(random_sleep_time) Debug.logger.debug(u' {} 的内容抓取完成'.format(request_url)) return
def catch(account_id): # 关键就在这里了 mock_sleep_time = 28 base_sleep_time = 62 max_sleep_time = 80 article_url_index_list =[] # 获取最大页码 url = 'http://chuansong.me/account/{}'.format(account_id) # front_page_content = Http.get_content(url) front_page_content ='' # max_page =WechatWorker.parse_max_page(front_page_content) # if max_page > 200: # max_page =200 max_page = 0 # 分析网页内容,存到数据库里 column_info = WechatColumnParser(front_page_content).get_column_info() column_info[u'column_id'] = account_id with open('ReadList.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: split_url = line.split('#')[0] if str(split_url).__contains__(account_id): column_info[u'title'] = str(line.split('#')[1]) from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 for raw_front_page_index in range(0, max_page): front_page_index = raw_front_page_index * 12 request_url = url + '?start={}'.format(front_page_index) index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) time.sleep(mock_sleep_time) if len(request_url_content) == 0 or catch_counter % 5 == 0: random_sleep_time = base_sleep_time + random.randint(0, max_sleep_time) / 100.0 Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time)) time.sleep(random_sleep_time) continue random_sleep_time = base_sleep_time + random.randint(0, max_sleep_time) / 10.0 Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time)) time.sleep(random_sleep_time) article_url_index_list += Match.wechat_article_index(content=request_url_content) del index_work_set[raw_front_page_index] article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: article_db = DB.query_row('select count(*) as article_count from Article where article_id = {}'.format(article_url_index)) if article_db['article_count'] > 0: continue request_url = 'http://chuansong.me/n/{}'.format(article_url_index) index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info(u"开始抓取{countert}号文章,剩余{article_count}篇".format(countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) time.sleep(mock_sleep_time) if len(request_url_content) == 0: random_sleep_time = base_sleep_time + random.randint(0, max_sleep_time) / 100.0 Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time)) time.sleep(random_sleep_time) continue random_sleep_time = base_sleep_time + random.randint(0, max_sleep_time) / 10.0 Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time)) time.sleep(random_sleep_time) article_info = WechatArticleParser(request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 url = 'https://www.wuxiareview.com/category/{}'.format(account_id) front_page_content = Http.get_content(url) column_info = WuXiaColumnParser(front_page_content).get_column_info() column_info[u'column_id'] = account_id max_page = 2 if account_id == 'daidai': column_info[u'title'] = "吃瓜群众岱岱" max_page = 1 elif account_id == 'gzmdzst': column_info[u'title'] = "顾子明的政事堂" max_page = 1 else: column_info[u'title'] = "时文" max_page = 2 from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 for raw_front_page_index in range(0, max_page): request_url = u'https://www.wuxiareview.com/category/{}/{}/'.format( account_id, raw_front_page_index) index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, 'lxml') list_p_list = soup.find_all('article', class_="excerpt") for p in list_p_list: # print p list_pcyc_li = p.find_all('a') for li in list_pcyc_li: # print li.text tarUrl = li.get('href') ttt = str(tarUrl).split("#")[-1] print ttt if not (ttt is None): article_url_index_list.append(ttt) del index_work_set[raw_front_page_index] article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = WuXiaArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 star_page = 1 max_page = 1 column_info = Todo1ColumnParser("").get_column_info() column_info[u'column_id'] = account_id with open('ReadList.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: split_url = line.split('#')[0] if str(split_url).__contains__(account_id): # Config.now_id_likeName = line.split('#')[1] max_page = int(line.split('#')[-1]) + 1 column_info[u'title'] = str(line.split('#')[1]) # max_page = 1 print max_page from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() #获取每一页中文章的地址的地址 for raw_front_page_index in range(star_page, max_page): request_url = u'https://www.guancha.cn/{}/list_{}.shtml'.format( account_id, raw_front_page_index) index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, 'lxml') list_p_list = soup.find_all('h4', class_="module-title") for p in list_p_list: # print p list_pcyc_li = p.find_all('a') for li in list_pcyc_li: ttt = li.get('href') print ttt if not (ttt is None): ss = str(ttt).split('.') article_url_index_list.append( u"https://www.guancha.cn{}_s.{}".format( ss[0], ss[1])) del index_work_set[raw_front_page_index] article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = Todo1ArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # url = 'http://xinsheng.huawei.com/{}'.format(account_id) front_page_content = Http.get_content(url) column_info = HuaWeiColumnParser(front_page_content).get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = "华为家事" column_info[ u'image_url'] = 'file:///Users/ex-liyan010/Desktop/share/hcover.jpeg' from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) max_page = 0 Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 for raw_front_page_index in range(0, max_page): request_url = u'http://xinsheng.huawei.com/cn/index.php?app=forum&mod=List&act=index&class=461&order=cTime&type=&sign=&special=&cate=155&p={}'.format( raw_front_page_index) index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, "lxml") content_dom = soup.find_all('div', class_="bbs_list")[0] # print content_dom.text # t_dom = dom.find_all('tbody') # # print t_dom font_box_dom = content_dom.find_all('div', class_="font_box") # print time_dom for xx in font_box_dom: linkl = xx.findAll('a') tarUrl = linkl[0].get('href') print tarUrl article_url_index_list.append(tarUrl) del index_work_set[raw_front_page_index] article_url_index_list.append( 'http://xinsheng.huawei.com/cn/index.php?app=forum&mod=Detail&act=index&id=4343641' ) article_url_index_list.append( 'http://xinsheng.huawei.com/cn/index.php?app=forum&mod=Detail&act=index&id=4340813' ) article_url_index_list.append( 'http://xinsheng.huawei.com/cn/index.php?app=group&mod=Bbs&act=detail&tid=4346331' ) article_url_index_list.append( 'http://xinsheng.huawei.com/cn/index.php?app=group&mod=Bbs&act=detail&tid=4347493' ) article_url_index_list.append( 'http://xinsheng.huawei.com/cn/index.php?app=group&mod=Bbs&act=detail&tid=4342141' ) article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = HuaWeiArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def write_txt(self): try: if self.filter: result_header = u"\n\n原创微博内容: \n" else: result_header = u"\n\n微博内容: \n" result = (u"用户信息\n用户昵称:" + self.username + u"\n用户id: " + str(self.user_id) + u"\n微博数: " + str(self.weibo_num) + u"\n关注数: " + str(self.following) + u"\n粉丝数: " + str(self.followers) + result_header) column_info = {} column_info[u'column_id'] = str(self.user_id) column_info[u'title'] = self.username column_info['article_count'] = str(self.weibo_num) column_info['follower_count'] = str(self.following) column_info['description'] = '' column_info['image_url'] = '' from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) for i in range(1, self.weibo_num2 + 1): text = (str(i) + ":" + self.weibo_content[i - 1] + "\n" + u"微博位置: " + self.weibo_place[i - 1] + "\n" + u"发布时间: " + self.publish_time[i - 1] + "\n" + u"点赞数: " + str(self.up_num[i - 1]) + u" 转发数: " + str(self.retweet_num[i - 1]) + u" 评论数: " + str(self.comment_num[i - 1]) + "\n" + u"发布工具: " + self.publish_tool[i - 1] + "\n\n") result = result + text article_info = {} from src.worker import Worker article_info['article_id'] = u'{}'.format(i) article_info['column_id'] = str(self.user_id) article_info['title'] = u'{}'.format(self.weibo_num2 + 1 - i) article_info['content'] = self.weibo_content[i - 1] article_info['updated_time'] = self.publish_time[i - 1] article_info['voteup_count'] = "" article_info['comment_count'] = "" article_info['image_url'] = '' article_info['author_id'] = str(self.user_id) article_info['author_name'] = self.username article_info['author_headline'] = '' article_info[ 'author_avatar_url'] = 'https://pic4.zhimg.com/v2-38a89e42b40baa7d26d99cab9a451623_xl.jpg' article_info['author_gender'] = '0' Worker.save_record_list(u'Article', [article_info]) # file_dir = os.path.split(os.path.realpath(__file__))[ # 0] + os.sep + "weibo" # if not os.path.isdir(file_dir): # os.mkdir(file_dir) # file_path = file_dir + os.sep + "%d" % self.user_id + ".txt" # f = open(file_path, "wb") # f.write(result.encode(sys.stdout.encoding)) # f.close() # print(file_path) print(u"微博写入文件完毕") except Exception as e: print("Error: ", e) traceback.print_exc()
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 column_info = Todo2ColumnParser("").get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = "纽约时报" from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) star_page = 0 max_page = 0 from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() #获取每一页中文章的地址的地址 for raw_front_page_index in range(star_page, max_page): request_url = u'https://cn.nytimes.com/search/data/?query=DAVID%20BARBOZA&lang=&dt=json&from={}&size=10'.format( raw_front_page_index * 10) # request_url = u'https://cn.nytimes.com/real-estate/{}/'.format(raw_front_page_index) index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) content = Http.get_content(request_url) # json 返回中 url if not content: return jdata = json.loads(content) articles = jdata['items'] for article in articles: print article['headline'] uur = article['web_url_with_host'] print uur article_url_index_list.append(uur) # soup = BeautifulSoup(content, 'lxml') # list_p_list = soup.find_all('h3' ,class_="regularSummaryHeadline") # for p in list_p_list: # # print p # list_pcyc_li = p.find_all('a') # for li in list_pcyc_li: # # tarUrl = str(li.get('href')) # print tarUrl # # if not (tarUrl is None): # if str(tarUrl).__contains__("cn.nytimes.com"): # article_url_index_list.append(u"https:{}".format(tarUrl)) # else: # article_url_index_list.append(u"https://cn.nytimes.com{}".format(tarUrl)) del index_work_set[raw_front_page_index] article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = Todo2ArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 url = 'http://www.360doc.com/userhome/{}'.format(account_id) front_page_content = Http.get_content(url) # Config.now_id_likeName = account_id # Config.save() column_info = Doc360ColumnParser(front_page_content).get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = "明公" max_page = 2 # if account_id == 'daidai': # # column_info[u'title'] = "吃瓜群众岱岱" # max_page = 1 # elif account_id == 'gzmdzst': # # column_info[u'title'] = "顾子明的政事堂" # max_page = 1 # else: # # column_info[u'title'] = "时文" # max_page = 1 from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 for raw_front_page_index in range(0, max_page): request_url = u"http://www.360doc.com/ajax/getUserArticle.aspx?pagenum=50&curnum={}&icid=13&ishowabstract=null&word=&userid={}&isoriginal=0&_={}" urequest_url = (request_url.format(raw_front_page_index, account_id, int(time.time() * 1000))) index_work_set[raw_front_page_index] = urequest_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, 'lxml') list_p_list = soup.find_all('div', class_="list listwz1 font14") for p in list_p_list: # print p list_pcyc_li = p.find_all('a') for li in list_pcyc_li: # print li.text tarUrl = li.get('href') # deep level print tarUrl article_url_index_list.append(tarUrl) del index_work_set[raw_front_page_index] article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = Doc360ArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 1 max_sleep_time = 1 article_url_index_list = [] # 获取最大页码 column_info = WeiXinColumnParser('').get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = account_id column_info[u'image_url'] = 'https://wpimg.wallstcn.com/3598b719-ab0d-4be7-bc09-30c3ae29a3cc.jpg?imageView2/1/w/240/h/240' max_page = 1 # with open('ReadList.txt', 'r') as read_list: # read_list = read_list.readlines() # for line in read_list: # split_url = line.split('#')[0] # if str(split_url).__contains__(account_id): # # Config.now_id_likeName = line.split('#')[1] # max_page = int(line.split('#')[-1]) + 1 # column_info[u'title'] = str(line.split('#')[1]) # # # max_page = 1 # print max_page from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) # article_url_index_list.append('https://mp.weixin.qq.com/s?__biz=MjM5MjczNDc0Mw==&mid=2650847984&idx=2&sn=b7b111e5964d2f2fb568ba0d419e3edf&chksm=bd55d1888a22589e2f3bab0613b346427079efc6b82fac869d4f78244a500c3e5cc8cb8402ed&scene=21#wechat_redirect') # article_url_index_list.append('https://mp.weixin.qq.com/s/yj1BT3jWyxLjlEnzz0vEtQ') with open('/Users/0/Desktop/list.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: article_url_index_list.append(str(line).strip('\n')) article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"'.format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info(u"开始抓取 {countert} 号文章,剩余{article_count}篇".format(countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) time.sleep(mock_sleep_time) if len(request_url_content) == 0: random_sleep_time = base_sleep_time + random.randint(0, max_sleep_time) / 100.0 Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time)) time.sleep(random_sleep_time) continue #article_info = Todo2ArticleParser(request_url_content).get_article_info() # article_info = HuXiuArticleParser(request_url_content).get_article_info() article_info = WeiXinArticleParser(request_url_content).get_article_info() # article_info = WallStreetArticleParser(request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 url = 'https://www.huxiu.com/{}'.format(account_id) front_page_content = Http.get_content(url) # Config.now_id_likeName = account_id # Config.save() column_info = HuXiuColumnParser(front_page_content).get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = account_id from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) u_result = urllib.quote( account_id.decode(sys.stdin.encoding).encode('utf8')) print account_id max_page = 2 idds = '' # with open('ReadList.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: split_url = line.split('#')[0] if split_url.split('/')[-1] == account_id: # Config.now_id_likeName = line.split('#')[1] max_page = int(line.split('#')[-1]) + 1 idds = str(line.split('#')[1]) print max_page max_page = -1 # 分析网页内容,存到数据库里 Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 for raw_front_page_index in range(0, max_page + 1): #https://www.huxiu.com/search.html?s=%E5%B7%B4%E8%8F%B2%E7%89%B9&sort=dateline:desc request_url = u'https://www.huxiu.com/search.html?s={}&sort=dateline%3Adesc&per_page={}'.format( u_result, raw_front_page_index) #request_url = u'https://www.huxiu.com/member/{}/article/{}.html'.format(idds,raw_front_page_index) # request_url = 'https://www.huxiu.com/member/1872007.html' index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, "lxml") list_pcyc_l_ = soup.find_all('li') # list_pcyc_l_ = soup.find_all('div',class_='mob-ctt') for tgo_right in list_pcyc_l_: for link in tgo_right.findAll('a'): hre = str(link.get('href')) if hre.startswith('/article/', 0, 10): print u'https://www.huxiu.com{}'.format( link.get('href')) article_url_index_list.append( 'https://www.huxiu.com{}'.format( link.get('href'))) del index_work_set[raw_front_page_index] article_url_index_list.append( 'https://www.huxiu.com/article/299355.html') article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = HuXiuArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 article_url_index_list = [] # 获取最大页码 url = 'http://blog.sina.com.cn/s/articlelist_{}_11_1.html'.format(account_id) front_page_content = Http.get_content(url) article_num = SinaWorker.parse_max_page(front_page_content) href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format(account_id) content_profile = Http.get_content(href_profile) column_info = SinaColumnParser(content_profile).get_column_info() column_info[u'column_id'] = account_id from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 if article_num % 50 != 0: page_num = article_num / 50 + 1 # 50 href on 1 page else: page_num = article_num / 50 Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=page_num)) index_work_set = OrderedDict() for page in range(page_num): url = 'http://blog.sina.com.cn/s/articlelist_{}_11_{}.html'.format(account_id, page + 1) content_article_list = Http.get_content(url) soup = BeautifulSoup(content_article_list, "lxml") article_list = soup.select('span.atc_title a') for item in range(len(article_list)): article_title = ParserTools.get_attr(article_list[item], 'href') index_work_set[item] = article_title re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info(u"开始抓取{countert}号文章,剩余{article_count}篇".format(countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) time.sleep(mock_sleep_time) if len(request_url_content) == 0: Debug.logger.info(u"休眠{}秒".format(mock_sleep_time)) time.sleep(mock_sleep_time) continue article_info = SinaArticleParser(request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return