def __get_question_list(self): question_list = [DB.wrap('question', x) for x in DB.get_result_list(self.sql.question)] answer_list = [DB.wrap('answer', x) for x in DB.get_result_list(self.sql.get_answer_sql())] def merge_answer_into_question(): question_dict = {x['question_id']: {'question': x.copy(), 'answer_list': [], 'agree': 0} for x in question_list} for answer in answer_list: question_dict[answer['question_id']]['answer_list'].append(answer) return question_dict.values() def add_property(question): agree_count = 0 char_count = 0 for answer in question['answer_list']: answer['char_count'] = len(answer['content']) answer['agree_count'] = answer['agree'] answer['update_date'] = answer['edit_date'] agree_count += answer['agree'] char_count += answer['char_count'] question['answer_count'] = len(question['answer_list']) question['agree_count'] = agree_count question['char_count'] = char_count return question question_list = [add_property(x) for x in merge_answer_into_question() if len(x['answer_list'])] return question_list
def login(self, account, password, captcha=''): content = Http.get_content('https://www.zhihu.com/') xsrf = Match.xsrf(content) if not xsrf: Debug.logger.info(u'登陆失败') Debug.logger.info(u'敲击回车重新发送登陆请求') return False xsrf = xsrf.split('=')[1] # add xsrf as cookie into cookieJar, cookie = Http.make_cookie(name='_xsrf', value=xsrf, domain='www.zhihu.com') self.cookieJar.set_cookie(cookie) if captcha: post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True, 'captcha': captcha} else: post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True} header = { 'Accept': '*/*', 'Accept-Encoding': 'gzip,deflate', # 主要属性,只要有此项知乎即认为来源非脚本 'Accept-Language': 'zh,zh-CN;q=0.8,en-GB;q=0.6,en;q=0.4', 'Host': 'www.zhihu.com', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36(KHTML, like Gecko)Chrome/34.0.1847.116 Safari/537.36', 'Connection': 'keep-alive', 'X-Requested-With': 'XMLHttpRequest', 'Origin': 'https://www.zhihu.com', 'Referer': 'https://www.zhihu.com/', } result = Http.get_content(url=r'https://www.zhihu.com/login/email', data=post_data, extra_header=header) if not result: Debug.logger.info(u'登陆失败,请敲击回车重新登陆') return False response = json.loads(result) if response['r'] == 0: print u'登陆成功!' print u'登陆账号:', account print u'请问是否需要记住帐号密码?输入yes记住,输入其它任意字符跳过,回车确认' if raw_input() == 'yes': Config.account, Config.password, Config.remember_account = account, password, True print u'帐号密码已保存,可通过修改config.json修改设置' else: Config.account, Config.password, Config.remember_account = '', '', False print u'跳过保存环节,进入下一流程' Config._save() cookie = self.get_cookie() data = {} data['account'] = account data['password'] = password data['recordDate'] = ExtraTools.get_today() data['cookieStr'] = cookie DB.save(data, 'LoginRecord') DB.commit() return True else: print u'登陆失败' Debug.print_dict(response) return False
def clear_index(self): topic_id_tuple = tuple( set(x['topic_id'] for x in self.topic_index_list)) sql = 'DELETE from TopicIndex where topic_id in ({})'.format( (' ?,' * len(topic_id_tuple))[:-1]) DB.cursor.execute(sql, topic_id_tuple) DB.commit() return
def save_record_list(table_name, record_list): """ 将数据保存到数据库中 :return: """ for record in record_list: DB.save(record, table_name) DB.commit() return
def save(self): self.clear_index() save_config = self.create_save_config() for key in save_config: for item in save_config[key]: if item: DB.save(item, key) DB.commit() return
def query_answer_list_by_author_page_id(self, author_page_id): # 需要先查出来对应的author_id author_info = DB.query_row(u'select author_id from Author where author_page_id="{author_page_id}"'.format(author_page_id=author_page_id)) author_id = author_info[u'author_id'] raw_answer_list = DB.query_all(u'select * from Answer where author_id="{author_id}" {order_by} '.format(author_id=author_id, order_by=Config.answer_order_by)) answer_list = [] for raw_answer in raw_answer_list: answer_list.append(self.format_answer(raw_answer)) return answer_list
def __init__(self): # 初始化目录结构 Path.init_base_path() Path.init_work_directory() # 初始化数据库链接 DB.init_database() # 初始化配置 Config.init_config() return
def __get_article_list(self): def add_property(article): article['char_count'] = len(article['content']) article['agree_count'] = article['agree'] article['update_date'] = article['publish_date'] article['answer_count'] = 1 return article article_list = [DB.wrap(Type.article, x) for x in DB.get_result_list(self.sql.get_answer_sql())] article_list = [add_property(x) for x in article_list] return article_list
def __get_article_list(self): def add_property(article): article['char_count'] = len(article['content']) article['answer_count'] = 1 # TODO if self.kind in [Type.jianshu_author, Type.jianshu_collection, Type.jianshu_notebooks, Type.sinablog_author, Type.csdnblog_author]: article['agree_count'] = "没有赞同数" # article['agree'] else: article['agree_count'] = article['agree'] article['update_date'] = article['publish_date'] return article if self.kind in [Type.jianshu_author, Type.jianshu_collection, Type.jianshu_notebooks]: article_list = [DB.wrap(Type.jianshu_article, x) for x in DB.get_result_list(self.sql.get_answer_sql())] elif self.kind == Type.sinablog_author: article_list = [DB.wrap(Type.sinablog_article, x) for x in DB.get_result_list(self.sql.get_answer_sql())] elif self.kind == Type.csdnblog_author: article_list = [DB.wrap(Type.csdnblog_article, x) for x in DB.get_result_list(self.sql.get_answer_sql())] else: article_list = [DB.wrap(Type.article, x) for x in DB.get_result_list(self.sql.get_answer_sql())] article_list = [add_property(x) for x in article_list] return article_list
def query_answer_list_by_question_id(self, question_id): raw_answer_list = DB.query_all( u'select * from Answer where question_id="{question_id}" {order_by} '.format(question_id=question_id, order_by=Config.answer_order_by)) answer_list = [] for raw_answer in raw_answer_list: answer_list.append(self.format_answer(raw_answer)) return answer_list
def catch_article_book_info(self, sql): info_list = DB.cursor.execute(self.sql.info).fetchall() info_list = [DB.wrap(Type.article, item) for item in info_list] info = {} info['title'] = '_'.join([str(item['title']) for item in info_list]) info['id'] = '_'.join([str(item['article_id']) for item in info_list]) return info
def query_question(self, question_id): """ :rtype: Question_Info """ question = DB.query_row(u'select * from Question where question_id in ({question_id})'.format(question_id=question_id)) question = self.format_question(question) # 包装成标准的信息格式 return question
def query_question_list(self, question_id_list): raw_question_list = DB.query_all( u'select * from Question where question_id in ({question_id_list})'.format(question_id_list=','.join(question_id_list))) question_list = [] for raw_question in raw_question_list: question_list.append(self.format_question(raw_question)) return question_list
def catch_info(self): info = {} if self.property.sql.info: info = DB.cursor.execute(self.property.sql.info).fetchone() info = DB.wrap(Type.info_table[self.kind], info) self.set_info(info) return
def catch_question_book_info(self, sql): info_list = DB.cursor.execute(self.sql.info).fetchall() info_list = [DB.wrap(Type.question, item) for item in info_list] info = dict() info['title'] = '_'.join([str(item['title']) for item in info_list]) # 可以是多个问题, 多个id联系在一起 info['id'] = '_'.join([str(item['question_id']) for item in info_list]) return info
def query_article_list_by_column_id(self, column_id): # 根据发表时间正序获取文章列表,方便浏览 raw_article_list = DB.query_all(u'select * from Article where column_id="{column_id}" {order_by}'.format(column_id=column_id, order_by=Config.article_order_by)) article_list = [] for raw_article in raw_article_list: article = self.format_article(raw_article) article_list.append(article) return article_list
def query_article_list_by_column_id(self, column_id): # 根据发表时间正序获取文章列表,方便浏览 print u'select * from Article where column_id = "{column_id}" {order_by}'.format(column_id=column_id, order_by=Config.article_order_by) raw_article_list = DB.query_all(u'select * from Article where column_id = "{column_id}" {order_by}'.format(column_id=column_id, order_by=Config.article_order_by)) article_list = [] for raw_article in raw_article_list: article = self.format_article(raw_article) article_list.append(article) return article_list
def query_answer(self, answer_id): """ :type answer_id:int :return: """ answer = DB.query_row(u'select * from Answer where answer_id in ({answer_id})'.format(answer_id=answer_id)) answer = self.format_answer(answer) return answer
def catch_csdnblog_book_info(self): u""" :return: """ info_list = DB.cursor.execute(self.sql.info).fetchall() info_list = [DB.wrap(Type.csdnblog_info, item) for item in info_list] info = dict() info['creator_name'] = '_'.join([str(item['creator_name']) for item in info_list]) info['creator_id'] = '_'.join([str(item['creator_id']) for item in info_list]) return info
def catch_jianshu_book_info(self): u""" :return: info """ info_list = DB.cursor.execute(self.sql.info).fetchall() info_list = [DB.wrap(Type.jianshu_info, item) for item in info_list] info = dict() # 可以是多个博客组合在一起 TODO: 删掉??? info['creator_name'] = '_'.join([str(item['creator_name']) for item in info_list]) info['creator_id'] = '_'.join([str(item['creator_id']) for item in info_list]) return info
def catch_info(self): info = {} if self.sql.info: if self.kind in [Type.question, Type.answer]: info = self.catch_question_book_info(self.sql.info) elif self.kind == Type.article: info = self.catch_article_book_info(self.sql.info) else: info = DB.cursor.execute(self.sql.info).fetchone() info = DB.wrap(Type.info_table[self.kind], info) self.set_info(info) return
def catch_jianshu_book_info(self): u""" :param :return: info """ info_list = DB.cursor.execute(self.sql.info).fetchall() info_list = [DB.wrap(Type.jianshu_info, item) for item in info_list] info = {} info['creator_name'] = '_'.join([str(item['creator_name']) for item in info_list]) # 可以是多个博客组合在一起 info['creator_id'] = '_'.join([str(item['creator_id']) for item in info_list]) return info
def catch_sinablog_book_info(self): u""" :param :return: info """ info_list = DB.cursor.execute(self.sql.info).fetchall() info_list = [DB.wrap(Type.sinablog_info, item) for item in info_list] info = dict() info['creator_name'] = '_'.join([str(item['creator_name']) for item in info_list]) # 可以是多个博客组合在一起 info['creator_id'] = '_'.join([str(item['creator_id']) for item in info_list]) return info
def init_database(): if Path.is_file(Path.db_path): DB.set_conn(sqlite3.connect(Path.db_path)) else: DB.set_conn(sqlite3.connect(Path.db_path)) with open(Path.sql_path) as sql_script: DB.cursor.executescript(sql_script.read()) DB.commit()
def extract_topic(self): raw_topic = DB.query_row(u'select * from Topic where topic_id="{topic_id}"'.format(topic_id=self.task.topic_id)) self.info_page = Topic_Info(raw_topic) answer_list = self.query_answer_list(self.info_page.best_answer_id_list.split(',')) question_id_dict = OrderedDict() # 依次获取对应的Question对象 for answer in answer_list: if answer.question_id not in question_id_dict: question_id_dict[answer.question_id] = Question(self.query_question(answer.question_id)) question_id_dict[answer.question_id].append_answer(answer) for question_id in question_id_dict: self.question_list.append(question_id_dict[question_id]) return
def extract_collection(self): raw_collection = DB.query_row( u'select * from Collection where collection_id="{collection_id}"'.format(collection_id=self.task.collection_id)) self.info_page = Collection_Info(raw_collection) answer_list = self.query_answer_list(self.info_page.collected_answer_id_list.split(',')) question_id_dict = OrderedDict() # 依次获取对应的Question对象 for answer in answer_list: if answer.question_id not in question_id_dict: question_id_dict[answer.question_id] = Question(self.query_question(answer.question_id)) question_id_dict[answer.question_id].append_answer(answer) for question_id in question_id_dict: self.question_list.append(question_id_dict[question_id]) return
def init_database(): if Path.is_file(Path.db_path): Debug.logger.debug(u"Connect to the database...") Debug.logger.debug(u"db_path: " + str(Path.db_path)) DB.set_conn(sqlite3.connect(Path.db_path)) else: Debug.logger.debug(u"Create db file...") DB.set_conn(sqlite3.connect(Path.db_path)) with open(Path.sql_path) as sql_script: DB.cursor.executescript(sql_script.read()) DB.commit()
def extract_author(self): raw_author = DB.query_row(u'select * from Author where author_page_id="{author_page_id}" '.format(author_page_id=self.task.author_page_id)) self.info_page = Author_Info(raw_author) answer_list = self.query_answer_list_by_author_page_id(self.info_page.author_page_id) question_id_dict = OrderedDict() # 依次获取对应的Question对象 for answer in answer_list: if answer.question_id not in question_id_dict: db_question_info = self.query_question(answer.question_id) if not db_question_info: # 当返回值为空的时候,直接跳过即可 continue question_id_dict[answer.question_id] = Question(db_question_info) question_id_dict[answer.question_id].append_answer(answer) for question_id in question_id_dict: self.question_list.append(question_id_dict[question_id]) return
def catch_info(self): u""" 获得博客的信息, 将info作为参数传给set_info :return: """ info = {} if self.sql.info: if self.kind == Type.csdnblog_author: info = self.catch_csdnblog_book_info() elif self.kind == Type.jianshu_author: info = self.catch_jianshu_book_info() elif self.kind == Type.sinablog_author: info = self.catch_sinablog_book_info() elif self.kind in [Type.question, Type.answer]: info = self.catch_question_book_info(self.sql.info) elif self.kind == Type.article: info = self.catch_article_book_info(self.sql.info) else: info = DB.cursor.execute(self.sql.info).fetchone() info = DB.wrap(Type.info_table[self.kind], info) self.set_info(info) return
def query_article(self, article_id): raw_article = DB.query_row(u'select * from Article where article_id="{article_id}" '.format(article_id=article_id)) article = self.format_article(raw_article) return article
def query_column(self, column_id): raw_column = DB.query_row(u'select * from Column where column_id="{column_id}"'.format(column_id=column_id)) column = self.format_column(raw_column) # 包装成标准的信息格式 return column
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 url = 'http://www.gushequ.com/{}/'.format(account_id) front_page_content = Http.get_content(url) column_info = TodoColumnParser(front_page_content).get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = "股社区" from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) star_page = 0 max_page = 24 if account_id == '2018': star_page = 0 max_page = 24 elif account_id == '2017': star_page = 24 max_page = 58 elif account_id == '2016': star_page = 58 max_page = 92 from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() #获取每一页中文章的地址的地址 for raw_front_page_index in range(star_page, max_page): request_url = u'http://www.gushequ.com/page/{}/'.format( raw_front_page_index) index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, 'lxml') list_p_list = soup.find_all('article') for p in list_p_list: # print p list_pcyc_li = p.find_all('a') for li in list_pcyc_li: tarUrl = li.get('href') ttt = str(tarUrl).split("#")[-1] print ttt if not (ttt is None): article_url_index_list.append(ttt) del index_work_set[raw_front_page_index] article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = TodoArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 url = 'https://www.huxiu.com/{}'.format(account_id) front_page_content = Http.get_content(url) # Config.now_id_likeName = account_id # Config.save() column_info = HuXiuColumnParser(front_page_content).get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = account_id from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) u_result = urllib.quote( account_id.decode(sys.stdin.encoding).encode('utf8')) print account_id max_page = 2 idds = '' # with open('ReadList.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: split_url = line.split('#')[0] if split_url.split('/')[-1] == account_id: # Config.now_id_likeName = line.split('#')[1] max_page = int(line.split('#')[-1]) + 1 idds = str(line.split('#')[1]) print max_page max_page = -1 # 分析网页内容,存到数据库里 Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 for raw_front_page_index in range(0, max_page + 1): #https://www.huxiu.com/search.html?s=%E5%B7%B4%E8%8F%B2%E7%89%B9&sort=dateline:desc request_url = u'https://www.huxiu.com/search.html?s={}&sort=dateline%3Adesc&per_page={}'.format( u_result, raw_front_page_index) #request_url = u'https://www.huxiu.com/member/{}/article/{}.html'.format(idds,raw_front_page_index) # request_url = 'https://www.huxiu.com/member/1872007.html' index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, "lxml") list_pcyc_l_ = soup.find_all('li') # list_pcyc_l_ = soup.find_all('div',class_='mob-ctt') for tgo_right in list_pcyc_l_: for link in tgo_right.findAll('a'): hre = str(link.get('href')) if hre.startswith('/article/', 0, 10): print u'https://www.huxiu.com{}'.format( link.get('href')) article_url_index_list.append( 'https://www.huxiu.com{}'.format( link.get('href'))) del index_work_set[raw_front_page_index] article_url_index_list.append( 'https://www.huxiu.com/article/299355.html') article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = HuXiuArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 1 max_sleep_time = 1 article_url_index_list = [] # 获取最大页码 column_info = WeiXinColumnParser('').get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = account_id column_info[u'image_url'] = 'https://wpimg.wallstcn.com/3598b719-ab0d-4be7-bc09-30c3ae29a3cc.jpg?imageView2/1/w/240/h/240' max_page = 1 # with open('ReadList.txt', 'r') as read_list: # read_list = read_list.readlines() # for line in read_list: # split_url = line.split('#')[0] # if str(split_url).__contains__(account_id): # # Config.now_id_likeName = line.split('#')[1] # max_page = int(line.split('#')[-1]) + 1 # column_info[u'title'] = str(line.split('#')[1]) # # # max_page = 1 # print max_page from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) # article_url_index_list.append('https://mp.weixin.qq.com/s?__biz=MjM5MjczNDc0Mw==&mid=2650847984&idx=2&sn=b7b111e5964d2f2fb568ba0d419e3edf&chksm=bd55d1888a22589e2f3bab0613b346427079efc6b82fac869d4f78244a500c3e5cc8cb8402ed&scene=21#wechat_redirect') # article_url_index_list.append('https://mp.weixin.qq.com/s/yj1BT3jWyxLjlEnzz0vEtQ') with open('/Users/0/Desktop/list.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: article_url_index_list.append(str(line).strip('\n')) article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"'.format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info(u"开始抓取 {countert} 号文章,剩余{article_count}篇".format(countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) time.sleep(mock_sleep_time) if len(request_url_content) == 0: random_sleep_time = base_sleep_time + random.randint(0, max_sleep_time) / 100.0 Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time)) time.sleep(random_sleep_time) continue #article_info = Todo2ArticleParser(request_url_content).get_article_info() # article_info = HuXiuArticleParser(request_url_content).get_article_info() article_info = WeiXinArticleParser(request_url_content).get_article_info() # article_info = WallStreetArticleParser(request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def catch(account_id): # 关键就在这里了 article_url_index_list = [] # 获取最大页码 url = 'http://www.taoguba.com.cn/Article/' + account_id + '/1' front_page_content = Http.get_content(url) star_page = 1 with open('ReadList.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: if str(line).__contains__('#'): split_url = line.split('#')[0] trgId = split_url.split('/')[-2] if trgId == account_id: pg = (split_url.split('/')[-1]) print pg star_page = int(pg) if star_page == 0: star_page = 1 else: print star_page max_page = 2 dom = BeautifulSoup(front_page_content, "lxml") list_pcyc_l_ = dom.find_all('div', class_="left t_page01") try: for tgo_tgo_ in list_pcyc_l_: linkl = tgo_tgo_.findAll('a') tarUrl = linkl[0].get('href') max_page = int(tarUrl.split('/')[3]) except IndexError as e: max_page = 1 column_info = TGBColumnParser(front_page_content).get_column_info() from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 # star_page = 100 for raw_front_page_index in range(star_page, max_page + 1): request_url = 'http://www.taoguba.com.cn/Article/' + account_id + '/' + str( raw_front_page_index) article_url_index_list.append(request_url) article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = TGBArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def clear_index(self): topic_id_tuple = tuple(set(x['topic_id'] for x in self.topic_index_list)) sql = 'DELETE from TopicIndex where topic_id in ({})'.format((' ?,' * len(topic_id_tuple))[:-1]) DB.cursor.execute(sql, topic_id_tuple) DB.commit() return
def login(self, account, password, captcha=''): content = Http.get_content('https://www.zhihu.com/') xsrf = Match.xsrf(content) if not xsrf: Debug.logger.info(u'登陆失败') Debug.logger.info(u'敲击回车重新发送登陆请求') return False xsrf = xsrf.split('=')[1] # add xsrf as cookie into cookieJar, cookie = Http.make_cookie(name='_xsrf', value=xsrf, domain='www.zhihu.com') self.cookieJar.set_cookie(cookie) if captcha: post_data = { '_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True, 'captcha': captcha } else: post_data = { '_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True } header = { 'Accept': '*/*', 'Accept-Encoding': 'gzip,deflate', # 主要属性,只要有此项知乎即认为来源非脚本 'Accept-Language': 'zh,zh-CN;q=0.8,en-GB;q=0.6,en;q=0.4', 'Host': 'www.zhihu.com', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36(KHTML, like Gecko)Chrome/34.0.1847.116 Safari/537.36', 'Connection': 'keep-alive', 'X-Requested-With': 'XMLHttpRequest', 'Origin': 'https://www.zhihu.com', 'Referer': 'https://www.zhihu.com/', } result = Http.get_content(url=r'https://www.zhihu.com/login/email', data=post_data, extra_header=header) if not result: Debug.logger.info(u'登陆失败,请敲击回车重新登陆') return False response = json.loads(result) if response['r'] == 0: print u'登陆成功!' print u'登陆账号:', account print u'请问是否需要记住帐号密码?输入yes记住,输入其它任意字符跳过,回车确认' if raw_input() == 'yes': Config.account, Config.password, Config.remember_account = account, password, True print u'帐号密码已保存,可通过修改config.json修改设置' else: Config.account, Config.password, Config.remember_account = '', '', False print u'跳过保存环节,进入下一流程' Config._save() cookie = self.get_cookie() DB.execute( 'delete from LoginRecord') # 登陆成功后清除数据库中原有的登录记录,避免下次登陆时取到旧记录 data = {} data['account'] = account data['password'] = password data['recordDate'] = ExtraTools.get_today() data['cookieStr'] = cookie DB.save(data, 'LoginRecord') DB.commit() return True else: print u'登陆失败' Debug.print_dict(response) return False
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 column_info = Todo3ColumnParser('').get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = "新能源汽车" column_info['article_count'] = 0 column_info['follower_count'] = 0 column_info['description'] = '' column_info['image_url'] = '' from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) star_page = 1 max_page = 1 from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() #获取每一页中文章的地址的地址 for raw_front_page_index in range(star_page, max_page): request_url = u'https://post.smzdm.com/fenlei/xinnengyuanche/p{}/'.format( raw_front_page_index) index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, 'lxml') list_p_list = soup.find_all('div', class_='list-border clearfix') for p in list_p_list: # print p list_pcyc_li = p.find_all('a') li = list_pcyc_li[0] tarUrl = li.get('href') ttt = str(tarUrl).split("#")[-1] print ttt if not (ttt is None): article_url_index_list.append(ttt) del index_work_set[raw_front_page_index] article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = Todo3ArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 url = 'https://www.wuxiareview.com/category/{}'.format(account_id) front_page_content = Http.get_content(url) column_info = WuXiaColumnParser(front_page_content).get_column_info() column_info[u'column_id'] = account_id max_page = 2 if account_id == 'daidai': column_info[u'title'] = "吃瓜群众岱岱" max_page = 1 elif account_id == 'gzmdzst': column_info[u'title'] = "顾子明的政事堂" max_page = 1 else: column_info[u'title'] = "时文" max_page = 2 from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 for raw_front_page_index in range(0, max_page): request_url = u'https://www.wuxiareview.com/category/{}/{}/'.format( account_id, raw_front_page_index) index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, 'lxml') list_p_list = soup.find_all('article', class_="excerpt") for p in list_p_list: # print p list_pcyc_li = p.find_all('a') for li in list_pcyc_li: # print li.text tarUrl = li.get('href') ttt = str(tarUrl).split("#")[-1] print ttt if not (ttt is None): article_url_index_list.append(ttt) del index_work_set[raw_front_page_index] article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = WuXiaArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def login(self, account, password, captcha=''): if self.recipe_kind == 'zhihu': # 知乎此处的r参数为一个13位的unix时间戳 unix_time_stp = str(int(1000 * time.time()))[0:13] content = Http.get_content('https://www.zhihu.com/') else: Debug.logger.error(u"登录中...未知网站类型错误") return xsrf = Match.xsrf(content) if not xsrf: Debug.logger.info(u'登陆失败') Debug.logger.info(u'敲击回车重新发送登陆请求') return False xsrf = xsrf.split('=')[1] # add xsrf as cookie into cookieJar, cookie = Http.make_cookie(name='_xsrf', value=xsrf, domain='www.zhihu.com') self.cookieJar.set_cookie(cookie) if captcha: post_data = { '_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True, 'captcha': captcha } else: post_data = { '_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True } header = { 'Accept': '*/*', 'Accept-Encoding': 'gzip,deflate', # 主要属性,只要有此项知乎即认为来源非脚本 'Accept-Language': 'zh,zh-CN;q=0.8,en-GB;q=0.6,en;q=0.4', 'Host': 'www.zhihu.com', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36(KHTML, like Gecko)Chrome/34.0.1847.116 Safari/537.36', 'Connection': 'keep-alive', 'X-Requested-With': 'XMLHttpRequest', 'Origin': 'https://www.zhihu.com', 'Referer': 'https://www.zhihu.com/', } result = Http.get_content(url=r'https://www.zhihu.com/login/email', data=post_data, extra_header=header) if not result: Debug.logger.info(u'登陆失败,请敲击回车重新登陆') return False response = json.loads(result) if response['r'] == 0: print u'\nlogin successfully...' print u'account:', account if self.from_ui is not True: print u'请问是否需要记住帐号密码?输入yes记住,输入其它任意字符跳过,回车确认' remenber_account = raw_input() # 从图形界面登录默认记住密码 else: remenber_account = 'yes' if remenber_account == 'yes': Config.account, Config.password, Config.remember_account = account, password, True print u'帐号密码已保存,可通过修改config.json修改设置' else: Config.account, Config.password, Config.remember_account_set = '', '', False print u'跳过保存环节,进入下一流程' Config._save() cookie = self.get_cookie() DB.execute('delete from LoginRecord') # 登陆成功后清除数据库中原有的登录记录,避免下次登陆时取到旧记录 data = dict() data['account'] = account data['password'] = password data['recordDate'] = ExtraTools.get_today() data['cookieStr'] = cookie DB.save(data, 'LoginRecord') DB.commit() return True else: print u'登陆失败' Debug.print_dict(response) return False
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 star_page = 1 max_page = 1 column_info = Todo1ColumnParser("").get_column_info() column_info[u'column_id'] = account_id with open('ReadList.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: split_url = line.split('#')[0] if str(split_url).__contains__(account_id): # Config.now_id_likeName = line.split('#')[1] max_page = int(line.split('#')[-1]) + 1 column_info[u'title'] = str(line.split('#')[1]) # max_page = 1 print max_page from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() #获取每一页中文章的地址的地址 for raw_front_page_index in range(star_page, max_page): request_url = u'https://www.guancha.cn/{}/list_{}.shtml'.format( account_id, raw_front_page_index) index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, 'lxml') list_p_list = soup.find_all('h4', class_="module-title") for p in list_p_list: # print p list_pcyc_li = p.find_all('a') for li in list_pcyc_li: ttt = li.get('href') print ttt if not (ttt is None): ss = str(ttt).split('.') article_url_index_list.append( u"https://www.guancha.cn{}_s.{}".format( ss[0], ss[1])) del index_work_set[raw_front_page_index] article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = Todo1ArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def query_article(self, article_id): print u"query_article {}".format(article_id) raw_article = DB.query_row(u'select * from Article where article_id="{article_id}" '.format(article_id=article_id)) article = self.format_article(raw_article) return article
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 1 max_sleep_time = 1 article_url_index_list = [] # 获取最大页码 url = 'http://www.jintiankansha.me/tag/{}?page=1'.format(account_id) column_info = JinWanKanSaEmptColumnParser('').get_column_info() column_info[u'column_id'] = account_id dt = datetime.datetime.now() column_info[u'title'] = u"AI_{}".format(dt.strftime("%Y-%m-%d")) max_page = 1 typeToTry = 'tag' with open('ReadList.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: split_url = line.split('#')[0] if split_url.split('/')[-1] == account_id: dt = datetime.datetime.now() column_info[u'title'] = u"{}_{}".format( line.split('#')[1], dt.strftime("%Y-%m-%d")) max_page = int(line.split('#')[2]) typeToTry = str(int(line.split('#')[-1])).strip('\n') from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 for raw_front_page_index in range(0, max_page + 1): # request_url = u'http://www.jintiankansha.me/column/{}?page={}'.format(account_id, raw_front_page_index) request_url = u'http://www.jintiankansha.me/{}/{}?page={}'.format( typeToTry, account_id, raw_front_page_index) print request_url index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, 'html.parser') list_p_list = soup.find_all('span', class_="item_title") for tgo_right in list_p_list: for link in tgo_right.findAll('a'): ttt = str(link.get('href')) print ttt if not (ttt is None): article_url_index_list.append(ttt) del index_work_set[raw_front_page_index] # article_url_index_list.append('http://www.jintiankansha.me/t/u8MygoqKI8') article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取 {countert} 号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) time.sleep(mock_sleep_time) if len(request_url_content) == 0: random_sleep_time = base_sleep_time + random.randint( 0, max_sleep_time) / 100.0 Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time)) time.sleep(random_sleep_time) continue article_info = JinWanKanSaArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return