def init_config(recipe_kind): if recipe_kind == 'zhihu': # TODO: 再有一个需要登录的网站, 改掉硬编码 login = Login(recipe_kind='zhihu') else: return # !!!!!发布的时候把Config.remember_account改成false!!!!!,第一次需要登录,之后用cookie即可 # 登陆成功了,自动记录账户 if Config.remember_account_set: Debug.logger.info(u'检测到有设置文件,直接使用之前的设置') # if raw_input(): # login.start() # Config.picture_quality = guide.set_picture_quality() Config.picture_quality = 1 # else: try: Http.set_cookie() # sinablog, jianshu: DontNeed except TypeError: print u"没有找到登录成功的cookie记录, 请重新登录" login.start() else: log.warning_log(u"Please login...") login.start() # Config.picture_quality = guide.set_picture_quality() Config.picture_quality = 1 Config.remember_account_set = True # save config Config._save() return
def login(self, account, password, captcha=''): content = Http.get_content('https://www.zhihu.com/') xsrf = Match.xsrf(content) if not xsrf: Debug.logger.info(u'登陆失败') Debug.logger.info(u'敲击回车重新发送登陆请求') return False xsrf = xsrf.split('=')[1] # add xsrf as cookie into cookieJar, cookie = Http.make_cookie(name='_xsrf', value=xsrf, domain='www.zhihu.com') self.cookieJar.set_cookie(cookie) if captcha: post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True, 'captcha': captcha} else: post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True} header = { 'Accept': '*/*', 'Accept-Encoding': 'gzip,deflate', # 主要属性,只要有此项知乎即认为来源非脚本 'Accept-Language': 'zh,zh-CN;q=0.8,en-GB;q=0.6,en;q=0.4', 'Host': 'www.zhihu.com', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36(KHTML, like Gecko)Chrome/34.0.1847.116 Safari/537.36', 'Connection': 'keep-alive', 'X-Requested-With': 'XMLHttpRequest', 'Origin': 'https://www.zhihu.com', 'Referer': 'https://www.zhihu.com/', } result = Http.get_content(url=r'https://www.zhihu.com/login/email', data=post_data, extra_header=header) if not result: Debug.logger.info(u'登陆失败,请敲击回车重新登陆') return False response = json.loads(result) if response['r'] == 0: print u'登陆成功!' print u'登陆账号:', account print u'请问是否需要记住帐号密码?输入yes记住,输入其它任意字符跳过,回车确认' if raw_input() == 'yes': Config.account, Config.password, Config.remember_account = account, password, True print u'帐号密码已保存,可通过修改config.json修改设置' else: Config.account, Config.password, Config.remember_account = '', '', False print u'跳过保存环节,进入下一流程' Config._save() cookie = self.get_cookie() DB.execute('delete from LoginRecord') # 登陆成功后清除数据库中原有的登录记录,避免下次登陆时取到旧记录 data = {} data['account'] = account data['password'] = password data['recordDate'] = ExtraTools.get_today() data['cookieStr'] = cookie DB.save(data, 'LoginRecord') DB.commit() return True else: print u'登陆失败' Debug.print_dict(response) return False
def create_work_set(self, target_url): u""" 根据博客首页的url, 首先通过re获得博客id, 然后根据博客"关于我"的页面的内容获得写入SinaBlog_Info 的数据(这部分理应不在这个函数中, 可以改进), 最后通过博客目录页面的内容, 获得每篇博文的地址, 放入work_set中 :param target_url: 博客首页的url :return: """ Debug.logger.debug(u"target_url是:" + str(target_url)) if target_url in self.task_complete_set: return result = Match.SinaBlog(target_url) SinaBlog_author_id = int(result.group('SinaBlog_people_id')) href_article_list = 'http://blog.sina.com.cn/s/articlelist_{}_0_1.html'.format( SinaBlog_author_id) href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format( SinaBlog_author_id) # ############下面这部分应该是SinaBlogAuthorWorker的内容, 写到SinaBlog_Info, 暂时写在这, 以后再优化 content_profile = Http.get_content(href_profile) parser = SinaBlogParser(content_profile) self.question_list += parser.get_SinaBlog_info_list() # Debug.logger.debug(u"create_work_set中的question_list是什么??" + str(self.question_list)) # #############上面这部分应该是SinaBlogAuthorWorker的内容, 写到SinaBlog_Info, 暂时写在这, 以后再优化 # content_index = Http.get_content(href_index) content_article_list = Http.get_content(href_article_list) article_num = int(self.parse_article_num(content_article_list)) Debug.logger.debug(u"article_num:" + str(article_num)) if article_num % 50 != 0: page_num = article_num / 50 + 1 # 博客目录页面, 1页放50个博客链接 else: page_num = article_num / 50 self.question_list[0][ 'article_num'] = article_num # 这样的话, 每行只能放一个新浪博客地址!!! # 上面这行, 暂时只能这样写, 因为"关于我"的页面, 没有文章的数量 self.task_complete_set.add(target_url) for page in range(page_num): url = 'http://blog.sina.com.cn/s/articlelist_{}_0_{}.html'.format( SinaBlog_author_id, page + 1) content_article_list = Http.get_content(url) article_list = self.parse_get_article_list(content_article_list) for item in article_list: self.work_set.add(item) # self.work_set.add(article_list[0]) return
def __init__(self, task_list): self.task_set = set(task_list) self.work_set = set() # 待抓取网址池 self.answer_list = [] self.question_list = [] self.thread_pool = ThreadPool(Config.max_thread) self.info_list = [] self.extra_index_list = [] self.info_url_set = self.task_set.copy() self.add_property() # 添加扩展属性 Http.set_cookie()
def get_sinablog_question_list(self, author_id): u""" get sinablog_info, article_num :param author_id: :return: """ href_article_list = 'http://blog.sina.com.cn/s/articlelist_{}_0_1.html'.format(author_id) href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format(author_id) content_profile = Http.get_content(href_profile) parser = SinaBlogParser(content_profile) self.question_list += parser.get_extra_info() content_article_list = Http.get_content(href_article_list) article_num = int(self.parse_article_num(content_article_list)) return article_num
def create_work_set(self, target_url): u""" 根据博客首页的url, 首先通过re获得博客id, 然后根据博客"关于我"的页面的内容获得写入SinaBlog_Info 的数据(这部分理应不在这个函数中, 可以改进), 最后通过博客目录页面的内容, 获得每篇博文的地址, 放入work_set中 :param target_url: 博客首页的url :return: """ Debug.logger.debug(u"target_url是:" + str(target_url)) if target_url in self.task_complete_set: return result = Match.SinaBlog(target_url) SinaBlog_author_id = int(result.group('SinaBlog_people_id')) href_article_list = 'http://blog.sina.com.cn/s/articlelist_{}_0_1.html'.format(SinaBlog_author_id) href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format(SinaBlog_author_id) # ############下面这部分应该是SinaBlogAuthorWorker的内容, 写到SinaBlog_Info, 暂时写在这, 以后再优化 content_profile = Http.get_content(href_profile) parser = SinaBlogParser(content_profile) self.question_list += parser.get_SinaBlog_info_list() # Debug.logger.debug(u"create_work_set中的question_list是什么??" + str(self.question_list)) # #############上面这部分应该是SinaBlogAuthorWorker的内容, 写到SinaBlog_Info, 暂时写在这, 以后再优化 # content_index = Http.get_content(href_index) content_article_list = Http.get_content(href_article_list) article_num = int(self.parse_article_num(content_article_list)) Debug.logger.debug(u"article_num:" + str(article_num)) if article_num % 50 != 0: page_num = article_num/50 + 1 # 博客目录页面, 1页放50个博客链接 else: page_num = article_num / 50 self.question_list[0]['article_num'] = article_num # 这样的话, 每行只能放一个新浪博客地址!!! # 上面这行, 暂时只能这样写, 因为"关于我"的页面, 没有文章的数量 self.task_complete_set.add(target_url) for page in range(page_num): url = 'http://blog.sina.com.cn/s/articlelist_{}_0_{}.html'.format(SinaBlog_author_id, page+1) content_article_list = Http.get_content(url) article_list = self.parse_get_article_list(content_article_list) for item in article_list: self.work_set.add(item) # self.work_set.add(article_list[0]) return
def worker(self, target_url): content = Http.get_content(target_url) if not content: return self.work_set.discard(target_url) self.parse_content(content) return
def create_work_set(self, target_url): u""" 根据target_url(例:http://www.jianshu.com/users/b1dd2b2c87a8/latest_articles)的内容, 先获得creator_id, 再根据文章的数目, 获得页面数, 依次打开每个页面, 将文章的地址放入work_set中 :param target_url: :return: """ if target_url in self.task_complete_set: return id_result = Match.jianshu_author(target_url) jianshu_id = id_result.group('jianshu_id') article_num, article_list = self.get_jianshu_question_list(target_url) self.task_complete_set.add(target_url) if article_num % 9 != 0: page_num = article_num/9 + 1 # 9 href on one page else: page_num = article_num / 9 for item in article_list: self.work_set.add(item) for page in range(page_num-1): # page+2, don't need to get the first page url = 'http://www.jianshu.com/users/{}/latest_articles?page={}'.format(jianshu_id, page+2) content_article_list = Http.get_content(url) article_list = self.parse_get_article_list(content_article_list) for item in article_list: self.work_set.add(item) return
def create_work_set(self, target_url): if target_url in self.task_complete_set: return result = Match.column(target_url) self.column_id = result.group("column_id") content = Http.get_content("https://zhuanlan.zhihu.com/api/columns/" + self.column_id) if not content: return raw_info = json.loads(content) info = {} info["creator_id"] = raw_info["creator"]["slug"] info["creator_hash"] = raw_info["creator"]["hash"] info["creator_sign"] = raw_info["creator"]["bio"] info["creator_name"] = raw_info["creator"]["name"] info["creator_logo"] = ( raw_info["creator"]["avatar"]["template"] .replace("{id}", raw_info["creator"]["avatar"]["id"]) .replace("_{size}", "") ) info["column_id"] = raw_info["slug"] info["name"] = raw_info["name"] info["logo"] = ( raw_info["creator"]["avatar"]["template"].replace("{id}", raw_info["avatar"]["id"]).replace("_{size}", "") ) info["article"] = raw_info["postsCount"] info["follower"] = raw_info["followersCount"] info["description"] = raw_info["description"] self.info_list.append(info) self.task_complete_set.add(target_url) detect_url = "https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset=".format(self.column_id) for i in range(info["article"] / 10 + 1): self.work_set.add(detect_url + str(i * 10)) return
def get_captcha(): # 知乎此处的r参数为一个13位的unix时间戳 unix_time_stp = str(int(1000 * time.time()))[0:13] content = Http.get_content( 'https://www.zhihu.com/captcha.gif?r={}&type=login'.format( unix_time_stp)) # 开始拉取验证码 captcha_path = Path.base_path + u'/我是登陆知乎时的验证码.gif' image = open(captcha_path, 'wb') image.write(content) image.close() print u'请输入您所看到的验证码' print u'验证码在助手所处的文件夹中' print u'验证码位置:' print captcha_path if platform.system() == "Darwin": os.system(u'open "{}" &'.format(captcha_path).encode( sys.stdout.encoding)) else: webbrowser.get().open_new_tab(u'file:///' + captcha_path) print u'如果不需要输入验证码可点按回车跳过此步' captcha = raw_input() return captcha
def create_work_set(self, target_url): if target_url in self.task_complete_set: return result = Match.column(target_url) self.column_id = result.group('column_id') content = Http.get_content('https://zhuanlan.zhihu.com/api/columns/' + self.column_id) if not content: return raw_info = json.loads(content) info = {} info['creator_id'] = raw_info['creator']['slug'] info['creator_hash'] = raw_info['creator']['hash'] info['creator_sign'] = raw_info['creator']['bio'] info['creator_name'] = raw_info['creator']['name'] info['creator_logo'] = raw_info['creator']['avatar']['template'].replace('{id}', raw_info['creator']['avatar'][ 'id']).replace('_{size}', '') info['column_id'] = raw_info['slug'] info['name'] = raw_info['name'] info['logo'] = raw_info['creator']['avatar']['template'].replace('{id}', raw_info['avatar']['id']).replace( '_{size}', '') info['article'] = raw_info['postsCount'] info['follower'] = raw_info['followersCount'] info['description'] = raw_info['description'] self.info_list.append(info) self.task_complete_set.add(target_url) detect_url = 'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset='.format(self.column_id) for i in range(info['article'] / 10 + 1): self.work_set.add(detect_url + str(i * 10)) return
def __init__(self, task_list): self.task_set = set(task_list) self.task_complete_set = set() self.work_set = set() # 待抓取网址池 self.work_complete_set = set() # 已完成网址池 self.content_list = [] # 用于存放已抓取的内容 self.answer_list = [] self.question_list = [] self.info_list = [] self.extra_index_list = [] self.info_url_set = self.task_set.copy() self.info_url_complete_set = set() self.add_property() # 添加扩展属性 Http.set_cookie()
def create_work_set(self, target_url): if target_url in self.task_complete_set: return result = Match.column(target_url) column_id = result.group('column_id') content = Http.get_content('https://zhuanlan.zhihu.com/api/columns/' + column_id) if not content: return raw_info = json.loads(content) info = {} info['creator_id'] = raw_info['creator']['slug'] info['creator_hash'] = raw_info['creator']['hash'] info['creator_sign'] = raw_info['creator']['bio'] info['creator_name'] = raw_info['creator']['name'] info['creator_logo'] = raw_info['creator']['avatar'][ 'template'].replace('{id}', raw_info['creator']['avatar']['id']).replace( '_{size}', '') info['column_id'] = raw_info['slug'] info['name'] = raw_info['name'] info['logo'] = raw_info['creator']['avatar']['template'].replace( '{id}', raw_info['avatar']['id']).replace('_{size}', '') info['article'] = raw_info['postsCount'] info['follower'] = raw_info['followersCount'] info['description'] = raw_info['description'] self.info_list.append(info) self.task_complete_set.add(target_url) detect_url = 'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset='.format( column_id) for i in range(info['article'] / 10 + 1): self.work_set.add(detect_url + str(i * 10)) return
def check_update(): # 强制更新 u""" * 功能 * 检测更新。 * 若在服务器端检测到新版本,自动打开浏览器进入新版下载页面 * 网页请求超时或者版本号正确都将自动跳过 * 输入 * 无 * 返回 * 无 """ print u"检查更新。。。" if Config.debug: # 当位于debug模式时,不检查更新 return try: content = Http.get_content(u"https://www.yaozeyuan.online/zhihuhelp/upgrade.txt") if not content: raise Exception(u'HttpError') time, url = [x.strip() for x in content.strip('\n').split('\n')] if time == Config.update_time: return else: print u"发现新版本,\n更新日期:{} ,点按回车进入更新页面".format(time) print u'新版本下载地址:' + url raw_input() import webbrowser webbrowser.open_new_tab(url) except Exception: # 不论发生任何异常均直接返回 return
def create_work_set(self, target_url): u""" 根据博客首页的url, 首先通过re获得博客id, 然后根据博客"关于我"的页面的内容获得写入sinablog_info 的数据(这部分理应不在这个函数中, 可以改进), 最后通过博客目录页面的内容, 获得每篇博文的地址, 放入work_set中 :param target_url: 博客首页的url :return: """ if target_url in self.task_complete_set: return result = Match.sinablog_author(target_url) sinablog_author_id = int(result.group('sinablog_people_id')) article_num = self.get_sinablog_question_list(sinablog_author_id) if article_num % 50 != 0: page_num = article_num/50 + 1 # 50 href on 1 page else: page_num = article_num / 50 self.question_list[0]['article_num'] = article_num # 上面这行, 暂时只能这样写, 因为"关于我"的页面没有文章的数量 self.task_complete_set.add(target_url) for page in range(page_num): url = 'http://blog.sina.com.cn/s/articlelist_{}_0_{}.html'.format(sinablog_author_id, page+1) content_article_list = Http.get_content(url) article_list = self.parse_get_article_list(content_article_list) for item in article_list: self.work_set.add(item) return
def check_update(): # 强制更新 u""" * 功能 * 检测更新。 * 若在服务器端检测到新版本,自动打开浏览器进入新版下载页面 * 网页请求超时或者版本号正确都将自动跳过 * 输入 * 无 * 返回 * 无 """ print u"检查更新。。。" try: content = Http.get_content(u"http://zhihuhelpbyyzy-zhihu.stor.sinaapp.com/ZhihuHelpUpdateTime.txt") if not content: raise Exception("HttpError") except: return time, url = [x.strip() for x in content.split("\n")] if time == Config.update_time: return else: print u"发现新版本,\n更新日期:{} ,点按回车进入更新页面".format(time) print u"新版本下载地址:" + url raw_input() import webbrowser webbrowser.open_new_tab(url) return
def download(self, index): image = self.container[index] filename = image['filename'] href = image['href'] # filename=href.split('/')[-1] if os.path.isfile(self.save_path + '/' + filename): return print 'Downloading picture:' + href + ' filename ' + filename # urllib.urlretrieve(href, self.save_path + '/' + filename, cbk) if len(str(href)) < 300 and Match.isUrlOk(href): # Debug.print_in_single_line(u'Downloading picture: {}'.format(href)) rely_url = str(href).split('@')[0] content = Http.get_content(url=rely_url, timeout=Config.timeout_download_picture) else: Debug.print_in_single_line(u"Href of the Picture seems wrong...") content = None if not content: return with open(self.save_path + '/' + filename, 'wb') as image: image.write(content) return
def check_update(): # 强制更新 u""" * 功能 * 检测更新。 * 若在服务器端检测到新版本,自动打开浏览器进入新版下载页面 * 网页请求超时或者版本号正确都将自动跳过 * 输入 * 无 * 返回 * 无 """ print u"检查更新。。。" try: content = Http.get_content( u"http://zhihuhelpbyyzy-zhihu.stor.sinaapp.com/ZhihuHelpUpdateTime.txt" ) if not content: raise Exception('HttpError') except: return time, url = [x.strip() for x in content.split('\n')] if time == Config.update_time: return else: print u"发现新版本,\n更新日期:{} ,点按回车进入更新页面".format(time) print u'新版本下载地址:' + url raw_input() import webbrowser webbrowser.open_new_tab(url) return
def create_work_set(self, target_url): if target_url in self.task_complete_set: return content = Http.get_content(target_url) if not content: return self.task_complete_set.add(target_url) notebooks_id = self.info_list[0]['notebooks_id'] page_num = self.parse_max_page(content) for page in range(page_num): url = 'http://www.jianshu.com/notebooks/627726/latest?page={}'.format(page+1) content = Http.get_content(url) article_list = JianshuNotebooksParser(content).get_article_list() self.add_notebooks_index(notebooks_id, article_list) for item in article_list: self.work_set.add(item) return
def catch_info(self, target_url): content = Http.get_content(target_url) if not content: return self.info_url_set.discard(target_url) parser = CollectionParser(content) self.info_list.append(parser.get_extra_info()) return
def worker(self, target_url): Debug.logger.info(u'开始抓取{}的内容'.format(target_url)) content = Http.get_content(target_url) if not content: return self.work_set.discard(target_url) self.parse_content(content) return
def catch_info(self, target_url): content = Http.get_content(target_url + '/about') if not content: return self.info_url_set.discard(target_url) parser = AuthorParser(content) self.info_list.append(parser.get_extra_info()) return
def init_config(): login = Login() if Config.remember_account: print u'检测到有设置文件,是否直接使用之前的设置?(帐号、密码、图片质量)' print u'点按回车使用之前设置,敲入任意字符后点按回车进行重新设置' if raw_input(): login.start() Config.picture_quality = guide.set_picture_quality() else: Http.set_cookie() else: login.start() Config.picture_quality = guide.set_picture_quality() # 储存设置 Config._save() return
def catch_info(self, target_url): content = Http.get_content(target_url + '/top-answers') if not content: return self.info_url_set.discard(target_url) parser = TopicParser(content) self.info_list.append(parser.get_extra_info()) return
def init_config(): login = Login() if Config.remember_account: print u"检测到有设置文件,是否直接使用之前的设置?(帐号、密码、图片质量)" print u"点按回车使用之前设置,敲入任意字符后点按回车进行重新设置" if raw_input(): login.start() Config.picture_quality = guide.set_picture_quality() else: Http.set_cookie() else: login.start() Config.picture_quality = guide.set_picture_quality() # 储存设置 Config._save() return
def create_work_set(self, target_url): if target_url in self.task_complete_set: return self.task_complete_set.add(target_url) url = target_url + '?page=2' # there are page num in this url content = Http.get_content(url) page_num = self.parse_max_page(content) for item in range(int(page_num)): url = target_url + '?page={}'.format(str(item+1)) content = Http.get_content(url) parser = CnblogsAuthorParser(content) article_url_list = parser.get_article_list() for item in article_url_list: self.work_set.add(item) return
def create_work_set(self, target_url): content = Http.get_content(target_url + '/answers?order_by=vote_num') if not content: return self.task_set.discard(target_url) max_page = self.parse_max_page(content) for page in range(max_page): url = '{}/answers?order_by=vote_num&page={}'.format(target_url, page + 1) self.work_set.add(url) return
def create_work_set(self, target_url): content = Http.get_content(target_url + '?nr=1&sort=created') if not content: return self.task_set.discard(target_url) max_page = self.parse_max_page(content) for page in range(max_page): url = '{}?nr=1&sort=created&page={}'.format(target_url, page + 1) self.work_set.add(url) return
def catch_info(self, target_url): if target_url in self.info_url_complete_set: return content = Http.get_content(target_url) if not content: return self.info_url_complete_set.add(target_url) parser = CollectionParser(content) self.info_list.append(parser.get_extra_info()) return
def catch_info(self, target_url): if target_url in self.info_url_complete_set: return content = Http.get_content(target_url + '/top-answers') if not content: return self.info_url_complete_set.add(target_url) parser = TopicParser(content) self.info_list.append(parser.get_extra_info()) return
def create_work_set(self, target_url): if target_url in self.task_complete_set: return content = Http.get_content(target_url) if not content: return self.task_complete_set.add(target_url) real_id = self.info_list[0]['collection_real_id'] fake_id = self.info_list[0]['collection_fake_id'] page_num = self.parse_max_page(content) for page in range(page_num): url = 'http://www.jianshu.com/collections/' + str(real_id) + '/notes?order_by=added_at&page={}'.format(page+1) content = Http.get_content(url) article_list = JianshuCollectionParser(content).get_article_list() self.add_collection_index(fake_id, article_list) for item in article_list: self.work_set.add(item) return
def catch_info(self, target_url): if target_url in self.info_url_complete_set: return content = Http.get_content(target_url) if not content: return self.info_url_complete_set.add(target_url) parser = YiibaiParser(content) self.info_list.append(parser.get_extra_info()) return
def create_work_set(self, target_url): if target_url in self.task_complete_set: return content = Http.get_content(target_url) article_list = YiibaiParser(content).get_article_list() self.task_complete_set.add(target_url) for item in article_list: self.work_set.add(item) return
def reCreate_work_set(self, target_url, subUrl="", pageFlag="?"): if target_url in self.task_complete_set: return content = Http.get_content(target_url + subUrl) if not content: return self.task_complete_set.add(target_url) max_page = self.parse_max_page(content) for page in range(max_page): url = "{}{}{}page={}".format(target_url, subUrl, pageFlag, page + 1) self.work_set.add(url) return
def create_work_set(self, target_url): if target_url in self.task_complete_set: return content = Http.get_content(target_url + "?nr=1&sort=created") if not content: return self.task_complete_set.add(target_url) max_page = self.parse_max_page(content) for page in range(max_page): url = "{}?nr=1&sort=created&page={}".format(target_url, page + 1) self.work_set.add(url) return
def create_work_set(self, target_url): if target_url in self.task_complete_set: return content = Http.get_content(target_url + '/top-answers') if not content: return self.task_complete_set.add(target_url) max_page = self.parse_max_page(content) for page in range(max_page): url = '{}/top-answers?page={}'.format(target_url, page + 1) self.work_set.add(url) return
def findPDFUrl(self, originUrl): ''' the links returned from the method <findAnnalReports> are not the PDF links those origin links should be further excavated to find the true links ''' r = Http.get_content(originUrl) # r.encoding = "gbk" # result = unicode(str(result), 'GBK').encode('UTF-8') bsObj = BeautifulSoup(r, "html.parser") result = bsObj.findAll("a") pdf = [link["href"] for link in result if "PDF" in link["href"]][0] return pdf
def get_captcha(): content = Http.get_content('http://www.zhihu.com/captcha.gif') # 开始拉取验证码 captcha_path = Path.base_path + u'/我是登陆知乎时的验证码.gif' with open(captcha_path, 'wb') as image: image.write(content) print u'请输入您所看到的验证码' print u'验证码在助手所处的文件夹中' print u'验证码位置:' print captcha_path print u'如果不需要输入验证码可点按回车跳过此步' captcha = raw_input() return captcha
def downloadPDF(self, name, url, isDesktop=True): if isDesktop: fileName = "/ink/work/62/ink/{name}.pdf".format(name=name) else: fileName = "{name}.pdf".format(name=name) if not os.path.exists(fileName): pdf = Http.get_content(url, timeout=180) with open(fileName, "wb") as file: for chunk in pdf: file.write(chunk) print("Done!")
def create_work_set(self, target_url): if target_url in self.task_complete_set: return content = Http.get_content(target_url + '?nr=1&sort=created') if not content: return self.task_complete_set.add(target_url) max_page = self.parse_max_page(content) for page in range(max_page): url = '{}?nr=1&sort=created&page={}'.format(target_url, page + 1) self.work_set.add(url) return
def get_jianshu_question_list(self, target_url): u""" get jianshu_info, article_num, article_list :param target_url: :return: """ index_content = Http.get_content(target_url) parser = JianshuAuthorParser(index_content) self.question_list += parser.get_extra_info() article_num = self.question_list[0]['article_num'] # not collection, only one author article_list = self.parse_get_article_list(index_content) return article_num, article_list
def findAnnalReports(self, stockid): url = u"http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/{stockid}/page_type/ndbg.phtml".format( stockid=stockid) r = Http.get_content(url) r = unicode(r, 'GBK').encode('UTF-8') bsObj = BeautifulSoup(r, "html.parser") result = bsObj.findAll("div", {"class": "datelist"})[0] dateList = re.findall('\d{4}-\d{2}-\d{2}', result.text) returnList = [] for index, value in enumerate(result.findAll("a")): returnList.append([dateList[index], value.text, "http://vip.stock.finance.sina.com.cn" + value["href"]]) return returnList
def get_captcha(): content = Http.get_content( 'https://www.zhihu.com/captcha.gif') # 开始拉取验证码 captcha_path = Path.base_path + u'/我是登陆知乎时的验证码.gif' with open(captcha_path, 'wb') as image: image.write(content) print u'请输入您所看到的验证码' print u'验证码在助手所处的文件夹中' print u'验证码位置:' print captcha_path print u'如果不需要输入验证码可点按回车跳过此步' captcha = raw_input() return captcha
def create_work_set(self, target_url): u""" 根据target_url(例:http://www.jianshu.com/users/b1dd2b2c87a8/latest_articles)的内容, 先获得creator_id, 再根据文章的数目, 获得页面数, 依次打开每个页面, 将文章的地址放入work_set中 :param target_url: :return: """ if target_url in self.task_complete_set: return id_result = Match.jianshu(target_url) jianshu_id = id_result.group('jianshu_id') # ############下面这部分应该是JianshuAuthorInfo的内容, 完成jianshu_info中的内容,暂时写在这, TODO, 一行只能写一个地址 content_profile = Http.get_content(target_url) parser = JianshuParser(content_profile) self.question_list += parser.get_jianshu_info_list() # #############上面这部分应该是JianshuAuthorInfo的内容, 完成jianshu_info中的内容,暂时写在这 self.task_complete_set.add(target_url) article_num = self.question_list[0][ 'article_num'] # 这样的话, 一行只能写一个地址 TODO: 硬编码 if article_num % 9 != 0: page_num = article_num / 9 + 1 # 博客目录页面, 1页放50个博客链接 else: page_num = article_num / 9 article_list = self.parse_get_article_list(content_profile) for item in article_list: self.work_set.add(item) for page in range(page_num - 1): # 第一页是不需要打开的 url = 'http://www.jianshu.com/users/{}/latest_articles?page={}'.format( jianshu_id, page + 2) content_article_list = Http.get_content(url) article_list = self.parse_get_article_list(content_article_list) for item in article_list: self.work_set.add(item) return
def create_work_set(self, target_url): if target_url in self.task_complete_set: return content = Http.get_content(target_url + '/answers?order_by=vote_num') if not content: return self.task_complete_set.add(target_url) max_page = self.parse_max_page(content) for page in range(max_page): url = '{}/answers?order_by=vote_num&page={}'.format( target_url, page + 1) self.work_set.add(url) return
def worker(self, target_url): if target_url in self.work_complete_set: # 自动跳过已抓取成功的网址 return Debug.logger.info(u'开始抓取{}的内容'.format(target_url)) content = Http.get_content(target_url) if not content: return content = Match.fix_html(content) # 需要修正其中的<br>标签,避免爆栈 self.content_list.append(content) Debug.logger.debug(u'{}的内容抓取完成'.format(target_url)) self.work_complete_set.add(target_url) return
def download(self, index): image = self.container[index] filename = image['filename'] href = image['href'] if os.path.isfile(self.save_path + '/' + filename): return Debug.logger.debug(u'开始下载图片{}'.format(href)) content = Http.get_content(url=href, timeout=Config.timeout_download_picture) if not content: return with open(self.save_path + '/' + filename, 'wb') as image: image.write(content) return
def get_captcha(): content = Http.get_content( 'https://www.zhihu.com/captcha.gif') # 开始拉取验证码 captcha_path = Path.base_path + u'/我是登陆知乎时的验证码.gif' with open(captcha_path, 'wb') as image: image.write(content) print u'请输入您所看到的验证码' print u'验证码在助手所处的文件夹中' print u'验证码位置:' print captcha_path if platform.system() == "Darwin": os.system(u'open "{}" &'.format(captcha_path).encode( sys.stdout.encoding)) else: webbrowser.get().open_new_tab(u'file:///' + captcha_path) print u'如果不需要输入验证码可点按回车跳过此步' captcha = raw_input() return captcha
def download(self, index): image = self.container[index] filename = image['filename'] href = image['href'] # 下载图片时自动把https换成http,以便加速图片下载过程 href = href.replace('https://', 'http://') if os.path.isfile(self.save_path + '/' + filename): return Debug.print_in_single_line(u'开始下载图片{}'.format(href)) if href: content = Http.get_content(url=href, timeout=Config.timeout_download_picture) if not content: Debug.logger.debug(u'图片『{}』下载失败'.format(href)) content = '' else: Debug.print_in_single_line(u'图片{}下载完成'.format(href)) else: # 当下载地址为空的时候,就没必要再去下载了 content = '' with open(self.save_path + '/' + filename, 'wb') as image: image.write(content) return
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 1 max_sleep_time = 1 article_url_index_list = [] # 获取最大页码 url = 'http://www.jintiankansha.me/tag/{}?page=1'.format(account_id) column_info = JinWanKanSaEmptColumnParser('').get_column_info() column_info[u'column_id'] = account_id dt = datetime.datetime.now() column_info[u'title'] = u"AI_{}".format(dt.strftime("%Y-%m-%d")) max_page = 1 typeToTry = 'tag' with open('ReadList.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: split_url = line.split('#')[0] if split_url.split('/')[-1] == account_id: dt = datetime.datetime.now() column_info[u'title'] = u"{}_{}".format( line.split('#')[1], dt.strftime("%Y-%m-%d")) max_page = int(line.split('#')[2]) typeToTry = str(int(line.split('#')[-1])).strip('\n') from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 for raw_front_page_index in range(0, max_page + 1): # request_url = u'http://www.jintiankansha.me/column/{}?page={}'.format(account_id, raw_front_page_index) request_url = u'http://www.jintiankansha.me/{}/{}?page={}'.format( typeToTry, account_id, raw_front_page_index) print request_url index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, 'html.parser') list_p_list = soup.find_all('span', class_="item_title") for tgo_right in list_p_list: for link in tgo_right.findAll('a'): ttt = str(link.get('href')) print ttt if not (ttt is None): article_url_index_list.append(ttt) del index_work_set[raw_front_page_index] # article_url_index_list.append('http://www.jintiankansha.me/t/u8MygoqKI8') article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取 {countert} 号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) time.sleep(mock_sleep_time) if len(request_url_content) == 0: random_sleep_time = base_sleep_time + random.randint( 0, max_sleep_time) / 100.0 Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time)) time.sleep(random_sleep_time) continue article_info = JinWanKanSaArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 star_page = 1 max_page = 1 column_info = Todo1ColumnParser("").get_column_info() column_info[u'column_id'] = account_id with open('ReadList.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: split_url = line.split('#')[0] if str(split_url).__contains__(account_id): # Config.now_id_likeName = line.split('#')[1] max_page = int(line.split('#')[-1]) + 1 column_info[u'title'] = str(line.split('#')[1]) # max_page = 1 print max_page from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() #获取每一页中文章的地址的地址 for raw_front_page_index in range(star_page, max_page): request_url = u'https://www.guancha.cn/{}/list_{}.shtml'.format( account_id, raw_front_page_index) index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, 'lxml') list_p_list = soup.find_all('h4', class_="module-title") for p in list_p_list: # print p list_pcyc_li = p.find_all('a') for li in list_pcyc_li: ttt = li.get('href') print ttt if not (ttt is None): ss = str(ttt).split('.') article_url_index_list.append( u"https://www.guancha.cn{}_s.{}".format( ss[0], ss[1])) del index_work_set[raw_front_page_index] article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = Todo1ArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def start(self): print 'start JRJ_Report' stockList = [] file_name = 'annual.txt' with open(file_name, 'r') as read_list: read_list = read_list.readlines() resultsL = read_list.__len__() for x in range(0, resultsL): line = read_list[x] splits = line.split('#') code = (str)(splits[0]) fieName = (str)(splits[1]).strip() print fieName stockList.append({'URL': code, 'NAME': fieName}) for xx in stockList: for raw_front_page_index in range(1, 8): fileN = str(xx['NAME']).strip() uux = xx['URL'] sdPath = '/ink/work/62/ink/{}'.format(fileN) Path.mkdir(sdPath) url = u"http://istock.jrj.com.cn/yanbao_{}_p{}.html" request_url = url.format(uux, raw_front_page_index) content = Http.get_content(request_url) soup = BeautifulSoup(content, 'html.parser') list_p_list = soup.find_all('td', class_="left") for p in list_p_list: # print p list_pcyc_li = p.find_all('a') for li in list_pcyc_li: xxurl = li.get('href') # print xxurl if not 'http://istock.jrj.com.cn/list,yanbao.html' == xxurl: time.sleep(1) result = Http.get_content(xxurl) result = unicode(str(result), 'GBK').encode('UTF-8') xxsoup = BeautifulSoup(result, 'html.parser') # title_tationl = xxsoup.find_all('h1') # tt = str(title_tationl[0].text).strip() xxlist_p_list = xxsoup.find_all('p', class_='title')[0] xxlist_ds = xxsoup.find_all('span', class_='fr')[0] realu = str(xxlist_p_list).replace( str(xxlist_ds), '', 1) realuxsoup = BeautifulSoup(realu, 'html.parser') sp = str(realuxsoup.text).split(' ') ttime = sp[1] if ttime.__contains__('发表于'): ttime = sp[2] # print (sp[2]).text # print (sp[3]).text # print ttime all_main = xxsoup.find_all('div', class_='main')[0] realuxsoup = BeautifulSoup(str(all_main), 'html.parser') reaupp = realuxsoup.find_all('p') for pp in reaupp: list_pcyc_li = pp.find_all('a') for li in list_pcyc_li: print li.text ttt = li.get('href') print ttt fileName = u"{}_{}.pdf".format( ttime, str(li.text).replace('/', "")) print fileName basePath = '/ink/work/62/ink/{}/{}'.format( fileN, fileName) Path.mkdirAndPath(basePath) Debug.print_in_single_line( u'开始下载 {}'.format(ttt)) if ttt: content = Http.get_content(url=ttt, timeout=180) if not content: Debug.logger.debug( u'pdf『{}』下载失败'.format(ttt)) content = '' else: Debug.print_in_single_line( u'pdf {} 下载完成'.format(ttt)) else: # 当下载地址为空的时候,就没必要再去下载了 content = '' if content.__len__() > 10: with open(basePath, "wb") as pdf: pdf.write(content)
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 url = 'https://www.wuxiareview.com/category/{}'.format(account_id) front_page_content = Http.get_content(url) column_info = WuXiaColumnParser(front_page_content).get_column_info() column_info[u'column_id'] = account_id max_page = 2 if account_id == 'daidai': column_info[u'title'] = "吃瓜群众岱岱" max_page = 1 elif account_id == 'gzmdzst': column_info[u'title'] = "顾子明的政事堂" max_page = 1 else: column_info[u'title'] = "时文" max_page = 2 from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 for raw_front_page_index in range(0, max_page): request_url = u'https://www.wuxiareview.com/category/{}/{}/'.format( account_id, raw_front_page_index) index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, 'lxml') list_p_list = soup.find_all('article', class_="excerpt") for p in list_p_list: # print p list_pcyc_li = p.find_all('a') for li in list_pcyc_li: # print li.text tarUrl = li.get('href') ttt = str(tarUrl).split("#")[-1] print ttt if not (ttt is None): article_url_index_list.append(ttt) del index_work_set[raw_front_page_index] article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = WuXiaArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 url = 'https://www.huxiu.com/{}'.format(account_id) front_page_content = Http.get_content(url) # Config.now_id_likeName = account_id # Config.save() column_info = HuXiuColumnParser(front_page_content).get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = account_id from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) u_result = urllib.quote( account_id.decode(sys.stdin.encoding).encode('utf8')) print account_id max_page = 2 idds = '' # with open('ReadList.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: split_url = line.split('#')[0] if split_url.split('/')[-1] == account_id: # Config.now_id_likeName = line.split('#')[1] max_page = int(line.split('#')[-1]) + 1 idds = str(line.split('#')[1]) print max_page max_page = -1 # 分析网页内容,存到数据库里 Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 for raw_front_page_index in range(0, max_page + 1): #https://www.huxiu.com/search.html?s=%E5%B7%B4%E8%8F%B2%E7%89%B9&sort=dateline:desc request_url = u'https://www.huxiu.com/search.html?s={}&sort=dateline%3Adesc&per_page={}'.format( u_result, raw_front_page_index) #request_url = u'https://www.huxiu.com/member/{}/article/{}.html'.format(idds,raw_front_page_index) # request_url = 'https://www.huxiu.com/member/1872007.html' index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, "lxml") list_pcyc_l_ = soup.find_all('li') # list_pcyc_l_ = soup.find_all('div',class_='mob-ctt') for tgo_right in list_pcyc_l_: for link in tgo_right.findAll('a'): hre = str(link.get('href')) if hre.startswith('/article/', 0, 10): print u'https://www.huxiu.com{}'.format( link.get('href')) article_url_index_list.append( 'https://www.huxiu.com{}'.format( link.get('href'))) del index_work_set[raw_front_page_index] article_url_index_list.append( 'https://www.huxiu.com/article/299355.html') article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = HuXiuArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 1 max_sleep_time = 1 article_url_index_list = [] # 获取最大页码 column_info = WeiXinColumnParser('').get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = account_id column_info[u'image_url'] = 'https://wpimg.wallstcn.com/3598b719-ab0d-4be7-bc09-30c3ae29a3cc.jpg?imageView2/1/w/240/h/240' max_page = 1 # with open('ReadList.txt', 'r') as read_list: # read_list = read_list.readlines() # for line in read_list: # split_url = line.split('#')[0] # if str(split_url).__contains__(account_id): # # Config.now_id_likeName = line.split('#')[1] # max_page = int(line.split('#')[-1]) + 1 # column_info[u'title'] = str(line.split('#')[1]) # # # max_page = 1 # print max_page from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) # article_url_index_list.append('https://mp.weixin.qq.com/s?__biz=MjM5MjczNDc0Mw==&mid=2650847984&idx=2&sn=b7b111e5964d2f2fb568ba0d419e3edf&chksm=bd55d1888a22589e2f3bab0613b346427079efc6b82fac869d4f78244a500c3e5cc8cb8402ed&scene=21#wechat_redirect') # article_url_index_list.append('https://mp.weixin.qq.com/s/yj1BT3jWyxLjlEnzz0vEtQ') with open('/Users/0/Desktop/list.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: article_url_index_list.append(str(line).strip('\n')) article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"'.format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info(u"开始抓取 {countert} 号文章,剩余{article_count}篇".format(countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) time.sleep(mock_sleep_time) if len(request_url_content) == 0: random_sleep_time = base_sleep_time + random.randint(0, max_sleep_time) / 100.0 Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time)) time.sleep(random_sleep_time) continue #article_info = Todo2ArticleParser(request_url_content).get_article_info() # article_info = HuXiuArticleParser(request_url_content).get_article_info() article_info = WeiXinArticleParser(request_url_content).get_article_info() # article_info = WallStreetArticleParser(request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 column_info = Todo3ColumnParser('').get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = "新能源汽车" column_info['article_count'] = 0 column_info['follower_count'] = 0 column_info['description'] = '' column_info['image_url'] = '' from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) star_page = 1 max_page = 1 from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() #获取每一页中文章的地址的地址 for raw_front_page_index in range(star_page, max_page): request_url = u'https://post.smzdm.com/fenlei/xinnengyuanche/p{}/'.format( raw_front_page_index) index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, 'lxml') list_p_list = soup.find_all('div', class_='list-border clearfix') for p in list_p_list: # print p list_pcyc_li = p.find_all('a') li = list_pcyc_li[0] tarUrl = li.get('href') ttt = str(tarUrl).split("#")[-1] print ttt if not (ttt is None): article_url_index_list.append(ttt) del index_work_set[raw_front_page_index] article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = Todo3ArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def login(self, account, password, captcha=''): content = Http.get_content('https://www.zhihu.com/') xsrf = Match.xsrf(content) if not xsrf: Debug.logger.info(u'登陆失败') Debug.logger.info(u'敲击回车重新发送登陆请求') return False xsrf = xsrf.split('=')[1] # add xsrf as cookie into cookieJar, cookie = Http.make_cookie(name='_xsrf', value=xsrf, domain='www.zhihu.com') self.cookieJar.set_cookie(cookie) if captcha: post_data = { '_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True, 'captcha': captcha } else: post_data = { '_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True } header = { 'Accept': '*/*', 'Accept-Encoding': 'gzip,deflate', # 主要属性,只要有此项知乎即认为来源非脚本 'Accept-Language': 'zh,zh-CN;q=0.8,en-GB;q=0.6,en;q=0.4', 'Host': 'www.zhihu.com', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36(KHTML, like Gecko)Chrome/34.0.1847.116 Safari/537.36', 'Connection': 'keep-alive', 'X-Requested-With': 'XMLHttpRequest', 'Origin': 'https://www.zhihu.com', 'Referer': 'https://www.zhihu.com/', } result = Http.get_content(url=r'https://www.zhihu.com/login/email', data=post_data, extra_header=header) if not result: Debug.logger.info(u'登陆失败,请敲击回车重新登陆') return False response = json.loads(result) if response['r'] == 0: print u'登陆成功!' print u'登陆账号:', account print u'请问是否需要记住帐号密码?输入yes记住,输入其它任意字符跳过,回车确认' if raw_input() == 'yes': Config.account, Config.password, Config.remember_account = account, password, True print u'帐号密码已保存,可通过修改config.json修改设置' else: Config.account, Config.password, Config.remember_account = '', '', False print u'跳过保存环节,进入下一流程' Config._save() cookie = self.get_cookie() DB.execute( 'delete from LoginRecord') # 登陆成功后清除数据库中原有的登录记录,避免下次登陆时取到旧记录 data = {} data['account'] = account data['password'] = password data['recordDate'] = ExtraTools.get_today() data['cookieStr'] = cookie DB.save(data, 'LoginRecord') DB.commit() return True else: print u'登陆失败' Debug.print_dict(response) return False
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 url = 'http://www.gushequ.com/{}/'.format(account_id) front_page_content = Http.get_content(url) column_info = TodoColumnParser(front_page_content).get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = "股社区" from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) star_page = 0 max_page = 24 if account_id == '2018': star_page = 0 max_page = 24 elif account_id == '2017': star_page = 24 max_page = 58 elif account_id == '2016': star_page = 58 max_page = 92 from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() #获取每一页中文章的地址的地址 for raw_front_page_index in range(star_page, max_page): request_url = u'http://www.gushequ.com/page/{}/'.format( raw_front_page_index) index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, 'lxml') list_p_list = soup.find_all('article') for p in list_p_list: # print p list_pcyc_li = p.find_all('a') for li in list_pcyc_li: tarUrl = li.get('href') ttt = str(tarUrl).split("#")[-1] print ttt if not (ttt is None): article_url_index_list.append(ttt) del index_work_set[raw_front_page_index] article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = TodoArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return