def download(self, index): image = self.container[index] filename = image['filename'] href = image['href'] # filename=href.split('/')[-1] if os.path.isfile(self.save_path + '/' + filename): return print 'Downloading picture:' + href + ' filename ' + filename # urllib.urlretrieve(href, self.save_path + '/' + filename, cbk) if len(str(href)) < 300 and Match.isUrlOk(href): # Debug.print_in_single_line(u'Downloading picture: {}'.format(href)) rely_url = str(href).split('@')[0] content = Http.get_content(url=rely_url, timeout=Config.timeout_download_picture) else: Debug.print_in_single_line(u"Href of the Picture seems wrong...") content = None if not content: return with open(self.save_path + '/' + filename, 'wb') as image: image.write(content) return
def login(self, account, password, captcha=''): content = Http.get_content('https://www.zhihu.com/') xsrf = Match.xsrf(content) if not xsrf: Debug.logger.info(u'登陆失败') Debug.logger.info(u'敲击回车重新发送登陆请求') return False xsrf = xsrf.split('=')[1] # add xsrf as cookie into cookieJar, cookie = Http.make_cookie(name='_xsrf', value=xsrf, domain='www.zhihu.com') self.cookieJar.set_cookie(cookie) if captcha: post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True, 'captcha': captcha} else: post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True} header = { 'Accept': '*/*', 'Accept-Encoding': 'gzip,deflate', # 主要属性,只要有此项知乎即认为来源非脚本 'Accept-Language': 'zh,zh-CN;q=0.8,en-GB;q=0.6,en;q=0.4', 'Host': 'www.zhihu.com', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36(KHTML, like Gecko)Chrome/34.0.1847.116 Safari/537.36', 'Connection': 'keep-alive', 'X-Requested-With': 'XMLHttpRequest', 'Origin': 'https://www.zhihu.com', 'Referer': 'https://www.zhihu.com/', } result = Http.get_content(url=r'https://www.zhihu.com/login/email', data=post_data, extra_header=header) if not result: Debug.logger.info(u'登陆失败,请敲击回车重新登陆') return False response = json.loads(result) if response['r'] == 0: print u'登陆成功!' print u'登陆账号:', account print u'请问是否需要记住帐号密码?输入yes记住,输入其它任意字符跳过,回车确认' if raw_input() == 'yes': Config.account, Config.password, Config.remember_account = account, password, True print u'帐号密码已保存,可通过修改config.json修改设置' else: Config.account, Config.password, Config.remember_account = '', '', False print u'跳过保存环节,进入下一流程' Config._save() cookie = self.get_cookie() DB.execute('delete from LoginRecord') # 登陆成功后清除数据库中原有的登录记录,避免下次登陆时取到旧记录 data = {} data['account'] = account data['password'] = password data['recordDate'] = ExtraTools.get_today() data['cookieStr'] = cookie DB.save(data, 'LoginRecord') DB.commit() return True else: print u'登陆失败' Debug.print_dict(response) return False
def download(self, index): image = self.container[index] filename = image['filename'] href = image['href'] # 下载图片时自动把https换成http,以便加速图片下载过程 href = href.replace('https://', 'http://') # 图片存在就不下载 if os.path.exists(self.save_path + '/' + filename): return else: if os.path.isfile(self.save_path + '/' + filename): return Debug.print_in_single_line(u'开始下载图片 {}'.format(href)) if href: content = Http.get_content( url=href, timeout=Config.timeout_download_picture) if not content: Debug.logger.debug(u'图片『{}』下载失败'.format(href)) content = '' else: Debug.print_in_single_line(u'图片{}下载完成'.format(href)) else: # 当下载地址为空的时候,就没必要再去下载了 content = '' if content.__len__() > 10: with open(self.save_path + '/' + filename, 'wb') as image: image.write(content) return
def Help_ZipToEpub(Dir='.'): for p in os.listdir(Dir): if p == targetFileName or p == 'mimetype': Debug.print_in_single_line(u'该文件已添加,自动跳过') continue filepath = os.path.join(Dir, p) if not os.path.isfile(filepath): if p == '.' or p == '..': continue Help_ZipToEpub(Dir=filepath) else: Debug.print_in_single_line(u'将{}添加至电子书内'.format(filepath)) epub.write(filepath, compress_type=zipfile.ZIP_STORED)
def start_worker(self): a = list(self.work_set) a.sort() argv = {"func": self.worker, "iterable": a} # 所有待存入数据库中的数据都应当是list Control.control_center(argv, self.work_set) Debug.logger.info(u"所有内容抓取完毕,开始对页面进行解析") i = 0 for content in self.content_list: i += 1 Debug.print_in_single_line(u"正在解析第{}/{}张页面".format(i, self.content_list.__len__())) self.parse_content(content) Debug.logger.info(u"网页内容解析完毕") return
def download(self, index): image = self.container[index] filename = image['filename'] href = image['href'] if os.path.isfile(self.save_path + '/' + filename): return Debug.print_in_single_line(u'开始下载图片{}'.format(href)) content = Http.get_content(url=href, timeout=Config.timeout_download_picture) if not content: return with open(self.save_path + '/' + filename, 'wb') as image: image.write(content) return
def zip_to_epub(self): epub_name = self.title + u'.epub' file_path = EpubPath.output_path + '/' + epub_name EpubPath.reset_path() epub = zipfile.ZipFile(file=file_path, mode='w', compression=zipfile.ZIP_STORED, allowZip64=True) epub.write('./mimetype') for parent, dirnames, filenames in os.walk('.'): for filename in filenames: if filename in [epub_name, 'mimetype']: continue Debug.print_in_single_line(u'将{}添加至电子书内'.format(filename)) epub.write(parent + '/' + filename, compress_type=zipfile.ZIP_STORED) epub.close() return
def zip_to_epub(self): epub_name = self.title + u'.epub' file_path = EpubPath.output_path + '/' + epub_name EpubPath.reset_path() epub = zipfile.ZipFile(file=file_path, mode='w', compression=zipfile.ZIP_STORED, allowZip64=True) epub.write('./mimetype') for parent, dirnames, filenames in os.walk('.'): for filename in filenames: if filename in [epub_name, 'mimetype']: continue Debug.print_in_single_line(u'add {} to e-book'.format(filename)) epub.write(parent + '/' + filename, compress_type=zipfile.ZIP_STORED) epub.close() print u'\n\n', u'e-book', epub_name, u"has been built successfully!" return
def start_worker(self): a = list(self.work_set) a.sort() argv = { 'func': self.worker, # 所有待存入数据库中的数据都应当是list 'iterable': a, } Control.control_center(argv, self.work_set) Debug.logger.info(u"所有内容抓取完毕,开始对页面进行解析") i = 0 for content in self.content_list: i += 1 Debug.print_in_single_line(u"正在解析第{}/{}张页面".format( i, self.content_list.__len__())) self.parse_content(content) Debug.logger.info(u"网页内容解析完毕") return
def start_worker(self): u""" work_set是所有的需要抓取的页面 :return: """ a = list(self.work_set) a.sort() argv = {'func': self.worker, # 所有待存入数据库中的数据都应当是list 'iterable': a, } Control.control_center(argv, self.work_set) Debug.logger.info(u"所有内容抓取完毕,开始对页面进行解析") i = 0 for content in self.content_list: i += 1 Debug.print_in_single_line(u"正在解析第{}/{}张页面".format(i, self.content_list.__len__())) self.parse_content(content) Debug.logger.info(u"网页内容解析完毕") return
def start(self): print ' 中文研报 ' stockList = [] for raw_front_page_index in range(1, 251): fileN = '策略' sdPath = '/ink/work/62/ink/{}'.format(fileN) Path.mkdir(sdPath) # http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=HGYJ&cmd=4&code=&ps=50&p=2&js=var%20UxmjGoYW={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}& burl = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=CLBG&cmd=4&code=&ps=50&p=" # burl = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=HGYJ&cmd=4&code=&ps=50&p=" uu = u"&js=var%20GdYXcAjX={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&" url = '%s%s%s' % (burl, str(raw_front_page_index), uu) # print url content = Http.get_content(url) if content: try: jsonD = str(content).split('=')[-1] jdata = json.loads(jsonD) articles = jdata['data'] for article in articles: xxxs = str(article).split(',') rticlet = xxxs[0] preTitle = xxxs[5] if str(preTitle).__contains__('川财') or str(preTitle).__contains__('或'): continue # if str(preTitle).__contains__('历史') or str(preTitle).__contains__('周期')or str(preTitle).__contains__('成长'): # if str(preTitle).__contains__('政治') or str(preTitle).__contains__('中央经济')or str(preTitle).__contains__('贸易战'): if str(preTitle).__contains__('日本'): print preTitle date_time = datetime.datetime.strptime(rticlet, '%Y/%m/%d %H:%M:%S') infoCode = xxxs[1] destU = u"http://data.eastmoney.com/report/{}/cl,{}.html ".format( date_time.strftime('%Y%m%d'), infoCode) print destU result = Http.get_content(destU) result = unicode(result, 'GBK').encode('UTF-8') xxsoup = BeautifulSoup(result, 'html.parser') title_tationl = xxsoup.find_all('h1') tt = str(title_tationl[0].text).strip() xxlist_p_list = xxsoup.find_all('div', class_='report-infos')[0] sp = xxlist_p_list.find_all('span') ttime = str((sp[1]).text) date_time = datetime.datetime.strptime(ttime, '%Y年%m月%d日 %H:%M') # print date_time.strftime('%Y-%m-%d') ttime = date_time.strftime('%Y-%m-%d') # print (sp[2]).text # print (sp[3]).text title = Match.replace_specile_chars(tt) title = title.replace('/', '', 100) fileName = u"{}_{}_{}_{}.pdf".format(ttime, (sp[2]).text, title, (sp[3]).text) # 时间 券商 名称 author print fileName urlsp = sp[-1] basePath = '{}/{}'.format(sdPath, fileName) # print basePath # 创建文件夹 list_pcyc_li = urlsp.find_all('a') for li in list_pcyc_li: ttt = li.get('href') Path.mkdirAndPath(basePath) print ttt Debug.print_in_single_line(u'开始下载 {}'.format(ttt)) if ttt: content = Http.get_content(url=ttt, timeout=180) if not content: Debug.logger.debug(u'pdf『{}』下载失败'.format(ttt)) content = '' else: Debug.print_in_single_line(u'pdf {} 下载完成'.format(ttt)) else: # 当下载地址为空的时候,就没必要再去下载了 content = '' if content.__len__() > 10: with open(basePath, "wb") as pdf: pdf.write(content) except Exception as e: print('next')
def login(self, account, password, captcha=''): content = Http.get_content('https://www.zhihu.com/') xsrf = Match.xsrf(content) if not xsrf: Debug.logger.info(u'登陆失败') Debug.logger.info(u'敲击回车重新发送登陆请求') return False xsrf = xsrf.split('=')[1] # add xsrf as cookie into cookieJar, cookie = Http.make_cookie(name='_xsrf', value=xsrf, domain='www.zhihu.com') self.cookieJar.set_cookie(cookie) if captcha: post_data = { '_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True, 'captcha': captcha } else: post_data = { '_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True } header = { 'Accept': '*/*', 'Accept-Encoding': 'gzip,deflate', # 主要属性,只要有此项知乎即认为来源非脚本 'Accept-Language': 'zh,zh-CN;q=0.8,en-GB;q=0.6,en;q=0.4', 'Host': 'www.zhihu.com', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36(KHTML, like Gecko)Chrome/34.0.1847.116 Safari/537.36', 'Connection': 'keep-alive', 'X-Requested-With': 'XMLHttpRequest', 'Origin': 'https://www.zhihu.com', 'Referer': 'https://www.zhihu.com/', } result = Http.get_content(url=r'https://www.zhihu.com/login/email', data=post_data, extra_header=header) if not result: Debug.logger.info(u'登陆失败,请敲击回车重新登陆') return False response = json.loads(result) if response['r'] == 0: print u'登陆成功!' print u'登陆账号:', account print u'请问是否需要记住帐号密码?输入yes记住,输入其它任意字符跳过,回车确认' if raw_input() == 'yes': Config.account, Config.password, Config.remember_account = account, password, True print u'帐号密码已保存,可通过修改config.json修改设置' else: Config.account, Config.password, Config.remember_account = '', '', False print u'跳过保存环节,进入下一流程' Config._save() cookie = self.get_cookie() DB.execute( 'delete from LoginRecord') # 登陆成功后清除数据库中原有的登录记录,避免下次登陆时取到旧记录 data = {} data['account'] = account data['password'] = password data['recordDate'] = ExtraTools.get_today() data['cookieStr'] = cookie DB.save(data, 'LoginRecord') DB.commit() return True else: print u'登陆失败' Debug.print_dict(response) return False
'src_info': './unit_html/collection.html', 'parser': CollectionParser, }, 'private_collection': { 'src_answer': './unit_html/private_collection.html', 'src_info': './unit_html/private_collection.html', 'parser': CollectionParser, }, } if is_info: src = unit[kind]['src_info'] else: src = unit[kind]['src_answer'] content = open(src, 'r').read() parser = unit[kind]['parser'](content) if is_info: Debug.print_dict(parser.get_extra_info()) print '----------------------' print '==========================' else: for answer in parser.get_answer_list(): Debug.print_dict(answer) print '----------------------' print '==========================' for question in parser.get_question_info_list(): Debug.print_dict(question) print '----------------------'
'parser': CollectionParser, }, 'private_collection': { 'src_answer':'./unit_html/private_collection.html', 'src_info':'./unit_html/private_collection.html', 'parser':CollectionParser, }, } if is_info: src = unit[kind]['src_info'] else: src = unit[kind]['src_answer'] content = open(src, 'r').read() parser = unit[kind]['parser'](content) if is_info: Debug.print_dict(parser.get_extra_info()) print '----------------------' print '==========================' else: for answer in parser.get_answer_list(): Debug.print_dict(answer) print '----------------------' print '==========================' for question in parser.get_question_info_list(): Debug.print_dict(question) print '----------------------'
def start(self): print 'start JRJ_Report' stockList = [] file_name = 'annual.txt' with open(file_name, 'r') as read_list: read_list = read_list.readlines() resultsL = read_list.__len__() for x in range(0, resultsL): line = read_list[x] splits = line.split('#') code = (str)(splits[0]) fieName = (str)(splits[1]).strip() print fieName stockList.append({'URL': code, 'NAME': fieName}) for xx in stockList: for raw_front_page_index in range(1, 8): fileN = str(xx['NAME']).strip() uux = xx['URL'] sdPath = '/ink/work/62/ink/{}'.format(fileN) Path.mkdir(sdPath) url = u"http://istock.jrj.com.cn/yanbao_{}_p{}.html" request_url = url.format(uux, raw_front_page_index) content = Http.get_content(request_url) soup = BeautifulSoup(content, 'html.parser') list_p_list = soup.find_all('td', class_="left") for p in list_p_list: # print p list_pcyc_li = p.find_all('a') for li in list_pcyc_li: xxurl = li.get('href') # print xxurl if not 'http://istock.jrj.com.cn/list,yanbao.html' == xxurl: time.sleep(1) result = Http.get_content(xxurl) result = unicode(str(result), 'GBK').encode('UTF-8') xxsoup = BeautifulSoup(result, 'html.parser') # title_tationl = xxsoup.find_all('h1') # tt = str(title_tationl[0].text).strip() xxlist_p_list = xxsoup.find_all('p', class_='title')[0] xxlist_ds = xxsoup.find_all('span', class_='fr')[0] realu = str(xxlist_p_list).replace( str(xxlist_ds), '', 1) realuxsoup = BeautifulSoup(realu, 'html.parser') sp = str(realuxsoup.text).split(' ') ttime = sp[1] if ttime.__contains__('发表于'): ttime = sp[2] # print (sp[2]).text # print (sp[3]).text # print ttime all_main = xxsoup.find_all('div', class_='main')[0] realuxsoup = BeautifulSoup(str(all_main), 'html.parser') reaupp = realuxsoup.find_all('p') for pp in reaupp: list_pcyc_li = pp.find_all('a') for li in list_pcyc_li: print li.text ttt = li.get('href') print ttt fileName = u"{}_{}.pdf".format( ttime, str(li.text).replace('/', "")) print fileName basePath = '/ink/work/62/ink/{}/{}'.format( fileN, fileName) Path.mkdirAndPath(basePath) Debug.print_in_single_line( u'开始下载 {}'.format(ttt)) if ttt: content = Http.get_content(url=ttt, timeout=180) if not content: Debug.logger.debug( u'pdf『{}』下载失败'.format(ttt)) content = '' else: Debug.print_in_single_line( u'pdf {} 下载完成'.format(ttt)) else: # 当下载地址为空的时候,就没必要再去下载了 content = '' if content.__len__() > 10: with open(basePath, "wb") as pdf: pdf.write(content)
def create_book(self): # 确定文件信息 title = Match.fix_filename(self.book_title) if self.is_split: title = self.book_title + u'_卷{}'.format(self.chapter_no) # 先切换到电子书临时资源目录下 Path.chdir(Path.book_pool_path) epub = Epub(title) for task_result in self.task_result_list: chapter_src = '' # info_page if task_result.task.task_type == Type.question: chapter_src = self.generate_question_info_page( task_result.info_page) elif task_result.task.task_type == Type.answer: chapter_src = self.generate_question_info_page( task_result.info_page) elif task_result.task.task_type == Type.collection: chapter_src = self.generate_collection_info_page( task_result.info_page) elif task_result.task.task_type == Type.topic: chapter_src = self.generate_topic_info_page( task_result.info_page) elif task_result.task.task_type == Type.author: chapter_src = self.generate_author_info_page( task_result.info_page) elif task_result.task.task_type == Type.column: task_result.info_page.article_count = ( task_result.column_list[0].article_list).__len__() chapter_src = self.generate_column_info_page( task_result.info_page) elif task_result.task.task_type == Type.article: chapter_src = self.generate_article_info_page( task_result.info_page) epub.create_chapter(chapter_src, task_result.get_title()) for question in task_result.question_list: # 添加图片文件 for filename in question.img_filename_list: epub.add_image(Path.image_pool_path + '/' + filename) question_src = self.generate_question_page(question) epub.add_html(question_src, question.question_info.title) for column in task_result.column_list: # 添加图片文件 for filename in column.img_filename_list: epub.add_image(Path.image_pool_path + '/' + filename) for article in column.article_list: article_src = self.generate_article_page(article) epub.add_html(article_src, article.title) epub.finish_chapter() href = self.task_result_list[0].info_page.image_url if len(href) > 0: print href if href: content = Http.get_content( url=href, timeout=Config.timeout_download_picture) if not content: Debug.logger.debug(u'图片『{}』下载失败'.format(href)) content = '' else: Debug.print_in_single_line(u'图片{}下载完成'.format(href)) else: # 当下载地址为空的时候,就没必要再去下载了 content = '' if content.__len__() > 10: filename = Path.image_pool_path + '/' + 'cover.jpg' with open(filename, 'wb') as image: image.write(content) epub.add_cover_image(filename) else: epub.add_cover_image('/Users/ex-liyan010/Desktop/cover.png') # epub.add_cover_image('/Users/ex-liyan010/Desktop/cover.png') epub.set_creator(u'macbookpro2100') epub.set_language(u'zh-cn') epub.set_book_id() epub.set_output_path(Path.result_path) epub.add_css(Path.base_path + u'/www/css/markdown.css') epub.add_css(Path.base_path + u'/www/css/customer.css') epub.add_css(Path.base_path + u'/www/css/normalize.css') epub.add_css(Path.base_path + u'/www/css/bootstrap.css') epub.create() Path.reset_path() return
def start(self): print 'start JRJ_Report' stockList = [] stockList.append({'URL': '1', 'NAME': '宏观研究'}) # stockList.append({'URL': '8', 'NAME': '策略趋势'}) for xx in stockList: for raw_front_page_index in range(5, 50): print '开始第' + str(raw_front_page_index) + '页面 下载' fileN = str(xx['NAME']).strip() uux = xx['URL'] sdPath = ' /Volumes/MacintoshHD/File/{}'.format(fileN) Path.mkdir(sdPath) url = u"http://istock.jrj.com.cn/yanbao_{}_p{}.html" request_url = url.format(uux, raw_front_page_index) content = Http.get_content(request_url) soup = BeautifulSoup(content, 'html.parser') list_p_list = soup.find_all('div', class_="yb_con") for p in list_p_list: # print p list_pcyc_li = p.find_all('a') for li in list_pcyc_li: xxurl = li.get('href') # print xxurl if not 'http://istock.jrj.com.cn/list,yanbao.html' == xxurl: try: result = Http.get_content(xxurl) result = unicode(str(result), 'GBK').encode('UTF-8') xxsoup = BeautifulSoup(result, 'html.parser') # title_tationl = xxsoup.find_all('h1') # tt = str(title_tationl[0].text).strip() xxlist_p_list = xxsoup.find_all('p', class_='title')[0] xxlist_ds = xxsoup.find_all('span', class_='fr')[0] realu = str(xxlist_p_list).replace(str(xxlist_ds), '', 1) realuxsoup = BeautifulSoup(realu, 'html.parser') sp = str(realuxsoup.text).split(' ') ttime = sp[1] if ttime.__contains__('发表于'): ttime = sp[2] # print (sp[2]).text # print (sp[3]).text # print ttime all_main = xxsoup.find_all('div', class_='main')[0] realuxsoup = BeautifulSoup(str(all_main), 'html.parser') reaupp = realuxsoup.find_all('p') for pp in reaupp: list_pcyc_li = pp.find_all('a') for li in list_pcyc_li: print li.text ttt = li.get('href') # print ttt ftype = 'pdf' if str(ttt).endswith('.xlsx'): ftype = 'xlsx' fileName = u"{}_{}.{}".format(ttime, str(li.text).replace('/', ""), ftype) print fileName basePath = '/ink/work/62/ink/{}/{}'.format(fileN, fileName) Path.mkdirAndPath(basePath) Debug.print_in_single_line(u'开始下载 {} '.format(ttt)) if ttt: content = Http.get_content(url=ttt, timeout=180) if not content: # Debug.logger.debug(u'文件『{}』下载失败'.format(ttt)) content = '' else: Debug.print_in_single_line(u'文件 {} 下载完成'.format(ttt)) else: # 当下载地址为空的时候,就没必要再去下载了 content = '' if not os.path.exists(fileName): if content.__len__() > 10: with open(basePath, "wb") as pdf: pdf.write(content) except Exception as e: print 'Exception ' + e.message
def start(self): print 'start 东财研报' stockList = [] file_name = 'annual.txt' with open(file_name, 'r') as read_list: read_list = read_list.readlines() resultsL = read_list.__len__() for x in range(0, resultsL): line = read_list[x] splits = line.split('#') code = (str)(splits[0]) fieName = (str)(splits[1]).strip() print fieName stockList.append({'URL': code, 'NAME': fieName}) for xx in stockList: for raw_front_page_index in range(1, 5): fileN = str(xx['NAME']).strip() uux = xx['URL'] sdPath = '/ink/work/62/ink/{}'.format(fileN) Path.mkdir(sdPath) # url = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=GGSR&js=var%20LhAYbcgn={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&ps=25&p=1&code=000333&rt=51734025" burl = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=GGSR&js=var%20LhAYbcgn={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&ps=25&" uu = u"p={0}&code={1}&rt=" url = '%s%s' % (burl, uu.format(raw_front_page_index, uux)) content = Http.get_content(url) if content: jsonD = str(content).split('=')[-1] jdata = json.loads(jsonD) articles = jdata['data'] for article in articles: rticlet = article['datetime'] date_time = datetime.datetime.strptime(rticlet, '%Y-%m-%dT%H:%M:%S') destU = u"http://data.eastmoney.com/report/{}/{}.html ".format(date_time.strftime('%Y%m%d'), article['infoCode']) result = Http.get_content(destU) result = unicode(result, 'GBK').encode('UTF-8') xxsoup = BeautifulSoup(result, 'html.parser') title_tationl = xxsoup.find_all('h1') tt = str(title_tationl[0].text).strip() xxlist_p_list = xxsoup.find_all('div', class_='report-infos')[0] sp = xxlist_p_list.find_all('span') ttime = str((sp[1]).text) date_time = datetime.datetime.strptime(ttime, '%Y年%m月%d日 %H:%M') # print date_time.strftime('%Y-%m-%d') ttime = date_time.strftime('%Y-%m-%d') # print (sp[2]).text # print (sp[3]).text title = Match.replace_specile_chars(tt) title = title.replace('/', '', 100) fileName = u"{}_{}_{}_{}.pdf".format(ttime, (sp[2]).text, title, (sp[3]).text) # 时间 券商 名称 author print fileName urlsp = sp[-1] basePath = '{}/{}'.format(sdPath, fileName) # print basePath # 创建文件夹 list_pcyc_li = urlsp.find_all('a') for li in list_pcyc_li: ttt = li.get('href') Path.mkdirAndPath(basePath) print ttt Debug.print_in_single_line(u'开始下载 {}'.format(ttt)) if ttt: content = Http.get_content(url=ttt, timeout=180) if not content: Debug.logger.debug(u'pdf『{}』下载失败'.format(ttt)) content = '' else: Debug.print_in_single_line(u'pdf {} 下载完成'.format(ttt)) else: # 当下载地址为空的时候,就没必要再去下载了 content = '' if content.__len__() > 10: with open(basePath, "wb") as pdf: pdf.write(content)
def start(self): print 'start 东财股吧 研报' stockList = [] file_name = 'annual.txt' with open(file_name, 'r') as read_list: read_list = read_list.readlines() resultsL = read_list.__len__() for x in range(0, resultsL): line = read_list[x] splits = line.split('#') code = (str)(splits[0]) fieName = (str)(splits[1]).strip() print fieName stockList.append({'URL': code, 'NAME': fieName}) for xx in stockList: for raw_front_page_index in range(1, 3): fileN = str(xx['NAME']).strip() uux = xx['URL'] sdPath = '/ink/work/62/ink/{}'.format(fileN) Path.mkdir(sdPath) burl = u"http://guba.eastmoney.com/list,{},2,f_{}.html" content = Http.get_content(burl.format(uux, raw_front_page_index)) xxsoup = BeautifulSoup(content, 'html.parser') tagrt = xxsoup.find_all('div', id='articlelistnew')[0] ols = tagrt.find_all('div', class_='articleh normal_post') olss = tagrt.find_all('div', class_='articleh normal_post odd') splicy = [] for xxos in ols: splicy.append(xxos) for xx in olss: splicy.append(xxos) for inkl in splicy: try: inklinkl = BeautifulSoup(str(inkl), 'html.parser') spp = inklinkl.find_all('span', class_='l3')[0] list_pcyc_li = spp.find_all('a') for li in list_pcyc_li: ttt = li.get('href') print ttt destU = u'http://guba.eastmoney.com{}'.format(ttt) result = Http.get_content(destU) # result = unicode(result, 'GBK').encode('UTF-8') xxsoup = BeautifulSoup(result, 'html.parser') title_tationl = xxsoup.find_all('div', id='zwconttbt') tt = str(title_tationl[0].text).strip() print tt title = Match.replace_specile_chars(tt) title = title.replace('/', '', 100) title = title.replace('查看原文', '') ttime = xxsoup.find_all('p', class_='publishdate')[0] tttttime = str(ttime.text)[-10:] print tttttime date_time = datetime.datetime.strptime(tttttime, '%Y-%m-%d') # print date_time.strftime('%Y-%m-%d') ttime = date_time.strftime('%Y-%m-%d') fileName = u"{}_{}.pdf".format(ttime, title) # 时间 券商 名称 author print fileName basePath = '{}/{}'.format(sdPath, fileName) # print basePath # 创建文件夹 # spx = xxsoup.find_all('span', class_='zwtitlepdf')[0] pdfu = spx.find_all('a') for li in pdfu: ttt = li.get('href') print ttt Path.mkdirAndPath(basePath) Debug.print_in_single_line(u'开始下载 {}'.format(ttt)) if ttt: content = Http.get_content(url=ttt, timeout=180) if not content: Debug.logger.debug(u'pdf『{}』下载失败'.format(ttt)) content = '' else: Debug.print_in_single_line(u'pdf {} 下载完成'.format(ttt)) else: # 当下载地址为空的时候,就没必要再去下载了 content = '' if content.__len__() > 10: with open(basePath, "wb") as pdf: pdf.write(content) except Exception as e: print('next')