def download(self, index):
        image = self.container[index]
        filename = image['filename']
        href = image['href']
        # filename=href.split('/')[-1]

        if os.path.isfile(self.save_path + '/' + filename):
            return
        print 'Downloading picture:' + href + '   filename   ' + filename

        # urllib.urlretrieve(href, self.save_path + '/' + filename, cbk)

        if len(str(href)) < 300 and Match.isUrlOk(href):
            # Debug.print_in_single_line(u'Downloading picture: {}'.format(href))
            rely_url = str(href).split('@')[0]
            content = Http.get_content(url=rely_url,
                                       timeout=Config.timeout_download_picture)
        else:
            Debug.print_in_single_line(u"Href of the Picture seems wrong...")
            content = None
        if not content:
            return
        with open(self.save_path + '/' + filename, 'wb') as image:
            image.write(content)
        return
Example #2
0
    def login(self, account, password, captcha=''):
        content = Http.get_content('https://www.zhihu.com/')
        xsrf = Match.xsrf(content)
        if not xsrf:
            Debug.logger.info(u'登陆失败')
            Debug.logger.info(u'敲击回车重新发送登陆请求')
            return False
        xsrf = xsrf.split('=')[1]
        # add xsrf as cookie into cookieJar,
        cookie = Http.make_cookie(name='_xsrf', value=xsrf, domain='www.zhihu.com')
        self.cookieJar.set_cookie(cookie)
        if captcha:
            post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True,
                         'captcha': captcha}
        else:
            post_data = {'_xsrf': xsrf, 'email': account, 'password': password, 'remember_me': True}

        header = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip,deflate',  # 主要属性,只要有此项知乎即认为来源非脚本
            'Accept-Language': 'zh,zh-CN;q=0.8,en-GB;q=0.6,en;q=0.4',
            'Host': 'www.zhihu.com',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36(KHTML, like Gecko)Chrome/34.0.1847.116 Safari/537.36',
            'Connection': 'keep-alive',
            'X-Requested-With': 'XMLHttpRequest',
            'Origin': 'https://www.zhihu.com',
            'Referer': 'https://www.zhihu.com/',
        }
        result = Http.get_content(url=r'https://www.zhihu.com/login/email', data=post_data, extra_header=header)
        if not result:
            Debug.logger.info(u'登陆失败,请敲击回车重新登陆')
            return False
        response = json.loads(result)

        if response['r'] == 0:
            print u'登陆成功!'
            print u'登陆账号:', account
            print u'请问是否需要记住帐号密码?输入yes记住,输入其它任意字符跳过,回车确认'
            if raw_input() == 'yes':
                Config.account, Config.password, Config.remember_account = account, password, True
                print u'帐号密码已保存,可通过修改config.json修改设置'
            else:
                Config.account, Config.password, Config.remember_account = '', '', False
                print u'跳过保存环节,进入下一流程'
            Config._save()
            cookie = self.get_cookie()
            DB.execute('delete from LoginRecord')  # 登陆成功后清除数据库中原有的登录记录,避免下次登陆时取到旧记录
            data = {}
            data['account'] = account
            data['password'] = password
            data['recordDate'] = ExtraTools.get_today()
            data['cookieStr'] = cookie
            DB.save(data, 'LoginRecord')
            DB.commit()
            return True
        else:
            print u'登陆失败'
            Debug.print_dict(response)
            return False
    def download(self, index):
        image = self.container[index]
        filename = image['filename']
        href = image['href']
        #   下载图片时自动把https换成http,以便加速图片下载过程
        href = href.replace('https://', 'http://')
        # 图片存在就不下载
        if os.path.exists(self.save_path + '/' + filename):
            return
        else:
            if os.path.isfile(self.save_path + '/' + filename):
                return

            Debug.print_in_single_line(u'开始下载图片  {}'.format(href))
            if href:
                content = Http.get_content(
                    url=href, timeout=Config.timeout_download_picture)
                if not content:
                    Debug.logger.debug(u'图片『{}』下载失败'.format(href))
                    content = ''
                else:
                    Debug.print_in_single_line(u'图片{}下载完成'.format(href))
            else:
                #   当下载地址为空的时候,就没必要再去下载了
                content = ''
            if content.__len__() > 10:
                with open(self.save_path + '/' + filename, 'wb') as image:
                    image.write(content)
            return
Example #4
0
 def Help_ZipToEpub(Dir='.'):
     for p in os.listdir(Dir):
         if p == targetFileName or p == 'mimetype':
             Debug.print_in_single_line(u'该文件已添加,自动跳过')
             continue
         filepath = os.path.join(Dir, p)
         if not os.path.isfile(filepath):
             if p == '.' or p == '..':
                 continue
             Help_ZipToEpub(Dir=filepath)
         else:
             Debug.print_in_single_line(u'将{}添加至电子书内'.format(filepath))
             epub.write(filepath, compress_type=zipfile.ZIP_STORED)
Example #5
0
 def start_worker(self):
     a = list(self.work_set)
     a.sort()
     argv = {"func": self.worker, "iterable": a}  # 所有待存入数据库中的数据都应当是list
     Control.control_center(argv, self.work_set)
     Debug.logger.info(u"所有内容抓取完毕,开始对页面进行解析")
     i = 0
     for content in self.content_list:
         i += 1
         Debug.print_in_single_line(u"正在解析第{}/{}张页面".format(i, self.content_list.__len__()))
         self.parse_content(content)
     Debug.logger.info(u"网页内容解析完毕")
     return
Example #6
0
    def download(self, index):
        image = self.container[index]
        filename = image['filename']
        href = image['href']

        if os.path.isfile(self.save_path + '/' + filename):
            return
        Debug.print_in_single_line(u'开始下载图片{}'.format(href))
        content = Http.get_content(url=href, timeout=Config.timeout_download_picture)
        if not content:
            return
        with open(self.save_path + '/' + filename, 'wb') as image:
            image.write(content)
        return
Example #7
0
 def zip_to_epub(self):
     epub_name = self.title + u'.epub'
     file_path = EpubPath.output_path + '/' + epub_name
     EpubPath.reset_path()
     epub = zipfile.ZipFile(file=file_path, mode='w', compression=zipfile.ZIP_STORED, allowZip64=True)
     epub.write('./mimetype')
     for parent, dirnames, filenames in os.walk('.'):
         for filename in filenames:
             if filename in [epub_name, 'mimetype']:
                 continue
             Debug.print_in_single_line(u'将{}添加至电子书内'.format(filename))
             epub.write(parent + '/' + filename, compress_type=zipfile.ZIP_STORED)
     epub.close()
     return
Example #8
0
    def download(self, index):
        image = self.container[index]
        filename = image['filename']
        href = image['href']

        if os.path.isfile(self.save_path + '/' + filename):
            return
        Debug.print_in_single_line(u'开始下载图片{}'.format(href))
        content = Http.get_content(url=href,
                                   timeout=Config.timeout_download_picture)
        if not content:
            return
        with open(self.save_path + '/' + filename, 'wb') as image:
            image.write(content)
        return
Example #9
0
    def zip_to_epub(self):
        epub_name = self.title + u'.epub'
        file_path = EpubPath.output_path + '/' + epub_name
        EpubPath.reset_path()
        epub = zipfile.ZipFile(file=file_path, mode='w', compression=zipfile.ZIP_STORED, allowZip64=True)
        epub.write('./mimetype')
        for parent, dirnames, filenames in os.walk('.'):
            for filename in filenames:
                if filename in [epub_name, 'mimetype']:
                    continue
                Debug.print_in_single_line(u'add {} to e-book'.format(filename))
                epub.write(parent + '/' + filename, compress_type=zipfile.ZIP_STORED)
        epub.close()

        print u'\n\n', u'e-book', epub_name, u"has been built successfully!"
        return
Example #10
0
 def start_worker(self):
     a = list(self.work_set)
     a.sort()
     argv = {
         'func': self.worker,  # 所有待存入数据库中的数据都应当是list
         'iterable': a,
     }
     Control.control_center(argv, self.work_set)
     Debug.logger.info(u"所有内容抓取完毕,开始对页面进行解析")
     i = 0
     for content in self.content_list:
         i += 1
         Debug.print_in_single_line(u"正在解析第{}/{}张页面".format(
             i, self.content_list.__len__()))
         self.parse_content(content)
     Debug.logger.info(u"网页内容解析完毕")
     return
Example #11
0
 def zip_to_epub(self):
     epub_name = self.title + u'.epub'
     file_path = EpubPath.output_path + '/' + epub_name
     EpubPath.reset_path()
     epub = zipfile.ZipFile(file=file_path,
                            mode='w',
                            compression=zipfile.ZIP_STORED,
                            allowZip64=True)
     epub.write('./mimetype')
     for parent, dirnames, filenames in os.walk('.'):
         for filename in filenames:
             if filename in [epub_name, 'mimetype']:
                 continue
             Debug.print_in_single_line(u'将{}添加至电子书内'.format(filename))
             epub.write(parent + '/' + filename,
                        compress_type=zipfile.ZIP_STORED)
     epub.close()
     return
Example #12
0
 def start_worker(self):
     u"""
     work_set是所有的需要抓取的页面
     :return:
     """
     a = list(self.work_set)
     a.sort()
     argv = {'func': self.worker,  # 所有待存入数据库中的数据都应当是list
             'iterable': a, }
     Control.control_center(argv, self.work_set)
     Debug.logger.info(u"所有内容抓取完毕,开始对页面进行解析")
     i = 0
     for content in self.content_list:
         i += 1
         Debug.print_in_single_line(u"正在解析第{}/{}张页面".format(i, self.content_list.__len__()))
         self.parse_content(content)
     Debug.logger.info(u"网页内容解析完毕")
     return
    def start(self):
        print ' 中文研报 '

        stockList = []


        for raw_front_page_index in range(1, 251):
            fileN = '策略'

            sdPath = '/ink/work/62/ink/{}'.format(fileN)
            Path.mkdir(sdPath)

       #    http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=HGYJ&cmd=4&code=&ps=50&p=2&js=var%20UxmjGoYW={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&
            burl = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=CLBG&cmd=4&code=&ps=50&p="
            # burl = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=HGYJ&cmd=4&code=&ps=50&p="
            uu = u"&js=var%20GdYXcAjX={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&"

            url = '%s%s%s' % (burl, str(raw_front_page_index), uu)

            # print url

            content = Http.get_content(url)

            if content:
                try:
                    jsonD = str(content).split('=')[-1]

                    jdata = json.loads(jsonD)
                    articles = jdata['data']
                    for article in articles:

                        xxxs = str(article).split(',')
                        rticlet = xxxs[0]

                        preTitle = xxxs[5]

                        if str(preTitle).__contains__('川财') or str(preTitle).__contains__('或'):
                           continue

                        # if str(preTitle).__contains__('历史') or str(preTitle).__contains__('周期')or str(preTitle).__contains__('成长'):
                        # if str(preTitle).__contains__('政治') or str(preTitle).__contains__('中央经济')or str(preTitle).__contains__('贸易战'):
                        if str(preTitle).__contains__('日本'):
                            print preTitle
                            date_time = datetime.datetime.strptime(rticlet, '%Y/%m/%d %H:%M:%S')

                            infoCode = xxxs[1]
                            destU = u"http://data.eastmoney.com/report/{}/cl,{}.html ".format(
                                date_time.strftime('%Y%m%d'), infoCode)

                            print destU

                            result = Http.get_content(destU)
                            result = unicode(result, 'GBK').encode('UTF-8')

                            xxsoup = BeautifulSoup(result, 'html.parser')

                            title_tationl = xxsoup.find_all('h1')
                            tt = str(title_tationl[0].text).strip()

                            xxlist_p_list = xxsoup.find_all('div', class_='report-infos')[0]

                            sp = xxlist_p_list.find_all('span')

                            ttime = str((sp[1]).text)

                            date_time = datetime.datetime.strptime(ttime, '%Y年%m月%d日 %H:%M')

                            # print date_time.strftime('%Y-%m-%d')

                            ttime = date_time.strftime('%Y-%m-%d')

                            # print (sp[2]).text
                            # print (sp[3]).text

                            title = Match.replace_specile_chars(tt)
                            title = title.replace('/', '', 100)

                            fileName = u"{}_{}_{}_{}.pdf".format(ttime, (sp[2]).text, title, (sp[3]).text)
                            # 时间 券商 名称  author

                            print fileName

                            urlsp = sp[-1]

                            basePath = '{}/{}'.format(sdPath, fileName)

                            # print basePath

                            # 创建文件夹

                            list_pcyc_li = urlsp.find_all('a')
                            for li in list_pcyc_li:
                                ttt = li.get('href')
                                Path.mkdirAndPath(basePath)
                                print ttt

                                Debug.print_in_single_line(u'开始下载   {}'.format(ttt))
                                if ttt:
                                    content = Http.get_content(url=ttt, timeout=180)
                                    if not content:
                                        Debug.logger.debug(u'pdf『{}』下载失败'.format(ttt))
                                        content = ''
                                    else:
                                        Debug.print_in_single_line(u'pdf {} 下载完成'.format(ttt))
                                else:
                                    #   当下载地址为空的时候,就没必要再去下载了
                                    content = ''
                                if content.__len__() > 10:
                                    with open(basePath, "wb") as pdf:
                                        pdf.write(content)



                except Exception as e:
                    print('next')
Example #14
0
    def login(self, account, password, captcha=''):
        content = Http.get_content('https://www.zhihu.com/')
        xsrf = Match.xsrf(content)
        if not xsrf:
            Debug.logger.info(u'登陆失败')
            Debug.logger.info(u'敲击回车重新发送登陆请求')
            return False
        xsrf = xsrf.split('=')[1]
        # add xsrf as cookie into cookieJar,
        cookie = Http.make_cookie(name='_xsrf',
                                  value=xsrf,
                                  domain='www.zhihu.com')
        self.cookieJar.set_cookie(cookie)
        if captcha:
            post_data = {
                '_xsrf': xsrf,
                'email': account,
                'password': password,
                'remember_me': True,
                'captcha': captcha
            }
        else:
            post_data = {
                '_xsrf': xsrf,
                'email': account,
                'password': password,
                'remember_me': True
            }

        header = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip,deflate',  # 主要属性,只要有此项知乎即认为来源非脚本
            'Accept-Language': 'zh,zh-CN;q=0.8,en-GB;q=0.6,en;q=0.4',
            'Host': 'www.zhihu.com',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36(KHTML, like Gecko)Chrome/34.0.1847.116 Safari/537.36',
            'Connection': 'keep-alive',
            'X-Requested-With': 'XMLHttpRequest',
            'Origin': 'https://www.zhihu.com',
            'Referer': 'https://www.zhihu.com/',
        }
        result = Http.get_content(url=r'https://www.zhihu.com/login/email',
                                  data=post_data,
                                  extra_header=header)
        if not result:
            Debug.logger.info(u'登陆失败,请敲击回车重新登陆')
            return False
        response = json.loads(result)

        if response['r'] == 0:
            print u'登陆成功!'
            print u'登陆账号:', account
            print u'请问是否需要记住帐号密码?输入yes记住,输入其它任意字符跳过,回车确认'
            if raw_input() == 'yes':
                Config.account, Config.password, Config.remember_account = account, password, True
                print u'帐号密码已保存,可通过修改config.json修改设置'
            else:
                Config.account, Config.password, Config.remember_account = '', '', False
                print u'跳过保存环节,进入下一流程'
            Config._save()
            cookie = self.get_cookie()
            DB.execute(
                'delete from LoginRecord')  # 登陆成功后清除数据库中原有的登录记录,避免下次登陆时取到旧记录
            data = {}
            data['account'] = account
            data['password'] = password
            data['recordDate'] = ExtraTools.get_today()
            data['cookieStr'] = cookie
            DB.save(data, 'LoginRecord')
            DB.commit()
            return True
        else:
            print u'登陆失败'
            Debug.print_dict(response)
            return False
Example #15
0
        'src_info': './unit_html/collection.html',
        'parser': CollectionParser,
    },
    'private_collection': {
        'src_answer': './unit_html/private_collection.html',
        'src_info': './unit_html/private_collection.html',
        'parser': CollectionParser,
    },
}
if is_info:
    src = unit[kind]['src_info']
else:
    src = unit[kind]['src_answer']

content = open(src, 'r').read()
parser = unit[kind]['parser'](content)

if is_info:
    Debug.print_dict(parser.get_extra_info())
    print '----------------------'
    print '=========================='
else:
    for answer in parser.get_answer_list():
        Debug.print_dict(answer)
        print '----------------------'
    print '=========================='

    for question in parser.get_question_info_list():
        Debug.print_dict(question)
        print '----------------------'
Example #16
0
        'parser': CollectionParser,
    },
    'private_collection': {
        'src_answer':'./unit_html/private_collection.html',
        'src_info':'./unit_html/private_collection.html',
        'parser':CollectionParser,
    },
}
if is_info:
    src = unit[kind]['src_info']
else:
    src = unit[kind]['src_answer']

content = open(src, 'r').read()
parser = unit[kind]['parser'](content)


if is_info:
    Debug.print_dict(parser.get_extra_info())
    print '----------------------'
    print '=========================='
else:
    for answer in parser.get_answer_list():
        Debug.print_dict(answer)
        print '----------------------'
    print '=========================='

    for question in parser.get_question_info_list():
        Debug.print_dict(question)
        print '----------------------'
    def start(self):
        print 'start JRJ_Report'

        stockList = []
        file_name = 'annual.txt'

        with open(file_name, 'r') as read_list:
            read_list = read_list.readlines()

            resultsL = read_list.__len__()
            for x in range(0, resultsL):
                line = read_list[x]
                splits = line.split('#')
                code = (str)(splits[0])
                fieName = (str)(splits[1]).strip()
                print fieName
                stockList.append({'URL': code, 'NAME': fieName})

        for xx in stockList:

            for raw_front_page_index in range(1, 8):

                fileN = str(xx['NAME']).strip()
                uux = xx['URL']

                sdPath = '/ink/work/62/ink/{}'.format(fileN)

                Path.mkdir(sdPath)

                url = u"http://istock.jrj.com.cn/yanbao_{}_p{}.html"

                request_url = url.format(uux, raw_front_page_index)
                content = Http.get_content(request_url)

                soup = BeautifulSoup(content, 'html.parser')

                list_p_list = soup.find_all('td', class_="left")

                for p in list_p_list:
                    # print p

                    list_pcyc_li = p.find_all('a')
                    for li in list_pcyc_li:
                        xxurl = li.get('href')
                        # print xxurl

                        if not 'http://istock.jrj.com.cn/list,yanbao.html' == xxurl:

                            time.sleep(1)
                            result = Http.get_content(xxurl)
                            result = unicode(str(result),
                                             'GBK').encode('UTF-8')

                            xxsoup = BeautifulSoup(result, 'html.parser')

                            # title_tationl = xxsoup.find_all('h1')
                            # tt = str(title_tationl[0].text).strip()

                            xxlist_p_list = xxsoup.find_all('p',
                                                            class_='title')[0]
                            xxlist_ds = xxsoup.find_all('span', class_='fr')[0]

                            realu = str(xxlist_p_list).replace(
                                str(xxlist_ds), '', 1)

                            realuxsoup = BeautifulSoup(realu, 'html.parser')

                            sp = str(realuxsoup.text).split(' ')

                            ttime = sp[1]

                            if ttime.__contains__('发表于'):
                                ttime = sp[2]

                            # print (sp[2]).text
                            # print (sp[3]).text

                            # print ttime

                            all_main = xxsoup.find_all('div', class_='main')[0]

                            realuxsoup = BeautifulSoup(str(all_main),
                                                       'html.parser')

                            reaupp = realuxsoup.find_all('p')

                            for pp in reaupp:
                                list_pcyc_li = pp.find_all('a')

                                for li in list_pcyc_li:
                                    print li.text
                                    ttt = li.get('href')

                                    print ttt

                                    fileName = u"{}_{}.pdf".format(
                                        ttime,
                                        str(li.text).replace('/', ""))

                                    print fileName

                                    basePath = '/ink/work/62/ink/{}/{}'.format(
                                        fileN, fileName)

                                    Path.mkdirAndPath(basePath)

                                    Debug.print_in_single_line(
                                        u'开始下载   {}'.format(ttt))
                                    if ttt:
                                        content = Http.get_content(url=ttt,
                                                                   timeout=180)
                                        if not content:
                                            Debug.logger.debug(
                                                u'pdf『{}』下载失败'.format(ttt))
                                            content = ''
                                        else:
                                            Debug.print_in_single_line(
                                                u'pdf {} 下载完成'.format(ttt))
                                    else:
                                        #   当下载地址为空的时候,就没必要再去下载了
                                        content = ''
                                    if content.__len__() > 10:
                                        with open(basePath, "wb") as pdf:
                                            pdf.write(content)
Example #18
0
    def create_book(self):
        #   确定文件信息
        title = Match.fix_filename(self.book_title)
        if self.is_split:
            title = self.book_title + u'_卷{}'.format(self.chapter_no)

        #   先切换到电子书临时资源目录下
        Path.chdir(Path.book_pool_path)
        epub = Epub(title)
        for task_result in self.task_result_list:
            chapter_src = ''
            # info_page
            if task_result.task.task_type == Type.question:
                chapter_src = self.generate_question_info_page(
                    task_result.info_page)
            elif task_result.task.task_type == Type.answer:
                chapter_src = self.generate_question_info_page(
                    task_result.info_page)
            elif task_result.task.task_type == Type.collection:
                chapter_src = self.generate_collection_info_page(
                    task_result.info_page)
            elif task_result.task.task_type == Type.topic:
                chapter_src = self.generate_topic_info_page(
                    task_result.info_page)
            elif task_result.task.task_type == Type.author:
                chapter_src = self.generate_author_info_page(
                    task_result.info_page)
            elif task_result.task.task_type == Type.column:

                task_result.info_page.article_count = (
                    task_result.column_list[0].article_list).__len__()

                chapter_src = self.generate_column_info_page(
                    task_result.info_page)
            elif task_result.task.task_type == Type.article:
                chapter_src = self.generate_article_info_page(
                    task_result.info_page)
            epub.create_chapter(chapter_src, task_result.get_title())
            for question in task_result.question_list:
                #   添加图片文件
                for filename in question.img_filename_list:
                    epub.add_image(Path.image_pool_path + '/' + filename)
                question_src = self.generate_question_page(question)
                epub.add_html(question_src, question.question_info.title)

            for column in task_result.column_list:
                #   添加图片文件
                for filename in column.img_filename_list:
                    epub.add_image(Path.image_pool_path + '/' + filename)
                for article in column.article_list:
                    article_src = self.generate_article_page(article)
                    epub.add_html(article_src, article.title)
            epub.finish_chapter()

        href = self.task_result_list[0].info_page.image_url
        if len(href) > 0:
            print href

            if href:
                content = Http.get_content(
                    url=href, timeout=Config.timeout_download_picture)
                if not content:
                    Debug.logger.debug(u'图片『{}』下载失败'.format(href))
                    content = ''
                else:
                    Debug.print_in_single_line(u'图片{}下载完成'.format(href))
            else:
                #   当下载地址为空的时候,就没必要再去下载了
                content = ''
            if content.__len__() > 10:
                filename = Path.image_pool_path + '/' + 'cover.jpg'
                with open(filename, 'wb') as image:
                    image.write(content)

                epub.add_cover_image(filename)

        else:
            epub.add_cover_image('/Users/ex-liyan010/Desktop/cover.png')
            # epub.add_cover_image('/Users/ex-liyan010/Desktop/cover.png')

        epub.set_creator(u'macbookpro2100')
        epub.set_language(u'zh-cn')
        epub.set_book_id()
        epub.set_output_path(Path.result_path)
        epub.add_css(Path.base_path + u'/www/css/markdown.css')
        epub.add_css(Path.base_path + u'/www/css/customer.css')
        epub.add_css(Path.base_path + u'/www/css/normalize.css')
        epub.add_css(Path.base_path + u'/www/css/bootstrap.css')
        epub.create()

        Path.reset_path()
        return
    def start(self):
        print 'start JRJ_Report'

        stockList = []

        stockList.append({'URL': '1', 'NAME': '宏观研究'})
        # stockList.append({'URL': '8', 'NAME': '策略趋势'})

        for xx in stockList:

            for raw_front_page_index in range(5, 50):

                print '开始第' + str(raw_front_page_index) + '页面 下载'

                fileN = str(xx['NAME']).strip()
                uux = xx['URL']

                sdPath = ' /Volumes/MacintoshHD/File/{}'.format(fileN)

                Path.mkdir(sdPath)

                url = u"http://istock.jrj.com.cn/yanbao_{}_p{}.html"

                request_url = url.format(uux, raw_front_page_index)
                content = Http.get_content(request_url)

                soup = BeautifulSoup(content, 'html.parser')

                list_p_list = soup.find_all('div', class_="yb_con")

                for p in list_p_list:
                    # print p

                    list_pcyc_li = p.find_all('a')
                    for li in list_pcyc_li:
                        xxurl = li.get('href')
                        # print xxurl

                        if not 'http://istock.jrj.com.cn/list,yanbao.html' == xxurl:

                            try:

                                result = Http.get_content(xxurl)
                                result = unicode(str(result), 'GBK').encode('UTF-8')

                                xxsoup = BeautifulSoup(result, 'html.parser')

                                # title_tationl = xxsoup.find_all('h1')
                                # tt = str(title_tationl[0].text).strip()

                                xxlist_p_list = xxsoup.find_all('p', class_='title')[0]
                                xxlist_ds = xxsoup.find_all('span', class_='fr')[0]

                                realu = str(xxlist_p_list).replace(str(xxlist_ds), '', 1)

                                realuxsoup = BeautifulSoup(realu, 'html.parser')

                                sp = str(realuxsoup.text).split(' ')

                                ttime = sp[1]

                                if ttime.__contains__('发表于'):
                                    ttime = sp[2]

                                # print (sp[2]).text
                                # print (sp[3]).text

                                # print ttime

                                all_main = xxsoup.find_all('div', class_='main')[0]

                                realuxsoup = BeautifulSoup(str(all_main), 'html.parser')

                                reaupp = realuxsoup.find_all('p')

                                for pp in reaupp:
                                    list_pcyc_li = pp.find_all('a')

                                    for li in list_pcyc_li:
                                        print li.text
                                        ttt = li.get('href')

                                        # print ttt

                                        ftype = 'pdf'

                                        if str(ttt).endswith('.xlsx'):
                                            ftype = 'xlsx'

                                        fileName = u"{}_{}.{}".format(ttime, str(li.text).replace('/', ""), ftype)

                                        print fileName

                                        basePath = '/ink/work/62/ink/{}/{}'.format(fileN, fileName)

                                        Path.mkdirAndPath(basePath)

                                        Debug.print_in_single_line(u'开始下载   {}  '.format(ttt))
                                        if ttt:
                                            content = Http.get_content(url=ttt, timeout=180)
                                            if not content:
                                                # Debug.logger.debug(u'文件『{}』下载失败'.format(ttt))
                                                content = ''
                                            else:
                                                Debug.print_in_single_line(u'文件 {} 下载完成'.format(ttt))
                                        else:
                                            #   当下载地址为空的时候,就没必要再去下载了
                                            content = ''
                                        if not os.path.exists(fileName):
                                            if content.__len__() > 10:
                                                with open(basePath, "wb") as pdf:
                                                    pdf.write(content)
                            except Exception as e:
                                print 'Exception ' + e.message
    def start(self):
        print 'start 东财研报'

        stockList = []
        file_name = 'annual.txt'

        with open(file_name, 'r') as read_list:
            read_list = read_list.readlines()

            resultsL = read_list.__len__()
            for x in range(0, resultsL):
                line = read_list[x]
                splits = line.split('#')
                code = (str)(splits[0])
                fieName = (str)(splits[1]).strip()
                print fieName
                stockList.append({'URL': code, 'NAME': fieName})

        for xx in stockList:
            for raw_front_page_index in range(1, 5):
                fileN = str(xx['NAME']).strip()
                uux = xx['URL']

                sdPath = '/ink/work/62/ink/{}'.format(fileN)
                Path.mkdir(sdPath)
                # url = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=GGSR&js=var%20LhAYbcgn={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&ps=25&p=1&code=000333&rt=51734025"

                burl = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=GGSR&js=var%20LhAYbcgn={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&ps=25&"
                uu = u"p={0}&code={1}&rt="

                url = '%s%s' % (burl, uu.format(raw_front_page_index, uux))

                content = Http.get_content(url)

                if content:
                    jsonD = str(content).split('=')[-1]

                    jdata = json.loads(jsonD)
                    articles = jdata['data']
                    for article in articles:
                        rticlet = article['datetime']

                        date_time = datetime.datetime.strptime(rticlet, '%Y-%m-%dT%H:%M:%S')
                        destU = u"http://data.eastmoney.com/report/{}/{}.html ".format(date_time.strftime('%Y%m%d'),
                                                                                       article['infoCode'])

                        result = Http.get_content(destU)
                        result = unicode(result, 'GBK').encode('UTF-8')

                        xxsoup = BeautifulSoup(result, 'html.parser')

                        title_tationl = xxsoup.find_all('h1')
                        tt = str(title_tationl[0].text).strip()

                        xxlist_p_list = xxsoup.find_all('div', class_='report-infos')[0]

                        sp = xxlist_p_list.find_all('span')

                        ttime = str((sp[1]).text)

                        date_time = datetime.datetime.strptime(ttime, '%Y年%m月%d日 %H:%M')

                        # print date_time.strftime('%Y-%m-%d')

                        ttime = date_time.strftime('%Y-%m-%d')

                        # print (sp[2]).text
                        # print (sp[3]).text

                        title = Match.replace_specile_chars(tt)
                        title = title.replace('/', '', 100)

                        fileName = u"{}_{}_{}_{}.pdf".format(ttime, (sp[2]).text, title, (sp[3]).text)
                        # 时间 券商 名称  author

                        print fileName

                        urlsp = sp[-1]

                        basePath = '{}/{}'.format(sdPath, fileName)

                        # print basePath

                        # 创建文件夹

                        list_pcyc_li = urlsp.find_all('a')
                        for li in list_pcyc_li:
                            ttt = li.get('href')
                            Path.mkdirAndPath(basePath)
                            print ttt

                            Debug.print_in_single_line(u'开始下载   {}'.format(ttt))
                            if ttt:
                                content = Http.get_content(url=ttt, timeout=180)
                                if not content:
                                    Debug.logger.debug(u'pdf『{}』下载失败'.format(ttt))
                                    content = ''
                                else:
                                    Debug.print_in_single_line(u'pdf {} 下载完成'.format(ttt))
                            else:
                                #   当下载地址为空的时候,就没必要再去下载了
                                content = ''
                            if content.__len__() > 10:
                                with open(basePath, "wb") as pdf:
                                    pdf.write(content)
    def start(self):
        print 'start 东财股吧 研报'

        stockList = []
        file_name = 'annual.txt'

        with open(file_name, 'r') as read_list:
            read_list = read_list.readlines()

            resultsL = read_list.__len__()
            for x in range(0, resultsL):
                line = read_list[x]
                splits = line.split('#')
                code = (str)(splits[0])
                fieName = (str)(splits[1]).strip()
                print fieName
                stockList.append({'URL': code, 'NAME': fieName})

        for xx in stockList:
            for raw_front_page_index in range(1, 3):
                fileN = str(xx['NAME']).strip()
                uux = xx['URL']

                sdPath = '/ink/work/62/ink/{}'.format(fileN)
                Path.mkdir(sdPath)

                burl = u"http://guba.eastmoney.com/list,{},2,f_{}.html"

                content = Http.get_content(burl.format(uux, raw_front_page_index))

                xxsoup = BeautifulSoup(content, 'html.parser')

                tagrt = xxsoup.find_all('div', id='articlelistnew')[0]

                ols = tagrt.find_all('div', class_='articleh normal_post')
                olss = tagrt.find_all('div', class_='articleh normal_post odd')

                splicy = []

                for xxos in ols:
                    splicy.append(xxos)
                for xx in olss:
                    splicy.append(xxos)

                for inkl in splicy:

                    try:

                        inklinkl = BeautifulSoup(str(inkl), 'html.parser')

                        spp = inklinkl.find_all('span', class_='l3')[0]

                        list_pcyc_li = spp.find_all('a')
                        for li in list_pcyc_li:
                            ttt = li.get('href')

                            print ttt

                            destU = u'http://guba.eastmoney.com{}'.format(ttt)

                            result = Http.get_content(destU)
                            # result = unicode(result, 'GBK').encode('UTF-8')

                            xxsoup = BeautifulSoup(result, 'html.parser')

                            title_tationl = xxsoup.find_all('div', id='zwconttbt')
                            tt = str(title_tationl[0].text).strip()
                            print tt

                            title = Match.replace_specile_chars(tt)
                            title = title.replace('/', '', 100)

                            title = title.replace('查看原文', '')

                            ttime = xxsoup.find_all('p', class_='publishdate')[0]

                            tttttime = str(ttime.text)[-10:]

                            print  tttttime

                            date_time = datetime.datetime.strptime(tttttime, '%Y-%m-%d')

                            # print date_time.strftime('%Y-%m-%d')

                            ttime = date_time.strftime('%Y-%m-%d')

                            fileName = u"{}_{}.pdf".format(ttime, title)
                            # 时间 券商 名称  author

                            print fileName

                            basePath = '{}/{}'.format(sdPath, fileName)

                            # print basePath

                            # 创建文件夹
                            #

                            spx = xxsoup.find_all('span', class_='zwtitlepdf')[0]

                            pdfu = spx.find_all('a')
                            for li in pdfu:
                                ttt = li.get('href')

                                print ttt
                                Path.mkdirAndPath(basePath)

                                Debug.print_in_single_line(u'开始下载   {}'.format(ttt))
                                if ttt:
                                    content = Http.get_content(url=ttt, timeout=180)
                                    if not content:
                                        Debug.logger.debug(u'pdf『{}』下载失败'.format(ttt))
                                        content = ''
                                    else:
                                        Debug.print_in_single_line(u'pdf {} 下载完成'.format(ttt))
                                else:
                                    #   当下载地址为空的时候,就没必要再去下载了
                                    content = ''
                                if content.__len__() > 10:
                                    with open(basePath, "wb") as pdf:
                                        pdf.write(content)
                    except Exception as e:
                        print('next')