def _get_rank_fund_info(self):
        '''
        得到天天基金全部基金的rank_fund
        :return: a list
        '''
        rank_fund_list = []
        for page_num in range(self.page_num_start, self.page_num_end):
            print('正在抓取第{0}页的基金信息...'.format(page_num))
            cookies = {
                'st_pvi':
                '11586003301354',
                'EMFUND1':
                'null',
                'EMFUND0':
                'null',
                'EMFUND2':
                '07-10%2018%3A01%3A38@%23%24%u534E%u6DA6%u5143%u5927%u73B0%u91D1%u901A%u8D27%u5E01B@%23%24002884',
                'EMFUND3':
                '07-10%2018%3A01%3A48@%23%24%u5929%u5F18%u73B0%u91D1%u7BA1%u5BB6%u8D27%u5E01B@%23%24420106',
                'EMFUND4':
                '07-10%2018%3A11%3A53@%23%24%u65B9%u6B63%u5BCC%u90A6%u4FDD%u9669%u4E3B%u9898%u6307%u6570%u5206%u7EA7@%23%24167301',
                'EMFUND5':
                '07-10%2018%3A04%3A32@%23%24%u62DB%u5546%u4E2D%u8BC1%u94F6%u884C%u6307%u6570%u5206%u7EA7@%23%24161723',
                'EMFUND6':
                '07-10%2018%3A05%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570C@%23%24001595',
                'EMFUND7':
                '07-10%2018%3A06%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570A@%23%24001594',
                'st_si':
                '38764934559714',
                'ASP.NET_SessionId':
                'hqeo1xk5oqgwb0cqzxicytda',
                'EMFUND8':
                '07-11 11:28:55@#$%u7533%u4E07%u83F1%u4FE1%u591A%u7B56%u7565%u7075%u6D3B%u914D%u7F6E%u6DF7%u5408A@%23%24001148',
                'EMFUND9':
                '07-11 11:28:55@#$%u5E7F%u53D1%u751F%u7269%u79D1%u6280%u6307%u6570%28QDII%29@%23%24001092',
            }

            headers = {
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
                'Accept': '*/*',
                # 'Referer': 'http://fund.eastmoney.com/data/fundranking.html',
                'Proxy-Connection': 'keep-alive',
            }

            end_date = str(get_shanghai_time())[:10]
            start_date = str(
                datetime.datetime(year=get_shanghai_time().year - 1,
                                  month=get_shanghai_time().month,
                                  day=get_shanghai_time().day))[:10]
            print('开始时间: {0}, 结束时间: {1}'.format(start_date, end_date))

            params = (
                ('op', 'ph'),
                ('dt', 'kf'),
                ('ft', 'all'),
                ('rs', ''),
                ('gs', '0'),
                ('sc', 'zzf'),
                ('st', 'desc'),
                ('sd', start_date),  # '2017-07-10'
                ('ed', end_date),  # '2018-07-10'
                ('qdii', ''),
                ('tabSubtype', ',,,,,'),
                ('pi', str(page_num)),  # rank_data的页码
                ('pn', '50'),
                ('dx', '1'),
                # ('v', '0.5290053467389759'),
            )

            url = 'http://fund.eastmoney.com/data/rankhandler.aspx'

            # TODO 常规requests被502
            # body = MyRequests.get_url_body(url=url, headers=headers, params=params, cookies=None)
            # print(body)

            # 用phantomjs
            body = self.my_phantomjs.get_url_body(
                url=_get_url_contain_params(url, params))

            try:
                body = re.compile('<body>(.*)</body>').findall(body)[0]
                this_page_rank_data = re.compile(r'rankData = (.*);').findall(
                    body)[0]
                # print(this_page_rank_data)
            except IndexError:
                print('在获取this_page_rank_data时索引异常!请检查!')
                continue

            # 报错: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
            # 解决方案: 用demjson处理下
            this_page_rank_data = demjson.decode(this_page_rank_data).get(
                'datas', {})
            # pprint(this_page_rank_data)
            if this_page_rank_data == {}:
                return []

            for item in this_page_rank_data:
                _i = item.split(',')
                rank_fund_list.append({
                    '基金代码': _i[0],
                    '基金简称': _i[1],
                    '当天日期': _i[3],
                    '单位净值': _i[4],
                    '累计净值': _i[5],
                    '日增长率': _i[6],
                    '近1周': _i[7],
                    '近1月': _i[8],
                    '近3月': _i[9],
                    '近6月': _i[10],
                    '近1年': _i[11],
                    '近2年': _i[12],
                    '近3年': _i[13],
                    '今年来': _i[14],
                    '成立来': _i[15],
                    '手续费': _i[20],
                })

            sleep(2.5)

        print('\n抓取完毕!\n')

        # pprint(rank_fund_list)

        return rank_fund_list
    def _get_one_fund_info(self, fund_code):
        '''
        得到一只基金的info,并处理
        :return:
        '''
        cookies = {
            'st_pvi': '11586003301354',
            'st_si': '46806950936799',
            'ASP.NET_SessionId': 'fhllwae2zicg00o0x4ub1fxs',
            'EMFUND1': 'null',
            'EMFUND0': 'null',
            # 'EMFUND2': '07-10%2018%3A01%3A38@%23%24%u534E%u6DA6%u5143%u5927%u73B0%u91D1%u901A%u8D27%u5E01B@%23%24002884',
            'EMFUND2': '07-10 18:01:38@#$华润元大现金通货币B@#$002884',
            # 'EMFUND3': '07-10%2018%3A01%3A48@%23%24%u5929%u5F18%u73B0%u91D1%u7BA1%u5BB6%u8D27%u5E01B@%23%24420106',
            'EMFUND3': '07-10 18:01:48@#$天弘现金管家货币B@#$420106',
            # 'EMFUND4': '07-10%2018%3A11%3A53@%23%24%u65B9%u6B63%u5BCC%u90A6%u4FDD%u9669%u4E3B%u9898%u6307%u6570%u5206%u7EA7@%23%24167301',
            'EMFUND4': '07-10 18:11:53@#$方正富邦保险主题指数分级@#$167301',
            # 'EMFUND5': '07-10%2018%3A04%3A32@%23%24%u62DB%u5546%u4E2D%u8BC1%u94F6%u884C%u6307%u6570%u5206%u7EA7@%23%24161723',
            'EMFUND5': '07-10 18:04:32@#$招商中证银行指数分级@#$161723',
            # 'EMFUND6': '07-10%2018%3A05%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570C@%23%24001595',
            'EMFUND6': '07-10 18:05:13@#$天弘中证银行指数C@#$001595',
            # 'EMFUND7': '07-10%2018%3A06%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570A@%23%24001594',
            'EMFUND7': '07-10 18:06:13@#$天弘中证银行指数A@#$001594',
            # 'EMFUND8': '07-10%2018%3A11%3A22@%23%24%u7533%u4E07%u83F1%u4FE1%u591A%u7B56%u7565%u7075%u6D3B%u914D%u7F6E%u6DF7%u5408A@%23%24001148',
            'EMFUND8': '07-10 18:11:22@#$申万菱信多策略灵活配置混合A@#$001148',
            # 'EMFUND9': '07-10 18:12:26@#$%u5E7F%u53D1%u751F%u7269%u79D1%u6280%u6307%u6570%28QDII%29@%23%24001092',
            'EMFUND9': '07-10 18:12:26@#$广发生物科技指数(QDII)@#$001092',
        }

        cookies = unquote_cookies(cookies)
        # pprint(cookies)

        headers = {
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': '*/*',
            # 'Referer': 'http://fund.eastmoney.com/001092.html',
            'Proxy-Connection': 'keep-alive',
        }

        v = re.compile(r'-| |:').sub('', str(
            get_shanghai_time()))  # 2018-07-10 18:30:46 -> 20180710183046
        # print(v)
        params = (
            # ('v', '20180710175951'),    # 时间
            ('v', v),  # 时间
        )

        fund_url = 'http://fund.eastmoney.com/pingzhongdata/{0}.js'.format(
            fund_code)
        # response = requests.get(fund_url, headers=headers, params=params, cookies=None)
        # body = response.text
        # print(body)

        # body = MyRequests.get_url_body(url=fund_url, headers=headers, params=params, cookies=None)
        # print(body)

        body = self.my_phantomjs.get_url_body(
            url=_get_url_contain_params(fund_url, params))
        # print(body)
        self._get_this_fund_info(body=body)

        return True
Example #3
0
    def get_home_page_info_by_page_num(self, page_num: int) -> list:
        """
        根据page_num获取单页的信息
        :param page_num:
        :return:
        """
        def parse_page_info(body) -> list:
            """
            解析
            :param body:
            :return:
            """
            # div item
            li_sel = {
                'method': 'css',
                'selector': 'div.center-wrap a.random_list',
            }
            title_sel = {
                'method': 'css',
                'selector': 'div.random_title ::text',
            }
            create_time_sel = {
                'method': 'css',
                'selector': 'div.date ::text',
            }
            article_img_url_sel = {
                'method': 'css',
                'selector': 'div.random_article img ::attr("data-original")',
            }
            article_img_name_sel = {
                'method': 'css',
                'selector': 'div.random_article img ::attr("alt")',
            }
            li_list = parse_field(
                parser=li_sel,
                target_obj=body,
                is_first=False,
            )
            res = []
            for item in li_list:
                # pprint(item)
                try:
                    title = parse_field(
                        parser=title_sel,
                        target_obj=item,
                    )
                    assert title != ''
                    create_time = parse_field(
                        parser=create_time_sel,
                        target_obj=item,
                    )
                    assert create_time != ''
                    article_img_url_list = parse_field(
                        parser=article_img_url_sel,
                        target_obj=item,
                        is_first=False,
                    )
                    assert article_img_url_list != []
                    article_img_name_list = parse_field(
                        parser=article_img_name_sel,
                        target_obj=item,
                        is_first=False,
                    )
                    assert article_img_name_list != []
                    article_img_list = list(
                        zip(article_img_name_list, article_img_url_list))
                    article_img_list = [{
                        'img_name': i[0],
                        'img_url': i[1],
                    } for i in article_img_list]
                except (AssertionError, IndexError) as e:
                    # print(e)
                    continue

                res.append({
                    'title': title,
                    'create_time': create_time,
                    'article_img_list': article_img_list,
                })

            return res

        headers = self.get_random_phone_headers()
        headers.update({
            'authority': 'www.doutula.com',
            'referer': 'https://www.doutula.com/',
        })
        params = (('page', str(page_num)), )
        url = 'https://www.doutula.com/article/list/'
        # TODO 用requests乱码
        # body = Requests.get_url_body(
        #     url=url,
        #     headers=headers,
        #     params=params,
        #     ip_pool_type=self.ip_pool_type,
        #     num_retries=self.request_num_retries,
        #     encoding='utf-8',)
        # print(body)
        # 改用driver
        d = BaseDriver(ip_pool_type=tri_ip_pool, user_agent_type=PHONE)
        body = d.get_url_body(
            url=_get_url_contain_params(url=url, params=params))
        # print(body)
        try:
            del d
        except:
            pass
        res = parse_page_info(body=body)
        print('[{}] page_num: {}'.format(
            '+' if res != [] else '-',
            page_num,
        ))
        collect()

        return res
Example #4
0
    def _get_goods_data(self, goods_id):
        '''
        得到需求数据
        :param goods_id:
        :return:
        '''
        if goods_id == '':
            self.my_lg.error('获取到的goods_id为空值!此处跳过!')
            return self._get_data_error_init()

        # 网易严选m站抓取
        url = 'http://m.you.163.com/item/detail'
        params = self._get_params(goods_id=goods_id)

        m_url = url + '?id={0}'.format(goods_id)
        self.my_lg.info('------>>>| 正在抓取严选地址为: {0}'.format(m_url))

        write_info = '出错goods_id:{0}, 出错地址: {1}'.format(goods_id, m_url)

        '''requests被无限转发'''
        # body = MyRequests.get_url_body(url=url, headers=self.headers, params=params)
        # self.my_lg.info(str(body))

        '''改用phantomjs'''
        body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=_get_url_contain_params(url=url, params=params))
        if body == '':
            self.my_lg.error('获取到的body为空值!'+write_info)
            return self._get_data_error_init()

        try:
            body = re.compile('var jsonData=(.*?),policyList=').findall(body)[0]
        except IndexError:
            self.my_lg.error('获取body时索引异常!'+write_info, exc_info=True)
            return self._get_data_error_init()

        body = nonstandard_json_str_handle(json_str=body)
        # self.my_lg.info(str(body))
        _ = json_2_dict(
            json_str=body, logger=self.my_lg)
        # pprint(_)
        if _ == {}:
            self.my_lg.error('获取到的data为空dict!'+write_info)
            return self._get_data_error_init()

        _ = self._wash_data(_)
        data = {}
        try:
            data['title'] = self._wash_sensitive_info(self._get_title(data=_))
            data['sub_title'] = self._wash_sensitive_info(self._get_sub_title(data=_))
            data['shop_name'] = ''
            data['all_img_url'] = self._get_all_img_url(data=_)
            data['p_info'] = self._get_p_info(data=_)
            data['div_desc'] = self._get_div_desc(data=_)
            data['sell_time'] = self._get_sell_time(data=_)
            data['detail_name_list'] = self._get_detail_name_list(data=_.get('skuSpecList', []))
            data['price_info_list'] = self._get_price_info_list(data=_.get('skuList', []))
            data['price'], data['taobao_price'] = self._get_price_and_taobao_price(
                price_info_list=data['price_info_list']
            )
            if data['price'] == 0 or data['taobao_price'] == 0:     # 售罄商品处理
                data['is_delete'] = 1
            else:
                data['is_delete'] = self._get_is_delete(price_info_list=data['price_info_list'], data=data, other=_)

        except Exception:
            self.my_lg.error('遇到错误:', exc_info=True)
            self.my_lg.error(write_info)
            return self._get_data_error_init()

        if data != {}:
            self.result_data = data
            return data
        else:
            self.my_lg.info('data为空值')
            return self._get_data_error_init()
Example #5
0
    def _get_comment_data(self, type: int, goods_id):
        if goods_id == '' or type == '':
            self.result_data = {}
            return {}
        self.lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id))
        '''先获取到sellerId'''
        try:
            seller_id = self._get_seller_id(type=type, goods_id=goods_id)
        except AssertionError or IndexError as e:
            self.lg.error('出错goods_id: %s' % goods_id)
            self.lg.error(e.args[0])
            self.result_data = {}
            self.random_sku_info_list = []
            return {}
        """再获取price_info_list"""
        try:
            self.random_sku_info_list = self._get_random_sku_info_list()
            # self.lg.info(self.random_sku_info_list)
        except Exception as e:
            self.lg.error('出错goods_id: %s' % str(goods_id))
            self.lg.exception(e)
            self.result_data = {}
            self.random_sku_info_list = []
            return {}

        _tmp_comment_list = []
        for current_page in range(1, 4):
            self.lg.info('------>>>| 正在抓取第 {0} 页的评论...'.format(
                str(current_page)))
            _url = 'https://rate.tmall.com/list_detail_rate.htm'

            params = self._set_params(goods_id=goods_id,
                                      seller_id=seller_id,
                                      current_page=current_page)
            self.headers.update({
                'referer':
                'https://detail.m.tmall.com/item.htm?id=' + goods_id
            })

            # 原先用代理请求不到数据的原因是没有cookies
            # body = MyRequests.get_url_body(url=_url, headers=self.headers, params=params, encoding='gbk')

            # 所以直接用phantomjs来获取相关api数据
            _url = _get_url_contain_params(url=_url,
                                           params=params)  # 根据params组合得到url
            # self.lg.info(_url)

            body = self.driver.use_phantomjs_to_get_url_body(url=_url)
            # self.lg.info(str(body))
            if body == '':
                self.lg.error('获取到的body为空str! 出错type:{0}, goods_id:{1}'.format(
                    str(type), goods_id))
                self.result_data = {}
                return {}

            try:
                _ = re.compile('\((.*)\)').findall(body)[0]
            except IndexError:
                _ = {}
                self.lg.error('索引异常! 出错type:{0}, goods_id:{1}'.format(
                    str(type), goods_id))

            try:
                data = json.loads(_).get('rateDetail', {}).get('rateList', [])
                # pprint(data)
            except:
                data = []
                self.lg.error(
                    'json.loads转换_出错! 出错type:{0}, goods_id:{1}'.format(
                        str(type), goods_id))
            _tmp_comment_list += data
            sleep(self.comment_page_switch_sleep_time)

        try:
            _comment_list = self._get_comment_list(
                _tmp_comment_list=_tmp_comment_list)
        except Exception as e:
            self.lg.error('出错type:{0}, goods_id:{1}'.format(
                str(type), goods_id))
            self.lg.exception(e)
            self.result_data = {}
            return {}

        _t = datetime.datetime.now()
        _r = CommentItem()
        _r['goods_id'] = str(goods_id)
        _r['create_time'] = _t
        _r['modify_time'] = _t
        _r['_comment_list'] = _comment_list
        self.result_data = _r
        # pprint(self.result_data)

        return self.result_data