def parse_product_detail(self, response): item = BankproductItem() item['bankCode'] = 'cmb' item['channel'] = 'app' item['proCode'] = re.search( 'Code=(\d+)', str(response.request.body, encoding='utf-8')).group(1) item['proName'] = self.__get_re_value(response.text, 'prdname: "(.*?)"', 1) item['proType'] = self.__get_re_value(response.text, 'prdtype: "(.*?)"', 1) item['incomeRateName'] = self.__get_xpath_value( response, "//span[@id='ctl00_cphBody_RatBre']/text()").strip() item['incomeRate'] = self.__get_xpath_value( response, "//span[@id='ctl00_cphBody_PrdRat']/text()").strip() item['cycleTime'] = self.__get_xpath_value( response, "//span[@id='ctl00_cphBody_TerDay']/text()").strip() item['riskLevel'] = self.__get_xpath_value( response, "//span[@id='ctl00_cphBody_RiskLvl2']/text()").strip() item['currentPurchases'] = self.__get_xpath_value( response, "//span[@id='ctl00_cphBody_SalAmt']/text()").strip() item['firstSubMinAmount'] = self.__get_xpath_value( response, "//span[@id='ctl00_cphBody_SbsUqt']/text()").strip() self.form_data3['Code'] = item['proCode'] self.form_data3['behavior_prodcode'] = item['proCode'] yield scrapy.FormRequest(self.rule_url, method="POST", meta={'item': item}, formdata=self.form_data3, callback=self.parse_product_rules, dont_filter=True) pass
def parse(self, response): # 解析具体的产品 content = self.__get_response_content(response) product_list = json.loads(content)['rows'] # 返回的是对象集合 for product_item in product_list: item = BankproductItem() item['bankCode'] = 'spd' item['channel'] = 'web' item['proCode'] = product_item['finance_no'] item['proName'] = product_item['finance_allname'] # 产品类型 params = urllib.parse.unquote( str(response.request.body, encoding='utf-8')) proAttr = self.__get_re_value(params, 'product_type=(\d+)', 1) item['proAttr'] = proAttr.replace('0', '私行专属').replace('2', "净值类").replace('3', "固定期限") \ .replace('4', "现金管理类") if proAttr else "汇理财" item['incomeRate'] = product_item['finance_anticipate_rate'] item['riskLevel'] = product_item['finance_risklevel'].replace('A', "低风险等级").replace('B', "较低风险等级") \ .replace('C', "中风险等级").replace('D', "较高风险等级").replace('E', "高风险等级") item['firstAmount'] = product_item['finance_indi_ipominamnt'] item['nextOpenDate'] = product_item[ 'finance_next_openday'] if 'finance_next_openday' in product_item.keys( ) else "" item['instructionUrl'] = product_item[ 'product_attr'] if 'product_attr' in product_item.keys( ) else "" item['status'] = product_item['finance_state'] channelid = re.search('channelid=(\d+)', str(response.request.body, encoding='utf-8')).group(1) if channelid == '266906': product_item_detail_url = '{}{}'.format( self.detail_url, item['proCode']) elif channelid == '263468': product_item_detail_url = '{}{}'.format( self.detail2_url, item['proCode']) yield scrapy.Request(product_item_detail_url, meta={'item': item}, callback=self.parse_product_detail, dont_filter=True) # 是否存在下一页数据 exist_data = re.search('"rows":\[([\S\s]+)\]', response.text) if exist_data: current_page = int( re.search('page=(\d+)', str(response.request.body, encoding='utf-8')).group(1)) current_page = current_page + 1 current_form_data = self.fromData2Dict( str(response.request.body, encoding='utf-8')) current_form_data['page'] = str(current_page) yield scrapy.FormRequest(self.start_url, method="POST", formdata=current_form_data, dont_filter=True) pass
def parse_organization(self, response): product_attr = response.xpath( "//div[@id='content']//div[@class='top']").css( "p>a:last-child::text").extract()[0] product_ul = response.xpath( "//div[@id='content']//div[@class='middle']/ul") for product_item in product_ul.xpath("li"): item = BankproductItem() item['bankCode'] = 'cib' item['channel'] = 'web' item['proAttr'] = product_attr item['proName'] = self.__get_xpath_value(product_item, "a/text()") time_str = self.__get_xpath_value(product_item, "span[@class='time']/text()") year = datetime.strptime(time_str, '%Y-%m-%d').year # 判断是否是今年的数据 if year == datetime.now().year: next_page_url = "{}{}".format( self.base_url, self.__get_xpath_value(product_item, "a/@href").strip()) yield scrapy.Request(next_page_url, meta={'item': item}, callback=self.product_announcement, dont_filter=True) else: yield item pass
def parse(self, response): selector = Selector(response) for product_item in selector.xpath( "//*[@id='content']//li[@name='pageli']"): item = BankproductItem() item['bankCode'] = 'hxb' item['channel'] = 'web' item['proName'] = self.__get_xpath_value( product_item, "div[@class='pro_box']/p[@class='box_title']/a/text()").strip( ) item['cycleTime'] = self.__get_xpath_value( product_item, "div[@class='pro_box']/ul//span[@class='highlight']/text()" ).strip() title = self.__get_xpath_value( product_item, "div[@class='pro_box']/div[@class='box_lf']/p[2]/text()" ).strip() if title == '预期最高年化收益率': item['incomeRateName'] = title item['incomeRate'] = self.__get_xpath_value( product_item, "div[@class='pro_box']//p[@class='box_num']/text()").strip( ) else: item['proNetValue'] = self.__get_xpath_value( product_item, "div[@class='pro_box']//p[@class='box_num']/text()").strip( ) item['startDate'] = re.search( '(.*?)至', self.__get_xpath_value( product_item, "div[@class='pro_box']//span[text()='发售日期']/../span[2]/text()" )).group(1) item['endDate'] = re.search( '至(.*)', self.__get_xpath_value( product_item, "div[@class='pro_box']//span[text()='发售日期']/../span[2]/text()" )).group(1) item['firstSubMinAmount'] = self.__get_xpath_value( product_item, "div[@class='pro_box']//span[@class='amt']/text()" ).strip() + self.__get_xpath_value( product_item, "div[@class='pro_box']//span[@class='amt']/following-sibling::text()" ).strip() sellChannel = self.__get_xpath_value( product_item, "div[@class='pro_box']//span[text()='购买渠道']/../span[2]/text()" ).strip() # 替换不可见字符 item['sellChannel'] = re.sub('[\r\n\t\s]', '', sellChannel) yield item pass
def parse_private(self, response): selector = Selector(response) # 遍历产品类型 for product_type in selector.xpath( "//*[@id='content']//div[@class='middle']/p[not(@align)]"): # 找到兄弟节点 table_selector = product_type.xpath("following-sibling::table[1]") title_num = len(table_selector.xpath("tbody/tr[1]").css("td")) # 遍历除了表头的元素 for index, product_item in enumerate( table_selector.xpath( "tbody/tr[not(contains(td,'产品名称'))]")): child_num = len(product_item.xpath("td")) item = BankproductItem() item['bankCode'] = 'cib' item['channel'] = 'web' item['proType'] = self.__get_xpath_value( product_type, "strong/text()").strip()[2:] # 获取产品名称和代码(xpath <br/>会转化为两个节点) name_code = product_item.xpath("td[1]/text()").extract() if len(name_code) == 1: length = len( re.findall(r"[\((][^\((]+[\))]", name_code[0].strip())) # 含有多个括号的情况,取最后一个 if length > 1: product_code = re.findall( r"[\((][^\((]+[\))]", name_code[0].strip())[length - 1] else: product_code = re.search(r"[\((][^\((]+[\))]", name_code[0].strip()).group(0) pass else: product_code = name_code[1].strip()[1:-1] item['proCode'] = product_code item['proName'] = name_code[0][0:name_code[0].find(str(item['proCode']))-1] if len(name_code) == 1 else \ name_code[0].strip() # 是否含有合并行 if child_num == title_num: item['cycleTime'] = self.__get_xpath_value( product_item, "td[3]/text()").strip() item['nextIncomeRate'] = self.__get_xpath_value( product_item, "td[4]/text()").strip() pass else: item['cycleTime'] = self.__get_xpath_value( product_item, "td[2]/text()").strip() item['nextIncomeRate'] = self.__get_xpath_value( product_item, "td[3]/text()").strip() yield item pass
def parse(self, response): productItems = json.loads(response.text)['respData']['list'] for productItem in productItems: item = BankproductItem() item['bankCode'] = 'cmbc' item['channel'] = 'app' item['proCode'] = productItem['PRD_CODE'].strip() item['proName'] = productItem['PRD_NAME'].strip() item['proAttr'] = productItem['PRD_ATTR'].strip() # PRD_TYPE(0:每日型,1:定期开放型,2:封闭型,3:收益型,4:净值类周期型,5:活期型), item['proType'] = productItem['PRD_TYPE_NAME'].strip() item['incomeRateName'] = productItem['INCOME_TYPE'] item['incomeRate'] = productItem['INCOME_RATE'] item['nextIncomeRate'] = productItem['NEXT_INCOME_RATE'] item['proNetValue'] = productItem['NAV'] item['openDate'] = productItem['START_DATE'].strip() item['realEndDate'] = productItem['REALEND_DATE'].strip() item['cycleTime'] = productItem['LIV_TIME_UNIT_NAME'].strip() item['firstAmount'] = productItem['FIRST_AMT'] # currency(156:人民币,840:美元) item['currency'] = productItem['CURR_TYPE_NAME'].strip() self.request_data2['prdCode'] = item['proCode'] yield scrapy.Request(self.start_url, method="POST", body=json.dumps(self.request_data2), meta={'item': item}, headers={'Content-Type': 'application/json'}, callback=self.parse_product_detail, dont_filter=True) # 是否存在下一页数据 exist_data = re.search('"PRD_CODE":"(.*?)"', response.text) if exist_data: current_startId = int( self.__get_re_value( str(response.request.body, encoding='utf-8'), '"startId": "(.*?)"', 1)) pagesize = int( self.__get_re_value( str(response.request.body, encoding='utf-8'), '"pageSize": "(\d+)"', 1)) next_startId = current_startId + pagesize self.request_data['startId'] = str(next_startId) yield scrapy.Request(self.start_url, method="POST", body=json.dumps(self.request_data), headers={'Content-Type': 'application/json'}, dont_filter=True) pass
def parse_retail_cash(self, response): table = response.xpath( "//*[@id='content']//div[@class='middle']/table") proAttr = self.__get_xpath_value( response, "//*[@id='content']//div[@class='middle']/h1/text()").strip() for product_item in table.xpath("tbody/tr[not(contains(td,'产品名称'))]"): item = BankproductItem() item['bankCode'] = 'cib' item['channel'] = 'web' item['proAttr'] = proAttr item['proName'] = self.__get_xpath_value( product_item, "td[1]/strong/a/text()|td[1]/a/text()").strip() item['sellArea'] = self.__get_xpath_value( product_item, "td[2]/text()|td[2]/strong/text()").strip() item['currency'] = self.__get_xpath_value( product_item, "td[3]/text()|td[3]/strong/text()").strip() item['cycleTime'] = self.__get_xpath_value( product_item, "td[4]/text()|td[4]/strong/text()").strip() item['proType'] = self.__get_xpath_value( product_item, "td[5]/text()|td[5]/strong/text()").strip() item['firstAmount'] = self.__get_xpath_value( product_item, "td[6]/text()|td[6]/strong/text()").strip() item['incomeRateName'] = self.__get_xpath_value( table, "tbody/tr[1]/td[7]/strong/text()|tbody/tr[1]/td[7]/text()" ).strip() item['incomeRate'] = self.__get_xpath_value( product_item, "td[7]/text()|td[7]/strong/text()").strip() product_pic = product_item.xpath("td[8]/img/@src") item['proCode'] = re.search( 'lccp(.*?).png', product_pic.extract()[0]).group(1) if product_pic else '' # 判断是否有超链接 href_num = len( product_item.xpath( 'td[1]/strong/a/@href|td[1]/a/@href').extract()) if href_num: next_page_url = self.__get_xpath_value( product_item, "td[1]/strong/a/@href|td[1]/a/@href") yield scrapy.Request(next_page_url, meta={'item': item}, callback=self.product_announcement, dont_filter=True) pass else: yield item pass
def parse(self, response): # 解析各个产品 # for product_item in re.findall('lccpsj/(.*?)/index', response.text): for product_item in response.xpath("//div[@class='lccp_main_content_tx']/ul/li"): item = BankproductItem() item['bankCode'] = 'ceb' item['channel'] = 'web' item['incomeRate'] = self.__get_xpath_value(product_item, "p[@class='lccp_syl']/span[@class='lccp_ll fc_box']/text()").strip() product_item_url = "{}{}".format(self.base_url, self.__get_xpath_value(product_item, "a/@href").strip()) yield scrapy.Request(product_item_url, callback=self.parse_product_detail, meta={'item': item}, dont_filter=True) # 是否存在下一页数据 exist_data = re.search('cpmc-(.*?)', response.text) if exist_data: # response.request.body获取请求中的body page_index = int(re.search('page=(\d+)', str(response.request.body, encoding='utf-8')).group(1)) + 1 self.form_data['page'] = str(page_index) yield scrapy.FormRequest(self.start_url, method='POST', formdata=self.form_data, dont_filter=True)
def parse_retail_index(self, response): selector = Selector(response) for product_item in selector.xpath("//tbody/tr"): item = BankproductItem() item['bankCode'] = 'cib' item['channel'] = 'web' item['proCode'] = re.search( 'lccp(.*?).png', product_item.xpath('td[9]/img/@src').extract()[0]).group(1) item['proAttr'] = '零售理财' # 判断属否有超链接 proName = re.search('<a[\S\s]*>(.*?)</a>', product_item.xpath('td[1]').extract()[0]) item['proName'] = proName.group(1) if ( proName != None ) else product_item.xpath('td[1]/text()').extract()[0] item['incomeRate'] = product_item.xpath( 'td[7]/text()').extract()[0].strip() item['currency'] = product_item.xpath( 'td[4]/text()').extract()[0].strip() item['startDate'] = product_item.xpath( 'td[2]/text()').extract()[0].strip() item['endDate'] = product_item.xpath( 'td[3]/text()').extract()[0].strip() # 大额客户参考净收益率(客户要求放在next_income_rate) item['nextIncomeRate'] = product_item.xpath( 'td[8]/text()').extract()[0].strip() # 判断是否含有超链接 href_num = len(product_item.xpath('td[1]/a/@href').extract()) if href_num > 0: next_page_url = "{}{}".format( self.base_url, product_item.xpath('td[1]/a/@href').extract()[0]) yield scrapy.Request(next_page_url, meta={'item': item}, callback=self.parse_product_detail, dont_filter=True) else: yield item pass
def parse_product_detail(self, response): url = response.url content = self.__get_response_content(response) item = BankproductItem() item['bankCode'] = 'cmbc' item['channel'] = 'web' product = json.loads(content)['returnData'] item['proCode'] = product['PRD_CODE'].strip() item['proName'] = product['PRD_NAME'].strip() item['proAttr'] = product['PRD_ATTR_NAME'].strip() item['proType'] = product['PRD_TYPE_NAME'].strip() item['sellObject'] = product['SELLDIR'].strip() item['status'] = product['STATUS_NAME'].strip() item['currency'] = product['CURR_TYPE_NAME'].strip() item['crFlag'] = product['CRFLAGNAME'].strip() item['startDate'] = product['IPO_START_DATE'].strip() item['endDate'] = product['IPO_END_DATE'].strip() item['openDate'] = product['START_DATE'].strip() item['nextOpenDate'] = product['PRD_NEXT_DATE'].strip() item['nextEndDate'] = product['EDDATE'].strip() item['realEndDate'] = product['REALEND_DATE'].strip() item['cycleTime'] = product['LIV_TIME_UNIT_NAME'].strip() item['incomeRate'] = product['INCOME_RATE'].strip() item['nextIncomeRate'] = product['Next_Income_Rate'].strip() item['interestType'] = product['INTEREST_TYPE_NAME'].strip() item['riskLevel'] = product['RISK_LEVEL_NAME'].strip() item['openTime'] = product['OPEN_TIME'].strip() item['closeTime'] = product['CLOSE_TIME'].strip() item['firstSubMinAmount'] = product['PFIRST_AMT'].strip() item['minRedBalance'] = product['PRED_UNIT'].strip() item['minSubUnit'] = product['PSUB_UNIT'].strip() item['maxSingleSub'] = product['PMAX_AMT'].strip() item['maxSingleRed'] = product['PMAX_RED'].strip() item['maxOneDaySub'] = product['PDAY_MAX'].strip() item['plainHold'] = product['PMIN_HOLD'].strip() item['proNetValue'] = product['NAV'].strip() yield item
def parse_product_detail(self, response): item = BankproductItem() item['bankCode'] = 'czb' item['channel'] = 'web' product_name = self.__get_xpath_value( response, "div[contains(@class,'nameLC')]/h3/text()").strip() item['proCode'] = re.search( '型(.*)', product_name).group(1) if re.search( '型(.*)', product_name) else product_name item['proName'] = product_name firstAmount = self.__get_xpath_value( response, "div[contains(@class,'nameLC')]/p/text()").strip() item['firstAmount'] = re.search('(.*)起购', firstAmount).group(1) item['incomeRateName'] = self.__get_xpath_value( response, "div[contains(@class,'num_det')]/div[@class='fl_num']/p[@class='num_txt']/text()" ) item['incomeRate'] = self.__get_xpath_value( response, "div[contains(@class,'num_det')]/div[@class='fl_num']/p[1]/text()" ).strip() item['currency'] = self.__get_xpath_value( response, "div[contains(@class,'num_det')]/div[@class='mid_date' and contains(p[@class='num_txt'],'币种')]/p[1]/text()" ).strip() item['cycleTime'] = self.__get_xpath_value( response, "div[contains(@class,'num_det')]/div[@class='mid_date' and contains(p[@class='num_txt'],'理财期限')]/p[1]/text()" ).strip() item['endDate'] = self.__get_xpath_value( response, "div[contains(@class,'num_det')]/div[@class='fr_date' and contains(p[@class='num_txt'],'认购截止日')]/p[1]/text()" ).strip() item['openTime'] = self.__get_xpath_value( response, "div[contains(@class,'num_det')]/div[@class='fr_date' and contains(p[@class='num_txt'],'申购时间')]/p[1]/text()" ).strip() return item
def parse_retail_zyb(self, response): table = response.xpath( "//*[@id='content']//div[@class='middle']/table") proAttr = self.__get_xpath_value( response, "//*[@id='content']//div[@class='middle']/h1/text()").strip() for product_item in table.xpath("tbody/tr[not(contains(td,'产品名称'))]"): item = BankproductItem() item['bankCode'] = 'cib' item['channel'] = 'web' item['proAttr'] = proAttr item['proName'] = self.__get_xpath_value(product_item, "td[1]/text()").strip() item['sellArea'] = self.__get_xpath_value(product_item, "td[2]/text()").strip() item['proType'] = self.__get_xpath_value(product_item, "td[3]/text()").strip() item['firstAmount'] = self.__get_xpath_value( product_item, "td[4]/text()").strip() # 募集时间 raise_date = self.__get_xpath_value(product_item, "td[5]/text()").strip() item['startDate'] = self.__get_re_value(raise_date, "(.*)-(.*)", 1).strip() year = datetime.strptime(item['startDate'], '%Y年%m月%d日').year item['endDate'] = str(year) + "年" + self.__get_re_value( raise_date, "(.*)-(.*)", 2).strip() item['incomeRateName'] = self.__get_xpath_value( table, "tbody/tr[1]/td[6]/text()").strip() item['incomeRate'] = self.__get_xpath_value( product_item, "td[6]/text()").strip() product_pic = product_item.xpath("td[9]/img/@src") item['proCode'] = re.search( 'lccp(.*?).png', product_pic.extract()[0]).group(1) if product_pic else '' yield item pass
def parse_product_detail(self, response): selector = Selector(response) item = BankproductItem() item['bankCode'] = 'hfb' item['channel'] = 'web' item['proCode'] = self.__get_xpath_value( response, "//table[@class='con2']/tbody/tr[1]/td[2]/text()").strip() item['proName'] = self.__get_xpath_value( response, "//table[@class='con2']/tbody/tr[1]/td[4]/text()").strip() item['openDate'] = self.__get_xpath_value( response, "//table[@class='con2']/tbody/tr[2]/td[2]/text()").strip() item['realEndDate'] = self.__get_xpath_value( response, "//table[@class='con2']/tbody/tr[2]/td[4]/text()").strip() item['currency'] = self.__get_xpath_value( response, "//table[@class='con2']/tbody/tr[3]/td[2]/text()").strip() item['riskLevel'] = self.__get_xpath_value( response, "//table[@class='con2']/tbody/tr[3]/td[4]/text()").strip() item['cycleTime'] = self.__get_xpath_value( response, "//table[@class='con2']/tbody/tr[5]/td[2]/text()").strip() item['endDate'] = self.__get_xpath_value( response, "//table[@class='con2']/tbody/tr[5]/td[4]/text()").strip() sellChannel = self.__get_xpath_value( response, "//table[@class='con2']/tbody/tr[6]/td[2]/text()").strip() item['sellChannel'] = re.sub('[\r\n\t\s]', '', sellChannel) item['incomeRateName'] = self.__get_xpath_value( response, "//div[@class='con1 of']/table/tbody/tr[1]/td[1]/text()").strip() item['incomeRate'] = self.__get_xpath_value( response, "//div[@class='con1 of']/table/tbody/tr[1]/td[2]/p/text()").strip( ) item['firstAmount'] = self.__get_xpath_value( response, "//table[@class='con2']/tbody/tr[4]/td[2]/text()").strip() minSubUnit = self.__get_xpath_value( response, "//table[@class='con2']/tbody/tr[4]/td[4]/text()").strip() item['minSubUnit'] = re.search('((\d)+.(\d)+)', minSubUnit).group(0) # 产品说明书下载路径 instructionUrl = self.__get_xpath_value( response, "//a[@class='download' and contains(@href,'说明书')]/@href") if instructionUrl: strs = self.start_url.split('/') num = len(strs) index = self.start_url.find(strs[num - 1], 0) item['instructionUrl'] = '{}{}'.format(self.start_url[0:index], instructionUrl) # 风险说明书下载路径 riskDisclosureUrl = self.__get_xpath_value( response, "//a[@class='download' and contains(@href,'风险')]/@href") if riskDisclosureUrl: strs = self.start_url.split('/') num = len(strs) index = self.start_url.find(strs[num - 1], 0) item['riskDisclosureUrl'] = '{}{}'.format(self.start_url[0:index], riskDisclosureUrl) yield item pass
def parse(self, response): # 解析各个产品 for product_item in response.xpath( "//*[@id='product_tab']//tr[@class='bg2']"): item = BankproductItem() item['bankCode'] = 'cgb' item['channel'] = 'web' item['proName'] = self.__get_xpath_value( product_item, "td[@class='name']/a/text()").strip() item_url = self.__get_xpath_value( product_item, "td[@class='name']/a/@href").strip() item['proCode'] = self.__get_re_value(item_url, 'productno=(.*)', 1) item['currency'] = self.__get_xpath_value(product_item, "td[2]/text()").strip() item['cycleTime'] = self.__get_xpath_value( product_item, "td[3]/text()").strip().replace(' ', '') item['firstAmount'] = self.__get_xpath_value( product_item, "td[4]/text()").strip() item['incomeRateName'] = self.__get_xpath_value( response, "//*[@id='product_tab']/tr[1]/th[5]/text()").strip() item['incomeRate'] = self.__get_xpath_value( product_item, "td[5]/b/text()").strip() item['riskLevel'] = self.__get_xpath_value(product_item, "td[6]/text()").strip() recruitment_period = self.__get_xpath_value( product_item, "td[7]/text()").strip() if recruitment_period != '长   期': item['startDate'] = self.__get_re_value( recruitment_period, "(.*?)至", 1) item['endDate'] = self.__get_re_value(recruitment_period, "至(.*?)", 1) pass product_item_url = "{}{}".format(self.base_url, item_url) yield scrapy.Request(product_item_url, meta={'item': item}, callback=self.parse_product_detail, dont_filter=True) # 是否存在下一页数据 exist_data = response.xpath( "//*[@id='product_tab']//tr[@class='bg2']/td[@class='name']/a/@href" ) if exist_data: # 当前页 currPage = int( re.search('currPage=(\d+)', str(response.request.body, encoding='utf-8')).group(1)) currPage = currPage + 1 # 每页数量 pageSize = int( re.search('rowsPerpage=(\d+)', str(response.request.body, encoding='utf-8')).group(1)) # 每页开始 turnPageBeginPos = int( re.search('turnPageBeginPos=(\d+)', str(response.request.body, encoding='utf-8')).group(1)) turnPageBeginPos = turnPageBeginPos + pageSize self.form_data['currPage'] = str(currPage) self.form_data['turnPageBeginPos'] = str(turnPageBeginPos) yield scrapy.FormRequest(self.start_url, method="POST", formdata=self.form_data, dont_filter=True)
def parse_retail_open(self, response): selector = Selector(response) # 遍历产品类型 for product_type in selector.xpath( "//*[@id='content']//div[@class='middle']/p"): # 找到兄弟节点 table_selector = product_type.xpath("following-sibling::table[1]") title_num = len(table_selector.xpath("tbody/tr[1]").css("td")) proAttr = self.__get_xpath_value(product_type, "strong/text()").strip() # 遍历除了表头的元素 for index, product_item in enumerate( table_selector.xpath( "tbody/tr[not(contains(td,'产品名称'))]")): item = BankproductItem() item['bankCode'] = 'cib' item['channel'] = 'web' item['proAttr'] = proAttr[0:proAttr.find('产品')] if title_num == 10: item['proName'] = self.__get_xpath_value( product_item, "td[1]/text()").strip() item['startDate'] = self.__get_xpath_value( product_item, "td[2]/text()").strip() item['endDate'] = self.__get_xpath_value( product_item, "td[3]/text()").strip() item['sellArea'] = self.__get_xpath_value( product_item, "td[4]/text()").strip() item['currency'] = self.__get_xpath_value( product_item, "td[5]/text()").strip() item['cycleTime'] = self.__get_xpath_value( product_item, "td[6]/text()").strip() item['proType'] = self.__get_xpath_value( product_item, "td[7]/text()").strip() item['firstAmount'] = self.__get_xpath_value( product_item, "td[8]/text()").strip() item['incomeRateName'] = '业绩比较基准' # 多个xpath路径可以一起使用 item['incomeRate'] = self.__get_xpath_value( product_item, "td[9]/strong/text()|td[9]/text()").strip() product_pic = product_item.xpath('td[10]/img/@src') item['proCode'] = re.search( 'lccp(.*?).png', product_pic.extract()[0]).group( 1) if product_pic else '' else: item['proName'] = self.__get_xpath_value( product_item, "td[1]/text()").strip() item['openTime'] = self.__get_xpath_value( product_item, "td[2]/text()").strip() item['sellArea'] = self.__get_xpath_value( product_item, "td[3]/text()").strip() item['currency'] = self.__get_xpath_value( product_item, "td[4]/text()").strip() item['cycleTime'] = self.__get_xpath_value( product_item, "td[5]/text()").strip() item['proType'] = self.__get_xpath_value( product_item, "td[6]/text()").strip() item['firstAmount'] = self.__get_xpath_value( product_item, "td[7]/text()").strip() item['incomeRateName'] = '业绩比较基准' item['incomeRate'] = self.__get_xpath_value( product_item, "td[8]/strong/text()|td[8]/text()").strip() product_pic = product_item.xpath('td[9]/img/@src') item['proCode'] = re.search( 'lccp(.*?).png', product_pic.extract()[0]).group( 1) if product_pic else '' pass yield item pass
def parse_product_detail(self, response): item = BankproductItem() item['bankCode'] = 'cmb' item['channel'] = 'web' proCode = self.__get_xpath_value( response, "//li[contains(text(),'产品代码')]/span/text()").strip() item['proCode'] = proCode item['proName'] = self.__get_xpath_value( response, "//td[contains(text(), '产品简称')]/following-sibling::td[1]/text()" ).strip() item['proAttr'] = self.__get_xpath_value( response, "//li[contains(text(),'产品类别')]/span/text()").strip() item['proType'] = self.__get_xpath_value( response, "//li[contains(text(),'投资类型')]/span/text()").strip() # item['sellObject'] = self.__get_xpath_value(response) # item['status'] = self.__get_xpath_value(response) item['currency'] = self.__get_xpath_value( response, "//td[contains(text(), '币种')]/following-sibling::td[1]/text()" ).strip() # item['crFlag'] = self.__get_xpath_value(response) # item['cycleTime'] = self.__get_xpath_value(response) # item['incomeRateName'] = self.__get_xpath_value(response) # item['incomeRate'] = self.__get_xpath_value(response) # item['nextIncomeRate'] = self.__get_xpath_value(response) # item['interestType'] = self.__get_xpath_value(response) item['riskLevel'] = self.__get_xpath_value( response, "//li[contains(text(),'风险评级')]/span/text()").strip() # item['redRule'] = self.__get_xpath_value(response) # item['buyRule'] = self.__get_xpath_value(response) item['startDate'] = self.__get_xpath_value( response, "//li[contains(text(),'发售起始日期')]/span/text()").strip() item['endDate'] = self.__get_xpath_value( response, "//li[contains(text(),'发售截止日期')]/span/text()").strip() # item['openDate'] = self.__get_xpath_value(response) # nextOpenDate = self.__get_xpath_value(response) # nextEndDate = self.__get_xpath_value(response) item['realEndDate'] = self.__get_xpath_value( response, "//li[contains(text(),'产品到期日')]/span/text()").strip() # openTime = self.__get_xpath_value(response) # closeTime = self.__get_xpath_value(response) # proManager = self.__get_xpath_value(response) # sellArea = self.__get_xpath_value(response) item['sellChannel'] = self.__get_xpath_value( response, "//li[contains(text(),'销售渠道')]/span/text()").strip() # currentPurchases = self.__get_xpath_value(response) item['firstAmount'] = self.__get_xpath_value( response, "//td[contains(text(), '认购价格')]/following-sibling::td[1]/text()" ).strip() item['firstSubMinAmount'] = self.__get_xpath_value( response, "//td[contains(text(), '首次认购下限')]/following-sibling::td[1]/text()" ).strip() item['minPurBalance'] = self.__get_xpath_value( response, "//td[contains(text(), '最低申购余额')]/following-sibling::td[1]/text()" ).strip() item['minRedBalance'] = self.__get_xpath_value( response, "//td[contains(text(), '最低赎回余额')]/following-sibling::td[1]/text()" ).strip() item['minSubUnit'] = self.__get_xpath_value( response, "//td[contains(text(), '认购基数')]/following-sibling::td[1]/text()" ).strip() item['minPurUnit'] = self.__get_xpath_value( response, "//td[contains(text(), '申购基数')]/following-sibling::td[1]/text()" ).strip() item['minRedUnit'] = self.__get_xpath_value( response, "//td[contains(text(), '赎回基数')]/following-sibling::td[1]/text()" ).strip() item['maxSingleSub'] = self.__get_xpath_value( response, "//td[contains(text(), '认购单笔上限')]/following-sibling::td[1]/text()" ).strip() item['maxSinglePur'] = self.__get_xpath_value( response, "//td[contains(text(), '申购单笔上限')]/following-sibling::td[1]/text()" ).strip() item['maxSingleRed'] = self.__get_xpath_value( response, "//td[contains(text(), '赎回单笔上限')]/following-sibling::td[1]/text()" ).strip() item['minSingleSub'] = self.__get_xpath_value( response, "//td[contains(text(), '认购单笔下限')]/following-sibling::td[1]/text()" ).strip() item['minSinglePur'] = self.__get_xpath_value( response, "//td[contains(text(), '申购单笔上限')]/following-sibling::td[1]/text()" ).strip() item['minSingleRed'] = self.__get_xpath_value( response, "//td[contains(text(), '赎回单笔上限')]/following-sibling::td[1]/text()" ).strip() # maxOneDaySub = self.__get_xpath_value(response) # plainHold = self.__get_xpath_value(response) # proNetValue = self.__get_xpath_value(response) # allowedResRed = self.__get_xpath_value(response) # allowedRelRed = self.__get_xpath_value(response) item['overviewUrl'] = '{}{}{}'.format( "http://www.cmbchina.com/cfweb/Personal/productdetail.aspx?code=", proCode, "&type=prodintro") # item['overviewDownloadUrl'] = '' item['infoUrl'] = '{}{}{}'.format( "http://www.cmbchina.com/cfweb/Personal/productdetail.aspx?code=", proCode, "&type=prodinfo") # item['infoDownloadUrl'] = '' item['noticeUrl'] = '{}{}{}'.format( "http://www.cmbchina.com/cfweb/Personal/productdetail.aspx?code=", proCode, "&type=prodnotice") # item['noticeDownloadUrl'] = '' item['netWorthUrl'] = '{}{}{}'.format( "http://www.cmbchina.com/cfweb/Personal/productdetail.aspx?code=", proCode, "&type=prodvalue") # item['netWorthDownloadUrl'] = '' item['reportUrl'] = '{}{}{}'.format( "http://www.cmbchina.com/cfweb/Personal/productdetail.aspx?code=", proCode, "&type=prodir") # item['reportDownloadUrl'] = '' item['commentUrl'] = '{}{}{}'.format( "http://www.cmbchina.com/cfweb/Personal/productdetail.aspx?code=", proCode, "&type=prodcomment") # item['commentDownloadUrl'] = '' item['instructionUrl'] = '{}{}{}'.format( "http://www.cmbchina.com/cfweb/Personal/productdetail.aspx?code=", proCode, "&type=prodexplain") # item['instructionDownloadUrl'] = '' item['riskDisclosureUrl'] = '{}{}{}'.format( "http://www.cmbchina.com/cfweb/Personal/productdetail.aspx?code=", proCode, "&type=prodrisk") # item['riskDisclosureDownloadUrl'] = '' yield scrapy.Request(item['instructionUrl'], meta={'item': item}, callback=self.parse_product_detail_2, dont_filter=True)
def parse_product_detail(self, response): item = BankproductItem() item['bankCode'] = 'citic' item['channel'] = 'web' item['proCode'] = self.__get_xpath_value(response, "//td[text()='产品代码']/../td[2]/text()").strip() item['proName'] = self.__get_re_value(response.text, "<div class=\"title_l\">(.*?)<span>", 1).strip() cycleTime = self.__get_xpath_value(response, "//td[text()='产品期限']/../td[2]/script/text()").strip() item['cycleTime'] = self.__get_re_value(cycleTime, "'(.*?)'", 1) + '天' item['firstAmount'] = self.__get_xpath_value(response, "//td[text()='购买起点']/../td[2]/@startpoint").strip() # 币种 currency_flag = self.__get_xpath_value(response, "//span[text()='币种']/../span[2]/@curr_type") if currency_flag == '001': item['currency'] = "人民币" elif currency_flag == '014': item['currency'] = "美元" pass # 风险等级 riskLevel_flag = self.__get_xpath_value(response, "//span[text()='风险等级']/../span[2]/@risklevel") if riskLevel_flag == '0': item['riskLevel'] = '无风险' elif riskLevel_flag == '1': item['riskLevel'] = '低风险' elif riskLevel_flag == '2': item['riskLevel'] = '较低风险' elif riskLevel_flag == '3': item['riskLevel'] = '中等风险' elif riskLevel_flag == '4': item['riskLevel'] = '较高风险' elif riskLevel_flag == '5': item['riskLevel'] = '高风险' # 产品状态 status_flag = self.__get_xpath_value(response, "//span[text()='产品状态']/../span[2]/@prod_state") if status_flag == '0': item['status'] = '开放期' elif status_flag == '1': item['status'] = '募集期' elif status_flag == '3': item['status'] = '发行失败' elif status_flag == '4': item['status'] = '停止交易' # 管理机构 proManager_code = self.__get_xpath_value(response, "//span[text()='管理机构']/../span[2]/@prdmanager") if proManager_code == '008': item['proManager'] = '中信银行' item['openDate'] = self.__get_xpath_value(response, "//span[text()='起息日']/../span[2]/text()").strip() item['realEndDate'] = self.__get_xpath_value(response, "//span[text()='到期日']/../span[2]/text()").strip() item['nextOpenDate'] = self.__get_xpath_value(response, "//span[text()='下一开放日']/../span[2]/text()").strip() # 销售对象 sellObject = self.__get_xpath_value(response, "//span[text()='产品面向客户群']/../span[2]/script/text()") sellObject = self.__get_re_value(sellObject, '"(.*?)"', 1) item['sellObject'] = sellObject.replace('0', "个人普通客户 ").replace('1', "个人金卡客户 ") \ .replace('2', "个人白金客户 ").replace('4', "个人钻石客户") # 销售区域 item['sellArea'] = self.__get_xpath_value(response, "//span[text()='销售区域']/../span[2]/text()").strip() item['incomeRate'] = self.__get_xpath_value(response, "//div[@class='col-lg-4 col-md-4 col-sm-4 lc_text_m']/div/span/@finagains") proNetValue = self.__get_xpath_value(response, "//span[text()='产品净值']/../span[2]/script/text()") item['proNetValue'] = self.__get_re_value(proNetValue, '"(.*?)"', 1) maxSingleRed = self.__get_xpath_value(response, "//span[text()='赎回单笔上限']/../span[2]/script/text()") item['maxSingleRed'] = '0.00' if self.__get_re_value(maxSingleRed, '"(.*?)"', 1) == '' \ else self.__get_re_value(maxSingleRed, '"(.*?)"', 1) minSingleRed = self.__get_xpath_value(response, "//span[text()='赎回单笔下限']/../span[2]/script/text()") item['minSingleRed'] = '0.00' if self.__get_re_value(minSingleRed, '"(.*?)"', 1) == '' \ else self.__get_re_value(minSingleRed, '"(.*?)"', 1) maxSingleSub = self.__get_xpath_value(response, "//span[text()='认购单笔上限']/../span[2]/script/text()") item['maxSingleSub'] = '0.00' if self.__get_re_value(maxSingleSub, '"(.*?)"', 1) == '' \ else self.__get_re_value(maxSingleSub, '"(.*?)"', 1) minSingleSub = self.__get_xpath_value(response, "//span[text()='认购单笔下限']/../span[2]/script/text()") item['minSingleSub'] = '0.00' if self.__get_re_value(minSingleSub, '"(.*?)"', 1) == '' \ else self.__get_re_value(minSingleSub, '"(.*?)"', 1) maxSinglePur = self.__get_xpath_value(response, "//span[text()='申购单笔上限']/../span[2]/script/text()") item['maxSinglePur'] = '0.00' if self.__get_re_value(maxSinglePur, '"(.*?)"', 1) == '' \ else self.__get_re_value(maxSinglePur, '"(.*?)"', 1) minSinglePur = self.__get_xpath_value(response, "//span[text()='申购单笔下限']/../span[2]/script/text()") item['minSinglePur'] = '0.00' if self.__get_re_value(minSinglePur, '"(.*?)"', 1) == ''\ else self.__get_re_value(minSinglePur, '"(.*?)"', 1) minRedUnit = self.__get_xpath_value(response, "//span[text()='赎回基数']/../span[2]/script/text()") item['minRedUnit'] = '0.00' if self.__get_re_value(minRedUnit, '"(.*?)"', 1) == '' \ else self.__get_re_value(minRedUnit, '"(.*?)"', 1) minSubUnit = self.__get_xpath_value(response, "//span[text()='认购基数']/../span[2]/script/text()") item['minSubUnit'] = '0.00' if self.__get_re_value(minSubUnit, '"(.*?)"', 1) == '' \ else self.__get_re_value(minSubUnit, '"(.*?)"', 1) minPurUnit = self.__get_xpath_value(response, "//span[text()='申购基数']/../span[2]/script/text()") item['minPurUnit'] = '0.00' if self.__get_re_value(minPurUnit, '"(.*?)"', 1) == '' \ else self.__get_re_value(minPurUnit, '"(.*?)"', 1) allowedResRed = self.__get_xpath_value(response, "//span[text()='是否允许预约赎回']/../span[2]/@dataisbit") item['allowedResRed'] = allowedResRed.replace('0', "否").replace('1', "是") allowedRelRed = self.__get_xpath_value(response, "//span[text()='是否允许实时赎回']/../span[2]/@dataisbit") item['allowedRelRed'] = allowedRelRed.replace('0', "否").replace('1', "是") item['instructionUrl'] = self.__get_xpath_value(response, "//div[@class='title_r']/ul/li[1]/a/@href") yield item