Beispiel #1
0
 def parse(self, response):
     sel = Selector(response)
     item = PaipaidaiItem()
     # 分开爬,需要哪个就把其他的注释掉
     item['username'] = sel.xpath('//a[@class="username"]/text()').extract()
     item['process'] = sel.xpath(
         '//div[@class="item w260"]/text()').extract()
     item['bidinfo'] = sel.xpath(
         '//span[@class="bidinfo"]/text()').extract()
     item['money'] = sel.xpath(
         '//div[@class="newLendDetailMoneyLeft"]/dl[1]/dd/text()').extract(
         )
     item['rate'] = sel.xpath(
         '//div[@class="newLendDetailMoneyLeft"]/dl[2]/dd/text()').extract(
         )
     item['timelimit'] = sel.xpath(
         '//div[@class="newLendDetailMoneyLeft"]/dl[3]/dd/text()').extract(
         )
     item['finishtime'] = sel.xpath(
         '//span[@class="countdown_row countdown_amount"]/text()').extract(
         )
     yield item
Beispiel #2
0
    def parse_user(self, response):
        item =  PaipaidaiItem()
        # 项目id
        id = re.findall("id=(\d+)",response.url)[0]
        item['id'] = id
        # 用户名
        userName = response.xpath(".//a[@class='userface']/@href").extract_first()
        userName =  userName.split("/")[-1]
        item['userName'] = userName



        # 用户评级
        userRate = response.xpath(".//span[contains(@class,'creditRating')]/@class").extract_first().replace('creditRating',"").strip()
        item['userRate'] = userRate
        # 金额 年利率 年限
        amount, year_rate, timeLimit = response.xpath(".//div[@class='newLendDetailMoneyLeft']//dd/text()").extract()
        item['amount'] = amount.replace(",","")
        item['year_rate'] = year_rate
        item['timeLimit'] = timeLimit
        #payMethod = response.xpath(".//div[@class='part mb16 clearfix']/div")
        progressBar = "".join(response.xpath(".//div[@class='part clearfix']/div[@class='item w260']/text()").extract())
        # 进度条
        progressBar = re.findall("\d+%",progressBar)[0]
        item['progressBar'] = progressBar
        bidders = response.xpath(".//div[@class='item w164']/text()").extract_first()
        # 投标人数
        bidders = re.findall("\d+", bidders)[0]
        item['bidders'] = bidders
        # 剩余时间
        rest_time = response.xpath(".//span[@id='leftTime']//text()").extract_first()
        item['end_time'] = rest_time
        #rest_time = re.findall("\d+天", rest_time)[0]
        # # 借款余额
        # borrow_balance = response.xpath(".//span[@id='listRestMoney']/text()").extract_first()
        # if borrow_balance:
        #     borrow_balance = borrow_balance.replace(",","").replace("¥","")
        #item['borrow_balance'] = borrow_balance
        borrow_info = response.xpath(".//div[@class='lender-info']//div[@class='flex']//span/text()").extract()
        # 借款人信息
        male, age, registerTime, degree_education, college, learn_form = borrow_info
        item['male'] = male
        item['age'] = age
        item['registerTime'] = registerTime
        item['degree_education'] = degree_education
        item['college'] = college
        item['learn_form'] = learn_form
        # 认证信息
        authentication_information = "|".join(response.xpath(".//ul[@class='record-info']/li/text()").extract())
        item['authentication_information'] = authentication_information
        html = response.text
        html = html.replace("¥","")
        re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I)
        html = re_script.sub('', html)  # 去掉SCRIPT
        re_comment = re.compile('<!--[^>]*-->')  # HTML注释
        html = re_comment.sub('', html)  # 去掉SCRIPT
        re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I)  # style
        html = re_style.sub('', html)
        html = re.sub("</?\w+[^>]*>","",html)
        if '成功借款次数' in html:
            succeed_borrow = re.findall("成功借款次数.*?(\d+).*?次",html)[0]
        else:
            succeed_borrow = ""
        item['succeed_borrow'] = succeed_borrow
        if '第一次成功借款时间' in html:
            first_borrow_time = re.findall("第一次成功借款时间.*?(\d+/\d+/\d+).*?$",html,re.M)[0]
        else:
            first_borrow_time = ""
        item['first_borrow_time'] = first_borrow_time
        if '历史记录' in html:
            #borrow_history = re.findall("历史记录: 1次流标,1次撤标,0次失败",html)[0]
            borrow_history = re.findall(u"历史记录: ([\u4e00-\u9fa5|\d+|,]*)", html)[0]
        else:
            borrow_history = ""
        item['borrow_history'] = borrow_history
        if '成功还款次数' in html:
            succeed_repay = re.findall("成功还款次数:.*?(\d+).*?次", html)[0]
        else:
            succeed_repay = ""
        item['succeed_repay'] = succeed_repay
        if  '正常还清次数' in html:
            normal_repay = re.findall("正常还清次数:.*?(\d+).*?次",html)[0]
        else:
            normal_repay = ""
        item['normal_repay'] = normal_repay.strip()
        if '逾期(0-15天)还清次数' in html:
            overtime_less_15 = re.findall("逾期\(0-15天\)还清次数:.*?(\d+).*?次",html)[0]
        else:
            overtime_less_15 = ""
        item['overtime_less_15'] = overtime_less_15
        if '逾期(15天以上)还清次数' in html:
            overtime_more_15 = re.findall("逾期\(15天以上\)还清次数:.*?(\d+).*?次",html)[0]
        else:
            overtime_more_15 = ""
        item['overtime_more_15'] = overtime_more_15
        if '累计借款金额'in html: # 47,570.00
            cumulative_amount_of_borrowing = re.findall("累计借款金额:((?:\d+,?)*(?:\d+)?(?:\.\d+)?)", html)[0].replace(",","")
        else:
            cumulative_amount_of_borrowing = ""
        item['cumulative_amount_of_borrowing'] = cumulative_amount_of_borrowing
        if '待还金额' in html:
            to_be_repay = re.findall("待还金额:((?:\d+,?)*(?:\d+)?(?:\.\d+)?)", html)[0].replace(",","")
        else:
            to_be_repay = ""
        item['to_be_repay'] = to_be_repay
        if '待收金额' in html:
            #to_be_gather = re.findall("待收金额:.*?((?:\d+,?)*(?:\d+)?(?:\.\d+)?)", html)[0].replace(",","")
            to_be_gather = re.findall("待收金额:\s*?((?:\d+,)*\d+\.\d+)", html,re.S)[0].replace(",", "")
        else:
            to_be_gather = ""
        item['to_be_gather'] = to_be_gather
        if '单笔最高借款金额' in html:
            max_borrow_amount = re.findall("单笔最高借款金额:((?:\d+,?)*(?:\d+)?(?:\.\d+)?)", html)[0].replace(",","")
        else:
            max_borrow_amount = ""
        item['max_borrow_amount'] = max_borrow_amount
        if '历史最高负债' in html:
            max_liabilities = re.findall("历史最高负债:((?:\d+,?)*(?:\d+)?(?:\.\d+)?)", html)[0].replace(",","")
        else:
            max_liabilities = ""
        item['max_liabilities'] = max_liabilities
        if '负债曲线图' in html:
            #categories = re.findall("categories:\s*\[([\s\S]*?)\]", html)[0]
            #categories = categories.split(",")
            #categories = list(map(lambda x:x.replace('"',"").strip(),categories))

            data = re.findall("data:\s*\[([\s\S]*?)\]", html)[0]
            data = data.split(",")
            #data = list(map(lambda x:x.strip(),data))
            data = list(filter(lambda x:len(x.strip())>0,data))
            last_fuzhai = data[-1].strip()
            #fuzhai = list(zip(categories,data))
            item['last_fuzhai'] = last_fuzhai
        lendDetailTab_tabContent_table1 = response.xpath(".//table[@class='lendDetailTab_tabContent_table1 normal' and not(@style)]")
        sum_of_to_pay = max_overtime = ""
        for table in lendDetailTab_tabContent_table1:
            name = table.xpath(".//th/text()").extract()
            if '金额' in name:# 说明是未来6个月的待还记录
                table_f_six_info = table.xpath(".//td/text()").extract()
                table_f_six_info = [item.replace("¥","") for item in table_f_six_info]
                half_length = int(len(table_f_six_info)/2)
                #table_f_six_info = list(zip(table_f_six_info[:half_length], table_f_six_info[half_length:]))
                table_f_six_info = table_f_six_info[half_length:]
                table_f_six_info = sum([float(item.replace(",","")) for item in table_f_six_info])
                sum_of_to_pay = table_f_six_info
            if '最大逾期天数' in name: # 说明是过去6个月有回款记录的逾期天数
                table_last_six_info = table.xpath(".//td/text()").extract()
                half_length = int(len(table_last_six_info) / 2)
                #table_last_six_info = list(zip(table_last_six_info[:half_length], table_last_six_info[half_length:]))
                table_last_six_info = table_last_six_info[half_length:]
                max_overtime = max([float(item.replace(",","")) for item in table_last_six_info])
                pass
        item['sum_of_to_pay'] = sum_of_to_pay
        item['max_overtime'] = max_overtime

        # 历史成功借款
        hisBorrowTable = response.xpath(".//table[@class='lendDetailTab_tabContent_table1 normal' and @style]//tr[@class='tab-list']")
        avg_rate = avg_data = avg_amount = 0
        if hisBorrowTable:
           for item_1 in hisBorrowTable:
               itemList = item_1.xpath(".//text()").extract()
               itemList = list(filter(lambda x:len(x.strip())>0,itemList))
               itemList = [item.strip() for item in itemList]
               rate = float(itemList[1][:-1])*0.01
               avg_rate += rate
               # 期限
               data = re.findall("(\d+)个月",itemList[2])
               if data:
                   data = int(data[0])*30
                   avg_data += data
               data = re.findall("(\d+)天", itemList[2])
               if data:
                   data = int(data[0])
                   avg_data += data

               avg_amount += float(itemList[3].replace(",",""))
           avg_rate =  avg_rate / len(hisBorrowTable)
           avg_data = avg_data / len(hisBorrowTable)
           avg_amount = avg_amount / len(hisBorrowTable)
        item['avg_rate'] = avg_rate
        item['avg_data'] = avg_data
        item['avg_amount'] = avg_amount
        item['update_time'] = datetime.datetime.now().strftime("%Y-%m-%d")
        # 状态
        if response.xpath(".//div[@class='newbidstatus_lb']"):# 投标已结束
            state = "0"
        elif response.xpath(".//div[@class='restMoney']"): # 还在进行
            state = "0.5"
        elif response.xpath(".//div[@class='wrapNewLendDetailInfoRight']//img[@alt='借款成功']"):
            state = "1"
        elif "借款成功" in response.text:
            state = '1'
        else:
            print("item state error:%s" % id, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
            self.log("tem state error:%s" % id, logging.INFO)
            state = "-1"
        item['state'] = state
        if state!='-1':
            if state == '1':
               self.log("ITEM fiinish(OneDown):%s" % id, logging.INFO)
            else:
                self.log("item finish:%s" % id, logging.INFO)
            print ("item finish:%s" % id,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
            yield item