def spider_auction_list_and_insert(self, url, court_id, category_id, status_id, mysql_instance, table_name):
     item_html = UrlUtil.get_html_with_proxy(url, False)
     url_partial_list = re.findall(re.compile(r'"\/\/sf-item.taobao.com\/sf_item\/(\S+.htm)'), item_html.decode('gbk'))
     for url_partial in url_partial_list:
         url = 'https://sf-item.taobao.com/sf_item/' + url_partial
         auction_json = self.get_auction_json(url, court_id, category_id, status_id)
         mysql_instance.upsert_auction(auction_json, table_name)
     return len(url_partial_list)
 def spider_auction_and_insert(self, url, court_id, category_id, status_id,
                               mysql_instance):
     item_html = UrlUtil.get_html_with_proxy(url)
     url_list = re.findall(
         re.compile(r'<a href="(\S*?)item2.do?(\S*?)"><img'),
         item_html.decode('utf8'))
     for urls in url_list:
         url = 'http://www.gpai.net/sf/item2.do' + urls[1]
         auction_json = self.get_auction_json(url, court_id, category_id,
                                              status_id)
         mysql_instance.upsert_auction(auction_json)
Example #3
0
    def get_auction_json(self, url, court_id, category_id, status_id):
        auction_json = {}
        html = UrlUtil.get_html_with_proxy(url, False)
        et = etree.HTML(html)
        soup = BeautifulSoup(html, 'html.parser', from_encoding='gbk')

        auction_json['AuctionModel'] = ""
        auction_json['AuctionType'] = ""
        auction_json['SellingPeriod'] = ""
        # print(soup.find('td', class_='delay-td').find('span'))
        # print(soup.find('td', class_='delay-td').find_all('span')[1])
        auction_json['AuctionTimes'] = soup.find('td', class_='delay-td').find_all('span')[1].text[1:]
        auction_json['OnlineCycle'] = soup.find('span', class_='pay-mark').text
        auction_json['DelayCycle'] = soup.find('td', class_='delay-td').text.replace('\n', '').strip()

        auction_json['CashDeposit'] = ""
        auction_json['PaymentAdvance'] = ""

        top_info = soup.find('tbody', id='J_HoverShow')
        tds = top_info.find_all('td')
        start_price_span = tds[0].find_all('span')[2]
        increment_span = tds[1].find_all('span')[2]
        auction_type_span = tds[2].find_all('span')[1].span
        cash_deposit_span = tds[3].find_all('span')[1].span
        auction_cycle_span = tds[4].find_all('span')[1].span
        prior_buyer_span = tds[5].find_all('span')[1]
        access_price_span = tds[6].find_all('span')[1].span

        self.assign_auction_property(auction_json, 'StartPrice', start_price_span, True)
        self.assign_auction_property(auction_json, 'FareIncrease', increment_span, True)
        self.assign_auction_property(auction_json, 'CashDeposit', cash_deposit_span, True)
        self.assign_auction_property(auction_json, 'AccessPrice', access_price_span, True)

        auction_json['Title'] = soup.find('h1').text.replace(u"\u2022", u" ").replace(u"\xa0", u" ").strip()
        auction_json['CurrentPrice'] = soup.find('span', class_='pm-current-price').text.replace(',', '').strip()
        auction_json['CorporateAgent'] = soup.find('span', class_='item-announcement').text.strip()
        auction_json['Phone'] = soup.find('div', class_='contact-unit').find('p', class_='contact-line').find('span', class_='c-text').text
        auction_json['BiddingRecord'] = soup.find('span', class_='current-bid-user').text.strip() if soup.find('span', class_='current-bid-user') else ''
        auction_json['SetReminders'] = soup.find('span', class_='pm-reminder').find('em').text if soup.find('span', class_='pm-reminder') else 0
        auction_json['Onlookers'] = soup.find('span', class_='pm-surround').find('em').text if soup.find('span', class_='pm-surround') else 0
        auction_json['Enrollment'] = soup.find('em', class_='J_Applyer').text

        auction_json['Url'] = url
        auction_json['datetime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        auction_json['AuctionId'] = url[35:-4]
        auction_json['CourtId'] = court_id
        auction_json['CategoryId'] = category_id
        auction_json['StatusId'] = status_id
        return auction_json
 def get_user_id(self, url):
     page = UrlUtil.get_html_with_proxy(url, False)
     user_id_part = re.findall(re.compile(r'<input type="hidden" name="userId" value="(\d+)"'), page.decode('gbk'))
     return user_id_part[0]
 def get_total_count(self, url):
     page = UrlUtil.get_html_with_proxy(url, False)
     total_counts = re.findall(re.compile(r'<em class="count">(\d+)</em>'), page.decode('gbk'))
     return int(total_counts[0])
    def get_auction_json(self, url, court_id, category_id, status_id):
        auction_json = {}
        html = UrlUtil.get_html_with_proxy(url)
        et = etree.HTML(html)
        auction_model_div = et.xpath(
            '//div[@class="d-m-tb"]/table[1]/tr[1]/td[1]/text()')
        auction_json['AuctionModel'] = ""
        if auction_model_div.__len__() != 0:
            auction_model = auction_model_div[0]
            len = auction_model.__len__()
            if len > 7:
                auction_json['AuctionModel'] = auction_model[7:]
            else:
                auction_json['AuctionModel'] = auction_model[5:]
        auction_json['SellingPeriod'] = ""
        auction_json['AuctionTimes'] = ""
        auctionTimes = et.xpath(
            '//div[@class="d-m-tb"]/table[1]/tr[1]/td[2]/text()')
        if auctionTimes.__len__() != 0:
            auction_times = auctionTimes[0]
            if str(auction_json['AuctionModel'].encode('utf-8')) == '变卖':
                auction_json['SellingPeriod'] = auction_times[4:]
            else:
                auction_json['AuctionTimes'] = auction_times[5:]

        self.assign_auction_property_et(
            auction_json, 'AuctionType', et,
            '//div[@class="d-m-tb"]/table[1]/tr[1]/td[3]/text()', 5)

        onlineCycle = et.xpath(
            '//div[@class="d-m-tb"]/table[1]/tr[2]/td[1]/text()')
        auction_json['OnlineCycle'] = ""
        if onlineCycle.__len__() != 0:
            online_cycle = onlineCycle[0]
            len = online_cycle.__len__()
            if len > 8:
                auction_json['OnlineCycle'] = online_cycle[6:]
            else:
                auction_json['OnlineCycle'] = online_cycle[4:]

        self.assign_auction_property_et(
            auction_json, 'DelayCycle', et,
            '//div[@class="d-m-tb"]/table[1]/tr[2]/td[2]/text()', 5)
        self.assign_auction_property(auction_json, 'FareIncrease', html,
                                     r'<span id="Price_Step">(.*?)</span>',
                                     True)
        self.assign_auction_property(auction_json, 'StartPrice', html,
                                     r'<span id="Price_Start">(.*?)</span>',
                                     True)

        auction_json['CashDeposit'] = ""
        auction_json['PaymentAdvance'] = ""
        if str(auction_json['AuctionModel'].encode('utf-8')) == '变卖':
            paymentAdvance = et.xpath(
                '//div[@class="d-m-tb"]/table[1]/tr[3]/td[2]/text()')
            cashDeposit = et.xpath(
                '//div[@class="d-m-tb"]/table[1]/tr[3]/td[3]/text()')
            if paymentAdvance.__len__() != 0:
                payment_advance = paymentAdvance[0]
                cash_deposit = cashDeposit[0]
                auction_json['cash_deposit'] = cash_deposit[4:].replace(
                    ",", "")
                auction_json['payment_advance'] = payment_advance[6:].replace(
                    ",", "")
        else:
            cashDeposit = et.xpath(
                '//div[@class="d-m-tb"]/table[1]/tr[3]/td[2]/text()')
            if cashDeposit.__len__() != 0:
                cash_deposit = cashDeposit[0]
                auction_json['cash_deposit'] = cash_deposit[4:].replace(
                    ",", "")

        accessPrice = et.xpath(
            '//div[@class="d-m-tb"]/table[1]/tr[4]/td[1]/text()')
        auction_json['AccessPrice'] = ""
        if accessPrice.__len__() != 0:
            access_price = accessPrice[0]
            auction_json['AccessPrice'] = access_price[4:].replace(",",
                                                                   "").replace(
                                                                       "	", "")

        self.assign_auction_property(auction_json, 'Title', html,
                                     r'class="d-m-title"><b>(.*?)</b>', True)
        self.assign_auction_property_et(
            auction_json, 'Enrollment', et,
            '//div[@class="peoples-infos"]/span[1]/b[1]/text()')
        self.assign_auction_property_et(
            auction_json, 'SetReminders', et,
            '//div[@class="peoples-infos"]/span[2]/b[1]/text()')
        self.assign_auction_property_et(
            auction_json, 'Onlookers', et,
            '//div[@class="peoples-infos"]/span[3]/b[1]/text()')
        self.assign_auction_property(auction_json, 'CourtName', html,
                                     r"<td nowrap class='pr7'>(.*?)</td>",
                                     False, 5)
        self.assign_auction_property(auction_json, 'CorporateAgent', html,
                                     r"<td valign='top'>(.*?)</td>", False, 4)
        self.assign_auction_property(auction_json, 'Phone', html,
                                     r"<td colspan='2'>(.*?)</td>", False, 5)
        self.assign_auction_property(auction_json, 'BiddingRecord', html,
                                     r"id='html_Bid_Shu'>(.*?)</span>", True)
        self.assign_auction_property(auction_json, 'CurrentPrice', html,
                                     r"<b class='price-red'>(.*?)</b>", True)

        auction_json['Url'] = url
        auction_json['datetime'] = dataTime = time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        auction_json['AuctionId'] = url[44:]
        auction_json['CourtId'] = court_id
        auction_json['CategoryId'] = category_id
        auction_json['StatusId'] = status_id
        return auction_json
 def get_total_count(self, url):
     page = UrlUtil.get_html_with_proxy(url, False)
     total_counts = re.findall(re.compile(r'<label>(.*?)</label>'),
                               page.decode('utf8'))
     return int(total_counts[0])