def spider_auction_list_and_insert(self, url, court_id, category_id, status_id, mysql_instance, table_name): item_html = UrlUtil.get_html_with_proxy(url, False) url_partial_list = re.findall(re.compile(r'"\/\/sf-item.taobao.com\/sf_item\/(\S+.htm)'), item_html.decode('gbk')) for url_partial in url_partial_list: url = 'https://sf-item.taobao.com/sf_item/' + url_partial auction_json = self.get_auction_json(url, court_id, category_id, status_id) mysql_instance.upsert_auction(auction_json, table_name) return len(url_partial_list)
def spider_auction_and_insert(self, url, court_id, category_id, status_id, mysql_instance): item_html = UrlUtil.get_html_with_proxy(url) url_list = re.findall( re.compile(r'<a href="(\S*?)item2.do?(\S*?)"><img'), item_html.decode('utf8')) for urls in url_list: url = 'http://www.gpai.net/sf/item2.do' + urls[1] auction_json = self.get_auction_json(url, court_id, category_id, status_id) mysql_instance.upsert_auction(auction_json)
def get_auction_json(self, url, court_id, category_id, status_id): auction_json = {} html = UrlUtil.get_html_with_proxy(url, False) et = etree.HTML(html) soup = BeautifulSoup(html, 'html.parser', from_encoding='gbk') auction_json['AuctionModel'] = "" auction_json['AuctionType'] = "" auction_json['SellingPeriod'] = "" # print(soup.find('td', class_='delay-td').find('span')) # print(soup.find('td', class_='delay-td').find_all('span')[1]) auction_json['AuctionTimes'] = soup.find('td', class_='delay-td').find_all('span')[1].text[1:] auction_json['OnlineCycle'] = soup.find('span', class_='pay-mark').text auction_json['DelayCycle'] = soup.find('td', class_='delay-td').text.replace('\n', '').strip() auction_json['CashDeposit'] = "" auction_json['PaymentAdvance'] = "" top_info = soup.find('tbody', id='J_HoverShow') tds = top_info.find_all('td') start_price_span = tds[0].find_all('span')[2] increment_span = tds[1].find_all('span')[2] auction_type_span = tds[2].find_all('span')[1].span cash_deposit_span = tds[3].find_all('span')[1].span auction_cycle_span = tds[4].find_all('span')[1].span prior_buyer_span = tds[5].find_all('span')[1] access_price_span = tds[6].find_all('span')[1].span self.assign_auction_property(auction_json, 'StartPrice', start_price_span, True) self.assign_auction_property(auction_json, 'FareIncrease', increment_span, True) self.assign_auction_property(auction_json, 'CashDeposit', cash_deposit_span, True) self.assign_auction_property(auction_json, 'AccessPrice', access_price_span, True) auction_json['Title'] = soup.find('h1').text.replace(u"\u2022", u" ").replace(u"\xa0", u" ").strip() auction_json['CurrentPrice'] = soup.find('span', class_='pm-current-price').text.replace(',', '').strip() auction_json['CorporateAgent'] = soup.find('span', class_='item-announcement').text.strip() auction_json['Phone'] = soup.find('div', class_='contact-unit').find('p', class_='contact-line').find('span', class_='c-text').text auction_json['BiddingRecord'] = soup.find('span', class_='current-bid-user').text.strip() if soup.find('span', class_='current-bid-user') else '' auction_json['SetReminders'] = soup.find('span', class_='pm-reminder').find('em').text if soup.find('span', class_='pm-reminder') else 0 auction_json['Onlookers'] = soup.find('span', class_='pm-surround').find('em').text if soup.find('span', class_='pm-surround') else 0 auction_json['Enrollment'] = soup.find('em', class_='J_Applyer').text auction_json['Url'] = url auction_json['datetime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) auction_json['AuctionId'] = url[35:-4] auction_json['CourtId'] = court_id auction_json['CategoryId'] = category_id auction_json['StatusId'] = status_id return auction_json
def get_user_id(self, url): page = UrlUtil.get_html_with_proxy(url, False) user_id_part = re.findall(re.compile(r'<input type="hidden" name="userId" value="(\d+)"'), page.decode('gbk')) return user_id_part[0]
def get_total_count(self, url): page = UrlUtil.get_html_with_proxy(url, False) total_counts = re.findall(re.compile(r'<em class="count">(\d+)</em>'), page.decode('gbk')) return int(total_counts[0])
def get_auction_json(self, url, court_id, category_id, status_id): auction_json = {} html = UrlUtil.get_html_with_proxy(url) et = etree.HTML(html) auction_model_div = et.xpath( '//div[@class="d-m-tb"]/table[1]/tr[1]/td[1]/text()') auction_json['AuctionModel'] = "" if auction_model_div.__len__() != 0: auction_model = auction_model_div[0] len = auction_model.__len__() if len > 7: auction_json['AuctionModel'] = auction_model[7:] else: auction_json['AuctionModel'] = auction_model[5:] auction_json['SellingPeriod'] = "" auction_json['AuctionTimes'] = "" auctionTimes = et.xpath( '//div[@class="d-m-tb"]/table[1]/tr[1]/td[2]/text()') if auctionTimes.__len__() != 0: auction_times = auctionTimes[0] if str(auction_json['AuctionModel'].encode('utf-8')) == '变卖': auction_json['SellingPeriod'] = auction_times[4:] else: auction_json['AuctionTimes'] = auction_times[5:] self.assign_auction_property_et( auction_json, 'AuctionType', et, '//div[@class="d-m-tb"]/table[1]/tr[1]/td[3]/text()', 5) onlineCycle = et.xpath( '//div[@class="d-m-tb"]/table[1]/tr[2]/td[1]/text()') auction_json['OnlineCycle'] = "" if onlineCycle.__len__() != 0: online_cycle = onlineCycle[0] len = online_cycle.__len__() if len > 8: auction_json['OnlineCycle'] = online_cycle[6:] else: auction_json['OnlineCycle'] = online_cycle[4:] self.assign_auction_property_et( auction_json, 'DelayCycle', et, '//div[@class="d-m-tb"]/table[1]/tr[2]/td[2]/text()', 5) self.assign_auction_property(auction_json, 'FareIncrease', html, r'<span id="Price_Step">(.*?)</span>', True) self.assign_auction_property(auction_json, 'StartPrice', html, r'<span id="Price_Start">(.*?)</span>', True) auction_json['CashDeposit'] = "" auction_json['PaymentAdvance'] = "" if str(auction_json['AuctionModel'].encode('utf-8')) == '变卖': paymentAdvance = et.xpath( '//div[@class="d-m-tb"]/table[1]/tr[3]/td[2]/text()') cashDeposit = et.xpath( '//div[@class="d-m-tb"]/table[1]/tr[3]/td[3]/text()') if paymentAdvance.__len__() != 0: payment_advance = paymentAdvance[0] cash_deposit = cashDeposit[0] auction_json['cash_deposit'] = cash_deposit[4:].replace( ",", "") auction_json['payment_advance'] = payment_advance[6:].replace( ",", "") else: cashDeposit = et.xpath( '//div[@class="d-m-tb"]/table[1]/tr[3]/td[2]/text()') if cashDeposit.__len__() != 0: cash_deposit = cashDeposit[0] auction_json['cash_deposit'] = cash_deposit[4:].replace( ",", "") accessPrice = et.xpath( '//div[@class="d-m-tb"]/table[1]/tr[4]/td[1]/text()') auction_json['AccessPrice'] = "" if accessPrice.__len__() != 0: access_price = accessPrice[0] auction_json['AccessPrice'] = access_price[4:].replace(",", "").replace( " ", "") self.assign_auction_property(auction_json, 'Title', html, r'class="d-m-title"><b>(.*?)</b>', True) self.assign_auction_property_et( auction_json, 'Enrollment', et, '//div[@class="peoples-infos"]/span[1]/b[1]/text()') self.assign_auction_property_et( auction_json, 'SetReminders', et, '//div[@class="peoples-infos"]/span[2]/b[1]/text()') self.assign_auction_property_et( auction_json, 'Onlookers', et, '//div[@class="peoples-infos"]/span[3]/b[1]/text()') self.assign_auction_property(auction_json, 'CourtName', html, r"<td nowrap class='pr7'>(.*?)</td>", False, 5) self.assign_auction_property(auction_json, 'CorporateAgent', html, r"<td valign='top'>(.*?)</td>", False, 4) self.assign_auction_property(auction_json, 'Phone', html, r"<td colspan='2'>(.*?)</td>", False, 5) self.assign_auction_property(auction_json, 'BiddingRecord', html, r"id='html_Bid_Shu'>(.*?)</span>", True) self.assign_auction_property(auction_json, 'CurrentPrice', html, r"<b class='price-red'>(.*?)</b>", True) auction_json['Url'] = url auction_json['datetime'] = dataTime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) auction_json['AuctionId'] = url[44:] auction_json['CourtId'] = court_id auction_json['CategoryId'] = category_id auction_json['StatusId'] = status_id return auction_json
def get_total_count(self, url): page = UrlUtil.get_html_with_proxy(url, False) total_counts = re.findall(re.compile(r'<label>(.*?)</label>'), page.decode('utf8')) return int(total_counts[0])