def parse_ads_images(self, response): # Somehow CGMC doesn't like when ads images and ads are scapred from the same page and keeps # throwing foreign key exceptions (trying to save images before the ad is saved) # Hack : Make a new request only for images, not clean, but it works :\ ads_id = self.get_ad_id(response.url) image_urls = response.css( 'section#main .product figure a::attr(href)').extract() if len(image_urls) > 0: img_item = items.AdsImage(image_urls=[]) for img_url in image_urls: img_item['image_urls'].append( self.make_request('image', url=img_url)) img_item['ads_id'] = ads_id yield img_item
def parse_product(self, response): try: offer_id = re.search(r'/product/([\w-]+)/$', response.url).group(1) title = re.search(r'produit «(.+)»', self.get_text(response.css('div.card-header h2'))).group(1) ads_item = items.Ads() ads_item['offer_id'] = offer_id ads_item['title'] = title ads_item['relativeurl'] = self.get_relative_url(response.url) ads_item['fullurl'] = response.url trs = response.css('table.m-0 tr') for tr in trs: key = self.get_text(tr.css('th')).lower() value = self.get_text(tr.css('td')) if key == 'prix en ฿': ads_item['price_btc'] = value elif key == 'catégorie': ads_item['category'] = value elif key == 'vendeur': value = tr.xpath(".//a/@href").extract_first() value = re.search("/account/(.*)/$", value).group(1) ads_item['vendor_username'] = value elif key == 'escrow': ads_item['escrow'] = value elif key == 'description': ads_item['description'] = value elif key in ['prix en €']: ads_item['price_eur'] = value else: self.logger.warning("Found a new piece of product information, '%s', with value '%s' at %s" % (key, value, response.url)) yield ads_item images_url = response.css('.card-img-top a img::attr(src)').extract() for url in images_url: if url: ads_image = items.AdsImage(image_urls = []) ads_image['image_urls'].append(self.make_request('image', url=url)) ads_image['ads_id'] = offer_id yield ads_image except Exception as error: self.logger.warning("Failed to yield ads at %s because '%s'" % (response.url, error))
def parse_listing(self, response): title = response.xpath( ".//section[@id='content1']//div[@class='listing_right']/span/text()" ).extract_first(default="").strip() username = response.xpath( ".//section[@id='content1']//div[@class='listing_right']//a[@class='greenlink']/text()" ).extract_first(default="").strip() if title == "" and username == "": self.logger.warning("Found what is likely an empty page at %s." % response.url) else: # Try to yield ads. try: ads_item = items.Ads() ads_item['title'] = title ads_item['vendor_username'] = username ads_item['relativeurl'] = self.get_relative_url(response.url) ads_item['fullurl'] = response.url if 'clid' in response.url: ads_item['offer_id'] = self.get_url_param( response.url, 'clid') else: ads_item['offer_id'] = self.get_url_param( response.url, 'lid') ads_item['category'] = response.xpath( ".//section[@id='content1']//div[@class='listing_right']/br/following-sibling::span/text()" ).extract_first(default="").strip() ads_item['ships_from'] = response.xpath( ".//section[@id='content1']//div[@class='listing_right']//b[contains(text(),'Shipping From:')]/following-sibling::span/text()" ).extract_first(default="").strip() ads_item['ships_to'] = response.xpath( ".//section[@id='content1']//div[@class='listing_right']//b[contains(text(),'Shipping To:')]/following-sibling::span/text()" ).extract_first(default="").strip() ads_item['description'] = self.get_text( response.xpath(".//section[@id='content1']/p")) ads_item['escrow'] = self.get_text( response.xpath( ".//section[@id='content1']//div[@class='listing_right']/div/span[@style='float:right']/span" )) ads_item['multisig'] = response.xpath( ".//section[@id='content1']//div[@class='listing_right']/div/span[@style='float:right']/img[@alt='Multisig']" ) ads_item['multisig'] = True if ads_item['multisig'] else False ads_item['stock'] = self.get_text( response.xpath( ".//section[@id='content1']//div[@class='listing_right']/div/span[not(@style='float:right')]/span" )) ads_item['shipping_options'] = self.get_shipping_options( response) ads_item['accepted_currencies'] = self.get_accepted_currencies( response) prices_text = self.get_text( response.xpath( ".//section[@id='content1']//div[@class='listing_right']/p" )) price_usd = re.search(r"\$\s*([\d\.]+)", prices_text, re.M | re.I) price_btc = re.search(r"([\d\.]+)\s*฿", prices_text, re.M | re.I) price_xmr = re.search(r"([\d\.]+)\s*XMR", prices_text, re.M | re.I) if price_usd: ads_item["price_usd"] = price_usd.group(1) else: self.logger.warning("No price_usd found on %s" % response.url) if price_xmr: ads_item["price_xmr"] = price_xmr.group(1) if price_btc: ads_item["price_btc"] = price_btc.group(1) yield ads_item except Exception as error: self.logger.warning("Couldn't yield ad from %s (Error: %s)" % (response.url, error)) # Try to yield images. try: image_urls = response.xpath( ".//section[@id='content1']//div[@class='listing_image']/img/@src" ).extract() if len(image_urls) > 0: img_item = items.AdsImage(image_urls=[]) for img_url in image_urls: img_item['image_urls'].append( self.make_request(reqtype='image', url=img_url)) img_item['ads_id'] = ads_item['offer_id'] yield img_item except Exception as error: self.logger.warning( "Couldn't yield ad images from %s (Error: %s)" % (response.url, error)) # Yield product ratings. # Note, that the price is also available in ads. feedbacks = response.xpath( ".//section[@id='content2']//div[@class='feedback']") if feedbacks: for feedback in feedbacks: rating = items.ProductRating() rating["ads_id"] = ads_item["offer_id"] rating["submitted_on_string"] = feedback.xpath( "div[@class='feedback_header']/span/text()").extract_first( default="").strip() rating["submitted_on"] = self.parse_datetime( rating["submitted_on_string"]) rating['price_usd'] = feedback.xpath( "div[@class='feedback_subheader']/div/span/text()[contains(., 'USD')]" ).extract_first() rating['price_usd'] = rating['price_usd'].replace( "~", "").replace("USD", "").replace(" ", "") rating_star = feedback.xpath( "div[@class='feedback_subheader']//div[contains(@style,'img/star.png')]/@style" ).extract_first(default="") rating_star = re.search(r"width:(\d+)px;height", rating_star, re.M | re.S) if rating_star: rating_star = float(rating_star.group(1)) rating['rating'] = rating_star / 120 * 5 warning = feedback.xpath( "div[@class='feedback_subheader']/div/span") if warning and len(warning) > 1: rating['warnings'] = self.get_text(warning[0]) rating["comment"] = self.get_text(feedback.xpath("p")) rating["submitted_by"] = feedback.xpath( "div[@class='feedback_header']//span[@class='feedbackScore']/../text()" ).extract_first(default="").strip() rating["submitter_rating"] = self.get_text( feedback.xpath( "div[@class='feedback_header']//span[@class='feedbackScore']/sup" )) rating["submitted_by_number_transactions"] = self.get_text( feedback.xpath( "div[@class='feedback_header']//span[@class='feedbackScore']/sub" )) yield rating
ads_item['fullurl'] = response.url parsed_url = urlparse(response.url) ads_item['relativeurl'] = "%s?%s" % (parsed_url.path, (parsed_url.query)) yield ads_item elif listing_not_found == 'Listing not found': self.logger.warning('Listing not found at %s' % response.url) else: self.logger.warning('Unknown listing status %s' % response.url) ## ===================== IMAGES ===================== images_url = response.css('img.productImage::attr(src)').extract() for url in images_url: img_item = items.AdsImage(image_urls=[]) img_item['image_urls'].append(self.make_request('image', url=url)) img_item['ads_id'] = ads_item['offer_id'] yield img_item ## ===================== Product Ratings (feedback) ========= rating_lines = response.css('.ratings table tr') for tr in rating_lines: try: rating_item = items.ProductRating() age = self.get_text(tr.css('td.age')) m = re.search('(\d+)d', age) if m: days_offset = m.group(1) # A sanity check. Dream has some dates which are in 1969 and 1970..
def parse_product(self, response): try: username = self.get_text(response.css('h4.media-heading a')) offer_id = re.search(r'/listings/[\w-]+/([\w-]+)$', response.url).group(1) title = self.get_text(response.css('h3.m-b-15')) ads_item = items.Ads() ads_item['vendor_username'] = username ads_item['offer_id'] = offer_id ads_item['title'] = title ads_item['relativeurl'] = self.get_relative_url(response.url) ads_item['fullurl'] = response.url prices = response.xpath( ".//div[@class='panel-footer text-center']").extract_first() ads_item['accepted_currencies'] = [] price_usd = re.search("([0-9\.]*) USD", prices) price_xmr = re.search("([0-9\.]*) XMR", prices) price_btc = re.search("([0-9\.]*) BTC\n", prices) if price_usd: ads_item['price_usd'] = price_usd.group(1) if price_xmr: ads_item['price_xmr'] = price_xmr.group(1) ads_item['accepted_currencies'].append("xmr") if price_btc: ads_item['price_btc'] = price_btc.group(1) ads_item['accepted_currencies'].append("btc") dts = response.css("dl.dl-horizontal dt") for dt in dts: key = self.get_text(dt).lower() value = self.get_text(dt.xpath('following-sibling::dd[1]')) if key == 'sold': ads_item['already_sold'] = re.search(r'(\d+)', value).group(1) elif key == 'ships from': ads_item['ships_from'] = value elif key == 'ships to': ads_item['ships_to'] = value elif key == 'payment type': ads_item['escrow'] = value if 'multisig' in value.lower(): ads_item['multisig'] = True elif key == 'product type': ads_item['category'] = value elif key in [ 'sold by', 'trust rating', 'creation date', 'starts from' ]: pass else: self.logger.warning( 'New information found on use profile page : %s' % key) ads_item['shipping_options'] = [] for option in response.css('select#shipping_method option'): ads_item['shipping_options'].append(self.get_text(option)) ads_item['description'] = self.get_text(response.css('p.break-me')) yield ads_item except Exception as error: self.logger.warning("Failed to yield ads at %s because '%s'" % (response.url, error)) try: # Yield images in thumbnail. images_url = response.css('a.thumbnail img::attr(src)').extract() for url in images_url: if url: ads_image = items.AdsImage(image_urls=[]) ads_image['ads_id'] = offer_id ads_image['image_urls'].append( self.make_request(reqtype='regular', url=url)) yield ads_image # Yield feature image. image_url = response.css( 'img.featured-image::attr(src)').extract_first() if image_url: ads_image = items.AdsImage(image_urls=[]) ads_image['ads_id'] = offer_id ads_image['image_urls'].append( self.make_request(reqtype='regular', url=image_url)) yield ads_image except Exception as error: self.logger.warning("Failed to yield images at %s because '%s'" % (response.url, error))
def parse_listing(self, response): ads = items.Ads() ads_img = items.AdsImage() listing_content = response.css("#content1") # Tabs feedback_content = response.css("#content2") # Tabs ads['title'] = self.get_text_first(response.css('.listing_right span')) #ads['offer_id'] = self.get_url_param(response.url, 'lid') try: offer_id = self.get_url_param(response.url, 'lid') ads['offer_id'] = self.get_url_param(response.url, 'lid') except: self.logger.warning( "Ran into a URL parameter issue at URL: %s. Offer_ID is not recorded." % (response.url)) ads['offer_id'] = self.get_url_param(response.url, 'lid') ads['relativeurl'] = response.meta['relativeurl'] ads['fullurl'] = self.make_url(ads['relativeurl']) user_url = response.css('.listing_right').xpath( './/a[contains(@href, "page=profile")]/@href').extract_first() # Some items don't have an associated vendor. try: ads['vendor_username'] = self.get_url_param(user_url, 'user') except: self.logger.warning( 'No seller available at URL: %s. Seller is noted as \'\'. Inspect the URL post-crawl.' % (response.url)) ads['vendor_username'] = '' ads['category'] = response.meta['category'] multilisting_select = listing_content.css( 'select[name="multilistingChild"]' ) # 2 types of Ads. Multilisting or not. if not multilisting_select: ads['multilisting'] = False listing_right_p = self.get_text( listing_content.css(".listing_right p")) m = re.search( r'\((\d+(\.\d+)?)\s*\xe0\xb8\xbf\)', listing_right_p ) # Search for bitcoin icon \xe0\b8\xbf is unicode char for bitcoin encoded in UTF8 m2 = re.search(r'([0-9.]{1,10}) \xe0\xb8\xbf', listing_right_p) if m: ads['price'] = m.group(1) # minor error handling in case the previous regex written by Pier-Yver doesn't catch bitcoin prices. elif m is None and m2 is not None: ads['price'] = m2.group(1) #self.logger.warning('Encountered an error with the old price-regex. Using RM\'s regex at URL: %s' % (response.url)) else: ads['multilisting'] = True options = [] # Just added @ below which should fix everything. for option in multilisting_select.xpath('.//option[@value!=""]'): options.append(self.get_text(option)) ads['price'] = json.dumps(options) #Bunches of regex to parse the page. listing_right_html = self.get_text( listing_content.css('.listing_right').extract_first() ) # Read HTML. We need tags as separator. listing_right_span_text = self.get_text( listing_content.css('.listing_right span')) m = re.search('<b>shipping from\s*:\s*</b>\s*([^<]+)', listing_right_html, re.IGNORECASE) if m: ads['ships_from'] = m.group(1) m = re.search('<b>shipping to\s*:\s*</b>\s*([^<]+)', listing_right_html, re.IGNORECASE) if m: ads['ships_to'] = m.group(1) shipping_options = [] for option in listing_content.css( '.listing_right form select[name="shipment"] option[value!=""]::text' ).extract(): shipping_options.append(self.get_text(option)) ads['shipping_options'] = json.dumps(shipping_options) ads['description'] = self.get_text(listing_content.xpath('./p')) stocks_possibilities = [ 'Excellent stock', 'Good stock', 'Low stock', 'Very low stock' ] for possibility in stocks_possibilities: if possibility in listing_right_span_text: ads['stock'] = possibility break yield ads # Ads Image. ads_img['ads_id'] = ads['offer_id'] ads_img['image_urls'] = [ self.make_request( 'image', url=listing_content.css( ".listing_image img::attr(src)").extract_first(), referer=response.url) ] yield ads_img # Handling listing feedbacks for feedback in feedback_content.css(".feedback"): try: rating = items.ProductRating() rating['ads_id'] = ads['offer_id'] rating['comment'] = self.get_text(feedback.css('p')) #rating['submitted_by'] = self.get_text(feedback.css('.feedback_header span a')) try: username = feedback.css('.feedback_header span a').xpath( "./text()")[0].extract().strip() except: username = '' self.logger.warning( 'Found a review with no username. URL: %s' % response.url) rating['submitted_on'] = self.parse_timestr( self.get_text( feedback.css('.feedback_header').xpath( 'span/text()').extract_first())) rating['submitted_by'] = username #star_styles = feedback.css('.feedback_subheader').xpath('./div[1]/@style').extract_first() star_styles = feedback.css('.feedback_subheader').xpath( './div/div')[0].extract() m = re.search(r'width:(\d+)px', star_styles) if m: width = int(m.group(1)) rating['rating'] = '%d/5' % (width // 24 ) # One star is 24 px wide else: self.logger.warning('Cannot find product rating score.') yield rating except Exception as e: self.logger.warning( 'Could not get listing feedback at %s. Error %s' % (response.url, e)) #If there is several pages of feedback. feedback_buffer_middleware will buffer them until we have them all and then sends them further in pipeline for url in feedback_content.css( 'div.pagination a::attr(href)').extract(): if self.get_url_param(url, 'pg') != '1': yield self.make_request( 'listing', url=url, relativeurl=response.meta['relativeurl'], ads_id=ads['offer_id'], category=response.meta['category']) # If statement to avoid requesting vendors pages when there is no vendor associated with an item. if ads['vendor_username'] is not '': yield self.make_request('userprofile', url=user_url)
def parse_offer(self, response): ads = items.Ads() ads['offer_id'] = self.get_offer_id_from_url(response.url) layout = 'unknown' info_block = response.xpath( '//h1[text()="Info"]/..' ) # Two known layout. Try first, fallback on second if len(info_block) == 1: layout = 'with_headings' else: layout = 'without_headings' if layout == 'without_headings': info_block = response.xpath( '//h1[contains(@class, "fheading")]/..') ads['title'] = self.get_text(response.css('h1.fheading')) ads['vendor_username'] = self.get_text( info_block.xpath('.//a[contains(@href, "profile")]')) if 'category' in response.meta and response.meta[ 'category'] is not None: ads['category'] = response.meta['category'] else: ads['category'] = None ads['fullurl'] = response.url.replace('/refund', '') ads['relativeurl'] = "/offer/%s" % ads['offer_id'] # ===== Info block 1 - Ships from/to, escrot, multisig, etc ========== # We determine the type of info by the icon in front of it. Most reliable way to do it as layout changes freely between listings if layout == 'with_headings': p = info_block.xpath('./p[1]') elif layout == 'without_headings': p = info_block.xpath('./p[1]') for line in p.extract_first().split('<br>'): linesel = scrapy.Selector(text=line) line_txt = self.get_text(linesel) if len(linesel.css(".ion-log-out")) > 0: # Ships From icon m = re.search('ships from:(.+)', line_txt, re.IGNORECASE) if m: ads['ships_from'] = self.get_text(m.group(1)) elif len(linesel.css(".ion-log-in")) > 0: # Ships To icon m = re.search('only ships to certain countries\s*\(([^\)]+)\)', line_txt, re.IGNORECASE) if m: ads['ships_to'] = json.dumps([ self.get_text(x.upper()) for x in m.group(1).split(',') ]) elif 'Worldwide' in line_txt: ads['ships_to'] = 'Worldwide' m = re.search('with Exceptions\s*\(([^\)]+)\)', line_txt, re.IGNORECASE) if m: ads['ships_to_except'] = json.dumps([ self.get_text(x.upper()) for x in m.group(1).split(',') ]) else: self.logger.warning( "New format of 'ships_to' string (%s) at %s" % (line_txt, response.url)) elif len(linesel.css( ".ion-android-share-alt")) > 0: # Properties icons if line_txt: line_txt = line_txt.lower() ads['multisig'] = True if 'multisig' in line_txt else False ads['escrow'] = True if 'escrow' in line_txt else False elif len(linesel.css( ".ion-android-checkmark-circle")) > 0: # Auto Accept icon if line_txt: line_txt = line_txt.lower() ads['auto_accept'] = True if 'auto-accept' in line_txt else False elif len(linesel.css( ".ion-ios-monitor-outline")) > 0: # Digital Good icon pass else: icontype = linesel.css('.ionicons') if icontype: iconclass = icontype[0].xpath('@class').extract_first() self.logger.warning( 'Unhandled information available with icon of type (%s) in offer page at %s' % (iconclass, response.url)) # ========================================= ## ============= Prices Options ======= price_opt_table = response.xpath( ".//h4[contains(text(), 'Prices')]/../table") options = [] for line in price_opt_table.css('tbody tr'): option = {} option['amount'] = self.get_text(line.css('td:nth-child(1)')) option['price_btc'] = self.get_text(line.css('td:nth-child(3)')) options.append(option) if len(options) > 0: ads['price_options'] = json.dumps(options) if len(options) == 1: m = re.search('(\d+(\.\d+)?) BTC.+', options[0]['price_btc']) if m: ads['price'] = m.group(1) ## ============== ## ============ Shipping Options ======== shipping_opt_table = response.xpath( ".//h4[contains(text(), 'Shipping Options')]/../table") options = [] for line in shipping_opt_table.css('tbody tr'): option = {} option['name'] = self.get_text(line.css('td:nth-child(1)')) amount_raw = line.css('td:nth-child(2)').extract_first() amount_raw = amount_raw.replace( '<i class="ionicons ion-ios-infinite"></i>', 'inf') # Infinity option['amount'] = self.get_text(scrapy.Selector(text=amount_raw)) option['price_btc'] = self.get_text( line.css('td:nth-child(4)')).replace(' BTC', '') options.append(option) if len(options) > 0: ads['shipping_options'] = json.dumps(options) ## ===================== # =================== Info block 2. List of key/value with key in bold. if layout == 'with_headings': p = response.xpath( './/h4[contains(text(), "Information")]/..').extract_first() if p is None: # BUG P IS NONE self.logger.warning( "Invalid layout, could not find h4 element with text 'Information' on url " + response.url) p = "" p = re.sub('<h4>[^<]+</h4>', '', p) elif layout == 'without_headings': p = info_block.xpath('./p[2]').extract_first() for line in p.split('<br>'): line_txt = self.get_text(scrapy.Selector(text=line)) known = False m = re.search('minimum amount per order:?\s*(.+)', line_txt, re.IGNORECASE) if m: ads['minimum_order'] = m.group(1) known = True m = re.search('maximum amount per order:?\s*(.+)', line_txt, re.IGNORECASE) if m: ads['maximum_order'] = m.group(1) known = True m = re.search('views:?\s*(.+)', line_txt, re.IGNORECASE) if m: ads['views'] = m.group(1) known = True m = re.search('Quantity in stock:?\s*(.+)', line_txt, re.IGNORECASE) if m: ads['stock'] = m.group(1) known = True m = re.search('Already sold:?\s*(.+)', line_txt, re.IGNORECASE) if m: ads['already_sold'] = m.group(1) known = True m = re.search('Country:?\s*(.+)', line_txt, re.IGNORECASE) if m: ads['country'] = m.group(1) known = True m = re.search('Replace-Time:?\s*(.+)', line_txt, re.IGNORECASE) if m: ads['replace_time'] = m.group(1) known = True m = re.search('Category', line_txt, re.IGNORECASE) if m: known = True if ads['category'] is None: splitted_html = re.sub('\s*<i[^\>]+>\s*</i>\s*', '/', line) line_txt2 = self.get_text( scrapy.Selector(text=splitted_html)) m = re.search('Category:\s*(.+)\s*', line_txt2, re.IGNORECASE) if m: ads['category'] = m.group(1) known = True if not known: self.logger.warning( 'Unknown information type (%s) in ads at %s' % (line_txt, response.url)) if response.url.endswith('refund'): ads['terms_and_conditions'] = self.get_text( response.css("#tabcontent")) else: #ads['description'] = self.get_text(response.css("#tabcontent")); ads['description'] = self.get_text(response.css("#tabcontent")) yield self.make_request('offer-refund', url=response.url + '/refund', category=ads['category']) yield ads #================================================= #if response.url.endswith('refund'): # ads['terms_and_conditions'] = self.get_text(response.css("#tabcontent")) #else: # #ads['description'] = self.get_text(response.css("#tabcontent")); # ads['description'] = self.get_text(response.css("#tabcontent")) # yield self.make_request('offer-refund', url=response.url + '/refund', category=ads['category']) ## ===================== IMAGES ===================== images_url = response.css('img.img-thumbnail::attr(src)').extract() for url in images_url: if url: img_item = items.AdsImage(image_urls=[]) img_item['image_urls'].append( self.make_request('image', url=url) ) # Need Scrapy > 1.4.0 for this to work (base64 url encoded data). img_item['ads_id'] = ads['offer_id'] yield img_item ## ============================ ## ========== Feedbacks ===== feedback_table = response.xpath( './/h3[contains(text(), "Feedback")]/../table') for line in feedback_table.css('tbody tr'): try: rating = items.ProductRating() score = self.get_text(line.css('td:nth-child(1) .text-muted')) m = re.search('\((\d+(.\d+)?)\)', score) if not m: self.logger.warning('Cannot read feedback score %s' % score) continue rating['rating'] = "%s/5" % m.group(1) #rating['comment'] = self.get_text(line.css('td:nth-child(2)')) comment = line.xpath('./td[2]/text()')[0].extract().strip() if comment is None: self.logger.warning( "Couldn't find the review. Inserting an empty string at URL: %s" % url) else: rating['comment'] = comment rating['ads_id'] = ads['offer_id'] rating['submitted_by'] = self.get_text( line.css('td:nth-child(3)')) rating['submitted_on'] = self.parse_timestr( self.get_text(line.css('td:nth-child(4)'))) yield rating except Exception as e: self.logger.warning( "Could not get product feedback. Error : %s" % e)
def parse_listing(self, response): try: ads = items.Ads() lis = response.css('.container ol li') title_cat = [] [title_cat.append(self.get_text(li)) for li in lis] if (len(title_cat) < 1): raise WarningException("Cannot determine title of listing.") m = re.search('listing\/(\d+)', response.url) if not m: raise WarningException('Cannot find listing ID') ads['offer_id'] = m.group(1) ads['relativeurl'] = '/listing/%s' % ads['offer_id'] ads['fullurl'] = self.make_url(ads['relativeurl']) ads['title'] = title_cat[-1] # Last item ads['category'] = '/'.join(title_cat[:-1]) # Last item ads['price'] = self.get_text(response.css(".listing-price .fa-btc").xpath('..')) lines = response.css(".container form table").xpath('.//td[contains(text(), "Vendor")]/../../tr') for line in lines: tds = line.css('td') if len(tds) != 2: raise WarningException("Listing property table line does not have two cells.") prop = self.get_text(tds[0]).lower() val = tds[1] if prop == 'vendor': vendor_link = val.xpath('a[contains(@href, "vendor")]') yield self.make_request('userprofile', url=vendor_link.xpath('@href').extract_first()) ads['vendor_username'] = self.get_text(vendor_link) elif prop == 'class': ads['ads_class'] = self.get_text(val) elif prop == 'ships from': ads['ships_from'] = self.get_text(val) elif prop == 'ships to': ads['ships_to'] = self.get_text(val) elif prop == 'except': ads['ships_to_except'] = self.get_text(val) elif prop == 'delivery': ads['shipping_options'] = json.dumps([self.get_text(val)]) else: self.logger.warning("New property found : %s. Listing at %s" % (prop, response.url)) shipping_options_elements = response.css('select[name="shippingID"] option:not([value="0"])') shipping_options = [] for element in shipping_options_elements: shipping_options.append(self.get_text(element)) if len(shipping_options) > 0: ads['shipping_options'] = json.dumps(shipping_options) ads['description'] = self.get_presentation_text(response, 'Details') ads['terms_and_conditions'] = self.get_presentation_text(response, 'Terms & Conditions') ads['in_stock'] = True if len(response.css(".listing-stock .label-success")) > 0 else False stock_text = self.get_text(response.css(".listing-stock .label-success")).lower() m = re.search('(\d+) in stock', stock_text) if m: ads['stock'] = m.group(1) yield ads ## ===================== IMAGES ===================== images_url = response.css('img.img-thumbnail::attr(src)').extract(); for url in images_url: if url: img_item = items.AdsImage(image_urls = []) img_item['image_urls'].append(self.make_request('image', url=url)) # Need Scrapy > 1.4.0 for this to work (base64 url encoded data). img_item['ads_id'] = ads['offer_id'] yield img_item #self.dao.flush(dbmodels.AdsImage) ## ========= presetations = response.css("li[role='presentation'] a"); for presentation in presetations: name = self.get_text(presentation).lower() link = presentation.xpath('@href').extract_first() if name in ['details', 'terms & conditions'] : yield self.make_request('listing', url=link) elif name == 'feedback': yield self.make_request('listing_feedback', url=link, listing_id=ads['offer_id']) else: self.logger.warning('Encountered an unknown tab %s. Listing at %s' % (name, response.url)) except WarningException as e: self.logger.warning("Cannot parse listing. %s" % e) except: raise
def parse_ads(self, response): title = response.xpath(".//div[@id='main']/h1/text()").extract_first() if title is None and response.xpath( ".//div[contains(text(), 'Produkten finns inte.')]"): self.logger.warning( "Found what is likely an empty page at %s. Flugsvamp writes: %s" % (response.url, response.xpath( ".//div[contains(text(), 'Produkten finns inte.')]/text()" ).extract_first().strip())) else: ads_item = items.Ads() user_item = items.User() ads_item['title'] = title ads_item['offer_id'] = response.url.split("=")[-1] ads_item['fullurl'] = response.url ads_item['relativeurl'] = self.get_relative_url(response.url) # COMMENT WHY THIS IS. description = self.get_text( response.xpath( '//strong[contains(text(), "Beskrivning:")]/parent::div') ).replace('Beskrivning:', '') if description: ads_item['description'] = description try: keys = response.xpath(".//div[@class='lightrow']") for key_ele in keys: key = key_ele.xpath("strong/text()").extract_first() if key == None: continue key = key.lower() if "omd" in key: value = key_ele.xpath( './/span[@class="grey"]/text()').extract_first() m = re.search('(.*?)\ \((.*?)\ omd', value, re.M | re.I | re.S) if m: ads_item['product_rating'] = m.group(1) ads_item['already_sold'] = m.group(2) elif "ljare" in key: ads_item['vendor_username'] = key_ele.xpath( './/a/text()').extract_first() user_item['username'] = ads_item['vendor_username'] user_item['relativeurl'] = key_ele.xpath( './/a/@href').extract_first() user_item['fullurl'] = response.urljoin( user_item['relativeurl']) value = key_ele.xpath( './/span[@class="grey"]/text()').extract_first() m = re.search('(.*?)\ \((.*?)\ omd', value, re.M | re.I | re.S) if m: user_item['average_rating'] = m.group(1) user_item['feedback_received'] = m.group(2) elif key == "kategori:": ads_item['category'] = key_ele.xpath( './/a/text()').extract_first() elif key == "kvantitet:": ads_item['quantity'] = self.get_text( key_ele.xpath('span[@class="float-right"]')) elif key == "ditt pris inkl. frakt:": value = self.get_text( key_ele.xpath('.//span[@class="float-right"]')) m = re.search('(.*?)\ \((.*?)\)', value, re.M | re.I | re.S) if m: ads_item['price_btc'] = m.group(2) elif key == "pristabell:": price_options = [] priceList = key_ele.xpath( './/span[@class="float-right"]').extract_first( ).split('<br>') for list_item in priceList: linesel = scrapy.Selector(text=list_item) line_txt = self.get_text(linesel) price_options.append(line_txt) if len(price_options) > 0: ads_item['price_options'] = price_options else: self.logger.warning( "Found a new piece of product information, '%s', at %s" % (key, response.url)) yield ads_item yield user_item except Exception as error: self.logger.warning( "Failed to parse listing (Error: '%s'). See URL %s" % (error, response.url)) # ===================== IMAGES ===================== images_url = response.css('img.float-right::attr(src)').extract() for url in images_url: if url: img_item = items.AdsImage(image_urls=[]) img_item['image_urls'].append( self.make_request(reqtype='image', url=url, headers=self.tor_browser)) img_item['ads_id'] = ads_item['offer_id'] yield img_item
def parse_product(self, response): ads_id = re.search('code=([\w]+)', response.url).group(1) try: ads_item = items.Ads() ads_item['offer_id'] = ads_id ads_item['title'] = self.get_text( response.xpath(".//div[@class='col-md-8']/h2")) ads_item['relativeurl'] = self.get_relative_url(response.url) ads_item['fullurl'] = response.url ads_item['accepted_currencies'] = response.css( '.well input[name="currency"]::attr(value)').extract() ads_item['description'] = self.get_text( response.css('ul.nav-tabs').xpath('following-sibling::p')) in_stock_match = re.search( '(\d+)', response.css('.listing-stock span::text').extract_first()) if in_stock_match: ads_item['in_stock'] = in_stock_match.group(1) ads_item['shipping_options'] = [] for option in response.css( 'select[name="shipping_option"] option'): ads_item['shipping_options'].append(self.get_text(option)) item_sold_match = re.search( '(\d+) items sold since ([\d\-: ]+)', self.get_text( response.css('table.table-condensed tr:last-child td'))) if item_sold_match: ads_item['already_sold'] = item_sold_match.group(1) trs = response.css('div.col-sm-7 table.table-condensed tr') for tr in trs: tds = tr.css('td') if len(tds) == 2: key = self.get_text(tds[0]).lower() value = self.get_text(tds[1]) if key == 'vendor': ads_item['vendor_username'] = tds.xpath( ".//a/text()").extract_first() elif key == 'class': ads_item['ads_class'] = value elif key == 'escrow type': ads_item['escrow'] = value elif key == 'ships from': ads_item['ships_from'] = value else: self.logger.warning( 'New information found on product page : %s' % key) prices = self.get_text( response.xpath(".//div[@class='listing-price']")) price_eur = re.search("([0-9\.]*) EUR", prices) price_usd = re.search("([0-9\.]*) USD", prices) if price_usd: ads_item['price_usd'] = price_usd.group(1) if price_eur: ads_item['price_eur'] = price_eur.group(1) price_btc = response.xpath( ".//div[@class='listing-price']/span").extract_first() if 'btc' in price_btc: ads_item['price_btc'] = self.get_text( response.xpath(".//div[@class='listing-price']/span")) else: self.logger.warning( "Couldn't match BTC price. There might be another currency available. Please inspect %s" % response.url) yield ads_item except Exception as error: self.logger.warning("Failed to yield ads at %s because '%s'" % (response.url, error)) try: image_url = response.css( 'div.index-img img::attr(src)').extract_first() if image_url: ads_image = items.AdsImage(image_urls=[]) ads_image['ads_id'] = ads_id ads_image['image_urls'].append( self.make_request('image', url=self.make_url(image_url))) yield ads_image except Exception as error: self.logger.warning("Failed to yield ad image at %s because '%s'" % (response.url, error))
def parse_listing(self, response): try: ads_item = items.Ads() ads_item["offer_id"] = re.search(r"ls_id=(\d+)", response.url, re.M | re.I).group(1) ads_item["vendor_username"] = self.get_text( response.xpath("//small/a[contains(@href,'user.php?u_id=')]")) ads_item["vendor_username"] = ads_item["vendor_username"].split( "(")[0].strip() ads_item["fullurl"] = response.url.split("&")[0] ads_item["relativeurl"] = self.get_relative_url( ads_item["fullurl"]) ads_item["title"] = response.xpath( ".//div[@class='col-sm-12']/a[contains(@href, 'ls_id')]/text()" ).extract_first() ads_item["ships_to"] = self.get_text( response.xpath( "//small//b[contains(text(),'Ship To :')]/ancestor::small") ).replace("Ship To :", "").strip() if ads_item["ships_to"] == "": #self.logger.warning("Fallback to other shipping to field at %s." % response.url) ads_item["ships_to"] = self.get_text( response.xpath( "//small//b[contains(text(),'Ship To :')]/ancestor::small/following-sibling::small[1]" )) ads_item["ships_from"] = self.get_text( response.xpath( "//small//b[contains(text(),'Origin Country :')]/ancestor::small" )).replace("Origin Country :", "").strip() ads_item["ads_class"] = self.get_text( response.xpath( "//small//b[contains(text(),'Product class :')]/ancestor::small" )).replace("Product class :", "").strip() ads_item["quantity"] = self.get_text( response.xpath( "//small//b[contains(text(),'Quantity :')]/ancestor::small" )).replace("Quantity :", "").strip() accepted_currencies = [] sale_price = self.get_text( response.xpath( "//form//span[contains(text(),'Sale Price :')]")).replace( "Sale Price :", "").strip() if "USD" in sale_price: ads_item["price_usd"] = re.search(r"([\d\.]+)\s*USD", sale_price, re.M | re.I) ads_item["price_usd"] = ads_item["price_usd"].group( 1) if ads_item["price_usd"] else None if "BTC" in sale_price: ads_item["price_btc"] = re.search(r"([\d\.]+)\s*BTC", sale_price, re.M | re.I) ads_item["price_btc"] = ads_item["price_btc"].group( 1) if ads_item["price_btc"] else None accepted_currencies.append("BTC") ads_item["accepted_currencies"] = ",".join(accepted_currencies) ads_item["shipping_options"] = self.get_shipping_options(response) # new fields ads_item["escrow"] = self.get_text( response.xpath( "//small//b[contains(text(),'Payment :')]/ancestor::small") ).replace("Payment :", "").strip() active_tab = self.get_text( response.xpath( "//ul[@class='nav nav-tabs']/li[@class='active']/a")) if "Product Description" in active_tab: ads_item['description'] = self.get_text( response.xpath("//div[@class='tab-content']")) elif "Refund Policy" in active_tab: ads_item['refund_policy'] = self.get_text( response.xpath("//div[@class='tab-content']")) elif "Product Tags" in active_tab: pass elif "Feedback" in active_tab: feedbacks = response.xpath( "//div[@class='tab-content']//table/tbody/tr") if feedbacks: for feedback in feedbacks: rating = items.ProductRating() rating["ads_id"] = ads_item["offer_id"] rating["submitted_by"] = self.get_text( feedback.xpath("td[3]/small")) rating["submitted_on_string"] = self.get_text( feedback.xpath("td[5]/small")).replace( "View Item", "").strip() rating["submitted_on"] = self.parse_datetime( rating["submitted_on_string"]) rating["comment"] = self.get_text( feedback.xpath("td[2]/small")) rating["price_usd"] = self.get_text( feedback.xpath("td[4]/small")) # new fields score = self.get_text(feedback.xpath("td[1]")) if score == "\xe2\x98\x91": rating["rating"] = "Positive" elif score == "\xe2\x98\x92": rating["rating"] = "Negative" elif score == "\xe2\x98\x90": rating["rating"] = "Neutral" else: self.logger.warning( "Unknown rating type '%s' at %s" % (rating["rating"], response.url)) yield rating else: self.logger.warning("Unknown tab: %s at %s" % (active_tab, response.url)) yield ads_item except Exception as error: self.logger.warning("Couldn't yield Ad (Error %s) at %s." % (error, response.url)) if self.is_listing_tab_page(response) is False: # self.requests_from_listing_page(response) image_urls = response.xpath( "//img[@class='pull-left']/@src").extract() if len(image_urls) > 0: img_item = items.AdsImage(image_urls=[]) for img_url in image_urls: # e.g. uploads/9bc5f18d5667081890e8972def13da2f_100_100.png # -> uploads/9bc5f18d5667081890e8972def13da2f.png img_url = re.sub(r"_\d+_\d+\.", ".", img_url) img_item['image_urls'].append( self.make_request(reqtype='image', url=img_url)) img_item['ads_id'] = ads_item['offer_id'] yield img_item
def parse_listing(self, response): # The ad. ads_item = items.Ads() ads_item["offer_id"] = re.search(r"/item/([^/]+)", response.url, re.M | re.I) if ads_item["offer_id"]: ads_item["offer_id"] = ads_item["offer_id"].group(1) else: self.logger.warning("offer_id is None at %s" % response.url) return ads_item["vendor_username"] = re.search(r"/user/([^/]+)", response.url, re.M | re.I) if ads_item["vendor_username"]: ads_item["vendor_username"] = ads_item["vendor_username"].group(1) ads_item["fullurl"] = response.url.split( ads_item["offer_id"])[0] + ads_item["offer_id"] ads_item["relativeurl"] = self.get_relative_url(ads_item["fullurl"]) ads_item["title"] = "".join( response.xpath( ".//div[@class='ui segment inverted t-item-image secondary']/h3/text()" ).extract()).strip() ads_item["description"] = self.get_text( response.xpath( ".//div[@class='ui segment']/h3[contains(text(),'About')]/following-sibling::div" )) ads_item["shipping_options"] = self.get_shipping_options(response) ads_item["product_rating"] = response.xpath( ".//div[@class='ui segment inverted t-item-image secondary']/h3//i[@class='icon thumbs up']/following-sibling::span/text()" ).extract_first(default="").strip() yield ads_item # The images. image_urls = response.xpath( ".//div[@class='ui segment inverted t-item-image secondary']/img/@src" ).extract() if len(image_urls) > 0: img_item = items.AdsImage(image_urls=[]) for img_url in image_urls: img_item['image_urls'].append( self.make_request(reqtype='image', url=img_url)) img_item['ads_id'] = ads_item['offer_id'] yield img_item # The reviews. feedbacks = response.xpath( ".//div[@class='ui segment']/h3[contains(text(),'Reviews')]/following-sibling::div[@class='ui comments']/div[@class='comment']" ) if feedbacks: for feedback in feedbacks: rating = items.ProductRating() rating["ads_id"] = ads_item["offer_id"] rating["submitted_by"] = feedback.xpath( ".//a[@class='author']/text()").extract_first( default="").strip().replace("@", "") rating["submitted_on_string"] = feedback.xpath( ".//span[@class='date']/text()").extract_first( default="").strip() rating["submitted_on"] = self.parse_datetime( rating["submitted_on_string"]) rating["comment"] = self.get_text( feedback.xpath(".//pre[@class='text']")) rating["rating"] = feedback.xpath( ".//i[@class='icon thumbs up']/following-sibling::span/text()" ).extract_first(default="").strip() yield rating
def parse_product(self, response): ads = items.Ads() if 'username' in response.meta: username = response.meta['username'] else: username = self.get_username_from_header(response) ads['offer_id'] = self.get_product_id_from_url(response.url) ads['relativeurl'] = 'product/%s' % ads['offer_id'] ads['fullurl'] = self.make_url(ads['relativeurl']) ads['vendor_username'] = username if 'category' in response.meta and response.meta['category'] != None: ads['category'] = response.meta['category'] ads['title'] = self.get_text_first(response.css('.container h1')) ads['description'] = self.get_text( response.css('article.internal-product-desc')) ads['escrow'] = False ads['in_stock'] = False price_options = [] # for each trable line for tr in response.css('.internal-product-varieties table tr'): qty = self.get_text(tr.xpath('./td[1]')) for btn in tr.css( '.btn' ): # Many options per line possible, separated by a Buy button option = {} option['qty'] = qty btn_txt = self.get_text(btn) btn_text_lower = btn_txt.lower() if 'escrow' in btn_text_lower: option['method'] = 'escrow' ads['in_stock'] = True elif 'direct-pay' in btn_text_lower: option['method'] = 'direct-pay' ads['in_stock'] = True elif 'out of stock' in btn_text_lower: option['method'] = 'out of stock' pass else: option['method'] = '' self.logger.warning( 'Unknown price payment method from string "%s" at %s' % (btn_text_lower, response.url)) # Scan each line to find the price that preceed the actual button last_price_seen = '' for td in tr.css('td'): td_txt = self.get_text(td) m = re.search('BTC\s*(\d+(.\d+)?)', td_txt, re.IGNORECASE) if m: last_price_seen = m.group( 1 ) # We found a price in bitcoin, if we encounter the actual button, this price will be the right one btn_in_cell = td.css('.btn') if btn_in_cell: if self.get_text(btn_in_cell) == btn_txt: option['price'] = last_price_seen break if 'price' not in option: self.logger.warning( 'Could not find price for product at %s' % response.url) option['price'] = '' if option['method'] == 'escrow': ads['escrow'] = True price_options.append(option) ads['price_options'] = json.dumps(price_options) if len(price_options) == 1: ads['price'] = price_options[0]['price'] ads_img = items.AdsImage() ads_img['ads_id'] = ads['offer_id'] img_src = response.css('.main-content').xpath( './/img[contains(@src, "product_images")]/@src').extract_first() if img_src: ads_img['image_urls'] = [self.make_request('image', url=img_src)] yield ads_img yield ads