def parse_ads_images(self, response):
     # Somehow CGMC doesn't like when ads images and ads are scapred from the same page and keeps
     # throwing foreign key exceptions (trying to save images before the ad is saved)
     # Hack : Make a new request only for images, not clean, but it works :\
     ads_id = self.get_ad_id(response.url)
     image_urls = response.css(
         'section#main .product figure a::attr(href)').extract()
     if len(image_urls) > 0:
         img_item = items.AdsImage(image_urls=[])
         for img_url in image_urls:
             img_item['image_urls'].append(
                 self.make_request('image', url=img_url))
         img_item['ads_id'] = ads_id
         yield img_item
Example #2
0
    def parse_product(self, response):
        try:
            offer_id    = re.search(r'/product/([\w-]+)/$', response.url).group(1)
            title       = re.search(r'produit «(.+)»', self.get_text(response.css('div.card-header h2'))).group(1)

            ads_item                    = items.Ads()
            ads_item['offer_id']        = offer_id
            ads_item['title']           = title
            ads_item['relativeurl']     = self.get_relative_url(response.url)
            ads_item['fullurl']         = response.url

            trs = response.css('table.m-0 tr')

            for tr in trs:
                key     = self.get_text(tr.css('th')).lower()
                value   = self.get_text(tr.css('td'))
                
                if key == 'prix en ฿':
                    ads_item['price_btc'] = value
                elif key == 'catégorie':
                    ads_item['category'] = value
                elif key == 'vendeur':
                    value = tr.xpath(".//a/@href").extract_first()
                    value = re.search("/account/(.*)/$", value).group(1)
                    ads_item['vendor_username'] = value
                elif key == 'escrow':
                    ads_item['escrow'] = value
                elif key == 'description':
                    ads_item['description'] = value
                elif key in ['prix en €']:
                    ads_item['price_eur'] = value
                else:
                    self.logger.warning("Found a new piece of product information, '%s', with value '%s' at %s" % (key, value, response.url))

            yield ads_item

            images_url = response.css('.card-img-top a img::attr(src)').extract()

            for url in images_url:
                if url:
                    ads_image = items.AdsImage(image_urls = [])
                    ads_image['image_urls'].append(self.make_request('image', url=url))
                    ads_image['ads_id'] = offer_id
                    yield ads_image

        except Exception as error:
            self.logger.warning("Failed to yield ads at %s because '%s'" % (response.url, error))
Example #3
0
    def parse_listing(self, response):
        title = response.xpath(
            ".//section[@id='content1']//div[@class='listing_right']/span/text()"
        ).extract_first(default="").strip()
        username = response.xpath(
            ".//section[@id='content1']//div[@class='listing_right']//a[@class='greenlink']/text()"
        ).extract_first(default="").strip()
        if title == "" and username == "":
            self.logger.warning("Found what is likely an empty page at %s." %
                                response.url)
        else:
            # Try to yield ads.
            try:
                ads_item = items.Ads()
                ads_item['title'] = title
                ads_item['vendor_username'] = username
                ads_item['relativeurl'] = self.get_relative_url(response.url)
                ads_item['fullurl'] = response.url
                if 'clid' in response.url:
                    ads_item['offer_id'] = self.get_url_param(
                        response.url, 'clid')
                else:
                    ads_item['offer_id'] = self.get_url_param(
                        response.url, 'lid')
                ads_item['category'] = response.xpath(
                    ".//section[@id='content1']//div[@class='listing_right']/br/following-sibling::span/text()"
                ).extract_first(default="").strip()
                ads_item['ships_from'] = response.xpath(
                    ".//section[@id='content1']//div[@class='listing_right']//b[contains(text(),'Shipping From:')]/following-sibling::span/text()"
                ).extract_first(default="").strip()
                ads_item['ships_to'] = response.xpath(
                    ".//section[@id='content1']//div[@class='listing_right']//b[contains(text(),'Shipping To:')]/following-sibling::span/text()"
                ).extract_first(default="").strip()
                ads_item['description'] = self.get_text(
                    response.xpath(".//section[@id='content1']/p"))
                ads_item['escrow'] = self.get_text(
                    response.xpath(
                        ".//section[@id='content1']//div[@class='listing_right']/div/span[@style='float:right']/span"
                    ))
                ads_item['multisig'] = response.xpath(
                    ".//section[@id='content1']//div[@class='listing_right']/div/span[@style='float:right']/img[@alt='Multisig']"
                )
                ads_item['multisig'] = True if ads_item['multisig'] else False
                ads_item['stock'] = self.get_text(
                    response.xpath(
                        ".//section[@id='content1']//div[@class='listing_right']/div/span[not(@style='float:right')]/span"
                    ))
                ads_item['shipping_options'] = self.get_shipping_options(
                    response)
                ads_item['accepted_currencies'] = self.get_accepted_currencies(
                    response)

                prices_text = self.get_text(
                    response.xpath(
                        ".//section[@id='content1']//div[@class='listing_right']/p"
                    ))
                price_usd = re.search(r"\$\s*([\d\.]+)", prices_text,
                                      re.M | re.I)
                price_btc = re.search(r"([\d\.]+)\s*฿", prices_text,
                                      re.M | re.I)
                price_xmr = re.search(r"([\d\.]+)\s*XMR", prices_text,
                                      re.M | re.I)

                if price_usd:
                    ads_item["price_usd"] = price_usd.group(1)
                else:
                    self.logger.warning("No price_usd found on %s" %
                                        response.url)
                if price_xmr:
                    ads_item["price_xmr"] = price_xmr.group(1)
                if price_btc:
                    ads_item["price_btc"] = price_btc.group(1)

                yield ads_item
            except Exception as error:
                self.logger.warning("Couldn't yield ad from %s (Error: %s)" %
                                    (response.url, error))
            # Try to yield images.
            try:
                image_urls = response.xpath(
                    ".//section[@id='content1']//div[@class='listing_image']/img/@src"
                ).extract()
                if len(image_urls) > 0:
                    img_item = items.AdsImage(image_urls=[])
                    for img_url in image_urls:
                        img_item['image_urls'].append(
                            self.make_request(reqtype='image', url=img_url))
                    img_item['ads_id'] = ads_item['offer_id']
                    yield img_item
            except Exception as error:
                self.logger.warning(
                    "Couldn't yield ad images from %s (Error: %s)" %
                    (response.url, error))

        # Yield product ratings.
        # Note, that the price is also available in ads.
        feedbacks = response.xpath(
            ".//section[@id='content2']//div[@class='feedback']")
        if feedbacks:
            for feedback in feedbacks:
                rating = items.ProductRating()
                rating["ads_id"] = ads_item["offer_id"]
                rating["submitted_on_string"] = feedback.xpath(
                    "div[@class='feedback_header']/span/text()").extract_first(
                        default="").strip()
                rating["submitted_on"] = self.parse_datetime(
                    rating["submitted_on_string"])
                rating['price_usd'] = feedback.xpath(
                    "div[@class='feedback_subheader']/div/span/text()[contains(., 'USD')]"
                ).extract_first()
                rating['price_usd'] = rating['price_usd'].replace(
                    "~", "").replace("USD", "").replace(" ", "")
                rating_star = feedback.xpath(
                    "div[@class='feedback_subheader']//div[contains(@style,'img/star.png')]/@style"
                ).extract_first(default="")
                rating_star = re.search(r"width:(\d+)px;height", rating_star,
                                        re.M | re.S)
                if rating_star:
                    rating_star = float(rating_star.group(1))
                    rating['rating'] = rating_star / 120 * 5
                warning = feedback.xpath(
                    "div[@class='feedback_subheader']/div/span")
                if warning and len(warning) > 1:
                    rating['warnings'] = self.get_text(warning[0])
                rating["comment"] = self.get_text(feedback.xpath("p"))
                rating["submitted_by"] = feedback.xpath(
                    "div[@class='feedback_header']//span[@class='feedbackScore']/../text()"
                ).extract_first(default="").strip()
                rating["submitter_rating"] = self.get_text(
                    feedback.xpath(
                        "div[@class='feedback_header']//span[@class='feedbackScore']/sup"
                    ))
                rating["submitted_by_number_transactions"] = self.get_text(
                    feedback.xpath(
                        "div[@class='feedback_header']//span[@class='feedbackScore']/sub"
                    ))

                yield rating
            ads_item['fullurl'] = response.url
            parsed_url = urlparse(response.url)
            ads_item['relativeurl'] = "%s?%s" % (parsed_url.path,
                                                 (parsed_url.query))

            yield ads_item
        elif listing_not_found == 'Listing not found':
            self.logger.warning('Listing not found at %s' % response.url)
        else:
            self.logger.warning('Unknown listing status %s' % response.url)

        ## ===================== IMAGES =====================
        images_url = response.css('img.productImage::attr(src)').extract()
        for url in images_url:
            img_item = items.AdsImage(image_urls=[])
            img_item['image_urls'].append(self.make_request('image', url=url))
            img_item['ads_id'] = ads_item['offer_id']
            yield img_item

        ## ===================== Product Ratings (feedback) =========
        rating_lines = response.css('.ratings table tr')
        for tr in rating_lines:
            try:
                rating_item = items.ProductRating()

                age = self.get_text(tr.css('td.age'))
                m = re.search('(\d+)d', age)
                if m:
                    days_offset = m.group(1)
                    # A sanity check. Dream has some dates which are in 1969 and 1970..
Example #5
0
    def parse_product(self, response):
        try:
            username = self.get_text(response.css('h4.media-heading a'))
            offer_id = re.search(r'/listings/[\w-]+/([\w-]+)$',
                                 response.url).group(1)
            title = self.get_text(response.css('h3.m-b-15'))
            ads_item = items.Ads()
            ads_item['vendor_username'] = username
            ads_item['offer_id'] = offer_id
            ads_item['title'] = title
            ads_item['relativeurl'] = self.get_relative_url(response.url)
            ads_item['fullurl'] = response.url
            prices = response.xpath(
                ".//div[@class='panel-footer text-center']").extract_first()
            ads_item['accepted_currencies'] = []
            price_usd = re.search("([0-9\.]*) USD", prices)
            price_xmr = re.search("([0-9\.]*) XMR", prices)
            price_btc = re.search("([0-9\.]*) BTC\n", prices)
            if price_usd:
                ads_item['price_usd'] = price_usd.group(1)
            if price_xmr:
                ads_item['price_xmr'] = price_xmr.group(1)
                ads_item['accepted_currencies'].append("xmr")
            if price_btc:
                ads_item['price_btc'] = price_btc.group(1)
                ads_item['accepted_currencies'].append("btc")

            dts = response.css("dl.dl-horizontal dt")
            for dt in dts:
                key = self.get_text(dt).lower()
                value = self.get_text(dt.xpath('following-sibling::dd[1]'))
                if key == 'sold':
                    ads_item['already_sold'] = re.search(r'(\d+)',
                                                         value).group(1)
                elif key == 'ships from':
                    ads_item['ships_from'] = value
                elif key == 'ships to':
                    ads_item['ships_to'] = value
                elif key == 'payment type':
                    ads_item['escrow'] = value
                    if 'multisig' in value.lower():
                        ads_item['multisig'] = True
                elif key == 'product type':
                    ads_item['category'] = value
                elif key in [
                        'sold by', 'trust rating', 'creation date',
                        'starts from'
                ]:
                    pass
                else:
                    self.logger.warning(
                        'New information found on use profile page : %s' % key)

            ads_item['shipping_options'] = []
            for option in response.css('select#shipping_method option'):
                ads_item['shipping_options'].append(self.get_text(option))

            ads_item['description'] = self.get_text(response.css('p.break-me'))

            yield ads_item
        except Exception as error:
            self.logger.warning("Failed to yield ads at %s because '%s'" %
                                (response.url, error))

        try:
            # Yield images in thumbnail.
            images_url = response.css('a.thumbnail img::attr(src)').extract()
            for url in images_url:
                if url:
                    ads_image = items.AdsImage(image_urls=[])
                    ads_image['ads_id'] = offer_id
                    ads_image['image_urls'].append(
                        self.make_request(reqtype='regular', url=url))
                    yield ads_image

            # Yield feature image.
            image_url = response.css(
                'img.featured-image::attr(src)').extract_first()

            if image_url:
                ads_image = items.AdsImage(image_urls=[])
                ads_image['ads_id'] = offer_id
                ads_image['image_urls'].append(
                    self.make_request(reqtype='regular', url=image_url))
                yield ads_image

        except Exception as error:
            self.logger.warning("Failed to yield images at %s because '%s'" %
                                (response.url, error))
Example #6
0
    def parse_listing(self, response):
        ads = items.Ads()
        ads_img = items.AdsImage()

        listing_content = response.css("#content1")  # Tabs
        feedback_content = response.css("#content2")  # Tabs

        ads['title'] = self.get_text_first(response.css('.listing_right span'))
        #ads['offer_id']		= self.get_url_param(response.url, 'lid')
        try:
            offer_id = self.get_url_param(response.url, 'lid')
            ads['offer_id'] = self.get_url_param(response.url, 'lid')
        except:
            self.logger.warning(
                "Ran into a URL parameter issue at URL: %s. Offer_ID is not recorded."
                % (response.url))
            ads['offer_id'] = self.get_url_param(response.url, 'lid')
        ads['relativeurl'] = response.meta['relativeurl']
        ads['fullurl'] = self.make_url(ads['relativeurl'])
        user_url = response.css('.listing_right').xpath(
            './/a[contains(@href, "page=profile")]/@href').extract_first()
        # Some items don't have an associated vendor.
        try:
            ads['vendor_username'] = self.get_url_param(user_url, 'user')
        except:
            self.logger.warning(
                'No seller available at URL: %s. Seller is noted as \'\'. Inspect the URL post-crawl.'
                % (response.url))
            ads['vendor_username'] = ''

        ads['category'] = response.meta['category']

        multilisting_select = listing_content.css(
            'select[name="multilistingChild"]'
        )  # 2 types of Ads. Multilisting or not.

        if not multilisting_select:
            ads['multilisting'] = False
            listing_right_p = self.get_text(
                listing_content.css(".listing_right p"))
            m = re.search(
                r'\((\d+(\.\d+)?)\s*\xe0\xb8\xbf\)', listing_right_p
            )  # Search for bitcoin icon \xe0\b8\xbf is unicode char for bitcoin encoded in UTF8
            m2 = re.search(r'([0-9.]{1,10}) \xe0\xb8\xbf', listing_right_p)
            if m:
                ads['price'] = m.group(1)
            # minor error handling in case the previous regex written by Pier-Yver doesn't catch bitcoin prices.
            elif m is None and m2 is not None:
                ads['price'] = m2.group(1)
                #self.logger.warning('Encountered an error with the old price-regex. Using RM\'s regex at URL: %s' % (response.url))
        else:
            ads['multilisting'] = True
            options = []
            # Just added @ below which should fix everything.
            for option in multilisting_select.xpath('.//option[@value!=""]'):
                options.append(self.get_text(option))

            ads['price'] = json.dumps(options)

        #Bunches of regex to parse the page.
        listing_right_html = self.get_text(
            listing_content.css('.listing_right').extract_first()
        )  # Read HTML. We need tags as separator.
        listing_right_span_text = self.get_text(
            listing_content.css('.listing_right span'))
        m = re.search('<b>shipping from\s*:\s*</b>\s*([^<]+)',
                      listing_right_html, re.IGNORECASE)
        if m:
            ads['ships_from'] = m.group(1)

        m = re.search('<b>shipping to\s*:\s*</b>\s*([^<]+)',
                      listing_right_html, re.IGNORECASE)
        if m:
            ads['ships_to'] = m.group(1)
        shipping_options = []
        for option in listing_content.css(
                '.listing_right form select[name="shipment"] option[value!=""]::text'
        ).extract():
            shipping_options.append(self.get_text(option))
        ads['shipping_options'] = json.dumps(shipping_options)
        ads['description'] = self.get_text(listing_content.xpath('./p'))
        stocks_possibilities = [
            'Excellent stock', 'Good stock', 'Low stock', 'Very low stock'
        ]
        for possibility in stocks_possibilities:
            if possibility in listing_right_span_text:
                ads['stock'] = possibility
                break

        yield ads

        # Ads Image.
        ads_img['ads_id'] = ads['offer_id']
        ads_img['image_urls'] = [
            self.make_request(
                'image',
                url=listing_content.css(
                    ".listing_image img::attr(src)").extract_first(),
                referer=response.url)
        ]
        yield ads_img

        # Handling listing feedbacks
        for feedback in feedback_content.css(".feedback"):
            try:
                rating = items.ProductRating()
                rating['ads_id'] = ads['offer_id']
                rating['comment'] = self.get_text(feedback.css('p'))
                #rating['submitted_by'] 	= self.get_text(feedback.css('.feedback_header span a'))
                try:
                    username = feedback.css('.feedback_header span a').xpath(
                        "./text()")[0].extract().strip()
                except:
                    username = ''
                    self.logger.warning(
                        'Found a review with no username. URL: %s' %
                        response.url)
                rating['submitted_on'] = self.parse_timestr(
                    self.get_text(
                        feedback.css('.feedback_header').xpath(
                            'span/text()').extract_first()))
                rating['submitted_by'] = username
                #star_styles = feedback.css('.feedback_subheader').xpath('./div[1]/@style').extract_first()
                star_styles = feedback.css('.feedback_subheader').xpath(
                    './div/div')[0].extract()
                m = re.search(r'width:(\d+)px', star_styles)
                if m:
                    width = int(m.group(1))
                    rating['rating'] = '%d/5' % (width // 24
                                                 )  # One star is 24 px wide
                else:
                    self.logger.warning('Cannot find product rating score.')
                yield rating
            except Exception as e:
                self.logger.warning(
                    'Could not get listing feedback at %s. Error %s' %
                    (response.url, e))

        #If there is several pages of feedback. feedback_buffer_middleware will buffer them until we have them all and then sends them further in pipeline
        for url in feedback_content.css(
                'div.pagination a::attr(href)').extract():
            if self.get_url_param(url, 'pg') != '1':
                yield self.make_request(
                    'listing',
                    url=url,
                    relativeurl=response.meta['relativeurl'],
                    ads_id=ads['offer_id'],
                    category=response.meta['category'])
        # If statement to avoid requesting vendors pages when there is no vendor associated with an item.
        if ads['vendor_username'] is not '':
            yield self.make_request('userprofile', url=user_url)
    def parse_offer(self, response):
        ads = items.Ads()
        ads['offer_id'] = self.get_offer_id_from_url(response.url)

        layout = 'unknown'
        info_block = response.xpath(
            '//h1[text()="Info"]/..'
        )  # Two known layout. Try first, fallback on second

        if len(info_block) == 1:
            layout = 'with_headings'
        else:
            layout = 'without_headings'

        if layout == 'without_headings':
            info_block = response.xpath(
                '//h1[contains(@class, "fheading")]/..')

        ads['title'] = self.get_text(response.css('h1.fheading'))
        ads['vendor_username'] = self.get_text(
            info_block.xpath('.//a[contains(@href, "profile")]'))
        if 'category' in response.meta and response.meta[
                'category'] is not None:
            ads['category'] = response.meta['category']
        else:
            ads['category'] = None

        ads['fullurl'] = response.url.replace('/refund', '')
        ads['relativeurl'] = "/offer/%s" % ads['offer_id']

        # =====  Info block 1 - Ships from/to, escrot, multisig, etc ==========
        # We determine the type of info by the icon in front of it. Most reliable way to do it as layout changes freely between listings

        if layout == 'with_headings':
            p = info_block.xpath('./p[1]')
        elif layout == 'without_headings':
            p = info_block.xpath('./p[1]')

        for line in p.extract_first().split('<br>'):
            linesel = scrapy.Selector(text=line)
            line_txt = self.get_text(linesel)

            if len(linesel.css(".ion-log-out")) > 0:  # Ships From icon
                m = re.search('ships from:(.+)', line_txt, re.IGNORECASE)
                if m:
                    ads['ships_from'] = self.get_text(m.group(1))
            elif len(linesel.css(".ion-log-in")) > 0:  # Ships To icon
                m = re.search('only ships to certain countries\s*\(([^\)]+)\)',
                              line_txt, re.IGNORECASE)
                if m:
                    ads['ships_to'] = json.dumps([
                        self.get_text(x.upper()) for x in m.group(1).split(',')
                    ])
                elif 'Worldwide' in line_txt:
                    ads['ships_to'] = 'Worldwide'
                    m = re.search('with Exceptions\s*\(([^\)]+)\)', line_txt,
                                  re.IGNORECASE)
                    if m:
                        ads['ships_to_except'] = json.dumps([
                            self.get_text(x.upper())
                            for x in m.group(1).split(',')
                        ])
                else:
                    self.logger.warning(
                        "New format of 'ships_to' string  (%s) at %s" %
                        (line_txt, response.url))
            elif len(linesel.css(
                    ".ion-android-share-alt")) > 0:  # Properties icons
                if line_txt:
                    line_txt = line_txt.lower()
                    ads['multisig'] = True if 'multisig' in line_txt else False
                    ads['escrow'] = True if 'escrow' in line_txt else False
            elif len(linesel.css(
                    ".ion-android-checkmark-circle")) > 0:  # Auto Accept icon
                if line_txt:
                    line_txt = line_txt.lower()
                    ads['auto_accept'] = True if 'auto-accept' in line_txt else False
            elif len(linesel.css(
                    ".ion-ios-monitor-outline")) > 0:  # Digital Good icon
                pass
            else:
                icontype = linesel.css('.ionicons')
                if icontype:
                    iconclass = icontype[0].xpath('@class').extract_first()
                    self.logger.warning(
                        'Unhandled information available with icon of type (%s) in offer page at %s'
                        % (iconclass, response.url))
        # =========================================

        ## ============= Prices Options =======
        price_opt_table = response.xpath(
            ".//h4[contains(text(), 'Prices')]/../table")
        options = []
        for line in price_opt_table.css('tbody tr'):
            option = {}
            option['amount'] = self.get_text(line.css('td:nth-child(1)'))
            option['price_btc'] = self.get_text(line.css('td:nth-child(3)'))
            options.append(option)

        if len(options) > 0:
            ads['price_options'] = json.dumps(options)
            if len(options) == 1:
                m = re.search('(\d+(\.\d+)?) BTC.+', options[0]['price_btc'])
                if m:
                    ads['price'] = m.group(1)

        ## ==============

        ## ============ Shipping Options ========
        shipping_opt_table = response.xpath(
            ".//h4[contains(text(), 'Shipping Options')]/../table")
        options = []
        for line in shipping_opt_table.css('tbody tr'):
            option = {}
            option['name'] = self.get_text(line.css('td:nth-child(1)'))
            amount_raw = line.css('td:nth-child(2)').extract_first()
            amount_raw = amount_raw.replace(
                '<i class="ionicons ion-ios-infinite"></i>', 'inf')  # Infinity
            option['amount'] = self.get_text(scrapy.Selector(text=amount_raw))
            option['price_btc'] = self.get_text(
                line.css('td:nth-child(4)')).replace(' BTC', '')
            options.append(option)

        if len(options) > 0:
            ads['shipping_options'] = json.dumps(options)
        ## =====================

        # ===================   Info block 2. List of key/value with key in bold.
        if layout == 'with_headings':
            p = response.xpath(
                './/h4[contains(text(), "Information")]/..').extract_first()
            if p is None:  # BUG P IS NONE
                self.logger.warning(
                    "Invalid layout, could not find h4 element with text 'Information' on url "
                    + response.url)
                p = ""
            p = re.sub('<h4>[^<]+</h4>', '', p)
        elif layout == 'without_headings':
            p = info_block.xpath('./p[2]').extract_first()

        for line in p.split('<br>'):
            line_txt = self.get_text(scrapy.Selector(text=line))
            known = False
            m = re.search('minimum amount per order:?\s*(.+)', line_txt,
                          re.IGNORECASE)
            if m:
                ads['minimum_order'] = m.group(1)
                known = True

            m = re.search('maximum amount per order:?\s*(.+)', line_txt,
                          re.IGNORECASE)
            if m:
                ads['maximum_order'] = m.group(1)
                known = True

            m = re.search('views:?\s*(.+)', line_txt, re.IGNORECASE)
            if m:
                ads['views'] = m.group(1)
                known = True

            m = re.search('Quantity in stock:?\s*(.+)', line_txt,
                          re.IGNORECASE)
            if m:
                ads['stock'] = m.group(1)
                known = True

            m = re.search('Already sold:?\s*(.+)', line_txt, re.IGNORECASE)
            if m:
                ads['already_sold'] = m.group(1)
                known = True

            m = re.search('Country:?\s*(.+)', line_txt, re.IGNORECASE)
            if m:
                ads['country'] = m.group(1)
                known = True

            m = re.search('Replace-Time:?\s*(.+)', line_txt, re.IGNORECASE)
            if m:
                ads['replace_time'] = m.group(1)
                known = True

            m = re.search('Category', line_txt, re.IGNORECASE)
            if m:
                known = True
                if ads['category'] is None:
                    splitted_html = re.sub('\s*<i[^\>]+>\s*</i>\s*', '/', line)
                    line_txt2 = self.get_text(
                        scrapy.Selector(text=splitted_html))

                    m = re.search('Category:\s*(.+)\s*', line_txt2,
                                  re.IGNORECASE)
                    if m:
                        ads['category'] = m.group(1)
                        known = True

            if not known:
                self.logger.warning(
                    'Unknown information type (%s) in ads at %s' %
                    (line_txt, response.url))
        if response.url.endswith('refund'):
            ads['terms_and_conditions'] = self.get_text(
                response.css("#tabcontent"))
        else:
            #ads['description']				= self.get_text(response.css("#tabcontent"));
            ads['description'] = self.get_text(response.css("#tabcontent"))
            yield self.make_request('offer-refund',
                                    url=response.url + '/refund',
                                    category=ads['category'])
        yield ads
        #=================================================

        #if response.url.endswith('refund'):
        #	ads['terms_and_conditions']		= self.get_text(response.css("#tabcontent"))
        #else:
        #	#ads['description']				= self.get_text(response.css("#tabcontent"));
        #	ads['description']				= self.get_text(response.css("#tabcontent"))
        #	yield self.make_request('offer-refund', url=response.url + '/refund', category=ads['category'])

        ## ===================== IMAGES =====================
        images_url = response.css('img.img-thumbnail::attr(src)').extract()
        for url in images_url:
            if url:
                img_item = items.AdsImage(image_urls=[])
                img_item['image_urls'].append(
                    self.make_request('image', url=url)
                )  # Need Scrapy > 1.4.0 for this to work (base64 url encoded data).
                img_item['ads_id'] = ads['offer_id']
                yield img_item
        ## ============================

        ## ========== Feedbacks =====

        feedback_table = response.xpath(
            './/h3[contains(text(), "Feedback")]/../table')

        for line in feedback_table.css('tbody tr'):
            try:
                rating = items.ProductRating()
                score = self.get_text(line.css('td:nth-child(1) .text-muted'))
                m = re.search('\((\d+(.\d+)?)\)', score)
                if not m:
                    self.logger.warning('Cannot read feedback score %s' %
                                        score)
                    continue

                rating['rating'] = "%s/5" % m.group(1)
                #rating['comment'] 		= self.get_text(line.css('td:nth-child(2)'))
                comment = line.xpath('./td[2]/text()')[0].extract().strip()
                if comment is None:
                    self.logger.warning(
                        "Couldn't find the review. Inserting an empty string at URL: %s"
                        % url)
                else:
                    rating['comment'] = comment
                rating['ads_id'] = ads['offer_id']
                rating['submitted_by'] = self.get_text(
                    line.css('td:nth-child(3)'))
                rating['submitted_on'] = self.parse_timestr(
                    self.get_text(line.css('td:nth-child(4)')))
                yield rating
            except Exception as e:
                self.logger.warning(
                    "Could not get product feedback. Error : %s" % e)
	def parse_listing(self, response):
		try:
			ads = items.Ads()
			lis = response.css('.container ol li')
			title_cat = []
			[title_cat.append(self.get_text(li)) for li in lis]
			if (len(title_cat) < 1):
				raise WarningException("Cannot determine title of listing.")

			m = re.search('listing\/(\d+)', response.url)
			if not m:
				raise WarningException('Cannot find listing ID')

			ads['offer_id'] = m.group(1)
			ads['relativeurl'] = '/listing/%s' % ads['offer_id']
			ads['fullurl'] = self.make_url(ads['relativeurl'])		
			ads['title'] = title_cat[-1]  # Last item
			ads['category'] = '/'.join(title_cat[:-1])  # Last item
			ads['price'] = self.get_text(response.css(".listing-price .fa-btc").xpath('..'))

			lines  = response.css(".container form table").xpath('.//td[contains(text(), "Vendor")]/../../tr')
			for line in lines:
				tds = line.css('td')

				if len(tds) != 2:
					raise WarningException("Listing property table line does not have two cells.")

				prop = self.get_text(tds[0]).lower()
				val = tds[1]
				if prop == 'vendor':
					vendor_link = val.xpath('a[contains(@href, "vendor")]')
					yield self.make_request('userprofile', url=vendor_link.xpath('@href').extract_first())
					ads['vendor_username'] = self.get_text(vendor_link)
				elif prop == 'class':
					ads['ads_class'] = self.get_text(val)
				elif prop == 'ships from':
					ads['ships_from'] = self.get_text(val)
				elif prop == 'ships to':
					ads['ships_to'] = self.get_text(val)
				elif prop == 'except':
					ads['ships_to_except'] = self.get_text(val)
				elif prop == 'delivery':
					ads['shipping_options'] = json.dumps([self.get_text(val)])
				else:
					self.logger.warning("New property found : %s. Listing at %s" % (prop, response.url))

			shipping_options_elements = response.css('select[name="shippingID"] option:not([value="0"])')
			shipping_options = []
			for element in shipping_options_elements:
				shipping_options.append(self.get_text(element))

			if len(shipping_options) > 0:
				ads['shipping_options'] = json.dumps(shipping_options)
					
			ads['description'] 			= self.get_presentation_text(response, 'Details')
			ads['terms_and_conditions'] = self.get_presentation_text(response, 'Terms & Conditions')

			ads['in_stock'] = True if len(response.css(".listing-stock .label-success")) > 0 else False
			stock_text = self.get_text(response.css(".listing-stock .label-success")).lower()
			m = re.search('(\d+) in stock', stock_text)
			if m:
				ads['stock'] = m.group(1)

			yield ads


			## ===================== IMAGES =====================
			images_url = response.css('img.img-thumbnail::attr(src)').extract();
			for url in images_url:
				if url:
					img_item = items.AdsImage(image_urls = [])
					img_item['image_urls'].append(self.make_request('image', url=url))	# Need Scrapy > 1.4.0 for this to work (base64 url encoded data).
					img_item['ads_id'] = ads['offer_id']
					yield img_item

			#self.dao.flush(dbmodels.AdsImage)
			## =========


			presetations = response.css("li[role='presentation'] a");
			for presentation in presetations:
				name = self.get_text(presentation).lower()
				link = presentation.xpath('@href').extract_first()
				if name in ['details', 'terms & conditions'] : 
					yield self.make_request('listing', url=link)
				elif name == 'feedback':
					yield self.make_request('listing_feedback', url=link, listing_id=ads['offer_id'])
				else:
					self.logger.warning('Encountered an unknown tab %s. Listing at %s' % (name, response.url))

		except WarningException as e:
			self.logger.warning("Cannot parse listing.  %s" % e) 
		except:
			raise
    def parse_ads(self, response):
        title = response.xpath(".//div[@id='main']/h1/text()").extract_first()
        if title is None and response.xpath(
                ".//div[contains(text(), 'Produkten finns inte.')]"):
            self.logger.warning(
                "Found what is likely an empty page at %s. Flugsvamp writes: %s"
                %
                (response.url,
                 response.xpath(
                     ".//div[contains(text(), 'Produkten finns inte.')]/text()"
                 ).extract_first().strip()))
        else:
            ads_item = items.Ads()
            user_item = items.User()
            ads_item['title'] = title
            ads_item['offer_id'] = response.url.split("=")[-1]
            ads_item['fullurl'] = response.url
            ads_item['relativeurl'] = self.get_relative_url(response.url)

            # COMMENT WHY THIS IS.
            description = self.get_text(
                response.xpath(
                    '//strong[contains(text(), "Beskrivning:")]/parent::div')
            ).replace('Beskrivning:', '')
            if description:
                ads_item['description'] = description
            try:
                keys = response.xpath(".//div[@class='lightrow']")
                for key_ele in keys:
                    key = key_ele.xpath("strong/text()").extract_first()
                    if key == None:
                        continue
                    key = key.lower()
                    if "omd" in key:
                        value = key_ele.xpath(
                            './/span[@class="grey"]/text()').extract_first()
                        m = re.search('(.*?)\ \((.*?)\ omd', value,
                                      re.M | re.I | re.S)
                        if m:
                            ads_item['product_rating'] = m.group(1)
                            ads_item['already_sold'] = m.group(2)
                    elif "ljare" in key:
                        ads_item['vendor_username'] = key_ele.xpath(
                            './/a/text()').extract_first()
                        user_item['username'] = ads_item['vendor_username']
                        user_item['relativeurl'] = key_ele.xpath(
                            './/a/@href').extract_first()
                        user_item['fullurl'] = response.urljoin(
                            user_item['relativeurl'])
                        value = key_ele.xpath(
                            './/span[@class="grey"]/text()').extract_first()
                        m = re.search('(.*?)\ \((.*?)\ omd', value,
                                      re.M | re.I | re.S)
                        if m:
                            user_item['average_rating'] = m.group(1)
                            user_item['feedback_received'] = m.group(2)
                    elif key == "kategori:":
                        ads_item['category'] = key_ele.xpath(
                            './/a/text()').extract_first()
                    elif key == "kvantitet:":
                        ads_item['quantity'] = self.get_text(
                            key_ele.xpath('span[@class="float-right"]'))
                    elif key == "ditt pris inkl. frakt:":
                        value = self.get_text(
                            key_ele.xpath('.//span[@class="float-right"]'))
                        m = re.search('(.*?)\ \((.*?)\)', value,
                                      re.M | re.I | re.S)
                        if m:
                            ads_item['price_btc'] = m.group(2)
                    elif key == "pristabell:":
                        price_options = []
                        priceList = key_ele.xpath(
                            './/span[@class="float-right"]').extract_first(
                            ).split('<br>')
                        for list_item in priceList:
                            linesel = scrapy.Selector(text=list_item)
                            line_txt = self.get_text(linesel)
                            price_options.append(line_txt)
                        if len(price_options) > 0:
                            ads_item['price_options'] = price_options
                    else:
                        self.logger.warning(
                            "Found a new piece of product information, '%s', at %s"
                            % (key, response.url))
                yield ads_item
                yield user_item
            except Exception as error:
                self.logger.warning(
                    "Failed to parse listing (Error: '%s'). See URL %s" %
                    (error, response.url))

        # ===================== IMAGES =====================
        images_url = response.css('img.float-right::attr(src)').extract()
        for url in images_url:
            if url:
                img_item = items.AdsImage(image_urls=[])
                img_item['image_urls'].append(
                    self.make_request(reqtype='image',
                                      url=url,
                                      headers=self.tor_browser))
                img_item['ads_id'] = ads_item['offer_id']
                yield img_item
    def parse_product(self, response):
        ads_id = re.search('code=([\w]+)', response.url).group(1)
        try:
            ads_item = items.Ads()
            ads_item['offer_id'] = ads_id
            ads_item['title'] = self.get_text(
                response.xpath(".//div[@class='col-md-8']/h2"))
            ads_item['relativeurl'] = self.get_relative_url(response.url)
            ads_item['fullurl'] = response.url
            ads_item['accepted_currencies'] = response.css(
                '.well input[name="currency"]::attr(value)').extract()
            ads_item['description'] = self.get_text(
                response.css('ul.nav-tabs').xpath('following-sibling::p'))

            in_stock_match = re.search(
                '(\d+)',
                response.css('.listing-stock span::text').extract_first())
            if in_stock_match:
                ads_item['in_stock'] = in_stock_match.group(1)
            ads_item['shipping_options'] = []
            for option in response.css(
                    'select[name="shipping_option"] option'):
                ads_item['shipping_options'].append(self.get_text(option))
            item_sold_match = re.search(
                '(\d+) items sold since ([\d\-: ]+)',
                self.get_text(
                    response.css('table.table-condensed tr:last-child td')))
            if item_sold_match:
                ads_item['already_sold'] = item_sold_match.group(1)

            trs = response.css('div.col-sm-7 table.table-condensed tr')
            for tr in trs:
                tds = tr.css('td')
                if len(tds) == 2:
                    key = self.get_text(tds[0]).lower()
                    value = self.get_text(tds[1])
                    if key == 'vendor':
                        ads_item['vendor_username'] = tds.xpath(
                            ".//a/text()").extract_first()
                    elif key == 'class':
                        ads_item['ads_class'] = value
                    elif key == 'escrow type':
                        ads_item['escrow'] = value
                    elif key == 'ships from':
                        ads_item['ships_from'] = value
                    else:
                        self.logger.warning(
                            'New information found on product page : %s' % key)

            prices = self.get_text(
                response.xpath(".//div[@class='listing-price']"))
            price_eur = re.search("([0-9\.]*) EUR", prices)
            price_usd = re.search("([0-9\.]*) USD", prices)
            if price_usd:
                ads_item['price_usd'] = price_usd.group(1)
            if price_eur:
                ads_item['price_eur'] = price_eur.group(1)
            price_btc = response.xpath(
                ".//div[@class='listing-price']/span").extract_first()
            if 'btc' in price_btc:
                ads_item['price_btc'] = self.get_text(
                    response.xpath(".//div[@class='listing-price']/span"))
            else:
                self.logger.warning(
                    "Couldn't match BTC price. There might be another currency available. Please inspect %s"
                    % response.url)

            yield ads_item
        except Exception as error:
            self.logger.warning("Failed to yield ads at %s because '%s'" %
                                (response.url, error))

        try:
            image_url = response.css(
                'div.index-img img::attr(src)').extract_first()
            if image_url:
                ads_image = items.AdsImage(image_urls=[])
                ads_image['ads_id'] = ads_id
                ads_image['image_urls'].append(
                    self.make_request('image', url=self.make_url(image_url)))
                yield ads_image
        except Exception as error:
            self.logger.warning("Failed to yield ad image at %s because '%s'" %
                                (response.url, error))
    def parse_listing(self, response):
        try:
            ads_item = items.Ads()
            ads_item["offer_id"] = re.search(r"ls_id=(\d+)", response.url,
                                             re.M | re.I).group(1)
            ads_item["vendor_username"] = self.get_text(
                response.xpath("//small/a[contains(@href,'user.php?u_id=')]"))
            ads_item["vendor_username"] = ads_item["vendor_username"].split(
                "(")[0].strip()
            ads_item["fullurl"] = response.url.split("&")[0]
            ads_item["relativeurl"] = self.get_relative_url(
                ads_item["fullurl"])
            ads_item["title"] = response.xpath(
                ".//div[@class='col-sm-12']/a[contains(@href, 'ls_id')]/text()"
            ).extract_first()
            ads_item["ships_to"] = self.get_text(
                response.xpath(
                    "//small//b[contains(text(),'Ship To :')]/ancestor::small")
            ).replace("Ship To :", "").strip()
            if ads_item["ships_to"] == "":
                #self.logger.warning("Fallback to other shipping to field at %s." % response.url)
                ads_item["ships_to"] = self.get_text(
                    response.xpath(
                        "//small//b[contains(text(),'Ship To :')]/ancestor::small/following-sibling::small[1]"
                    ))
            ads_item["ships_from"] = self.get_text(
                response.xpath(
                    "//small//b[contains(text(),'Origin Country :')]/ancestor::small"
                )).replace("Origin Country :", "").strip()
            ads_item["ads_class"] = self.get_text(
                response.xpath(
                    "//small//b[contains(text(),'Product class :')]/ancestor::small"
                )).replace("Product class :", "").strip()
            ads_item["quantity"] = self.get_text(
                response.xpath(
                    "//small//b[contains(text(),'Quantity :')]/ancestor::small"
                )).replace("Quantity :", "").strip()

            accepted_currencies = []
            sale_price = self.get_text(
                response.xpath(
                    "//form//span[contains(text(),'Sale Price :')]")).replace(
                        "Sale Price :", "").strip()
            if "USD" in sale_price:
                ads_item["price_usd"] = re.search(r"([\d\.]+)\s*USD",
                                                  sale_price, re.M | re.I)
                ads_item["price_usd"] = ads_item["price_usd"].group(
                    1) if ads_item["price_usd"] else None
            if "BTC" in sale_price:
                ads_item["price_btc"] = re.search(r"([\d\.]+)\s*BTC",
                                                  sale_price, re.M | re.I)
                ads_item["price_btc"] = ads_item["price_btc"].group(
                    1) if ads_item["price_btc"] else None
                accepted_currencies.append("BTC")
            ads_item["accepted_currencies"] = ",".join(accepted_currencies)
            ads_item["shipping_options"] = self.get_shipping_options(response)

            # new fields
            ads_item["escrow"] = self.get_text(
                response.xpath(
                    "//small//b[contains(text(),'Payment :')]/ancestor::small")
            ).replace("Payment :", "").strip()
            active_tab = self.get_text(
                response.xpath(
                    "//ul[@class='nav nav-tabs']/li[@class='active']/a"))

            if "Product Description" in active_tab:
                ads_item['description'] = self.get_text(
                    response.xpath("//div[@class='tab-content']"))
            elif "Refund Policy" in active_tab:
                ads_item['refund_policy'] = self.get_text(
                    response.xpath("//div[@class='tab-content']"))
            elif "Product Tags" in active_tab:
                pass
            elif "Feedback" in active_tab:
                feedbacks = response.xpath(
                    "//div[@class='tab-content']//table/tbody/tr")
                if feedbacks:
                    for feedback in feedbacks:
                        rating = items.ProductRating()
                        rating["ads_id"] = ads_item["offer_id"]
                        rating["submitted_by"] = self.get_text(
                            feedback.xpath("td[3]/small"))
                        rating["submitted_on_string"] = self.get_text(
                            feedback.xpath("td[5]/small")).replace(
                                "View Item", "").strip()
                        rating["submitted_on"] = self.parse_datetime(
                            rating["submitted_on_string"])
                        rating["comment"] = self.get_text(
                            feedback.xpath("td[2]/small"))
                        rating["price_usd"] = self.get_text(
                            feedback.xpath("td[4]/small"))
                        # new fields
                        score = self.get_text(feedback.xpath("td[1]"))
                        if score == "\xe2\x98\x91":
                            rating["rating"] = "Positive"
                        elif score == "\xe2\x98\x92":
                            rating["rating"] = "Negative"
                        elif score == "\xe2\x98\x90":
                            rating["rating"] = "Neutral"
                        else:
                            self.logger.warning(
                                "Unknown rating type '%s' at %s" %
                                (rating["rating"], response.url))
                        yield rating
            else:
                self.logger.warning("Unknown tab: %s at %s" %
                                    (active_tab, response.url))
            yield ads_item
        except Exception as error:
            self.logger.warning("Couldn't yield Ad (Error %s) at %s." %
                                (error, response.url))

        if self.is_listing_tab_page(response) is False:
            #     self.requests_from_listing_page(response)
            image_urls = response.xpath(
                "//img[@class='pull-left']/@src").extract()
            if len(image_urls) > 0:
                img_item = items.AdsImage(image_urls=[])
                for img_url in image_urls:
                    # e.g. uploads/9bc5f18d5667081890e8972def13da2f_100_100.png
                    #      -> uploads/9bc5f18d5667081890e8972def13da2f.png
                    img_url = re.sub(r"_\d+_\d+\.", ".", img_url)
                    img_item['image_urls'].append(
                        self.make_request(reqtype='image', url=img_url))
                img_item['ads_id'] = ads_item['offer_id']
                yield img_item
 def parse_listing(self, response):
     # The ad.
     ads_item = items.Ads()
     ads_item["offer_id"] = re.search(r"/item/([^/]+)", response.url,
                                      re.M | re.I)
     if ads_item["offer_id"]:
         ads_item["offer_id"] = ads_item["offer_id"].group(1)
     else:
         self.logger.warning("offer_id is None at %s" % response.url)
         return
     ads_item["vendor_username"] = re.search(r"/user/([^/]+)", response.url,
                                             re.M | re.I)
     if ads_item["vendor_username"]:
         ads_item["vendor_username"] = ads_item["vendor_username"].group(1)
     ads_item["fullurl"] = response.url.split(
         ads_item["offer_id"])[0] + ads_item["offer_id"]
     ads_item["relativeurl"] = self.get_relative_url(ads_item["fullurl"])
     ads_item["title"] = "".join(
         response.xpath(
             ".//div[@class='ui segment inverted t-item-image secondary']/h3/text()"
         ).extract()).strip()
     ads_item["description"] = self.get_text(
         response.xpath(
             ".//div[@class='ui segment']/h3[contains(text(),'About')]/following-sibling::div"
         ))
     ads_item["shipping_options"] = self.get_shipping_options(response)
     ads_item["product_rating"] = response.xpath(
         ".//div[@class='ui segment inverted t-item-image secondary']/h3//i[@class='icon thumbs up']/following-sibling::span/text()"
     ).extract_first(default="").strip()
     yield ads_item
     # The images.
     image_urls = response.xpath(
         ".//div[@class='ui segment inverted t-item-image secondary']/img/@src"
     ).extract()
     if len(image_urls) > 0:
         img_item = items.AdsImage(image_urls=[])
         for img_url in image_urls:
             img_item['image_urls'].append(
                 self.make_request(reqtype='image', url=img_url))
         img_item['ads_id'] = ads_item['offer_id']
         yield img_item
     # The reviews.
     feedbacks = response.xpath(
         ".//div[@class='ui segment']/h3[contains(text(),'Reviews')]/following-sibling::div[@class='ui comments']/div[@class='comment']"
     )
     if feedbacks:
         for feedback in feedbacks:
             rating = items.ProductRating()
             rating["ads_id"] = ads_item["offer_id"]
             rating["submitted_by"] = feedback.xpath(
                 ".//a[@class='author']/text()").extract_first(
                     default="").strip().replace("@", "")
             rating["submitted_on_string"] = feedback.xpath(
                 ".//span[@class='date']/text()").extract_first(
                     default="").strip()
             rating["submitted_on"] = self.parse_datetime(
                 rating["submitted_on_string"])
             rating["comment"] = self.get_text(
                 feedback.xpath(".//pre[@class='text']"))
             rating["rating"] = feedback.xpath(
                 ".//i[@class='icon thumbs up']/following-sibling::span/text()"
             ).extract_first(default="").strip()
             yield rating
Example #13
0
    def parse_product(self, response):
        ads = items.Ads()

        if 'username' in response.meta:
            username = response.meta['username']
        else:
            username = self.get_username_from_header(response)

        ads['offer_id'] = self.get_product_id_from_url(response.url)
        ads['relativeurl'] = 'product/%s' % ads['offer_id']
        ads['fullurl'] = self.make_url(ads['relativeurl'])
        ads['vendor_username'] = username

        if 'category' in response.meta and response.meta['category'] != None:
            ads['category'] = response.meta['category']

        ads['title'] = self.get_text_first(response.css('.container h1'))
        ads['description'] = self.get_text(
            response.css('article.internal-product-desc'))

        ads['escrow'] = False
        ads['in_stock'] = False
        price_options = []
        # for each trable line
        for tr in response.css('.internal-product-varieties table tr'):
            qty = self.get_text(tr.xpath('./td[1]'))
            for btn in tr.css(
                    '.btn'
            ):  # Many options per line possible, separated by a Buy button
                option = {}
                option['qty'] = qty
                btn_txt = self.get_text(btn)
                btn_text_lower = btn_txt.lower()
                if 'escrow' in btn_text_lower:
                    option['method'] = 'escrow'
                    ads['in_stock'] = True
                elif 'direct-pay' in btn_text_lower:
                    option['method'] = 'direct-pay'
                    ads['in_stock'] = True
                elif 'out of stock' in btn_text_lower:
                    option['method'] = 'out of stock'
                    pass
                else:
                    option['method'] = ''
                    self.logger.warning(
                        'Unknown price payment method from string "%s" at %s' %
                        (btn_text_lower, response.url))

                # Scan each line to find the price that preceed the actual button
                last_price_seen = ''
                for td in tr.css('td'):
                    td_txt = self.get_text(td)
                    m = re.search('BTC\s*(\d+(.\d+)?)', td_txt, re.IGNORECASE)
                    if m:
                        last_price_seen = m.group(
                            1
                        )  # We found a price in bitcoin, if we encounter the actual button, this price will be the right one

                    btn_in_cell = td.css('.btn')
                    if btn_in_cell:
                        if self.get_text(btn_in_cell) == btn_txt:
                            option['price'] = last_price_seen
                            break

                if 'price' not in option:
                    self.logger.warning(
                        'Could not find price for product at %s' %
                        response.url)
                    option['price'] = ''

                if option['method'] == 'escrow':
                    ads['escrow'] = True

                price_options.append(option)

        ads['price_options'] = json.dumps(price_options)

        if len(price_options) == 1:
            ads['price'] = price_options[0]['price']

        ads_img = items.AdsImage()
        ads_img['ads_id'] = ads['offer_id']
        img_src = response.css('.main-content').xpath(
            './/img[contains(@src, "product_images")]/@src').extract_first()
        if img_src:
            ads_img['image_urls'] = [self.make_request('image', url=img_src)]
        yield ads_img

        yield ads