Ejemplo n.º 1
0
 def __init__(self, response):
     mde = MicrodataExtractor()
     try:
         self.data = mde.extract(response.body, response.url)
     except:
         self.data = mde.extract(response.body.decode('latin-1'),
                                 response.url)
Ejemplo n.º 2
0
    def test_w3c_5_2(self):
        body = get_testdata('w3c', 'microdata.5.2.html')
        expected = json.loads(get_testdata('w3c', 'microdata.5.2.flat.json').decode('UTF-8'))

        mde = MicrodataExtractor(nested=False, strict=True)
        data = mde.extract(body)
        self.assertDictEqual(data, expected)
Ejemplo n.º 3
0
    def test_w3c_5_2(self):
        body = get_testdata('w3c', 'microdata.5.2.html')
        expected = json.loads(get_testdata('w3c', 'microdata.5.2.withtext.json').decode('UTF-8'))

        mde = MicrodataExtractor(add_text_content=True)
        data = mde.extract(body)
        self.assertDictEqual(data, expected)
Ejemplo n.º 4
0
    def test_w3c_5_5(self):
        body = get_testdata("w3c", "microdata.5.5.html")
        expected = json.loads(get_testdata("w3c", "microdata.5.5.json").decode("UTF-8"))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body)
        self.assertDictEqual(data, expected)
Ejemplo n.º 5
0
 def parse_listing(self, response):
     mde = MicrodataExtractor()
     data = mde.extract(response.body)['items']
     if data:
         it = {}
         it['shop'] = data[0]['properties']
         prod = data[1]['properties']
         it.update(prod['offerDetails']['properties'])
         it['name'] = prod['name']
         it['url'] = response.url
         it['properties'] = [x for x in response.css('#item-overview .properties li::text').extract() \
             if all(y not in x.lower() for y in ['materials','feedback', 'favorited', 'ships'])]
         it['materials'] = e0(response.css('#overview-materials::text'))
         it['origin'] = e0(response.css('.origin::text'))
         it['imgs'] = response.css(
             '#image-carousel img::attr("src")').extract()
         it['description'] = e0(response.css("#description-text"))
         it['tags'] = response.css('#listing-tag-list li a::text').extract()
         it['fineprints'] = [
             x.strip()
             for x in response.css('#fineprint li::text').extract()[:4]
         ]
         it['rating'] = response.css(
             '.review-rating meta::attr("content")').extract()
         #it['html'] = response.body
         yield it
Ejemplo n.º 6
0
 def parse(self, response):
     for beacon_page in response:
         mde = MicrodataExtractor()
         beacon_data=mde.extract(html_content)
         yield{
             beacon_data
         }
 def parse(self, response):
     mde = MicrodataExtractor()
     data = mde.extract(response.body)
     for item in data:
         if item['type'] in self.target_types:
             record = {'indexed_date': datetime.date.today().isoformat(), 'url': response.url, 'body': item}
             yield record
Ejemplo n.º 8
0
    def test_w3c_7_1(self):
        body = get_testdata('w3c', 'microdata.7.1.html')
        expected = json.loads(get_testdata('w3c', 'microdata.7.1.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body, 'http://blog.example.com/progress-report')
        self.assertDictEqual(data, expected)
Ejemplo n.º 9
0
    def test_w3c_object_element(self):
        body = get_testdata('w3c', 'microdata.object.html')
        expected = json.loads(get_testdata('w3c', 'microdata.object.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body, 'http://www.example.com/microdata/test')
        self.assertDictEqual(data, expected)
Ejemplo n.º 10
0
    def test_w3c_data_element(self):
        body = get_testdata('w3c', 'microdata.4.2.data.html')
        expected = json.loads(get_testdata('w3c', 'microdata.4.2.data.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body)
        self.assertDictEqual(data, expected)
Ejemplo n.º 11
0
    def test_w3c_7_1(self):
        body = get_testdata("w3c", "microdata.7.1.html")
        expected = json.loads(get_testdata("w3c", "microdata.7.1.json").decode("UTF-8"))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body, "http://blog.example.com/progress-report")
        self.assertDictEqual(data, expected)
Ejemplo n.º 12
0
    def test_w3c_5_2(self):
        body = get_testdata('w3c', 'microdata.5.2.html')
        expected = json.loads(
            get_testdata('w3c', 'microdata.5.2.withtext.json').decode('UTF-8'))

        mde = MicrodataExtractor(add_text_content=True)
        data = mde.extract(body)
        self.assertEqual(data, expected)
Ejemplo n.º 13
0
def get_microdata_extruct_items(htmltext):
    mde = MicrodataExtractor()
    try:
        items = mde.extract(htmltext)
    except XMLSyntaxError:
        return  # Nothing to do here

    return items
Ejemplo n.º 14
0
    def test_join_none(self):
        body = get_testdata('schema.org', 'product-ref.html')
        expected = json.loads(
            get_testdata('schema.org', 'product-ref.json').decode('UTF-8'))

        mde = MicrodataExtractor()
        data = mde.extract(body)
        self.assertEqual(data, expected)
Ejemplo n.º 15
0
    def test_w3c_textContent_values(self):
        body = get_testdata('w3c', 'microdata.4.2.strings.html')
        expected = json.loads(
            get_testdata('w3c', 'microdata.4.2.strings.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body)
        self.assertEqual(data, expected)
Ejemplo n.º 16
0
    def test_w3c_5_5(self):
        body = get_testdata('w3c', 'microdata.5.5.html')
        expected = json.loads(
            get_testdata('w3c', 'microdata.5.5.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body)
        self.assertEqual(data, expected)
Ejemplo n.º 17
0
    def test_w3c_meter_element(self):
        body = get_testdata('w3c', 'microdata.4.2.meter.html')
        expected = json.loads(
            get_testdata('w3c', 'microdata.4.2.meter.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body)
        self.assertDictEqual(data, expected)
Ejemplo n.º 18
0
    def test_w3c_object_element(self):
        body = get_testdata('w3c', 'microdata.object.html')
        expected = json.loads(
            get_testdata('w3c', 'microdata.object.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body, 'http://www.example.com/microdata/test')
        self.assertEqual(data, expected)
Ejemplo n.º 19
0
    def test_w3c_7_1(self):
        body = get_testdata('w3c', 'microdata.7.1.html')
        expected = json.loads(
            get_testdata('w3c', 'microdata.7.1.flat.json').decode('UTF-8'))

        mde = MicrodataExtractor(nested=False, strict=True)
        data = mde.extract(body, 'http://blog.example.com/progress-report')
        self.assertEqual(data, expected)
Ejemplo n.º 20
0
    def test_schemaorg_MusicRecording(self):
        for i in [1]:
            body = get_testdata('schema.org', 'MusicRecording.{:03d}.html'.format(i))
            expected = json.loads(get_testdata('schema.org', 'MusicRecording.{:03d}.json'.format(i)).decode('UTF-8'))

            mde = MicrodataExtractor()
            data = mde.extract(body)
            self.assertDictEqual(data, expected)
Ejemplo n.º 21
0
    def test_schemaorg_Event(self):
        for i in [1, 2, 3, 4, 8]:
            body = get_testdata("schema.org", "Event.{:03d}.html".format(i))
            expected = json.loads(get_testdata("schema.org", "Event.{:03d}.json".format(i)).decode("UTF-8"))

            mde = MicrodataExtractor()
            data = mde.extract(body)
            self.assertDictEqual(data, expected)
Ejemplo n.º 22
0
    def parse_product(self, response):
        mde = MicrodataExtractor()
        data = mde.extract(response.body)

        product_data = data['items'][1]['properties']

        extra_data = {}
        for l in response.body.split('\n'):
            if 'vgoogle_ecommProd' in l:
                line_data = l.strip()
                key = line_data.split(':')[0].strip().replace(
                    'vgoogle_ecommProd', '')
                value = line_data.split(':')[1][3:-3]
                if key not in extra_data:
                    extra_data[key] = value

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('identifier',
                         product_data.get('sku', extra_data['ID']))
        loader.add_value('sku', product_data.get('sku', extra_data['ID']))
        loader.add_value('name', product_data.get('name', extra_data['Name']))
        loader.add_value('url', response.url)
        if 'price' in product_data:
            loader.add_value('price', product_data['price'])
        elif 'Price' in extra_data:
            loader.add_value('price', extra_data['Price'])
        else:
            price = response.xpath('//*[(contains(@class, "product-item") and '
                                   'contains(@class, "product-price")) or @id="price-amount"]//text()')\
                            .re(r'[\d\.,]+')
            loader.add_value('price', price)
        try:
            loader.add_value(
                'category', data['items'][0]['properties']['itemListElement']
                [1]['properties']['name'])
        except:
            loader.add_value('category', extra_data['Cat'])
        loader.add_value('brand',
                         product_data.get('manufacturer', extra_data['Brand']))
        try:
            loader.add_value(
                'image_url',
                response.urljoin(
                    response.xpath(
                        '//div[@id="prod-img-placehold"]/img/@srcset').re(
                            r'(.*\.jpg)')[0].split(',')[-1].strip()))
        except:
            pass

        item = loader.load_item()

        metadata = SpecSaversMeta()
        promotional_data = response.xpath(
            '//div[@class="arrow-container"]/div/text()').extract()
        metadata['promotion'] = promotional_data[0].strip(
        ) if promotional_data else ''
        item['metadata'] = metadata
        yield item
Ejemplo n.º 23
0
    def test_join_custom_url(self):
        body = get_testdata('schema.org', 'product.html')
        expected = json.loads(
            get_testdata('schema.org',
                         'product_custom_url.json').decode('UTF-8'))

        mde = MicrodataExtractor()
        data = mde.extract(body, base_url='http://some-example.com')
        self.assertEqual(data, expected)
Ejemplo n.º 24
0
    def parse(self, response):
        extractor = MicrodataExtractor()
        items = extractor.extract(response.text, response.url)['items']
        for it in items:
            yield it['properties']

        next_page_url = response.css("li.next > a::attr(href)").extract_first()
        if next_page_url is not None:
            yield scrapy.Request(response.urljoin(next_page_url))
Ejemplo n.º 25
0
 def parse(self, response):
     extractor = MicrodataExtractor()
     properties = extractor.extract(response.body_as_unicode()).get('items')[0].get('properties', {})
     item = response.meta.get('item', {})
     item['url'] = response.url
     item['title'] = properties.get('name').replace('Details about', '').strip()
     item['price'] = float(
         properties.get('offers', {}).get('properties', {}).get('price', 0)
     )
     yield item
Ejemplo n.º 26
0
    def test_if_punctuations_in_description_are_correctly_formatted(self):
        body = get_testdata('websites', 'microdata-with-description.html')
        expected = json.loads(
            get_testdata('websites',
                         'microdata-with-description.json').decode('UTF-8'))

        mde = MicrodataExtractor()
        data = mde.extract(body)

        self.assertEqual(data, expected)
Ejemplo n.º 27
0
    def test_schemaorg_Event(self):
        for i in [1, 2, 3, 4, 8]:
            body = get_testdata('schema.org', 'Event.{:03d}.html'.format(i))
            expected = json.loads(
                get_testdata('schema.org',
                             'Event.{:03d}.json'.format(i)).decode('UTF-8'))

            mde = MicrodataExtractor()
            data = mde.extract(body)
            self.assertEqual(data, expected)
Ejemplo n.º 28
0
    def parse_item(self, response):
        """Parse the recipe to get title and ingredients."""
        schema_type = "mde"
        mde = MicrodataExtractor()
        data = mde.extract(response.body)
        # print('response.body:', response.body)
        # print('data:', data)
        if len(data) == 0:
            jslde = JsonLdExtractor()
            data = jslde.extract(response.body)
            schema_type = "jsonld"

        if schema_type == "mde":
            recipe = data[0]['properties']
            # recipe_output_item = RecipeItem()
            # recipe_output_item['recipe_name'] = recipe['name']
            # recipe_output_item['ingredients'] = [
            #     ingredient for ingredient in recipe['ingredients']
            #     if ingredient not in ['', 'Add all ingredients to list']
            # ]
            # recipe_output_item['tags'] = [tag['properties']['title']
            #                               for tag in data['items'][1:]]
            # try:
            #   recipe_output_item['description'] = recipe['description']
            # except KeyError:
            #   recipe_output_item['description'] = None
            # recipe_output_item['url'] = recipe['url']
        elif schema_type == "jsonld":
            recipe = data['items'][0]
            # recipe_output_item = RecipeItem()
            # recipe_output_item['recipe_name'] = recipe['name']
            # recipe_output_item['ingredients'] = recipe['ingredients']
            # recipe_output_item['tags'] = [tag['properties']['title']
            #                               for tag in data['items'][1:]]
            # try:
            #   recipe_output_item['description'] = recipe['description']
            # except KeyError:
            #   recipe_output_item['description'] = None
            # recipe_output_item['url'] = recipe['url']

        properties = [
            'totalTime', 'nutrition', 'name', 'author', 'url', 'image',
            'recipeIngredient', 'aggregateRating', 'recipeYield',
            'recipeInstructions', 'video', 'mainEntityOfPage', 'cookTime',
            'recipeCategory', 'review', 'prepTime', 'description'
        ]
        recipe_output_item = RecipeItem()
        for prop in properties:
            try:
                recipe_output_item[prop] = recipe[prop]
            except KeyError:
                recipe_output_item[prop] = None

        yield recipe_output_item
Ejemplo n.º 29
0
    def test_schemaorg_LocalBusiness(self):
        for i in [2, 3]:
            body = get_testdata('schema.org',
                                'LocalBusiness.{:03d}.html'.format(i))
            expected = json.loads(
                get_testdata(
                    'schema.org',
                    'LocalBusiness.{:03d}.json'.format(i)).decode('UTF-8'))

            mde = MicrodataExtractor()
            data = mde.extract(body)
            self.assertEqual(data, expected)
Ejemplo n.º 30
0
    def test_schemaorg_MusicRecording(self):
        for i in [1]:
            body = get_testdata('schema.org',
                                'MusicRecording.{:03d}.html'.format(i))
            expected = json.loads(
                get_testdata(
                    'schema.org',
                    'MusicRecording.{:03d}.json'.format(i)).decode('UTF-8'))

            mde = MicrodataExtractor()
            data = mde.extract(body)
            self.assertDictEqual(data, expected)
Ejemplo n.º 31
0
    def parse(self, response):
        mde = MicrodataExtractor()
        data = mde.extract(response.body)
        for item in data['items']:
            if item['type'] in self.target_types:
                item['indexed_date'] = datetime.date.today().isoformat()
                item['url'] = response.url
                yield item

        for url in response.xpath('//a/@href').extract():
            if '/events' in url:
                yield scrapy.Request(response.urljoin(url),
                                     callback=self.parse)
Ejemplo n.º 32
0
    def extract(self, html_text: str,
                extract_title: bool = False,
                extract_meta: bool = False,
                extract_microdata: bool = False,
                microdata_base_url: str = "",
                extract_json_ld: bool = False,
                extract_rdfa: bool = False,
                rdfa_base_url: str = "") \
            -> List[Extraction]:
        """
        Args:
            html_text (str): input html string to be extracted
            extract_title (bool): True if string of 'title' tag needs to be extracted, return as { "title": "..." }
            extract_meta (bool): True if string of 'meta' tags needs to be extracted, return as { "meta": { "author": "...", ...}}
            extract_microdata (bool): True if microdata needs to be extracted, returns as { "microdata": [...] }
            microdata_base_url (str): base namespace url for microdata, empty string if no base url is specified
            extract_json_ld (bool): True if json-ld needs to be extracted, return as { "json-ld": [...] }
            extract_rdfa (bool): True if rdfs needs to be extracted, returns as { "rdfa": [...] }
            rdfa_base_url (str): base namespace url for rdfa, empty string if no base url is specified

        Returns:
            List[Extraction]: the list of extraction or the empty list if there are no matches.
        """
        res = list()
        soup = BeautifulSoup(html_text, 'html.parser')

        if soup.title and extract_title:
            title = self._wrap_data("title", soup.title.string.encode('utf-8').decode('utf-8'))
            res.append(title)

        if soup.title and extract_meta:
            meta_content = self._wrap_meta_content(soup.find_all("meta"))
            meta_data = self._wrap_data("meta", meta_content)
            res.append(meta_data)

        if extract_microdata:
            mde = MicrodataExtractor()
            mde_data = self._wrap_data("microdata", mde.extract(html_text, microdata_base_url))
            res.append(mde_data)

        if extract_json_ld:
            jslde = JsonLdExtractor()
            jslde_data = self._wrap_data("json-ld", jslde.extract(html_text))
            res.append(jslde_data)

        if extract_rdfa:
            rdfae = RDFaExtractor()
            rdfae_data = self._wrap_data("rdfa", rdfae.extract(html_text, rdfa_base_url))
            res.append(rdfae_data)

        return res
Ejemplo n.º 33
0
    def parse_product(self, response):
        mde = MicrodataExtractor()
        try:
            micro_data = mde.extract(response.body)['items']
            gen_data = filter(lambda a: a['type'] == 'http://schema.org/Product',
                              micro_data)[0]['properties']
            categories = [c['properties']['title']
                          for c in filter(lambda d: d['type'] == 'http://data-vocabulary.org/Breadcrumb',
                                          micro_data)][1:]
        except:
            self.log('WARNING => Wrong product page in %s' % response.url)
            return

        main_name = gen_data['name']
        if isinstance(main_name, list):
            main_name = main_name[0]
        main_brand = gen_data.get('brand', '')
        if isinstance(main_brand, list):
            main_brand = main_brand[0]

        variants = response.xpath('//input[@name="ctl00$cphMain$ctl00$hidProductVariants"]/@value').extract()
        if variants:
            data = json.loads(self.html_parser.unescape(variants[0]))
            for d in data:
                for var in d['Variants']:
                    for size_data in var['Variants']:
                        color_name = size_data.get('Article', dict()).get('ColorName', '')
                        size_data = size_data['Article']
                        url = self.product_url % size_data
                        identifier = size_data['ItemOfferId']
                        name = main_name + ', ' + color_name + ', ' + size_data['FriendlySize']
                        price = size_data['WebInfo']['ArticlePriceDisplay']['FormattedSalePriceAfterWithCharges']
                        shipping_cost = size_data['FormattedDeliveryFee']
                        loader = ProductLoader(item=Product(), response=response)
                        loader.add_value('name', name)
                        loader.add_value('url', url)
                        loader.add_value('identifier', identifier)
                        loader.add_value('sku', size_data['ProductId'])
                        loader.add_value('price', extract_price_eu(price))
                        if shipping_cost:
                            loader.add_value('shipping_cost', extract_price_eu(shipping_cost))
                        loader.add_value('image_url', gen_data['image'][-1])
                        if size_data['AvailabilityCode'] != 'L':
                            loader.add_value('stock', 0)
                        loader.add_value('category', categories)
                        if main_brand:
                            loader.add_value('brand', main_brand)
                        yield loader.load_item()
        else:
            self.log('WARNING: Variants not found in => %s' % response.url)
    def parse_item(self, response):
        """Parse the recipe to get title and ingredients."""
        schema_type = "mde"
        mde = MicrodataExtractor()
        data = mde.extract(response.body)
        if len(data['items']) == 0:
            jslde = JsonLdExtractor()
            data = jslde.extract(response.body)
            schema_type = "jsonld"

        if schema_type == "mde":
            recipe = data['items'][2]['properties']
            recipe_output_item = RecipeItem()
            recipe_output_item['recipe_name'] = recipe['name']
            recipe_output_item['ingredients'] = [
                ingredient for ingredient in recipe['ingredients']
                if ingredient not in ['', 'Add all ingredients to list']
            ]
            recipe_tags = recipe['recipeCategory']
            if 'recipeCuisine' in recipe.keys():
                recipe_tags.append(recipe['recipeCuisine'])
            recipe_output_item['tags'] = recipe_tags
            try:
                recipe_output_item['description'] = recipe['description']
            except KeyError:
                recipe_output_item['description'] = None
            recipe_output_item['url'] = recipe['url']
        elif schema_type == "jsonld":
            recipe = data['items'][0]
            recipe_output_item = RecipeItem()
            recipe_output_item['recipe_name'] = recipe['name']
            recipe_output_item['ingredients'] = recipe['ingredients']
            recipe_output_item['tags'] = [
                tag['properties']['title'] for tag in data['items'][1:]
            ]
            try:
                recipe_output_item['description'] = recipe['description']
            except KeyError:
                recipe_output_item['description'] = None
            recipe_output_item['url'] = recipe['url']

        yield recipe_output_item
Ejemplo n.º 35
0
  def parse_item(self, response):
    """Parse the recipe to get title and ingredients."""
    schema_type = "mde"
    mde = MicrodataExtractor()
    data = mde.extract(response.body)
    if len(data['items']) == 0:
      jslde = JsonLdExtractor()
      data = jslde.extract(response.body)
      schema_type = "jsonld"

    if schema_type == "mde":
      recipe = data['items'][2]['properties']
      recipe_output_item = RecipeItem()
      recipe_output_item['recipe_name'] = recipe['name']
      recipe_output_item['ingredients'] = [
          ingredient for ingredient in recipe['ingredients']
          if ingredient not in ['', 'Add all ingredients to list']
      ]
      recipe_tags = recipe['recipeCategory']
      if 'recipeCuisine' in recipe.keys():
        recipe_tags.append(recipe['recipeCuisine'])
      recipe_output_item['tags'] = recipe_tags
      try:
        recipe_output_item['description'] = recipe['description']
      except KeyError:
        recipe_output_item['description'] = None
      recipe_output_item['url'] = recipe['url']
    elif schema_type == "jsonld":
      recipe = data['items'][0]
      recipe_output_item = RecipeItem()
      recipe_output_item['recipe_name'] = recipe['name']
      recipe_output_item['ingredients'] = recipe['ingredients']
      recipe_output_item['tags'] = [tag['properties']['title']
                                    for tag in data['items'][1:]]
      try:
        recipe_output_item['description'] = recipe['description']
      except KeyError:
        recipe_output_item['description'] = None
      recipe_output_item['url'] = recipe['url']

    yield recipe_output_item
Ejemplo n.º 36
0
 def parse_listing(self, response):
 	mde = MicrodataExtractor()
     data = mde.extract(response.body)['items']
     if data:
         it = {}
         it['shop'] = data[0]['properties']
         prod = data[1]['properties']
         it.update(prod['offerDetails']['properties'])
         it['name'] = prod['name']
         it['url'] = response.url
         it['properties'] = [x for x in response.css('#item-overview .properties li::text').extract() \
             if all(y not in x.lower() for y in ['materials','feedback', 'favorited', 'ships'])]
         it['materials'] = e0(response.css('#overview-materials::text'))
         it['origin'] = e0(response.css('.origin::text'))
         it['imgs'] = response.css('#image-carousel img::attr("src")').extract()
         it['description'] = e0(response.css("#description-text"))
         it['tags'] = response.css('#listing-tag-list li a::text').extract()
         it['fineprints'] = [x.strip() for x in response.css('#fineprint li::text').extract()[:4]]
         it['rating'] = response.css('.review-rating meta::attr("content")').extract()
         #it['html'] = response.body
         yield it
Ejemplo n.º 37
0
    def extract(self, html_text: str,
                extract_title: bool = False,
                extract_meta: bool = False,
                extract_microdata: bool = False,
                extract_json_ld: bool = False,
                extract_rdfa: bool = False) \
            -> List[Extraction]:

        res = list()
        soup = BeautifulSoup(html_text, 'html.parser')

        if soup.title and extract_title:
            title = self.wrap_data(
                "title",
                soup.title.string.encode('utf-8').decode('utf-8'))
            res.append(title)

        if soup.title and extract_meta:
            meta_content = self.wrap_meta_content(soup.find_all("meta"))
            meta_data = self.wrap_data("meta", meta_content)
            res.append(meta_data)

        if extract_microdata:
            mde = MicrodataExtractor()
            mde_data = self.wrap_data("microdata", mde.extract(html_text))
            res.append(mde_data)

        if extract_json_ld:
            jslde = JsonLdExtractor()
            jslde_data = self.wrap_data("json-ld", jslde.extract(html_text))
            res.append(jslde_data)

        if extract_rdfa:
            rdfae = RDFaExtractor()
            rdfae_data = self.wrap_data("rdfa", rdfae.extract(html_text))
            res.append(rdfae_data)

        return res
Ejemplo n.º 38
0
	def parse(self, response):		
		selector = Selector(response=response)

		extractor = MicrodataExtractor()
		items = extractor.extract(response.body_as_unicode(), response.url)
		print items
Ejemplo n.º 39
0
    def parse_products(self, response):
        mde = MicrodataExtractor()
        data = mde.extract(response.body)

        category = response.meta['category']

        selectors = response.xpath('//div[contains(@id, "Products_")]')
        products = filter(lambda d: d['type'] == 'http://schema.org/Product',
                          data['items'])
        for product_data, product_xs in zip(products, selectors):
            properties = product_data['properties']
            try:
                offer = properties['offers']['properties']
            except:
                self.log('Offers are not found for %s => %s' %
                         (properties['name'], response.url))
                continue
            brand = product_xs.xpath(
                './/div[@class="Image"]//img[contains(@alt, "View more ")]/@alt'
            ).re(r'View more (.*) products')
            product_url = product_xs.xpath(
                './/div[@class="Info"]//h2/a[contains(@href, "/products/")]/@href'
            ).extract()
            if not product_url:
                self.log('Not product url in => %s' % response.url)
                continue
            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('identifier', properties['mpn'])
            loader.add_value('url', response.urljoin(product_url[0]))
            loader.add_value('name', properties['name'])
            loader.add_value(
                'price',
                round(
                    Decimal(offer['price'].replace(',', '')) / Decimal('1.2'),
                    2))
            loader.add_value('sku', properties['mpn'])
            loader.add_value('category', category)
            loader.add_value(
                'image_url',
                urljoin_rfc('http://www.broadbandbuyer.com/images/products/',
                            properties['image']))
            if brand:
                loader.add_value('brand', brand[0])
            loader.add_value('shipping_cost', '13')

            in_stock = (offer['availability'] == 'http://schema.org/InStock')
            if not in_stock:
                loader.add_value('stock', 0)
            else:
                stock_no = product_xs.xpath(
                    './/div[@class="Info"]//span[@class="Stock3"]/text()').re(
                        r'(\d+)')
                if stock_no:
                    loader.add_value('stock', stock_no[0])

            item = loader.load_item()

            self.yield_item(item)

        page_urls = set(
            response.xpath(
                '//div[@class="pages"]/a[not(contains(@class, "active"))'
                ' and contains(@href, "page=")]/@href').extract())
        for url in page_urls:
            yield Request(response.urljoin(url),
                          callback=self.parse_products,
                          meta={
                              'cookiejar': response.meta['cookiejar'],
                              'category': response.meta['category']
                          })
Ejemplo n.º 40
0
def get_review_items_from_microdata(spider,
                                    review_type,
                                    response,
                                    product,
                                    reviews_xpath=None,
                                    pros_xpath=None,
                                    cons_xpath=None):
    '''
    Get all reviews from a page, useful for user review pages with microdata
    :param spider: the spider we use to scrape the site
    :param review_type: type of the reviews to scrape, should be either USER or PRO
    :param response: an instance of Scrapy's Response object where reviews will be scraped from
    :param product: the product item the reviews are written for
    :param reviews_xpath: the xpath to extract review selectors from 'response'
    :param pros_xpath: the xpath to extract pros from review selectors
    :param cons_xpath: the xpath to extract cons from review selectors
    :return: list of all review items extracted
    '''
    mde = MicrodataExtractor()
    try:
        items = mde.extract(response.text)
    except XMLSyntaxError:
        return []  # Nothing to do here...

    all_review_extracts = [
        i for i in items if i['type'] == "http://schema.org/Review"
    ]
    all_pros = []
    all_cons = []

    if reviews_xpath:
        add_pros_and_cons = True
        all_reviews = response.xpath(reviews_xpath)
        for single_review in all_reviews:
            if pros_xpath:
                pros = spider.extract_all(single_review.xpath(pros_xpath),
                                          separator=' ; ')
            else:
                pros = ''
            if cons_xpath:
                cons = spider.extract_all(single_review.xpath(cons_xpath),
                                          separator=' ; ')
            else:
                cons = ''
            all_pros.append(pros)
            all_cons.append(cons)

        if len(all_pros) != len(all_review_extracts) or len(all_cons) != len(
                all_review_extracts):
            spider.logger.warning(
                "Number of reviews extracted from xpath is different from number of review microdata."
            )
            add_pros_and_cons = False
    else:
        add_pros_and_cons = False

    review_items = []
    for index, item in enumerate(all_review_extracts):
        if add_pros_and_cons:
            review = review_microdata_extruct(item,
                                              product=product,
                                              tp=review_type,
                                              pros=all_pros[index],
                                              cons=all_cons[index])
        else:
            review = review_microdata_extruct(item,
                                              product=product,
                                              tp=review_type)
        review_items.append(review)

    return review_items
Ejemplo n.º 41
0
class RISJMetadataExtractor(object):
    """An extruct-based metadata extractor"""

    # TODO: Extend to microdata and RDFa, replacing bespoke xpath code. Then
    #       test on body of crawlers!
    def __init__(self, response, microdata=False, jsonld=False, rdfa=False):
        self.response = response
        self.microdata = microdata
        self.jsonld = jsonld
        self.rdfa = rdfa

        if rdfa:
            try:
                self.rdfae = RDFaExtractor()
                self.rdfadata = self.rdfae.extract(self.response.text,
                                                   url=self.response.url)
            except JSONDecodeError:
                pass
        if microdata:
            try:
                self.mde = MicrodataExtractor()
                self.mdedata = self.mde.extract(self.response.text)
            except JSONDecodeError:
                pass
        if jsonld:
            try:
                self.jlde = JsonLdExtractor()
                self.jldata = self.jlde.extract(self.response.text)
            except (JSONDecodeError, TypeError):
                self.jldata = []
            finally:
                # Sometimes we get this in the meta dict from RISJExtractJSONLD
                self.jldata.extend(self.response.meta.get('json-ld', []))

    def extract_newsarticle_schemaorg(self,
                                      microdata=None,
                                      jsonld=None,
                                      rdfa=None):
        """Extract schema.org NewsArticle metadata, encoded using any
           supported metadata format. Note that we only try to extract the
           *first* block of NewsArticle data for each method (which is then
           combined with the first extracted from other methods if more than
           one is selected."""
        if microdata is None:
            microdata = self.microdata
        if jsonld is None:
            jsonld = self.jsonld
        if rdfa is None:
            rdfa = self.rdfa

        outd = {}
        if jsonld:
            for d in self.jldata:
                #                logger.debug('Analysing JSON-LD data: '+pformat(d))
                try:
                    if (re.match(r'https?://schema.org/?', d['@context'])
                            and d['@type'] == 'NewsArticle'):
                        outd.update(d)
                except (KeyError, TypeError):
                    continue
        if microdata:
            for d in self.mdedata:
                logger.debug('Analysing W3C microdata: ' + pformat(d))
                if re.match(r'https?://schema.org/NewsArticle/?',
                            d.get('type', '')):
                    outd.update(d)
        if rdfa:
            raise NotImplementedError
#        logger.debug('Returning schema.org NewsArticle: '+pformat(outd))
        return outd