def test_w3c_object_element(self): body = get_testdata('w3c', 'microdata.object.html') expected = json.loads(get_testdata('w3c', 'microdata.object.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body, 'http://www.example.com/microdata/test') self.assertDictEqual(data, expected)
def test_w3c_5_5(self): body = get_testdata("w3c", "microdata.5.5.html") expected = json.loads(get_testdata("w3c", "microdata.5.5.json").decode("UTF-8")) mde = MicrodataExtractor(strict=True) data = mde.extract(body) self.assertDictEqual(data, expected)
def test_w3c_7_1(self): body = get_testdata("w3c", "microdata.7.1.html") expected = json.loads(get_testdata("w3c", "microdata.7.1.json").decode("UTF-8")) mde = MicrodataExtractor(strict=True) data = mde.extract(body, "http://blog.example.com/progress-report") self.assertDictEqual(data, expected)
def test_w3c_7_1(self): body = get_testdata('w3c', 'microdata.7.1.html') expected = json.loads(get_testdata('w3c', 'microdata.7.1.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body, 'http://blog.example.com/progress-report') self.assertDictEqual(data, expected)
def test_w3c_data_element(self): body = get_testdata('w3c', 'microdata.4.2.data.html') expected = json.loads(get_testdata('w3c', 'microdata.4.2.data.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body) self.assertDictEqual(data, expected)
def test_w3c_5_2(self): body = get_testdata('w3c', 'microdata.5.2.html') expected = json.loads(get_testdata('w3c', 'microdata.5.2.withtext.json').decode('UTF-8')) mde = MicrodataExtractor(add_text_content=True) data = mde.extract(body) self.assertDictEqual(data, expected)
def test_w3c_5_2(self): body = get_testdata('w3c', 'microdata.5.2.html') expected = json.loads(get_testdata('w3c', 'microdata.5.2.flat.json').decode('UTF-8')) mde = MicrodataExtractor(nested=False, strict=True) data = mde.extract(body) self.assertDictEqual(data, expected)
def test_w3c_object_element(self): body = get_testdata('w3c', 'microdata.object.html') expected = json.loads( get_testdata('w3c', 'microdata.object.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body, 'http://www.example.com/microdata/test') self.assertEqual(data, expected)
def test_w3c_meter_element(self): body = get_testdata('w3c', 'microdata.4.2.meter.html') expected = json.loads( get_testdata('w3c', 'microdata.4.2.meter.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body) self.assertDictEqual(data, expected)
def test_w3c_5_2(self): body = get_testdata('w3c', 'microdata.5.2.html') expected = json.loads( get_testdata('w3c', 'microdata.5.2.withtext.json').decode('UTF-8')) mde = MicrodataExtractor(add_text_content=True) data = mde.extract(body) self.assertEqual(data, expected)
def test_join_none(self): body = get_testdata('schema.org', 'product.html') expected = json.loads( get_testdata('schema.org', 'product.json').decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body) self.assertEqual(data, expected)
def test_w3c_textContent_values(self): body = get_testdata('w3c', 'microdata.4.2.strings.html') expected = json.loads( get_testdata('w3c', 'microdata.4.2.strings.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body) self.assertEqual(data, expected)
def test_w3c_5_3(self): body = get_testdata('w3c', 'microdata.5.3.html') expected = json.loads( get_testdata('w3c', 'microdata.5.3.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body) self.assertEqual(data, expected)
def test_schemaorg_MusicRecording(self): for i in [1]: body = get_testdata('schema.org', 'MusicRecording.{:03d}.html'.format(i)) expected = json.loads(get_testdata('schema.org', 'MusicRecording.{:03d}.json'.format(i)).decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body) self.assertDictEqual(data, expected)
def test_w3c_7_1(self): body = get_testdata('w3c', 'microdata.7.1.html') expected = json.loads( get_testdata('w3c', 'microdata.7.1.flat.json').decode('UTF-8')) mde = MicrodataExtractor(nested=False, strict=True) data = mde.extract(body, 'http://blog.example.com/progress-report') self.assertEqual(data, expected)
def test_schemaorg_Event(self): for i in [1, 2, 3, 4, 8]: body = get_testdata("schema.org", "Event.{:03d}.html".format(i)) expected = json.loads(get_testdata("schema.org", "Event.{:03d}.json".format(i)).decode("UTF-8")) mde = MicrodataExtractor() data = mde.extract(body) self.assertDictEqual(data, expected)
def parse_product(self, response): mde = MicrodataExtractor() data = mde.extract(response.body) product_data = data['items'][1]['properties'] extra_data = {} for l in response.body.split('\n'): if 'vgoogle_ecommProd' in l: line_data = l.strip() key = line_data.split(':')[0].strip().replace( 'vgoogle_ecommProd', '') value = line_data.split(':')[1][3:-3] if key not in extra_data: extra_data[key] = value loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', product_data.get('sku', extra_data['ID'])) loader.add_value('sku', product_data.get('sku', extra_data['ID'])) loader.add_value('name', product_data.get('name', extra_data['Name'])) loader.add_value('url', response.url) if 'price' in product_data: loader.add_value('price', product_data['price']) elif 'Price' in extra_data: loader.add_value('price', extra_data['Price']) else: price = response.xpath('//*[(contains(@class, "product-item") and ' 'contains(@class, "product-price")) or @id="price-amount"]//text()')\ .re(r'[\d\.,]+') loader.add_value('price', price) try: loader.add_value( 'category', data['items'][0]['properties']['itemListElement'] [1]['properties']['name']) except: loader.add_value('category', extra_data['Cat']) loader.add_value('brand', product_data.get('manufacturer', extra_data['Brand'])) try: loader.add_value( 'image_url', response.urljoin( response.xpath( '//div[@id="prod-img-placehold"]/img/@srcset').re( r'(.*\.jpg)')[0].split(',')[-1].strip())) except: pass item = loader.load_item() metadata = SpecSaversMeta() promotional_data = response.xpath( '//div[@class="arrow-container"]/div/text()').extract() metadata['promotion'] = promotional_data[0].strip( ) if promotional_data else '' item['metadata'] = metadata yield item
def test_join_custom_url(self): body = get_testdata('schema.org', 'product.html') expected = json.loads( get_testdata('schema.org', 'product_custom_url.json').decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body, base_url='http://some-example.com') self.assertEqual(data, expected)
def parse(self, response): extractor = MicrodataExtractor() items = extractor.extract(response.text, response.url)['items'] for it in items: yield it['properties'] next_page_url = response.css("li.next > a::attr(href)").extract_first() if next_page_url is not None: yield scrapy.Request(response.urljoin(next_page_url))
def parse(self, response): extractor = MicrodataExtractor() properties = extractor.extract(response.body_as_unicode()).get('items')[0].get('properties', {}) item = response.meta.get('item', {}) item['url'] = response.url item['title'] = properties.get('name').replace('Details about', '').strip() item['price'] = float( properties.get('offers', {}).get('properties', {}).get('price', 0) ) yield item
def test_if_punctuations_in_description_are_correctly_formatted(self): body = get_testdata('websites', 'microdata-with-description.html') expected = json.loads( get_testdata('websites', 'microdata-with-description.json').decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body) self.assertEqual(data, expected)
def test_schemaorg_Event(self): for i in [1, 2, 3, 4, 8]: body = get_testdata('schema.org', 'Event.{:03d}.html'.format(i)) expected = json.loads( get_testdata('schema.org', 'Event.{:03d}.json'.format(i)).decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body) self.assertEqual(data, expected)
def parse_item(self, response): """Parse the recipe to get title and ingredients.""" schema_type = "mde" mde = MicrodataExtractor() data = mde.extract(response.body) # print('response.body:', response.body) # print('data:', data) if len(data) == 0: jslde = JsonLdExtractor() data = jslde.extract(response.body) schema_type = "jsonld" if schema_type == "mde": recipe = data[0]['properties'] # recipe_output_item = RecipeItem() # recipe_output_item['recipe_name'] = recipe['name'] # recipe_output_item['ingredients'] = [ # ingredient for ingredient in recipe['ingredients'] # if ingredient not in ['', 'Add all ingredients to list'] # ] # recipe_output_item['tags'] = [tag['properties']['title'] # for tag in data['items'][1:]] # try: # recipe_output_item['description'] = recipe['description'] # except KeyError: # recipe_output_item['description'] = None # recipe_output_item['url'] = recipe['url'] elif schema_type == "jsonld": recipe = data['items'][0] # recipe_output_item = RecipeItem() # recipe_output_item['recipe_name'] = recipe['name'] # recipe_output_item['ingredients'] = recipe['ingredients'] # recipe_output_item['tags'] = [tag['properties']['title'] # for tag in data['items'][1:]] # try: # recipe_output_item['description'] = recipe['description'] # except KeyError: # recipe_output_item['description'] = None # recipe_output_item['url'] = recipe['url'] properties = [ 'totalTime', 'nutrition', 'name', 'author', 'url', 'image', 'recipeIngredient', 'aggregateRating', 'recipeYield', 'recipeInstructions', 'video', 'mainEntityOfPage', 'cookTime', 'recipeCategory', 'review', 'prepTime', 'description' ] recipe_output_item = RecipeItem() for prop in properties: try: recipe_output_item[prop] = recipe[prop] except KeyError: recipe_output_item[prop] = None yield recipe_output_item
def test_schemaorg_LocalBusiness(self): for i in [2, 3]: body = get_testdata('schema.org', 'LocalBusiness.{:03d}.html'.format(i)) expected = json.loads( get_testdata( 'schema.org', 'LocalBusiness.{:03d}.json'.format(i)).decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body) self.assertEqual(data, expected)
def test_schemaorg_MusicRecording(self): for i in [1]: body = get_testdata('schema.org', 'MusicRecording.{:03d}.html'.format(i)) expected = json.loads( get_testdata( 'schema.org', 'MusicRecording.{:03d}.json'.format(i)).decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body) self.assertDictEqual(data, expected)
def parse(self, response): mde = MicrodataExtractor() data = mde.extract(response.body) for item in data['items']: if item['type'] in self.target_types: item['indexed_date'] = datetime.date.today().isoformat() item['url'] = response.url yield item for url in response.xpath('//a/@href').extract(): if '/events' in url: yield scrapy.Request(response.urljoin(url), callback=self.parse)
def extract(self, html_text: str, extract_title: bool = False, extract_meta: bool = False, extract_microdata: bool = False, microdata_base_url: str = "", extract_json_ld: bool = False, extract_rdfa: bool = False, rdfa_base_url: str = "") \ -> List[Extraction]: """ Args: html_text (str): input html string to be extracted extract_title (bool): True if string of 'title' tag needs to be extracted, return as { "title": "..." } extract_meta (bool): True if string of 'meta' tags needs to be extracted, return as { "meta": { "author": "...", ...}} extract_microdata (bool): True if microdata needs to be extracted, returns as { "microdata": [...] } microdata_base_url (str): base namespace url for microdata, empty string if no base url is specified extract_json_ld (bool): True if json-ld needs to be extracted, return as { "json-ld": [...] } extract_rdfa (bool): True if rdfs needs to be extracted, returns as { "rdfa": [...] } rdfa_base_url (str): base namespace url for rdfa, empty string if no base url is specified Returns: List[Extraction]: the list of extraction or the empty list if there are no matches. """ res = list() soup = BeautifulSoup(html_text, 'html.parser') if soup.title and extract_title: title = self._wrap_data("title", soup.title.string.encode('utf-8').decode('utf-8')) res.append(title) if soup.title and extract_meta: meta_content = self._wrap_meta_content(soup.find_all("meta")) meta_data = self._wrap_data("meta", meta_content) res.append(meta_data) if extract_microdata: mde = MicrodataExtractor() mde_data = self._wrap_data("microdata", mde.extract(html_text, microdata_base_url)) res.append(mde_data) if extract_json_ld: jslde = JsonLdExtractor() jslde_data = self._wrap_data("json-ld", jslde.extract(html_text)) res.append(jslde_data) if extract_rdfa: rdfae = RDFaExtractor() rdfae_data = self._wrap_data("rdfa", rdfae.extract(html_text, rdfa_base_url)) res.append(rdfae_data) return res
def parse_product(self, response): mde = MicrodataExtractor() try: micro_data = mde.extract(response.body)['items'] gen_data = filter(lambda a: a['type'] == 'http://schema.org/Product', micro_data)[0]['properties'] categories = [c['properties']['title'] for c in filter(lambda d: d['type'] == 'http://data-vocabulary.org/Breadcrumb', micro_data)][1:] except: self.log('WARNING => Wrong product page in %s' % response.url) return main_name = gen_data['name'] if isinstance(main_name, list): main_name = main_name[0] main_brand = gen_data.get('brand', '') if isinstance(main_brand, list): main_brand = main_brand[0] variants = response.xpath('//input[@name="ctl00$cphMain$ctl00$hidProductVariants"]/@value').extract() if variants: data = json.loads(self.html_parser.unescape(variants[0])) for d in data: for var in d['Variants']: for size_data in var['Variants']: color_name = size_data.get('Article', dict()).get('ColorName', '') size_data = size_data['Article'] url = self.product_url % size_data identifier = size_data['ItemOfferId'] name = main_name + ', ' + color_name + ', ' + size_data['FriendlySize'] price = size_data['WebInfo']['ArticlePriceDisplay']['FormattedSalePriceAfterWithCharges'] shipping_cost = size_data['FormattedDeliveryFee'] loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) loader.add_value('url', url) loader.add_value('identifier', identifier) loader.add_value('sku', size_data['ProductId']) loader.add_value('price', extract_price_eu(price)) if shipping_cost: loader.add_value('shipping_cost', extract_price_eu(shipping_cost)) loader.add_value('image_url', gen_data['image'][-1]) if size_data['AvailabilityCode'] != 'L': loader.add_value('stock', 0) loader.add_value('category', categories) if main_brand: loader.add_value('brand', main_brand) yield loader.load_item() else: self.log('WARNING: Variants not found in => %s' % response.url)
def parse_item(self, response): items = [] def microdata2jsonld(md): if md.get('properties'): item = md['properties'] item['@type'] = md.get('type') return item items += map(microdata2jsonld, MicrodataExtractor().extract( response.body_as_unicode(), response.url)['items']) items += JsonLdExtractor().extract( response.body_as_unicode(), response.url)['items'] if not items: self.logger.debug("No Microdata items found for %s", response.url) self.logger.debug("Checking URL for item: %s" , items) for item in items: if not item or not item.get('url'): self.logger.debug("No URL for item: %s" , item) continue if item['url'] != response.url: self.logger.debug("Not in main URL, go there..") yield Request(item['url'], callback=self.parse_item) else: item['@type'] = item.get('type') self.logger.debug("Parsed microdata: %s" % item) yield item
def async_extruct(url, microdata=True, jsonld=True): response.content_type = 'application/json' resp = requests.get(url, timeout=30) parser = lxml.html.HTMLParser(encoding=resp.encoding) lxmldoc = lxml.html.fromstring(resp.content, parser=parser) result = {'url': url, 'status': 'ok'} if microdata: mde = MicrodataExtractor(nested=True) result['microdata'] = mde.extract_items(lxmldoc, url) if jsonld: jsonlde = JsonLdExtractor() result['json-ld'] = jsonlde.extract_items(lxmldoc) return result
def extract(htmlstring, url='http://www.example.com/', encoding="UTF-8"): domparser = XmlDomHTMLParser(encoding=encoding) tree = fromstring(htmlstring, parser=domparser) return { name: extractor.extract_items(tree, url=url) for name, extractor in (('json-ld', JsonLdExtractor()), ('microdata', MicrodataExtractor()), ('rdfa', RDFaExtractor())) }
def parse_item(self, response): """Parse the recipe to get title and ingredients.""" schema_type = "mde" mde = MicrodataExtractor() data = mde.extract(response.body) if len(data['items']) == 0: jslde = JsonLdExtractor() data = jslde.extract(response.body) schema_type = "jsonld" if schema_type == "mde": recipe = data['items'][2]['properties'] recipe_output_item = RecipeItem() recipe_output_item['recipe_name'] = recipe['name'] recipe_output_item['ingredients'] = [ ingredient for ingredient in recipe['ingredients'] if ingredient not in ['', 'Add all ingredients to list'] ] recipe_tags = recipe['recipeCategory'] if 'recipeCuisine' in recipe.keys(): recipe_tags.append(recipe['recipeCuisine']) recipe_output_item['tags'] = recipe_tags try: recipe_output_item['description'] = recipe['description'] except KeyError: recipe_output_item['description'] = None recipe_output_item['url'] = recipe['url'] elif schema_type == "jsonld": recipe = data['items'][0] recipe_output_item = RecipeItem() recipe_output_item['recipe_name'] = recipe['name'] recipe_output_item['ingredients'] = recipe['ingredients'] recipe_output_item['tags'] = [ tag['properties']['title'] for tag in data['items'][1:] ] try: recipe_output_item['description'] = recipe['description'] except KeyError: recipe_output_item['description'] = None recipe_output_item['url'] = recipe['url'] yield recipe_output_item
def parse_item(self, response): """Parse the recipe to get title and ingredients.""" schema_type = "mde" mde = MicrodataExtractor() data = mde.extract(response.body) if len(data['items']) == 0: jslde = JsonLdExtractor() data = jslde.extract(response.body) schema_type = "jsonld" if schema_type == "mde": recipe = data['items'][2]['properties'] recipe_output_item = RecipeItem() recipe_output_item['recipe_name'] = recipe['name'] recipe_output_item['ingredients'] = [ ingredient for ingredient in recipe['ingredients'] if ingredient not in ['', 'Add all ingredients to list'] ] recipe_tags = recipe['recipeCategory'] if 'recipeCuisine' in recipe.keys(): recipe_tags.append(recipe['recipeCuisine']) recipe_output_item['tags'] = recipe_tags try: recipe_output_item['description'] = recipe['description'] except KeyError: recipe_output_item['description'] = None recipe_output_item['url'] = recipe['url'] elif schema_type == "jsonld": recipe = data['items'][0] recipe_output_item = RecipeItem() recipe_output_item['recipe_name'] = recipe['name'] recipe_output_item['ingredients'] = recipe['ingredients'] recipe_output_item['tags'] = [tag['properties']['title'] for tag in data['items'][1:]] try: recipe_output_item['description'] = recipe['description'] except KeyError: recipe_output_item['description'] = None recipe_output_item['url'] = recipe['url'] yield recipe_output_item
def async_extruct(url, microdata=True, jsonld=True): resp = requests.get(url, timeout=30) parser = lxml.html.HTMLParser(encoding=resp.encoding) lxmldoc = lxml.html.fromstring(resp.content, parser=parser) result = {"url": url, "status": "ok"} if microdata: mde = MicrodataExtractor(nested=True) microdata = mde.extract_items(lxmldoc, url) if microdata.get("items", []): result["microdata"] = microdata if jsonld: jsonlde = JsonLdExtractor() jsonldata = jsonlde.extract_items(lxmldoc) if jsonldata.get("items", []): result["json-ld"] = jsonldata return result
def parse_listing(self, response): mde = MicrodataExtractor() data = mde.extract(response.body)['items'] if data: it = {} it['shop'] = data[0]['properties'] prod = data[1]['properties'] it.update(prod['offerDetails']['properties']) it['name'] = prod['name'] it['url'] = response.url it['properties'] = [x for x in response.css('#item-overview .properties li::text').extract() \ if all(y not in x.lower() for y in ['materials','feedback', 'favorited', 'ships'])] it['materials'] = e0(response.css('#overview-materials::text')) it['origin'] = e0(response.css('.origin::text')) it['imgs'] = response.css('#image-carousel img::attr("src")').extract() it['description'] = e0(response.css("#description-text")) it['tags'] = response.css('#listing-tag-list li a::text').extract() it['fineprints'] = [x.strip() for x in response.css('#fineprint li::text').extract()[:4]] it['rating'] = response.css('.review-rating meta::attr("content")').extract() #it['html'] = response.body yield it
def async_extruct(url, microdata=True, jsonld=True): response.content_type = 'application/json' resp = requests.get(url, timeout=30) parser = lxml.html.HTMLParser(encoding=resp.encoding) lxmldoc = lxml.html.fromstring(resp.content, parser=parser) result = {'url': url, 'status': 'ok'} if microdata: mde = MicrodataExtractor(nested=True) microdata = mde.extract_items(lxmldoc, url) if microdata.get('items', []): result['microdata'] = microdata if jsonld: jsonlde = JsonLdExtractor() jsonldata = jsonlde.extract_items(lxmldoc) if jsonldata.get('items', []): result['json-ld'] = jsonldata return result
def extract(self, html_text: str, extract_title: bool = False, extract_meta: bool = False, extract_microdata: bool = False, extract_json_ld: bool = False, extract_rdfa: bool = False) \ -> List[Extraction]: res = list() soup = BeautifulSoup(html_text, 'html.parser') if soup.title and extract_title: title = self.wrap_data( "title", soup.title.string.encode('utf-8').decode('utf-8')) res.append(title) if soup.title and extract_meta: meta_content = self.wrap_meta_content(soup.find_all("meta")) meta_data = self.wrap_data("meta", meta_content) res.append(meta_data) if extract_microdata: mde = MicrodataExtractor() mde_data = self.wrap_data("microdata", mde.extract(html_text)) res.append(mde_data) if extract_json_ld: jslde = JsonLdExtractor() jslde_data = self.wrap_data("json-ld", jslde.extract(html_text)) res.append(jslde_data) if extract_rdfa: rdfae = RDFaExtractor() rdfae_data = self.wrap_data("rdfa", rdfae.extract(html_text)) res.append(rdfae_data) return res
def get_review_items_from_microdata(spider, review_type, response, product, reviews_xpath=None, pros_xpath=None, cons_xpath=None): ''' Get all reviews from a page, useful for user review pages with microdata :param spider: the spider we use to scrape the site :param review_type: type of the reviews to scrape, should be either USER or PRO :param response: an instance of Scrapy's Response object where reviews will be scraped from :param product: the product item the reviews are written for :param reviews_xpath: the xpath to extract review selectors from 'response' :param pros_xpath: the xpath to extract pros from review selectors :param cons_xpath: the xpath to extract cons from review selectors :return: list of all review items extracted ''' mde = MicrodataExtractor() try: items = mde.extract(response.text) except XMLSyntaxError: return [] # Nothing to do here... all_review_extracts = [ i for i in items if i['type'] == "http://schema.org/Review" ] all_pros = [] all_cons = [] if reviews_xpath: add_pros_and_cons = True all_reviews = response.xpath(reviews_xpath) for single_review in all_reviews: if pros_xpath: pros = spider.extract_all(single_review.xpath(pros_xpath), separator=' ; ') else: pros = '' if cons_xpath: cons = spider.extract_all(single_review.xpath(cons_xpath), separator=' ; ') else: cons = '' all_pros.append(pros) all_cons.append(cons) if len(all_pros) != len(all_review_extracts) or len(all_cons) != len( all_review_extracts): spider.logger.warning( "Number of reviews extracted from xpath is different from number of review microdata." ) add_pros_and_cons = False else: add_pros_and_cons = False review_items = [] for index, item in enumerate(all_review_extracts): if add_pros_and_cons: review = review_microdata_extruct(item, product=product, tp=review_type, pros=all_pros[index], cons=all_cons[index]) else: review = review_microdata_extruct(item, product=product, tp=review_type) review_items.append(review) return review_items
class RISJMetadataExtractor(object): """An extruct-based metadata extractor""" # TODO: Extend to microdata and RDFa, replacing bespoke xpath code. Then # test on body of crawlers! def __init__(self, response, microdata=False, jsonld=False, rdfa=False): self.response = response self.microdata = microdata self.jsonld = jsonld self.rdfa = rdfa if rdfa: try: self.rdfae = RDFaExtractor() self.rdfadata = self.rdfae.extract(self.response.text, url=self.response.url) except JSONDecodeError: pass if microdata: try: self.mde = MicrodataExtractor() self.mdedata = self.mde.extract(self.response.text) except JSONDecodeError: pass if jsonld: try: self.jlde = JsonLdExtractor() self.jldata = self.jlde.extract(self.response.text) except (JSONDecodeError, TypeError): self.jldata = [] finally: # Sometimes we get this in the meta dict from RISJExtractJSONLD self.jldata.extend(self.response.meta.get('json-ld', [])) def extract_newsarticle_schemaorg(self, microdata=None, jsonld=None, rdfa=None): """Extract schema.org NewsArticle metadata, encoded using any supported metadata format. Note that we only try to extract the *first* block of NewsArticle data for each method (which is then combined with the first extracted from other methods if more than one is selected.""" if microdata is None: microdata = self.microdata if jsonld is None: jsonld = self.jsonld if rdfa is None: rdfa = self.rdfa outd = {} if jsonld: for d in self.jldata: # logger.debug('Analysing JSON-LD data: '+pformat(d)) try: if (re.match(r'https?://schema.org/?', d['@context']) and d['@type'] == 'NewsArticle'): outd.update(d) except (KeyError, TypeError): continue if microdata: for d in self.mdedata: logger.debug('Analysing W3C microdata: ' + pformat(d)) if re.match(r'https?://schema.org/NewsArticle/?', d.get('type', '')): outd.update(d) if rdfa: raise NotImplementedError # logger.debug('Returning schema.org NewsArticle: '+pformat(outd)) return outd
def extract(htmlstring, base_url=None, encoding="UTF-8", syntaxes=SYNTAXES, errors='strict', uniform=False, return_html_node=False, schema_context='http://schema.org', with_og_array=False, **kwargs): """htmlstring: string with valid html document; base_url: base url of the html document encoding: encoding of the html document syntaxes: list of syntaxes to extract, default SYNTAXES errors: set to 'log' to log the exceptions, 'ignore' to ignore them or 'strict'(default) to raise them uniform: if True uniform output format of all syntaxes to a list of dicts. Returned dicts structure: {'@context': 'http://example.com', '@type': 'example_type', /* All other the properties in keys here */ } return_html_node: if True, it includes into the result a HTML node of respective embedded metadata under 'htmlNode' key. The feature is supported only by microdata syntax. Each node is of `lxml.etree.Element` type. schema_context: schema's context for current page""" if base_url is None and 'url' in kwargs: warnings.warn('"url" argument is deprecated, please use "base_url"', DeprecationWarning, stacklevel=2) base_url = kwargs.pop('url') if kwargs: raise TypeError('Unexpected keyword arguments') if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)): raise ValueError("syntaxes must be a list with any or all (default) of" "these values: {}".format(SYNTAXES)) if errors not in ['log', 'ignore', 'strict']: raise ValueError('Invalid error command, valid values are either "log"' ', "ignore" or "strict"') try: tree = parse_xmldom_html(htmlstring, encoding=encoding) except Exception as e: if errors == 'ignore': return {} if errors == 'log': logger.exception('Failed to parse html, raises {}'.format(e)) return {} if errors == 'strict': raise processors = [] if 'microdata' in syntaxes: processors.append( ('microdata', MicrodataExtractor(add_html_node=return_html_node).extract_items, tree)) if 'json-ld' in syntaxes: processors.append(( 'json-ld', JsonLdExtractor().extract_items, tree, )) if 'opengraph' in syntaxes: processors.append( ('opengraph', OpenGraphExtractor().extract_items, tree)) if 'microformat' in syntaxes: processors.append( ('microformat', MicroformatExtractor().extract_items, htmlstring)) if 'rdfa' in syntaxes: processors.append(( 'rdfa', RDFaExtractor().extract_items, tree, )) if 'dublincore' in syntaxes: processors.append(( 'dublincore', DublinCoreExtractor().extract_items, tree, )) output = {} for syntax, extract, document in processors: try: output[syntax] = list(extract(document, base_url=base_url)) except Exception as e: if errors == 'log': logger.exception('Failed to extract {}, raises {}'.format( syntax, e)) if errors == 'ignore': pass if errors == 'strict': raise if uniform: uniform_processors = [] if 'microdata' in syntaxes: uniform_processors.append(( 'microdata', _umicrodata_microformat, output['microdata'], schema_context, )) if 'microformat' in syntaxes: uniform_processors.append(( 'microformat', _umicrodata_microformat, output['microformat'], 'http://microformats.org/wiki/', )) if 'opengraph' in syntaxes: uniform_processors.append(( 'opengraph', _uopengraph, output['opengraph'], None, )) if 'dublincore' in syntaxes: uniform_processors.append(( 'dublincore', _udublincore, output['dublincore'], None, )) for syntax, uniform, raw, schema_context in uniform_processors: try: if syntax == 'opengraph': output[syntax] = uniform(raw, with_og_array=with_og_array) elif syntax == 'dublincore': output[syntax] = uniform(raw) else: output[syntax] = uniform(raw, schema_context) except Exception as e: if errors == 'ignore': output[syntax] = [] if errors == 'log': output[syntax] = [] logger.exception( 'Failed to uniform extracted for {}, raises {}'.format( syntax, e)) if errors == 'strict': raise return output
def parse(self, response): selector = Selector(response=response) extractor = MicrodataExtractor() items = extractor.extract(response.body_as_unicode(), response.url) print items
def extract(htmlstring, base_url=None, encoding="UTF-8", syntaxes=SYNTAXES, errors='strict', uniform=False, schema_context='http://schema.org', **kwargs): """htmlstring: string with valid html document; base_url: base url of the html document encoding: encoding of the html document syntaxes: list of syntaxes to extract, default SYNTAXES errors: set to 'log' to log the exceptions, 'ignore' to ignore them or 'strict'(default) to raise them uniform: if True uniform output format of all syntaxes to a list of dicts. Returned dicts structure: {'@context': 'http://example.com', '@type': 'example_type', /* All other the properties in keys here */ } schema_context: schema's context for current page""" if base_url is None and 'url' in kwargs: warnings.warn('"url" argument is deprecated, please use "base_url"', DeprecationWarning) base_url = kwargs.pop('url') if kwargs: raise TypeError('Unexpected keyword arguments') if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)): raise ValueError("syntaxes must be a list with any or all (default) of" "these values: {}".format(SYNTAXES)) if errors not in ['log', 'ignore', 'strict']: raise ValueError('Invalid error command, valid values are either "log"' ', "ignore" or "strict"') domparser = XmlDomHTMLParser(encoding=encoding) tree = fromstring(htmlstring, parser=domparser) processors = [] if 'microdata' in syntaxes: processors.append( ('microdata', MicrodataExtractor().extract_items, tree)) if 'json-ld' in syntaxes: processors.append(('json-ld', JsonLdExtractor().extract_items, tree)) if 'opengraph' in syntaxes: processors.append( ('opengraph', OpenGraphExtractor().extract_items, tree)) if 'microformat' in syntaxes: processors.append( ('microformat', MicroformatExtractor().extract_items, htmlstring)) if 'rdfa' in syntaxes: processors.append(('rdfa', RDFaExtractor().extract_items, tree)) output = {} for label, extract, document in processors: try: output[label] = list(extract(document, base_url=base_url)) except Exception: if errors == 'log': logger.exception('Failed to extract {}'.format(label)) if errors == 'ignore': pass if errors == 'strict': raise if uniform: if 'microdata' in syntaxes: output['microdata'] = _umicrodata_microformat( output['microdata'], schema_context=schema_context) if 'microformat' in syntaxes: output['microformat'] = _umicrodata_microformat( output['microformat'], schema_context='http://microformats.org/wiki/') if 'opengraph' in syntaxes: output['opengraph'] = _uopengraph(output['opengraph']) return output
import os.path import re import io, shutil from extruct.w3cmicrodata import MicrodataExtractor import sys from subprocess import * from shlex import split from pprint import pprint as pp from zipfile import ZipFile, BadZipFile from bs4 import BeautifulSoup mde = MicrodataExtractor() def microdata(html): microdata = mde.extract(html) microdata = microdata['items'][0]['properties'] def attrget(item, key): keys = key.split('.') for key in keys: item = item.get(key, {}) if item == {}: return None return item keys = ('url', 'name', 'version', 'aggregateRating.properties.ratingCount', 'aggregateRating.properties.ratingValue', 'image',