def main(): url = "http://matfystutor.dk/rus/holdtutorer/" r = requests.get(url) item, = microdata.get_items(r.text) year = item.year rusclass_list = [] o = item.get_all('rusclass') for rusclass in o: tutors = [] for tutor in rusclass.get_all("tutor"): name = tutor.name phone = strip_prefix(str(tutor.phone), "tel:") tutors.append({ 'name': name, 'phone': phone, }) rusclass_list.append({ 'name': rusclass.name, 'tutors': tutors, }) template_name = 'templates/rusclass/tutorhold.tex' context = {'rusclass_list': rusclass_list, 'year': year} print(render_to_string(template_name, context))
def parse(self, source, sink, **kwargs): """ Pass in a file or file-like object containing html5 microdata and populate the sink graph with triples. """ for item in microdata.get_items(source.getByteStream()): self._add_item(item, sink)
def update_price(self): # check if this is an Amazon product if self.distributor.name == 'Amazon': amazon = AmazonAPI(settings.AWS_ACCESS_KEY_ID, settings.AWS_SECRET_ACCESS_KEY, settings.AWS_ASSOCIATE_TAG) try: product = amazon.lookup(ItemId=self.part.asin) price = product.price_and_currency return price[0] except: pass else: import urllib2 from lxml import etree import microdata import urllib items = microdata.get_items(urllib.urlopen(self.url)) for i in items: if i.offers: return "%s (md)".replace("$", "") % i.offers.price.strip().replace("$", "") html = urllib2.urlopen(self.url).read() tree = etree.HTML(html) price = tree.xpath("%s/text()[1]" % self.xpath) try: return "%s (xp)" % price[0].replace("$", "") except: return "N/A"
def get_from_html(html_text, url): soup = BeautifulSoup(html_text, "html.parser") # first try finding ld+json as its most common for ld in soup.find_all('script', type='application/ld+json'): try: ld_json = json.loads(ld.string.replace('\n', '')) if type(ld_json) != list: ld_json = [ld_json] for ld_json_item in ld_json: # recipes type might be wrapped in @graph type if '@graph' in ld_json_item: for x in ld_json_item['@graph']: if '@type' in x and x['@type'] == 'Recipe': ld_json_item = x if '@type' in ld_json_item and ld_json_item['@type'] == 'Recipe': return find_recipe_json(ld_json_item, url) except JSONDecodeError as e: return JsonResponse({'error': True, 'msg': _('The requested site provided malformed data and cannot be read.')}, status=400) # now try to find microdata items = microdata.get_items(html_text) for i in items: md_json = json.loads(i.json()) if 'schema.org/Recipe' in str(md_json['type']): return find_recipe_json(md_json['properties'], url) return JsonResponse({'error': True, 'msg': _('The requested site does not provide any recognized data format to import the recipe from.')}, status=400)
def parseMicrodata(url): req = urllib2.Request(url) try: response = urllib2.urlopen(req) except urllib2.URLError as err: logger.error("Error while fetching %s: %s" % (url, err.msg)) raise items = microdata.get_items(response) event_list = [] for ev in filter( lambda x: microdata.URI("http://schema.org/Event") in x.itemtype, items): start = datetime.strptime(ev.startdate, "%Y-%m-%dT%H:%M:%SZ") start = start.replace(tzinfo=timezone("UTC")).astimezone(tz) if (ev.enddate): end = datetime.strptime(ev.startdate, "%Y-%m-%dT%H:%M:%SZ") end = end.replace(tzinfo=timezone("UTC")) else: end = start + timedelta(hours=1) event_data = { "title": ev.name, "description": ev.name, "start": start.strftime(dt_format), "end": end.strftime(dt_format), "location": ev.location.name, "url": urljoin(url, str(ev.url)) } event_list.append(event_data) return event_list
def bookmarklet(self, request): """ Fetches the recipe for the url, saves the recipe, and returns a response to the chrome extension """ u = ICurrentUser(request) url = request.args['uri'][0] pageSource = yield treq.get(url).addCallback(treq.content) items = microdata.get_items(pageSource) recipesSaved = [] for i in items: itemTypeArray = [x.string for x in i.itemtype] if RECIPE_SCHEMA in itemTypeArray: recipe = i saveItem = Recipe.fromMicrodata(recipe, u.email) Recipe.saveOnlyOnce(saveItem) recipesSaved.append({ "name": saveItem.name, "urlKey": saveItem.urlKey }) break if len(recipesSaved) == 0: defer.returnValue( ClipResponse(status=RS.error, message=ResponseMsg.noRecipe)) defer.returnValue(ClipResponse(status=RS.ok, recipes=recipesSaved))
def extract_microdata_from_html(html_str): import microdata try: items = microdata.get_items(html_str) return [item.json_dict() for item in items] except Exception as e: return [{'extraction_error': str(e)}]
def test_parse_multiple_props(self): items = get_items(open("test-data/multiple-props.html")) self.assertEqual(len(items), 2) item = items[0] i = json.loads(item.json()) # both names `John Doe and Jane Dun` should appear under author and creator props self.assertEqual( len(i["properties"]["author"][0]["properties"]["name"]), 2) self.assertEqual(i["properties"]["author"][0]["properties"]["name"], ["John Doe", "Jane Dun"]) self.assertTrue( len(i["properties"]["creator"][0]["properties"]["name"]), 2) self.assertEqual(i["properties"]["creator"][0]["properties"]["name"], ["John Doe", "Jane Dun"]) # nested multiple props self.assertEqual(item.author.affiliation.name, "Stanford University") self.assertEqual(item.creator.affiliation.name, "Stanford University") self.assertEqual(item.author.alumniOf.name, "Stanford University") self.assertEqual(item.creator.alumniOf.name, "Stanford University") item = items[1] i = json.loads(item.json()) # test case for original issue #3 self.assertTrue(i["properties"]["favorite-color"][0], "orange") self.assertTrue(i["properties"]["favorite-fruit"][0], "orange")
def parseMicrodata(url): req = urllib2.Request(url) try: response = urllib2.urlopen(req) except urllib2.URLError as err: logger.error("Error while fetching %s: %s" % (url, err.msg)) raise items = microdata.get_items(response) event_list = [] for ev in filter(lambda x: microdata.URI("http://schema.org/Event") in x.itemtype, items): start = datetime.strptime(ev.startdate, "%Y-%m-%dT%H:%M:%SZ") start = start.replace(tzinfo=timezone("UTC")).astimezone(tz) if (ev.enddate): end = datetime.strptime(ev.startdate, "%Y-%m-%dT%H:%M:%SZ") end = end.replace(tzinfo=timezone("UTC")) else: end = start + timedelta(hours = 1) event_data = { "title": ev.name, "description": ev.name, "start": start.strftime(dt_format), "end": end.strftime(dt_format), "location": ev.location.name, "url": urljoin (url, str(ev.url)) } event_list.append(event_data) return event_list
def get_comments(recipe=None): url = recipe['source_url'] title = recipe['title'] items = microdata.get_items(urllib.urlopen(url)) for item in items: if len(item.get_all('ingredients')) > 0: try: ingredients = [' '.join(i.replace('\n', '').split()) for i in item.get_all('ingredients')] except TypeError: return None for ing in ingredients: found = False ings = ing.split(' ') for i, word in enumerate(ings): for m in measurements: if word.startswith(m): ings = ings[i+1:] found = True if ings else False if found: final = ' '.join(ings) mesg = u"{}: {} {} with {}. {} ".format(title, random.sample(starts, 1)[0], final, random.sample(foods, 1)[0], random.sample(comments, 1)[0], ) if len(mesg) > (140 - 23 - 23): # This message is too long print "Message was too long; retrying" return None else: return mesg + url
def test_parse_nested(self): # parse the html for microdata items = get_items(open("test-data/example-nested.html")) # this html should have just one main item self.assertTrue(len(items), 1) item = items[0] # item's type should be set self.assertEqual(item.itemtype, [URI("http://schema.org/Event")]) # test case of a nested itemprop self.assertEqual(item.name.strip(), "Miami Heat at Philadelphia 76ers - Game 3 (Home Game 1)") # test case of a nested itemscope self.assertTrue(isinstance(item.location, Item)) self.assertEqual(item.location.itemtype, [URI("http://schema.org/Place")]) self.assertEqual(item.location.url, URI("wells-fargo-center.html")) # address should be a nested item self.assertTrue(isinstance(item.location.address, Item)) self.assertEqual(item.location.address.itemtype, [URI("http://schema.org/PostalAddress")]) self.assertTrue(item.location.address.addressLocality, "Philadelphia") # json i = json.loads(item.json()) self.assertEqual(i["properties"]["name"][0].strip(), "Miami Heat at Philadelphia 76ers - Game 3 (Home Game 1)") self.assertEqual(i["type"], ["http://schema.org/Event"]) self.assertEqual(i["properties"]["url"], ["nba-miami-philidelphia-game3.html"]) self.assertTrue(isinstance(i["properties"]["location"][0], dict)) self.assertEqual(i["properties"]["location"][0]["properties"]["url"][0], "wells-fargo-center.html") self.assertTrue(isinstance(i["properties"]["location"][0]["properties"]["address"][0], dict)) self.assertEqual(i["properties"]["location"][0]["properties"]["address"][0]["properties"]["addressLocality"][0], "Philadelphia")
def find_rating(self, title): tt_uri = self.IMDB_TITLE_URI % title try: page = microdata.get_items(urlopen(tt_uri)) return page[0].aggregateRating.ratingValue except (AttributeError, IndexError) as e: self.l.debug("Parsed microdata content: " + str(page)) self.l.error("Error parsing IMDB microdata: " + str(e))
def get(htmldoc): """Get page data.""" data = {} items = microdata.get_items(htmldoc) movie_item = items[0] data['microdata'] = movie_item return data
def get_comments_from_article(guardian_article_url): read_url = urllib.urlopen(guardian_article_url) microdata_entities = microdata.get_items(read_url) entities = [json.loads(entity.json()) for entity in microdata_entities] comments = [ entity['properties'] for entity in entities if 'http://schema.org/Comment' in entity['type'] ] return comments
def from_microdata(content): result = [] for item in microdata.get_items(content): if item.itemtype == [microdata.URI('http://schema.org/JobPosting')]: job_posting = JobPosting() job_posting.title = item.title job_posting._original_format = 'microdata' result.append(job_posting) return result
def download(self,url): self.url = url items = microdata.get_items(urllib.request.urlopen(self.url)) item = items[0] self.set_text(item.articleBody) self.set_title(item.alternativeHeadline) self.set_thumbnailUrl(item.thumbnailUrl) #self.set_summary(item.articleBody) self.json = item.json()
def microdata_extract(url_or_resource, itemprop): """ Extracts via microdata scraping Make sure to add two scrapers/packages in case one of them fails Use try-except for scarping """ resource = isinstance( url_or_resource, Resource) and url_or_resource or Resource(url_or_resource) try: # Calling get_contents to get the html contents html_content = resource.get_contents_as_file() items = microdata_library.get_items(html_content) except ValueError: return None try: # Getting json data from the html content item = items[0] item_data = item.json() json_item_data = json.loads(item_data) try: itemprop = itemprop.split('/') except AttributeError: return None # Removing empty lists itemprop = [key for key in itemprop if key] itemprop_list = [] for tag in itemprop: try: tag = int(tag) itemprop_list.append(tag) except ValueError: itemprop_list.append(tag) pass try: def f(iterable, key): return iterable[key] #Reducing the json data to required value return reduce(f, itemprop_list, json_item_data) except IndexError: return None except KeyError: return None # Exception incase there is no microdata in the html content except IndexError: return None
def scrape_recipe(recipe_url) : if recipe_exists(recipe_url): #print "Not Scraping " + recipe_url + ", already exists" return True else: #print "Scraping recipe microdata: " + recipe_url items = microdata.get_items(urllib.urlopen(recipe_url)) for item in items: print "Scraping: " + item.name + " from " + recipe_url recipe_model = { "url" : recipe_url, "name" : item.name, "recipe" : item.json() } scraperwiki.sqlite.save(unique_keys=["url"], table_name="recipes", data=recipe_model) return True
def parse_content_microdata(parse_url): # https://developers.google.com/structured-data/testing-tool/ # not working with http://www.bonprix.de/produkt/maxi-jerseykleid-dunkelblau-bedruckt-958483/ # which is good microformat according to google print parse_url items = microdata.get_items(urllib.urlopen(parse_url)) data = [i.json_dict() for i in items] pp.pprint(data) return data
def scrapSoftwareApplicationSchema(self, html): # extract microdata from html items = microdata.get_items(html) softwareApps = [] for item in items: # from all itemscope elements, filter by SoftwareApplication for itemtype in item.itemtype: if itemtype.string == SoftwareApp.ENTITY_TYPE: props = self._getEntityPropertyValuesFromMicroItem(item) softwareApp = SoftwareApp(props) softwareApps.append(softwareApp) break return softwareApps
def load(self): """Retrieves the data for this object from the WikiTree server. This happens automatically when any of the properties are accessed. >>> p = Person('Sloan-518') >>> p.load() """ items = microdata.get_items(urllib.request.urlopen(self.url)) data = items[0].json_dict()['properties'] self.__dict__ = self.__process_microdata__(None, data) self.__data__ = data self.__loaded__ = True
def get_data(url): """ Uses the metadata module to parse the metadata from the provided URL """ try: request = requests.get(url) except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError) as e: raise ParseError(e) items = microdata.get_items(request.text) for item in items: if item.itemtype == [microdata.URI("http://schema.org/Recipe")]: return item raise ParseError("No recipe data found")
def test_parse_nested(self): # parse the html for microdata with open("test-data/example-nested.html") as f: items = get_items(f) # this html should have just one main item self.assertTrue(len(items), 1) item = items[0] # item's type should be set self.assertEqual(item.itemtype, [URI("http://schema.org/Event")]) # test case of a nested itemprop self.assertEqual( item.name.strip(), "Miami Heat at Philadelphia 76ers - Game 3 (Home Game 1)") # test case of a nested itemscope self.assertTrue(isinstance(item.location, Item)) self.assertEqual(item.location.itemtype, [URI("http://schema.org/Place")]) self.assertEqual(item.location.url, URI("wells-fargo-center.html")) # address should be a nested item self.assertTrue(isinstance(item.location.address, Item)) self.assertEqual(item.location.address.itemtype, [URI("http://schema.org/PostalAddress")]) self.assertTrue(item.location.address.addressLocality, "Philadelphia") # json i = json.loads(item.json()) self.assertEqual( i["properties"]["name"][0].strip(), "Miami Heat at Philadelphia 76ers - Game 3 (Home Game 1)") self.assertEqual(i["type"], ["http://schema.org/Event"]) self.assertEqual(i["properties"]["url"], ["nba-miami-philidelphia-game3.html"]) self.assertTrue(isinstance(i["properties"]["location"][0], dict)) self.assertEqual( i["properties"]["location"][0]["properties"]["url"][0], "wells-fargo-center.html") self.assertTrue( isinstance( i["properties"]["location"][0]["properties"]["address"][0], dict)) self.assertEqual( i["properties"]["location"][0]["properties"]["address"][0] ["properties"]["addressLocality"][0], "Philadelphia")
def get_article_body(nytimes_article_url): try: opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) response = opener.open(nytimes_article_url) html = response.read() microdata_entities = microdata.get_items(html) entities = [json.loads(entity.json()) for entity in microdata_entities] body = [] for entity in entities: body += entity[u'properties'][u'articleBody'] except: return [] return body
def get_article_body(guardian_article_url): try: opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) response = opener.open(guardian_article_url) html = response.read() microdata_entities = microdata.get_items(html) entities = [json.loads(entity.json()) for entity in microdata_entities] body = [] for entity in entities: if entity[u'type'] == [u'http://schema.org/NewsArticle']: return entity[u'properties'] except: return [] return body
def get_data(url): """ Uses the metadata module to parse the metadata from the provided URL """ try: request = requests.get(url) request.raise_for_status() except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError) as e: raise ParseError(e) items = microdata.get_items(request.text) for item in items: if item.itemtype == [microdata.URI("http://schema.org/Recipe")]: return item raise ParseError("No recipe data found")
def parseHtml(url): ''' Metodo que faz o parse dos dados estruturados existentes no html usando a classe microdata:https://github.com/edsu/microdata e as libs html5lib:https://github.com/html5lib/html5lib-python lxml:http://lxml.de/index.html ''' location=urllib.urlopen(url) html=lhtml.fromstring(location.read()) softwareHtmlString= html.get_element_by_id("software") items = microdata.get_items(lhtml.tostring(softwareHtmlString)) if len(items): return items[0] else: return None
def microdata_filter(site_id): products = [] schema_product_type = 'http://schema.org/Product' data_file_path = config.URL_CRAWLED_DATA_DIR + str(site_id) if not os.path.exists(data_file_path): return False, None, None, None with open(data_file_path, 'rb') as f: encoding = chardet.detect(f.read())['encoding'] items = microdata.get_items(f, encoding) if not items: return False, None, None, None for item in items: item = json.loads(item.json()) if item.get('type')[0] == schema_product_type and item.get( 'properties').get('offers'): product_price = None product_currency = None try: product_price = item.get('properties').get('offers')[0].get( 'properties').get('price')[0] except Exception as e: print(e) try: product_currency = item.get('properties').get('offers')[0].get( 'properties').get('priceCurrency')[0] except Exception as e: print(e) if product_price: product = { 'price': price_formatter(product_price)[0] if product_price else None, 'currency': product_currency } products.append(product) if len(products) == 0: return False, None, None, None else: product = products[0] return True, product.get('price'), product.get( 'currency'), datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
def test_parse(self): # parse the html for microdata with open('test-data/example.html') as f: items = get_items(f) # this html should have just one main item self.assertTrue(len(items), 1) item = items[0] # item's type should be set self.assertEqual(item.itemtype, [URI("http://schema.org/Person")]) # test simple case of a single valued property self.assertEqual(item.name, "Jane Doe") # but object properties can have multiple values ... # basic accessor returns the first value self.assertEqual(item.colleagues, URI("http://www.xyz.edu/students/alicejones.html")) # and get_all, well, gets them all of course :) self.assertEqual(item.get_all("colleagues"), [ URI("http://www.xyz.edu/students/alicejones.html"), URI("http://www.xyz.edu/students/bobsmith.html") ]) # address should be another item self.assertTrue(isinstance(item.address, Item)) self.assertEqual(item.address.itemtype, [URI("http://schema.org/PostalAddress")]) self.assertTrue(item.address.addressLocality, "Seattle") # <script> tag should be ignored in the content text self.assertFalse("Unrelated text" in item.address.streetAddress) # json i = json.loads(item.json()) self.assertEqual(i["properties"]["name"][0], "Jane Doe") self.assertEqual(i["type"], ["http://schema.org/Person"]) self.assertEqual(i["id"], "http://www.xyz.edu/~jane") self.assertTrue(isinstance(i["properties"]["address"][0], dict)) self.assertEqual( i["properties"]["address"][0]["properties"]["addressLocality"][0], "Seattle")
def test_parse_unlinked(self): items = get_items(open("test-data/unlinked.html")) self.assertEqual(len(items), 2) i = items[0] self.assertEqual(i.itemtype, [URI("http://schema.org/Person")]) self.assertEqual(i.name, "Jane Doe") self.assertEqual(i.streetAddress, None) # this PostalAddress is enclosed within the Person but it is # not linked via the streetAddress itemprop. This particular example # would represent a bug in the markup, but technically items can appear # within other items without them being related together with an # itemprop. i = items[1] self.assertEqual(i.itemtype, [URI("http://schema.org/PostalAddress")]) self.assertTrue("Whitworth" in i.streetAddress)
def test_parse_unlinked(self): items = get_items(open("test-data/unlinked.html")) self.assertEqual(len(items), 2) i = items[0] self.assertEqual(i.itemtype, [URI("http://schema.org/Person")]) self.assertEqual(i.name, "Jane Doe") self.assertEqual(i.streetAddress, None) # this PostalAddress is enclosed within the Person but it is # not linked via the streetAddress itemprop. This particular example # would represent a bug in the markup, but technically items can appear # within other items without them being related together with an # itemprop. i = items[1] self.assertEqual(i.itemtype, [URI("http://schema.org/PostalAddress")]) self.assertTrue('Whitworth' in i.streetAddress)
def scrape_recipe(recipe_url): if recipe_exists(recipe_url): #print "Not Scraping " + recipe_url + ", already exists" return True else: #print "Scraping recipe microdata: " + recipe_url items = microdata.get_items(urllib.urlopen(recipe_url)) for item in items: print "Scraping: " + item.name + " from " + recipe_url recipe_model = { "url": recipe_url, "name": item.name, "recipe": item.json() } scraperwiki.sqlite.save(unique_keys=["url"], table_name="recipes", data=recipe_model) return True
def get_recipe_from_file(self, file): html_text = file.getvalue().decode("utf-8") items = microdata.get_items(html_text) for i in items: md_json = json.loads(i.json()) if 'schema.org/Recipe' in str(md_json['type']): recipe_json = find_recipe_json(md_json['properties'], '') recipe = Recipe.objects.create( name=recipe_json['name'].strip(), created_by=self.request.user, internal=True) step = Step.objects.create( instruction=recipe_json['recipeInstructions']) for ingredient in recipe_json['recipeIngredient']: f, created = Food.objects.get_or_create( name=ingredient['ingredient']['text']) u, created = Unit.objects.get_or_create( name=ingredient['unit']['text']) step.ingredients.add( Ingredient.objects.create(food=f, unit=u, amount=ingredient['amount'], note=ingredient['note'])) recipe.steps.add(step) soup = BeautifulSoup(html_text, "html.parser") image = soup.find('img') image_name = image.attrs['src'].strip().replace('Images/', '') for f in self.files: if '.zip' in f.name: import_zip = ZipFile(f.file) for z in import_zip.filelist: if re.match(f'^Recipes/Images/{image_name}$', z.filename): self.import_recipe_image( recipe, BytesIO(import_zip.read(z.filename))) return recipe
def get_microdata_items(self, response): items = microdata.get_items(response.text) result = [] for item in items: if 'ClaimReview' in str(item.itemtype[-1]): rr = item.get('reviewRating') img = item.get('image') ir = item.get('itemReviewed') url = str(item.get('url')) result.append( dict( type=str(item.itemtype[-1]), datePublished=self.parse_date( item.get('datePublished')), dateModified=self.parse_date(item.get('dateModified')), url=url, author=self.microdata_authors_from(item), image=dict(type=str(img.itemtype[-1]), url=str(img.get('url')), width=img.get('width'), height=img.get('height')) if img else None, claimReviewed=item.get('claimReviewed'), reviewRating=dict(type=str(rr.itemtype[-1]), ratingValue=rr.get('ratingValue'), bestRating=rr.get('bestRating'), worstRating=rr.get('worstRating'), alternateName=rr.get('alternateName') or rr.get('name')), itemReviewed=dict( type=str(ir.itemtype[-1]), author=self.microdata_authors_from(ir), datePublished=self.parse_date( ir.get('datePublished')), sameAs=[str(s) for s in ir.get_all('sameAs')]), keywords=str(item.get('keywords')) if item.get('keywords') else None, )) return result
def test_parse(self): # parse the html for microdata items = get_items(open("test-data/example.html")) # this html should have just one main item self.assertTrue(len(items), 1) item = items[0] # item's type should be set self.assertEqual(item.itemtype, [URI("http://schema.org/Person")]) # test simple case of a single valued property self.assertEqual(item.name, "Jane Doe") # but object properties can have multiple values ... # basic accessor returns the first value self.assertEqual(item.colleagues, URI("http://www.xyz.edu/students/alicejones.html")) # and get_all, well, gets them all of course :) self.assertEqual(item.get_all("colleagues"), [URI("http://www.xyz.edu/students/alicejones.html"), URI("http://www.xyz.edu/students/bobsmith.html")]) # address should be another item self.assertTrue(isinstance(item.address, Item)) self.assertEqual(item.address.itemtype, [URI("http://schema.org/PostalAddress")]) self.assertTrue(item.address.addressLocality, "Seattle") # <script> tag should be ignored in the content text self.assertFalse("Unrelated text" in item.address.streetAddress) # json i = json.loads(item.json()) self.assertEqual(i["properties"]["name"][0], "Jane Doe") self.assertEqual(i["type"], ["http://schema.org/Person"]) self.assertEqual(i["id"], "http://www.xyz.edu/~jane") self.assertTrue(isinstance(i["properties"]["address"][0], dict)) self.assertEqual(i["properties"]["address"][0]["properties"]["addressLocality"][0], "Seattle")
def scrape(url): try: parsed_uri = urlparse(url) response = urlopen(url) html = response.read() items = microdata.get_items(html) recipe = None if len(items) > 0: for item in items: if str(item.itemtype[0]).endswith("/Recipe"): instructions = [] ingredients = [] if item.recipeInstructions is not None: for instruction in item.get_all("recipeInstructions"): splitted = instruction.split("\n") for line in splitted: clean = line.strip() if clean != "": instructions.append(clean) if item.ingredients is not None: ingredients = item.get_all("ingredients") if item.recipeIngredient is not None: ingredients = item.get_all("recipeIngredient") if len(ingredients) > 0 and len(instructions) > 0: recipe = { "name": item.name, "ingredients": ingredients, "instructions": instructions } soup = BeautifulSoup(markup=html, features="html5lib") hrefs = [] for link in soup.findAll('a'): href = link.get('href') parsed_sub_uri = urlparse(href) if parsed_sub_uri.netloc == "" or parsed_sub_uri.netloc == parsed_uri.netloc: new_url = parsed_uri.scheme + "://" + parsed_uri.netloc + parsed_sub_uri.path if new_url != url: hrefs.append(new_url) return set(hrefs), recipe except Exception as e: print(e) return set([]), None
def get(self): url = self.request.get('url') if not url: self.redirect('/') extracted = {} extracted['items'] = items = [] url_contents = urllib.urlopen(url).read() for item in microdata.get_items(url_contents): items.append(item.json_dict()) context = { "url": url, "request_url": self.request.url, "extracted": json.dumps(extracted, indent=4), "items": items, "access_date": datetime.date.today(), "show_wikipedia": self.request.get('wikipedia', 'off') == 'on' } url_parts = urlsplit(url) site_name = url_parts.netloc if site_name.endswith('wdl.org'): site_name = 'WDL' wiki_site_name = '[[World Digital Library]]' else: wiki_site_name = site_name context['site_name'] = site_name context['wiki_site_name'] = wiki_site_name best_match = self.request.accept.best_match(['application/json', 'text/html']) if best_match == 'application/json': self.response.content_type = 'application/json' self.response.write(context['extracted']) else: template = JINJA_ENVIRONMENT.get_template('index.html') self.response.write(template.render(context))
def parse_recipes(response, data={}): recipes = [] items = microdata.get_items(response.body) for item in items: #log.msg(item.json(), level=log.DEBUG) recipe = {} if item.itemtype == [URI("http://data-vocabulary.org/Recipe")]: recipe = handle_data_vocab(item, data) elif item.itemtype == [URI("http://schema.org/Recipe")]: recipe = handle_schema_org(item, data) else: log.msg('could not determine microdata type', level=log.ERROR) if 'image' in recipe: img = recipe['image'] fb_image = extract_facebook_images(response) if type(img) is str or type(img) is unicode and img.startswith('//'): recipe['image'] = extract_facebook_images(response) elif type(img) is URI and img.string.startswith('//'): recipe['image'] = extract_facebook_images(response) # favor facebook image if img != fb_image and fb_image is not None: recipe['image'] = fb_image; if recipe['image'] is not None and recipe['image'].startswith('//'): recipe['image'] = 'http:' + recipe['image'] recipe['source'] = data['source'] recipes.append(recipe) return recipes
def parse_page(page_dump): ''' Parse page ''' parsed_microdata = microdata.get_items(page_dump) if parsed_microdata: items =[i for i in parsed_microdata if 'name' in i.props] if items: item = items[0] else: return (None,'') print "No items with property 'name'" else: print "Empty microdata" return (None,'') try: soup = BeautifulSoup(page_dump) price = extract_price(soup) except Exception,e: logging.debug("Can't extract price for %s'", item.name.strip()) price = None
def consumeData(self, data): """ Parse the microdata into structured data """ ret = [] soup = BeautifulSoup(StringIO(data)) ingredientses = soup.find_all(None, itemprop='ingredients') for ing in ingredientses: separateByClass(soup, ing, "ingredient") separateByTag(soup, ing, ['br', 'tr', 'li']) instructionses = soup.find_all(None, itemprop="recipeInstructions") for ins in instructionses: separateByClass(soup, ins, "instruction") separateByTag(soup, ins, ['br', 'tr', 'li']) workingDocument = StringIO(soup.encode('utf-8')) items = microdata.get_items(workingDocument) for i in items: for typ in i.itemtype: if typ.string == MICROFORMAT_RECIPE: ret.append(i.json()) break return map(json.loads, ret)
def get_all_program_urls(): all_program_urls = [] page = 1 while True: url = f"https://www.rtp.pt/play/bg_l_pg/?listtype=az&page={page}&type=all" logging.info(f"Fetching {url}") r = requests.get(url) r.raise_for_status() items = microdata.get_items(r.text) if len(items) == 0: break for item in items: assert item.itemtype[0] == microdata.URI( "http://schema.org/VideoObject") program_url = urljoin("https://www.rtp.pt/", item.url.string) all_program_urls.append(program_url) page += 1 return all_program_urls
def bookmarklet(self, request): """ Fetches the recipe for the url, saves the recipe, and returns a response to the chrome extension """ def returnResponse(status, recipes, message): """ Return the appropriate data structure to the http response """ data = {'status': status, 'recipes': recipes, 'message': message} defer.returnValue(json.dumps(data)) userEmail = self.user(request).email if not userEmail: returnResponse(status="error", recipes=[], message=ResponseMsg.not_logged_in) url = request.args['uri'][0] pageSource = yield treq.get(url).addCallback(treq.content) items = microdata.get_items(pageSource) recipeSaved = [] for i in items: itemTypeArray = [x.string for x in i.itemtype] if RECIPE_SCHEMA in itemTypeArray: recipe = i saveItem = Recipe.fromMicrodata(recipe, userEmail) Recipe.saveOnlyOnce(saveItem) recipeSaved.append({"name": saveItem.name, "urlKey": saveItem.urlKey}) break if len(recipeSaved) == 0: returnResponse(status="error", recipes=[], message=ResponseMsg.no_recipe) returnResponse(status="ok", recipes=recipeSaved, message=ResponseMsg.blank)
def items_from_str(self, html_str): self.items = [] self.items = microdata.get_items(html_str) self.inspect_items()
def get_microdata_author(author, instance): try: author_url = author.props['url'][0].string except: LOGGER.error(u'Microdata author has no URL?!?') return None if author_url.startswith(u'/'): # make a full absolute URL, and let things flow. try: proto, host_and_port, remaining = split_url(instance.url) except: LOGGER.error(u'schema.org-extractor: could not split “%s” ' u'to get schema/host parts, author_url “%s” ' u'could be unusable.', instance.url, author_url) else: author_url = '{0}://{1}{2}'.format(proto, host_and_port, author_url) if not author_url.startswith(u'http'): # We already have a full name. return author_url response = requests.get(author_url) response.encoding = detect_encoding_from_requests_response(response) try: items = microdata.get_items(response.text.encode('utf-8')) except: LOGGER.warning(u'schema.org-extractor: could not extract author ' u'microdata from %s', author_url) return author_url author = {} for item in items: schema_properties = item.props email = schema_properties.get('email', None) if email is not None: # microdata items are always lists… author['email'] = email[0] name = schema_properties.get('name', None) if name is not None: # microdata items are always lists… author['name'] = name[0] if item.type == 'http://schema.org/Person': familly_name = schema_properties.get('famillyName', None) given_name = schema_properties.get('givenName', None) if given_name is not None and familly_name is not None: # intended overwrite author['name'] = u'{0} {1}'.format(given_name, familly_name) # implicit: elif item.type == 'http://schema.org/Organization': # But we already have all the needed data. if bool(author): return author LOGGER.warning(u'schema.org-extractor: no Person/Organization found ' u'in author page %s.', author_url) return author_url
output_scr = open("/tmp/get_digikey_data.scr", "w+") if len(sys.argv) != 2: print "error: invalid number of inputs" print "usage: python get_digikey_data.py [url]" sys.exit(1) url = sys.argv[1] headers = { 'User-Agent' : 'Mozilla/5.0' } postdata = None #if not url.endswith(".html"): # url += ".html" try: req = urllib2.Request(url, postdata, headers) data = urllib2.urlopen(req).read() items = microdata.get_items(data) except: print "error: invalid url or unable to connect" sys.exit(1) d = next(item for item in items if item.itemtype[0] == microdata.URI("http://schema.org/WebPage")).json_dict() #d = item.json_dict() DIST_NAME = "Digi-Key" DIST_PN = d['properties']['mainEntity'][0]['properties']['productID'][0][4:].encode('ascii','ignore').strip() MFG_NAME = d['properties']['mainEntity'][0]['properties']['manufacturer'][0].encode('ascii','ignore').strip() MFG_PN = d['properties']['mainEntity'][0]['properties']['model'][0].encode('ascii','ignore').strip() DESC = d['properties']['mainEntity'][0]['properties']['description'][0].encode('ascii','ignore').strip().replace('\n', '<br>') print "DIST_NAME: " + DIST_NAME print "DIST_PN: " + DIST_PN print "MFG_NAME: " + MFG_NAME print "MFG_PN: " + MFG_PN
elif re.search(r"^\d+$", v): return int(v) else: return v defaults = {} if args.add != None: for p in args.add: n, v = p.split("=", 1) defaults[n] = parse_value(v) # print (defaults) # sys.exit() if args.output == "-": out = sys.stdout else: out = open(args.output, "w") data = {} data['items'] = items = [] for i in args.input: with open(i) as f: for item in microdata.get_items(f): for n, v in defaults.items(): item.set(n, v) items.append(item.json_dict()) print(json.dumps(data, indent=2), file=out)
chunks.append(_text(child)) return ''.join(chunks) if __name__ == "__main__": import urllib if len(sys.argv) < 2: print "Usage: %s URL [...]" % sys.argv[0] sys.exit(1) for url in sys.argv[1:]: sys.stderr.write(url + "\n") microdata = {} microdata['items'] = items = [] for item in get_items(urllib.urlopen(url)): items.append(item.json_dict()) print json.dumps(microdata, indent=2) ########NEW FILE######## __FILENAME__ = test try: import json except ImportError: import simplejson as json import unittest from microdata import get_items, Item, URI
def process(self, instance, parameters=None, verbose=True, commit=True, **kwargs): """ See source code. """ CONTENT_TYPES = models.CONTENT_TYPES instance_name = instance._meta.verbose_name instance_id = instance.id # Only used in accepts() code. # repair = parameters.get('repair', False) if instance.content_type == CONTENT_TYPES.HTML: html_to_work_on = instance.content else: # The existence of this has already been tested in accepts(). # We cannot run process() if the instance is not HTML or not # repairing it with a known HTML history version. html_to_work_on = instance.history.filter( content_type=CONTENT_TYPES.HTML).earliest('history_date').content try: # The microdata parser expects an utf-8 encoded string… too bad. items = microdata.get_items(html_to_work_on.encode('utf-8')) except: LOGGER.warning(u'schema.org-extractor: could not extract microdata ' u'from %s %s', instance_name, instance_id) return need_save = False # ————————————————————————————————————————————————————————————————— Extract attributes = OrderedDict() for item in items: schema_properties = item.props # LOGGER.info(u'item %s', item.json()) # Common attributes to all types we handle in 1flow. name = schema_properties.get('name', None) # Do not overwrite with a less specific value if # name was already set via 'Article::headline'. if name is not None and attributes.get('name', None) is not None: attributes['name'] = get_property(name) date_published = schema_properties.get('datePublished', None) if date_published is not None: attributes['date_published'] = get_property(date_published) excerpt = schema_properties.get('description', None) if excerpt is not None: attributes['excerpt'] = get_property(excerpt) tags = schema_properties.get('keywords', None) if tags is not None: attributes['tags'] = extract_tags(tags) image_url = schema_properties.get('thumbnailUrl', None) if image_url is not None: attributes['image_url'] = get_property(image_url) authors = schema_properties.get('author', None) # Author can be a link to the author page, which # will give us a Person or Organization schema. if authors is not None: found_authors = get_microdata_authors(authors, instance) if found_authors: attributes['authors'] = found_authors genre = schema_properties.get('genre', None) if genre is not None: if 'tags' not in attributes: attributes['tags'] = [] for one_genre in genre: attributes['tags'].extend(extract_tags(one_genre)) if item.type == 'http://schema.org/VideoObject': if instance.content_type != CONTENT_TYPES.VIDEO: instance.content_type = CONTENT_TYPES.VIDEO need_save = True LOGGER.info(u'schema.org-extractor: Set %s %s content type ' u'to VIDEO.', instance_name, instance_id) elif item.type in ( 'http://schema.org/Article', 'http://schema.org/NewsArticle', 'http://schema.org/TechArticle', 'http://schema.org/BlogPosting', 'http://schema.org/WebPage', 'http://schema.org/CreativeWork', ): # HeadLine overwrites name, it's more specific. attributes['name'] = get_property( schema_properties.get('headline', None)) attributes['language'] = get_property( schema_properties.get('inLanguage', None)) attributes['word_count'] = get_property( schema_properties.get('wordCount', None)) creators = schema_properties.get('creator', None) # Author can be a link to the creator page, which # will give us a Person or Organization schema. if creators is not None: creators = get_microdata_authors(creators, instance) if creators: if 'authors' in attributes: attributes['authors'].extend(creators) else: attributes['authors'] = creators # TODO: # citation # comment # articleBody → content # articleSection → Tags # # News: # dateline → ? # # Tech: # dependencies # proficiencyLevel # # WebPage: # specialy → ? # significantLink → crawl ? # reviewedBy → ? # lastReviewed → ? # relatedLink → ? # primaryImageOfPage # —————————————————————————————————————————————————————— Transform & assign # turn attributes into their python / 1flow native-internals formats. if attributes.get('date_published', None) is not None: try: attributes['date_published'] = datetime(*datetime_extended_parser( attributes['date_published'])[:6]) except: LOGGER.exception(u'schema.org-extractor: unparseable date “%s”', attributes['date_published']) # Be sure we don't try to use it below. attributes['date_published'] = None if attributes.get('language', None) is not None: try: attributes['language'] = models.Language.get_by_code( attributes['language']) except: LOGGER.exception(u'schema.org-extractor: unable to get ' u'language “%s”', attributes['language']) # Be sure we don't try to use it below. attributes['language'] = None if attributes.get('word_count', None) is not None: attributes['word_count'] = int(attributes['word_count']) if attributes.get('tags', None) is not None: # We pop() tags to avoid trying to setattr() it below. tags = models.SimpleTag.get_tags_set(attributes.pop('tags'), origin=instance) instance.tags.add(*tags) if verbose: LOGGER.info(u'schema.org-extractor: added tags %s to %s %s.', u', '.join(tag.name for tag in tags), instance_name, instance_id) if attributes.get('authors', None) is not None: # We pop() tags to avoid trying to setattr() it below. authors = attributes.pop('authors') # LOGGER.info(authors) # This will implicitely add() the author to the instance. authors = models.Author.get_authors_from_name_emails_and_article( authors, origin_article=instance) # LOGGER.info(authors) LOGGER.info(u'schema.org-extractor: added author(s) %s to %s %s.', u', '.join(unicode(a) for a in authors), instance_name, instance_id) # if verbose: # LOGGER.debug(u'schema.org-extractor: %s', attributes) for attribute, value in attributes.items(): if value is None: continue if getattr(instance, attribute) is None: setattr(instance, attribute, value) need_save = True if verbose: LOGGER.info(u'schema.org-extractor: Set %s %s to %s %s.', attribute, value, instance_name, instance_id) if need_save and commit: instance.save()
def items_from_URL(self, doc_url): self.items = [] self.items = microdata.get_items(urllib2.urlopen(doc_url).read()) self.inspect_items()
def test_skip_level(self): items = get_items(open("test-data/skip-level.html")) self.assertEqual(len(items), 1) self.assertEqual(items[0].name, "Jane Doe")