def scrape_collection_files(topic, url): assets = json.loads(downloader.read(url))['data'] images = [] for asset in assets: if asset['attributes']['extension'] == 'png': images.append({ 'url': asset['attributes']['thumbnail_url'].replace( 'element.png', '*****@*****.**'), 'caption': asset['attributes']['name'] }) elif asset['attributes']['extension'] == 'mp4': video_data = json.loads( downloader.read(FILE_STORAGE_URL.format(id=asset['id']))) video = video_data['data'][0]['attributes'] topic.add_child( nodes.VideoNode(source_id=video['url'], title=asset['attributes']['name'], license=LICENSE, files=[ files.VideoFile(video['url']), files.ThumbnailFile(video['thumbnail_url']) ])) else: LOGGER.warning('Unable to add {} from {}'.format( asset['attributes']['extension'], url)) # Add images to slideshow node if len(images): topic.add_child(create_slideshow(images, url, topic.title, 'English'))
def scrape_resource(url, topic): resource = BeautifulSoup(downloader.read(url), 'html5lib') LOGGER.info(' {}'.format(resource.find('h2').text)) filepath = download_resource(resource.find('div', {'class': 'decargas'}).find('a')['href']) license = None author = '' for data_section in resource.find('div', {'class': 'datos_generales'}).find_all('h4'): if 'Licencia' in data_section.text: try: license = LICENSE_MAP[data_section.find_next_sibling('p').text](copyright_holder="Ceibal") except KeyError as e: LOGGER.error(str(e)) license = licenses.CC_BYLicense elif 'Autor' in data_section.text: author = data_section.find_next_sibling('p').text if filepath: thumbnail = resource.find('div', {'class': 'img-recurso'}).find('img')['src'] if thumbnail.endswith('.gif'): thumbnail = os.path.sep.join([DOWNLOAD_DIRECTORY, thumbnail.split('/')[-1].replace('.gif', '.png')]) with open(thumbnail, 'wb') as fobj: fobj.write(downloader.read(resource.find('div', {'class': 'img-recurso'}).find('img')['src'])) topic.add_child(nodes.HTML5AppNode( title=resource.find('h2').text, source_id=url, license=license, author=author, description=resource.find('form').find_all('p')[1].text, thumbnail=thumbnail, tags = [tag.text[:30] for tag in resource.find_all('a', {'class': 'tags'})], files=[files.HTMLZipFile(path=filepath)], ))
def scrape_multilanguage_slideshows(channel): LOGGER.info('Scraping multi-language content...') contents = BeautifulSoup(downloader.read(SLIDESHOWS_URL), 'html5lib') collection_key = get_collection_key(contents) languages_selection = contents.find('div', { 'class': 'asset-list' }).find('div') language_list = json.loads( languages_selection['data-react-props'])['sections'] for language in language_list: asset_url = SLIDESHOW_ASSETS_URL.format( collection='qac6i4-foozd4-68u325', section=language['section_key']) slide_data = json.loads(downloader.read(asset_url))['data'] translated_name = languages.getlang( LANGUAGE_MAP[language['name']]).native_name if LANGUAGE_MAP[ language['name']] else language['name'] LOGGER.info(' {}'.format(translated_name.encode('utf-8'))) slides = [{ 'url': slide['attributes']['thumbnail_url'].replace( 'element.png', '*****@*****.**') } for slide in slide_data] if len(slides): channel.add_child( create_slideshow(slides, asset_url, translated_name, language['name']))
def scrape_student_resources(): """ Scrape student resources from the main page http://edsitement.neh.gov/student-resources """ STUDENT_RESOURCES_URL = urljoin(BASE_URL, "student-resources/") subject_ids = [25, 21, 22, 23] levels = ["Student Resources"] for subject in subject_ids[STUDENT_RESOURCE_SUBJECT_INIT:STUDENT_RESOURCE_SUBJECT_END]: params_url = "all?grade=All&subject={}&type=All".format(subject) page_url = urljoin(STUDENT_RESOURCES_URL, params_url) LOGGER.info("Scrapping: " + page_url) page_contents = downloader.read(page_url, session=sess) page = BeautifulSoup(page_contents, 'html.parser') resource_links = page.find_all(lambda tag: tag.name == "a" and tag.findParent("h3")) for link in resource_links[STUDENT_RESOURCE_INIT:STUDENT_RESOURCE_END]: time.sleep(TIME_SLEEP) if link["href"].rfind("/student-resource/") != -1: student_resource_url = urljoin(BASE_URL, link["href"]) try: page_contents = downloader.read(student_resource_url, session=sess) except requests.exceptions.HTTPError as e: LOGGER.info("Error: {}".format(e)) page = BeautifulSoup(page_contents, 'html.parser') topic_name = student_resource_url.split("/")[-1] student_resource = StudentResourceIndex(page, filename="/tmp/student-resource-"+topic_name+".zip", levels=levels) student_resource.to_file()
def download_content(self): self.load_tree_data() assert self.channel_tree def get_filename(url): return url.split('/')[-1].split('?')[0] for lang in self.channel_tree: for class_name in self.channel_tree[lang]: for subject in self.channel_tree[lang][class_name]: for item in self.channel_tree[lang][class_name][subject]['items']: url = get_column(item, 'url') url = url.replace('?dl=0', '?dl=1') filename = get_filename(url) if url: download_path = os.path.join(self.ARCHIVE_DIR, lang, class_name, subject, filename) os.makedirs(os.path.dirname(download_path), exist_ok=True) if not os.path.exists(download_path): content = downloader.read(url) with open(download_path, 'wb') as f: f.write(content) item['file'] = download_path icon = get_column(item, 'icon') icon = icon.replace('?dl=0', '?dl=1') if icon: icon_filename = get_filename(icon) icon_path = os.path.join(self.ARCHIVE_DIR, lang, class_name, subject, icon_filename) content = downloader.read(icon) with open(icon_path, 'wb') as f: f.write(content) item['thumbnail'] = icon_path
def get_scraper(self): from pages import DEFAULT_PAGE_HANDLERS for handler in (DEFAULT_PAGE_HANDLERS + self.extra_scrapers): if handler.test(self.link): return handler downloader.read(self.link) # Will raise an error if this is broken raise UnscrapableSourceException
def scrape_resource_list(url, topic): resource_list_page = BeautifulSoup(downloader.read(url), 'html5lib') # Go through pages, omitting Previous and Next buttons for page in range(len(resource_list_page.find_all('a', {'class': 'page-link'})[1:-1])): # Use numbers instead of url as the links on the site are also broken resource_list = BeautifulSoup(downloader.read("{}&page={}".format(url, page + 1)), 'html5lib') for resource in resource_list.find_all('a', {'class': 'card-link'}): resource_file = scrape_resource(resource['href'], topic)
def get_scraper(self): scraper = guess_scraper(self.link, scrapers=self.extra_scrapers, locale=self.locale, triaged=self.triaged, zipper=self.zipper) if not scraper: downloader.read(self.link) # Will raise an error if this is broken raise UnscrapableSourceException return scraper
def get_thumbnail(url): filename, _ext = os.path.splitext(os.path.basename(url)) img_path = os.path.sep.join( [THUMBNAILS_DIRECTORY, "{}.png".format(filename)]) svg_path = os.path.sep.join( [THUMBNAILS_DIRECTORY, "{}.svg".format(filename)]) # This thumbnail gets converted with an error, so download it separately for now if "US_history" in filename: return files.ThumbnailFile(path="US_history.png") # Copy pngs to local storage if url.endswith("png"): with open(img_path, 'wb') as pngobj: pngobj.write(downloader.read(url)) elif url.endswith("svg"): with open(svg_path, 'wb') as svgobj: # renderPM doesn't read <style> tags, so add style to individual elements svg_contents = BeautifulSoup(downloader.read(url), 'html.parser') svg_contents = BeautifulSoup( svg_contents.find('svg').prettify(), 'html.parser') if svg_contents.find('style'): sheet = cssutils.parseString(svg_contents.find('style').string) for rule in sheet: rectangles = svg_contents.find_all( 'rect', {'class': rule.selectorText.lstrip('.')}) paths = svg_contents.find_all( 'path', {'class': rule.selectorText.lstrip('.')}) polygons = svg_contents.find_all( 'polygon', {'class': rule.selectorText.lstrip('.')}) for el in rectangles + paths + polygons: el['style'] = "" for prop in rule.style: el['style'] += "{}:{};".format( prop.name, prop.value) # Beautifulsoup autocorrects some words to be all lowercase, so undo correction autocorrected_fields = ["baseProfile", "viewBox"] svg = svg_contents.find('svg') for field in autocorrected_fields: if svg.get(field.lower()): svg[field] = svg[field.lower()] del svg[field.lower()] svgobj.write(svg_contents.renderContents()) drawing = svg2rlg(svg_path) renderPM.drawToFile(drawing, img_path) else: import pdb pdb.set_trace() return files.ThumbnailFile(path=img_path)
def read(url): """ Read contents from url Args: url (str): url to read Returns contents from url """ return downloader.read(format_url(url))
def process(self): if 'fonts' in self.link: # Omit google fonts self.tag.decompose() return # Parse urls in css (using parseString because it is much faster than parseUrl) style_sheet = downloader.read(self.link).decode('utf-8-sig', errors='ignore') sheet = cssutils.parseString(style_sheet) for css_url in cssutils.getUrls(sheet): if not css_url.startswith('data:image') and not css_url.startswith( 'data:application'): try: style_sheet = style_sheet.replace( css_url, os.path.basename( self.write_url(css_url, url=self.link, default_ext='.png'))) except BROKEN_EXCEPTIONS as e: LOGGER.warn( 'Unable to download stylesheet url at {} ({})'.format( self.url, str(e))) self.tag[self.attribute] = self.format_url( self.write_contents(self.get_filename(self.link), style_sheet)) return self.tag[self.attribute]
def _download_file(self, write_to_path): video_id = self.url.split('#')[1] with open(write_to_path, 'wb') as fobj: fobj.write( downloader.read( 'https://www.wevideo.com/api/2/media/{}/content'.format( video_id)))
def process(self): # Using html.parser as it is better at handling special characters contents = BeautifulSoup(downloader.read(self.url, loadjs=self.loadjs), 'html.parser') # If a main area is specified, replace body contents with main area if self.main_area_selector: body = self.create_tag('body') body.append(contents.find(*self.main_area_selector)) contents.body.replaceWith(body) # Remove any items to omit for item in self.omit_list: for element in contents.find_all(*item): element.decompose() self.preprocess(contents) # Scrape tags for tag_class in (self.extra_tags + COMMON_TAGS): for tag in contents.find_all(*tag_class.selector): scraper = tag_class(tag, self.url, zipper=self.zipper, scrape_subpages=self.scrape_subpages, triaged=self.triaged, locale=self.locale, extra_scrapers=self.scrapers, color=self.color) scraper.scrape() self.postprocess(contents) return contents.prettify(formatter="minimal").encode( 'utf-8-sig', 'ignore')
def process(self): # Read URL and generate slideshow html contents = BeautifulSoup(downloader.read(self.url, loadjs=self.loadjs), 'html.parser') images = [] for img in contents.find_all(*self.img_selector): images.append(self.write_url(img[self.img_attr], directory="slides")) return self.generate_slideshow(images)
def run(self, limit_page=1, page_number=1): total_items = None counter = 0 try: page_contents = downloader.read(self.resource_url, loadjs=False) except requests.exceptions.HTTPError as e: LOGGER.info("Error: {}".format(e)) else: page = BeautifulSoup(page_contents, 'html.parser') states = page.find("div", class_=["lm-filter-course"]) states_tree = self.get_state_lang(states) subjects = page.find("div", class_=["lm-filter-subject"]) subjects_tree = self.get_subjects(subjects) levels = page.find("div", class_=["lm-filter-level"]) levels_tree = self.get_levels(levels) pages_params = self.build_page_params(states_tree, subjects_tree, levels_tree) for page_params in pages_params: url = self.build_url( page_params["course_tid"], page_params["subject_tid"], page_params.get("educational_level_tid", None)) yield dict(url=url, subject_name=page_params["subject_name"], state_lang=page_params["state_lang"], level_name=page_params.get("level_name", None)) LOGGER.info("CRAWLING : URL {}".format(url)) time.sleep(TIME_SLEEP)
def postprocess(self, contents): for script in contents.find_all('script'): if script.string: script.string = script.text.replace( 'background="HalfBakedBG.gif"', '') for match in re.finditer( r'(?:src)=(?:\'|\")([^\'\"]+)(?:\'|\")', script.string, re.MULTILINE): img_filename = match.group(1).split('?')[0].split( '/')[-1][-20:] script.string = script.text.replace( match.group(1), self.write_url(match.group(1), directory="webimg")) for match in re.finditer( r"onclick=\\(?:'|\")parent\.location\s*=\s*(?:'|\")([^'\"]+)(?:'|\")", script.string, re.MULTILINE): page_filename = 'recursostic-{}'.format( match.group(1).split('?')[0].split('/')[-1]) page = BeautifulSoup( downloader.read(self.get_relative_url(match.group(1))), 'html5lib') page_link = RecursosticScraper( self.get_relative_url(match.group(1)), zipper=self.zipper, locale=self.locale).to_zip() script.string = script.text.replace( match.group(1), page_link)
def to_file(self, description, filepath): try: page_contents = downloader.read(self.resource_url, session=sess) except requests.exceptions.HTTPError as e: LOGGER.info("Error: {}".format(e)) return None else: metadata_dict = {"description": description, "language": "en", "license": licenses.CC_BY, "copyright_holder": "National Endowment for the Humanities", "author": "", "source_id": self.resource_url} page = BeautifulSoup(page_contents, 'html.parser') LOGGER.info("COPYRIGHT {}".format(has_copyright(page))) content = page.find("div", id="content") if self.swf_content(content): return files = self.remove_external_links(content) images = self.find_local_images(content) for file_ in files: metadata_files = metadata_dict.copy() metadata_files["source_id"] = file_ self.add_resources_files(file_, metadata_files) #for img in images: # self.add_resources_files(img) self.write('<html><body><head><meta charset="UTF-8"></head>'+\ str(content)+'</body><html>', filepath) return metadata_dict
def scrape_channel(channel): # Read from Categorias dropdown menu page = BeautifulSoup(downloader.read(BASE_URL), 'html5lib') dropdown = page.find('a', {'id': 'btn-categorias'}).find_next_sibling('ul') # Go through dropdown and generate topics and subtopics for category_list in dropdown.find_all('li', {'class': 'has-children'}): # Parse categories for category in category_list.find_all('li', {'class': 'has-children'}): # Add this topic to channel when scraping entire channel category_name = category.find('a').text topic = nodes.TopicNode(title=category_name, source_id=get_source_id(category_name)) channel.add_child(topic) LOGGER.info(topic.title) # Parse subcategories for subcategory in category.find_all('li'): if not subcategory.attrs.get('class') or 'go-back' not in subcategory.attrs['class']: # Get rid of this check to scrape entire site subcategory_name = subcategory.find('a').text subcategory_link = subcategory.find('a')['href'] LOGGER.info(' {}'.format(subcategory_name)) subtopic = nodes.TopicNode(title=subcategory_name, source_id=get_source_id(subcategory_link)) topic.add_child(subtopic) # Parse resources scrape_subcategory(subcategory_link, subtopic)
def scrape_english_collection(channel): LOGGER.info('Scraping English collection...') english_topic = nodes.TopicNode(source_id=ENGLISH_COLLECTION_URL, title="English") channel.add_child(english_topic) contents = BeautifulSoup(downloader.read(ENGLISH_COLLECTION_URL), 'html5lib') collection_key = get_collection_key(contents) topic_selection = contents.find('div', {'class': 'asset-list'}).find('div') topic_list = [ t for t in json.loads(topic_selection['data-react-props'])['sections'] if t['id'] not in EXCLUDED_TOPIC_IDS ] for topic in topic_list: LOGGER.info(' {}'.format(topic['name'].encode('utf-8'))) topic_node = nodes.TopicNode(source_id=topic['section_key'], title=topic['name']) english_topic.add_child(topic_node) # Scrape items in the topic url = ENGLISH_ASSETS_URL.format(collection=collection_key, section=topic['section_key']) scrape_collection_files(topic_node, url)
def _download_file(self, write_to_path): audio_id = re.search(r'(?:player_ek_)([^_]+)(?:_2_1\.html)', self.url).group(1) with open(write_to_path, 'wb') as fobj: fobj.write( downloader.read( 'http://www.ivoox.com/listenembeded_mn_{}_1.m4a?source=EMBEDEDHTML5' .format(audio_id)))
def write_url(self, url, filename, directory=None): """ write_url: Write contents from url to filename in zip Args: url: (str) url to file to download filename: (str) name of file in zip directory: (str) directory in zipfile to write file to (optional) Returns: path to file in zip """ return self.write_contents(filename, read(url), directory=directory)
def save_thumbnail(url, save_as, sess): THUMB_DATA_DIR = build_path([DATA_DIR, 'thumbnail']) filepath = os.path.join(THUMB_DATA_DIR, save_as) try: document = downloader.read(url, loadjs=False, session=sess) except requests.exceptions.ConnectionError as e: return None else: with open(filepath, 'wb') as f: f.write(document) return filepath
def scrape_subcategory(link, topic): url = "{}{}".format(BASE_URL, link.lstrip("/")) resource_page = BeautifulSoup(downloader.read(url), 'html5lib') # Skip "All" category for resource_filter in resource_page.find('div', {'class': 'menu-filtro'}).find_all('a')[1:]: LOGGER.info(' {}'.format(resource_filter.text)) source_id = get_source_id('{}/{}'.format(topic.title, resource_filter.text)) filter_topic = nodes.TopicNode(title=resource_filter.text, source_id=source_id) scrape_resource_list(url + resource_filter['href'], filter_topic) topic.add_child(filter_topic)
def get_available_languages(): contents = BeautifulSoup( downloader.read(BASE_URL.format(language='en', endpoint='')), 'html5lib') languages = [] for lang in contents.find('ul', { 'class': 'sf-lang-selector' }).findAll('li'): languages.append( re.search(r"openLinkWithTranslation\('([^\']+)'\)", lang.find('a')['onclick']).group(1)) return languages
def preprocess(self, contents): # Hide certain elements from the page style_tag = self.create_tag('style') style_tag.string = '.genially-view-logo { pointer-events: none;} .genially-view-navigation-actions,'\ ' .genially-view-navigation-actions-toggle-button{display: none !important; pointer-events:none;}' contents.head.append(style_tag) # Prefetch API response and replace script content accordingly genial_id = self.url.split('/')[-1] response = requests.get( 'https://view.genial.ly/api/view/{}'.format(genial_id)) for script in contents.find_all('script'): if script.get('src') and 'main' in script['src']: script_contents = downloader.read( self.get_relative_url(script['src'])).decode('utf-8') genial_data = json.loads(response.content) if len(genial_data['Videos']) or len(genial_data['Audios']): LOGGER.error( 'Unhandled genial.ly video or audio at {}'.format(url)) if genial_data['Genially']['ImageRender']: genial_data['Genially']['ImageRender'] = self.write_url( genial_data['Genially']['ImageRender'], directory='webimg') for image in genial_data['Images']: image['Source'] = self.write_url(image['Source'], directory='webimg') for slide in genial_data['Slides']: slide['Background'] = self.write_url(slide['Background'], directory='webimg') for code in genial_data['Contents']: code_contents = BeautifulSoup(code['HtmlCode'], 'html.parser') for img in code_contents.find_all('img'): try: img['src'] = self.write_url(img['src'], directory='webimg') except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError) as e: LOGGER.warning( "Error processing genial.ly at {} ({})".format( url, str(e))) code['HtmlCode'] = code_contents.prettify() script_contents = script_contents.replace( 'r.a.get(c).then(function(e){return n(e.data)})', 'n({})'.format(json.dumps(genial_data))) script['class'] = ['skip-scrape'] script['src'] = self.write_contents( 'genial-{}-embed.js'.format(genial_id), script_contents, directory="js")
def write_url(self, url, filename, directory=None): """ write_url: Write contents from url to filename in zip Args: url: (str) url to file to download filename: (str) name of file in zip directory: (str) directory in zipfile to write file to (optional) Returns: path to file in zip """ filepath = "{}/{}".format(directory.rstrip("/"), filename) if directory else filename if not self.contains(filepath): self._write_to_zipfile(filepath, read(url)) return filepath
def scrape_video_page(self, url, title): """ Creates a video topic with all the videos on the page """ IGNORED_VIDEOS = ['google', 'facebook'] VIDEO_SCRAPERS = [who.WHOWebVideoScraper, who.WHOVideoScraper] video_topic = nodes.TopicNode(source_id=url, title=title) contents = BeautifulSoup(downloader.read(url), 'html.parser') # Scrape youtube embeds # e.g. https://www.who.int/emergencies/diseases/novel-coronavirus-2019/advice-for-public/videos for iframe in contents.findAll('iframe'): if not any( [test for test in IGNORED_VIDEOS if test in iframe['src']]): header = iframe.find_parent('div', { 'class': 'sf_colsIn' }).find('div', { 'class': 'section-heading' }).text.strip() LOGGER.info(' - Downloading {}'.format( header.encode('utf-8'))) scraper = guess_scraper(iframe['src'], scrapers=VIDEO_SCRAPERS ) # Might be native or youtube video video_node = scraper.to_contentnode(header, license=LICENSE, directory="videos") video_topic.add_child(video_node) # Scrape native videos # e.g. https://www.who.int/zh/emergencies/diseases/novel-coronavirus-2019/advice-for-public/videos for video in contents.findAll('div', {'class': 'sf-multimedia-item__video'}): header = video.find('h3').text.strip() LOGGER.info(' - Downloading {}'.format( header.encode('utf-8'))) video_matches = re.search(r"\(\s*\"(.+)\"\,\s*\"(.+)\"\)", video.find('a')['onclick']) # Embedded youtube videos here refer to playlists, so skip them if 'YoutubeVideo' == video_matches.group(1): continue scraper = who.WHOVideoScraper(video_matches.group(2)) video_node = scraper.to_contentnode(header, license=LICENSE, directory="videos") video_topic.add_child(video_node) return video_topic
def add_file(self, path, title, download_url, write_data=True, ext=None, license=None, copyright_holder=None, **node_data): """ add_file: Creates file in csv and writes file to zip Args: path: (str) where in zip to write file title: (str) content's title download_url: (str) url or local path to download from write_data: (boolean) indicates whether to add as a csv entry (optional) ext: (str) extension to use for file license (str): content's license copyright_holder (str): holder of content's license (required except for PUBLIC_DOMAIN) license_description (str): description of content's license (optional) source_id: (str) content's original id (optional) description: (str) description of content (optional) author (str): who created the content (optional) language (str): language of content (optional) thumbnail (str): path to thumbnail in zip (optional) Returns: path to file in zip """ if write_data: assert license, "Files must have a license" copyright_holder = None if not copyright_holder or copyright_holder.strip( ) == '' else copyright_holder assert license in NO_COPYRIGHT_HOLDER_REQUIRED or copyright_holder, "Licenses must have a copyright holder if they are not public domain" self._parse_path(path) if not ext: _name, ext = os.path.splitext(download_url or "") ext = ext.lower( ) # normalize to lowercase extensions inside zip archive filepath = "{}/{}{}".format(path, title, ext) if download_url and filepath: self._write_to_zip(filepath, read(download_url)) if write_data: self._commit(filepath, title, license=license, copyright_holder=copyright_holder, **node_data) return filepath
def open(self, update=False): """ Opens pdf file to read from. """ filename = os.path.basename(self.source_path) folder, _ext = os.path.splitext(filename) self.path = os.path.sep.join([self.directory, folder, filename]) if not os.path.exists(os.path.dirname(self.path)): os.makedirs(os.path.dirname(self.path)) # Download full pdf if it hasn't already been downloaded if not os.path.isfile(self.path) or update: with open(self.path, "wb") as fobj: fobj.write(read(self.source_path)) self.file = open(self.path, 'rb') self.pdf = CustomPDFReader(self.file)
def open(self): """ Opens the specified PDF file for editing. If the path is a URL, it will first download the file. """ filename = os.path.basename(self.download_url) folder, _ext = os.path.splitext(filename) self.path = os.path.sep.join([self.directory, folder, filename]) if not os.path.exists(os.path.dirname(self.path)): os.makedirs(os.path.dirname(self.path)) # Download full pdf if it hasn't already been downloaded if not os.path.isfile(self.path): with open(self.path, "wb") as fobj: fobj.write(read(self.download_url)) self.file = open(self.path, 'rb') self.pdf = CustomPDFReader(self.file)