def main(): """main driver""" args = commandline_args_setup() # handle verbose option if args.verbose is True: # setup a logging handler for the command line console = logging.StreamHandler() # stream=sys.stdout console.setLevel(logging.INFO) formatter = logging.Formatter('%(funcName)s: %(message)s') console.setFormatter(formatter) # add the handler to the verbose logger verbose = logging.getLogger('verbose') verbose.setLevel(logging.INFO) verbose.addHandler(console) verbose.info('Running verbose.') base_url = args.base_url session = parse.Session() html = session.html_at_url(base_url) # this web page is defined as a URL and some HTML page = parse.Page(base_url, html) logging.getLogger('verbose').info(page.content) links = page.links logging.getLogger('verbose').info(links) # store the file if opted sm_arg = args.sitemap_dest to_save = _prompt_sitemap_save(base_url, sm_arg) if to_save: sm = Sitemap(links) sm.export(sm_arg)
def main(): """main driver""" args = commandline_args_setup() # handle verbose option if args.verbose is True: # setup a logging handler to the command line console = logging.StreamHandler() # stream=sys.stdout console.setLevel(logging.INFO) formatter = logging.Formatter('{funcName}: {message}') # TODO verify console.setFormatter(formatter) # add the handler to the verbose logger verbose = logging.getLogger('verbose') verbose.addHandler(console) verbose.info('Running verbose.') base_url = args.base_url session = parse.Session() html = session.html_at_url(base_url) # this web page is defined as a URL and some HTML page = parse.Page(base_url, html) logging.getLogger('verbose').info(page.content) links = page.links logging.getLogger('verbose').info(links) # store the file if opted sm_arg = args.sitemap_dest to_save = _prompt_sitemap_save(base_url, sm_arg) if to_save: sm = Sitemap(links) sm.export(sm_arg)
def index_as_xml(self,**kwargs): """Return XML serialization of this list taken to be sitemapindex entries """ self.default_capability_and_modified() s = Sitemap(**kwargs) return s.resources_as_xml(self,sitemapindex=True)
def __init__(self, config_file_name): self.config = Config(config_file_name) self.sitemap = Sitemap(self.config.value('website_base_url')) self.template = Template( self.tag['content'], self.config.value('template_path') + Website.WEBSITE_TEMPLATE) self.refresh()
def as_xml(self, **kwargs): """Return XML serialization of this list This code does not support the case where the list is too big for a single XML document. """ self.default_capability_and_modified() s = Sitemap(**kwargs) return s.resources_as_xml(self, sitemapindex=self.sitemapindex)
def write(self, basename="/tmp/resynclist.xml", **kwargs): """Write a single sitemap or sitemapindex XML document Must be overridden to support multi-file lists. """ self.default_capability_and_modified() fh = open(basename, "w") s = Sitemap(**kwargs) s.resources_as_xml(self, fh=fh, sitemapindex=self.sitemapindex) fh.close()
def write_zip(self, inventory=None, dumpfile=None): """Write a ZIP dump file""" compression = ( ZIP_DEFLATED if self.compress else ZIP_STORED ) zf = ZipFile(dumpfile, mode="w", compression=compression, allowZip64=True) # Write inventory first s = Sitemap(pretty_xml=True, allow_multifile=False) zf.writestr('manifest.xml',s.inventory_as_xml(inventory)) # Add all files in the inventory for resource in inventory: zf.write(resource.uri) zf.close() zipsize = os.path.getsize(dumpfile) print "Wrote ZIP file dump %s with size %d bytes" % (dumpfile,zipsize)
def write_zip(self, inventory=None, dumpfile=None): """Write a ZIP dump file""" compression = ( ZIP_DEFLATED if self.compress else ZIP_STORED ) zf = ZipFile(dumpfile, mode="w", compression=compression, allowZip64=True) # Write inventory first s = Sitemap(pretty_xml=True, allow_multifile=False) zf.writestr('manifest.xml',s.resources_as_xml(inventory)) # Add all files in the inventory for resource in inventory: zf.write(resource.uri) zf.close() zipsize = os.path.getsize(dumpfile) print "Wrote ZIP file dump %s with size %d bytes" % (dumpfile,zipsize)
def parse(self, uri=None, fh=None, **kwargs): """Parse a single XML document for this list Does not handle the case of sitemapindex+sitemaps ResourceList """ if uri is not None: try: fh = URLopener().open(uri) except IOError as e: raise Exception("Failed to load sitemap/sitemapindex from %s (%s)" % (uri, str(e))) if fh is None: raise Exception("Nothing to parse") s = Sitemap(**kwargs) s.parse_xml(fh=fh, resources=self, capability=self.capability_md, sitemapindex=False) self.parsed_index = s.parsed_index
def read(self, uri=None, resources=None, capability=None, index_only=False): """Read sitemap from a URI including handling sitemapindexes If index_only is True then individual sitemaps references in a sitemapindex will not be read. This will result in no resources being returned and is useful only to read the metadata and links listed in the sitemapindex. Includes the subtlety that if the input URI is a local file and is a sitemapindex which contains URIs for the individual sitemaps, then these are mapped to the filesystem also. """ try: fh = URLopener().open(uri) self.num_files += 1 except IOError as e: raise IOError("Failed to load sitemap/sitemapindex from %s (%s)" % (uri,str(e))) # Get the Content-Length if we can (works fine for local files) try: self.content_length = int(fh.info()['Content-Length']) self.bytes_read += self.content_length self.logger.debug( "Read %d bytes from %s" % (self.content_length,uri) ) except KeyError: # If we don't get a length then c'est la vie self.logger.debug( "Read ????? bytes from %s" % (uri) ) pass self.logger.info( "Read sitemap/sitemapindex from %s" % (uri) ) s = Sitemap() s.parse_xml(fh=fh,resources=self,capability='resourcelist') # what did we read? sitemap or sitemapindex? if (s.parsed_index): # sitemapindex if (not self.allow_multifile): raise ListBaseIndexError("Got sitemapindex from %s but support for sitemapindex disabled" % (uri)) self.logger.info( "Parsed as sitemapindex, %d sitemaps" % (len(self.resources)) ) sitemapindex_is_file = self.is_file_uri(uri) if (index_only): # don't read the component sitemaps self.sitemapindex = True return # now loop over all entries to read each sitemap and add to resources sitemaps = self.resources self.resources = self.resources_class() self.logger.info( "Now reading %d sitemaps" % len(sitemaps.uris()) ) for sitemap_uri in sorted(sitemaps.uris()): self.read_component_sitemap(uri,sitemap_uri,s,sitemapindex_is_file) else: # sitemap self.logger.info( "Parsed as sitemap, %d resources" % (len(self.resources)) )
def test_sitemap(): """ Test of test_sitemap() """ base_url = 'https://example.com' page1 = 'example-%s.html' % randint(0, 999999) page2 = 'example-%s.html' % randint(0, 999999) page3 = 'example-%s.html' % randint(0, 999999) sitemap = Sitemap(base_url) # New sitemap should include no pages assert not sitemap.pages sitemap.append(page1) # after appending one page, sitemap should contain that page and only that page assert len(sitemap.pages) == 1 assert sitemap.pages[0] == "%s/%s" % (base_url, page1) sitemap.append(page2) sitemap.append(page3) # after appebding two more pages, sitemap should contain 3 pages assert len(sitemap.pages) == 3 assert sitemap.pages[0] == "%s/%s" % (base_url, page1) assert sitemap.pages[1] == "%s/%s" % (base_url, page2) assert sitemap.pages[2] == "%s/%s" % (base_url, page3) sitemap.append('index.html') # index.html should be included as '/', without 'index.html' assert len(sitemap.pages) == 4 assert sitemap.pages[3] == "%s/" % base_url sitemap.write(TEST_WEBSITE.config.value('output_path')) with open(TEST_WEBSITE.config.value('output_path') + 'sitemap.txt', 'r') as my_file: sitemap_from_file = my_file.read().splitlines() # sitemap written to file should contain our 3 pages assert len(sitemap.pages) == 4 assert sitemap_from_file[0] == "%s/%s" % (base_url, page1) assert sitemap_from_file[1] == "%s/%s" % (base_url, page2) assert sitemap_from_file[2] == "%s/%s" % (base_url, page3) assert sitemap_from_file[3] == "%s/" % base_url
def write(self, basename='/tmp/sitemap.xml', **kwargs): """Write one or a set of sitemap files to disk resources is a ResourceContainer that may be an ResourceList or a ChangeList. This may be a generator so data is read as needed and length is determined at the end. basename is used as the name of the single sitemap file or the sitemapindex for a set of sitemap files. Uses self.max_sitemap_entries to determine whether the resource_list can be written as one sitemap. If there are more entries and self.allow_multifile is set true then a set of sitemap files, with an sitemapindex, will be written. """ # Access resources through iterator only resources_iter = iter(self.resources) ( chunk, next ) = self.get_resources_chunk(resources_iter) s = Sitemap(**kwargs) if (next is not None): # Have more than self.max_sitemap_entries => sitemapindex if (not self.allow_multifile): raise ListBaseIndexError("Too many entries for a single sitemap but multifile disabled") # Work out how to name the sitemaps, attempt to add %05d before ".xml$", else append sitemap_prefix = basename sitemap_suffix = '.xml' if (basename[-4:] == '.xml'): sitemap_prefix = basename[:-4] # Use iterator over all resources and count off sets of # max_sitemap_entries to go into each sitemap, store the # names of the sitemaps as we go sitemaps=ListBase() while (len(chunk)>0): file = sitemap_prefix + ( "%05d" % (len(sitemaps)) ) + sitemap_suffix self.logger.info("Writing sitemap %s..." % (file)) f = open(file, 'w') s.resources_as_xml(chunk, fh=f) f.close() # Record information about this sitemap for index r = Resource( uri = self.mapper.dst_to_src(file), path = file, timestamp = os.stat(file).st_mtime, md5 = compute_md5_for_file(file) ) sitemaps.add(r) # Get next chunk ( chunk, next ) = self.get_resources_chunk(resources_iter,next) self.logger.info("Wrote %d sitemaps" % (len(sitemaps))) f = open(basename, 'w') self.logger.info("Writing sitemapindex %s..." % (basename)) s.resources_as_xml(resources=sitemaps,sitemapindex=True,fh=f) f.close() self.logger.info("Wrote sitemapindex %s" % (basename)) else: f = open(basename, 'w') self.logger.info("Writing sitemap %s..." % (basename)) s.resources_as_xml(chunk, fh=f) f.close() self.logger.info("Wrote sitemap %s" % (basename))
def get(self,url,inventory=None): """Get a inventory from url Will either create a new Inventory object or add to one supplied. """ # Either use inventory passed in or make a new one if (inventory is None): inventory = Inventory() inventory_fh = URLopener().open(url) Sitemap().inventory_parse_xml(fh=inventory_fh, inventory=inventory) return(inventory)
def from_robots(cls, robots_url): try: response = requests.get(robots_url, timeout=30) except requests.HTTPError as e: raise cls.RobotsLoadError("robots.txt from {} return {} status code".format(robots_url, e.response.status_code)) robots_txt = response.content sitemaps = cls.ROBOTS_SITEMAP_RE.findall(robots_txt) if not sitemaps: raise cls.RobotsParseError("robots.txt from {} sitemaps not found".format(robots_url)) sitemaps = set([s.strip() for s in sitemaps]) return [Sitemap.from_url(url) for url in sitemaps]
class Website: """ Class representing a website """ WEBSITE_TEMPLATE = '_website_template.html' LISTING_PAGE_TEMPLATE_FILENAME = '_listing_page_template.html' ARTICLE_PAGE_TEMPLATE_FILENAME = '_article_page_template.html' STATIC_PAGE_TEMPLATE_FILENAME = '_static_page_template.html' ARTICLE_ITEM_FOOTER_TEMPLATE_FILENAME = "_article_item_footer_template.html" STATIC_ITEM_FOOTER_TEMPLATE_FILENAME = "_static_item_footer_template.html" def __init__(self, config_file_name): self.config = Config(config_file_name) self.sitemap = Sitemap(self.config.value('website_base_url')) self.template = Template( self.tag['content'], self.config.value('template_path') + Website.WEBSITE_TEMPLATE) self.refresh() tag = { 'content': '<!-- MAGNETIZER_CONTENT -->', 'page_content': '<!-- MAGNETIZER_PAGE_CONTENT -->', 'meta': '<!-- MAGNETIZER_META -->', 'item_footer': '<!-- MAGNETIZER_ITEM_FOOTER -->', 'date': '<!-- MAGNETIZER_DATE -->', 'page_class': '<!-- MAGNETIZER_PAGE_CLASS -->', 'pagination': '<!-- MAGNETIZER_PAGINATION -->', 'cc_here': '<!-- MAGNETIZER_CC -->', 'break': '<!-- BREAK -->', 'noindex': '<!-- NOINDEX -->', 'creative_commons': '<!-- CC -->' } def refresh(self): """ Run as part of init to populate the Website elements - reads item footer templates - calculates cache bursting checksum for css filename """ self.article_footer_html = Website.read_file( self.config.value('template_path'), self.ARTICLE_ITEM_FOOTER_TEMPLATE_FILENAME) self.static_footer_html = Website.read_file( self.config.value('template_path'), self.STATIC_ITEM_FOOTER_TEMPLATE_FILENAME) css_contents = Website.read_file( self.config.value('resources_path'), self.config.value('website_css_filename')) css_hash = hashlib.md5(bytes(css_contents, encoding='utf-8')).hexdigest() self.css_filename = self.config.value( 'website_css_filename') + '?' + css_hash def include(self, filename): """ Reads the contents of an include file Parameters: - filename Returns: - the contents of the include file or an error message """ if path.isfile(path.join(self.config.value('template_path'), filename)): return Website.read_file(self.config.value('template_path'), filename) print(COLOUR_ERROR + ' (!) ' + COLOUR_END + "Include '%s' does not exist!" % filename) return "[ ERROR: Include '%s' does not exist! ]" % filename def copy_resources(self): """ Copies resource files from the resources directory to the output directory, e.g. css, images etc. Files not included in approved_filetypes will be ignored. """ print("Copying resources --> %s " % self.config.value('output_path')) copied = 0 ignored = 0 for filename in listdir(self.config.value('resources_path')): if path.isfile(self.config.value('resources_path') + filename): extension = filename.split('.')[-1] if '.' + extension in self.config.value('approved_filetypes'): shutil.copyfile( self.config.value('resources_path') + filename, self.config.value('output_path') + filename) copied += 1 else: ignored += 1 message = COLOUR_OK + ' --> ' + COLOUR_END + 'Copied %s files, ignored %s' print(message % (copied, ignored)) def wipe(self): """ Delete the files from the output directory, typically when regenerating the site. Ignores files not ending with .html or not in approved_filetypes. """ print('Deleting previous files from %s ' % self.config.value('output_path')) deleted = 0 ignored = 0 for filename in listdir(self.config.value('output_path')): if path.isfile(self.config.value('output_path') + filename): extension = '.' + filename.split('.')[-1] if extension == '.html' or extension in self.config.value( 'approved_filetypes'): remove(self.config.value('output_path') + filename) deleted += 1 else: ignored += 1 self.sitemap.clear() message = COLOUR_OK + ' --> ' + COLOUR_END + 'Deleted %s files, ignored %s' print(message % (deleted, ignored)) @staticmethod def read_file(directory, filename): """ Helper method for reading files. """ with open(directory + filename, 'r') as myfile: return myfile.read()
def new_sitemap(self): """Create new Sitemap object with default settings""" return Sitemap(pretty_xml=self.pretty_xml)
if (URL_STABILIZATION.search(file_read(page))): rePage = URL_STABILIZATION.sub("", file_read(page)) link_href = re.compile('href=".*?"') for href in link_href.findall(rePage): #проверка на абсолютные ссылки if (not href[6:10] == "http"): rePage = re.sub(href, 'href="' + PATH_site_name + href[6:], rePage) link_src = re.compile('src=".*?"') for src in link_src.findall(rePage): if (not src[5:9] == "http"): rePage = re.sub(src, 'src="' + PATH_site_name + src[5:], rePage) file_write(page, rePage) #создание sitemap sm = Sitemap(changefreq=SITEMAP_all_update) for page in l_page_create: sm.add(PATH_site_name + page[7:]) file_write(PATH_site_out + "sitemap.xml", sm.get_as_string()) l_page_create.append(PATH_site_out + "sitemap.xml") #создание robots file_write(PATH_site_out + "robots.txt", ROBOT_TXT) l_page_create.append(PATH_site_out + "robots.txt") print "Content-Type: text/html\n\n" print "<html>" print "<br>Модулей найдено для подключения:<br>" for page in os.listdir(PATH_module): print page + "<br>" print "<br>Страниц создано:<br>" for page in l_page_create: print page + "<br>"
def GET(self): m = Sitemap() m.add_url('http://evervolv.com',priority='1.0') m.add_url('http://evervolv.com/about/',priority='0.8') m.add_url('http://evervolv.com/chat/',priority='0.3') m.add_url('http://evervolv.com/devices/',priority='0.8') m.add_url('http://evervolv.com/news/',priority='0.3') m.add_url('http://evervolv.com/source/',priority='0.6') m.add_url('http://evervolv.com/features/',priority='0.6') for d in devices(): m.add_url('http://evervolv.com/devices/%s' % d, changefreq='daily',priority='0.1') return m.write()
usage = """\ python3 __main__.py https://www.femsense.com """ def exit_wrong_usage(): print(usage) sys.exit(1) def format_result(result: typing.Union[CheckResult, CheckResults]): symbol = "✅" if result.valid else "❌" return f"\t{symbol} {check.msg}" if __name__ == "__main__": logging.basicConfig(level=logging.INFO) if len(sys.argv) < 2: exit_wrong_usage() site = sys.argv[1] crawler = Crawler(site) sitemap = Sitemap(site) results = [*crawler.crawl()] print("Dere!") for n, check in enumerate(results): # if not check.valid: print(f"{n: 4d} {format_result(check)}")
def test_invalid_sitemap_input(self): with pytest.raises(TypeError): Sitemap(loc=self.loc, lastmod=123)
def test_happy_path(self): sitemap = Sitemap(loc=self.loc, lastmod=self.lastmod) assert isinstance(sitemap, Sitemap)
def from_url(cls, url): return Sitemap.from_url(url)
def main(args): is_release = "release" in args Path("../out/news").mkdir(parents=True, exist_ok=True) copytree("./news", "../out/news", ignore=ignore_patterns("*.md")) news_list = find_news() env = Environment( loader=PackageLoader("generate", "./templates"), autoescape=select_autoescape(["html", "xml"]), ) news_items = [ { "title": Markup(news.render_title_link("/" + news.path.strip("/") + "/")), "date": format_date(news.date), "path": news.path, } for news in news_list ] sitemap = Sitemap(BASE_URL) sitemap.add_url("/") sitemap.add_url("/news/") for news in news_list: render_template( env, "news.html", f"../out/{news.path.strip('/')}/index.html", release=is_release, meta_title=news.render_title(), meta_description=news.description, meta_canonical=f'{BASE_URL.rstrip("/")}/{news.path.strip("/")}/', content=Markup(news.html), date=format_date(news.date), other_news=[on for on in news_items if news.path != on["path"]][:3], ) sitemap.add_url(news.path) render_template( env, "news-index.html", "../out/news/index.html", release=is_release, news=news_items, meta_title="Новости", meta_description=( "Новости либертарианства и Либертарианской Партии России в Екатеринбурге и Свердловской области" ), meta_canonical=f'{BASE_URL.rstrip("/")}/news/', ) render_template( env, "home.html", "../out/index.html", release=is_release, news=news_items[:3], meta_description=( "Выступаем за свободную экономику, независимое местное самоуправление, " "суверенитет личности и против цензуры в интернете. Присоединяйся!" ), ) render_template(env, "sitemap.xml", "../out/sitemap.xml", urls=sitemap.urls) copytree("./static", "../out")