Exemple #1
0
def main():
    """main driver"""
    args = commandline_args_setup()

    # handle verbose option
    if args.verbose is True:
        # setup a logging handler for the command line
        console = logging.StreamHandler()  # stream=sys.stdout
        console.setLevel(logging.INFO)
        formatter = logging.Formatter('%(funcName)s: %(message)s')
        console.setFormatter(formatter)
        # add the handler to the verbose logger
        verbose = logging.getLogger('verbose')
        verbose.setLevel(logging.INFO)
        verbose.addHandler(console)
        verbose.info('Running verbose.')

    base_url = args.base_url
    session = parse.Session()
    html = session.html_at_url(base_url)

    # this web page is defined as a URL and some HTML
    page = parse.Page(base_url, html)
    logging.getLogger('verbose').info(page.content)

    links = page.links
    logging.getLogger('verbose').info(links)

    # store the file if opted
    sm_arg = args.sitemap_dest
    to_save = _prompt_sitemap_save(base_url, sm_arg)
    if to_save:
        sm = Sitemap(links)
        sm.export(sm_arg)
Exemple #2
0
def main():
    """main driver"""
    args = commandline_args_setup()

    # handle verbose option
    if args.verbose is True:
        # setup a logging handler to the command line
        console = logging.StreamHandler()  # stream=sys.stdout
        console.setLevel(logging.INFO)
        formatter = logging.Formatter('{funcName}: {message}')  # TODO verify
        console.setFormatter(formatter)
        # add the handler to the verbose logger
        verbose = logging.getLogger('verbose')
        verbose.addHandler(console)
        verbose.info('Running verbose.')

    base_url = args.base_url
    session = parse.Session()
    html = session.html_at_url(base_url)

    # this web page is defined as a URL and some HTML
    page = parse.Page(base_url, html)
    logging.getLogger('verbose').info(page.content)

    links = page.links
    logging.getLogger('verbose').info(links)

    # store the file if opted
    sm_arg = args.sitemap_dest
    to_save = _prompt_sitemap_save(base_url, sm_arg)
    if to_save:
        sm = Sitemap(links)
        sm.export(sm_arg)
    def index_as_xml(self,**kwargs):
        """Return XML serialization of this list taken to be sitemapindex entries

        """
        self.default_capability_and_modified()
        s = Sitemap(**kwargs)
        return s.resources_as_xml(self,sitemapindex=True)
Exemple #4
0
    def __init__(self, config_file_name):

        self.config = Config(config_file_name)
        self.sitemap = Sitemap(self.config.value('website_base_url'))
        self.template = Template(
            self.tag['content'],
            self.config.value('template_path') + Website.WEBSITE_TEMPLATE)
        self.refresh()
Exemple #5
0
    def as_xml(self, **kwargs):
        """Return XML serialization of this list

        This code does not support the case where the list is too big for 
        a single XML document.
        """
        self.default_capability_and_modified()
        s = Sitemap(**kwargs)
        return s.resources_as_xml(self, sitemapindex=self.sitemapindex)
Exemple #6
0
    def write(self, basename="/tmp/resynclist.xml", **kwargs):
        """Write a single sitemap or sitemapindex XML document

        Must be overridden to support multi-file lists.
        """
        self.default_capability_and_modified()
        fh = open(basename, "w")
        s = Sitemap(**kwargs)
        s.resources_as_xml(self, fh=fh, sitemapindex=self.sitemapindex)
        fh.close()
Exemple #7
0
 def write_zip(self, inventory=None, dumpfile=None):
     """Write a ZIP dump file"""
     compression = ( ZIP_DEFLATED if self.compress else ZIP_STORED )
     zf = ZipFile(dumpfile, mode="w", compression=compression, allowZip64=True)
     # Write inventory first
     s = Sitemap(pretty_xml=True, allow_multifile=False)
     zf.writestr('manifest.xml',s.inventory_as_xml(inventory))
     # Add all files in the inventory
     for resource in inventory:
         zf.write(resource.uri)
     zf.close()
     zipsize = os.path.getsize(dumpfile)
     print "Wrote ZIP file dump %s with size %d bytes" % (dumpfile,zipsize)
Exemple #8
0
 def write_zip(self, inventory=None, dumpfile=None):
     """Write a ZIP dump file"""
     compression = ( ZIP_DEFLATED if self.compress else ZIP_STORED )
     zf = ZipFile(dumpfile, mode="w", compression=compression, allowZip64=True)
     # Write inventory first
     s = Sitemap(pretty_xml=True, allow_multifile=False)
     zf.writestr('manifest.xml',s.resources_as_xml(inventory))
     # Add all files in the inventory
     for resource in inventory:
         zf.write(resource.uri)
     zf.close()
     zipsize = os.path.getsize(dumpfile)
     print "Wrote ZIP file dump %s with size %d bytes" % (dumpfile,zipsize)
Exemple #9
0
    def parse(self, uri=None, fh=None, **kwargs):
        """Parse a single XML document for this list

        Does not handle the case of sitemapindex+sitemaps ResourceList
        """
        if uri is not None:
            try:
                fh = URLopener().open(uri)
            except IOError as e:
                raise Exception("Failed to load sitemap/sitemapindex from %s (%s)" % (uri, str(e)))
        if fh is None:
            raise Exception("Nothing to parse")
        s = Sitemap(**kwargs)
        s.parse_xml(fh=fh, resources=self, capability=self.capability_md, sitemapindex=False)
        self.parsed_index = s.parsed_index
    def read(self, uri=None, resources=None, capability=None, index_only=False):
        """Read sitemap from a URI including handling sitemapindexes

        If index_only is True then individual sitemaps references in a sitemapindex
        will not be read. This will result in no resources being returned and is
        useful only to read the metadata and links listed in the sitemapindex.

        Includes the subtlety that if the input URI is a local file and is a 
        sitemapindex which contains URIs for the individual sitemaps, then these
        are mapped to the filesystem also.
        """
        try:
            fh = URLopener().open(uri)
            self.num_files += 1
        except IOError as e:
            raise IOError("Failed to load sitemap/sitemapindex from %s (%s)" % (uri,str(e)))
        # Get the Content-Length if we can (works fine for local files)
        try:
            self.content_length = int(fh.info()['Content-Length'])
            self.bytes_read += self.content_length
            self.logger.debug( "Read %d bytes from %s" % (self.content_length,uri) )
        except KeyError:
            # If we don't get a length then c'est la vie
            self.logger.debug( "Read ????? bytes from %s" % (uri) )
            pass
        self.logger.info( "Read sitemap/sitemapindex from %s" % (uri) )
        s = Sitemap()
        s.parse_xml(fh=fh,resources=self,capability='resourcelist')
        # what did we read? sitemap or sitemapindex?
        if (s.parsed_index):
            # sitemapindex
            if (not self.allow_multifile):
                raise ListBaseIndexError("Got sitemapindex from %s but support for sitemapindex disabled" % (uri))
            self.logger.info( "Parsed as sitemapindex, %d sitemaps" % (len(self.resources)) )
            sitemapindex_is_file = self.is_file_uri(uri)
            if (index_only):
                # don't read the component sitemaps
                self.sitemapindex = True
                return
            # now loop over all entries to read each sitemap and add to resources
            sitemaps = self.resources
            self.resources = self.resources_class()
            self.logger.info( "Now reading %d sitemaps" % len(sitemaps.uris()) )
            for sitemap_uri in sorted(sitemaps.uris()):
                self.read_component_sitemap(uri,sitemap_uri,s,sitemapindex_is_file)
        else:
            # sitemap
            self.logger.info( "Parsed as sitemap, %d resources" % (len(self.resources)) )
Exemple #11
0
def test_sitemap():
    """ Test of test_sitemap()
    """

    base_url = 'https://example.com'

    page1 = 'example-%s.html' % randint(0, 999999)
    page2 = 'example-%s.html' % randint(0, 999999)
    page3 = 'example-%s.html' % randint(0, 999999)

    sitemap = Sitemap(base_url)

    # New sitemap should include no pages
    assert not sitemap.pages

    sitemap.append(page1)

    # after appending one page, sitemap should contain that page and only that page
    assert len(sitemap.pages) == 1
    assert sitemap.pages[0] == "%s/%s" % (base_url, page1)

    sitemap.append(page2)
    sitemap.append(page3)

    # after appebding two more pages, sitemap should contain 3 pages
    assert len(sitemap.pages) == 3
    assert sitemap.pages[0] == "%s/%s" % (base_url, page1)
    assert sitemap.pages[1] == "%s/%s" % (base_url, page2)
    assert sitemap.pages[2] == "%s/%s" % (base_url, page3)

    sitemap.append('index.html')

    # index.html should be included as '/', without 'index.html'
    assert len(sitemap.pages) == 4
    assert sitemap.pages[3] == "%s/" % base_url

    sitemap.write(TEST_WEBSITE.config.value('output_path'))

    with open(TEST_WEBSITE.config.value('output_path') + 'sitemap.txt',
              'r') as my_file:
        sitemap_from_file = my_file.read().splitlines()

    # sitemap written to file should contain our 3 pages
    assert len(sitemap.pages) == 4
    assert sitemap_from_file[0] == "%s/%s" % (base_url, page1)
    assert sitemap_from_file[1] == "%s/%s" % (base_url, page2)
    assert sitemap_from_file[2] == "%s/%s" % (base_url, page3)
    assert sitemap_from_file[3] == "%s/" % base_url
    def write(self, basename='/tmp/sitemap.xml', **kwargs):
        """Write one or a set of sitemap files to disk

        resources is a ResourceContainer that may be an ResourceList or
        a ChangeList. This may be a generator so data is read as needed
        and length is determined at the end.

        basename is used as the name of the single sitemap file or the 
        sitemapindex for a set of sitemap files.

        Uses self.max_sitemap_entries to determine whether the resource_list can 
        be written as one sitemap. If there are more entries and 
        self.allow_multifile is set true then a set of sitemap files, 
        with an sitemapindex, will be written.
        """
        # Access resources through iterator only
        resources_iter = iter(self.resources)
        ( chunk, next ) = self.get_resources_chunk(resources_iter)
        s = Sitemap(**kwargs)
        if (next is not None):
            # Have more than self.max_sitemap_entries => sitemapindex
            if (not self.allow_multifile):
                raise ListBaseIndexError("Too many entries for a single sitemap but multifile disabled")
            # Work out how to name the sitemaps, attempt to add %05d before ".xml$", else append
            sitemap_prefix = basename
            sitemap_suffix = '.xml'
            if (basename[-4:] == '.xml'):
                sitemap_prefix = basename[:-4]
            # Use iterator over all resources and count off sets of
            # max_sitemap_entries to go into each sitemap, store the
            # names of the sitemaps as we go
            sitemaps=ListBase()
            while (len(chunk)>0):
                file = sitemap_prefix + ( "%05d" % (len(sitemaps)) ) + sitemap_suffix
                self.logger.info("Writing sitemap %s..." % (file))
                f = open(file, 'w')
                s.resources_as_xml(chunk, fh=f)
                f.close()
                # Record information about this sitemap for index
                r = Resource( uri = self.mapper.dst_to_src(file),
                              path = file,
                              timestamp = os.stat(file).st_mtime,
                              md5 = compute_md5_for_file(file) )
                sitemaps.add(r)
                # Get next chunk
                ( chunk, next ) = self.get_resources_chunk(resources_iter,next)
            self.logger.info("Wrote %d sitemaps" % (len(sitemaps)))
            f = open(basename, 'w')
            self.logger.info("Writing sitemapindex %s..." % (basename))
            s.resources_as_xml(resources=sitemaps,sitemapindex=True,fh=f)
            f.close()
            self.logger.info("Wrote sitemapindex %s" % (basename))
        else:
            f = open(basename, 'w')
            self.logger.info("Writing sitemap %s..." % (basename))
            s.resources_as_xml(chunk, fh=f)
            f.close()
            self.logger.info("Wrote sitemap %s" % (basename))
Exemple #13
0
    def get(self,url,inventory=None):
        """Get a inventory from url

        Will either create a new Inventory object or add to one supplied.
        """
        # Either use inventory passed in or make a new one
        if (inventory is None):
            inventory = Inventory()

        inventory_fh = URLopener().open(url)
        Sitemap().inventory_parse_xml(fh=inventory_fh, inventory=inventory)
        return(inventory)
Exemple #14
0
    def from_robots(cls, robots_url):
        try:
            response = requests.get(robots_url, timeout=30)
        except requests.HTTPError as e:
            raise cls.RobotsLoadError("robots.txt from {} return {} status code".format(robots_url, e.response.status_code))

        robots_txt = response.content
        sitemaps = cls.ROBOTS_SITEMAP_RE.findall(robots_txt)
        if not sitemaps:
            raise cls.RobotsParseError("robots.txt from {} sitemaps not found".format(robots_url))

        sitemaps = set([s.strip() for s in sitemaps])

        return [Sitemap.from_url(url) for url in sitemaps]
Exemple #15
0
class Website:
    """ Class representing a website
    """

    WEBSITE_TEMPLATE = '_website_template.html'

    LISTING_PAGE_TEMPLATE_FILENAME = '_listing_page_template.html'
    ARTICLE_PAGE_TEMPLATE_FILENAME = '_article_page_template.html'
    STATIC_PAGE_TEMPLATE_FILENAME = '_static_page_template.html'

    ARTICLE_ITEM_FOOTER_TEMPLATE_FILENAME = "_article_item_footer_template.html"
    STATIC_ITEM_FOOTER_TEMPLATE_FILENAME = "_static_item_footer_template.html"

    def __init__(self, config_file_name):

        self.config = Config(config_file_name)
        self.sitemap = Sitemap(self.config.value('website_base_url'))
        self.template = Template(
            self.tag['content'],
            self.config.value('template_path') + Website.WEBSITE_TEMPLATE)
        self.refresh()

    tag = {
        'content': '<!-- MAGNETIZER_CONTENT -->',
        'page_content': '<!-- MAGNETIZER_PAGE_CONTENT -->',
        'meta': '<!-- MAGNETIZER_META -->',
        'item_footer': '<!-- MAGNETIZER_ITEM_FOOTER -->',
        'date': '<!-- MAGNETIZER_DATE -->',
        'page_class': '<!-- MAGNETIZER_PAGE_CLASS -->',
        'pagination': '<!-- MAGNETIZER_PAGINATION -->',
        'cc_here': '<!-- MAGNETIZER_CC -->',
        'break': '<!-- BREAK -->',
        'noindex': '<!-- NOINDEX -->',
        'creative_commons': '<!-- CC -->'
    }

    def refresh(self):
        """ Run as part of init to populate the Website elements

        - reads item footer templates
        - calculates cache bursting checksum for css filename
        """

        self.article_footer_html = Website.read_file(
            self.config.value('template_path'),
            self.ARTICLE_ITEM_FOOTER_TEMPLATE_FILENAME)
        self.static_footer_html = Website.read_file(
            self.config.value('template_path'),
            self.STATIC_ITEM_FOOTER_TEMPLATE_FILENAME)

        css_contents = Website.read_file(
            self.config.value('resources_path'),
            self.config.value('website_css_filename'))
        css_hash = hashlib.md5(bytes(css_contents,
                                     encoding='utf-8')).hexdigest()
        self.css_filename = self.config.value(
            'website_css_filename') + '?' + css_hash

    def include(self, filename):
        """ Reads the contents of an include file

        Parameters:
        - filename

        Returns:
        - the contents of the include file or an error message
        """

        if path.isfile(path.join(self.config.value('template_path'),
                                 filename)):
            return Website.read_file(self.config.value('template_path'),
                                     filename)

        print(COLOUR_ERROR + ' (!) ' + COLOUR_END +
              "Include '%s' does not exist!" % filename)
        return "[ ERROR: Include '%s' does not exist! ]" % filename

    def copy_resources(self):
        """ Copies resource files from the resources directory to the output directory,
        e.g. css, images etc. Files not included in approved_filetypes will be ignored.

        """

        print("Copying resources --> %s " % self.config.value('output_path'))
        copied = 0
        ignored = 0

        for filename in listdir(self.config.value('resources_path')):

            if path.isfile(self.config.value('resources_path') + filename):

                extension = filename.split('.')[-1]

                if '.' + extension in self.config.value('approved_filetypes'):
                    shutil.copyfile(
                        self.config.value('resources_path') + filename,
                        self.config.value('output_path') + filename)
                    copied += 1
                else:
                    ignored += 1

        message = COLOUR_OK + ' --> ' + COLOUR_END + 'Copied %s files, ignored %s'
        print(message % (copied, ignored))

    def wipe(self):
        """ Delete the files from the output directory, typically when regenerating the site.
        Ignores files not ending with .html or not in approved_filetypes.
        """

        print('Deleting previous files from %s ' %
              self.config.value('output_path'))
        deleted = 0
        ignored = 0

        for filename in listdir(self.config.value('output_path')):

            if path.isfile(self.config.value('output_path') + filename):
                extension = '.' + filename.split('.')[-1]

                if extension == '.html' or extension in self.config.value(
                        'approved_filetypes'):
                    remove(self.config.value('output_path') + filename)
                    deleted += 1
                else:
                    ignored += 1

        self.sitemap.clear()

        message = COLOUR_OK + ' --> ' + COLOUR_END + 'Deleted %s files, ignored %s'
        print(message % (deleted, ignored))

    @staticmethod
    def read_file(directory, filename):
        """ Helper method for reading files.
        """

        with open(directory + filename, 'r') as myfile:
            return myfile.read()
Exemple #16
0
 def new_sitemap(self):
     """Create new Sitemap object with default settings"""
     return Sitemap(pretty_xml=self.pretty_xml)
Exemple #17
0
    if (URL_STABILIZATION.search(file_read(page))):
        rePage = URL_STABILIZATION.sub("", file_read(page))
        link_href = re.compile('href=".*?"')
        for href in link_href.findall(rePage):
            #проверка на абсолютные ссылки
            if (not href[6:10] == "http"):
                rePage = re.sub(href, 'href="' + PATH_site_name + href[6:],
                                rePage)
        link_src = re.compile('src=".*?"')
        for src in link_src.findall(rePage):
            if (not src[5:9] == "http"):
                rePage = re.sub(src, 'src="' + PATH_site_name + src[5:],
                                rePage)
        file_write(page, rePage)
#создание sitemap
sm = Sitemap(changefreq=SITEMAP_all_update)
for page in l_page_create:
    sm.add(PATH_site_name + page[7:])
file_write(PATH_site_out + "sitemap.xml", sm.get_as_string())
l_page_create.append(PATH_site_out + "sitemap.xml")
#создание robots
file_write(PATH_site_out + "robots.txt", ROBOT_TXT)
l_page_create.append(PATH_site_out + "robots.txt")
print "Content-Type: text/html\n\n"
print "<html>"
print "<br>Модулей найдено для подключения:<br>"
for page in os.listdir(PATH_module):
    print page + "<br>"
print "<br>Страниц создано:<br>"
for page in l_page_create:
    print page + "<br>"
Exemple #18
0
 def GET(self):
     m = Sitemap()
     m.add_url('http://evervolv.com',priority='1.0')
     m.add_url('http://evervolv.com/about/',priority='0.8')
     m.add_url('http://evervolv.com/chat/',priority='0.3')
     m.add_url('http://evervolv.com/devices/',priority='0.8')
     m.add_url('http://evervolv.com/news/',priority='0.3')
     m.add_url('http://evervolv.com/source/',priority='0.6')
     m.add_url('http://evervolv.com/features/',priority='0.6')
     for d in devices():
         m.add_url('http://evervolv.com/devices/%s' % d,
                 changefreq='daily',priority='0.1')
     return m.write()
Exemple #19
0
usage = """\
python3 __main__.py https://www.femsense.com
"""


def exit_wrong_usage():
    print(usage)
    sys.exit(1)


def format_result(result: typing.Union[CheckResult, CheckResults]):
    symbol = "✅" if result.valid else "❌"
    return f"\t{symbol} {check.msg}"


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    if len(sys.argv) < 2:
        exit_wrong_usage()

    site = sys.argv[1]

    crawler = Crawler(site)
    sitemap = Sitemap(site)

    results = [*crawler.crawl()]
    print("Dere!")
    for n, check in enumerate(results):
        # if not check.valid:
        print(f"{n: 4d} {format_result(check)}")
Exemple #20
0
 def test_invalid_sitemap_input(self):
     with pytest.raises(TypeError):
         Sitemap(loc=self.loc, lastmod=123)
Exemple #21
0
 def test_happy_path(self):
     sitemap = Sitemap(loc=self.loc, lastmod=self.lastmod)
     assert isinstance(sitemap, Sitemap)
Exemple #22
0
 def from_url(cls, url):
     return Sitemap.from_url(url)
Exemple #23
0
def main(args):
    is_release = "release" in args

    Path("../out/news").mkdir(parents=True, exist_ok=True)
    copytree("./news", "../out/news", ignore=ignore_patterns("*.md"))

    news_list = find_news()

    env = Environment(
        loader=PackageLoader("generate", "./templates"),
        autoescape=select_autoescape(["html", "xml"]),
    )

    news_items = [
        {
            "title": Markup(news.render_title_link("/" + news.path.strip("/") + "/")),
            "date": format_date(news.date),
            "path": news.path,
        }
        for news in news_list
    ]

    sitemap = Sitemap(BASE_URL)
    sitemap.add_url("/")
    sitemap.add_url("/news/")

    for news in news_list:
        render_template(
            env,
            "news.html",
            f"../out/{news.path.strip('/')}/index.html",
            release=is_release,
            meta_title=news.render_title(),
            meta_description=news.description,
            meta_canonical=f'{BASE_URL.rstrip("/")}/{news.path.strip("/")}/',
            content=Markup(news.html),
            date=format_date(news.date),
            other_news=[on for on in news_items if news.path != on["path"]][:3],
        )

        sitemap.add_url(news.path)

    render_template(
        env,
        "news-index.html",
        "../out/news/index.html",
        release=is_release,
        news=news_items,
        meta_title="Новости",
        meta_description=(
            "Новости либертарианства и Либертарианской Партии России в Екатеринбурге и Свердловской области"
        ),
        meta_canonical=f'{BASE_URL.rstrip("/")}/news/',
    )

    render_template(
        env,
        "home.html",
        "../out/index.html",
        release=is_release,
        news=news_items[:3],
        meta_description=(
            "Выступаем за свободную экономику, независимое местное самоуправление, "
            "суверенитет личности и против цензуры в интернете. Присоединяйся!"
        ),
    )

    render_template(env, "sitemap.xml", "../out/sitemap.xml", urls=sitemap.urls)

    copytree("./static", "../out")