Ejemplo n.º 1
0
Archivo: cron.py Proyecto: psbots/kuma
def build_sitemaps():
    sitemap_element = "<sitemap><loc>%s</loc><lastmod>%s</lastmod></sitemap>"
    sitemap_index = ("<sitemapindex xmlns=\"http://www.sitemaps.org/"
                     "schemas/sitemap/0.9\">")
    for locale in settings.MDN_LANGUAGES:
        queryset = (Document.objects.filter(
            is_template=False, locale=locale, is_redirect=False).exclude(
                title__startswith='User:'******'Talk:'))
        if len(queryset) > 0:
            info = {'queryset': queryset, 'date_field': 'modified'}
            sitemap = GenericSitemap(info, priority=0.5)
            urls = sitemap.get_urls(page=1)
            xml = smart_str(
                loader.render_to_string('wiki/sitemap.xml', {'urlset': urls}))
            xml = xml.replace('http://developer.mozilla.org',
                              'https://developer.mozilla.org')
            directory = '%s/sitemaps/%s' % (settings.MEDIA_ROOT, locale)
            if not os.path.exists(directory):
                os.makedirs(directory)
            f = open('%s/sitemap.xml' % directory, 'w')
            f.write(xml)
            f.close()

            sitemap_url = ("https://%s/sitemaps/%s/sitemap.xml" %
                           (Site.objects.get_current().domain, locale))
            sitemap_index = sitemap_index + sitemap_element % (
                sitemap_url,
                time.strftime('%Y-%m-%dT%H:%M:%S+00:00', time.gmtime()))

    sitemap_index = sitemap_index + "</sitemapindex>"
    index_file = open('%s/sitemap.xml' % settings.MEDIA_ROOT, 'w')
    index_file.write(parseString(sitemap_index).toxml())
    index_file.close()
Ejemplo n.º 2
0
def build_sitemaps():
    sitemap_element = "<sitemap><loc>%s</loc><lastmod>%s</lastmod></sitemap>"
    sitemap_index = "<sitemapindex xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">"
    for locale in settings.MDN_LANGUAGES:
        queryset = (Document.objects
                        .filter(is_template=False, locale=locale)
                        .exclude(title__startswith='User:'******'Redirect [0-9]+$')
                        .exclude(html__iregex=r'^(<p>)?(#)?REDIRECT')
                        .exclude(slug__icontains='Talk:')
                   )
        if len(queryset) > 0:
            info = {'queryset': queryset, 'date_field': 'modified'}
            sitemap = GenericSitemap(info, priority=0.5)
            urls = sitemap.get_urls(page=1)
            xml = smart_str(loader.render_to_string('sitemap.xml',
                                                    {'urlset': urls}))
            xml = xml.replace('http://', 'https://')
            directory = '%s/sitemaps/%s' % (settings.MEDIA_ROOT, locale)
            if not os.path.exists(directory):
                os.makedirs(directory)
            f = open('%s/sitemap.xml' % directory, 'w')
            f.write(xml)
            f.close()

            sitemap_url = ("https://%s/sitemaps/%s/sitemap.xml" % (
                Site.objects.get_current().domain, locale))
            sitemap_index = sitemap_index + sitemap_element % (sitemap_url,
                time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime()))

    sitemap_index = sitemap_index + "</sitemapindex>"
    index_file = open('%s/sitemap.xml' % settings.MEDIA_ROOT, 'w')
    index_file.write(parseString(sitemap_index).toxml())
    index_file.close()
Ejemplo n.º 3
0
 def test_sitemap_item(self):
     """
     Check to make sure that the raw item is included with each
     Sitemap.get_url() url result.
     """
     user_sitemap = GenericSitemap({'queryset': User.objects.all()})
     def is_user(url):
         return isinstance(url['item'], User)
     item_in_url_info = all(map(is_user, user_sitemap.get_urls()))
     self.assertTrue(item_in_url_info)
Ejemplo n.º 4
0
 def test_sitemap_item(self):
     """
     Check to make sure that the raw item is included with each
     Sitemap.get_url() url result.
     """
     test_sitemap = GenericSitemap({'queryset': TestModel.objects.all()})
     def is_testmodel(url):
         return isinstance(url['item'], TestModel)
     item_in_url_info = all(map(is_testmodel, test_sitemap.get_urls()))
     self.assertTrue(item_in_url_info)
Ejemplo n.º 5
0
    def test_sitemap_item(self):
        """
        Check to make sure that the raw item is included with each
        Sitemap.get_url() url result.
        """
        test_sitemap = GenericSitemap({'queryset': TestModel.objects.order_by('pk').all()})

        def is_testmodel(url):
            return isinstance(url['item'], TestModel)
        item_in_url_info = all(map(is_testmodel, test_sitemap.get_urls()))
        self.assertTrue(item_in_url_info)
Ejemplo n.º 6
0
def generate():
    sitemap = GenericSitemap({'queryset': models.Post.objects.filter(type__in=const.POST_TOPLEVEL).exclude(type=const.POST_BLOG), })
    urlset = sitemap.get_urls()
    text = loader.render_to_string('sitemap.xml', {'urlset': urlset})
    text = smart_str(text)
    site = Site.objects.get_current()
    fname = path(settings.EXPORT_DIR, 'sitemap.xml')
    print '*** writing sitemap for %s to %s' % (site, fname)
    fp = open(fname, 'wt')
    fp.write(text)
    fp.close()
    print '*** done'
Ejemplo n.º 7
0
def generate_sitemap():
    sitemap = GenericSitemap({
        'queryset': Post.objects.filter(type__in=Post.TOP_LEVEL).exclude(type=Post.BLOG),
    })
    urlset = sitemap.get_urls()
    text = loader.render_to_string('sitemap.xml', {'urlset': urlset})
    text = smart_str(text)
    site = Site.objects.get_current()
    fname = path(settings.STATIC_ROOT, 'sitemap.xml')
    logger.info('*** writing sitemap for %s to %s' % (site, fname))
    fp = open(fname, 'wt')
    fp.write(text)
    fp.close()
    logger.info('*** done')
Ejemplo n.º 8
0
def generate_sitemap():
    sitemap = GenericSitemap({
        'queryset':
        Post.objects.filter(type__in=Post.TOP_LEVEL).exclude(type=Post.BLOG),
    })
    urlset = sitemap.get_urls()
    text = loader.render_to_string('sitemap.xml', {'urlset': urlset})
    text = smart_str(text)
    site = Site.objects.get_current()
    fname = path(settings.STATIC_ROOT, 'sitemap.xml')
    logger.info('*** writing sitemap for %s to %s' % (site, fname))
    fp = open(fname, 'wt')
    fp.write(text)
    fp.close()
    logger.info('*** done')
Ejemplo n.º 9
0
def build_locale_sitemap(locale):
    """
    For the given locale build the appropriate sitemap file and
    returns the locale, the file names written and timestamp of the
    build.
    """
    now = datetime.utcnow()
    timestamp = "%s+00:00" % now.replace(microsecond=0).isoformat()

    directory = os.path.join(settings.MEDIA_ROOT, "sitemaps", locale)
    if not os.path.isdir(directory):
        os.makedirs(directory)

    # Add any non-document URL's, which will always include the home page.
    other_urls = [
        {
            "location": absolutify(reverse("home", locale=locale)),
            "lastmod": None,
            "changefreq": None,
            "priority": None,
        }
    ]
    make = [("sitemap_other.xml", other_urls)]

    # We *could* use the `Document.objects.filter_for_list()` manager
    # but it has a list of `.only()` columns which isn't right,
    # it has a list of hardcoded slug prefixes, and it forces an order by
    # on 'slug' which is slow and not needed in this context.
    queryset = Document.objects.filter(locale=locale, is_redirect=False,).exclude(
        html=""
    )
    # Be explicit about exactly only the columns we need.
    queryset = queryset.only("id", "locale", "slug", "modified")

    # The logic for rendering a page will do various checks on each
    # document to evaluate if it should be excluded from robots.
    # Ie. in a jinja template it does...
    #  `{% if reasons... %}noindex, nofollow{% endif %}`
    # Some of those evaluations are complex and depend on the request.
    # That's too complex here but we can at least do some low-hanging
    # fruit filtering.
    queryset = queryset.exclude(current_revision__isnull=True,)
    q = Q(slug__startswith=EXPERIMENT_TITLE_PREFIX)
    for legacy_mindtouch_namespace in LEGACY_MINDTOUCH_NAMESPACES:
        q |= Q(slug__startswith="{}:".format(legacy_mindtouch_namespace))
    for slug_start in NOINDEX_SLUG_PREFIXES:
        q |= Q(slug__startswith=slug_start)
    queryset = queryset.exclude(q)

    # We have to make the queryset ordered. Otherwise the GenericSitemap
    # generator might throw this perfectly valid warning:
    #
    #    UnorderedObjectListWarning:
    #     Pagination may yield inconsistent results with an unordered
    #     object_list: <class 'kuma.wiki.models.Document'> QuerySet.
    #
    # Any order is fine. Use something definitely indexed. It's needed for
    # paginator used by GenericSitemap.
    queryset = queryset.order_by("id")

    # To avoid an extra query to see if the queryset is empty, let's just
    # start iterator and create the sitemap on the first found page.
    # Note, how we check if 'urls' became truthy before adding it.
    sitemap = GenericSitemap(
        {"queryset": queryset, "date_field": "modified"}, protocol="https", priority=0.5
    )
    for page in range(1, sitemap.paginator.num_pages + 1):
        urls = sitemap.get_urls(page=page)
        if page == 1:
            name = "sitemap.xml"
        else:
            name = "sitemap_%s.xml" % page
        if urls:
            make.append((name, urls))

    # Make the sitemap files.
    for name, urls in make:
        rendered = smart_str(render_to_string("wiki/sitemap.xml", {"urls": urls}))
        path = os.path.join(directory, name)
        with open(path, "w") as sitemap_file:
            sitemap_file.write(rendered)

    return locale, [name for name, _ in make], timestamp