Exemple #1
0
    def test_readfile_path_metadata_implicit_dates(self):
        test_file = 'article_with_metadata_implicit_dates.html'
        page = self.read_file(path=test_file, DEFAULT_DATE='fs')
        expected = {
            'date':
            SafeDatetime.fromtimestamp(os.stat(_path(test_file)).st_mtime),
            'modified':
            SafeDatetime.fromtimestamp(os.stat(_path(test_file)).st_mtime)
        }

        self.assertDictHasSubset(page.metadata, expected)
Exemple #2
0
def path_metadata(full_path, source_path, settings=None):
    metadata = {}
    if settings:
        if settings.get("DEFAULT_DATE", None) == "fs":
            metadata["date"] = SafeDatetime.fromtimestamp(os.stat(full_path).st_ctime)
        metadata.update(settings.get("EXTRA_PATH_METADATA", {}).get(source_path, {}))
    return metadata
Exemple #3
0
def path_metadata(full_path, source_path, settings=None):
    metadata = {}
    if settings:
        if settings.get('DEFAULT_DATE', None) == 'fs':
            metadata['date'] = SafeDatetime.fromtimestamp(
                os.stat(full_path).st_mtime)
        metadata.update(
            settings.get('EXTRA_PATH_METADATA', {}).get(source_path, {}))
    return metadata
Exemple #4
0
def path_metadata(full_path, source_path, settings=None):
    metadata = {}
    if settings:
        if settings.get('DEFAULT_DATE', None) == 'fs':
            metadata['date'] = SafeDatetime.fromtimestamp(
                os.stat(full_path).st_mtime)
        metadata.update(settings.get('EXTRA_PATH_METADATA', {}).get(
            source_path, {}))
    return metadata
Exemple #5
0
def path_metadata(full_path, source_path, settings=None):
    metadata = {}
    if settings:
        if settings.get('DEFAULT_DATE', None) == 'fs':
            metadata['date'] = SafeDatetime.fromtimestamp(
                os.stat(full_path).st_mtime)

        # Apply EXTRA_PATH_METADATA for the source path and the paths of any
        # parent directories. Sorting EPM first ensures that the most specific
        # path wins conflicts.

        epm = settings.get('EXTRA_PATH_METADATA', {})
        for path, meta in sorted(epm.items()):
            # Enforce a trailing slash when checking for parent directories.
            # This prevents false positives when one file or directory's name
            # is a prefix of another's.
            dirpath = os.path.join(path, '')
            if source_path == path or source_path.startswith(dirpath):
                metadata.update(meta)

    return metadata
def tumblr2fields(api_key, blogname):
    """ Imports Tumblr posts (API v2)"""
    try:
        # py3k import
        import json
    except ImportError:
        # py2 import
        import simplejson as json

    try:
        # py3k import
        import urllib.request as urllib_request
    except ImportError:
        # py2 import
        import urllib2 as urllib_request

    def get_tumblr_posts(api_key, blogname, offset=0):
        url = ("http://api.tumblr.com/v2/blog/%s.tumblr.com/"
               "posts?api_key=%s&offset=%d&filter=raw") % (blogname, api_key,
                                                           offset)
        request = urllib_request.Request(url)
        handle = urllib_request.urlopen(request)
        posts = json.loads(handle.read().decode('utf-8'))
        return posts.get('response').get('posts')

    offset = 0
    posts = get_tumblr_posts(api_key, blogname, offset)
    settings = read_settings()
    subs = settings['SLUG_REGEX_SUBSTITUTIONS']
    while len(posts) > 0:
        for post in posts:
            title = \
                post.get('title') or \
                post.get('source_title') or \
                post.get('type').capitalize()
            slug = post.get('slug') or slugify(title, regex_subs=subs)
            tags = post.get('tags')
            timestamp = post.get('timestamp')
            date = SafeDatetime.fromtimestamp(
                int(timestamp)).strftime("%Y-%m-%d %H:%M:%S")
            slug = SafeDatetime.fromtimestamp(
                int(timestamp)).strftime("%Y-%m-%d-") + slug
            format = post.get('format')
            content = post.get('body')
            type = post.get('type')
            if type == 'photo':
                if format == 'markdown':
                    fmtstr = '![%s](%s)'
                else:
                    fmtstr = '<img alt="%s" src="%s" />'
                content = ''
                for photo in post.get('photos'):
                    content += '\n'.join(
                        fmtstr % (photo.get('caption'),
                                  photo.get('original_size').get('url')))
                content += '\n\n' + post.get('caption')
            elif type == 'quote':
                if format == 'markdown':
                    fmtstr = '\n\n&mdash; %s'
                else:
                    fmtstr = '<p>&mdash; %s</p>'
                content = post.get('text') + fmtstr % post.get('source')
            elif type == 'link':
                if format == 'markdown':
                    fmtstr = '[via](%s)\n\n'
                else:
                    fmtstr = '<p><a href="%s">via</a></p>\n'
                content = fmtstr % post.get('url') + post.get('description')
            elif type == 'audio':
                if format == 'markdown':
                    fmtstr = '[via](%s)\n\n'
                else:
                    fmtstr = '<p><a href="%s">via</a></p>\n'
                content = fmtstr % post.get('source_url') + \
                    post.get('caption') + \
                    post.get('player')
            elif type == 'video':
                if format == 'markdown':
                    fmtstr = '[via](%s)\n\n'
                else:
                    fmtstr = '<p><a href="%s">via</a></p>\n'
                source = fmtstr % post.get('source_url')
                caption = post.get('caption')
                players = '\n'.join(
                    player.get('embed_code') for player in post.get('player'))
                content = source + caption + players
            elif type == 'answer':
                title = post.get('question')
                content = ('<p>'
                           '<a href="%s" rel="external nofollow">%s</a>'
                           ': %s'
                           '</p>\n'
                           ' %s' %
                           (post.get('asking_name'), post.get('asking_url'),
                            post.get('question'), post.get('answer')))

            content = content.rstrip() + '\n'
            kind = 'article'
            status = 'published'  # TODO: Find a way for draft posts

            yield (title, content, slug, date, post.get('blog_name'), [type],
                   tags, status, kind, format)

        offset += len(posts)
        posts = get_tumblr_posts(api_key, blogname, offset)
Exemple #7
0
    def parse(self):
        """ Imports Tumblr posts (API v2)"""
        offset = 0
        posts = self._get_tumblr_posts(offset)
        settings = read_settings()
        subs = settings["SLUG_REGEX_SUBSTITUTIONS"]
        while len(posts) > 0:
            for post in posts:
                title = (
                    post.get("title")
                    or post.get("source_title")
                    or post.get("type").capitalize()
                )
                slug = post.get("slug") or slugify(title, regex_subs=subs)
                tags = post.get("tags")
                timestamp = post.get("timestamp")
                date = SafeDatetime.fromtimestamp(int(timestamp)).strftime(
                    "%Y-%m-%d %H:%M:%S"
                )
                slug = (
                    SafeDatetime.fromtimestamp(int(timestamp)).strftime(
                        "%Y-%m-%d-"
                    )
                    + slug
                )
                format = post.get("format")
                content = post.get("body")
                type = post.get("type")
                if type == "photo":
                    if format == "markdown":
                        fmtstr = "![%s](%s)"
                    else:
                        fmtstr = '<img alt="%s" src="%s" />'
                    content = ""
                    for photo in post.get("photos"):
                        content += "\n".join(
                            fmtstr
                            % (
                                photo.get("caption"),
                                photo.get("original_size").get("url"),
                            )
                        )
                    content += "\n\n" + post.get("caption")
                elif type == "quote":
                    if format == "markdown":
                        fmtstr = "\n\n&mdash; %s"
                    else:
                        fmtstr = "<p>&mdash; %s</p>"
                    content = post.get("text") + fmtstr % post.get("source")
                elif type == "link":
                    if format == "markdown":
                        fmtstr = "[via](%s)\n\n"
                    else:
                        fmtstr = '<p><a href="%s">via</a></p>\n'
                    content = fmtstr % post.get("url") + post.get(
                        "description"
                    )
                elif type == "audio":
                    if format == "markdown":
                        fmtstr = "[via](%s)\n\n"
                    else:
                        fmtstr = '<p><a href="%s">via</a></p>\n'
                    content = (
                        fmtstr % post.get("source_url")
                        + post.get("caption")
                        + post.get("player")
                    )
                elif type == "video":
                    if format == "markdown":
                        fmtstr = "[via](%s)\n\n"
                    else:
                        fmtstr = '<p><a href="%s">via</a></p>\n'
                    source = fmtstr % post.get("source_url")
                    caption = post.get("caption")
                    players = "\n".join(
                        player.get("embed_code")
                        for player in post.get("player")
                    )
                    content = source + caption + players
                elif type == "answer":
                    title = post.get("question")
                    content = (
                        "<p>"
                        '<a href="%s" rel="external nofollow">%s</a>'
                        ": %s"
                        "</p>\n"
                        " %s"
                        % (
                            post.get("asking_name"),
                            post.get("asking_url"),
                            post.get("question"),
                            post.get("answer"),
                        )
                    )

                content = content.rstrip() + "\n"
                kind = "article"
                status = "published"  # TODO: Find a way for draft posts

                yield (
                    title,
                    content,
                    slug,
                    date,
                    post.get("blog_name"),
                    [type],
                    tags,
                    status,
                    kind,
                    format,
                )

            offset += len(posts)
            posts = self._get_tumblr_posts(offset)
Exemple #8
0
def tumblr2fields(api_key, blogname):
    """ Imports Tumblr posts (API v2)"""
    try:
        # py3k import
        import json
    except ImportError:
        # py2 import
        import simplejson as json

    try:
        # py3k import
        import urllib.request as urllib_request
    except ImportError:
        # py2 import
        import urllib2 as urllib_request

    def get_tumblr_posts(api_key, blogname, offset=0):
        url = ("http://api.tumblr.com/v2/blog/%s.tumblr.com/"
               "posts?api_key=%s&offset=%d&filter=raw") % (
            blogname, api_key, offset)
        request = urllib_request.Request(url)
        handle = urllib_request.urlopen(request)
        posts = json.loads(handle.read().decode('utf-8'))
        return posts.get('response').get('posts')

    offset = 0
    posts = get_tumblr_posts(api_key, blogname, offset)
    settings = read_settings()
    subs = settings['SLUG_REGEX_SUBSTITUTIONS']
    while len(posts) > 0:
        for post in posts:
            title = \
                post.get('title') or \
                post.get('source_title') or \
                post.get('type').capitalize()
            slug = post.get('slug') or slugify(title, regex_subs=subs)
            tags = post.get('tags')
            timestamp = post.get('timestamp')
            date = SafeDatetime.fromtimestamp(int(timestamp)).strftime(
                "%Y-%m-%d %H:%M:%S")
            slug = SafeDatetime.fromtimestamp(int(timestamp)).strftime(
                "%Y-%m-%d-") + slug
            format = post.get('format')
            content = post.get('body')
            type = post.get('type')
            if type == 'photo':
                if format == 'markdown':
                    fmtstr = '![%s](%s)'
                else:
                    fmtstr = '<img alt="%s" src="%s" />'
                content = ''
                for photo in post.get('photos'):
                    content += '\n'.join(
                        fmtstr % (photo.get('caption'),
                                  photo.get('original_size').get('url')))
                content += '\n\n' + post.get('caption')
            elif type == 'quote':
                if format == 'markdown':
                    fmtstr = '\n\n&mdash; %s'
                else:
                    fmtstr = '<p>&mdash; %s</p>'
                content = post.get('text') + fmtstr % post.get('source')
            elif type == 'link':
                if format == 'markdown':
                    fmtstr = '[via](%s)\n\n'
                else:
                    fmtstr = '<p><a href="%s">via</a></p>\n'
                content = fmtstr % post.get('url') + post.get('description')
            elif type == 'audio':
                if format == 'markdown':
                    fmtstr = '[via](%s)\n\n'
                else:
                    fmtstr = '<p><a href="%s">via</a></p>\n'
                content = fmtstr % post.get('source_url') + \
                    post.get('caption') + \
                    post.get('player')
            elif type == 'video':
                if format == 'markdown':
                    fmtstr = '[via](%s)\n\n'
                else:
                    fmtstr = '<p><a href="%s">via</a></p>\n'
                source = fmtstr % post.get('source_url')
                caption = post.get('caption')
                players = '\n'.join(player.get('embed_code')
                                    for player in post.get('player'))
                content = source + caption + players
            elif type == 'answer':
                title = post.get('question')
                content = ('<p>'
                           '<a href="%s" rel="external nofollow">%s</a>'
                           ': %s'
                           '</p>\n'
                           ' %s' % (post.get('asking_name'),
                                    post.get('asking_url'),
                                    post.get('question'),
                                    post.get('answer')))

            content = content.rstrip() + '\n'
            kind = 'article'
            status = 'published'  # TODO: Find a way for draft posts

            yield (title, content, slug, date, post.get('blog_name'), [type],
                   tags, status, kind, format)

        offset += len(posts)
        posts = get_tumblr_posts(api_key, blogname, offset)
Exemple #9
0
def to_date(s):
    d = SafeDatetime.fromtimestamp(s)
    #d = d.replace(hour=0, minute=0, second=0, microsecond=0)
    return d