Beispiel #1
0
def test_parse_datetimes():
    def assert_with_tz(dt, naive, offset):
        """return a tuple with naive datetime, and an timedelta tz offset"""
        assert naive == dt.replace(tzinfo=None)
        assert offset == dt.utcoffset()

    # waterpigs.co.uk -- utc time
    assert_with_tz(mf2util.parse_datetime('2014-05-10T10:48:28+00:00'),
                   datetime(2014, 5, 10, 10, 48, 28), timedelta(hours=0))

    # same as above with Zulu time
    assert_with_tz(mf2util.parse_datetime('2014-05-10T10:48:28Z'),
                   datetime(2014, 5, 10, 10, 48, 28), timedelta(hours=0))

    # snarfed.org -- pacific time
    assert_with_tz(mf2util.parse_datetime('2014-05-05T09:59:08-07:00'),
                   datetime(2014, 5, 5, 9, 59, 8), timedelta(hours=-7))

    # same as above, no colon in tz
    assert_with_tz(mf2util.parse_datetime('2014-05-05T09:59:08-0700'),
                   datetime(2014, 5, 5, 9, 59, 8), timedelta(hours=-7))

    with pytest.raises(ValueError):
        # cannot read timezones by name
        mf2util.parse_datetime('2013-07-04T11:22 PST')
Beispiel #2
0
def test_parse_datetimes():
    def assert_with_tz(dt, naive, offset):
        """return a tuple with naive datetime, and an timedelta tz offset"""
        assert naive == dt.replace(tzinfo=None)
        assert offset == dt.utcoffset()

    # waterpigs.co.uk -- utc time
    assert_with_tz(mf2util.parse_datetime('2014-05-10T10:48:28+00:00'),
                   datetime(2014, 5, 10, 10, 48, 28), timedelta(hours=0))

    # same as above with Zulu time
    assert_with_tz(mf2util.parse_datetime('2014-05-10T10:48:28Z'),
                   datetime(2014, 5, 10, 10, 48, 28), timedelta(hours=0))

    # snarfed.org -- pacific time
    assert_with_tz(mf2util.parse_datetime('2014-05-05T09:59:08-07:00'),
                   datetime(2014, 5, 5, 9, 59, 8), timedelta(hours=-7))

    # same as above, no colon in tz
    assert_with_tz(mf2util.parse_datetime('2014-05-05T09:59:08-0700'),
                   datetime(2014, 5, 5, 9, 59, 8), timedelta(hours=-7))

    with pytest.raises(ValueError):
        # cannot read timezones by name
        mf2util.parse_datetime('2013-07-04T11:22 PST')
Beispiel #3
0
def test_parse_dates():
    assert mf2util.parse_datetime('2014-04-27') == date(2014, 4, 27)
    assert mf2util.parse_datetime('2014-9-2') == date(2014, 9, 2)
    assert mf2util.parse_datetime('1982-11-24') == date(1982, 11, 24)

    with pytest.raises(ValueError):
        # day/month switched
        mf2util.parse_datetime('2014-24-11')

    with pytest.raises(ValueError):
        # 2-character year
        mf2util.parse_datetime('14-09-27')
Beispiel #4
0
def test_parse_dates():
    assert mf2util.parse_datetime('2014-04-27') == date(2014, 4, 27)
    assert mf2util.parse_datetime('2014-9-2') == date(2014, 9, 2)
    assert mf2util.parse_datetime('1982-11-24') == date(1982, 11, 24)

    with pytest.raises(ValueError):
        # day/month switched
        mf2util.parse_datetime('2014-24-11')

    with pytest.raises(ValueError):
        # 2-character year
        mf2util.parse_datetime('14-09-27')
Beispiel #5
0
def test_parse_datetimes_no_tz():
    # tantek.com -- no seconds, no timezone
    assert mf2util.parse_datetime('2014-05-09T17:53') == datetime(2014, 5, 9, 17, 53)
    # same as above without 'T'
    assert mf2util.parse_datetime('2014-05-09 17:53') == datetime(2014, 5, 9, 17, 53)
    # Homebrew Website Club
    assert mf2util.parse_datetime('2014-04-23T18:30') == datetime(2014, 4, 23, 18, 30)

    with pytest.raises(ValueError):
        # hour only
        mf2util.parse_datetime('2012-09-01T12')

    with pytest.raises(ValueError):
        # invalid hour minute
        mf2util.parse_datetime('2014-04-23T30:90')
Beispiel #6
0
def _populate_cache_for_url(url, ctx):
    import mf2util
    import dateutil.parser

    silos = get_named_silos(ctx.silos, ctx.args.silo)

    until_dt = None
    if ctx.args.until:
        until_dt = dateutil.parser.parse(ctx.args.until).date()
        logger.debug("Populating cache until: %s" % until_dt)

    mf_obj = parse_mf2(url)
    mf_dict = mf_obj.to_dict()
    for entry in mf_dict.get('items', []):
        entry_props = entry.get('properties')
        if not entry_props:
            logger.warning("Found entry without any properties.")
            continue

        entry_url = entry_props.get('url')
        if not entry_url:
            logger.warning("Found entry without any URL.")
            continue

        if isinstance(entry_url, list):
            entry_url = entry_url[0]

        if until_dt:
            entry_published = entry_props.get('published')
            if not entry_published:
                logger.warning("Entry '%s' has not published date." %
                               entry_url)
                continue

            if isinstance(entry_published, list):
                entry_published = entry_published[0]

            entry_published_dt = mf2util.parse_datetime(entry_published)
            if entry_published_dt and entry_published_dt.date() > until_dt:
                continue

        logger.debug("Adding entry to cache: %s" % entry_url)
        for silo in silos:
            ctx.cache.addPost(silo.name, entry_url)
Beispiel #7
0
def test_parse_datetimes_no_tz():
    # tantek.com -- no seconds, no timezone
    assert mf2util.parse_datetime('2014-05-09T17:53') == datetime(
        2014, 5, 9, 17, 53)
    # same as above without 'T'
    assert mf2util.parse_datetime('2014-05-09 17:53') == datetime(
        2014, 5, 9, 17, 53)
    # Homebrew Website Club
    assert mf2util.parse_datetime('2014-04-23T18:30') == datetime(
        2014, 4, 23, 18, 30)

    with pytest.raises(ValueError):
        # hour only
        mf2util.parse_datetime('2012-09-01T12')

    with pytest.raises(ValueError):
        # invalid hour minute
        mf2util.parse_datetime('2014-04-23T30:90')
Beispiel #8
0
def from_activities(activities,
                    actor=None,
                    title=None,
                    feed_url=None,
                    home_page_url=None,
                    hfeed=None):
    """Converts ActivityStreams activities to an RSS 2.0 feed.

  Args:
    activities: sequence of ActivityStreams activity dicts
    actor: ActivityStreams actor dict, the author of the feed
    title: string, the feed title
    feed_url: string, the URL for this RSS feed
    home_page_url: string, the home page URL
    hfeed: dict, parsed mf2 h-feed, if available

  Returns:
    unicode string with RSS 2.0 XML
  """
    try:
        iter(activities)
    except TypeError:
        raise TypeError('activities must be iterable')

    if isinstance(activities, (dict, str)):
        raise TypeError('activities may not be a dict or string')

    fg = FeedGenerator()
    fg.id(feed_url)
    assert feed_url
    fg.link(href=feed_url, rel='self')
    if home_page_url:
        fg.link(href=home_page_url, rel='alternate')
    # TODO: parse language from lang attribute:
    # https://github.com/microformats/mf2py/issues/150
    fg.language('en')
    fg.generator('granary', uri='https://granary.io/')

    hfeed = hfeed or {}
    actor = actor or {}
    image = (util.get_url(hfeed.get('properties', {}), 'photo')
             or util.get_url(actor, 'image'))
    if image:
        fg.image(image)

    props = hfeed.get('properties') or {}
    content = microformats2.get_text(util.get_first(props, 'content', ''))
    summary = util.get_first(props, 'summary', '')
    desc = content or summary or '-'
    fg.description(desc)  # required
    fg.title(title or util.ellipsize(desc))  # required

    latest = None
    feed_has_enclosure = False
    for activity in activities:
        obj = activity.get('object') or activity
        if obj.get('objectType') == 'person':
            continue

        item = fg.add_entry()
        url = obj.get('url')
        id = obj.get('id') or url
        item.id(id)
        item.link(href=url)
        item.guid(url, permalink=True)

        # title (required)
        title = (obj.get('title') or obj.get('displayName')
                 or util.ellipsize(obj.get('content', '-')))
        # strip HTML tags
        title = util.parse_html(title).get_text('').strip()
        item.title(title)

        content = microformats2.render_content(obj,
                                               include_location=True,
                                               render_attachments=True,
                                               render_image=True)
        if not content:
            content = obj.get('summary')
        if content:
            item.content(content, type='CDATA')

        categories = [
            {
                'term': t['displayName']
            } for t in obj.get('tags', [])
            if t.get('displayName') and t.get('verb') not in ('like', 'react',
                                                              'share')
            and t.get('objectType') not in ('article', 'person', 'mention')
        ]
        item.category(categories)

        author = obj.get('author', {})
        author = {
            'name': author.get('displayName') or author.get('username'),
            'uri': author.get('url'),
            'email': author.get('email') or '-',
        }
        item.author(author)

        published = obj.get('published') or obj.get('updated')
        if published and isinstance(published, str):
            try:
                dt = mf2util.parse_datetime(published)
                if not isinstance(dt, datetime):
                    dt = datetime.combine(dt, time.min)
                if not dt.tzinfo:
                    dt = dt.replace(tzinfo=util.UTC)
                item.published(dt)
                if not latest or dt > latest:
                    latest = dt
            except ValueError:  # bad datetime string
                pass

        item_has_enclosure = False
        for att in obj.get('attachments', []):
            stream = util.get_first(att, 'stream') or att
            if not stream:
                continue

            url = stream.get('url') or ''
            mime = mimetypes.guess_type(url)[0] or ''
            if (att.get('objectType') in ENCLOSURE_TYPES
                    or mime and mime.split('/')[0] in ENCLOSURE_TYPES):
                if item_has_enclosure:
                    logging.info(
                        'Warning: item %s already has an RSS enclosure, skipping additional enclosure %s',
                        id, url)
                    continue

                item_has_enclosure = feed_has_enclosure = True
                item.enclosure(url=url,
                               type=mime,
                               length=str(stream.get('size', '')))
                item.load_extension('podcast')
                duration = stream.get('duration')
                if duration:
                    item.podcast.itunes_duration(duration)

    if feed_has_enclosure:
        fg.load_extension('podcast')
        fg.podcast.itunes_author(
            actor.get('displayName') or actor.get('username'))
        if summary:
            fg.podcast.itunes_summary(summary)
        fg.podcast.itunes_explicit('no')
        fg.podcast.itunes_block(False)
        name = author.get('name')
        if name:
            fg.podcast.itunes_author(name)
        if image:
            fg.podcast.itunes_image(image)
        fg.podcast.itunes_category(categories)

    if latest:
        fg.lastBuildDate(latest)

    return fg.rss_str(pretty=True).decode('utf-8')
Beispiel #9
0
def test_none():
    assert mf2util.parse_datetime(None) is None
Beispiel #10
0
def _interpret_common_properties(
    parsed,
    source_url,
    base_href,
    hentry,
    use_rel_syndication,
    want_json,
    fetch_mf2_func,
):
    result = {}
    props = hentry["properties"]

    for prop in ("url", "uid", "photo", "featured" "logo"):
        value = util.get_plain_text(props.get(prop))
        if value:
            result[prop] = value

    for prop in ("start", "end", "published", "updated", "deleted"):
        date_str = util.get_plain_text(props.get(prop))
        if date_str:
            if want_json:
                result[prop] = date_str
            else:
                result[prop + "-str"] = date_str
                try:
                    date = util.parse_datetime(date_str)
                    if date:
                        result[prop] = date
                except ValueError:
                    util.logging.warn("Failed to parse datetime %s", date_str)

    author = util.find_author(parsed, source_url, hentry, fetch_mf2_func)
    if author:
        result["author"] = author

    content_prop = props.get("content")
    content_value = None
    if content_prop:
        if isinstance(content_prop[0], dict):
            content_html = content_prop[0].get("html", "").strip()
            content_value = content_prop[0].get("value", "").strip()
        else:
            content_value = content_html = content_prop[0]
        result["content"] = util.convert_relative_paths_to_absolute(
            source_url, base_href, content_html
        )
        result["content-plain"] = content_value

    summary_prop = props.get("summary")
    if summary_prop:
        if isinstance(summary_prop[0], dict):
            result["summary"] = summary_prop[0]["value"]
        else:
            result["summary"] = summary_prop[0]

    # Collect location objects, then follow this algorithm to consolidate
    # their properties:
    # //indieweb.org/location#How_to_determine_the_location_of_a_microformat
    location_stack = [props]

    for prop in "location", "adr":
        vals = props.get(prop)
        if vals:
            if isinstance(vals[0], util.string_type):
                location_stack.append({"name": vals})
            else:
                location_stack.append(vals[0].get("properties", {}))

    geo = props.get("geo")
    if geo:
        if isinstance(geo[0], dict):
            location_stack.append(geo[0].get("properties", {}))
        else:
            if geo[0].startswith("geo:"):
                # a geo: URL. try to parse it.
                # //tools.ietf.org/html/rfc5870
                parts = geo[0][len("geo:") :].split(";")[0].split(",")
                if len(parts) >= 2:
                    location_stack.append(
                        {
                            "latitude": [parts[0]],
                            "longitude": [parts[1]],
                            "altitude": [parts[2]] if len(parts) >= 3 else [],
                        }
                    )

    for prop in util.LOCATION_PROPERTIES:
        for obj in location_stack:
            if obj and obj.get(prop) and not (obj == props and prop == "name"):
                result.setdefault("location", {})[prop] = obj[prop][0]

    if use_rel_syndication:
        result["syndication"] = list(
            set(
                parsed.get("rels", {}).get("syndication", [])
                + hentry["properties"].get("syndication", [])
            )
        )
    else:
        result["syndication"] = hentry["properties"].get("syndication", [])

    # TODO patch start
    checkin_prop = props.get("checkin")
    if checkin_prop:
        if isinstance(checkin_prop[0], dict):
            props = checkin_prop[0]["properties"]
            result["checkin"] = {"name": props["name"][0]}
            try:
                result.update(
                    {
                        "latitude": props["latitude"][0],
                        "longitude": props["longitude"][0],
                    }
                )
            except KeyError:
                pass
        else:
            result["checkin"] = checkin_prop[0]

    categories = props.get("category")
    if categories:
        result["category"] = categories
    # TODO patch end

    return result
Beispiel #11
0
def from_activities(activities, actor=None, title=None, feed_url=None,
                    home_page_url=None, hfeed=None):
  """Converts ActivityStreams activities to an RSS 2.0 feed.

  Args:
    activities: sequence of ActivityStreams activity dicts
    actor: ActivityStreams actor dict, the author of the feed
    title: string, the feed title
    feed_url: string, the URL for this RSS feed
    home_page_url: string, the home page URL
    hfeed: dict, parsed mf2 h-feed, if available

  Returns:
    unicode string with RSS 2.0 XML
  """
  try:
    iter(activities)
  except TypeError:
    raise TypeError('activities must be iterable')

  if isinstance(activities, (dict, basestring)):
    raise TypeError('activities may not be a dict or string')

  fg = FeedGenerator()
  fg.id(feed_url)
  assert feed_url
  fg.link(href=feed_url, rel='self')
  if home_page_url:
    fg.link(href=home_page_url, rel='alternate')
  # TODO: parse language from lang attribute:
  # https://github.com/microformats/mf2py/issues/150
  fg.language('en')
  fg.generator('granary', uri='https://granary.io/')

  hfeed = hfeed or {}
  actor = actor or {}
  image = util.get_url(hfeed, 'image') or util.get_url(actor, 'image')
  if image:
    fg.image(image)

  props = hfeed.get('properties') or {}
  content = microformats2.get_text(util.get_first(props, 'content', ''))
  summary = util.get_first(props, 'summary', '')
  desc = content or summary or '-'
  fg.description(desc)  # required
  fg.title(title or util.ellipsize(desc))  # required

  latest = None
  enclosures = False
  for activity in activities:
    obj = activity.get('object') or activity
    if obj.get('objectType') == 'person':
      continue

    item = fg.add_entry()
    url = obj.get('url')
    item.id(obj.get('id') or url)
    item.link(href=url)
    item.guid(url, permalink=True)

    item.title(obj.get('title') or obj.get('displayName') or '-')  # required
    content = microformats2.render_content(
      obj, include_location=True, render_attachments=False) or obj.get('summary')
    if content:
      item.content(content, type='CDATA')

    item.category(
      [{'term': t['displayName']} for t in obj.get('tags', [])
       if t.get('displayName') and t.get('verb') not in ('like', 'react', 'share')])

    author = obj.get('author', {})
    item.author({
      'name': author.get('displayName') or author.get('username'),
      'uri': author.get('url'),
    })

    published = obj.get('published') or obj.get('updated')
    if published:
      try:
        dt = mf2util.parse_datetime(published)
        if not isinstance(dt, datetime):
          dt = datetime.combine(dt, time.min)
        if not dt.tzinfo:
          dt = dt.replace(tzinfo=util.UTC)
        item.published(dt)
        if not latest or dt > latest:
          latest = dt
      except ValueError:  # bad datetime string
        pass


    for att in obj.get('attachments', []):
      stream = util.get_first(att, 'stream') or att
      if not stream:
        continue

      url = stream.get('url') or ''
      mime = mimetypes.guess_type(url)[0] or ''
      if (att.get('objectType') in ENCLOSURE_TYPES or
          mime and mime.split('/')[0] in ENCLOSURE_TYPES):
        enclosures = True
        item.enclosure(url=url, type=mime, length='REMOVEME') # TODO: length (bytes)

        item.load_extension('podcast')
        duration = stream.get('duration')
        if duration:
          item.podcast.itunes_duration(duration)

  if enclosures:
    fg.load_extension('podcast')
    fg.podcast.itunes_author(actor.get('displayName') or actor.get('username'))
    if summary:
      fg.podcast.itunes_summary(summary)
    fg.podcast.itunes_explicit('no')
    fg.podcast.itunes_block(False)

  if latest:
    fg.lastBuildDate(latest)

  return fg.rss_str(pretty=True).decode('utf-8').replace(' length="REMOVEME"', '')
Beispiel #12
0
def test_none():
    assert mf2util.parse_datetime(None) is None