Esempio n. 1
0
  def get(self):
    expected_inputs = ('activitystreams', 'html', 'json-mf2')
    input = util.get_required_param(self, 'input')
    if input not in expected_inputs:
      raise exc.HTTPBadRequest('Invalid input: %s, expected one of %r' %
                               (input, expected_inputs))
    url = util.get_required_param(self, 'url')

    # check if request is cached
    cache = self.request.get('cache', '').lower() != 'false'
    cache_key = 'U %s' % url
    cached = memcache.get(cache_key) if cache else None

    if cached:
      logging.info('Serving cached response %r', cache_key)
      url = cached['url']
      body = cached['body']
    else:
      # fetch url
      try:
        resp = util.urlopen(url)
      except (ValueError, httplib.InvalidURL) as e:
        self.abort(400, str(e))
        # other exceptions are handled by webutil.handlers.handle_exception(),
        # which uses interpret_http_exception(), etc.

      if url != resp.geturl():
        url = resp.geturl()
        logging.info('Redirected to %s', url)
      body = resp.read()

      if cache:
        logging.info('Caching response in %r', cache_key)
        memcache.set(cache_key, {'url': url, 'body': body}, URL_CACHE_TIME)

    # decode data
    mf2 = None
    if input == 'html':
      mf2 = mf2py.parse(doc=body, url=url)
    elif input == 'json-mf2':
      mf2 = json.loads(body)
      mf2.setdefault('rels', {})  # mf2util expects rels

    actor = None
    title = None
    if mf2:
      actor = microformats2.find_author(
        mf2, fetch_mf2_func=lambda url: mf2py.parse(url=url))
      title = mf2util.interpret_feed(mf2, url).get('name')

    if input == 'activitystreams':
      activities = json.loads(body)
    elif input == 'html':
      activities = microformats2.html_to_activities(body, url, actor)
    elif input == 'json-mf2':
      activities = [microformats2.json_to_object(item, actor=actor)
                    for item in mf2.get('items', [])]

    self.write_response(source.Source.make_activities_base_response(activities),
                        url=url, actor=actor, title=title)
Esempio n. 2
0
def html_to_atom(html, url=None, fetch_author=False, reader=True):
  """Converts microformats2 HTML to an Atom feed.

  Args:
    html: string
    url: string URL html came from, optional
    fetch_author: boolean, whether to make HTTP request to fetch rel-author link
    reader: boolean, whether the output will be rendered in a feed reader.
      Currently just includes location if True, not otherwise.

  Returns:
    unicode string with Atom XML
  """
  if fetch_author:
    assert url, 'fetch_author=True requires url!'

  parsed = mf2py.parse(doc=html, url=url)
  actor = microformats2.find_author(
    parsed, fetch_mf2_func=lambda url: mf2py.parse(url=url))

  return activities_to_atom(
    microformats2.html_to_activities(html, url, actor),
    actor,
    title=mf2util.interpret_feed(parsed, url).get('name'),
    xml_base=util.base_url(url),
    host_url=url,
    reader=reader)
Esempio n. 3
0
def add_subscription(origin, feed_url, type, tags=None):
    feed = Feed.query.filter_by(feed=feed_url, type=type).first()

    if not feed:
        name = None
        if type == "html":
            flask.current_app.logger.debug("mf2py parsing %s", feed_url)
            resp = util.requests_get(feed_url)
            feed_text = resp.text if "charset" in resp.headers.get("content-type", "") else resp.content
            parsed = mf2util.interpret_feed(mf2py.parse(doc=feed_text, url=feed_url), feed_url)
            name = parsed.get("name")
        elif type == "xml":
            flask.current_app.logger.debug("feedparser parsing %s", feed_url)
            parsed = feedparser.parse(feed_url, agent=util.USER_AGENT)
            if parsed.feed:
                name = parsed.feed.get("title")
        else:
            flask.current_app.logger.error("unknown feed type %s", type)
            flask.abort(400)

        if not name:
            p = urllib.parse.urlparse(origin)
            name = p.netloc + p.path
        feed = Feed(name=name[:140], origin=origin, feed=feed_url, type=type)

    if feed:
        db.session.add(feed)

        flask_login.current_user.subscriptions.append(Subscription(feed=feed, name=feed.name, tags=tags))

        db.session.commit()
        # go ahead and update the fed
        tasks.q.enqueue(tasks.update_feed, feed.id)
    return feed
Esempio n. 4
0
def convert_mf2util():
    def dates_to_string(json):
        if isinstance(json, dict):
            return {k: dates_to_string(v) for (k, v) in json.items()}
        if isinstance(json, list):
            return [dates_to_string(v) for v in json]
        if isinstance(json, datetime.date) or isinstance(json, datetime.datetime):
            return json.isoformat()
        return json

    url = request.args.get('url')
    as_feed = request.args.get('as-feed')
    op = request.args.get('op')
    if url:
        try:
            d = mf2py.parse(url=url)
            if op == 'post-type-discovery':
                entry = mf2util.find_first_entry(d, ['h-entry', 'h-event'])
                return jsonify({'type': mf2util.post_type_discovery(entry)})
                
            if as_feed == 'true' or mf2util.find_first_entry(d, ['h-feed']):
                json = mf2util.interpret_feed(d, url)
            else:
                json = mf2util.interpret(d, url)
            return jsonify(dates_to_string(json))
        except:
            current_app.logger.exception('running mf2util service')
            return jsonify({'error': str(sys.exc_info()[0])})

    return """
Esempio n. 5
0
def get_title(mf2):
    """Returns the author of a page as a ActivityStreams actor dict.

  Args:
    mf2: dict, parsed mf2 object (ie return value from mf2py.parse())

  Returns: string title, possibly ellipsized
  """
    lines = mf2util.interpret_feed(mf2, '').get('name', '').splitlines()
    if lines:
        return util.ellipsize(lines[0])

    return ''
Esempio n. 6
0
def get_title(mf2):
  """Returns an mf2 object's title, ie its name.

  Args:
    mf2: dict, parsed mf2 object (ie return value from mf2py.parse())

  Returns: string title, possibly ellipsized
  """
  lines = mf2util.interpret_feed(mf2, '').get('name', '').splitlines()
  if lines:
    return util.ellipsize(lines[0])

  return ''
Esempio n. 7
0
def get_title(mf2):
    """Returns an mf2 object's title, ie its name.

  Args:
    mf2: dict, parsed mf2 object (ie return value from mf2py.parse())

  Returns: string title, possibly ellipsized
  """
    lines = mf2util.interpret_feed(mf2, '').get('name', '').splitlines()
    if lines:
        return util.ellipsize(lines[0])

    return ''
Esempio n. 8
0
def process_html_feed_for_new_entries(feed, content, backfill, now):
    # strip noscript tags before parsing, since we definitely aren't
    # going to preserve js
    content = re.sub('</?noscript[^>]*>', '', content)

    parsed = mf2util.interpret_feed(
        mf2py.parse(url=feed.feed, doc=content), feed.feed)
    hfeed = parsed.get('entries', [])

    for hentry in hfeed:
        entry = hentry_to_entry(hentry, feed, backfill, now)
        if entry:
            current_app.logger.debug('built entry: %s', entry.permalink)
            yield entry
Esempio n. 9
0
def html_to_atom(html, url=None, **kwargs):
    """Converts microformats2 HTML to an Atom feed.

  Args:
    html: string
    url: string URL html came from, optional

  Returns: unicode string with Atom XML
  """
    parsed = mf2py.parse(doc=html, url=url)
    return activities_to_atom(microformats2.html_to_activities(html, url),
                              microformats2.find_author(parsed),
                              title=mf2util.interpret_feed(parsed,
                                                           url).get('name'),
                              xml_base=util.base_url(url),
                              host_url=url)
Esempio n. 10
0
    def get(self):
        expected_inputs = ('activitystreams', 'html', 'json-mf2', 'jsonfeed')
        input = util.get_required_param(self, 'input')
        if input not in expected_inputs:
            raise exc.HTTPBadRequest('Invalid input: %s, expected one of %r' %
                                     (input, expected_inputs))
        url, body = self._urlopen(util.get_required_param(self, 'url'))

        # decode data
        mf2 = None
        if input == 'html':
            mf2 = mf2py.parse(doc=body, url=url)
        elif input == 'json-mf2':
            mf2 = json.loads(body)
            mf2.setdefault('rels', {})  # mf2util expects rels

        actor = None
        title = None
        if mf2:

            def fetch_mf2_func(url):
                _, doc = self._urlopen(url)
                return mf2py.parse(doc=doc, url=url)

            actor = microformats2.find_author(mf2,
                                              fetch_mf2_func=fetch_mf2_func)
            title = mf2util.interpret_feed(mf2, url).get('name')

        if input == 'activitystreams':
            activities = json.loads(body)
        elif input == 'html':
            activities = microformats2.html_to_activities(body, url, actor)
        elif input == 'json-mf2':
            activities = [
                microformats2.json_to_object(item, actor=actor)
                for item in mf2.get('items', [])
            ]
        elif input == 'jsonfeed':
            activities, actor = jsonfeed.jsonfeed_to_activities(
                json.loads(body))

        self.write_response(
            source.Source.make_activities_base_response(activities),
            url=url,
            actor=actor,
            title=title)
Esempio n. 11
0
def find_possible_feeds(origin):
    # scrape an origin source to find possible alternative feeds
    try:
        resp = util.requests_get(origin)
    except requests.exceptions.RequestException as e:
        flask.flash("Error fetching source {}".format(repr(e)))
        flask.current_app.logger.warn("Subscribe failed for %s with error %s", origin, repr(e))
        return None

    feeds = []

    xml_feed_types = [
        "application/rss+xml",
        "application/atom+xml",
        "application/rdf+xml",
        "application/xml",
        "text/xml",
    ]
    xml_mime_types = xml_feed_types + ["text/xml", "text/rss+xml", "text/atom+xml"]

    content_type = resp.headers["content-type"]
    content_type = content_type.split(";", 1)[0].strip()
    if content_type in xml_mime_types:
        feeds.append({"origin": origin, "feed": origin, "type": "xml", "title": "untitled xml feed"})

    elif content_type == "text/html":
        parsed = mf2py.parse(doc=resp.text, url=origin)
        # if text/html, then parse and look for h-entries
        hfeed = mf2util.interpret_feed(parsed, origin)
        if hfeed.get("entries"):
            ftitle = hfeed.get("name") or "untitled h-feed"
            feeds.append({"origin": origin, "feed": resp.url, "type": "html", "title": ftitle[:140]})

        # look for link="feed"
        for furl in parsed.get("rels", {}).get("feed", []):
            fprops = parsed.get("rel-urls", {}).get(furl, {})
            if not fprops.get("type") or fprops.get("type") == "text/html":
                feeds.append({"origin": origin, "feed": furl, "type": "html", "title": fprops.get("title")})

        # then look for link rel="alternate"
        for link in parsed.get("alternates", []):
            if link.get("type") in xml_feed_types:
                feeds.append({"origin": origin, "feed": link.get("url"), "type": "xml", "title": link.get("title")})

    return feeds
Esempio n. 12
0
def test_h_feed_excludes_rel_syndication():
    """Represents a feed that (incorrectly) includes page-scoped
    rel=syndication values in the feed itself. If we're not careful,
    these values will be slurped into every entry in the feed.
    """
    parsed = {
        "items":[{
            "type": ["h-entry"], "properties": {
                "name": ["First Post"],
                "url": ["http://example.com/first-post"],
                "content": [{
                    "html": "This is the body of the first post",
                    "value": "This is the body of the first post",
                }],
                "syndication": [
                    "https://twitter.com/example_com/123456",
                    "https://www.facebook.com/example.com/123456",
                ],
            },
        }, {
            "type": ["h-event"], "properties": {
                "name": ["Second Post"],
                "url": ["http://example.com/second-post"],
                "content": [{
                    "html": "This is the body of the second post",
                    "value": "This is the body of the second post",
                }],
                "syndication": [
                    "https://twitter.com/example_com/7891011",
                    "https://www.facebook.com/example.com/7891011",
                ],
            },
        }], "rels": {
            "syndication": [
                "https://twitter.com/example_com/123456",
                "https://twitter.com/example_com/7891011",
                "https://www.facebook.com/example.com/123456",
                "https://www.facebook.com/example.com/7891011"
            ],
        }
    }
    result = mf2util.interpret_feed(parsed, 'http://example.com')
    assert result['entries'][0]['syndication'] == ["https://twitter.com/example_com/123456", "https://www.facebook.com/example.com/123456"]
    assert result['entries'][1]['syndication'] == ["https://twitter.com/example_com/7891011", "https://www.facebook.com/example.com/7891011"]
Esempio n. 13
0
def convert_mf2util():
    def dates_to_string(json):
        if isinstance(json, dict):
            return {k: dates_to_string(v) for (k, v) in json.items()}
        if isinstance(json, list):
            return [dates_to_string(v) for v in json]
        if isinstance(json, datetime.date) or isinstance(json, datetime.datetime):
            return json.isoformat()
        return json

    url = request.args.get('url')
    if url:
        d = mf2py.Parser(url=url).to_dict()
        if mf2util.find_first_entry(d, ['h-feed']):
            json = mf2util.interpret_feed(d, url)
        else:
            json = mf2util.interpret(d, url)
        return jsonify(dates_to_string(json))
    return """
Esempio n. 14
0
def process_html_feed_for_new_entries(feed, content, backfill, now, fetch_mf2_func):
    # strip noscript tags before parsing, since we definitely aren't
    # going to preserve js
    content = re.sub('</?noscript[^>]*>', '', content, flags=re.IGNORECASE)

    # look for a <base> element
    doc = bs4.BeautifulSoup(content, 'html5lib')
    base_el = doc.find('base')
    base_href = base_el.get('href') if base_el else None

    parsed = mf2util.interpret_feed(
        mf2py.parse(doc, feed.feed),
        source_url=feed.feed, base_href=base_href,
        fetch_mf2_func=fetch_mf2_func)
    hfeed = parsed.get('entries', [])

    for hentry in hfeed:
        current_app.logger.debug('building entry: %s', hentry.get('url'))
        entry = hentry_to_entry(hentry, feed, backfill, now)
        if entry:
            current_app.logger.debug('built entry: %s', entry.permalink)
            yield entry
Esempio n. 15
0
def add_subscription(origin, feed_url, type, tags=None):
    feed = Feed.query.filter_by(feed=feed_url, type=type).first()

    if not feed:
        name = None
        if type == 'html':
            flask.current_app.logger.debug('mf2py parsing %s', feed_url)
            resp = util.requests_get(feed_url)
            feed_text = resp.text if 'charset' in resp.headers.get(
                'content-type', '') else resp.content
            parsed = mf2util.interpret_feed(
                mf2py.parse(doc=feed_text, url=feed_url), feed_url)
            name = parsed.get('name')
        elif type == 'xml':
            flask.current_app.logger.debug('feedparser parsing %s', feed_url)
            parsed = feedparser.parse(feed_url, agent=util.USER_AGENT)
            if parsed.feed:
                name = parsed.feed.get('title')
        else:
            flask.current_app.logger.error('unknown feed type %s', type)
            flask.abort(400)

        if not name:
            p = urllib.parse.urlparse(origin)
            name = p.netloc + p.path
        feed = Feed(name=name[:140], origin=origin, feed=feed_url, type=type)

    if feed:
        db.session.add(feed)

        flask_login.current_user.subscriptions.append(
            Subscription(feed=feed, name=feed.name, tags=tags))

        db.session.commit()
        # go ahead and update the fed
        tasks.q.enqueue(tasks.update_feed, feed.id)
    return feed
Esempio n. 16
0
def html_to_atom(html, url=None, fetch_author=False):
  """Converts microformats2 HTML to an Atom feed.

  Args:
    html: string
    url: string URL html came from, optional
    fetch_author: boolean, whether to make HTTP request to fetch rel-author link

  Returns:
    unicode string with Atom XML
  """
  if fetch_author:
    assert url, 'fetch_author=True requires url!'

  parsed = mf2py.parse(doc=html, url=url)
  actor = microformats2.find_author(
    parsed, fetch_mf2_func=lambda url: mf2py.parse(url=url))

  return activities_to_atom(
    microformats2.html_to_activities(html, url, actor),
    actor,
    title=mf2util.interpret_feed(parsed, url).get('name'),
    xml_base=util.base_url(url),
    host_url=url)
Esempio n. 17
0
  def get(self):
    expected_inputs = ('activitystreams', 'html', 'json-mf2')
    input = util.get_required_param(self, 'input')
    if input not in expected_inputs:
      raise exc.HTTPBadRequest('Invalid input: %s, expected one of %r' %
                               (input, expected_inputs))

    # fetch url
    url = util.get_required_param(self, 'url')
    resp = util.urlopen(url)
    if url != resp.geturl():
      url = resp.geturl()
      logging.info('Redirected to %s', url)
    body = resp.read()

    # decode data
    mf2 = None
    if input == 'activitystreams':
      activities = json.loads(body)
    elif input == 'html':
      activities = microformats2.html_to_activities(body, url)
      mf2 = mf2py.parse(doc=body, url=url)
    elif input == 'json-mf2':
      mf2 = json.loads(body)
      mf2['rels'] = {}  # mf2util expects rels
      activities = [microformats2.json_to_object(item)
                    for item in mf2.get('items', [])]

    author = None
    title = None
    if mf2:
      author = microformats2.find_author(mf2)
      title = mf2util.interpret_feed(mf2, url).get('name')

    self.write_response(source.Source.make_activities_base_response(activities),
                        url=url, actor=author, title=title)
Esempio n. 18
0
def convert_mf2util():
    def dates_to_string(json):
        if isinstance(json, dict):
            return {k: dates_to_string(v) for (k, v) in json.items()}
        if isinstance(json, list):
            return [dates_to_string(v) for v in json]
        if isinstance(json, datetime.date) or isinstance(json, datetime.datetime):
            return json.isoformat()
        return json

    url = request.args.get('url')
    as_feed = request.args.get('as-feed')
    if url:
        try:
            d = mf2py.parse(url=url)
            if as_feed == 'true' or mf2util.find_first_entry(d, ['h-feed']):
                json = mf2util.interpret_feed(d, url)
            else:
                json = mf2util.interpret(d, url)
            return jsonify(dates_to_string(json))
        except:
            return jsonify({'error': str(sys.exc_info()[0])})

    return """
Esempio n. 19
0
  def get(self):
    expected_inputs = ('activitystreams', 'html', 'json-mf2')
    input = util.get_required_param(self, 'input')
    if input not in expected_inputs:
      raise exc.HTTPBadRequest('Invalid input: %s, expected one of %r' %
                               (input, expected_inputs))
    url = util.get_required_param(self, 'url')

    # check if request is cached
    cache = self.request.get('cache', '').lower() != 'false'
    cache_key = 'U %s' % url
    cached = memcache.get(cache_key) if cache else None

    if cached:
      logging.info('Serving cached response %r', cache_key)
      url = cached['url']
      body = cached['body']
    else:
      # fetch url
      try:
        resp = util.urlopen(url)
      except (ValueError, httplib.InvalidURL) as e:
        self.abort(400, str(e))
      except Exception as e:
        if util.is_connection_failure(e):
          # HTTP 504 Gateway Timeout
          self.abort(504, str(e))
        raise

      if url != resp.geturl():
        url = resp.geturl()
        logging.info('Redirected to %s', url)
      body = resp.read()

      if cache:
        logging.info('Caching response in %r', cache_key)
        memcache.set(cache_key, {'url': url, 'body': body}, URL_CACHE_TIME)

    # decode data
    mf2 = None
    if input == 'html':
      mf2 = mf2py.parse(doc=body, url=url)
    elif input == 'json-mf2':
      mf2 = json.loads(body)
      mf2.setdefault('rels', {})  # mf2util expects rels

    actor = None
    title = None
    if mf2:
      actor = microformats2.find_author(
        mf2, fetch_mf2_func=lambda url: mf2py.parse(url=url))
      title = mf2util.interpret_feed(mf2, url).get('name')

    if input == 'activitystreams':
      activities = json.loads(body)
    elif input == 'html':
      activities = microformats2.html_to_activities(body, url, actor)
    elif input == 'json-mf2':
      activities = [microformats2.json_to_object(item, actor=actor)
                    for item in mf2.get('items', [])]

    self.write_response(source.Source.make_activities_base_response(activities),
                        url=url, actor=actor, title=title)
Esempio n. 20
0
def find_possible_feeds(origin):
    # scrape an origin source to find possible alternative feeds
    try:
        resp = util.requests_get(origin)
    except requests.exceptions.RequestException as e:
        flask.flash('Error fetching source {}'.format(repr(e)))
        flask.current_app.logger.warn(
            'Subscribe failed for %s with error %s', origin, repr(e))
        return None

    feeds = []

    xml_feed_types = [
        'application/rss+xml',
        'application/atom+xml',
        'application/rdf+xml',
        'application/xml',
        'text/xml',
    ]
    xml_mime_types = xml_feed_types + [
        'text/xml',
        'text/rss+xml',
        'text/atom+xml',
    ]
    html_feed_types = [
        'text/html',
        'application/xhtml+xml',
    ]

    content_type = resp.headers['content-type']
    content_type = content_type.split(';', 1)[0].strip()
    if content_type in xml_mime_types:
        feeds.append({
            'origin': origin,
            'feed': origin,
            'type': 'xml',
            'title': 'untitled xml feed',
        })

    elif content_type in html_feed_types:
        parsed = mf2py.parse(doc=resp.text, url=origin)
        # if text/html, then parse and look for h-entries
        hfeed = mf2util.interpret_feed(parsed, origin)
        if hfeed.get('entries'):
            ftitle = hfeed.get('name') or 'untitled h-feed'
            feeds.append({
                'origin': origin,
                'feed': resp.url,
                'type': 'html',
                'title': ftitle[:140]
            })

        # look for link="feed"
        for furl in parsed.get('rels', {}).get('feed', []):
            fprops = parsed.get('rel-urls', {}).get(furl, {})
            if not fprops.get('type') or fprops.get('type') in html_feed_types:
                feeds.append({
                    'origin': origin,
                    'feed': furl,
                    'type': 'html',
                    'title': fprops.get('title'),
                })

        # then look for link rel="alternate"
        for link in parsed.get('alternates', []):
            if link.get('type') in xml_feed_types:
                feeds.append({
                    'origin': origin,
                    'feed': link.get('url'),
                    'type': 'xml',
                    'title': link.get('title'),
                })

    return feeds
Esempio n. 21
0
def find_possible_feeds(origin):
    # scrape an origin source to find possible alternative feeds
    try:
        resp = util.requests_get(origin)
    except requests.exceptions.RequestException as e:
        flask.flash('Error fetching source {}'.format(repr(e)))
        flask.current_app.logger.warn('Subscribe failed for %s with error %s',
                                      origin, repr(e))
        return None

    feeds = []

    xml_feed_types = [
        'application/rss+xml',
        'application/atom+xml',
        'application/rdf+xml',
        'application/xml',
        'text/xml',
    ]
    xml_mime_types = xml_feed_types + [
        'text/xml',
        'text/rss+xml',
        'text/atom+xml',
    ]
    html_feed_types = [
        'text/html',
        'application/xhtml+xml',
    ]

    content_type = resp.headers['content-type']
    content_type = content_type.split(';', 1)[0].strip()
    if content_type in xml_mime_types:
        feeds.append({
            'origin': origin,
            'feed': origin,
            'type': 'xml',
            'title': 'untitled xml feed',
        })

    elif content_type in html_feed_types:
        parsed = mf2py.parse(doc=resp.text, url=origin)
        # if text/html, then parse and look for h-entries
        hfeed = mf2util.interpret_feed(parsed, origin)
        if hfeed.get('entries'):
            ftitle = hfeed.get('name') or 'untitled h-feed'
            feeds.append({
                'origin': origin,
                'feed': resp.url,
                'type': 'html',
                'title': ftitle[:140]
            })

        # look for link="feed"
        for furl in parsed.get('rels', {}).get('feed', []):
            fprops = parsed.get('rel-urls', {}).get(furl, {})
            if not fprops.get('type') or fprops.get('type') in html_feed_types:
                feeds.append({
                    'origin': origin,
                    'feed': furl,
                    'type': 'html',
                    'title': fprops.get('title'),
                })

        # then look for link rel="alternate"
        for link in parsed.get('alternates', []):
            if link.get('type') in xml_feed_types:
                feeds.append({
                    'origin': origin,
                    'feed': link.get('url'),
                    'type': 'xml',
                    'title': link.get('title'),
                })

    return feeds