Example #1
0
def html_to_atom(html, url=None, fetch_author=False):
    """Converts microformats2 HTML to an Atom feed.

  Args:
    html: string
    url: string URL html came from, optional
    fetch_author: boolean, whether to make HTTP request to fetch rel-author link

  Returns:
    unicode string with Atom XML
  """
    if fetch_author:
        assert url, 'fetch_author=True requires url!'

    parsed = mf2py.parse(doc=html, url=url)
    actor = microformats2.find_author(
        parsed, fetch_mf2_func=lambda url: mf2py.parse(url=url))

    return activities_to_atom(microformats2.html_to_activities(
        html, url, actor),
                              actor,
                              title=mf2util.interpret_feed(parsed,
                                                           url).get('name'),
                              xml_base=util.base_url(url),
                              host_url=url)
Example #2
0
def publish(source, target, endpoint, **kwargs):
    data = kwargs.get('data', {})
    data['_id'] = slugify(u'mention-{0}'.format(source))

    verified = data['verified'].get('state', False)

    if isinstance(target, list):
        real_target = target[-1]['url']
    else:
        real_target = data.pop('real_target', target)
    post_id = post_id_from_url(real_target)

    if verified:
        content = kwargs.get('body', None)
        if content is not None:
            mfdata = mf2py.parse(doc=content, html_parser="html5lib")
            #mentions = mention_from_doc(content)
        else:
            mfdata = mf2py.parse(url=url, html_parser="html5lib")
            #mentions = mention_from_url(source)

    mfdata['items'] = [bleachify(item) for item in mfdata['items']]

    data.update({'post_id': post_id, 'type': 'mention', 'format': 'mf2py'})

    data['data'] = mfdata

    res = update_record(endpoint.format(data['_id']), data)
    return res
Example #3
0
  def get(self):
    expected_inputs = ('activitystreams', 'html', 'json-mf2')
    input = util.get_required_param(self, 'input')
    if input not in expected_inputs:
      raise exc.HTTPBadRequest('Invalid input: %s, expected one of %r' %
                               (input, expected_inputs))
    url = util.get_required_param(self, 'url')

    # check if request is cached
    cache = self.request.get('cache', '').lower() != 'false'
    cache_key = 'U %s' % url
    cached = memcache.get(cache_key) if cache else None

    if cached:
      logging.info('Serving cached response %r', cache_key)
      url = cached['url']
      body = cached['body']
    else:
      # fetch url
      try:
        resp = util.urlopen(url)
      except (ValueError, httplib.InvalidURL) as e:
        self.abort(400, str(e))
        # other exceptions are handled by webutil.handlers.handle_exception(),
        # which uses interpret_http_exception(), etc.

      if url != resp.geturl():
        url = resp.geturl()
        logging.info('Redirected to %s', url)
      body = resp.read()

      if cache:
        logging.info('Caching response in %r', cache_key)
        memcache.set(cache_key, {'url': url, 'body': body}, URL_CACHE_TIME)

    # decode data
    mf2 = None
    if input == 'html':
      mf2 = mf2py.parse(doc=body, url=url)
    elif input == 'json-mf2':
      mf2 = json.loads(body)
      mf2.setdefault('rels', {})  # mf2util expects rels

    actor = None
    title = None
    if mf2:
      actor = microformats2.find_author(
        mf2, fetch_mf2_func=lambda url: mf2py.parse(url=url))
      title = mf2util.interpret_feed(mf2, url).get('name')

    if input == 'activitystreams':
      activities = json.loads(body)
    elif input == 'html':
      activities = microformats2.html_to_activities(body, url, actor)
    elif input == 'json-mf2':
      activities = [microformats2.json_to_object(item, actor=actor)
                    for item in mf2.get('items', [])]

    self.write_response(source.Source.make_activities_base_response(activities),
                        url=url, actor=actor, title=title)
Example #4
0
def html_to_atom(html, url=None, fetch_author=False, reader=True):
    """Converts microformats2 HTML to an Atom feed.

  Args:
    html: string
    url: string URL html came from, optional
    fetch_author: boolean, whether to make HTTP request to fetch rel-author link
    reader: boolean, whether the output will be rendered in a feed reader.
      Currently just includes location if True, not otherwise.

  Returns:
    unicode string with Atom XML
  """
    if fetch_author:
        assert url, 'fetch_author=True requires url!'

    parsed = mf2py.parse(doc=html, url=url)
    actor = microformats2.find_author(
        parsed, fetch_mf2_func=lambda url: mf2py.parse(url=url))

    return activities_to_atom(microformats2.html_to_activities(
        html, url, actor),
                              actor,
                              title=microformats2.get_title(parsed),
                              xml_base=util.base_url(url),
                              host_url=url,
                              reader=reader)
Example #5
0
def html_to_atom(html, url=None, fetch_author=False, reader=True):
  """Converts microformats2 HTML to an Atom feed.

  Args:
    html: string
    url: string URL html came from, optional
    fetch_author: boolean, whether to make HTTP request to fetch rel-author link
    reader: boolean, whether the output will be rendered in a feed reader.
      Currently just includes location if True, not otherwise.

  Returns:
    unicode string with Atom XML
  """
  if fetch_author:
    assert url, 'fetch_author=True requires url!'

  parsed = mf2py.parse(doc=html, url=url, img_with_alt=True)
  actor = microformats2.find_author(
    parsed, fetch_mf2_func=lambda url: mf2py.parse(url=url, img_with_alt=True))

  return activities_to_atom(
    microformats2.html_to_activities(html, url, actor),
    actor,
    title=microformats2.get_title(parsed),
    xml_base=util.base_url(url),
    host_url=url,
    reader=reader)
Example #6
0
def get_access_token(u, scopes):
    """Initiate an IndieAuth Authorization flow to get an acess token (for talking to the Miropub endpoint)."""
    # Guess the identity from the URL
    me = urllib.parse.urlparse(u)._replace(path="/").geturl()

    # Fetch the 3 endpoints needed:
    # TODO(tsileo): clean error if missing
    dat = mf2py.parse(url=u)
    auth_endpoint = dat["rels"]["authorization_endpoint"][0]
    tok_endpoint = dat["rels"]["token_endpoint"][0]
    micropub_endpoint = dat["rels"]["micropub"][0]

    # Generate a random state
    state = binascii.hexlify(os.urandom(6)).decode()

    # Actually initiate the Authorization flow
    auth_url = (auth_endpoint + "?" +
                urllib.parse.urlencode({
                    "me": me,
                    "response_type": "code",
                    "state": state,
                    "redirect_uri": REDIRECT_URI,
                    "scope": " ".join(scopes),
                    "client_id": CLIENT_ID,
                }))

    # Open the URL in a tab
    webbrowser.open_new_tab(auth_url)

    click.echo("waiting for the IndieAuth callback...")
    tok = _wait_for_access_token(me, tok_endpoint)
    click.echo("success")

    # And wait for the callback via the redirect_uri
    return (me, micropub_endpoint, tok)
    def do_whois(self, url):
        parsed = mf2py.parse(url=url)

        props = []
        for rel in 'authorization_endpoint', 'token_endpoint', 'micropub':
            for val in parsed['rels'].get(rel, []):
                props.append((rel, val))

        # top-level h-card first, then top-level h-* with .author
        hcard = None
        for item in parsed['items']:
            if 'h-card' in item['type']:
                hcard = item
                break

        if not hcard:
            for item in parsed['items']:
                if 'author' in item['properties']:
                    hcard = item['properties']['author'][0]
                    break
        if hcard:
            if isinstance(hcard, dict):
                for prop in 'name', 'photo', 'url':
                    for val in hcard['properties'].get(prop, []):
                        props.append((prop, val))
            else:
                props.append(('name', hcard))

        return ("Here's everything I could find about %s\n  " % url) + '\n  '.join(
            "%s: %s" % (k, v) for k, v in props)
Example #8
0
    def post(self):
        logging.info('(Params: %s )', self.request.params.items())

        # fetch source page
        source = util.get_required_param(self, 'source')
        source_resp = common.requests_get(source)
        self.source_url = source_resp.url or source
        self.source_domain = urlparse.urlparse(self.source_url).netloc.split(':')[0]
        self.source_mf2 = mf2py.parse(source_resp.text, url=self.source_url, img_with_alt=True)
        # logging.debug('Parsed mf2 for %s: %s', source_resp.url, json.dumps(self.source_mf2, indent=2))

        # check for backlink to bridgy fed (for webmention spec and to confirm
        # source's intent to federate to mastodon)
        if (self.request.host_url not in source_resp.text and
            urllib.quote(self.request.host_url, safe='') not in source_resp.text):
            common.error(self, "Couldn't find link to %s" % self.request.host_url)

        # convert source page to ActivityStreams
        entry = mf2util.find_first_entry(self.source_mf2, ['h-entry'])
        if not entry:
            common.error(self, 'No microformats2 found on %s' % self.source_url)

        logging.info('First entry: %s', json.dumps(entry, indent=2))
        # make sure it has url, since we use that for AS2 id, which is required
        # for ActivityPub.
        props = entry.setdefault('properties', {})
        if not props.get('url'):
            props['url'] = [self.source_url]

        self.source_obj = microformats2.json_to_object(entry, fetch_mf2=True)
        logging.info('Converted to AS1: %s', json.dumps(self.source_obj, indent=2))

        self.try_activitypub() or self.try_salmon()
Example #9
0
    def __init__(self, request: utils.RequestResult):
        """ Given a request object and retrieved text, parse out the feed """
        text = request.text
        md5 = hashlib.md5(text.encode('utf-8'))
        self.digest = md5.digest()

        self.url = str(request.url)
        self.caching = caching.make_headers(request.headers)

        self.feed = feedparser.parse(text)
        if 'bozo_exception' in self.feed:
            # feedparser couldn't handle this, so maybe it's mf2
            self.mf2 = mf2py.parse(text)
        else:
            self.mf2 = None

        self.status = request.status
        self.links: typing.DefaultDict[
            str, typing.Set[str]] = collections.defaultdict(set)

        try:
            for link in self.feed.feed.links:
                # conveniently this also contains the rel links from HTML
                # documents, so no need to handle the mf2 version (if any)
                href = link.get('href')
                rel = link.get('rel')

                if rel and href:
                    self.links[rel].add(href)
        except (AttributeError, KeyError):
            pass

        self.schema = SCHEMA_VERSION
Example #10
0
def parse_with_mf2py(url):
    result = mf2py.parse(url=url)

    if not result:
        return None

    if len(result.get('items', [])) == 0:
        return None

    item = result['items'][0]

    if not item['properties'].get('name'):
        return None

    if not item['properties'].get('content'):
        return None

    mf2 = {
        'type': ['h-entry'],
        'properties': {
            'name': item['properties']['name'],
            'content': item['properties']['content'],
            'syndication': [url],
            'url': [url]
        }
    }

    if item['properties'].get('author'):
        mf2['properties']['author'] = item['properties']['author']
    if item['properties'].get('published'):
        mf2['properties']['published'] = item['properties']['published']

    return mf2
Example #11
0
def convert_mf2():
    strip_rel_urls = request.args.get('strip_rel_urls') or request.form.get('strip_rel_urls')
    url = request.args.get('url') or request.form.get('url')
    doc = request.args.get('doc') or request.form.get('doc')
    doc = doc and doc.strip()

    if url and not doc:
        parsed = urllib.parse.urlparse(url)
        if parsed.fragment:
            r = requests.get(url)
            r.raise_for_status()
            doc = BeautifulSoup(
                r.text if 'charset' in r.headers.get('content-type', '') 
                else r.content)
            doc = doc.find(id=parsed.fragment)

    if url or doc:
        try:
            json = mf2py.parse(url=url, doc=doc)
            if strip_rel_urls:
                json.pop('rel-urls', None)
            return jsonify(json)
        except:
            return jsonify({'error': str(sys.exc_info()[0])})

    return """
Example #12
0
def build_user_json(me, resp=None):
  """user_json contains an h-card, rel-me links, and "me"

  Args:
    me: string, URL of the user, returned by
    resp: requests.Response (optional), re-use response if it's already been fetched

  Return:
    dict, with 'me', the URL for this person; 'h-card', the representative h-card
      for this page; 'rel-me', a list of rel-me URLs found at this page
  """
  user_json = {'me': me}

  resp = resp or util.requests_get(me)
  if resp.status_code // 100 != 2:
    logging.warning(
      'could not fetch user url "%s". got response code: %d',
      me, resp.status_code)
    return user_json
  # Requests doesn't look at the HTML body to find <meta charset>
  # tags, so if the character encoding isn't given in a header, then
  # we pass on the raw bytes and let BS4 deal with it.
  p = mf2py.parse(doc=resp.text
                  if 'charset' in resp.headers.get('content-type', '')
                  else resp.content, url=me)
  user_json['rel-me'] = p.get('rels', {}).get('me')
  user_json['h-card'] = mf2util.representative_hcard(p, me)
  logging.debug('built user-json %r', user_json)
  return util.trim_nulls(user_json)
Example #13
0
def fetch_reply_contexts(reply_pairs, now, fetch_mf2_func):
    old_contexts = {}
    in_reply_tos = [url for _, url in reply_pairs]
    if in_reply_tos:
        for entry in (Entry.query
                      .join(Entry.feed)
                      .filter(Entry.permalink.in_(in_reply_tos),
                              Feed.type == 'html')):
            old_contexts[entry.permalink] = entry

    for entry, in_reply_to in reply_pairs:
        context = old_contexts.get(in_reply_to)
        if not context:
            current_app.logger.info('fetching in-reply-to: %s', in_reply_to)
            try:
                proxied_reply_url = proxy_url(in_reply_to)
                parsed = mf2util.interpret(
                    mf2py.parse(url=proxied_reply_url), in_reply_to,
                    fetch_mf2_func=fetch_mf2_func)
                if parsed:
                    context = hentry_to_entry(parsed, None, False, now)
            except requests.exceptions.RequestException as err:
                current_app.logger.warn(
                    '%s fetching reply context: %s for entry: %s',
                    type(err).__name__, proxied_reply_url, entry.permalink)

        if context:
            db.session.add(context)
            entry.reply_context.append(context)
Example #14
0
def posse_post_discovery(original, regex):
    """Given an original URL and a permalink regex, looks for
    silo-specific syndication URLs. If the original is a silo url,
    that url is returned; otherwise we fetch the source and attempt to
    look for u-syndication URLs.
    """
    if not hasattr(regex, 'match'):
        regex = re.compile(regex)

    if regex.match(original):
        return original

    try:
        d = mf2py.parse(url=original)
        urls = d['rels'].get('syndication', [])
        for item in d['items']:
            if 'h-entry' in item['type']:
                urls += item['properties'].get('syndication', [])
        for url in urls:
            if regex.match(url):
                return url
    except HTTPError:
        current_app.logger.exception('Could not fetch original')
    except SSLError:
        current_app.logger.exception('SSL Error')
    except Exception as e:
        current_app.logger.exception('MF2 Parser error: %s', e)
Example #15
0
def posse_post_discovery(original, regex):
    """Given an original URL and a permalink regex, looks for
    silo-specific syndication URLs. If the original is a silo url,
    that url is returned; otherwise we fetch the source and attempt to
    look for u-syndication URLs.
    """
    if not hasattr(regex, 'match'):
        regex = re.compile(regex)

    if regex.match(original):
        return original

    try:
        d = mf2py.parse(url=original)
        urls = d['rels'].get('syndication', [])
        for item in d['items']:
            if 'h-entry' in item['type']:
                urls += item['properties'].get('syndication', [])
        for url in urls:
            if regex.match(url):
                return url
    except HTTPError:
        current_app.logger.exception('Could not fetch original')
    except SSLError:
        current_app.logger.exception('SSL Error')
    except Exception as e:
        current_app.logger.exception('MF2 Parser error: %s', e)
Example #16
0
def add_subscription(origin, feed_url, type, tags=None):
    feed = Feed.query.filter_by(feed=feed_url, type=type).first()

    if not feed:
        name = None
        if type == "html":
            flask.current_app.logger.debug("mf2py parsing %s", feed_url)
            resp = util.requests_get(feed_url)
            feed_text = resp.text if "charset" in resp.headers.get("content-type", "") else resp.content
            parsed = mf2util.interpret_feed(mf2py.parse(doc=feed_text, url=feed_url), feed_url)
            name = parsed.get("name")
        elif type == "xml":
            flask.current_app.logger.debug("feedparser parsing %s", feed_url)
            parsed = feedparser.parse(feed_url, agent=util.USER_AGENT)
            if parsed.feed:
                name = parsed.feed.get("title")
        else:
            flask.current_app.logger.error("unknown feed type %s", type)
            flask.abort(400)

        if not name:
            p = urllib.parse.urlparse(origin)
            name = p.netloc + p.path
        feed = Feed(name=name[:140], origin=origin, feed=feed_url, type=type)

    if feed:
        db.session.add(feed)

        flask_login.current_user.subscriptions.append(Subscription(feed=feed, name=feed.name, tags=tags))

        db.session.commit()
        # go ahead and update the fed
        tasks.q.enqueue(tasks.update_feed, feed.id)
    return feed
Example #17
0
    def get(self, domain):
        url = 'http://%s/' % domain
        resp = common.requests_get(url)
        mf2 = mf2py.parse(resp.text, url=resp.url, img_with_alt=True)
        # logging.info('Parsed mf2 for %s: %s', resp.url, json.dumps(mf2, indent=2))

        hcard = mf2util.representative_hcard(mf2, resp.url)
        logging.info('Representative h-card: %s', json.dumps(hcard, indent=2))
        if not hcard:
            common.error(
                self, """\
Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s"""
                % resp.url)

        key = MagicKey.get_or_create(domain)
        obj = common.postprocess_as2(as2.from_as1(
            microformats2.json_to_object(hcard)),
                                     key=key)
        obj.update({
            'inbox':
            '%s/%s/inbox' % (appengine_config.HOST_URL, domain),
            'outbox':
            '%s/%s/outbox' % (appengine_config.HOST_URL, domain),
            'following':
            '%s/%s/following' % (appengine_config.HOST_URL, domain),
            'followers':
            '%s/%s/followers' % (appengine_config.HOST_URL, domain),
        })
        logging.info('Returning: %s', json.dumps(obj, indent=2))

        self.response.headers.update({
            'Content-Type': common.CONTENT_TYPE_AS2,
            'Access-Control-Allow-Origin': '*',
        })
        self.response.write(json.dumps(obj, indent=2))
Example #18
0
def convert_mf2util():
    def dates_to_string(json):
        if isinstance(json, dict):
            return {k: dates_to_string(v) for (k, v) in json.items()}
        if isinstance(json, list):
            return [dates_to_string(v) for v in json]
        if isinstance(json, datetime.date) or isinstance(json, datetime.datetime):
            return json.isoformat()
        return json

    url = request.args.get('url')
    as_feed = request.args.get('as-feed')
    op = request.args.get('op')
    if url:
        try:
            d = mf2py.parse(url=url)
            if op == 'post-type-discovery':
                entry = mf2util.find_first_entry(d, ['h-entry', 'h-event'])
                return jsonify({'type': mf2util.post_type_discovery(entry)})
                
            if as_feed == 'true' or mf2util.find_first_entry(d, ['h-feed']):
                json = mf2util.interpret_feed(d, url)
            else:
                json = mf2util.interpret(d, url)
            return jsonify(dates_to_string(json))
        except:
            current_app.logger.exception('running mf2util service')
            return jsonify({'error': str(sys.exc_info()[0])})

    return """
Example #19
0
    def fetch_mf2(url):
        testname = url
        prefix = 'http://example.com/'
        if testname.startswith(prefix):
            testname = testname[len(prefix):]

        with open('tests/authorship/' + testname) as f:
            return mf2py.parse(url=url, doc=f.read())
Example #20
0
def mf2py_parse(input, url):
    """Uses mf2py to parse an input HTML string or BeautifulSoup input."""
    if isinstance(input, basestring):
        input = beautifulsoup_parse(input)

    # instrumenting, disabled for now:
    # with cache_time('mf2py', 1):
    return mf2py.parse(url=url, doc=input)
Example #21
0
    def fetch_mf2(url):
        testname = url
        prefix = 'http://example.com/'
        if testname.startswith(prefix):
            testname = testname[len(prefix):]

        with open('tests/authorship/' + testname) as f:
            return mf2py.parse(url=url, doc=f.read())
Example #22
0
def mf2py_parse(input, url):
  """Uses mf2py to parse an input HTML string or BeautifulSoup input."""
  if isinstance(input, basestring):
    input = beautifulsoup_parse(input)

  # instrumenting, disabled for now:
  # with cache_time('mf2py', 1):
  return mf2py.parse(url=url, doc=input, img_with_alt=True)
Example #23
0
def login_callback():
    current_app.logger.debug('callback fields: %s', request.args)

    state = request.args.get('state')
    next_url = state or url_for('views.index')
    # TODO rediscover these endpoints based on 'me'. Assuming
    # they are the same is not totally safe.
    auth_url, token_url, micropub_url = session['endpoints']

    if not auth_url:
        flash('Login failed: No authorization URL in session')
        return redirect(next_url)

    code = request.args.get('code')
    client_id = get_settings().site_url
    redirect_uri = url_for('.login_callback', _external=True)

    current_app.logger.debug('callback with auth endpoint %s', auth_url)
    response = requests.post(auth_url, data={
        'code': code,
        'client_id': client_id,
        'redirect_uri': redirect_uri,
        'state': state,
    })

    rdata = urllib.parse.parse_qs(response.text)
    if response.status_code != 200:
        current_app.logger.debug('call to auth endpoint failed %s', response)
        flash('Login failed {}: {}'.format(rdata.get('error'),
                                           rdata.get('error_description')))
        return redirect(next_url)

    current_app.logger.debug('verify response %s', response.text)
    if 'me' not in rdata:
        current_app.logger.debug('Verify response missing required "me" field')
        flash('Verify response missing required "me" field {}'.format(
            response.text))
        return redirect(next_url)

    me = rdata.get('me')[0]
    scopes = rdata.get('scope')

    try_micropub_config(token_url, micropub_url, scopes, code, me,
                        redirect_uri, client_id, state)

    cred = Credential.query.get(('indieauth', me))
    if not cred:
        cred = Credential(type='indieauth', value=me, display=me)
        db.session.add(cred)
        db.session.commit()

    # offer to associate credential with existing user or create a new user
    p = mf2py.parse(url=me)
    hcard = mf2util.representative_hcard(p, me)
    author = hcard and mf2util.parse_author(hcard)

    return do_login(cred, author and author.get('name'), next_url)
Example #24
0
def test_posts_by_type(client, silly_posts):
    text = client.get('/likes/').get_data(as_text=True)
    p = mf2py.parse(doc=text)
    feed = p['items'][0]['children']

    for item, expected in zip(feed, [
            'https://buf.fy/summers/',
            'https://mal.colm/reynolds',
    ]):
        assert item['properties']['like-of'][0]['properties']['url'][0] == expected
def find_screen_name(url):
    try:
        print('fetching', url)
        r = requests.get(url, timeout=10)
        p = mf2py.parse(url=url)
        for me in p.get('rels', {}).get('me', []):
            m = re.match(r'https?://(?:www.)?twitter.com/@?([\w]+)/?', me)
            if m:
                return m.group(1)
    except:
        logging.error('problem fetching %s', url)
Example #26
0
def test_posts_by_type(client, silly_posts):
    text = client.get('/likes/').get_data(as_text=True)
    p = mf2py.parse(doc=text)
    feed = p['items'][0]['children']

    for item, expected in zip(feed, [
            'https://buf.fy/summers/',
            'https://mal.colm/reynolds',
    ]):
        assert item['properties']['like-of'][0]['properties']['url'][
            0] == expected
Example #27
0
  def test_find_author(self):
    self.assert_equals({
    'displayName': 'my name',
    'url': 'http://li/nk',
    'image': {'url': 'http://pic/ture'},
  }, microformats2.find_author(mf2py.parse(doc="""\
<body class="p-author h-card">
<a href="http://li/nk">my name</a>
<img class="u-photo" src="http://pic/ture" />
<div class="h-entry"></div>
</body>
""", url='http://123')))
Example #28
0
def _get_mention_info_from_mf2(base_url, bs_html):
    import mf2py
    from urllib.parse import urljoin

    mf2 = mf2py.parse(bs_html)
    mf2_items = mf2.get('items')
    if not mf2_items:
        return None

    hentry = next(filter(
        lambda i: 'h-entry' in i['type'],
        mf2_items), None)
    if not hentry:
        return None

    info = {}
    hentry_props = hentry['properties']

    pnames = hentry_props.get('name')
    if pnames:
        info['name'] = pnames[0]

    urls = hentry_props.get('url')
    if urls:
        info['url'] = urljoin(base_url, urls[0])

    pubdates = hentry_props.get('published')
    if pubdates:
        info['published'] = pubdates[0]

    contents = hentry_props.get('content')
    if contents:
        info['content'] = contents[0]['html']

    authors = hentry_props.get('author')
    if authors:
        hcard = next(filter(
            lambda i: 'h-card' in i['type'],
            authors), None)
        if hcard:
            hcard_props = hcard['properties']
            hcard_names = hcard_props.get('name')
            if hcard_names:
                info['author_name'] = hcard_names[0]
            hcard_photos = hcard_props.get('photo')
            if hcard_photos:
                info['author_photo'] = urljoin(base_url, hcard_photos[0])
            hcard_urls = hcard_props.get('url')
            if hcard_urls:
                info['author_url'] = urljoin(base_url, hcard_urls[0])

    return info
Example #29
0
def get_client_id_data(url):
    # FIXME(tsileo): ensure not localhost via `little_boxes.urlutils.is_url_valid`
    data = mf2py.parse(url=url)
    for item in data["items"]:
        if "h-x-app" in item["type"] or "h-app" in item["type"]:
            props = item.get("properties", {})
            print(props)
            return dict(
                logo=_get_prop(props, "logo"),
                name=_get_prop(props, "name"),
                url=_get_prop(props, "url"),
            )
    return dict(logo=None, name=url, url=url)
Example #30
0
 def fetch_mf2_func(url):
     if util.domain_or_parent_in(
             urlparse.urlparse(url).netloc, SILO_DOMAINS):
         return {
             'items': [{
                 'type': ['h-card'],
                 'properties': {
                     'url': [url]
                 }
             }]
         }
     _, doc = self._fetch(url)
     return mf2py.parse(doc=doc, url=url)
Example #31
0
def test_mf2tests():
    allfiles = glob.glob(os.path.join('..', 'mf2tests', 'tests', 'microformats-v2', 'h-card', '*.json'))
    for jsonfile in allfiles:
        htmlfile = jsonfile[:-4] + 'html'
        with open(htmlfile) as f:
            p = mf2py.parse(doc=f, url='http://example.com')
            yield check_unicode, htmlfile, p
        with open(jsonfile) as jsonf:
            try:
                s = json.load(jsonf)
            except:
                s = "bad file: " + jsonfile + sys.exc_info()[0]
        yield check_mf2, htmlfile, p, s
Example #32
0
def handle_child(child):
    full_child = mf2py.parse(url=child['url'], html_parser='lxml')

    result = [
        item for item in full_child['items'] if item['type'][0] == 'h-entry'
    ]

    if len(result):
        result = result[0]
        result['properties']['url'] = [child['url']]
        return result

    return None
Example #33
0
def html_to_activities(html, url=None):
  """Converts a microformats2 HTML h-feed to ActivityStreams activities.

  Args:
    html: string HTML
    url: optional string URL that HTML came from

  Returns: list of ActivityStreams activity dicts
  """
  parsed = mf2py.parse(doc=html, url=url)
  hfeed = find_first_entry(parsed, ['h-feed'])
  items = hfeed.get('children', []) if hfeed else parsed.get('items', [])
  return [{'object': json_to_object(item)} for item in items]
Example #34
0
def html_to_activities(html, url=None):
    """Converts a microformats2 HTML h-feed to ActivityStreams activities.

  Args:
    html: string HTML
    url: optional string URL that HTML came from

  Returns: list of ActivityStreams activity dicts
  """
    parsed = mf2py.parse(doc=html, url=url)
    hfeed = mf2util.find_first_entry(parsed, ['h-feed'])
    items = hfeed.get('children', []) if hfeed else parsed.get('items', [])
    return [{'object': json_to_object(item)} for item in items]
Example #35
0
def process_html_feed_for_new_entries(feed, content, backfill, now):
    # strip noscript tags before parsing, since we definitely aren't
    # going to preserve js
    content = re.sub('</?noscript[^>]*>', '', content)

    parsed = mf2util.interpret_feed(
        mf2py.parse(url=feed.feed, doc=content), feed.feed)
    hfeed = parsed.get('entries', [])

    for hentry in hfeed:
        entry = hentry_to_entry(hentry, feed, backfill, now)
        if entry:
            current_app.logger.debug('built entry: %s', entry.permalink)
            yield entry
Example #36
0
def _extract(text, url):
    mf_doc = mf2py.parse(text)
    if mf_doc.get('items'):
        LOGGER.info("Found mf2 document")
        return [_extract_mf(item, url) for item in mf_doc['items']]

    # no valid mf2, so let's extract from DOM instead
    dom = BeautifulSoup(text, features='html.parser')
    articles = (dom.find_all('article') or dom.find_all(class_='entry')
                or dom.find_all(class_='article') or [dom])

    LOGGER.info("Attempting to extract from ad-hoc HTML")

    return [_extract_dom(item, dom, url) for item in articles]
Example #37
0
def test_mf2tests():
    allfiles = glob.glob(
        os.path.join('.', 'testsuite', 'tests', '*', '*', '*.json'))
    for jsonfile in allfiles:
        htmlfile = jsonfile[:-4] + 'html'
        with open(htmlfile) as f:
            p = mf2py.parse(doc=f, url='http://example.com')
            yield check_unicode, htmlfile, p
        with open(jsonfile) as jsonf:
            try:
                s = json.load(jsonf)
            except:
                s = "bad file: " + jsonfile + sys.exc_info()[0]
        yield check_mf2, htmlfile, p, s
 def test_complete(self, mock_get):
     actual = sites_to_bigquery.generate('orig.com')
     self.assertEqual({
         'domain': 'orig.com',
         'urls': [
             'http://foo.com/',
             'http://foo.com/canonical',
             'http://foo.com/ogp',
             'http://foo.com/twitter',
         ],
         'names': [
             'My Name',
             'A Title',
             'OGP title',
             'OGP site name',
             'Twitter title',
         ],
         'descriptions': [
             'About me',
             'A meta description',
             'An OGP description',
             'Twitter description',
         ],
         'pictures': [
             'http://foo.com/hcard.jpg',
             'http://foo.com/icon.jpg',
             'http://foo.com/ogp.jpg',
             'http://foo.com/ogp2.jpg',
             'https://foo.com/ogp.jpg',
             'http://foo.com/twitter.jpg',
         ],
         'hcard': json.dumps({
             'type': ['h-card'],
             'properties': {
                 'name': ['My Name'],
                 'url': ['http://foo.com/'],
                 'note': [{
                     'html': 'About <br/>me   ',
                     'value': 'About me   ',
                 }],
                 'photo': ['http://foo.com/hcard.jpg'],
             },
         }, sort_keys=True),
         'mf2': json.dumps(mf2py.parse(doc=HTML, url='http://foo.com'),
                           sort_keys=True),
         'rel_mes': ['http://a/silo', 'http://b/silo'],
         'html': HTML,
         'fetch_time': actual['fetch_time']
     }, actual)
Example #39
0
def extract_mf2_context(context, doc, url):
    """ Gets Microformats2 data from the given document
    """
    cached_mf2 = {}

    # used by authorship algorithm
    def fetch_mf2(url):
        if url in cached_mf2:
            return cached_mf2[url]
        p = mf2py.parse(url=url)
        cached_mf2[url] = p
        return p

    blob = mf2py.parse(doc=doc, url=url)
    cached_mf2[url] = blob

    if blob:
        current_app.logger.debug('parsed successfully by mf2py: %s', url)
        entry = mf2util.interpret(blob, url, fetch_mf2_func=fetch_mf2)
        if entry:
            current_app.logger.debug(
                'parsed successfully by mf2util: %s', url)
            published = entry.get('published')
            content = util.clean_foreign_html(entry.get('content', ''))
            content_plain = util.format_as_text(
                content, link_fn=lambda a: a)

            title = entry.get('name')
            if title and len(title) > 512:
                # FIXME is there a db setting to do this automatically?
                title = title[:512]
            author_name = entry.get('author', {}).get('name', '')
            author_image = entry.get('author', {}).get('photo')

            permalink = entry.get('url')
            if not permalink or not isinstance(permalink, str):
                permalink = url

            context.url = url
            context.permalink = permalink
            context.author_name = author_name
            context.author_url = entry.get('author', {}).get('url', '')
            context.author_image = author_image
            context.content = content
            context.content_plain = content_plain
            context.published = published
            context.title = title

    return context
Example #40
0
def html_to_atom(html, url=None, fetch_author=False):
  """Converts microformats2 HTML to an Atom feed.

  Args:
    html: string
    url: string URL html came from, optional
    fetch_author: boolean, whether to make HTTP request to fetch rel-author link

  Returns:
    unicode string with Atom XML
  """
  if fetch_author:
    assert url, 'fetch_author=True requires url!'

  parsed = mf2py.parse(doc=html, url=url)
  actor = microformats2.find_author(
    parsed, fetch_mf2_func=lambda url: mf2py.parse(url=url))

  return activities_to_atom(
    microformats2.html_to_activities(html, url, actor),
    actor,
    title=mf2util.interpret_feed(parsed, url).get('name'),
    xml_base=util.base_url(url),
    host_url=url)
Example #41
0
    def user_to_actor(self, resp):
        """Convert a Flickr user dict into an ActivityStreams actor.
    """
        person = resp.get('person', {})
        username = person.get('username', {}).get('_content')
        obj = util.trim_nulls({
            'objectType':
            'person',
            'displayName':
            person.get('realname', {}).get('_content') or username,
            'image': {
                'url':
                self.get_user_image(person.get('iconfarm'),
                                    person.get('iconserver'),
                                    person.get('nsid')),
            },
            'id':
            self.tag_uri(username),
            # numeric_id is our own custom field that always has the source's numeric
            # user id, if available.
            'numeric_id':
            person.get('nsid'),
            'location': {
                'displayName': person.get('location', {}).get('_content'),
            },
            'username':
            username,
            'description':
            person.get('description', {}).get('_content'),
        })

        # fetch profile page to get url(s)
        profile_url = person.get('profileurl', {}).get('_content')
        if profile_url:
            try:
                logging.debug('fetching flickr profile page %s', profile_url)
                resp = urllib2.urlopen(profile_url,
                                       timeout=appengine_config.HTTP_TIMEOUT)
                profile_json = mf2py.parse(doc=resp, url=profile_url)
                # personal site is likely the first non-flickr url
                urls = profile_json.get('rels', {}).get('me', [])
                obj['urls'] = [{'value': u} for u in urls]
                obj['url'] = next(
                    (u for u in urls
                     if not u.startswith('https://www.flickr.com/')), None)
            except urllib2.URLError, e:
                logging.warning('could not fetch user homepage %s',
                                profile_url)
Example #42
0
def html_to_atom(html, url=None, **kwargs):
    """Converts microformats2 HTML to an Atom feed.

  Args:
    html: string
    url: string URL html came from, optional

  Returns: unicode string with Atom XML
  """
    parsed = mf2py.parse(doc=html, url=url)
    return activities_to_atom(microformats2.html_to_activities(html, url),
                              microformats2.find_author(parsed),
                              title=mf2util.interpret_feed(parsed,
                                                           url).get('name'),
                              xml_base=util.base_url(url),
                              host_url=url)
Example #43
0
def convert_mf2():
    strip_rel_urls = request.args.get('strip_rel_urls') or request.form.get('strip_rel_urls')
    url = request.args.get('url') or request.form.get('url')
    doc = request.args.get('doc') or request.form.get('doc')
    doc = doc and doc.strip()

    if url or doc:
        try:
            json = mf2py.parse(url=url, doc=doc)
            if strip_rel_urls:
                json.pop('rel-urls', None)
            return jsonify(json)
        except:
            return jsonify({'error': str(sys.exc_info()[0])})

    return """
Example #44
0
def get_client_id_data(url):
    data = mf2py.parse(url=url)
    for item in data['items']:
        if 'h-x-app' in item['type'] or 'h-app' in item['type']:
            props = item.get('properties', {})
            print(props)
            return dict(
                logo=_get_prop(props, 'logo'),
                name=_get_prop(props, 'name'),
                url=_get_prop(props, 'url'),
            )
    return dict(
        logo=None,
        name=url,
        url=url,
    )
Example #45
0
def html_to_activities(html, url=None, actor=None):
  """Converts a microformats2 HTML h-feed to ActivityStreams activities.

  Args:
    html: string HTML
    url: optional string URL that HTML came from
    actor: optional author AS actor object for all activities. usually comes
      from a rel="author" link.

  Returns:
    list of ActivityStreams activity dicts
  """
  parsed = mf2py.parse(doc=html, url=url)
  hfeed = mf2util.find_first_entry(parsed, ['h-feed'])
  items = hfeed.get('children', []) if hfeed else parsed.get('items', [])
  return [{'object': json_to_object(item, actor=actor)} for item in items]
Example #46
0
def extract_mf2_context(context, doc, url):
    """ Gets Microformats2 data from the given document
    """
    cached_mf2 = {}

    # used by authorship algorithm
    def fetch_mf2(url):
        if url in cached_mf2:
            return cached_mf2[url]
        p = mf2py.parse(url=url)
        cached_mf2[url] = p
        return p

    blob = mf2py.parse(doc=doc, url=url)
    cached_mf2[url] = blob

    if blob:
        current_app.logger.debug('parsed successfully by mf2py: %s', url)
        entry = mf2util.interpret(blob, url, fetch_mf2_func=fetch_mf2)
        if entry:
            current_app.logger.debug('parsed successfully by mf2util: %s', url)
            published = entry.get('published')
            content = util.clean_foreign_html(entry.get('content', ''))
            content_plain = util.format_as_text(content, link_fn=lambda a: a)

            title = entry.get('name')
            if title and len(title) > 512:
                # FIXME is there a db setting to do this automatically?
                title = title[:512]
            author_name = entry.get('author', {}).get('name', '')
            author_image = entry.get('author', {}).get('photo')

            permalink = entry.get('url')
            if not permalink or not isinstance(permalink, str):
                permalink = url

            context.url = url
            context.permalink = permalink
            context.author_name = author_name
            context.author_url = entry.get('author', {}).get('url', '')
            context.author_image = author_image
            context.content = content
            context.content_plain = content_plain
            context.published = published
            context.title = title

    return context
Example #47
0
def lookup(url: str) -> ap.BaseActivity:
    """Try to find an AP object related to the given URL."""
    try:
        if url.startswith("@"):
            actor_url = get_actor_url(url)
            if actor_url:
                return ap.fetch_remote_activity(actor_url)
    except NotAnActivityError:
        pass
    except requests.HTTPError:
        # Some websites may returns 404, 503 or others when they don't support webfinger, and we're just taking a guess
        # when performing the lookup.
        pass
    except requests.RequestException as err:
        raise RemoteServerUnavailableError(f"failed to fetch {url}: {err!r}")

    backend = ap.get_backend()
    try:
        resp = requests.head(
            url,
            timeout=10,
            allow_redirects=True,
            headers={"User-Agent": backend.user_agent()},
        )
    except requests.RequestException as err:
        raise RemoteServerUnavailableError(f"failed to GET {url}: {err!r}")

    try:
        resp.raise_for_status()
    except Exception:
        return ap.fetch_remote_activity(url)

    # If the page is HTML, maybe it contains an alternate link pointing to an AP object
    for alternate in mf2py.parse(resp.text).get("alternates", []):
        if alternate.get("type") == "application/activity+json":
            return ap.fetch_remote_activity(alternate["url"])

    try:
        # Maybe the page was JSON-LD?
        data = resp.json()
        return ap.parse_activity(data)
    except Exception:
        pass

    # Try content negotiation (retry with the AP Accept header)
    return ap.fetch_remote_activity(url)
Example #48
0
def find_possible_feeds(origin):
    # scrape an origin source to find possible alternative feeds
    try:
        resp = util.requests_get(origin)
    except requests.exceptions.RequestException as e:
        flask.flash("Error fetching source {}".format(repr(e)))
        flask.current_app.logger.warn("Subscribe failed for %s with error %s", origin, repr(e))
        return None

    feeds = []

    xml_feed_types = [
        "application/rss+xml",
        "application/atom+xml",
        "application/rdf+xml",
        "application/xml",
        "text/xml",
    ]
    xml_mime_types = xml_feed_types + ["text/xml", "text/rss+xml", "text/atom+xml"]

    content_type = resp.headers["content-type"]
    content_type = content_type.split(";", 1)[0].strip()
    if content_type in xml_mime_types:
        feeds.append({"origin": origin, "feed": origin, "type": "xml", "title": "untitled xml feed"})

    elif content_type == "text/html":
        parsed = mf2py.parse(doc=resp.text, url=origin)
        # if text/html, then parse and look for h-entries
        hfeed = mf2util.interpret_feed(parsed, origin)
        if hfeed.get("entries"):
            ftitle = hfeed.get("name") or "untitled h-feed"
            feeds.append({"origin": origin, "feed": resp.url, "type": "html", "title": ftitle[:140]})

        # look for link="feed"
        for furl in parsed.get("rels", {}).get("feed", []):
            fprops = parsed.get("rel-urls", {}).get(furl, {})
            if not fprops.get("type") or fprops.get("type") == "text/html":
                feeds.append({"origin": origin, "feed": furl, "type": "html", "title": fprops.get("title")})

        # then look for link rel="alternate"
        for link in parsed.get("alternates", []):
            if link.get("type") in xml_feed_types:
                feeds.append({"origin": origin, "feed": link.get("url"), "type": "xml", "title": link.get("title")})

    return feeds
Example #49
0
def callback(info):
    if info.error:
        flash('Micropub failure: {}'.format(info.error))
    else:
        flash('Micropub success! Authorized {}'.format(info.me))

    p = mf2py.parse(url=info.me)

    current_app.logger.debug('found author info %s', info.me)
    target = PosseTarget(
        uid=info.me,
        name=info.me,
        style='microblog',
        micropub_endpoint=info.micropub_endpoint,
        access_token=info.access_token)

    current_user.posse_targets.append(target)
    db.session.commit()
    return redirect(url_for('.edit', target_id=target.id))
Example #50
0
def fetch_reply_context(entry_id, in_reply_to, now):
    with flask_app():
        entry = Entry.query.get(entry_id)
        context = Entry.query\
                       .join(Entry.feed)\
                       .filter(Entry.permalink==in_reply_to, Feed.type == 'html')\
                       .first()

        if not context:
            current_app.logger.info('fetching in-reply-to url: %s',
                                    in_reply_to)
            parsed = mf2util.interpret(
                mf2py.parse(url=proxy_url(in_reply_to)), in_reply_to)
            if parsed:
                context = hentry_to_entry(parsed, in_reply_to, False, now)

        if context:
            entry.reply_context.append(context)
            db.session.commit()
Example #51
0
  def user_to_actor(self, resp):
    """Convert a Flickr user dict into an ActivityStreams actor.
    """
    person = resp.get('person', {})
    username = person.get('username', {}).get('_content')
    obj = util.trim_nulls({
      'objectType': 'person',
      'displayName': person.get('realname', {}).get('_content') or username,
      'image': {
        'url': self.get_user_image(person.get('iconfarm'),
                                   person.get('iconserver'),
                                   person.get('nsid')),
      },
      'id': self.tag_uri(username),
      # numeric_id is our own custom field that always has the source's numeric
      # user id, if available.
      'numeric_id': person.get('nsid'),
      'location': {
        'displayName': person.get('location', {}).get('_content'),
      },
      'username': username,
      'description': person.get('description', {}).get('_content'),
    })

    # fetch profile page to get url(s)
    profile_url = person.get('profileurl', {}).get('_content')
    if profile_url:
      try:
        logging.debug('fetching flickr profile page %s', profile_url)
        resp = urllib2.urlopen(
          profile_url, timeout=appengine_config.HTTP_TIMEOUT)
        profile_json = mf2py.parse(doc=resp, url=profile_url)
        # personal site is likely the first non-flickr url
        urls = profile_json.get('rels', {}).get('me', [])
        obj['urls'] = [{'value': u} for u in urls]
        obj['url'] = next(
          (u for u in urls if not u.startswith('https://www.flickr.com/')),
          None)
      except urllib2.URLError, e:
        logging.warning('could not fetch user homepage %s', profile_url)
Example #52
0
  def user_to_actor(self, resp):
    """Convert a Flickr user dict into an ActivityStreams actor.
    """
    person = resp.get('person', {})
    username = person.get('username', {}).get('_content')
    obj = util.trim_nulls({
      'objectType': 'person',
      'displayName': person.get('realname', {}).get('_content') or username,
      'image': {
        'url': self.get_user_image(person.get('iconfarm'),
                                   person.get('iconserver'),
                                   person.get('nsid')),
      },
      'id': self.tag_uri(username),
      # numeric_id is our own custom field that always has the source's numeric
      # user id, if available.
      'numeric_id': person.get('nsid'),
      'location': {
        'displayName': person.get('location', {}).get('_content'),
      },
      'username': username,
      'description': person.get('description', {}).get('_content'),
    })

    # fetch profile page to get url(s)
    profile_url = person.get('profileurl', {}).get('_content')
    if profile_url:
      try:
        resp = util.urlopen(profile_url)
        profile_json = mf2py.parse(doc=resp, url=profile_url, img_with_alt=True)
        urls = profile_json.get('rels', {}).get('me', [])
        if urls:
          obj['url'] = urls[0]
        if len(urls) > 1:
          obj['urls'] = [{'value': u} for u in urls]
      except urllib_error.URLError:
        logging.warning('could not fetch user homepage %s', profile_url)

    return self.postprocess_object(obj)
Example #53
0
def process_html_feed_for_new_entries(feed, content, backfill, now, fetch_mf2_func):
    # strip noscript tags before parsing, since we definitely aren't
    # going to preserve js
    content = re.sub('</?noscript[^>]*>', '', content, flags=re.IGNORECASE)

    # look for a <base> element
    doc = bs4.BeautifulSoup(content, 'html5lib')
    base_el = doc.find('base')
    base_href = base_el.get('href') if base_el else None

    parsed = mf2util.interpret_feed(
        mf2py.parse(doc, feed.feed),
        source_url=feed.feed, base_href=base_href,
        fetch_mf2_func=fetch_mf2_func)
    hfeed = parsed.get('entries', [])

    for hentry in hfeed:
        current_app.logger.debug('building entry: %s', hentry.get('url'))
        entry = hentry_to_entry(hentry, feed, backfill, now)
        if entry:
            current_app.logger.debug('built entry: %s', entry.permalink)
            yield entry
Example #54
0
def fetch_context():
    url = request.args.get('url')
    if not url:
        return make_response(jsonify({
            'error': 'missing_url',
            'message': "Missing 'url' query parameter",
        }), 400)

    # TODO cache everything. check newer urls more frequently than
    # older urls. be careful not to overwrite previous good responses
    # with failure.

    url = maybe_proxy(url)
    resp = fetch(url)

    if resp.status_code // 100 != 2:
        return make_response(jsonify({
            'error': 'fetch_failed',
            'message': 'Failed to fetch resource at ' + url,
            'response': resp.text,
            'code': resp.status_code,
        }), resp.status_code)

    parsed = mf2py.parse(
        doc=resp.text if 'content-type' in resp.headers else resp.content,
        url=url)
    entry = mf2util.interpret(parsed, url, want_json=True)

    blob = {}
    if entry:
        blob['data'] = entry

    cb = request.args.get('callback')
    if cb:  # jsonp
        resp = make_response('{}({})'.format(cb, json.dumps(blob)))
        resp.headers['content-type'] = 'application/javascript; charset=utf-8'
        return resp

    return jsonify(blob)
Example #55
0
def _find_feed_items(feed_url, feed_doc):
  """Extract feed items from a given URL and document. If the top-level
  h-* item is an h-feed, return its children. Otherwise, returns the
  top-level items.

  Args:
    feed_url: a string. the URL passed to mf2py parser
    feed_doc: a string or BeautifulSoup object. document is passed to
      mf2py parser

  Returns:
    a list of dicts, each one representing an mf2 h-* item
  """
  parsed = mf2py.parse(url=feed_url, doc=feed_doc)

  feeditems = parsed['items']
  hfeed = mf2util.find_first_entry(parsed, ('h-feed',))
  if hfeed:
    feeditems = hfeed.get('children', [])
  else:
    logging.debug('No h-feed found, fallback to top-level h-entrys.')
  return feeditems