Esempio n. 1
0
def _find_feed_items(feed_url, feed_doc):
    """Extract feed items from a given URL and document. If the top-level
  h-* item is an h-feed, return its children. Otherwise, returns the
  top-level items.

  Args:
    feed_url: a string. the URL passed to mf2py parser
    feed_doc: a string or BeautifulSoup object. document is passed to
      mf2py parser

  Returns:
    a list of dicts, each one representing an mf2 h-* item
  """
    parsed = util.mf2py_parse(feed_doc, feed_url)

    feeditems = parsed['items']
    hfeeds = mf2util.find_all_entries(parsed, ('h-feed', ))
    if hfeeds:
        feeditems = list(
            itertools.chain.from_iterable(
                hfeed.get('children', []) for hfeed in hfeeds))
    else:
        logging.debug('No h-feed found, fallback to top-level h-entrys.')

    if len(feeditems) > MAX_FEED_ENTRIES:
        logging.info('%s has %s entries! only processing the first %s.',
                     feed_url, len(feeditems), MAX_FEED_ENTRIES)
        feeditems = feeditems[:MAX_FEED_ENTRIES]

    return feeditems
Esempio n. 2
0
  def resolve_profile_url(url, resolve=True):
    """Resolves a profile URL to be added to a source.

    Args:
      url: string
      resolve: boolean, whether to make HTTP requests to follow redirects, etc.

    Returns: string, resolved URL, or None
    """
    final, _, ok = util.get_webmention_target(url, resolve=resolve)
    if not ok:
      return None

    final = final.lower()
    if util.schemeless(final).startswith(util.schemeless(url.lower())):
      # redirected to a deeper path. use the original higher level URL. #652
      final = url

    # If final has a path segment check if root has a matching rel=me.
    match = re.match(r'^(https?://[^/]+)/.+', final)
    if match and resolve:
      root = match.group(1)
      try:
        resp = util.requests_get(root)
        resp.raise_for_status()
        data = util.mf2py_parse(resp.text, root)
        me_urls = data.get('rels', {}).get('me', [])
        if final in me_urls:
          final = root
      except requests.RequestException:
        logging.warning("Couldn't fetch %s, preserving path in %s",
                        root, final, exc_info=True)

    return final
Esempio n. 3
0
def _find_feed_items(feed_url, feed_doc):
  """Extract feed items from a given URL and document. If the top-level
  h-* item is an h-feed, return its children. Otherwise, returns the
  top-level items.

  Args:
    feed_url: a string. the URL passed to mf2py parser
    feed_doc: a string or BeautifulSoup object. document is passed to
      mf2py parser

  Returns:
    a list of dicts, each one representing an mf2 h-* item
  """
  parsed = util.mf2py_parse(feed_doc, feed_url)

  feeditems = parsed['items']
  hfeeds = mf2util.find_all_entries(parsed, ('h-feed',))
  if hfeeds:
    feeditems = list(itertools.chain.from_iterable(
      hfeed.get('children', []) for hfeed in hfeeds))
  else:
    logging.debug('No h-feed found, fallback to top-level h-entrys.')

  if len(feeditems) > MAX_FEED_ENTRIES:
    logging.info('%s has %s entries! only processing the first %s.',
                 feed_url, len(feeditems), MAX_FEED_ENTRIES)
    feeditems = feeditems[:MAX_FEED_ENTRIES]

  return feeditems
Esempio n. 4
0
  def resolve_profile_url(url, resolve=True):
    """Resolves a profile URL to be added to a source.

    Args:
      url: string
      resolve: boolean, whether to make HTTP requests to follow redirects, etc.

    Returns: string, resolved URL, or None
    """
    final, _, ok = util.get_webmention_target(url, resolve=resolve)
    if not ok:
      return None

    final = final.lower()
    if util.schemeless(final).startswith(util.schemeless(url.lower())):
      # redirected to a deeper path. use the original higher level URL. #652
      final = url

    # If final has a path segment check if root has a matching rel=me.
    match = re.match(r'^(https?://[^/]+)/.+', final)
    if match and resolve:
      root = match.group(1)
      try:
        resp = util.requests_get(root)
        resp.raise_for_status()
        data = util.mf2py_parse(resp.text, root)
        me_urls = data.get('rels', {}).get('me', [])
        if final in me_urls:
          final = root
      except requests.RequestException:
        logging.warning("Couldn't fetch %s, preserving path in %s",
                        root, final, exc_info=True)

    return final
Esempio n. 5
0
    def _urls_and_domains(self, auth_entity, user_url):
        """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
        actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
        logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

        candidates = util.trim_nulls(
            util.uniquify([user_url] + microformats2.object_urls(actor)))

        if len(candidates) > MAX_AUTHOR_URLS:
            logging.info(
                'Too many profile links! Only resolving the first %s: %s',
                MAX_AUTHOR_URLS, candidates)

        urls = []
        for i, url in enumerate(candidates):
            final, domain, ok = util.get_webmention_target(
                url, resolve=i < MAX_AUTHOR_URLS)
            if ok:
                final = final.lower()
                if util.schemeless(final).startswith(
                        util.schemeless(url.lower())):
                    # redirected to a deeper path. use the original higher level URL. #652
                    final = url
                # If final has a path segment check if root has a matching rel=me.
                match = re.match(r'^(https?://[^/]+)/.+', final)
                if match and i < MAX_AUTHOR_URLS:
                    root = match.group(1)
                    resp = util.requests_get(root)
                    resp.raise_for_status()
                    data = util.mf2py_parse(resp.text, root)
                    me_urls = data.get('rels', {}).get('me', [])
                    if final in me_urls:
                        final = root
                urls.append(final)

        urls = util.dedupe_urls(urls)  # normalizes domains to lower case
        domains = [util.domain_from_link(url) for url in urls]
        return urls, domains
Esempio n. 6
0
    def test_user_page_publish_url_with_unicode_char(self):
        """Check the custom mf2 we render on social user pages."""
        self.sources[0].features = ['publish']
        self.sources[0].put()

        url = u'https://ptt.com/ransomw…ocks-user-access/'
        Publish(parent=PublishedPage(id=url.encode('utf-8')).key,
                source=self.sources[0].key).put()

        user_url = self.sources[0].bridgy_path()
        resp = app.application.get_response(user_url)
        self.assertEquals(200, resp.status_int)

        parsed = util.mf2py_parse(resp.body, user_url)
        publish = parsed['items'][0]['children'][0]
Esempio n. 7
0
  def test_user_page_publish_url_with_unicode_char(self):
    """Check the custom mf2 we render on social user pages."""
    self.sources[0].features = ['publish']
    self.sources[0].put()

    url = u'https://ptt.com/ransomw…ocks-user-access/'
    Publish(parent=PublishedPage(id=url.encode('utf-8')).key,
            source=self.sources[0].key).put()

    user_url = self.sources[0].bridgy_path()
    resp = app.application.get_response(user_url)
    self.assertEquals(200, resp.status_int)

    parsed = util.mf2py_parse(resp.body, user_url)
    publish = parsed['items'][0]['children'][0]
Esempio n. 8
0
  def _urls_and_domains(self, auth_entity, user_url):
    """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
    actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
    logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

    candidates = util.trim_nulls(util.uniquify(
        [user_url] + microformats2.object_urls(actor)))

    if len(candidates) > MAX_AUTHOR_URLS:
      logging.info('Too many profile links! Only resolving the first %s: %s',
                   MAX_AUTHOR_URLS, candidates)

    urls = []
    for i, url in enumerate(candidates):
      final, domain, ok = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS)
      if ok:
        final = final.lower()
        if util.schemeless(final).startswith(util.schemeless(url.lower())):
          # redirected to a deeper path. use the original higher level URL. #652
          final = url
        # If final has a path segment check if root has a matching rel=me.
        match = re.match(r'^(https?://[^/]+)/.+', final)
        if match and i < MAX_AUTHOR_URLS:
          root = match.group(1)
          resp = util.requests_get(root)
          resp.raise_for_status()
          data = util.mf2py_parse(resp.text, root)
          me_urls = data.get('rels', {}).get('me', [])
          if final in me_urls:
            final = root
        urls.append(final)

    urls = util.dedupe_urls(urls)  # normalizes domains to lower case
    domains = [util.domain_from_link(url) for url in urls]
    return urls, domains
Esempio n. 9
0
  def test_social_user_page_mf2(self):
    """Check the custom mf2 we render on social user pages."""
    self.sources[0].features = ['listen', 'publish']
    self.sources[0].put()
    for entity in self.responses + self.publishes + self.blogposts:
      entity.put()

    user_url = self.sources[0].bridgy_path()
    resp = app.application.get_response(user_url)
    self.assertEquals(200, resp.status_int)

    parsed = util.mf2py_parse(resp.body, user_url)
    hcard = parsed.get('items', [])[0]
    self.assertEquals(['h-card'], hcard['type'])
    self.assertEquals(
      ['Fake User'], hcard['properties'].get('name'))
    self.assertEquals(
      ['http://fa.ke/profile/url'], hcard['properties'].get('url'))
    self.assertEquals(
      ['enabled'], hcard['properties'].get('bridgy-account-status'))
    self.assertEquals(
      ['enabled'], hcard['properties'].get('bridgy-listen-status'))
    self.assertEquals(
      ['enabled'], hcard['properties'].get('bridgy-publish-status'))

    expected_resps = self.responses[:10]
    for item, resp in zip(hcard['children'], expected_resps):
      self.assertIn('h-bridgy-response', item['type'])
      props = item['properties']
      self.assertEquals([resp.status], props['bridgy-status'])
      self.assertEquals([json.loads(resp.activities_json[0])['url']],
                        props['bridgy-original-source'])
      self.assertEquals(resp.unsent, props['bridgy-target'])

    publish = hcard['children'][len(expected_resps)]
    self.assertIn('h-bridgy-publish', publish['type'])
    props = publish['properties']
    self.assertEquals([self.publishes[0].key.parent().id()], props['url'])
    self.assertEquals([self.publishes[0].status], props['bridgy-status'])
Esempio n. 10
0
    def test_social_user_page_mf2(self):
        """Check the custom mf2 we render on social user pages."""
        self.sources[0].features = ['listen', 'publish']
        self.sources[0].put()
        for entity in self.responses + self.publishes + self.blogposts:
            entity.put()

        user_url = self.sources[0].bridgy_path()
        resp = app.application.get_response(user_url)
        self.assertEquals(200, resp.status_int)

        parsed = util.mf2py_parse(resp.body, user_url)
        hcard = parsed.get('items', [])[0]
        self.assertEquals(['h-card'], hcard['type'])
        self.assertEquals(['Fake User'], hcard['properties'].get('name'))
        self.assertEquals(['http://fa.ke/profile/url'],
                          hcard['properties'].get('url'))
        self.assertEquals(['enabled'],
                          hcard['properties'].get('bridgy-account-status'))
        self.assertEquals(['enabled'],
                          hcard['properties'].get('bridgy-listen-status'))
        self.assertEquals(['enabled'],
                          hcard['properties'].get('bridgy-publish-status'))

        expected_resps = self.responses[:10]
        for item, resp in zip(hcard['children'], expected_resps):
            self.assertIn('h-bridgy-response', item['type'])
            props = item['properties']
            self.assertEquals([resp.status], props['bridgy-status'])
            self.assertEquals([json.loads(resp.activities_json[0])['url']],
                              props['bridgy-original-source'])
            self.assertEquals(resp.unsent, props['bridgy-target'])

        publish = hcard['children'][len(expected_resps)]
        self.assertIn('h-bridgy-publish', publish['type'])
        props = publish['properties']
        self.assertEquals([self.publishes[0].key.parent().id()], props['url'])
        self.assertEquals([self.publishes[0].status], props['bridgy-status'])
Esempio n. 11
0
    def expand_target_urls(self, activity):
        """Expand the inReplyTo or object fields of an ActivityStreams object
    by fetching the original and looking for rel=syndication URLs.

    This method modifies the dict in place.

    Args:
      activity: an ActivityStreams dict of the activity being published
    """
        for field in ('inReplyTo', 'object'):
            # microformats2.json_to_object de-dupes, no need to do it here
            objs = activity.get(field)
            if not objs:
                continue

            if isinstance(objs, dict):
                objs = [objs]

            augmented = list(objs)
            for obj in objs:
                url = obj.get('url')
                if not url:
                    continue

                # get_webmention_target weeds out silos and non-HTML targets
                # that we wouldn't want to download and parse
                url, _, ok = util.get_webmention_target(url)
                if not ok:
                    continue

                # fetch_mf2 raises a fuss if it can't fetch a mf2 document;
                # easier to just grab this ourselves than add a bunch of
                # special-cases to that method
                logging.debug('expand_target_urls fetching field=%s, url=%s',
                              field, url)
                try:
                    resp = util.requests_get(url)
                    resp.raise_for_status()
                    data = util.mf2py_parse(resp.text, url)
                except AssertionError:
                    raise  # for unit tests
                except BaseException:
                    # it's not a big deal if we can't fetch an in-reply-to url
                    logging.warning(
                        'expand_target_urls could not fetch field=%s, url=%s',
                        field,
                        url,
                        exc_info=True)
                    continue

                synd_urls = data.get('rels', {}).get('syndication', [])

                # look for syndication urls in the first h-entry
                queue = collections.deque(data.get('items', []))
                while queue:
                    item = queue.popleft()
                    item_types = set(item.get('type', []))
                    if 'h-feed' in item_types and 'h-entry' not in item_types:
                        queue.extend(item.get('children', []))
                        continue

                    # these can be urls or h-cites
                    synd_urls += microformats2.get_string_urls(
                        item.get('properties', {}).get('syndication', []))

                logging.debug(
                    'expand_target_urls found rel=syndication for url=%s: %r',
                    url, synd_urls)
                augmented += [{'url': u} for u in synd_urls]

            activity[field] = augmented
Esempio n. 12
0
    def fetch_mf2(self, url, require_mf2=True, raise_errors=False):
        """Fetches a URL and extracts its mf2 data.

    Side effects: sets :attr:`entity`\ .html on success, calls :attr:`error()`
    on errors.

    Args:
      url: string
      require_mf2: boolean, whether to return error if no mf2 are found
      raise_errors: boolean, whether to let error exceptions propagate up or
        handle them

    Returns:
      (:class:`requests.Response`, mf2 data dict) on success, None on failure
    """
        try:
            fetched = util.requests_get(url)
            fetched.raise_for_status()
        except BaseException as e:
            if raise_errors:
                raise
            util.interpret_http_exception(e)  # log exception
            return self.error('Could not fetch source URL %s' % url)

        if self.entity:
            self.entity.html = fetched.text

        # .text is decoded unicode string, .content is raw bytes. if the HTTP
        # headers didn't specify a charset, pass raw bytes to BeautifulSoup so it
        # can look for a <meta> tag with a charset and decode.
        text = (fetched.text if 'charset' in fetched.headers.get(
            'content-type', '') else fetched.content)
        doc = util.beautifulsoup_parse(text)

        # parse microformats
        data = util.mf2py_parse(doc, fetched.url)

        # special case tumblr's markup: div#content > div.post > div.copy
        # convert to mf2 and re-parse
        if not data.get('items'):
            contents = doc.find_all(id='content')
            if contents:
                post = contents[0].find_next(class_='post')
                if post:
                    post['class'] = 'h-entry'
                    copy = post.find_next(class_='copy')
                    if copy:
                        copy['class'] = 'e-content'
                    photo = post.find_next(class_='photo-wrapper')
                    if photo:
                        img = photo.find_next('img')
                        if img:
                            img['class'] = 'u-photo'
                    doc = unicode(post)
                    data = util.mf2py_parse(doc, fetched.url)

        logging.debug('Parsed microformats2: %s', json.dumps(data, indent=2))
        items = data.get('items', [])
        if require_mf2 and (not items or not items[0]):
            return self.error('No microformats2 data found in ' + fetched.url,
                              data=data,
                              html="""
No <a href="http://microformats.org/get-started">microformats</a> or
<a href="http://microformats.org/wiki/microformats2">microformats2</a> found in
<a href="%s">%s</a>! See <a href="http://indiewebify.me/">indiewebify.me</a>
for details (skip to level 2, <em>Publishing on the IndieWeb</em>).
""" % (fetched.url, util.pretty_link(fetched.url)))

        return fetched, data
Esempio n. 13
0
  def expand_target_urls(self, activity):
    """Expand the inReplyTo or object fields of an ActivityStreams object
    by fetching the original and looking for rel=syndication URLs.

    This method modifies the dict in place.

    Args:
      activity: an ActivityStreams dict of the activity being published
    """
    for field in ('inReplyTo', 'object'):
      # microformats2.json_to_object de-dupes, no need to do it here
      objs = activity.get(field)
      if not objs:
        continue

      if isinstance(objs, dict):
        objs = [objs]

      augmented = list(objs)
      for obj in objs:
        url = obj.get('url')
        if not url:
          continue

        # get_webmention_target weeds out silos and non-HTML targets
        # that we wouldn't want to download and parse
        url, _, ok = util.get_webmention_target(url)
        if not ok:
          continue

        # fetch_mf2 raises a fuss if it can't fetch a mf2 document;
        # easier to just grab this ourselves than add a bunch of
        # special-cases to that method
        logging.debug('expand_target_urls fetching field=%s, url=%s', field, url)
        try:
          resp = util.requests_get(url)
          resp.raise_for_status()
          data = util.mf2py_parse(resp.text, url)
        except AssertionError:
          raise  # for unit tests
        except BaseException:
          # it's not a big deal if we can't fetch an in-reply-to url
          logging.warning('expand_target_urls could not fetch field=%s, url=%s',
                          field, url, exc_info=True)
          continue

        synd_urls = data.get('rels', {}).get('syndication', [])

        # look for syndication urls in the first h-entry
        queue = collections.deque(data.get('items', []))
        while queue:
          item = queue.popleft()
          item_types = set(item.get('type', []))
          if 'h-feed' in item_types and 'h-entry' not in item_types:
            queue.extend(item.get('children', []))
            continue

          # these can be urls or h-cites
          synd_urls += microformats2.get_string_urls(
            item.get('properties', {}).get('syndication', []))

        logging.debug('expand_target_urls found rel=syndication for url=%s: %r', url, synd_urls)
        augmented += [{'url': u} for u in synd_urls]

      activity[field] = augmented
Esempio n. 14
0
def process_entry(source,
                  permalink,
                  feed_entry,
                  refetch,
                  preexisting,
                  store_blanks=True):
    """Fetch and process an h-entry and save a new :class:`models.SyndicatedPost`.

  Args:
    source:
    permalink: url of the unprocessed post
    feed_entry: the h-feed version of the h-entry dict, often contains
      a partial version of the h-entry at the permalink
    refetch: boolean, whether to refetch and process entries we've seen before
    preexisting: list of previously discovered :class:`models.SyndicatedPost`\ s
      for this permalink
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Returns:
    a dict from syndicated url to a list of new :class:`models.SyndicatedPost`\ s
  """
    # if the post has already been processed, do not add to the results
    # since this method only returns *newly* discovered relationships.
    if preexisting:
        # if we're refetching and this one is blank, do not return.
        # if there is a blank entry, it should be the one and only entry,
        # but go ahead and check 'all' of them to be safe.
        if not refetch:
            return {}
        synds = [s.syndication for s in preexisting if s.syndication]
        if synds:
            logging.debug(
                'previously found relationship(s) for original %s: %s',
                permalink, synds)

    # first try with the h-entry from the h-feed. if we find the syndication url
    # we're looking for, we don't have to fetch the permalink
    permalink, _, type_ok = util.get_webmention_target(permalink)
    usynd = feed_entry.get('properties', {}).get('syndication', [])
    if usynd:
        logging.debug('u-syndication links on the h-feed h-entry: %s', usynd)
    results = _process_syndication_urls(
        source, permalink,
        set(url for url in usynd if isinstance(url, basestring)), preexisting)
    success = True

    if results:
        source.updates['last_feed_syndication_url'] = util.now_fn()
    elif not source.last_feed_syndication_url or not feed_entry:
        # fetch the full permalink page if we think it might have more details
        parsed = None
        try:
            logging.debug('fetching post permalink %s', permalink)
            if type_ok:
                resp = util.requests_get(permalink)
                resp.raise_for_status()
                parsed = util.mf2py_parse(resp.text, permalink)
        except AssertionError:
            raise  # for unit tests
        except BaseException:
            # TODO limit the number of allowed failures
            logging.info('Could not fetch permalink %s',
                         permalink,
                         exc_info=True)
            success = False

        if parsed:
            syndication_urls = set()
            relsynd = parsed.get('rels').get('syndication', [])
            if relsynd:
                logging.debug('rel-syndication links: %s', relsynd)
            syndication_urls.update(url for url in relsynd
                                    if isinstance(url, basestring))
            # there should only be one h-entry on a permalink page, but
            # we'll check all of them just in case.
            for hentry in (item for item in parsed['items']
                           if 'h-entry' in item['type']):
                usynd = hentry.get('properties', {}).get('syndication', [])
                if usynd:
                    logging.debug('u-syndication links: %s', usynd)
                syndication_urls.update(url for url in usynd
                                        if isinstance(url, basestring))
            results = _process_syndication_urls(source, permalink,
                                                syndication_urls, preexisting)

    # detect and delete SyndicatedPosts that were removed from the site
    if success:
        result_syndposts = itertools.chain(*results.values())
        for syndpost in list(preexisting):
            if syndpost.syndication and syndpost not in result_syndposts:
                logging.info('deleting relationship that disappeared: %s',
                             syndpost)
                syndpost.key.delete()
                preexisting.remove(syndpost)

    if not results:
        logging.debug('no syndication links from %s to current source %s.',
                      permalink, source.label())
        results = {}
        if store_blanks and not preexisting:
            # remember that this post doesn't have syndication links for this
            # particular source
            logging.debug(
                'saving empty relationship so that %s will not be '
                'searched again', permalink)
            SyndicatedPost.insert_original_blank(source, permalink)

    # only return results that are not in the preexisting list
    new_results = {}
    for syndurl, syndposts_for_url in results.iteritems():
        for syndpost in syndposts_for_url:
            if syndpost not in preexisting:
                new_results.setdefault(syndurl, []).append(syndpost)

    if new_results:
        logging.debug('discovered relationships %s', new_results)
    return new_results
Esempio n. 15
0
    def test_social_user_page_mf2(self):
        """Check the custom mf2 we render on social user pages."""
        self.sources[0].features = ['listen', 'publish']
        self.sources[0].put()

        # test invite with missing object and content
        resp = json.loads(self.responses[8].response_json)
        resp['verb'] = 'invite'
        resp.pop('object', None)
        resp.pop('content', None)
        self.responses[8].response_json = json.dumps(resp)

        # test that invites render the invitee, not the inviter
        # https://github.com/snarfed/bridgy/issues/754
        self.responses[9].type = 'rsvp'
        self.responses[9].response_json = json.dumps({
            'id': 'tag:fa.ke,2013:111',
            'objectType': 'activity',
            'verb': 'invite',
            'url': 'http://fa.ke/event',
            'actor': {
                'displayName': 'Mrs. Host',
                'url': 'http://fa.ke/host',
            },
            'object': {
                'objectType': 'person',
                'displayName': 'Ms. Guest',
                'url': 'http://fa.ke/guest',
            },
        })

        for entity in self.responses + self.publishes + self.blogposts:
            entity.put()

        user_url = self.sources[0].bridgy_path()
        response = app.application.get_response(user_url)
        self.assertEquals(200, response.status_int)

        parsed = util.mf2py_parse(response.body, user_url)
        hcard = parsed.get('items', [])[0]
        self.assertEquals(['h-card'], hcard['type'])
        self.assertEquals(['Fake User'], hcard['properties'].get('name'))
        self.assertEquals(['http://fa.ke/profile/url'],
                          hcard['properties'].get('url'))
        self.assertEquals(['enabled'],
                          hcard['properties'].get('bridgy-account-status'))
        self.assertEquals(['enabled'],
                          hcard['properties'].get('bridgy-listen-status'))
        self.assertEquals(['enabled'],
                          hcard['properties'].get('bridgy-publish-status'))

        expected_resps = self.responses[:10]
        for item, resp in zip(hcard['children'], expected_resps):
            self.assertIn('h-bridgy-response', item['type'])
            props = item['properties']
            self.assertEquals([resp.status], props['bridgy-status'])
            self.assertEquals([json.loads(resp.activities_json[0])['url']],
                              props['bridgy-original-source'])
            self.assertEquals(resp.unsent, props['bridgy-target'])

        # check invite
        invite = hcard['children'][-1]['properties']
        self.assertIn('Ms. Guest is invited.', response.body)
        self.assertNotIn('Mrs. Host is invited.', response.body)

        publish = hcard['children'][len(expected_resps)]
        self.assertIn('h-bridgy-publish', publish['type'])
        props = publish['properties']
        self.assertEquals([self.publishes[0].key.parent().id()], props['url'])
        self.assertEquals([self.publishes[0].status], props['bridgy-status'])
Esempio n. 16
0
  def fetch_mf2(self, url):
    """Fetches a URL and extracts its mf2 data.

    Side effects: sets :attr:`entity`\ .html on success, calls :attr:`error()`
    on errors.

    Args:
      url: string

    Returns:
      (:class:`requests.Response`, mf2 data dict) on success, None on failure
    """
    try:
      fetched = util.requests_get(url)
      fetched.raise_for_status()
    except BaseException as e:
      util.interpret_http_exception(e)  # log exception
      return self.error('Could not fetch source URL %s' % url)

    if self.entity:
      self.entity.html = fetched.text

    # .text is decoded unicode string, .content is raw bytes. if the HTTP
    # headers didn't specify a charset, pass raw bytes to BeautifulSoup so it
    # can look for a <meta> tag with a charset and decode.
    text = (fetched.text if 'charset' in fetched.headers.get('content-type', '')
            else fetched.content)
    doc = util.beautifulsoup_parse(text)

    # parse microformats, convert to ActivityStreams
    data = util.mf2py_parse(doc, fetched.url)

    # special case tumblr's markup: div#content > div.post > div.copy
    # convert to mf2 and re-parse
    if not data.get('items'):
      contents = doc.find_all(id='content')
      if contents:
        post = contents[0].find_next(class_='post')
        if post:
          post['class'] = 'h-entry'
          copy = post.find_next(class_='copy')
          if copy:
            copy['class'] = 'e-content'
          photo = post.find_next(class_='photo-wrapper')
          if photo:
            img = photo.find_next('img')
            if img:
              img['class'] = 'u-photo'
          doc = unicode(post)
          data = util.mf2py_parse(doc, fetched.url)

    logging.debug('Parsed microformats2: %s', json.dumps(data, indent=2))
    items = data.get('items', [])
    if not items or not items[0]:
      return self.error('No microformats2 data found in ' + fetched.url,
                        data=data, html="""
No <a href="http://microformats.org/get-started">microformats</a> or
<a href="http://microformats.org/wiki/microformats2">microformats2</a> found in
<a href="%s">%s</a>! See <a href="http://indiewebify.me/">indiewebify.me</a>
for details (skip to level 2, <em>Publishing on the IndieWeb</em>).
""" % (fetched.url, util.pretty_link(fetched.url)))

    return fetched, data
Esempio n. 17
0
def process_entry(source, permalink, feed_entry, refetch, preexisting,
                  store_blanks=True):
  """Fetch and process an h-entry and save a new :class:`models.SyndicatedPost`.

  Args:
    source:
    permalink: url of the unprocessed post
    feed_entry: the h-feed version of the h-entry dict, often contains
      a partial version of the h-entry at the permalink
    refetch: boolean, whether to refetch and process entries we've seen before
    preexisting: list of previously discovered :class:`models.SyndicatedPost`\ s
      for this permalink
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Returns:
    a dict from syndicated url to a list of new :class:`models.SyndicatedPost`\ s
  """
  # if the post has already been processed, do not add to the results
  # since this method only returns *newly* discovered relationships.
  if preexisting:
    # if we're refetching and this one is blank, do not return.
    # if there is a blank entry, it should be the one and only entry,
    # but go ahead and check 'all' of them to be safe.
    if not refetch:
      return {}
    synds = [s.syndication for s in preexisting if s.syndication]
    if synds:
      logging.debug('previously found relationship(s) for original %s: %s',
                    permalink, synds)

  # first try with the h-entry from the h-feed. if we find the syndication url
  # we're looking for, we don't have to fetch the permalink
  permalink, _, type_ok = util.get_webmention_target(permalink)
  usynd = feed_entry.get('properties', {}).get('syndication', [])
  if usynd:
    logging.debug('u-syndication links on the h-feed h-entry: %s', usynd)
  results = _process_syndication_urls(source, permalink, set(
    url for url in usynd if isinstance(url, basestring)), preexisting)
  success = True

  if results:
    source.updates['last_feed_syndication_url'] = util.now_fn()
  elif not source.last_feed_syndication_url or not feed_entry:
    # fetch the full permalink page if we think it might have more details
    parsed = None
    try:
      logging.debug('fetching post permalink %s', permalink)
      if type_ok:
        resp = util.requests_get(permalink)
        resp.raise_for_status()
        parsed = util.mf2py_parse(resp.text, permalink)
    except AssertionError:
      raise  # for unit tests
    except BaseException:
      # TODO limit the number of allowed failures
      logging.info('Could not fetch permalink %s', permalink, exc_info=True)
      success = False

    if parsed:
      syndication_urls = set()
      relsynd = parsed.get('rels').get('syndication', [])
      if relsynd:
        logging.debug('rel-syndication links: %s', relsynd)
      syndication_urls.update(url for url in relsynd
                              if isinstance(url, basestring))
      # there should only be one h-entry on a permalink page, but
      # we'll check all of them just in case.
      for hentry in (item for item in parsed['items']
                     if 'h-entry' in item['type']):
        usynd = hentry.get('properties', {}).get('syndication', [])
        if usynd:
          logging.debug('u-syndication links: %s', usynd)
        syndication_urls.update(url for url in usynd
                                if isinstance(url, basestring))
      results = _process_syndication_urls(
        source, permalink, syndication_urls, preexisting)

  # detect and delete SyndicatedPosts that were removed from the site
  if success:
    result_syndposts = itertools.chain(*results.values())
    for syndpost in list(preexisting):
      if syndpost.syndication and syndpost not in result_syndposts:
        logging.info('deleting relationship that disappeared: %s', syndpost)
        syndpost.key.delete()
        preexisting.remove(syndpost)

  if not results:
    logging.debug('no syndication links from %s to current source %s.',
                  permalink, source.label())
    results = {}
    if store_blanks and not preexisting:
      # remember that this post doesn't have syndication links for this
      # particular source
      logging.debug('saving empty relationship so that %s will not be '
                    'searched again', permalink)
      SyndicatedPost.insert_original_blank(source, permalink)

  # only return results that are not in the preexisting list
  new_results = {}
  for syndurl, syndposts_for_url in results.iteritems():
    for syndpost in syndposts_for_url:
      if syndpost not in preexisting:
        new_results.setdefault(syndurl, []).append(syndpost)

  if new_results:
    logging.debug('discovered relationships %s', new_results)
  return new_results
Esempio n. 18
0
  def test_social_user_page_mf2(self):
    """Check the custom mf2 we render on social user pages."""
    self.sources[0].features = ['listen', 'publish']
    self.sources[0].put()

    # test invite with missing object and content
    resp = json.loads(self.responses[8].response_json)
    resp['verb'] = 'invite'
    resp.pop('object', None)
    resp.pop('content', None)
    self.responses[8].response_json = json.dumps(resp)

    # test that invites render the invitee, not the inviter
    # https://github.com/snarfed/bridgy/issues/754
    self.responses[9].type = 'rsvp'
    self.responses[9].response_json = json.dumps({
      'id': 'tag:fa.ke,2013:111',
      'objectType': 'activity',
      'verb': 'invite',
      'url': 'http://fa.ke/event',
      'actor': {
        'displayName': 'Mrs. Host',
        'url': 'http://fa.ke/host',
      },
      'object': {
        'objectType': 'person',
        'displayName': 'Ms. Guest',
        'url': 'http://fa.ke/guest',
      },
    })

    for entity in self.responses + self.publishes + self.blogposts:
      entity.put()

    user_url = self.sources[0].bridgy_path()
    response = app.application.get_response(user_url)
    self.assertEquals(200, response.status_int)

    parsed = util.mf2py_parse(response.body, user_url)
    hcard = parsed.get('items', [])[0]
    self.assertEquals(['h-card'], hcard['type'])
    self.assertEquals(
      ['Fake User'], hcard['properties'].get('name'))
    self.assertEquals(
      ['http://fa.ke/profile/url'], hcard['properties'].get('url'))
    self.assertEquals(
      ['enabled'], hcard['properties'].get('bridgy-account-status'))
    self.assertEquals(
      ['enabled'], hcard['properties'].get('bridgy-listen-status'))
    self.assertEquals(
      ['enabled'], hcard['properties'].get('bridgy-publish-status'))

    expected_resps = self.responses[:10]
    for item, resp in zip(hcard['children'], expected_resps):
      self.assertIn('h-bridgy-response', item['type'])
      props = item['properties']
      self.assertEquals([resp.status], props['bridgy-status'])
      self.assertEquals([json.loads(resp.activities_json[0])['url']],
                        props['bridgy-original-source'])
      self.assertEquals(resp.unsent, props['bridgy-target'])

    # check invite
    html = response.body.decode('utf-8')
    self.assertIn('Ms. Guest is invited.', html)
    self.assertNotIn('Mrs. Host is invited.', html)

    publish = hcard['children'][len(expected_resps)]
    self.assertIn('h-bridgy-publish', publish['type'])
    props = publish['properties']
    self.assertEquals([self.publishes[0].key.parent().id()], props['url'])
    self.assertEquals([self.publishes[0].status], props['bridgy-status'])