Ejemplos de trim_nulls en Python, ejemplos de util.trim_nulls en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: wordpress_rest.py Proyecto: dev511/bridgy

  def new(handler, auth_entity=None, **kwargs):
    """Creates and returns a WordPress for the logged in user.

    Args:
      handler: the current RequestHandler
      auth_entity: oauth_dropins.wordpress.WordPressAuth
    """
    auth_domain = auth_entity.key.id()
    site_info = WordPress.get_site_info(handler, auth_entity)
    if site_info is None:
      return

    urls = util.dedupe_urls(util.trim_nulls(
      [site_info.get('URL'), auth_entity.blog_url]))
    domains = [util.domain_from_link(u) for u in urls]

    avatar = (json.loads(auth_entity.user_json).get('avatar_URL')
              if auth_entity.user_json else None)
    return WordPress(id=domains[0],
                     auth_entity=auth_entity.key,
                     name=auth_entity.user_display_name(),
                     picture=avatar,
                     superfeedr_secret=util.generate_secret(),
                     url=urls[0],
                     domain_urls=urls,
                     domains=domains,
                     site_info=site_info,
                     **kwargs)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: models.py Proyecto: notenoughneon/bridgy

  def _url_and_domain(self, auth_entity):
    """Returns this source's URL and domain.

    Uses the auth entity user_json 'url' field by default. May be overridden
    by subclasses.

    Args:
      auth_entity: oauth_dropins.models.BaseAuth

    Returns: (string url, string domain, boolean ok) tuple
    """
    user_json = json.loads(auth_entity.user_json)
    actor = self.as_source.user_to_actor(user_json)
    urls = util.trim_nulls([actor.get('url')] +
                           # also look at G+'s urls field
                           [u.get('value') for u in user_json.get('urls', [])])

    first_url = first_domain = None
    for url in urls:
      # TODO: fully support multiple urls
      for url in url.split():
        url, domain, ok = util.get_webmention_target(url)
        if ok:
          domain = domain.lower()
          return url, domain, True
        elif not first_url:
          first_url = url
          first_domain = domain

    return first_url, first_domain, False

Ejemplo n.º 3

0

Mostrar archivo

Archivo: models.py Proyecto: chrisaldrich/bridgy

  def _urls_and_domains(self, auth_entity, user_url):
    """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: oauth_dropins.models.BaseAuth
      user_url: string, optional URL passed in when authorizing

    Returns: ([string url, ...], [string domain, ...])
    """
    actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
    logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

    candidates = util.trim_nulls(util.uniquify(
        [user_url] + microformats2.object_urls(actor)))

    if len(candidates) > MAX_AUTHOR_URLS:
      logging.warning('Too many profile links! Only resolving the first %s: %s',
                      MAX_AUTHOR_URLS, candidates)

    urls = []
    for i, url in enumerate(candidates):
      url, domain, send = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS)
      if send:
        urls.append(url)

    urls = util.dedupe_urls(urls)  # normalizes domains to lower case
    domains = [util.domain_from_link(url) for url in urls]
    return urls, domains

Ejemplo n.º 4

0

Mostrar archivo

    def get_or_save(self, source, restart=False):
        resp = super(Response, self).get_or_save()

        if (self.type != resp.type or source.gr_source.activity_changed(
                json_loads(resp.response_json),
                json_loads(self.response_json),
                log=True)):
            logging.info('Response changed! Re-propagating. Original: %s' %
                         resp)

            resp.old_response_jsons = resp.old_response_jsons[:10] + [
                resp.response_json
            ]

            response_json_to_append = json_loads(self.response_json)
            source.gr_source.append_in_reply_to(json_loads(resp.response_json),
                                                response_json_to_append)
            self.response_json = json_dumps(
                util.trim_nulls(response_json_to_append))
            resp.response_json = self.response_json
            resp.restart(source)
        elif restart and resp is not self:  # ie it already existed
            resp.restart(source)

        return resp

Ejemplo n.º 5

0

Mostrar archivo

Archivo: wordpress_rest.py Proyecto: LennonFlores/bridgy

    def new(handler, auth_entity=None, **kwargs):
        """Creates and returns a WordPress for the logged in user.

    Args:
      handler: the current RequestHandler
      auth_entity: oauth_dropins.wordpress.WordPressAuth
    """
        auth_domain = auth_entity.key.id()
        site_info = WordPress.get_site_info(handler, auth_entity)
        if site_info is None:
            return

        urls = util.dedupe_urls(
            util.trim_nulls([site_info.get('URL'), auth_entity.blog_url]))
        domains = [util.domain_from_link(u) for u in urls]

        avatar = (json.loads(auth_entity.user_json).get('avatar_URL')
                  if auth_entity.user_json else None)
        return WordPress(id=domains[0],
                         auth_entity=auth_entity.key,
                         name=auth_entity.user_display_name(),
                         picture=avatar,
                         superfeedr_secret=util.generate_secret(),
                         url=urls[0],
                         domain_urls=urls,
                         domains=domains,
                         site_info=site_info,
                         **kwargs)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: twitter.py Proyecto: baroquebobcat/activitystreams-unofficial

  def tweet_to_activity(self, tweet):
    """Converts a tweet to an activity.

    Args:
      tweet: dict, a decoded JSON tweet

    Returns:
      an ActivityStreams activity dict, ready to be JSON-encoded
    """
    object = self.tweet_to_object(tweet)
    activity = {
      'verb': 'post',
      'published': object.get('published'),
      'id': object.get('id'),
      'url': object.get('url'),
      'actor': object.get('author'),
      'object': object,
      }

    # yes, the source field has an embedded HTML link. bleh.
    # https://dev.twitter.com/docs/api/1/get/statuses/show/
    parsed = re.search('<a href="([^"]+)".*>(.+)</a>', tweet.get('source', ''))
    if parsed:
      url, name = parsed.groups()
      activity['generator'] = {'displayName': name, 'url': url}

    return util.trim_nulls(activity)

Ejemplo n.º 7

0

Mostrar archivo

    def process_webmention_links(self, e):
        """Generates pretty HTML for the links in a :class:`Webmentions` entity.

    Args:
      e: :class:`Webmentions` subclass (:class:`Response` or :class:`BlogPost`)
    """
        link = lambda url, g: util.pretty_link(
            url,
            glyphicon=g,
            attrs={'class': 'original-post u-bridgy-target'},
            new_tab=True)
        return util.trim_nulls({
            'Failed':
            set(link(url, 'exclamation-sign') for url in e.error + e.failed),
            'Sending':
            set(
                link(url, 'transfer') for url in e.unsent
                if url not in e.error),
            'Sent':
            set(
                link(url, None) for url in e.sent
                if url not in (e.error + e.unsent)),
            'No <a href="http://indiewebify.me/#send-webmentions">webmention</a> '
            'support':
            set(link(url, None) for url in e.skipped),
        })

Ejemplo n.º 8

0

Mostrar archivo

Archivo: models.py Proyecto: uniteddiversity/bridgy

  def _urls_and_domains(self, auth_entity, user_url):
    """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: oauth_dropins.models.BaseAuth
      user_url: string, optional URL passed in when authorizing

    Returns: ([string url, ...], [string domain, ...])
    """
    actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
    logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

    urls = []
    for url in util.trim_nulls(util.uniquify(
        [user_url] + [actor.get('url')] +
        [u.get('value') for u in actor.get('urls', [])])):
      domain = util.domain_from_link(url)
      if domain and not util.in_webmention_blacklist(domain.lower()):
        urls.append(url)

    urls = util.dedupe_urls(urls)
    domains = [util.domain_from_link(url).lower() for url in urls]
    return urls, domains

Ejemplo n.º 9

0

Mostrar archivo

Archivo: facebook.py Proyecto: baroquebobcat/activitystreams-unofficial

  def post_to_activity(self, post):
    """Converts a post to an activity.

    Args:
      post: dict, a decoded JSON post

    Returns:
      an ActivityStreams activity dict, ready to be JSON-encoded
    """
    object = self.post_to_object(post)
    activity = {
      'verb': 'post',
      'published': object.get('published'),
      'updated': object.get('updated'),
      'id': object.get('id'),
      'url': object.get('url'),
      'actor': object.get('author'),
      'object': object,
      }

    application = post.get('application')
    if application:
      activity['generator'] = {
        'displayName': application.get('name'),
        'id': self.tag_uri(application.get('id')),
        }

    return util.trim_nulls(activity)

Ejemplo n.º 10

0

Mostrar archivo

    def _urls_and_domains(self, auth_entity, user_url):
        """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
        actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
        logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

        candidates = util.trim_nulls(
            util.uniquify([user_url] + microformats2.object_urls(actor)))

        if len(candidates) > MAX_AUTHOR_URLS:
            logging.warning(
                'Too many profile links! Only resolving the first %s: %s',
                MAX_AUTHOR_URLS, candidates)

        urls = []
        for i, url in enumerate(candidates):
            url, domain, send = util.get_webmention_target(
                url, resolve=i < MAX_AUTHOR_URLS)
            if send:
                urls.append(url)

        urls = util.dedupe_urls(urls)  # normalizes domains to lower case
        domains = [util.domain_from_link(url) for url in urls]
        return urls, domains

Ejemplo n.º 11

0

Mostrar archivo

Archivo: instagram.py Proyecto: kylewm/bridgy

  def finish(self, auth_entity, state=None):
    if 'target_url' in self.decode_state_parameter(state):
      # this is an interactive publish
      return self.redirect(util.add_query_params(
        '/publish/instagram/finish',
        util.trim_nulls({'auth_entity': auth_entity.key.urlsafe(), 'state': state})))

    self.maybe_add_or_delete_source(Instagram, auth_entity, state)

Ejemplo n.º 12

0

Mostrar archivo

    def finish(self, auth_entity, state=None):
        if auth_entity:
            user_json = json.loads(auth_entity.user_json)

            # find instagram profile URL
            urls = user_json.get('rel-me', [])
            logging.info('rel-mes: %s', urls)
            for url in util.trim_nulls(urls):
                if util.domain_from_link(url) == gr_instagram.Instagram.DOMAIN:
                    username = urllib.parse.urlparse(url).path.strip('/')
                    break
            else:
                self.messages.add(
                    'No Instagram profile found. Please <a href="https://indieauth.com/setup">add an Instagram rel-me link</a>, then try again.'
                )
                return self.redirect('/')

            # check that instagram profile links to web site
            try:
                actor = gr_instagram.Instagram(scrape=True).get_actor(
                    username, ignore_rate_limit=True)
            except Exception as e:
                code, _ = util.interpret_http_exception(e)
                if code in Instagram.RATE_LIMIT_HTTP_CODES:
                    self.messages.add(
                        '<a href="https://github.com/snarfed/bridgy/issues/665#issuecomment-524977427">Apologies, Instagram is temporarily blocking us.</a> Please try again later!'
                    )
                    return self.redirect('/')
                else:
                    raise

            if not actor:
                self.messages.add(
                    "Couldn't find Instagram user '%s'. Please check your site's rel-me link and your Instagram account."
                    % username)
                return self.redirect('/')

            canonicalize = util.UrlCanonicalizer(redirects=False)
            website = canonicalize(auth_entity.key.id())
            urls = [canonicalize(u) for u in microformats2.object_urls(actor)]
            logging.info('Looking for %s in %s', website, urls)
            if website not in urls:
                self.messages.add(
                    "Please add %s to your Instagram profile's website or bio field and try again."
                    % website)
                return self.redirect('/')

            # check that the instagram account is public
            if not gr_source.Source.is_public(actor):
                self.messages.add(
                    'Your Instagram account is private. Bridgy only supports public accounts.'
                )
                return self.redirect('/')

        self.maybe_add_or_delete_source(Instagram,
                                        auth_entity,
                                        state,
                                        actor=actor)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: models.py Proyecto: mblaney/bridgy

    def _urls_and_domains(self, auth_entity, user_url):
        """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
        actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
        logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

        candidates = util.trim_nulls(
            util.uniquify([user_url] + microformats2.object_urls(actor)))

        if len(candidates) > MAX_AUTHOR_URLS:
            logging.info(
                'Too many profile links! Only resolving the first %s: %s',
                MAX_AUTHOR_URLS, candidates)

        urls = []
        for i, url in enumerate(candidates):
            final, domain, ok = util.get_webmention_target(
                url, resolve=i < MAX_AUTHOR_URLS)
            if ok:
                final = final.lower()
                if util.schemeless(final).startswith(
                        util.schemeless(url.lower())):
                    # redirected to a deeper path. use the original higher level URL. #652
                    final = url
                # If final has a path segment check if root has a matching rel=me.
                match = re.match(r'^(https?://[^/]+)/.+', final)
                if match and i < MAX_AUTHOR_URLS:
                    root = match.group(1)
                    resp = util.requests_get(root)
                    resp.raise_for_status()
                    data = util.mf2py_parse(resp.text, root)
                    me_urls = data.get('rels', {}).get('me', [])
                    if final in me_urls:
                        final = root
                urls.append(final)

        urls = util.dedupe_urls(urls)  # normalizes domains to lower case
        domains = [util.domain_from_link(url) for url in urls]
        return urls, domains

Ejemplo n.º 14

0

Mostrar archivo

  def urls_and_domains(self, auth_entity, user_url, actor=None,
                       resolve_source_domain=True):
    """Returns this user's valid (not webmention-blocklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing
      actor: dict, optional AS actor for the user. If provided, overrides
        auth_entity
      resolve_source_domain: boolean, whether to follow redirects on URLs on
        this source's domain

    Returns:
      ([string url, ...], [string domain, ...])
    """
    if not actor:
      actor = self.gr_source.user_to_actor(json_loads(auth_entity.user_json))
    logger.debug(f'Extracting URLs and domains from actor: {json_dumps(actor, indent=2)}')

    candidates = util.trim_nulls(util.uniquify(
        [user_url] + microformats2.object_urls(actor)))

    if len(candidates) > MAX_AUTHOR_URLS:
      logger.info(f'Too many profile links! Only resolving the first {MAX_AUTHOR_URLS}: {candidates}')

    urls = []
    for i, url in enumerate(candidates):
      on_source_domain = util.domain_from_link(url) == self.gr_source.DOMAIN
      resolve = ((resolve_source_domain or not on_source_domain) and
                 i < MAX_AUTHOR_URLS)
      resolved = self.resolve_profile_url(url, resolve=resolve)
      if resolved:
        urls.append(resolved)

    final_urls = []
    domains = []
    for url in util.dedupe_urls(urls):  # normalizes domains to lower case
      # skip links on this source's domain itself. only currently needed for
      # Mastodon; the other silo domains are in the webmention blocklist.
      domain = util.domain_from_link(url)
      if domain != self.gr_source.DOMAIN:
        final_urls.append(url)
        domains.append(domain)

    return final_urls, domains

Ejemplo n.º 15

0

Mostrar archivo

    def _urls_and_domains(self, auth_entity, user_url):
        """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
        user = json_loads(auth_entity.user_json)
        actor = (
            user.get('actor')  # for Instagram; its user_json is IndieAuth
            or self.gr_source.user_to_actor(user))
        logging.debug('Extracting URLs and domains from actor: %s',
                      json_dumps(actor, indent=2))

        candidates = util.trim_nulls(
            util.uniquify([user_url] + microformats2.object_urls(actor)))

        if len(candidates) > MAX_AUTHOR_URLS:
            logging.info(
                'Too many profile links! Only resolving the first %s: %s',
                MAX_AUTHOR_URLS, candidates)

        urls = []
        for i, url in enumerate(candidates):
            resolved = self.resolve_profile_url(url,
                                                resolve=i < MAX_AUTHOR_URLS)
            if resolved:
                urls.append(resolved)

        final_urls = []
        domains = []
        for url in util.dedupe_urls(urls):  # normalizes domains to lower case
            # skip links on this source's domain itself. only currently needed for
            # Mastodon; the other silo domains are in the webmention blacklist.
            domain = util.domain_from_link(url)
            if domain != self.gr_source.DOMAIN:
                final_urls.append(url)
                domains.append(domain)

        return final_urls, domains

Ejemplo n.º 16

0

Mostrar archivo

Archivo: models.py Proyecto: mblaney/bridgy

  def _urls_and_domains(self, auth_entity, user_url):
    """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
    actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
    logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

    candidates = util.trim_nulls(util.uniquify(
        [user_url] + microformats2.object_urls(actor)))

    if len(candidates) > MAX_AUTHOR_URLS:
      logging.info('Too many profile links! Only resolving the first %s: %s',
                   MAX_AUTHOR_URLS, candidates)

    urls = []
    for i, url in enumerate(candidates):
      final, domain, ok = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS)
      if ok:
        final = final.lower()
        if util.schemeless(final).startswith(util.schemeless(url.lower())):
          # redirected to a deeper path. use the original higher level URL. #652
          final = url
        # If final has a path segment check if root has a matching rel=me.
        match = re.match(r'^(https?://[^/]+)/.+', final)
        if match and i < MAX_AUTHOR_URLS:
          root = match.group(1)
          resp = util.requests_get(root)
          resp.raise_for_status()
          data = util.mf2py_parse(resp.text, root)
          me_urls = data.get('rels', {}).get('me', [])
          if final in me_urls:
            final = root
        urls.append(final)

    urls = util.dedupe_urls(urls)  # normalizes domains to lower case
    domains = [util.domain_from_link(url) for url in urls]
    return urls, domains

Ejemplo n.º 17

0

Mostrar archivo

Archivo: app.py Proyecto: LennonFlores/bridgy

  def process_webmention_links(self, e):
    """Generates pretty HTML for the links in a BlogWebmention entity.

    Args:
      e: BlogWebmention subclass (Response or BlogPost)
    """
    link = lambda url, g: util.pretty_link(
      url, glyphicon=g, attrs={'class': 'original-post u-bridgy-target'}, new_tab=True)
    return util.trim_nulls({
        'Failed': set(link(url, 'exclamation-sign') for url in e.error + e.failed),
        'Sending': set(link(url, 'transfer') for url in e.unsent
                       if url not in e.error),
        'Sent': set(link(url, None) for url in e.sent
                    if url not in (e.error + e.unsent)),
        'No <a href="http://indiewebify.me/#send-webmentions">webmention</a> '
        'support': set(link(url, None) for url in e.skipped),
        })

Ejemplo n.º 18

0

Mostrar archivo

Archivo: instagram.py Proyecto: v1cker/bridgy

    def finish(self, auth_entity, state=None):
        if auth_entity:
            user_json = json.loads(auth_entity.user_json)

            # find instagram profile URL
            urls = user_json.get('rel-me', [])
            logging.info('rel-mes: %s', urls)
            for url in util.trim_nulls(urls):
                if util.domain_from_link(url) == gr_instagram.Instagram.DOMAIN:
                    username = urlparse.urlparse(url).path.strip('/')
                    break
            else:
                self.messages.add(
                    'No Instagram profile found. Please <a href="https://indieauth.com/setup">'
                    'add an Instagram rel-me link</a>, then try again.')
                return self.redirect('/')

            # check that instagram profile links to web site
            actor = gr_instagram.Instagram(scrape=True).get_actor(
                username, ignore_rate_limit=True)
            if not actor:
                self.messages.add(
                    "Couldn't find Instagram user '%s'. Please check your site's rel-me "
                    "link and your Instagram account." % username)
                return self.redirect('/')

            canonicalize = util.UrlCanonicalizer(redirects=False)
            website = canonicalize(auth_entity.key.id())
            urls = [canonicalize(u) for u in microformats2.object_urls(actor)]
            logging.info('Looking for %s in %s', website, urls)
            if website not in urls:
                self.messages.add(
                    "Please add %s to your Instagram profile's website or "
                    'bio field and try again.' % website)
                return self.redirect('/')

            # check that the instagram account is public
            if not gr_source.Source.is_public(actor):
                self.messages.add('Your Instagram account is private. '
                                  'Bridgy only supports public accounts.')
                return self.redirect('/')

        self.maybe_add_or_delete_source(Instagram,
                                        auth_entity,
                                        state,
                                        actor=actor)

Ejemplo n.º 19

0

Mostrar archivo

Archivo: facebook.py Proyecto: qls0ulp/bridgy

    def canonicalize_url(self, url, activity=None, **kwargs):
        """Facebook-specific standardization of syndicated urls.

    Canonical form is https://www.facebook.com/USERID/posts/POSTID

    Args:
      url: a string, the url of the syndicated content
      activity: the activity this URL came from. If it has an fb_object_id,
        we'll use that instead of fetching the post from Facebook
      kwargs: unused

    Return:
      a string, the canonical form of the syndication url
    """
        if util.domain_from_link(url) != self.gr_source.DOMAIN:
            return None

        def post_url(id):
            return 'https://www.facebook.com/%s/posts/%s' % (self.key.id(), id)

        parsed = urllib.parse.urlparse(url)
        params = urllib.parse.parse_qs(parsed.query)
        path = parsed.path.strip('/').split('/')
        url_id = self.gr_source.post_id(url)

        ids = params.get('story_fbid') or params.get('fbid')
        if ids:
            url = post_url(ids[0])
        elif url_id:
            if path and path[0] == 'notes':
                url = post_url(url_id)
            else:
                object_id = self.cached_resolve_object_id(url_id,
                                                          activity=activity)
                if object_id:
                    url = post_url(object_id)
                elif path and len(path) > 1 and path[1] == 'posts':
                    url = post_url(url_id)

        for alternate_id in util.trim_nulls(
                itertools.chain((self.username or self.inferred_username, ),
                                self.inferred_user_ids)):
            url = url.replace('facebook.com/%s/' % alternate_id,
                              'facebook.com/%s/' % self.key.id())

        return super(FacebookPage, self).canonicalize_url(url)

Ejemplo n.º 20

0

Mostrar archivo

Archivo: twitter.py Proyecto: baroquebobcat/activitystreams-unofficial

  def tweet_to_object(self, tweet):
    """Converts a tweet to an object.

    Args:
      tweet: dict, a decoded JSON tweet

    Returns:
      an ActivityStreams object dict, ready to be JSON-encoded
    """
    object = {}

    id = tweet.get('id')
    if not id:
      return {}

    object = {
      'objectType': 'note',
      'published': self.rfc2822_to_iso8601(tweet.get('created_at')),
      'content': tweet.get('text'),
      }

    user = tweet.get('user')
    if user:
      object['author'] = self.user_to_actor(user)
      username = object['author'].get('username')
      if username:
        object['id'] = self.tag_uri(id)
        object['url'] = 'http://twitter.com/%s/status/%d' % (username, id)

    # currently the media list will only have photos. if that changes, though,
    # we'll need to make this conditional on media.type.
    # https://dev.twitter.com/docs/tweet-entities
    media_url = tweet.get('entities', {}).get('media', [{}])[0].get('media_url')
    if media_url:
      object['image'] = {'url': media_url}

    place = tweet.get('place')
    if place:
      object['location'] = {
        'displayName': place.get('full_name'),
        'id': place.get('id'),
        'url': place.get('url'),
        }

    return util.trim_nulls(object)

Ejemplo n.º 21

0

Mostrar archivo

Archivo: instagram.py Proyecto: paulscallanjr/bridgy

  def finish(self, auth_entity, state=None):
    if auth_entity:
      user_json = json.loads(auth_entity.user_json)

      # find instagram profile URL
      urls = user_json.get('rel-me', [])
      logging.info('rel-mes: %s', urls)
      for url in util.trim_nulls(urls):
        if util.domain_from_link(url) == gr_instagram.Instagram.DOMAIN:
          username = urlparse.urlparse(url).path.strip('/')
          break
      else:
        self.messages.add(
          'No Instagram profile found. Please <a href="https://indieauth.com/setup">'
          'add an Instagram rel-me link</a>, then try again.')
        return self.redirect_home_or_user_page(state)

      # check that instagram profile links to web site
      actor = gr_instagram.Instagram(scrape=True).get_actor(username)
      if not actor:
        self.messages.add(
          "Couldn't find Instagram user '%s'. Please check your site's rel-me "
          "link and your Instagram account." % username)
        return self.redirect_home_or_user_page(state)

      canonicalize = util.UrlCanonicalizer(redirects=False)
      website = canonicalize(auth_entity.key.id())
      urls = [canonicalize(u) for u in microformats2.object_urls(actor)]
      logging.info('Looking for %s in %s', website, urls)
      if website not in urls:
        self.messages.add("Please add %s to your Instagram profile's website or "
                          'bio field and try again.' % website)
        return self.redirect_home_or_user_page(state)

      # check that the instagram account is public
      if not gr_source.Source.is_public(actor):
        self.messages.add('Your Instagram account is private. '
                          'Bridgy only supports public accounts.')
        return self.redirect_home_or_user_page(state)

    source = self.maybe_add_or_delete_source(Instagram, auth_entity, state,
                                             actor=actor)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: facebook.py Proyecto: baroquebobcat/activitystreams-unofficial

  def post_to_object(self, post):
    """Converts a post to an object.

    Args:
      post: dict, a decoded JSON post

    Returns:
      an ActivityStreams object dict, ready to be JSON-encoded
    """
    object = {}

    id = post.get('id')
    if not id:
      return {}

    object = {
      'id': self.tag_uri(str(id)),
      'objectType': 'note',
      'published': post.get('created_time'),
      'updated': post.get('updated_time'),
      'content': post.get('message'),
      'author': self.user_to_actor(post.get('from')),
      # FB post ids are of the form USERID_POSTID
      'url': 'http://facebook.com/' + id.replace('_', '/posts/'),
      'image': {'url': post.get('picture')},
      }

    place = post.get('place')
    if place:
      object['location'] = {
        'displayName': place.get('name'),
        'id': place.get('id'),
        }
      location = place.get('location', {})
      lat = location.get('latitude')
      lon = location.get('longitude')
      if lat and lon:
        # ISO 6709 location string. details: http://en.wikipedia.org/wiki/ISO_6709
        object['location']['position'] = '%+f%+f/' % (lat, lon)

    return util.trim_nulls(object)

Ejemplo n.º 23

0

Mostrar archivo

Archivo: facebook.py Proyecto: snarfed/bridgy

  def canonicalize_url(self, url, activity=None, **kwargs):
    """Facebook-specific standardization of syndicated urls.

    Canonical form is https://www.facebook.com/USERID/posts/POSTID

    Args:
      url: a string, the url of the syndicated content
      activity: the activity this URL came from. If it has an fb_object_id,
        we'll use that instead of fetching the post from Facebook
      kwargs: unused

    Return:
      a string, the canonical form of the syndication url
    """
    if util.domain_from_link(url) != self.gr_source.DOMAIN:
      return None

    def post_url(id):
      return 'https://www.facebook.com/%s/posts/%s' % (self.key.id(), id)

    parsed = urlparse.urlparse(url)
    params = urlparse.parse_qs(parsed.query)
    url_id = self.gr_source.post_id(url)

    ids = params.get('story_fbid') or params.get('fbid')
    if ids:
      url = post_url(ids[0])
    elif url_id:
      if parsed.path.startswith('/notes/'):
        url = post_url(url_id)
      else:
        object_id = self.cached_resolve_object_id(url_id, activity=activity)
        if object_id:
          url = post_url(object_id)

    for alternate_id in util.trim_nulls(itertools.chain(
       (self.username or self.inferred_username,), self.inferred_user_ids)):
      url = url.replace('facebook.com/%s/' % alternate_id,
                        'facebook.com/%s/' % self.key.id())

    return super(FacebookPage, self).canonicalize_url(url)

Ejemplo n.º 24

0

Mostrar archivo

Archivo: twitter.py Proyecto: baroquebobcat/activitystreams-unofficial

  def user_to_actor(self, user):
    """Converts a tweet to an activity.

    Args:
      user: dict, a decoded JSON Twitter user

    Returns:
      an ActivityStreams actor dict, ready to be JSON-encoded
    """
    username = user.get('screen_name')
    if not username:
      return {}

    return util.trim_nulls({
      'displayName': user.get('name'),
      'image': {'url': user.get('profile_image_url')},
      'id': self.tag_uri(username) if username else None,
      'published': self.rfc2822_to_iso8601(user.get('created_at')),
      'url': 'http://twitter.com/%s' % username,
      'location': {'displayName': user.get('location')},
      'username': username,
      'description': user.get('description'),
      })

Ejemplo n.º 25

0

Mostrar archivo

Archivo: facebook.py Proyecto: baroquebobcat/activitystreams-unofficial

  def user_to_actor(self, user):
    """Converts a user to an actor.

    Args:
      user: dict, a decoded JSON Facebook user

    Returns:
      an ActivityStreams actor dict, ready to be JSON-encoded
    """
    if not user:
      return {}

    id = user.get('id')
    username = user.get('username')
    handle = username or id
    if not handle:
      return {}

    # facebook implements this as a 302 redirect
    image_url = 'http://graph.facebook.com/%s/picture?type=large' % handle
    actor = {
      'displayName': user.get('name'),
      'image': {'url': image_url},
      'id': self.tag_uri(handle),
      'updated': user.get('updated_time'),
      'url': user.get('link'),
      'username': username,
      'description': user.get('bio'),
      }

    location = user.get('location')
    if location:
      actor['location'] = {'id': location.get('id'),
                           'displayName': location.get('name')}

    return util.trim_nulls(actor)

Ejemplo n.º 26

0

Mostrar archivo

Archivo: test_util.py Proyecto: snarfed/webutil

  def test_trim_nulls(self):
    # basic
    self.assertEqual(None, util.trim_nulls(None))
    self.assertEqual('foo', util.trim_nulls('foo'))
    self.assertEqual([], util.trim_nulls([]))
    self.assertEqual({}, util.trim_nulls({}))
    self.assertEqual(set(), util.trim_nulls(set()))
    self.assertEqual((), util.trim_nulls(()))
    self.assertEqual({1: 0}, util.trim_nulls({1: 0}))  # numeric zero

    # lists
    self.assertEqual([{'xyz': 3}], util.trim_nulls([{'abc': None, 'xyz': 3}]))
    self.assertEqual({'a': ['b'], 'd': ['e']}, util.trim_nulls(
        {'a': ['b'], 'c': [None], 'd': [None, 'e', None], 'f': [[{}], {'a': []}]}))
    self.assertEqual({}, util.trim_nulls({1: None, 2: [], 3: {}, 4: set(),
                                          5: frozenset()}))

    # sets
    self.assertEqual(set((1, 2)), util.trim_nulls(set((1, None, 2))))
    self.assertEqual({'a': set(['b']), 'd': set(['e'])}, util.trim_nulls(
        {'a': set(['b']), 'c': set([None]), 'd': set([None, 'e', None])}))
    self.assertEqual(set(), util.trim_nulls(set((None,))))

    # dicts
    self.assertEqual({1: 2, 3: 4}, util.trim_nulls({1: 2, 3: 4}))
    self.assertEqual({3: 4, 2: 9}, util.trim_nulls({1: None, 3: 4, 5: [], 2: 9}))
    self.assertEqual({1: {3: 4}}, util.trim_nulls({1: {2: [], 3: 4}, 5: {6: None}}))

    # iterator and generator
    self.assertEqual(['a', 'b'], list(util.trim_nulls(iter(['a', None, 'b']))))
    self.assertEqual(['a', 'b'], list(util.trim_nulls(x for x in ['a', None, 'b'])))

Ejemplo n.º 27

0

Mostrar archivo

Archivo: util_test.py Proyecto: kylewm/webutil

  def test_trim_nulls(self):
    # basic
    self.assertEqual(None, util.trim_nulls(None))
    self.assertEqual('foo', util.trim_nulls('foo'))
    self.assertEqual([], util.trim_nulls([]))
    self.assertEqual({}, util.trim_nulls({}))
    self.assertEqual(set(), util.trim_nulls(set()))
    self.assertEqual((), util.trim_nulls(()))
    self.assertEqual({1: 0}, util.trim_nulls({1: 0}))  # numeric zero

    # lists
    self.assertEqual([{'xyz': 3}], util.trim_nulls([{'abc': None, 'xyz': 3}]))
    self.assertEqual({'a': ['b'], 'd': ['e']}, util.trim_nulls(
        {'a': ['b'], 'c': [None], 'd': [None, 'e', None], 'f': [[{}], {'a': []}]}))
    self.assertEqual({}, util.trim_nulls({1: None, 2: [], 3: {}, 4: set(),
                                          5: frozenset()}))

    # sets
    self.assertEqual(set((1, 2)), util.trim_nulls(set((1, None, 2))))
    self.assertEqual({'a': set(['b']), 'd': set(['e'])}, util.trim_nulls(
        {'a': set(['b']), 'c': set([None]), 'd': set([None, 'e', None])}))
    self.assertEqual(set(), util.trim_nulls(set((None,))))

    # dicts
    self.assertEqual({1: 2, 3: 4}, util.trim_nulls({1: 2, 3: 4}))
    self.assertEqual({3: 4, 2: 9}, util.trim_nulls({1: None, 3: 4, 5: [], 2: 9}))
    self.assertEqual({1: {3: 4}}, util.trim_nulls({1: {2: [], 3: 4}, 5: {6: None}}))

Ejemplo n.º 28

0

Mostrar archivo

Archivo: tasks.py Proyecto: mblaney/bridgy

  def backfeed(self, source, responses=None, activities=None):
    """Processes responses and activities and generates propagate tasks.

    Stores property names and values to update in source.updates.

    Args:
      source: Source
      responses: dict mapping AS response id to AS object
      activities: dict mapping AS activity id to AS object
    """
    if responses is None:
      responses = {}
    if activities is None:
      activities = {}

    # Cache to make sure we only fetch the author's h-feed(s) the
    # first time we see it
    fetched_hfeeds = set()

    # narrow down to just public activities
    public = {}
    private = {}
    for id, activity in activities.items():
      (public if source.is_activity_public(activity) else private)[id] = activity
    logging.info('Found %d public activities: %s', len(public), public.keys())
    logging.info('Found %d private activities: %s', len(private), private.keys())

    last_public_post = (source.last_public_post or util.EPOCH).isoformat()
    public_published = util.trim_nulls([a.get('published') for a in public.values()])
    if public_published:
      max_published = max(public_published)
      if max_published > last_public_post:
        last_public_post = max_published
        source.updates['last_public_post'] = \
          util.as_utc(util.parse_iso8601(max_published))

    source.updates['recent_private_posts'] = \
      len([a for a in private.values()
           if a.get('published', util.EPOCH_ISO) > last_public_post])

    #
    # Step 2: extract responses, store their activities in response['activities']
    #
    # WARNING: this creates circular references in link posts found by search
    # queries in step 1, since they are their own activity. We use
    # prune_activity() and prune_response() in step 4 to remove these before
    # serializing to JSON.
    #
    for id, activity in public.items():
      obj = activity.get('object') or activity

      # handle user mentions
      user_id = source.user_tag_id()
      if obj.get('author', {}).get('id') != user_id:
        for tag in obj.get('tags', []):
          urls = tag.get('urls')
          if tag.get('objectType') == 'person' and tag.get('id') == user_id and urls:
            activity['originals'], activity['mentions'] = \
              original_post_discovery.discover(
                source, activity, fetch_hfeed=True,
                include_redirect_sources=False,
                already_fetched_hfeeds=fetched_hfeeds)
            activity['mentions'].update(u.get('value') for u in urls)
            responses[id] = activity
            break

      # handle quote mentions
      for att in obj.get('attachments', []):
        if (att.get('objectType') in ('note', 'article')
                and att.get('author', {}).get('id') == source.user_tag_id()):
          # now that we've confirmed that one exists, OPD will dig
          # into the actual attachments
          if 'originals' not in activity or 'mentions' not in activity:
            activity['originals'], activity['mentions'] = \
              original_post_discovery.discover(
                source, activity, fetch_hfeed=True,
                include_redirect_sources=False,
                already_fetched_hfeeds=fetched_hfeeds)
          responses[id] = activity
          break

      # extract replies, likes, reactions, reposts, and rsvps
      replies = obj.get('replies', {}).get('items', [])
      tags = obj.get('tags', [])
      likes = [t for t in tags if Response.get_type(t) == 'like']
      reactions = [t for t in tags if Response.get_type(t) == 'react']
      reposts = [t for t in tags if Response.get_type(t) == 'repost']
      rsvps = Source.get_rsvps_from_event(obj)

      # coalesce responses. drop any without ids
      for resp in replies + likes + reactions + reposts + rsvps:
        id = resp.get('id')
        if not id:
          logging.error('Skipping response without id: %s', json.dumps(resp, indent=2))
          continue

        if source.is_blocked(resp):
          logging.info('Skipping response by blocked user: %s',
                       json.dumps(resp.get('author') or resp.get('actor'), indent=2))
          continue

        resp.setdefault('activities', []).append(activity)

        # when we find two responses with the same id, the earlier one may have
        # come from a link post or user mention, and this one is probably better
        # since it probably came from the user's activity, so prefer this one.
        # background: https://github.com/snarfed/bridgy/issues/533
        existing = responses.get(id)
        if existing:
          if source.gr_source.activity_changed(resp, existing, log=True):
            logging.warning('Got two different versions of same response!\n%s\n%s',
                            existing, resp)
          resp['activities'].extend(existing.get('activities', []))

        responses[id] = resp

    #
    # Step 3: filter out responses we've already seen
    #
    # seen responses (JSON objects) for each source are stored in its entity.
    unchanged_responses = []
    if source.seen_responses_cache_json:
      for seen in json.loads(source.seen_responses_cache_json):
        id = seen['id']
        resp = responses.get(id)
        if resp and not source.gr_source.activity_changed(seen, resp, log=True):
          unchanged_responses.append(seen)
          del responses[id]

    #
    # Step 4: store new responses and enqueue propagate tasks
    #
    pruned_responses = []
    for id, resp in responses.items():
      resp_type = Response.get_type(resp)
      activities = resp.pop('activities', [])
      if not activities and resp_type == 'post':
        activities = [resp]
      too_long = set()
      urls_to_activity = {}
      for i, activity in enumerate(activities):
        # we'll usually have multiple responses for the same activity, and the
        # objects in resp['activities'] are shared, so cache each activity's
        # discovered webmention targets inside its object.
        if 'originals' not in activity or 'mentions' not in activity:
          activity['originals'], activity['mentions'] = \
            original_post_discovery.discover(
              source, activity, fetch_hfeed=True,
              include_redirect_sources=False,
              already_fetched_hfeeds=fetched_hfeeds)

        targets = original_post_discovery.targets_for_response(
          resp, originals=activity['originals'], mentions=activity['mentions'])
        if targets:
          logging.info('%s has %d webmention target(s): %s', activity.get('url'),
                       len(targets), ' '.join(targets))
        for t in targets:
          if len(t) <= _MAX_STRING_LENGTH:
            urls_to_activity[t] = i
          else:
            logging.info('Giving up on target URL over %s chars! %s',
                         _MAX_STRING_LENGTH, t)
            too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...')

      # store/update response entity. the prune_*() calls are important to
      # remove circular references in link responses, which are their own
      # activities. details in the step 2 comment above.
      pruned_response = util.prune_response(resp)
      pruned_responses.append(pruned_response)
      resp_entity = Response(
        id=id,
        source=source.key,
        activities_json=[json.dumps(util.prune_activity(a, source))
                         for a in activities],
        response_json=json.dumps(pruned_response),
        type=resp_type,
        unsent=list(urls_to_activity.keys()),
        failed=list(too_long),
        original_posts=resp.get('originals', []))
      if urls_to_activity and len(activities) > 1:
        resp_entity.urls_to_activity=json.dumps(urls_to_activity)
      resp_entity.get_or_save(source, restart=self.RESTART_EXISTING_TASKS)

    # update cache
    if pruned_responses:
      source.updates['seen_responses_cache_json'] = json.dumps(
        pruned_responses + unchanged_responses)

Ejemplo n.º 29

0

Mostrar archivo

Archivo: facebook.py Proyecto: priscila225/bridgy

  def get_activities_response(self, **kwargs):
    # TODO: use batch API to get photos, events, etc in one request
    # https://developers.facebook.com/docs/graph-api/making-multiple-requests
    try:
      resp = self.gr_source.get_activities_response(group_id=SELF, **kwargs)

      # if it's requesting one specific activity, then we're done
      if 'activity_id' in kwargs:
        return resp

      # also get uploaded photos manually since facebook sometimes collapses
      # multiple photos into albums, and the album post object won't have the
      # post content, comments, etc. from the individual photo posts.
      # http://stackoverflow.com/questions/12785120
      #
      # TODO: save and use ETag for all of these extra calls
      photos = self.get_data(API_PHOTOS)

      # also get events and RSVPs
      # https://developers.facebook.com/docs/graph-api/reference/user/events/
      # https://developers.facebook.com/docs/graph-api/reference/event#edges
      # TODO: also fetch and use API_USER_RSVPS_DECLINED
      user_rsvps = self.get_data(API_USER_RSVPS)

      # have to re-fetch the events because the user rsvps response doesn't
      # include the event description, which we need for original post links.
      events = [self.gr_source.urlopen(API_EVENT % r['id'])
                for r in user_rsvps if r.get('id')]

      # also, only process events that the user is the owner of. avoids (but
      # doesn't prevent) processing big non-indieweb events with tons of
      # attendees that put us over app engine's instance memory limit. details:
      # https://github.com/snarfed/bridgy/issues/77
      events_and_rsvps = [(e, self.get_data(API_EVENT_RSVPS % e['id']))
                          for e in events
                          if e.get('owner', {}).get('id') == self.key.id()]

    except urllib2.HTTPError as e:
      # Facebook API error details:
      # https://developers.facebook.com/docs/graph-api/using-graph-api/#receiving-errorcodes
      # https://developers.facebook.com/docs/reference/api/errors/
      exc_type, _, exc_traceback = sys.exc_info()
      body = e.read()
      exc_copy = exc_type(e.filename, e.code, e.msg, e.hdrs, cStringIO.StringIO(body))

      try:
        body_json = json.loads(body)
      except:
        logging.exception('Non-JSON response body: %s', body)
        # response isn't JSON. ignore and re-raise the original exception
        raise exc_type, exc_copy, exc_traceback

      error = body_json.get('error', {})
      if error.get('code') in (102, 190):
        subcode = error.get('error_subcode')
        if subcode == 458:  # revoked
          raise models.DisableSource()
        elif subcode in (463, 460):  # expired, changed password
          # ask the user to reauthenticate
          self.gr_source.create_notification(
            self.key.id(),
            "Brid.gy's access to your account has expired. Click here to renew it now!",
            'https://www.brid.gy/facebook/start')
          raise models.DisableSource()

      # other error. re-raise original exception
      raise exc_type, exc_copy, exc_traceback

    # add photos. they show up as both a post and a photo, each with a separate
    # id. the post's object_id field points to the photo's id. de-dupe by
    # switching the post to use the fb_object_id when it's provided.
    activities = resp.setdefault('items', [])
    activities_by_fb_id = {}
    for activity in activities:
      obj = activity.get('object', {})
      fb_id = obj.get('fb_object_id')
      if not fb_id:
        continue

      activities_by_fb_id[fb_id] = activity
      for x in activity, obj:
        parsed = util.parse_tag_uri(x.get('id', ''))
        if parsed:
          _, orig_id = parsed
          x['id'] = self.gr_source.tag_uri(fb_id)
          x['url'] = x.get('url', '').replace(orig_id, fb_id)

    # merge comments and likes from existing photo objects, and add new ones.
    for photo in photos:
      photo_activity = self.gr_source.post_to_activity(photo)
      existing = activities_by_fb_id.get(photo.get('id'))
      if existing:
        existing['object'].setdefault('replies', {}).setdefault('items', []).extend(
          photo_activity['object'].get('replies', {}).get('items', []))
        existing['object'].setdefault('tags', []).extend(
            [t for t in photo_activity['object'].get('tags', [])
             if t.get('verb') == 'like'])
      else:
        activities.append(photo_activity)

    # add events
    activities += [self.gr_source.event_to_activity(e, rsvps=r)
                   for e, r in events_and_rsvps]

    # TODO: remove once we're confident in our id parsing. (i'm going to canary
    # with just a few users before i do it for everyone.)
    #
    # discard objects with ids with colons in them. Background:
    # https://github.com/snarfed/bridgy/issues/305
    def remove_bad_ids(objs, label):
      ret = []
      for o in objs:
        id = util.parse_tag_uri(o.get('id') or o.get('object', {}).get('id') or '')
        if id and ':' in id[1]:
          logging.warning('Cowardly ignoring %s with bad id: %s', label,  id[1])
        else:
          ret.append(o)
      return ret

    resp['items'] = remove_bad_ids(activities, 'activity')
    for activity in resp['items']:
      obj = activity.get('object', {})
      obj['tags'] = remove_bad_ids(obj.setdefault('tags', []), 'tag/like')
      replies = obj.get('replies', {})
      items = replies.get('items')
      if items:
        replies['items'] = remove_bad_ids(items, 'comment')
        replies['totalItems'] = len(replies['items'])

    return util.trim_nulls(resp)

Ejemplo n.º 30

0

Mostrar archivo

Archivo: tasks.py Proyecto: LennonFlores/bridgy

class Poll(webapp2.RequestHandler):
    """Task handler that fetches and processes new responses from a single source.

  Request parameters:
    source_key: string key of source entity
    last_polled: timestamp, YYYY-MM-DD-HH-MM-SS

  Inserts a propagate task for each response that hasn't been seen before.
  """
    def post(self, *path_args):
        logging.debug('Params: %s', self.request.params)

        key = self.request.params['source_key']
        source = ndb.Key(urlsafe=key).get()
        if not source or source.status == 'disabled' or 'listen' not in source.features:
            logging.error('Source not found or disabled. Dropping task.')
            return
        logging.info('Source: %s %s, %s', source.label(),
                     source.key.string_id(), source.bridgy_url(self))

        last_polled = self.request.params['last_polled']
        if last_polled != source.last_polled.strftime(
                util.POLL_TASK_DATETIME_FORMAT):
            logging.warning(
                'duplicate poll task! deferring to the other task.')
            return

        logging.info('Last poll: %s/log?start_time=%s&key=%s',
                     self.request.host_url,
                     calendar.timegm(source.last_poll_attempt.utctimetuple()),
                     source.key.urlsafe())

        # mark this source as polling
        source.updates = {
            'poll_status': 'polling',
            'last_poll_attempt': util.now_fn(),
        }
        source = models.Source.put_updates(source)

        source.updates = {}
        try:
            self.poll(source)
        except models.DisableSource:
            # the user deauthorized the bridgy app, so disable this source.
            # let the task complete successfully so that it's not retried.
            source.updates['status'] = 'disabled'
            logging.warning('Disabling source!')
        except:
            source.updates['poll_status'] = 'error'
            raise
        finally:
            source = models.Source.put_updates(source)

        # add new poll task. randomize task ETA to within +/- 20% to try to spread
        # out tasks and prevent thundering herds.
        task_countdown = source.poll_period().total_seconds() * random.uniform(
            .8, 1.2)
        util.add_poll_task(source, countdown=task_countdown)

        # feeble attempt to avoid hitting the instance memory limit
        source = None
        gc.collect()

    def poll(self, source):
        """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
        if source.last_activities_etag or source.last_activity_id:
            logging.debug('Using ETag %s, last activity id %s',
                          source.last_activities_etag, source.last_activity_id)

        #
        # Step 1: fetch activities:
        # * posts by the user
        # * search all posts for the user's domain URLs to find links
        #
        cache = util.CacheDict()
        if source.last_activities_cache_json:
            cache.update(json.loads(source.last_activities_cache_json))

        try:
            # search for links first so that the user's activities and responses
            # override them if they overlap
            links = source.search_for_links()

            # this user's own activities (and user mentions)
            resp = source.get_activities_response(
                fetch_replies=True,
                fetch_likes=True,
                fetch_shares=True,
                fetch_mentions=True,
                count=50,
                etag=source.last_activities_etag,
                min_id=source.last_activity_id,
                cache=cache)
            etag = resp.get('etag')  # used later
            user_activities = resp.get('items', [])

            # these map ids to AS objects
            responses = {a['id']: a for a in links}
            activities = {a['id']: a for a in links + user_activities}

        except Exception, e:
            code, body = util.interpret_http_exception(e)
            if code == '401':
                msg = 'Unauthorized error: %s' % e
                logging.warning(msg, exc_info=True)
                source.updates['poll_status'] = 'ok'
                raise models.DisableSource(msg)
            elif code in util.HTTP_RATE_LIMIT_CODES:
                logging.warning(
                    'Rate limited. Marking as error and finishing. %s', e)
                source.updates.update({
                    'poll_status': 'error',
                    'rate_limited': True
                })
                return
            elif (code
                  and int(code) / 100 == 5) or util.is_connection_failure(e):
                logging.error(
                    'API call failed. Marking as error and finishing. %s: %s\n%s',
                    code, body, e)
                self.abort(ERROR_HTTP_RETURN_CODE)
            else:
                raise

        # extract silo activity ids, update last_activity_id
        silo_activity_ids = set()
        last_activity_id = source.last_activity_id
        for id, activity in activities.items():
            # maybe replace stored last activity id
            parsed = util.parse_tag_uri(id)
            if parsed:
                id = parsed[1]
            silo_activity_ids.add(id)
            try:
                # try numeric comparison first
                greater = int(id) > int(last_activity_id)
            except (TypeError, ValueError):
                greater = id > last_activity_id
            if greater:
                last_activity_id = id

        if last_activity_id and last_activity_id != source.last_activity_id:
            source.updates['last_activity_id'] = last_activity_id

        # trim cache to just the returned activity ids, so that it doesn't grow
        # without bound. (WARNING: depends on get_activities_response()'s cache key
        # format, e.g. 'PREFIX ACTIVITY_ID'!)
        source.updates['last_activities_cache_json'] = json.dumps({
            k: v
            for k, v in cache.items() if k.split()[-1] in silo_activity_ids
        })

        # Cache to make sure we only fetch the author's h-feed(s) the
        # first time we see it
        fetched_hfeeds = set()

        # narrow down to just public activities
        public = {}
        private = {}
        for id, activity in activities.items():
            (public if source.is_activity_public(activity) else
             private)[id] = activity
        logging.info('Found %d public activities: %s', len(public),
                     public.keys())
        logging.info('Found %d private activities: %s', len(private),
                     private.keys())

        last_public_post = (source.last_public_post or util.EPOCH).isoformat()
        public_published = util.trim_nulls(
            [a.get('published') for a in public.values()])
        if public_published:
            max_published = max(public_published)
            if max_published > last_public_post:
                last_public_post = max_published
                source.updates['last_public_post'] = \
                  util.as_utc(util.parse_iso8601(max_published))

        source.updates['recent_private_posts'] = \
          len([a for a in private.values()
               if a.get('published', util.EPOCH_ISO) > last_public_post])

        #
        # Step 2: extract responses, store their activities in response['activities']
        #
        # WARNING: this creates circular references in link posts found by search
        # queries in step 1, since they are their own activity. We use
        # prune_activity() and prune_response() in step 4 to remove these before
        # serializing to JSON.
        #
        for id, activity in public.items():
            obj = activity.get('object') or activity

            # handle user mentions
            user_id = source.user_tag_id()
            if obj.get('author', {}).get('id') != user_id:
                for tag in obj.get('tags', []):
                    urls = tag.get('urls')
                    if tag.get('objectType') == 'person' and tag.get(
                            'id') == user_id and urls:
                        activity['originals'], activity['mentions'] = \
                          original_post_discovery.discover(
                            source, activity, fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds)
                        activity['mentions'].update(
                            u.get('value') for u in urls)
                        responses[id] = activity
                        break

            # handle quote mentions
            for att in obj.get('attachments', []):
                if (att.get('objectType') in ('note', 'article') and att.get(
                        'author', {}).get('id') == source.user_tag_id()):
                    # now that we've confirmed that one exists, OPD will dig
                    # into the actual attachments
                    if 'originals' not in activity or 'mentions' not in activity:
                        activity['originals'], activity['mentions'] = \
                          original_post_discovery.discover(
                            source, activity, fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds)
                    responses[id] = activity
                    break

            # extract replies, likes, reactions, reposts, and rsvps
            replies = obj.get('replies', {}).get('items', [])
            tags = obj.get('tags', [])
            likes = [t for t in tags if Response.get_type(t) == 'like']
            reactions = [t for t in tags if Response.get_type(t) == 'react']
            reposts = [t for t in tags if Response.get_type(t) == 'repost']
            rsvps = Source.get_rsvps_from_event(obj)

            # coalesce responses. drop any without ids
            for resp in replies + likes + reactions + reposts + rsvps:
                id = resp.get('id')
                if not id:
                    logging.error('Skipping response without id: %s',
                                  json.dumps(resp, indent=2))
                    continue

                resp.setdefault('activities', []).append(activity)

                # when we find two responses with the same id, the earlier one may have
                # come from a link post or user mention, and this one is probably better
                # since it probably came from the user's activity, so prefer this one.
                # background: https://github.com/snarfed/bridgy/issues/533
                existing = responses.get(id)
                if existing:
                    if source.gr_source.activity_changed(resp,
                                                         existing,
                                                         log=True):
                        logging.warning(
                            'Got two different versions of same response!\n%s\n%s',
                            existing, resp)
                    resp['activities'].extend(existing.get('activities', []))

                responses[id] = resp

        #
        # Step 3: filter out responses we've already seen
        #
        # seen responses (JSON objects) for each source are stored in its entity.
        unchanged_responses = []
        if source.seen_responses_cache_json:
            for seen in json.loads(source.seen_responses_cache_json):
                id = seen['id']
                resp = responses.get(id)
                if resp and not source.gr_source.activity_changed(
                        seen, resp, log=True):
                    unchanged_responses.append(seen)
                    del responses[id]

        #
        # Step 4: store new responses and enqueue propagate tasks
        #
        pruned_responses = []
        for id, resp in responses.items():
            resp_type = Response.get_type(resp)
            activities = resp.pop('activities', [])
            if not activities and resp_type == 'post':
                activities = [resp]
            too_long = set()
            urls_to_activity = {}
            for i, activity in enumerate(activities):
                # we'll usually have multiple responses for the same activity, and the
                # objects in resp['activities'] are shared, so cache each activity's
                # discovered webmention targets inside its object.
                if 'originals' not in activity or 'mentions' not in activity:
                    activity['originals'], activity['mentions'] = \
                      original_post_discovery.discover(
                        source, activity, fetch_hfeed=True,
                        include_redirect_sources=False,
                        already_fetched_hfeeds=fetched_hfeeds)

                targets = original_post_discovery.targets_for_response(
                    resp,
                    originals=activity['originals'],
                    mentions=activity['mentions'])
                if targets:
                    logging.info('%s has %d webmention target(s): %s',
                                 activity.get('url'), len(targets),
                                 ' '.join(targets))
                for t in targets:
                    if len(t) <= _MAX_STRING_LENGTH:
                        urls_to_activity[t] = i
                    else:
                        logging.warning(
                            'Giving up on target URL over %s chars! %s',
                            _MAX_STRING_LENGTH, t)
                        too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...')

            # store/update response entity. the prune_*() calls are important to
            # remove circular references in link responses, which are their own
            # activities. details in the step 2 comment above.
            pruned_response = util.prune_response(resp)
            pruned_responses.append(pruned_response)
            resp_entity = Response(id=id,
                                   source=source.key,
                                   activities_json=[
                                       json.dumps(
                                           util.prune_activity(a, source))
                                       for a in activities
                                   ],
                                   response_json=json.dumps(pruned_response),
                                   type=resp_type,
                                   unsent=list(urls_to_activity.keys()),
                                   failed=list(too_long),
                                   original_posts=resp.get('originals', []))
            if urls_to_activity and len(activities) > 1:
                resp_entity.urls_to_activity = json.dumps(urls_to_activity)
            resp_entity.get_or_save(source)

        # update cache
        if pruned_responses:
            source.updates['seen_responses_cache_json'] = json.dumps(
                pruned_responses + unchanged_responses)

        source.updates.update({
            'last_polled': source.last_poll_attempt,
            'poll_status': 'ok'
        })
        if etag and etag != source.last_activities_etag:
            source.updates['last_activities_etag'] = etag

        #
        # Step 5. possibly refetch updated syndication urls
        #
        # if the author has added syndication urls since the first time
        # original_post_discovery ran, we'll miss them. this cleanup task will
        # periodically check for updated urls. only kicks in if the author has
        # *ever* published a rel=syndication url
        if source.should_refetch():
            logging.info('refetching h-feed for source %s', source.label())
            relationships = original_post_discovery.refetch(source)

            now = util.now_fn()
            source.updates['last_hfeed_refetch'] = now

            if relationships:
                logging.info(
                    'refetch h-feed found new rel=syndication relationships: %s',
                    relationships)
                try:
                    self.repropagate_old_responses(source, relationships)
                except BaseException, e:
                    if (isinstance(e, (datastore_errors.BadRequestError,
                                       datastore_errors.Timeout))
                            or util.is_connection_failure(e)):
                        logging.info('Timeout while repropagating responses.',
                                     exc_info=True)
                    else:
                        raise

Ejemplo n.º 31

0

Mostrar archivo

    def backfeed(self, source, responses=None, activities=None):
        """Processes responses and activities and generates propagate tasks.

    Stores property names and values to update in source.updates.

    Args:
      source: Source
      responses: dict mapping AS response id to AS object
      activities: dict mapping AS activity id to AS object
    """
        if responses is None:
            responses = {}
        if activities is None:
            activities = {}

        # Cache to make sure we only fetch the author's h-feed(s) the
        # first time we see it
        fetched_hfeeds = set()

        # narrow down to just public activities
        public = {}
        private = {}
        for id, activity in activities.items():
            (public if source.is_activity_public(activity) else
             private)[id] = activity
        logging.info('Found %d public activities: %s', len(public),
                     public.keys())
        logging.info('Found %d private activities: %s', len(private),
                     private.keys())

        last_public_post = (source.last_public_post or util.EPOCH).isoformat()
        public_published = util.trim_nulls(
            [a.get('published') for a in public.values()])
        if public_published:
            max_published = max(public_published)
            if max_published > last_public_post:
                last_public_post = max_published
                source.updates['last_public_post'] = \
                  util.as_utc(util.parse_iso8601(max_published))

        source.updates['recent_private_posts'] = \
          len([a for a in private.values()
               if a.get('published', util.EPOCH_ISO) > last_public_post])

        #
        # Step 2: extract responses, store their activities in response['activities']
        #
        # WARNING: this creates circular references in link posts found by search
        # queries in step 1, since they are their own activity. We use
        # prune_activity() and prune_response() in step 4 to remove these before
        # serializing to JSON.
        #
        for id, activity in public.items():
            obj = activity.get('object') or activity

            # handle user mentions
            user_id = source.user_tag_id()
            if obj.get(
                    'author',
                {}).get('id') != user_id and activity.get('verb') != 'share':
                for tag in obj.get('tags', []):
                    urls = tag.get('urls')
                    if tag.get('objectType') == 'person' and tag.get(
                            'id') == user_id and urls:
                        activity['originals'], activity['mentions'] = \
                          original_post_discovery.discover(
                            source, activity, fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds)
                        activity['mentions'].update(
                            u.get('value') for u in urls)
                        responses[id] = activity
                        break

            # handle quote mentions
            for att in obj.get('attachments', []):
                if (att.get('objectType') in ('note', 'article') and att.get(
                        'author', {}).get('id') == source.user_tag_id()):
                    # now that we've confirmed that one exists, OPD will dig
                    # into the actual attachments
                    if 'originals' not in activity or 'mentions' not in activity:
                        activity['originals'], activity['mentions'] = \
                          original_post_discovery.discover(
                            source, activity, fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds)
                    responses[id] = activity
                    break

            # extract replies, likes, reactions, reposts, and rsvps
            replies = obj.get('replies', {}).get('items', [])
            tags = obj.get('tags', [])
            likes = [t for t in tags if Response.get_type(t) == 'like']
            reactions = [t for t in tags if Response.get_type(t) == 'react']
            reposts = [t for t in tags if Response.get_type(t) == 'repost']
            rsvps = Source.get_rsvps_from_event(obj)

            # coalesce responses. drop any without ids
            for resp in replies + likes + reactions + reposts + rsvps:
                id = resp.get('id')
                if not id:
                    logging.error('Skipping response without id: %s',
                                  json_dumps(resp, indent=2))
                    continue

                if source.is_blocked(resp):
                    logging.info(
                        'Skipping response by blocked user: %s',
                        json_dumps(resp.get('author') or resp.get('actor'),
                                   indent=2))
                    continue

                resp.setdefault('activities', []).append(activity)

                # when we find two responses with the same id, the earlier one may have
                # come from a link post or user mention, and this one is probably better
                # since it probably came from the user's activity, so prefer this one.
                # background: https://github.com/snarfed/bridgy/issues/533
                existing = responses.get(id)
                if existing:
                    if source.gr_source.activity_changed(resp,
                                                         existing,
                                                         log=True):
                        logging.warning(
                            'Got two different versions of same response!\n%s\n%s',
                            existing, resp)
                    resp['activities'].extend(existing.get('activities', []))

                responses[id] = resp

        #
        # Step 3: filter out responses we've already seen
        #
        # seen responses (JSON objects) for each source are stored in its entity.
        unchanged_responses = []
        if source.seen_responses_cache_json:
            for seen in json_loads(source.seen_responses_cache_json):
                id = seen['id']
                resp = responses.get(id)
                if resp and not source.gr_source.activity_changed(
                        seen, resp, log=True):
                    unchanged_responses.append(seen)
                    del responses[id]

        #
        # Step 4: store new responses and enqueue propagate tasks
        #
        pruned_responses = []
        source.blocked_ids = None

        for id, resp in responses.items():
            resp_type = Response.get_type(resp)
            activities = resp.pop('activities', [])
            if not activities and resp_type == 'post':
                activities = [resp]
            too_long = set()
            urls_to_activity = {}
            for i, activity in enumerate(activities):
                # we'll usually have multiple responses for the same activity, and the
                # objects in resp['activities'] are shared, so cache each activity's
                # discovered webmention targets inside its object.
                if 'originals' not in activity or 'mentions' not in activity:
                    activity['originals'], activity['mentions'] = \
                      original_post_discovery.discover(
                        source, activity, fetch_hfeed=True,
                        include_redirect_sources=False,
                        already_fetched_hfeeds=fetched_hfeeds)

                targets = original_post_discovery.targets_for_response(
                    resp,
                    originals=activity['originals'],
                    mentions=activity['mentions'])
                if targets:
                    logging.info('%s has %d webmention target(s): %s',
                                 activity.get('url'), len(targets),
                                 ' '.join(targets))
                    # new response to propagate! load block list if we haven't already
                    if source.blocked_ids is None:
                        source.load_blocklist()

                for t in targets:
                    if len(t) <= _MAX_STRING_LENGTH:
                        urls_to_activity[t] = i
                    else:
                        logging.info(
                            'Giving up on target URL over %s chars! %s',
                            _MAX_STRING_LENGTH, t)
                        too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...')

            # store/update response entity. the prune_*() calls are important to
            # remove circular references in link responses, which are their own
            # activities. details in the step 2 comment above.
            pruned_response = util.prune_response(resp)
            pruned_responses.append(pruned_response)
            resp_entity = Response(id=id,
                                   source=source.key,
                                   activities_json=[
                                       json_dumps(
                                           util.prune_activity(a, source))
                                       for a in activities
                                   ],
                                   response_json=json_dumps(pruned_response),
                                   type=resp_type,
                                   unsent=list(urls_to_activity.keys()),
                                   failed=list(too_long),
                                   original_posts=resp.get('originals', []))
            if urls_to_activity and len(activities) > 1:
                resp_entity.urls_to_activity = json_dumps(urls_to_activity)
            resp_entity.get_or_save(source,
                                    restart=self.RESTART_EXISTING_TASKS)

        # update cache
        if pruned_responses:
            source.updates['seen_responses_cache_json'] = json_dumps(
                pruned_responses + unchanged_responses)

Ejemplo n.º 32

0

Mostrar archivo

Archivo: util_test.py Proyecto: baroquebobcat/activitystreams-unofficial

 def test_none(self):
   self.assertEqual(None, util.trim_nulls(None))

Ejemplo n.º 33

0

Mostrar archivo

Archivo: tasks.py Proyecto: snarfed/bridgy

    def poll(self, source):
        """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
        if source.last_activities_etag or source.last_activity_id:
            logging.debug("Using ETag %s, last activity id %s", source.last_activities_etag, source.last_activity_id)

        #
        # Step 1: fetch activities:
        # * posts by the user
        # * search all posts for the user's domain URLs to find links
        #
        cache = util.CacheDict()
        if source.last_activities_cache_json:
            cache.update(json.loads(source.last_activities_cache_json))

        # search for links first so that the user's activities and responses
        # override them if they overlap
        links = source.search_for_links()

        # this user's own activities (and user mentions)
        resp = source.get_activities_response(
            fetch_replies=True,
            fetch_likes=True,
            fetch_shares=True,
            fetch_mentions=True,
            count=50,
            etag=source.last_activities_etag,
            min_id=source.last_activity_id,
            cache=cache,
        )
        etag = resp.get("etag")  # used later
        user_activities = resp.get("items", [])

        # these map ids to AS objects
        responses = {a["id"]: a for a in links}
        activities = {a["id"]: a for a in links + user_activities}

        # extract silo activity ids, update last_activity_id
        silo_activity_ids = set()
        last_activity_id = source.last_activity_id
        for id, activity in activities.items():
            # maybe replace stored last activity id
            parsed = util.parse_tag_uri(id)
            if parsed:
                id = parsed[1]
            silo_activity_ids.add(id)
            try:
                # try numeric comparison first
                greater = int(id) > int(last_activity_id)
            except (TypeError, ValueError):
                greater = id > last_activity_id
            if greater:
                last_activity_id = id

        if last_activity_id and last_activity_id != source.last_activity_id:
            source.updates["last_activity_id"] = last_activity_id

        # trim cache to just the returned activity ids, so that it doesn't grow
        # without bound. (WARNING: depends on get_activities_response()'s cache key
        # format, e.g. 'PREFIX ACTIVITY_ID'!)
        source.updates["last_activities_cache_json"] = json.dumps(
            {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids}
        )

        # Cache to make sure we only fetch the author's h-feed(s) the
        # first time we see it
        fetched_hfeeds = set()

        # narrow down to just public activities
        public = {}
        private = {}
        for id, activity in activities.items():
            (public if source.is_activity_public(activity) else private)[id] = activity
        logging.info("Found %d public activities: %s", len(public), public.keys())
        logging.info("Found %d private activities: %s", len(private), private.keys())

        last_public_post = (source.last_public_post or util.EPOCH).isoformat()
        public_published = util.trim_nulls([a.get("published") for a in public.values()])
        if public_published:
            max_published = max(public_published)
            if max_published > last_public_post:
                last_public_post = max_published
                source.updates["last_public_post"] = util.as_utc(util.parse_iso8601(max_published))

        source.updates["recent_private_posts"] = len(
            [a for a in private.values() if a.get("published", util.EPOCH_ISO) > last_public_post]
        )

        #
        # Step 2: extract responses, store their activities in response['activities']
        #
        # WARNING: this creates circular references in link posts found by search
        # queries in step 1, since they are their own activity. We use
        # prune_activity() and prune_response() in step 4 to remove these before
        # serializing to JSON.
        #
        for id, activity in public.items():
            obj = activity.get("object") or activity

            # handle user mentions
            user_id = source.user_tag_id()
            if obj.get("author", {}).get("id") != user_id:
                for tag in obj.get("tags", []):
                    urls = tag.get("urls")
                    if tag.get("objectType") == "person" and tag.get("id") == user_id and urls:
                        activity["originals"], activity["mentions"] = original_post_discovery.discover(
                            source,
                            activity,
                            fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds,
                        )
                        activity["mentions"].update(u.get("value") for u in urls)
                        responses[id] = activity
                        break

            # handle quote mentions
            for att in obj.get("attachments", []):
                if (
                    att.get("objectType") in ("note", "article")
                    and att.get("author", {}).get("id") == source.user_tag_id()
                ):
                    # now that we've confirmed that one exists, OPD will dig
                    # into the actual attachments
                    if "originals" not in activity or "mentions" not in activity:
                        activity["originals"], activity["mentions"] = original_post_discovery.discover(
                            source,
                            activity,
                            fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds,
                        )
                    responses[id] = activity
                    break

            # extract replies, likes, reactions, reposts, and rsvps
            replies = obj.get("replies", {}).get("items", [])
            tags = obj.get("tags", [])
            likes = [t for t in tags if Response.get_type(t) == "like"]
            reactions = [t for t in tags if Response.get_type(t) == "react"]
            reposts = [t for t in tags if Response.get_type(t) == "repost"]
            rsvps = Source.get_rsvps_from_event(obj)

            # coalesce responses. drop any without ids
            for resp in replies + likes + reactions + reposts + rsvps:
                id = resp.get("id")
                if not id:
                    logging.error("Skipping response without id: %s", json.dumps(resp, indent=2))
                    continue

                resp.setdefault("activities", []).append(activity)

                # when we find two responses with the same id, the earlier one may have
                # come from a link post or user mention, and this one is probably better
                # since it probably came from the user's activity, so prefer this one.
                # background: https://github.com/snarfed/bridgy/issues/533
                existing = responses.get(id)
                if existing:
                    if source.gr_source.activity_changed(resp, existing, log=True):
                        logging.warning("Got two different versions of same response!\n%s\n%s", existing, resp)
                    resp["activities"].extend(existing.get("activities", []))

                responses[id] = resp

        #
        # Step 3: filter out responses we've already seen
        #
        # seen responses (JSON objects) for each source are stored in its entity.
        unchanged_responses = []
        if source.seen_responses_cache_json:
            for seen in json.loads(source.seen_responses_cache_json):
                id = seen["id"]
                resp = responses.get(id)
                if resp and not source.gr_source.activity_changed(seen, resp, log=True):
                    unchanged_responses.append(seen)
                    del responses[id]

        #
        # Step 4: store new responses and enqueue propagate tasks
        #
        pruned_responses = []
        for id, resp in responses.items():
            resp_type = Response.get_type(resp)
            activities = resp.pop("activities", [])
            if not activities and resp_type == "post":
                activities = [resp]
            too_long = set()
            urls_to_activity = {}
            for i, activity in enumerate(activities):
                # we'll usually have multiple responses for the same activity, and the
                # objects in resp['activities'] are shared, so cache each activity's
                # discovered webmention targets inside its object.
                if "originals" not in activity or "mentions" not in activity:
                    activity["originals"], activity["mentions"] = original_post_discovery.discover(
                        source,
                        activity,
                        fetch_hfeed=True,
                        include_redirect_sources=False,
                        already_fetched_hfeeds=fetched_hfeeds,
                    )

                targets = original_post_discovery.targets_for_response(
                    resp, originals=activity["originals"], mentions=activity["mentions"]
                )
                if targets:
                    logging.info(
                        "%s has %d webmention target(s): %s", activity.get("url"), len(targets), " ".join(targets)
                    )
                for t in targets:
                    if len(t) <= _MAX_STRING_LENGTH:
                        urls_to_activity[t] = i
                    else:
                        logging.warning("Giving up on target URL over %s chars! %s", _MAX_STRING_LENGTH, t)
                        too_long.add(t[: _MAX_STRING_LENGTH - 4] + "...")

            # store/update response entity. the prune_*() calls are important to
            # remove circular references in link responses, which are their own
            # activities. details in the step 2 comment above.
            pruned_response = util.prune_response(resp)
            pruned_responses.append(pruned_response)
            resp_entity = Response(
                id=id,
                source=source.key,
                activities_json=[json.dumps(util.prune_activity(a, source)) for a in activities],
                response_json=json.dumps(pruned_response),
                type=resp_type,
                unsent=list(urls_to_activity.keys()),
                failed=list(too_long),
                original_posts=resp.get("originals", []),
            )
            if urls_to_activity and len(activities) > 1:
                resp_entity.urls_to_activity = json.dumps(urls_to_activity)
            resp_entity.get_or_save(source)

        # update cache
        if pruned_responses:
            source.updates["seen_responses_cache_json"] = json.dumps(pruned_responses + unchanged_responses)

        source.updates.update({"last_polled": source.last_poll_attempt, "poll_status": "ok"})
        if etag and etag != source.last_activities_etag:
            source.updates["last_activities_etag"] = etag

        #
        # Step 5. possibly refetch updated syndication urls
        #
        # if the author has added syndication urls since the first time
        # original_post_discovery ran, we'll miss them. this cleanup task will
        # periodically check for updated urls. only kicks in if the author has
        # *ever* published a rel=syndication url
        if source.should_refetch():
            logging.info("refetching h-feed for source %s", source.label())
            relationships = original_post_discovery.refetch(source)

            now = util.now_fn()
            source.updates["last_hfeed_refetch"] = now

            if relationships:
                logging.info("refetch h-feed found new rel=syndication relationships: %s", relationships)
                try:
                    self.repropagate_old_responses(source, relationships)
                except BaseException, e:
                    if isinstance(
                        e, (datastore_errors.BadRequestError, datastore_errors.Timeout)
                    ) or util.is_connection_failure(e):
                        logging.info("Timeout while repropagating responses.", exc_info=True)
                    else:
                        raise

Ejemplo n.º 34

0

Mostrar archivo

Archivo: util_test.py Proyecto: baroquebobcat/activitystreams-unofficial

 def test_nested_dict_with_nones(self):
   self.assertEqual({1: {3: 4}}, util.trim_nulls({1: {2: [], 3: 4}, 5: {6: None}}))

Ejemplo n.º 35

0

Mostrar archivo

Archivo: util_test.py Proyecto: baroquebobcat/activitystreams-unofficial

 def test_string(self):
   self.assertEqual('foo', util.trim_nulls('foo'))

Ejemplo n.º 36

0

Mostrar archivo

Archivo: util_test.py Proyecto: baroquebobcat/activitystreams-unofficial

 def test_simple_dict_with_nones(self):
   self.assertEqual({3: 4, 2: 9}, util.trim_nulls({1: None, 3: 4, 5: [], 2: 9}))

Ejemplo n.º 37

0

Mostrar archivo

Archivo: util_test.py Proyecto: baroquebobcat/activitystreams-unofficial

 def test_simple_dict(self):
   self.assertEqual({1: 2, 3: 4}, util.trim_nulls({1: 2, 3: 4}))

Ejemplo n.º 38

0

Mostrar archivo

Archivo: util_test.py Proyecto: baroquebobcat/activitystreams-unofficial

 def test_simple_dict_with_nulls(self):
   self.assertEqual({}, util.trim_nulls({1: None, 2: [], 3: {}}))

Ejemplo n.º 39

0

Mostrar archivo

Archivo: util_test.py Proyecto: baroquebobcat/activitystreams-unofficial

 def test_empty_dict(self):
   self.assertEqual({}, util.trim_nulls({}))

Ejemplo n.º 40

0

Mostrar archivo

Archivo: util_test.py Proyecto: baroquebobcat/activitystreams-unofficial

 def test_empty_list(self):
   self.assertEqual([], util.trim_nulls([]))