Ejemplo n.º 1
0
    def test_insert_replaces_blanks(self):
        """Make sure we replace original=None with original=something
    when it is discovered"""

        # add a blank for the original too
        SyndicatedPost.insert_original_blank(
            self.source, 'http://original/newly-discovered')

        self.assertTrue(
            SyndicatedPost.query(
                SyndicatedPost.syndication == 'http://silo/no-original',
                SyndicatedPost.original == None,
                ancestor=self.source.key).get())

        self.assertTrue(
            SyndicatedPost.query(
                SyndicatedPost.original == 'http://original/newly-discovered',
                SyndicatedPost.syndication == None,
                ancestor=self.source.key).get())

        r = SyndicatedPost.insert(self.source, 'http://silo/no-original',
                                  'http://original/newly-discovered')
        self.assertIsNotNone(r)
        self.assertEqual('http://original/newly-discovered', r.original)

        # make sure it's in NDB
        rs = SyndicatedPost.query(
            SyndicatedPost.syndication == 'http://silo/no-original',
            ancestor=self.source.key).fetch()
        self.assertEqual(1, len(rs))
        self.assertEqual('http://original/newly-discovered', rs[0].original)
        self.assertEqual('http://silo/no-original', rs[0].syndication)

        # and the blanks have been removed
        self.assertFalse(
            SyndicatedPost.query(
                SyndicatedPost.syndication == 'http://silo/no-original',
                SyndicatedPost.original == None,
                ancestor=self.source.key).get())

        self.assertFalse(
            SyndicatedPost.query(
                SyndicatedPost.original == 'http://original/newly-discovered',
                SyndicatedPost.syndication == None,
                ancestor=self.source.key).get())
Ejemplo n.º 2
0
  def test_insert_replaces_blanks(self):
    """Make sure we replace original=None with original=something
    when it is discovered"""

    # add a blank for the original too
    SyndicatedPost.insert_original_blank(
      self.source, 'http://original/newly-discovered')

    self.assertTrue(
      SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/no-original',
        SyndicatedPost.original == None, ancestor=self.source.key).get())

    self.assertTrue(
      SyndicatedPost.query(
        SyndicatedPost.original == 'http://original/newly-discovered',
        SyndicatedPost.syndication == None, ancestor=self.source.key).get())

    r = SyndicatedPost.insert(
        self.source, 'http://silo/no-original',
        'http://original/newly-discovered')
    self.assertIsNotNone(r)
    self.assertEquals('http://original/newly-discovered', r.original)

    # make sure it's in NDB
    rs = SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/no-original',
        ancestor=self.source.key
    ).fetch()
    self.assertEquals(1, len(rs))
    self.assertEquals('http://original/newly-discovered', rs[0].original)
    self.assertEquals('http://silo/no-original', rs[0].syndication)

    # and the blanks have been removed
    self.assertFalse(
      SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/no-original',
        SyndicatedPost.original == None, ancestor=self.source.key).get())

    self.assertFalse(
      SyndicatedPost.query(
        SyndicatedPost.original == 'http://original/newly-discovered',
        SyndicatedPost.syndication == None, ancestor=self.source.key).get())
Ejemplo n.º 3
0
def _process_entry(source, permalink, feed_entry, refetch, preexisting,
                   store_blanks=True):
  """Fetch and process an h-entry, saving a new SyndicatedPost to the
  DB if successful.

  Args:
    source:
    permalink: url of the unprocessed post
    feed_entry: the h-feed version of the h-entry dict, often contains
      a partial version of the h-entry at the permalink
    refetch: boolean, whether to refetch and process entries we've seen before
    preexisting: a list of previously discovered models.SyndicatedPosts
      for this permalink
    store_blanks: boolean, whether we should store blank SyndicatedPosts when
      we don't find a relationship

  Returns:
    a dict from syndicated url to a list of new models.SyndicatedPosts
  """
  # if the post has already been processed, do not add to the results
  # since this method only returns *newly* discovered relationships.
  if preexisting:
    # if we're refetching and this one is blank, do not return.
    # if there is a blank entry, it should be the one and only entry,
    # but go ahead and check 'all' of them to be safe.
    if not refetch:
      return {}
    synds = [s.syndication for s in preexisting if s.syndication]
    if synds:
      logging.debug('previously found relationship(s) for original %s: %s',
                    permalink, synds)

  # first try with the h-entry from the h-feed. if we find the syndication url
  # we're looking for, we don't have to fetch the permalink
  permalink, _, type_ok = util.get_webmention_target(permalink)
  usynd = feed_entry.get('properties', {}).get('syndication', [])
  if usynd:
    logging.debug('u-syndication links on the h-feed h-entry: %s', usynd)
  results = _process_syndication_urls(source, permalink, set(
    url for url in usynd if isinstance(url, basestring)), preexisting)
  success = True

  # fetch the full permalink page, which often has more detailed information
  if not results:
    parsed = None
    try:
      logging.debug('fetching post permalink %s', permalink)
      if type_ok:
        resp = util.requests_get(permalink)
        resp.raise_for_status()
        parsed = mf2py.Parser(url=permalink, doc=resp.text).to_dict()
    except AssertionError:
      raise  # for unit tests
    except BaseException:
      # TODO limit the number of allowed failures
      logging.warning('Could not fetch permalink %s', permalink, exc_info=True)
      success = False

    if parsed:
      syndication_urls = set()
      relsynd = parsed.get('rels').get('syndication', [])
      if relsynd:
        logging.debug('rel-syndication links: %s', relsynd)
      syndication_urls.update(url for url in relsynd
                              if isinstance(url, basestring))
      # there should only be one h-entry on a permalink page, but
      # we'll check all of them just in case.
      for hentry in (item for item in parsed['items']
                     if 'h-entry' in item['type']):
        usynd = hentry.get('properties', {}).get('syndication', [])
        if usynd:
          logging.debug('u-syndication links: %s', usynd)
        syndication_urls.update(url for url in usynd
                                if isinstance(url, basestring))
      results = _process_syndication_urls(
        source, permalink, syndication_urls, preexisting)

  # detect and delete SyndicatedPosts that were removed from the site
  if success:
    result_syndposts = itertools.chain(*results.values())
    for syndpost in list(preexisting):
      if syndpost.syndication and syndpost not in result_syndposts:
        logging.info('deleting relationship that disappeared: %s', syndpost)
        syndpost.key.delete()
        preexisting.remove(syndpost)

  if not results:
    logging.debug('no syndication links from %s to current source %s.',
                  permalink, source.label())
    results = {}
    if store_blanks and not preexisting:
      # remember that this post doesn't have syndication links for this
      # particular source
      logging.debug('saving empty relationship so that %s will not be '
                    'searched again', permalink)
      SyndicatedPost.insert_original_blank(source, permalink)

  # only return results that are not in the preexisting list
  new_results = {}
  for syndurl, syndposts_for_url in results.iteritems():
    for syndpost in syndposts_for_url:
      if syndpost not in preexisting:
        new_results.setdefault(syndurl, []).append(syndpost)

  if new_results:
    logging.debug('discovered relationships %s', new_results)
  return new_results
Ejemplo n.º 4
0
def process_entry(source,
                  permalink,
                  feed_entry,
                  refetch,
                  preexisting,
                  store_blanks=True):
    """Fetch and process an h-entry and save a new :class:`models.SyndicatedPost`.

  Args:
    source:
    permalink: url of the unprocessed post
    feed_entry: the h-feed version of the h-entry dict, often contains
      a partial version of the h-entry at the permalink
    refetch: boolean, whether to refetch and process entries we've seen before
    preexisting: list of previously discovered :class:`models.SyndicatedPost`\ s
      for this permalink
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Returns:
    a dict from syndicated url to a list of new :class:`models.SyndicatedPost`\ s
  """
    # if the post has already been processed, do not add to the results
    # since this method only returns *newly* discovered relationships.
    if preexisting:
        # if we're refetching and this one is blank, do not return.
        # if there is a blank entry, it should be the one and only entry,
        # but go ahead and check 'all' of them to be safe.
        if not refetch:
            return {}
        synds = [s.syndication for s in preexisting if s.syndication]
        if synds:
            logger.debug(
                f'previously found relationship(s) for original {permalink}: {synds}'
            )

    # first try with the h-entry from the h-feed. if we find the syndication url
    # we're looking for, we don't have to fetch the permalink
    permalink, _, type_ok = util.get_webmention_target(permalink)
    usynd = feed_entry.get('properties', {}).get('syndication', [])
    usynd_urls = {url for url in usynd if isinstance(url, str)}
    if usynd_urls:
        logger.debug(
            f'u-syndication links on the h-feed h-entry: {usynd_urls}')
    results = _process_syndication_urls(source, permalink, usynd_urls,
                                        preexisting)
    success = True

    if results:
        source.updates['last_feed_syndication_url'] = util.now_fn()
    elif not source.last_feed_syndication_url or not feed_entry:
        # fetch the full permalink page if we think it might have more details
        mf2 = None
        try:
            if type_ok:
                logger.debug(f'fetching post permalink {permalink}')
                mf2 = util.fetch_mf2(permalink)
        except AssertionError:
            raise  # for unit tests
        except BaseException:
            # TODO limit the number of allowed failures
            logger.info(f'Could not fetch permalink {permalink}',
                        exc_info=True)
            success = False

        if mf2:
            syndication_urls = set()
            relsynd = mf2['rels'].get('syndication', [])
            if relsynd:
                logger.debug(f'rel-syndication links: {relsynd}')
            syndication_urls.update(url for url in relsynd
                                    if isinstance(url, str))
            # there should only be one h-entry on a permalink page, but
            # we'll check all of them just in case.
            for hentry in (item for item in mf2['items']
                           if 'h-entry' in item['type']):
                usynd = hentry.get('properties', {}).get('syndication', [])
                if usynd:
                    logger.debug(f'u-syndication links: {usynd}')
                syndication_urls.update(url for url in usynd
                                        if isinstance(url, str))
            results = _process_syndication_urls(source, permalink,
                                                syndication_urls, preexisting)

    # detect and delete SyndicatedPosts that were removed from the site
    if success:
        result_syndposts = list(itertools.chain(*results.values()))
        for syndpost in preexisting:
            if syndpost.syndication and syndpost not in result_syndposts:
                logger.info(
                    f'deleting relationship that disappeared: {syndpost}')
                syndpost.key.delete()
                preexisting.remove(syndpost)

    if not results:
        logger.debug(
            f'no syndication links from {permalink} to current source {source.label()}.'
        )
        results = {}
        if store_blanks and not preexisting:
            # remember that this post doesn't have syndication links for this
            # particular source
            logger.debug(
                f'saving empty relationship so that {permalink} will not be searched again'
            )
            SyndicatedPost.insert_original_blank(source, permalink)

    # only return results that are not in the preexisting list
    new_results = {}
    for syndurl, syndposts_for_url in results.items():
        for syndpost in syndposts_for_url:
            if syndpost not in preexisting:
                new_results.setdefault(syndurl, []).append(syndpost)

    if new_results:
        logger.debug(f'discovered relationships {new_results}')
    return new_results
Ejemplo n.º 5
0
def _process_entry(source,
                   permalink,
                   feed_entry,
                   refetch,
                   preexisting,
                   store_blanks=True):
    """Fetch and process an h-entry, saving a new SyndicatedPost to the
  DB if successful.

  Args:
    source:
    permalink: url of the unprocessed post
    feed_entry: the h-feed version of the h-entry dict, often contains
      a partial version of the h-entry at the permalink
    refetch: boolean, whether to refetch and process entries we've seen before
    preexisting: a list of previously discovered models.SyndicatedPosts
      for this permalink
    store_blanks: boolean, whether we should store blank SyndicatedPosts when
      we don't find a relationship

  Returns:
    a dict from syndicated url to a list of new models.SyndicatedPosts
  """
    # if the post has already been processed, do not add to the results
    # since this method only returns *newly* discovered relationships.
    if preexisting:
        # if we're refetching and this one is blank, do not return.
        # if there is a blank entry, it should be the one and only entry,
        # but go ahead and check 'all' of them to be safe.
        if not refetch:
            return {}
        synds = [s.syndication for s in preexisting if s.syndication]
        if synds:
            logging.debug(
                'previously found relationship(s) for original %s: %s',
                permalink, synds)

    # first try with the h-entry from the h-feed. if we find the syndication url
    # we're looking for, we don't have to fetch the permalink
    permalink, _, type_ok = util.get_webmention_target(permalink)
    usynd = feed_entry.get('properties', {}).get('syndication', [])
    if usynd:
        logging.debug('u-syndication links on the h-feed h-entry: %s', usynd)
    results = _process_syndication_urls(
        source, permalink,
        set(url for url in usynd if isinstance(url, basestring)), preexisting)
    success = True

    if results:
        source.updates['last_feed_syndication_url'] = util.now_fn()
    elif not source.last_feed_syndication_url:
        # fetch the full permalink page if we think it might have more details
        parsed = None
        try:
            logging.debug('fetching post permalink %s', permalink)
            if type_ok:
                resp = util.requests_get(permalink)
                resp.raise_for_status()
                parsed = util.mf2py_parse(resp.text, permalink)
        except AssertionError:
            raise  # for unit tests
        except BaseException:
            # TODO limit the number of allowed failures
            logging.warning('Could not fetch permalink %s',
                            permalink,
                            exc_info=True)
            success = False

        if parsed:
            syndication_urls = set()
            relsynd = parsed.get('rels').get('syndication', [])
            if relsynd:
                logging.debug('rel-syndication links: %s', relsynd)
            syndication_urls.update(url for url in relsynd
                                    if isinstance(url, basestring))
            # there should only be one h-entry on a permalink page, but
            # we'll check all of them just in case.
            for hentry in (item for item in parsed['items']
                           if 'h-entry' in item['type']):
                usynd = hentry.get('properties', {}).get('syndication', [])
                if usynd:
                    logging.debug('u-syndication links: %s', usynd)
                syndication_urls.update(url for url in usynd
                                        if isinstance(url, basestring))
            results = _process_syndication_urls(source, permalink,
                                                syndication_urls, preexisting)

    # detect and delete SyndicatedPosts that were removed from the site
    if success:
        result_syndposts = itertools.chain(*results.values())
        for syndpost in list(preexisting):
            if syndpost.syndication and syndpost not in result_syndposts:
                logging.info('deleting relationship that disappeared: %s',
                             syndpost)
                syndpost.key.delete()
                preexisting.remove(syndpost)

    if not results:
        logging.debug('no syndication links from %s to current source %s.',
                      permalink, source.label())
        results = {}
        if store_blanks and not preexisting:
            # remember that this post doesn't have syndication links for this
            # particular source
            logging.debug(
                'saving empty relationship so that %s will not be '
                'searched again', permalink)
            SyndicatedPost.insert_original_blank(source, permalink)

    # only return results that are not in the preexisting list
    new_results = {}
    for syndurl, syndposts_for_url in results.iteritems():
        for syndpost in syndposts_for_url:
            if syndpost not in preexisting:
                new_results.setdefault(syndurl, []).append(syndpost)

    if new_results:
        logging.debug('discovered relationships %s', new_results)
    return new_results
Ejemplo n.º 6
0
def _process_entry(source, permalink, feed_entry, refetch_blanks, preexisting):
  """Fetch and process an h-entry, saving a new SyndicatedPost to the
  DB if successful.

  Args:
    source:
    permalink: url of the unprocessed post
    feed_entry: the h-feed version of the h-entry dict, often contains
      a partial version of the h-entry at the permalink
    refetch_blanks: boolean whether we should ignore blank preexisting
      SyndicatedPosts
    preexisting: a list of previously discovered models.SyndicatedPosts
      for this permalink

  Returns:
    a dict from syndicated url to a list of new models.SyndicatedPosts
  """
  results = {}

  # if the post has already been processed, do not add to the results
  # since this method only returns *newly* discovered relationships.
  if preexisting:
    # if we're refetching blanks and this one is blank, do not return.
    # if there is a blank entry, it should be the one and only entry,
    # but go ahead and check 'all' of them to be safe.
    if refetch_blanks and all(not p.syndication for p in preexisting):
      logging.debug('ignoring blank relationship for original %s', permalink)
    else:
      return results

  # first try with the h-entry from the h-feed. if we find the syndication url
  # we're looking for, we don't have to fetch the permalink
  usynd = feed_entry.get('properties', {}).get('syndication', [])
  logging.debug('u-syndication links on the h-feed h-entry: %s', usynd)
  results = _process_syndication_urls(source, permalink, set(
    url for url in usynd if isinstance(url, basestring)))

  # fetch the full permalink page, which often has more detailed information
  if not results:
    parsed = None
    try:
      logging.debug('fetching post permalink %s', permalink)
      permalink, _, type_ok = util.get_webmention_target(permalink)
      if type_ok:
        resp = requests.get(permalink, timeout=HTTP_TIMEOUT)
        resp.raise_for_status()
        parsed = mf2py.Parser(url=permalink, doc=resp.text).to_dict()
    except BaseException:
      # TODO limit the number of allowed failures
      logging.warning('Could not fetch permalink %s', permalink, exc_info=True)

    if parsed:
      syndication_urls = set()
      relsynd = parsed.get('rels').get('syndication', [])
      logging.debug('rel-syndication links: %s', relsynd)
      syndication_urls.update(url for url in relsynd
                              if isinstance(url, basestring))
      # there should only be one h-entry on a permalink page, but
      # we'll check all of them just in case.
      for hentry in (item for item in parsed['items']
                     if 'h-entry' in item['type']):
        usynd = hentry.get('properties', {}).get('syndication', [])
        logging.debug('u-syndication links: %s', usynd)
        syndication_urls.update(url for url in usynd
                                if isinstance(url, basestring))
      results = _process_syndication_urls(source, permalink,
                                          syndication_urls)

  if not results:
    logging.debug('no syndication links from %s to current source %s.',
                  permalink, source.label())
    if not preexisting:
      # remember that this post doesn't have syndication links for this
      # particular source
      logging.debug('saving empty relationship so that it %s will not be '
                    'searched again', permalink)
      SyndicatedPost.insert_original_blank(source, permalink)

  logging.debug('discovered relationships %s', results)
  return results