def _process_syndication_urls(source, permalink, syndication_urls):
  """Process a list of syndication URLs looking for one that matches the
  current source.  If one is found, stores a new SyndicatedPost in the
  db.

  Args:
    source: a models.Source subclass
    permalink: a string. the current h-entry permalink
    syndication_urls: a collection of strings. the unfitered list
      of syndication_urls
  """

  results = {}
  # save the results (or lack thereof) to the db, and put them in a
  # map for immediate use
  for syndication_url in syndication_urls:
    # follow redirects to give us the canonical syndication url --
    # gives the best chance of finding a match.
    syndication_url = util.follow_redirects(syndication_url).url
    # source-specific logic to standardize the URL. (e.g., replace facebook
    # username with numeric id)
    syndication_url = source.canonicalize_syndication_url(syndication_url)
    # check that the syndicated url belongs to this source TODO save future
    # lookups by saving results for other sources too (note: query the
    # appropriate source subclass by author.domains, rather than
    # author.domain_urls)
    if util.domain_from_link(syndication_url) == source.AS_CLASS.DOMAIN:
      logging.debug('saving discovered relationship %s -> %s',
                    syndication_url, permalink)
      relationship = SyndicatedPost.insert(
        source, syndication=syndication_url, original=permalink)
      results.setdefault(syndication_url, []).append(relationship)
  return results
Beispiel #2
0
    def test_insert_no_duplicates(self):
        """Make sure we don't insert duplicate entries"""

        r = SyndicatedPost.insert(self.source, 'http://silo/post/url',
                                  'http://original/post/url')
        self.assertIsNotNone(r)
        self.assertEqual('http://original/post/url', r.original)

        # make sure there's only one in the DB
        rs = SyndicatedPost.query(
            SyndicatedPost.syndication == 'http://silo/post/url',
            SyndicatedPost.original == 'http://original/post/url',
            ancestor=self.source.key).fetch()

        self.assertEqual(1, len(rs))
Beispiel #3
0
  def test_insert_no_duplicates(self):
    """Make sure we don't insert duplicate entries"""

    r = SyndicatedPost.insert(
      self.source, 'http://silo/post/url', 'http://original/post/url')
    self.assertIsNotNone(r)
    self.assertEqual('http://original/post/url', r.original)

    # make sure there's only one in the DB
    rs = SyndicatedPost.query(
      SyndicatedPost.syndication == 'http://silo/post/url',
      SyndicatedPost.original == 'http://original/post/url',
      ancestor=self.source.key
    ).fetch()

    self.assertEqual(1, len(rs))
Beispiel #4
0
  def test_retry(self):
    self.assertEqual([], self.taskqueue_stub.GetTasks('propagate'))

    source = self.sources[0]
    source.domain_urls = ['http://orig']
    source.last_hfeed_refetch = last_hfeed_refetch = \
        testutil.NOW - datetime.timedelta(minutes=1)
    source.put()

    resp = self.responses[0]
    resp.status = 'complete'
    resp.unsent = ['http://unsent']
    resp.sent = ['http://sent']
    resp.error = ['http://error']
    resp.failed = ['http://failed']
    resp.skipped = ['https://skipped']

    # SyndicatedPost with new target URLs
    resp.activities_json = [
      json.dumps({'object': {'url': 'https://fa.ke/1'}}),
      json.dumps({'url': 'https://fa.ke/2', 'object': {'unused': 'ok'}}),
      json.dumps({'url': 'https://fa.ke/3'}),
    ]
    resp.put()
    SyndicatedPost.insert(source, 'https://fa.ke/1', 'https://orig/1')
    SyndicatedPost.insert(source, 'https://fa.ke/2', 'http://orig/2')
    SyndicatedPost.insert(source, 'https://fa.ke/3', 'http://orig/3')

    # cached webmention endpoint
    memcache.set('W https skipped /', 'asdf')

    key = resp.key.urlsafe()
    response = app.application.get_response(
      '/retry', method='POST', body=native_str(urllib.parse.urlencode({'key': key})))
    self.assertEquals(302, response.status_int)
    self.assertEquals(source.bridgy_url(self.handler),
                      response.headers['Location'].split('#')[0])
    params = testutil.get_task_params(self.taskqueue_stub.GetTasks('propagate')[0])
    self.assertEqual(key, params['response_key'])

    # status and URLs should be refreshed
    got = resp.key.get()
    self.assertEqual('new', got.status)
    self.assertItemsEqual(
      ['http://unsent/', 'http://sent/', 'https://skipped/', 'http://error/',
       'http://failed/', 'https://orig/1', 'http://orig/2', 'http://orig/3'],
      got.unsent)
    for field in got.sent, got.skipped, got.error, got.failed:
      self.assertEqual([], field)

    # webmention endpoints for URL domains should be refreshed
    self.assertIsNone(memcache.get('W https skipped /'))

    # shouldn't have refetched h-feed
    self.assertEqual(last_hfeed_refetch, source.key.get().last_hfeed_refetch)
Beispiel #5
0
  def test_retry(self):
    self.assertEqual([], self.taskqueue_stub.GetTasks('propagate'))

    source = self.sources[0]
    source.domain_urls = ['http://orig']
    source.last_hfeed_refetch = last_hfeed_refetch = \
        testutil.NOW - datetime.timedelta(minutes=1)
    source.put()

    resp = self.responses[0]
    resp.status = 'complete'
    resp.unsent = ['http://unsent']
    resp.sent = ['http://sent']
    resp.error = ['http://error']
    resp.failed = ['http://failed']
    resp.skipped = ['https://skipped']

    # SyndicatedPost with new target URLs
    resp.activities_json = [
      json.dumps({'object': {'url': 'https://fa.ke/1'}}),
      json.dumps({'url': 'https://fa.ke/2', 'object': {'unused': 'ok'}}),
      json.dumps({'url': 'https://fa.ke/3'}),
    ]
    resp.put()
    SyndicatedPost.insert(source, 'https://fa.ke/1', 'https://orig/1')
    SyndicatedPost.insert(source, 'https://fa.ke/2', 'http://orig/2')
    SyndicatedPost.insert(source, 'https://fa.ke/3', 'http://orig/3')

    # cached webmention endpoint
    memcache.set('W https skipped /', 'asdf')

    key = resp.key.urlsafe()
    response = app.application.get_response(
      '/retry', method='POST', body=urllib.urlencode({'key': key}))
    self.assertEquals(302, response.status_int)
    self.assertEquals(source.bridgy_url(self.handler),
                      response.headers['Location'].split('#')[0])
    params = testutil.get_task_params(self.taskqueue_stub.GetTasks('propagate')[0])
    self.assertEqual(key, params['response_key'])

    # status and URLs should be refreshed
    got = resp.key.get()
    self.assertEqual('new', got.status)
    self.assertItemsEqual(
      ['http://unsent/', 'http://sent/', 'https://skipped/', 'http://error/',
       'http://failed/', 'https://orig/1', 'http://orig/2', 'http://orig/3'],
      got.unsent)
    for field in got.sent, got.skipped, got.error, got.failed:
      self.assertEqual([], field)

    # webmention endpoints for URL domains should be refreshed
    self.assertIsNone(memcache.get('W https skipped /'))

    # shouldn't have refetched h-feed
    self.assertEqual(last_hfeed_refetch, source.key.get().last_hfeed_refetch)
Beispiel #6
0
    def test_insert_replaces_blanks(self):
        """Make sure we replace original=None with original=something
    when it is discovered"""

        # add a blank for the original too
        SyndicatedPost.insert_original_blank(
            self.source, 'http://original/newly-discovered')

        self.assertTrue(
            SyndicatedPost.query(
                SyndicatedPost.syndication == 'http://silo/no-original',
                SyndicatedPost.original == None,
                ancestor=self.source.key).get())

        self.assertTrue(
            SyndicatedPost.query(
                SyndicatedPost.original == 'http://original/newly-discovered',
                SyndicatedPost.syndication == None,
                ancestor=self.source.key).get())

        r = SyndicatedPost.insert(self.source, 'http://silo/no-original',
                                  'http://original/newly-discovered')
        self.assertIsNotNone(r)
        self.assertEqual('http://original/newly-discovered', r.original)

        # make sure it's in NDB
        rs = SyndicatedPost.query(
            SyndicatedPost.syndication == 'http://silo/no-original',
            ancestor=self.source.key).fetch()
        self.assertEqual(1, len(rs))
        self.assertEqual('http://original/newly-discovered', rs[0].original)
        self.assertEqual('http://silo/no-original', rs[0].syndication)

        # and the blanks have been removed
        self.assertFalse(
            SyndicatedPost.query(
                SyndicatedPost.syndication == 'http://silo/no-original',
                SyndicatedPost.original == None,
                ancestor=self.source.key).get())

        self.assertFalse(
            SyndicatedPost.query(
                SyndicatedPost.original == 'http://original/newly-discovered',
                SyndicatedPost.syndication == None,
                ancestor=self.source.key).get())
Beispiel #7
0
    def test_insert_auguments_existing(self):
        """Make sure we add newly discovered urls for a given syndication url,
    rather than overwrite them
    """
        r = SyndicatedPost.insert(self.source, 'http://silo/post/url',
                                  'http://original/different/url')
        self.assertIsNotNone(r)
        self.assertEqual('http://original/different/url', r.original)

        # make sure they're both in the DB
        rs = SyndicatedPost.query(
            SyndicatedPost.syndication == 'http://silo/post/url',
            ancestor=self.source.key).fetch()

        self.assertCountEqual([
            'http://original/post/url', 'http://original/another/post',
            'http://original/different/url'
        ], [rel.original for rel in rs])
def _process_syndication_urls(source, permalink, syndication_urls,
                              preexisting):
    """Process a list of syndication URLs looking for one that matches the
  current source. If one is found, stores a new :class:`models.SyndicatedPost`
  in the db.

  Args:
    source: a :class:`models.Source` subclass
    permalink: a string. the current h-entry permalink
    syndication_urls: a collection of strings. the unfitered list
      of syndication urls
    preexisting: a list of previously discovered :class:`models.SyndicatedPost`\ s

  Returns:
    dict mapping string syndication url to list of :class:`models.SyndicatedPost`\ s
  """
    results = {}
    # save the results (or lack thereof) to the db, and put them in a
    # map for immediate use
    for url in syndication_urls:
        # source-specific logic to standardize the URL. (e.g., replace facebook
        # username with numeric id)
        url = source.canonicalize_url(url)
        if not url:
            continue

        # TODO: save future lookups by saving results for other sources too (note:
        # query the appropriate source subclass by author.domains, rather than
        # author.domain_urls)
        #
        # we may have already seen this relationship, save a DB lookup by
        # finding it in the preexisting list
        relationship = next(
            (sp for sp in preexisting
             if sp.syndication == url and sp.original == permalink), None)
        if not relationship:
            logging.debug('saving discovered relationship %s -> %s', url,
                          permalink)
            relationship = SyndicatedPost.insert(source,
                                                 syndication=url,
                                                 original=permalink)
        results.setdefault(url, []).append(relationship)

    return results
Beispiel #9
0
  def test_insert_replaces_blanks(self):
    """Make sure we replace original=None with original=something
    when it is discovered"""

    # add a blank for the original too
    SyndicatedPost.insert_original_blank(
      self.source, 'http://original/newly-discovered')

    self.assertTrue(
      SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/no-original',
        SyndicatedPost.original == None, ancestor=self.source.key).get())

    self.assertTrue(
      SyndicatedPost.query(
        SyndicatedPost.original == 'http://original/newly-discovered',
        SyndicatedPost.syndication == None, ancestor=self.source.key).get())

    r = SyndicatedPost.insert(
        self.source, 'http://silo/no-original',
        'http://original/newly-discovered')
    self.assertIsNotNone(r)
    self.assertEquals('http://original/newly-discovered', r.original)

    # make sure it's in NDB
    rs = SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/no-original',
        ancestor=self.source.key
    ).fetch()
    self.assertEquals(1, len(rs))
    self.assertEquals('http://original/newly-discovered', rs[0].original)
    self.assertEquals('http://silo/no-original', rs[0].syndication)

    # and the blanks have been removed
    self.assertFalse(
      SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/no-original',
        SyndicatedPost.original == None, ancestor=self.source.key).get())

    self.assertFalse(
      SyndicatedPost.query(
        SyndicatedPost.original == 'http://original/newly-discovered',
        SyndicatedPost.syndication == None, ancestor=self.source.key).get())
def _process_syndication_urls(source, permalink, syndication_urls,
                              preexisting):
  """Process a list of syndication URLs looking for one that matches the
  current source. If one is found, stores a new :class:`models.SyndicatedPost`
  in the db.

  Args:
    source: a :class:`models.Source` subclass
    permalink: a string. the current h-entry permalink
    syndication_urls: a collection of strings. the unfitered list
      of syndication urls
    preexisting: a list of previously discovered :class:`models.SyndicatedPost`\ s

  Returns:
    dict mapping string syndication url to list of :class:`models.SyndicatedPost`\ s
  """
  results = {}
  # save the results (or lack thereof) to the db, and put them in a
  # map for immediate use
  for url in syndication_urls:
    # source-specific logic to standardize the URL. (e.g., replace facebook
    # username with numeric id)
    url = source.canonicalize_url(url)
    if not url:
      continue

    # TODO: save future lookups by saving results for other sources too (note:
    # query the appropriate source subclass by author.domains, rather than
    # author.domain_urls)
    #
    # we may have already seen this relationship, save a DB lookup by
    # finding it in the preexisting list
    relationship = next((sp for sp in preexisting
                         if sp.syndication == url
                         and sp.original == permalink), None)
    if not relationship:
      logging.debug('saving discovered relationship %s -> %s', url, permalink)
      relationship = SyndicatedPost.insert(
        source, syndication=url, original=permalink)
    results.setdefault(url, []).append(relationship)

  return results
Beispiel #11
0
  def test_insert_auguments_existing(self):
    """Make sure we add newly discovered urls for a given syndication url,
    rather than overwrite them
    """
    r = SyndicatedPost.insert(
        self.source, 'http://silo/post/url',
        'http://original/different/url')
    self.assertIsNotNone(r)
    self.assertEquals('http://original/different/url', r.original)

    # make sure they're both in the DB
    rs = SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/post/url',
        ancestor=self.source.key
    ).fetch()

    self.assertItemsEqual(['http://original/post/url',
                           'http://original/another/post',
                           'http://original/different/url'],
                          [rel.original for rel in rs])
Beispiel #12
0
    def test_retry(self):
        source = self.sources[0]
        source.domain_urls = ['http://orig']
        source.last_hfeed_refetch = last_hfeed_refetch = testutil.NOW - timedelta(
            minutes=1)
        source.put()

        resp = self.responses[0]
        resp.status = 'complete'
        resp.unsent = ['http://unsent']
        resp.sent = ['http://sent']
        resp.error = ['http://error']
        resp.failed = ['http://failed']
        resp.skipped = ['https://skipped']

        # SyndicatedPost with new target URLs
        resp.activities_json = [
            json_dumps({'object': {
                'url': 'https://fa.ke/1'
            }}),
            json_dumps({
                'url': 'https://fa.ke/2',
                'object': {
                    'unused': 'ok'
                }
            }),
            json_dumps({'url': 'https://fa.ke/3'}),
        ]
        resp.put()
        SyndicatedPost.insert(source, 'https://fa.ke/1', 'https://orig/1')
        SyndicatedPost.insert(source, 'https://fa.ke/2', 'http://orig/2')
        SyndicatedPost.insert(source, 'https://fa.ke/3', 'http://orig/3')

        key = resp.key.urlsafe().decode()
        self.expect_task('propagate', response_key=key)
        self.mox.ReplayAll()

        # cached webmention endpoint
        util.webmention_endpoint_cache['W https skipped /'] = 'asdf'

        response = self.client.post('/retry', data={'key': key})
        self.assertEqual(302, response.status_code)
        self.assertEqual(self.source_bridgy_url, response.headers['Location'])

        # status and URLs should be refreshed
        got = resp.key.get()
        self.assertEqual('new', got.status)
        self.assertCountEqual([
            'http://unsent/', 'http://sent/', 'https://skipped/',
            'http://error/', 'http://failed/', 'https://orig/1',
            'http://orig/2', 'http://orig/3'
        ], got.unsent)
        for field in got.sent, got.skipped, got.error, got.failed:
            self.assertEqual([], field)

        # webmention endpoints for URL domains should be refreshed
        self.assertNotIn('W https skipped /', util.webmention_endpoint_cache)

        # shouldn't have refetched h-feed
        self.assertEqual(last_hfeed_refetch,
                         source.key.get().last_hfeed_refetch)