Beispiel #1
0
  def test_query_by_syndication_url(self):
    """Simply testing the query helper"""
    r = SyndicatedPost.query_by_syndication(
        self.source, 'http://silo/post/url')
    self.assertIsNotNone(r)
    self.assertEquals('http://original/post/url', r.original)

    r = SyndicatedPost.query_by_syndication(
        self.source, 'http://silo/no-original')
    self.assertIsNotNone(r)
    self.assertIsNone(r.original)
def _posse_post_discovery(source, activity, author_url, syndication_url,
                          fetch_hfeed):
  """Performs the actual meat of the posse-post-discover. It was split
  out from discover() so that it can be done inside of a transaction.

  Args:
    source: models.Source subclass
    activity: activity dict
    author_url: author's url configured in their silo profile
    syndication_url: url of the syndicated copy for which we are
                     trying to find an original
    fetch_hfeed: boolean, whether or not to fetch and parse the
                 author's feed if we don't have a previously stored
                 relationship.

  Return:
    the activity, updated with original post urls if any are found
  """
  logging.info(
      'starting posse post discovery with author %s and syndicated %s',
      author_url, syndication_url)

  relationship = SyndicatedPost.query_by_syndication(source, syndication_url)
  if not relationship and fetch_hfeed:
    # a syndicated post we haven't seen before! fetch the author's
    # h-feed to see if we can find it.
    results = _process_author(source, author_url)
    relationship = results.get(syndication_url, None)

  if not relationship:
    # No relationship was found. Remember that we've seen this
    # syndicated post to avoid reprocessing it every time
    logging.debug('posse post discovery found no relationship for %s',
                  syndication_url)
    SyndicatedPost.get_or_insert_by_syndication_url(
        source, syndication_url, None)
    return activity

  logging.debug('posse post discovery found relationship %s -> %s',
                syndication_url, relationship.original)

  if relationship.original:
    obj = activity.get('object') or activity
    obj.setdefault('upstreamDuplicates', []).append(relationship.original)

  return activity
    def test_additional_requests_do_not_require_rework(self):
        """Test that original post discovery fetches and stores all entries up
    front so that it does not have to reparse the author's h-feed for
    every new post. Test that original post discovery does the reverse
    lookup to scan author's h-feed for rel=syndication links
    """
        for idx, activity in enumerate(self.activities):
            activity['object']['content'] = 'post content without backlinks'
            activity['object']['url'] = 'http://fa.ke/post/url%d' % (idx + 1)

        author_feed = """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="http://author/post/permalink1"></a>
      </div>
      <div class="h-entry">
        <a class="u-url" href="http://author/post/permalink2"></a>
      </div>
      <div class="h-entry">
        <a class="u-url" href="http://author/post/permalink3"></a>
      </div>
    </html>"""

        source = self.sources[0]
        source.domain_urls = ['http://author']

        self.expect_requests_get('http://author', author_feed)

        # first post is syndicated
        self.expect_requests_get(
            'http://author/post/permalink1', """
    <div class="h-entry">
      <a class="u-url" href="http://author/post/permalink1"></a>
      <a class="u-syndication" href="http://fa.ke/post/url1"></a>
    </div>""").InAnyOrder()

        # second post is syndicated
        self.expect_requests_get(
            'http://author/post/permalink2', """
    <div class="h-entry">
      <a class="u-url" href="http://author/post/permalink2"></a>
      <a class="u-syndication" href="http://fa.ke/post/url2"></a>
    </div>""").InAnyOrder()

        # third post is not syndicated
        self.expect_requests_get(
            'http://author/post/permalink3', """
    <div class="h-entry">
      <a class="u-url" href="http://author/post/permalink3"></a>
    </div>""").InAnyOrder()

        # the second activity lookup should not make any HTTP requests

        # the third activity lookup will fetch the author's h-feed one more time
        self.expect_requests_get('http://author', author_feed).InAnyOrder()

        self.mox.ReplayAll()

        # first activity should trigger all the lookups and storage
        original_post_discovery.discover(source, self.activities[0])

        self.assertEquals(['http://author/post/permalink1'],
                          self.activities[0]['object']['upstreamDuplicates'])

        # make sure things are where we want them
        r = SyndicatedPost.query_by_original(source,
                                             'http://author/post/permalink1')
        self.assertEquals('https://fa.ke/post/url1', r.syndication)
        r = SyndicatedPost.query_by_syndication(source,
                                                'https://fa.ke/post/url1')
        self.assertEquals('http://author/post/permalink1', r.original)

        r = SyndicatedPost.query_by_original(source,
                                             'http://author/post/permalink2')
        self.assertEquals('https://fa.ke/post/url2', r.syndication)
        r = SyndicatedPost.query_by_syndication(source,
                                                'https://fa.ke/post/url2')
        self.assertEquals('http://author/post/permalink2', r.original)

        r = SyndicatedPost.query_by_original(source,
                                             'http://author/post/permalink3')
        self.assertEquals(None, r.syndication)

        # second lookup should require no additional HTTP requests.
        # the second syndicated post should be linked up to the second permalink.
        original_post_discovery.discover(source, self.activities[1])
        self.assertEquals(['http://author/post/permalink2'],
                          self.activities[1]['object']['upstreamDuplicates'])

        # third activity lookup.
        # since we didn't find a back-link for the third syndicated post,
        # it should fetch the author's feed again, but seeing no new
        # posts, it should not follow any of the permalinks

        original_post_discovery.discover(source, self.activities[2])
        # should have found no new syndication link
        self.assertNotIn('upstreamDuplicates', self.activities[2]['object'])

        # should have saved a blank to prevent subsequent checks of this
        # syndicated post from fetching the h-feed again
        r = SyndicatedPost.query_by_syndication(source,
                                                'https://fa.ke/post/url3')
        self.assertEquals(None, r.original)

        # confirm that we do not fetch the h-feed again for the same
        # syndicated post
        original_post_discovery.discover(source, self.activities[2])
        # should be no new syndication link
        self.assertNotIn('upstreamDuplicates', self.activities[2]['object'])