def testPurgeOldTweetsDeletedLessThanExpected(
            self, estimateNumTweetsToDeleteMock, queryCandidateRowsMock,
            deleteRowsMock):

        estimate = purge_old_tweets._MAX_DELETE_BATCH_SIZE * 3

        estimateNumTweetsToDeleteMock.return_value = estimate

        uidsIter = iter(xrange(estimate))

        queryCandidateRowsMock.side_effect = (
            lambda limit, **kwargs: tuple(itertools.islice(uidsIter, limit)))

        deletedCounts = [
            purge_old_tweets._MAX_DELETE_BATCH_SIZE,
            purge_old_tweets._MAX_DELETE_BATCH_SIZE // 2,
            purge_old_tweets._MAX_DELETE_BATCH_SIZE
        ]

        deleteRowsMock.side_effect = iter(deletedCounts)

        # Execute
        numDeleted = purge_old_tweets.purgeOldTweets(thresholdDays=90)

        self.assertEqual(numDeleted, sum(deletedCounts))

        self.assertEqual(estimateNumTweetsToDeleteMock.call_count, 1)

        self.assertEqual(queryCandidateRowsMock.call_count, 4)
        self.assertEqual(deleteRowsMock.call_count, 3)
    def testPurgeOldTweetsStopAtEstimated(self, estimateNumTweetsToDeleteMock,
                                          queryCandidateRowsMock,
                                          deleteRowsMock):

        estimate = purge_old_tweets._MAX_DELETE_BATCH_SIZE * 2

        estimateNumTweetsToDeleteMock.return_value = estimate

        uidsIter = iter(xrange(estimate + 1))

        queryCandidateRowsMock.side_effect = (
            lambda limit, **kwargs: tuple(itertools.islice(uidsIter, limit)))

        deleteRowsMock.side_effect = lambda uids, **kwargs: len(uids)

        # Execute
        numDeleted = purge_old_tweets.purgeOldTweets(thresholdDays=90)

        self.assertEqual(numDeleted, estimate)

        self.assertEqual(estimateNumTweetsToDeleteMock.call_count, 1)

        self.assertEqual(queryCandidateRowsMock.call_count, 2)
        self.assertEqual(deleteRowsMock.call_count, 2)

        # Make sure it didn't try to retrieve candidates beyond estimated number
        self.assertEqual(len(tuple(uidsIter)), 1)
  def testPurgeOldTweetsDeletedLessThanExpected(self,
                                                estimateNumTweetsToDeleteMock,
                                                queryCandidateRowsMock,
                                                deleteRowsMock):

    estimate = purge_old_tweets._MAX_DELETE_BATCH_SIZE * 3

    estimateNumTweetsToDeleteMock.return_value = estimate


    uidsIter = iter(xrange(estimate))

    queryCandidateRowsMock.side_effect = (
      lambda limit, **kwargs: tuple(itertools.islice(uidsIter, limit)))

    deletedCounts = [
      purge_old_tweets._MAX_DELETE_BATCH_SIZE,
      purge_old_tweets._MAX_DELETE_BATCH_SIZE // 2,
      purge_old_tweets._MAX_DELETE_BATCH_SIZE
    ]

    deleteRowsMock.side_effect = iter(deletedCounts)

    # Execute
    numDeleted = purge_old_tweets.purgeOldTweets(thresholdDays=90)

    self.assertEqual(numDeleted, sum(deletedCounts))

    self.assertEqual(estimateNumTweetsToDeleteMock.call_count, 1)

    self.assertEqual(queryCandidateRowsMock.call_count, 4)
    self.assertEqual(deleteRowsMock.call_count, 3)
  def testPurgeOldTweetsStopAtEstimated(
      self,
      estimateNumTweetsToDeleteMock,
      queryCandidateRowsMock,
      deleteRowsMock):

    estimate = purge_old_tweets._MAX_DELETE_BATCH_SIZE * 2

    estimateNumTweetsToDeleteMock.return_value = estimate


    uidsIter = iter(xrange(estimate + 1))

    queryCandidateRowsMock.side_effect = (
      lambda limit, **kwargs: tuple(itertools.islice(uidsIter, limit)))

    deleteRowsMock.side_effect = lambda uids, **kwargs: len(uids)

    # Execute
    numDeleted = purge_old_tweets.purgeOldTweets(thresholdDays=90)

    self.assertEqual(numDeleted, estimate)

    self.assertEqual(estimateNumTweetsToDeleteMock.call_count, 1)

    self.assertEqual(queryCandidateRowsMock.call_count, 2)
    self.assertEqual(deleteRowsMock.call_count, 2)

    # Make sure it didn't try to retrieve candidates beyond estimated number
    self.assertEqual(len(tuple(uidsIter)), 1)
  def testPurgeOldTweetsFewerCandidatesThanExpected(
      self,
      estimateNumTweetsToDeleteMock,
      queryCandidateRowsMock,
      deleteRowsMock):

    estimate = purge_old_tweets._MAX_DELETE_BATCH_SIZE * 2

    estimateNumTweetsToDeleteMock.return_value = estimate


    uidsIter = iter(xrange(estimate // 2))

    queryCandidateRowsMock.side_effect = (
      lambda limit, **kwargs: tuple(itertools.islice(uidsIter, limit)))

    deleteRowsMock.side_effect = lambda uids, **kwargs: len(uids)

    # Execute
    numDeleted = purge_old_tweets.purgeOldTweets(thresholdDays=90)

    self.assertEqual(numDeleted, estimate // 2)

    self.assertEqual(estimateNumTweetsToDeleteMock.call_count, 1)

    self.assertEqual(queryCandidateRowsMock.call_count, 2)
    self.assertEqual(deleteRowsMock.call_count, 1)
Example #6
0
    def testPurgeOldTweets(self):

        gcThresholdDays = 90

        now = datetime.utcnow()

        oldRows = [
            dict(uid=uuid.uuid1().hex,
                 created_at=now - timedelta(days=gcThresholdDays + 1),
                 retweet=False,
                 lang="en-us"),
            dict(uid=uuid.uuid1().hex,
                 created_at=now - timedelta(days=gcThresholdDays + 2),
                 retweet=False,
                 lang="en-us"),
        ]

        youngRows = [
            dict(uid=uuid.uuid1().hex,
                 created_at=now,
                 retweet=False,
                 lang="en-us"),
            dict(uid=uuid.uuid1().hex,
                 created_at=now - timedelta(days=gcThresholdDays - 1),
                 retweet=False,
                 lang="en-us"),
            dict(uid=uuid.uuid1().hex,
                 created_at=now - timedelta(days=gcThresholdDays - 2),
                 retweet=False,
                 lang="en-us"),
        ]

        allRows = oldRows + youngRows

        # Patch collectorsdb config to use a temporary database
        with collectorsdb_test_utils.ManagedTempRepository("purgetweets"):
            engine = collectorsdb.engineFactory()

            numInserted = engine.execute(
                schema.twitterTweets.insert(),  # pylint: disable=E1120
                allRows).rowcount

            self.assertEqual(numInserted, len(allRows))

            # Execute
            numDeleted = purge_old_tweets.purgeOldTweets(gcThresholdDays)

            # Verify

            self.assertEqual(numDeleted, len(oldRows))

            # Verify that only the old tweets got purged
            remainingRows = engine.execute(
                sql.select([schema.twitterTweets.c.uid])).fetchall()

            self.assertEqual(len(remainingRows), len(youngRows))

            self.assertItemsEqual([row["uid"] for row in youngRows],
                                  [row.uid for row in remainingRows])  # pylint: disable=E1101
    def testPurgeOldTweetsWithoutOldRecords(self,
                                            estimateNumTweetsToDeleteMock,
                                            queryCandidateRowsMock,
                                            deleteRowsMock):
        estimateNumTweetsToDeleteMock.return_value = 0

        # These should not be called in this test
        queryCandidateRowsMock.side_effect = []
        deleteRowsMock.side_effect = []

        numDeleted = purge_old_tweets.purgeOldTweets(thresholdDays=90)

        self.assertEqual(numDeleted, 0)

        self.assertEqual(estimateNumTweetsToDeleteMock.call_count, 1)

        self.assertEqual(queryCandidateRowsMock.call_count, 0)
        self.assertEqual(deleteRowsMock.call_count, 0)
  def testPurgeOldTweetsWithoutOldRecords(self,
                                          estimateNumTweetsToDeleteMock,
                                          queryCandidateRowsMock,
                                          deleteRowsMock):
    estimateNumTweetsToDeleteMock.return_value = 0

    # These should not be called in this test
    queryCandidateRowsMock.side_effect = []
    deleteRowsMock.side_effect = []

    numDeleted = purge_old_tweets.purgeOldTweets(thresholdDays=90)

    self.assertEqual(numDeleted, 0)

    self.assertEqual(estimateNumTweetsToDeleteMock.call_count, 1)

    self.assertEqual(queryCandidateRowsMock.call_count, 0)
    self.assertEqual(deleteRowsMock.call_count, 0)
    def testPurgeOldTweetsFewerCandidatesThanExpected(
            self, estimateNumTweetsToDeleteMock, queryCandidateRowsMock,
            deleteRowsMock):

        estimate = purge_old_tweets._MAX_DELETE_BATCH_SIZE * 2

        estimateNumTweetsToDeleteMock.return_value = estimate

        uidsIter = iter(xrange(estimate // 2))

        queryCandidateRowsMock.side_effect = (
            lambda limit, **kwargs: tuple(itertools.islice(uidsIter, limit)))

        deleteRowsMock.side_effect = lambda uids, **kwargs: len(uids)

        # Execute
        numDeleted = purge_old_tweets.purgeOldTweets(thresholdDays=90)

        self.assertEqual(numDeleted, estimate // 2)

        self.assertEqual(estimateNumTweetsToDeleteMock.call_count, 1)

        self.assertEqual(queryCandidateRowsMock.call_count, 2)
        self.assertEqual(deleteRowsMock.call_count, 1)
  def testPurgeOldTweets(self):


    gcThresholdDays = 90

    now = datetime.utcnow()

    oldRows = [
      dict(
        uid=uuid.uuid1().hex,
        created_at=now - timedelta(days=gcThresholdDays + 1),
        retweet=False,
        lang="en-us"
      ),

      dict(
        uid=uuid.uuid1().hex,
        created_at=now - timedelta(days=gcThresholdDays + 2),
        retweet=False,
        lang="en-us"
      ),
    ]

    youngRows = [
      dict(
        uid=uuid.uuid1().hex,
        created_at=now,
        retweet=False,
        lang="en-us"
      ),

      dict(
        uid=uuid.uuid1().hex,
        created_at=now - timedelta(days=gcThresholdDays - 1),
        retweet=False,
        lang="en-us"
      ),

      dict(
        uid=uuid.uuid1().hex,
        created_at=now - timedelta(days=gcThresholdDays - 2),
        retweet=False,
        lang="en-us"
      ),
    ]

    allRows = oldRows + youngRows

    # Patch collectorsdb config to use a temporary database name
    with collectorsdb_test_utils.ManagedTempRepository("purgetweets"):
      engine = collectorsdb.engineFactory()

      numInserted = engine.execute(
        schema.twitterTweets.insert(),  # pylint: disable=E1120
        allRows
      ).rowcount

      self.assertEqual(numInserted, len(allRows))

      purge_old_tweets.purgeOldTweets(gcThresholdDays)

      # Verify that only the old tweets got purged
      rows = engine.execute(
        sql.select([schema.twitterTweets.c.uid])).fetchall()

      self.assertEqual(len(rows), len(youngRows))

      self.assertItemsEqual([row["uid"] for row in youngRows],
                            [row.uid for row in rows])