Beispiel #1
0
 def test_incremental_crawl_failure(self, bucket_mock, conn_mock, crawl_mock):
     def failure_feed(url):
         if '/feed' in url:
             return {'notdata': [{'ooga': 'booga'}]}
     self.facebook_patch = patch(
         'targetshare.integration.facebook.client.urllib2.urlopen',
         crawl_mock(1, 250, failure_feed)
     )
     self.facebook_patch.start()
     the_past = epoch.from_date(timezone.now() - timedelta(days=365))
     # Test runs in under a second typically, so we need to be slightly
     # behind present time, so that we can see fbm.incremental_epoch
     # get updated
     present = epoch.from_date(timezone.now() - timedelta(seconds=30))
     fbm = models.FBSyncMap.items.create(
         fbid_primary=self.fbid, fbid_secondary=self.fbid, token=self.token.token,
         back_filled=False, back_fill_epoch=the_past,
         incremental_epoch=present,
         status=models.FBSyncMap.COMPLETE, bucket='test_bucket_0'
     )
     existing_key = Mock()
     existing_key.get_contents_as_string.return_value = '{"updated": 1, "data": [{"test": "testing"}]}'
     bucket_mock.return_value = existing_key
     conn_mock.return_value = s3_feed.BucketManager()
     tasks.incremental_crawl(fbm.fbid_primary, fbm.fbid_secondary)
     new_fbm = models.FBSyncMap.items.get_item(
         fbid_primary=self.fbid, fbid_secondary=self.fbid)
     self.assertEqual(fbm.status, fbm.COMPLETE)
     self.assertEqual(int(new_fbm.incremental_epoch), present)
     self.assertFalse(existing_key.set_contents_from_string.called)
Beispiel #2
0
 def test_incremental_crawl(self, bucket_mock, conn_mock):
     the_past = epoch.from_date(timezone.now() - timedelta(days=365))
     # Test runs in under a second typically, so we need to be slightly
     # behind present time, so that we can see fbm.incremental_epoch
     # get updated
     present = epoch.from_date(timezone.now() - timedelta(seconds=30))
     fbm = models.FBSyncMap.items.create(
         fbid_primary=self.fbid, fbid_secondary=self.fbid, token=self.token.token,
         back_filled=False, back_fill_epoch=the_past,
         incremental_epoch=present,
         status=models.FBSyncMap.COMPLETE, bucket='test_bucket_0'
     )
     existing_key = Mock()
     existing_key.data = {"updated": 1, "data": [{"test": "testing"}]}
     bucket_mock.return_value = existing_key
     conn_mock.return_value = s3_feed.BucketManager()
     tasks.incremental_crawl(fbm.fbid_primary, fbm.fbid_secondary)
     new_fbm = models.FBSyncMap.items.get_item(
         fbid_primary=self.fbid, fbid_secondary=self.fbid)
     self.assertEqual(fbm.status, fbm.COMPLETE)
     self.assertGreater(int(new_fbm.incremental_epoch), present)
     self.assertTrue(existing_key.extend_s3_data.called)
     self.assertSequenceEqual(
         existing_key.extend_s3_data.call_args_list[0][0],
         (False,)
     )
Beispiel #3
0
 def save_to_s3(self):
     """ Commits the current populated FeedKey to s3 """
     self.data["updated"] = epoch.from_date(timezone.now())
     with TemporaryFile() as tmp_file:
         json.dump(self.data, tmp_file)
         tmp_file.seek(0)
         self.set_contents_from_file(tmp_file)
Beispiel #4
0
 def test_crawl_comments_and_likes(self, bucket_mock, conn_mock, fb_mock):
     the_past = epoch.from_date(timezone.now() - timedelta(days=365))
     fbm = models.FBSyncMap.items.create(
         fbid_primary=self.fbid, fbid_secondary=self.fbid, token=self.token.token,
         back_filled=False, back_fill_epoch=the_past,
         incremental_epoch=epoch.from_date(timezone.now()),
         status=models.FBSyncMap.COMMENT_CRAWL, bucket='test_bucket_0'
     )
     fb_mock.side_effect = [
         {"data": [
             {
                 "id": "10151910724132946_11479371",
                 "from": {
                     "name": "Alex Tevlin",
                     "id": "794333711"
                 },
                 "message": "Should've stayed at Fulham to begin with!",
                 "can_remove": False,
                 "created_time": "2013-12-20T16:25:26+0000",
                 "like_count": 0,
                 "user_likes": False
             },
         ]},
         {"data": [
             {
                 "id": "100002382106641",
                 "name": "Joseph Orozco"
             },
         ]},
     ]
     user_feed = json.loads(
         open(os.path.join(DATA_PATH, 'user_feed.json')).read()
     )
     existing_key = Mock()
     existing_key.data = json.load(
         open(os.path.join(DATA_PATH, 'user_feed.json'))
     )
     bucket_mock.return_value = existing_key
     conn_mock.return_value = s3_feed.BucketManager()
     self.assertEqual(len(user_feed['data'][0]['comments']['data']), 1)
     self.assertEqual(len(user_feed['data'][0]['likes']['data']), 3)
     tasks.crawl_comments_and_likes(fbm.fbid_primary, fbm.fbid_secondary)
     self.assertEqual(len(existing_key.data['data'][0]['comments']['data']), 2)
     self.assertEqual(len(existing_key.data['data'][0]['likes']['data']), 4)
     fbm = models.FBSyncMap.items.get_item(fbid_primary=self.fbid,
                                           fbid_secondary=self.fbid)
     self.assertEqual(fbm.status, fbm.COMPLETE)
Beispiel #5
0
def incremental_crawl(self, primary, secondary):
    sync_map = models.FBSyncMap.items.get_item(
        fbid_primary=primary, fbid_secondary=secondary)
    logger.info('Starting incremental crawl of %s', sync_map.s3_key_name)
    sync_map.save_status(models.FBSyncMap.INCREMENTAL)
    try:
        bucket = S3_CONN.get_or_create_bucket(sync_map.bucket)
        s3_key, created = bucket.get_or_create_key(sync_map.s3_key_name)
        s3_key.retrieve_fb_feed(
            sync_map.fbid_secondary, sync_map.token,
            sync_map.incremental_epoch, epoch.from_date(timezone.now())
        )
    except (facebook.client.OAuthException):
        rvn_logger.info('Failed incremental crawl due to expired token for %s',
                        sync_map.s3_key_name)
        return
    except (ValueError, IOError):
        try:
            self.retry()
        except MaxRetriesExceededError:
            # We'll get `em next time, boss.
            rvn_logger.info('Failed incremental crawl of %s', sync_map.s3_key_name)
    else:
        try:
            s3_key.crawl_pagination()
        except (facebook.client.OAuthException):
            rvn_logger.info('Failed incremental crawl due to expired token for %s',
                            sync_map.s3_key_name)
            return

        if 'data' in s3_key.data:
            # If we have data, let's save it. If not, let's kick this guy over
            # to crawl_comments_and_likes. We'll get that incremental data later
            try:
                s3_key.extend_s3_data(False)
            except HTTPException as exc:
                self.retry(exc=exc)
            sync_map.incremental_epoch = epoch.from_date(timezone.now())
            sync_map.save()
            crawl_comments_and_likes.apply_async(
                args=[sync_map.fbid_primary, sync_map.fbid_secondary],
                countdown=DELAY_INCREMENT
            )

    sync_map.save_status(models.FBSyncMap.COMPLETE)
    logger.info('Completed incremental crawl of %s', sync_map.s3_key_name)
Beispiel #6
0
 def test_back_fill_crawl(self, bucket_mock, conn_mock, crawl_mock):
     the_past = epoch.from_date(timezone.now() - timedelta(days=365))
     fbm = models.FBSyncMap.items.create(
         fbid_primary=self.fbid, fbid_secondary=self.fbid, token=self.token.token,
         back_filled=False, back_fill_epoch=the_past,
         incremental_epoch=epoch.from_date(timezone.now()),
         status=models.FBSyncMap.BACK_FILL, bucket='test_bucket_0'
     )
     existing_key = Mock()
     existing_key.data = {"updated": 1, "data": [{"test": "testing"}]}
     bucket_mock.return_value = existing_key
     conn_mock.return_value = s3_feed.BucketManager()
     tasks.back_fill_crawl(fbm.fbid_primary, fbm.fbid_secondary)
     fbm = models.FBSyncMap.items.get_item(
         fbid_primary=self.fbid, fbid_secondary=self.fbid)
     self.assertEqual(fbm.status, fbm.COMMENT_CRAWL)
     assert fbm.back_fill_epoch
     assert fbm.back_filled
     assert fbm.incremental_epoch
     assert crawl_mock.apply_async.called
     self.assertTrue(existing_key.extend_s3_data.called)
Beispiel #7
0
def initial_crawl(self, primary, secondary):
    sync_map = models.FBSyncMap.items.get_item(
        fbid_primary=primary, fbid_secondary=secondary)
    logger.info('Starting initial crawl of %s', sync_map.s3_key_name)
    sync_map.save_status(models.FBSyncMap.INITIAL_CRAWL)
    past_epoch = epoch.from_date(timezone.now() - timedelta(days=365))
    now_epoch = epoch.from_date(timezone.now())
    try:
        bucket = S3_CONN.get_or_create_bucket(sync_map.bucket)
        s3_key, _ = bucket.get_or_create_key(sync_map.s3_key_name)
        s3_key.retrieve_fb_feed(
            sync_map.fbid_secondary, sync_map.token, past_epoch, now_epoch
        )
    except (facebook.client.OAuthException):
        rvn_logger.info('Failed initial crawl due to expired token for %s',
                        sync_map.s3_key_name)
        return
    except (ValueError, IOError, HTTPException):
        try:
            self.retry()
        except MaxRetriesExceededError:
            sync_map.save_status(models.FBSyncMap.WAITING)
            return

    s3_key.data['updated'] = now_epoch
    try:
        s3_key.save_to_s3()
    except HTTPException as exc:
        self.retry(exc=exc)

    sync_map.back_fill_epoch = past_epoch
    sync_map.incremental_epoch = now_epoch
    sync_map.save_status(models.FBSyncMap.PAGE_LIKES)
    retrieve_page_likes.apply_async(
        args=[sync_map.fbid_primary, sync_map.fbid_secondary],
        countdown=DELAY_INCREMENT
    )
    logger.info('Completed initial crawl of %s', sync_map.s3_key_name)
Beispiel #8
0
 def extend_s3_data(self, append=True):
     """ Extends the data we have in S3, typically in incremental or
     back_fill jobs. Append flag lets you dictate if the new data ends up
     in front or in back of the existing data
     """
     with TemporaryFile() as s3_file, TemporaryFile() as json_file:
         self.get_contents_to_file(s3_file)
         s3_file.seek(0)
         full_data = json.load(s3_file)
         existing_data = full_data.setdefault("data", [])
         if append:
             existing_data.extend(self.data["data"])
             self.data = full_data
         else:
             self.data["data"].extend(existing_data)
         self.data["updated"] = epoch.from_date(timezone.now())
         json.dump(self.data, json_file)
         json_file.seek(0)
         self.set_contents_from_file(json_file)
Beispiel #9
0
from datetime import datetime, timedelta

import mock
from faraday.utils import epoch

from targetshare import models
from targetshare.tasks.integration import facebook

from .. import EdgeFlipTestCase


DEBUG_TOKEN_MOCK = json.dumps({
    'data': {
        'is_valid': True,
        'user_id': 100,
        'expires_at': epoch.from_date(datetime(2013, 5, 15, 12, 1, 1)),
    }
})

EXTEND_TOKEN_MOCK = urllib.urlencode([
    ('access_token', 'tok1'),
    ('expires', str(60 * 60 * 24 * 60)), # 60 days in seconds
])


class TestStoreOpenAuthToken(EdgeFlipTestCase):

    fixtures = ('test_data',)
    frozen_time = '2013-01-01'

    requests_patch = mock.patch('requests.get', **{'return_value.content': 'access_token=TOKZ'})