def get_archive(src_url, key, preferred_region): """ A generic getter for retrieving an s3 location of an archive where the archive is based off a src_url. sub-dir: hg.mozilla.org supports archives of sub directories within a repository. This flexibility allows for creating archives of only a portion of what would normally be an entire repo archive. logic flow: If their is already a key within s3, a re-direct link is given for the s3 location. If the key does not exist, download the archive from src url, upload it to s3 for each region supported and return all uploaded s3 url locations. When the key does not exist, the remaining work will be assigned to a celery background task with a url location returned immediately for obtaining task state updates. """ buckets = current_app.config['ARCHIVER_S3_BUCKETS'] random_region = buckets.keys()[randint(0, len(buckets.keys()) - 1)] # use preferred region if available otherwise choose a valid one at random region = preferred_region if preferred_region and preferred_region in buckets else random_region bucket = buckets[region] s3 = current_app.aws.connect_to('s3', region) session = current_app.db.session('relengapi') # first, see if the key exists if not s3.get_bucket(bucket).get_key(key): task_id = key.replace('/', '_') # keep things simple and avoid slashes in task url # can't use unique support: # api.pub.build.mozilla.org/docs/development/databases/#unique-row-support-get-or-create # because we want to know when the row doesn't exist before creating it tracker = tables.ArchiverTask.query.filter(tables.ArchiverTask.task_id == task_id).first() if tracker and tracker.state in FINISHED_STATES: log = logger.bind(archiver_task=task_id, archiver_task_state=tracker.state) log.info('Task tracker: {} exists but finished with state: ' '{}'.format(task_id, tracker.state)) # remove tracker and try celery task again delete_tracker(tracker) tracker = None if not tracker: log = logger.bind(archiver_task=task_id) log.info("Creating new celery task and task tracker for: {}".format(task_id)) task = create_and_upload_archive.apply_async(args=[src_url, key], task_id=task_id) if task and task.id: pending_expires_at = now() + datetime.timedelta(seconds=PENDING_EXPIRES_IN) session.add(tables.ArchiverTask(task_id=task.id, s3_key=key, created_at=now(), pending_expires_at=pending_expires_at, src_url=src_url, state="PENDING")) session.commit() else: return {}, 500 return {}, 202, {'Location': url_for('archiver.task_status', task_id=task_id)} logger.info("generating GET URL to {}, expires in {}s".format(key, GET_EXPIRES_IN)) # return 302 pointing to s3 url with archive signed_url = s3.generate_url( method='GET', expires_in=GET_EXPIRES_IN, bucket=bucket, key=key ) return redirect(signed_url)
def create_fake_tracker_row(app, id, s3_key='key', created_at=None, pending_expires_at=None, src_url='https://foo.com', state="PENDING"): now = datetime.datetime(2015, 7, 14, 23, 19, 42, tzinfo=pytz.UTC) # freeze time pending_expiry = now + datetime.timedelta(seconds=60) if not created_at: created_at = now if not pending_expires_at: pending_expires_at = pending_expiry session = app.db.session(tables.DB_DECLARATIVE_BASE) session.add( tables.ArchiverTask(task_id=id, s3_key=s3_key, created_at=created_at, pending_expires_at=pending_expires_at, src_url=src_url, state=state) ) session.commit()
def test_tracker_is_deleted_when_task_status_shows_task_complete(app, client): with app.app_context(): task_id = 'foo' session = app.db.session(tables.DB_DECLARATIVE_BASE) now = datetime.datetime(2015, 7, 14, 23, 19, 42, tzinfo=pytz.UTC) # freeze time pending_expiry = now + datetime.timedelta(seconds=60) session.add(tables.ArchiverTask(task_id=task_id, s3_key='key', created_at=now, pending_expires_at=pending_expiry, src_url='https://foo.com', state="PENDING")) session.commit() with mock.patch("relengapi.blueprints.archiver.create_and_upload_archive") as caua: caua.AsyncResult.return_value = fake_successful_task_status() client.get('/archiver/status/{task_id}'.format(task_id=task_id)) tracker = tables.ArchiverTask.query.filter(tables.ArchiverTask.task_id == task_id).first() eq_(tracker, None, "tracker was not deleted even though celery task completed.")