Beispiel #1
0
class ForkHandler(InstanceHandler):
    """
    Triggers the webhook lambda if new repositories are found.
    """
    def __init__(self, event, context, **kwargs):
        super(ForkHandler, self).__init__(event, context)

        self.stage_vars = self.retrieve(self.event, 'stage-variables',
                                        'payload')
        gogs_token = self.retrieve(self.stage_vars, 'gogs_token',
                                   'Environment Vars')
        self.gogs_url = self.retrieve(self.stage_vars, 'gogs_url',
                                      'Environment Vars')
        self.gogs_org = self.retrieve(self.stage_vars, 'gogs_org',
                                      'Environment Vars')
        self.from_email = self.retrieve(self.stage_vars, 'from_email',
                                        'Environment Vars')
        self.to_email = self.retrieve(self.stage_vars, 'to_email',
                                      'Environment Vars')
        self.stage = self.retrieve(self.stage_vars, 'stage',
                                   'Environment Vars')

        in_progress_db = self.retrieve_with_default(
            self.stage_vars, 'in_progress_db',
            '{}d43-catalog-in-progress'.format(self.stage_prefix()))
        self.catalog_webhook = self.retrieve_with_default(
            self.stage_vars, 'catalog_webhook_lambda',
            '{}d43-catalog_webhook'.format(self.stage))

        if 'dynamodb_handler' in kwargs:
            self.progress_table = kwargs['dynamodb_handler']
        else:
            self.progress_table = DynamoDBHandler(
                in_progress_db)  # pragma: no cover
        if 'gogs_client' in kwargs:
            self.gogs_client = kwargs['gogs_client']
        else:
            self.gogs_client = GogsClient  # pragma: no cover
        if 'boto_handler' in kwargs:
            self.boto = kwargs['boto_handler']
        else:
            self.boto = boto3  # pragma: no cover
        if 'logger' in kwargs:
            self.logger = kwargs['logger']

        self.gogs_api = self.gogs_client.GogsApi(self.gogs_url)
        self.gogs_auth = self.gogs_client.Token(gogs_token)

    def _run(self, **kwargs):
        """
        :param kwargs:
        :return:
        """
        client = self.boto.client("lambda")  # pragma: no cover
        repos = self.get_new_repos()  # pragma: no cover
        self._trigger_webhook(client, repos)  # pragma: no cover
        return True

    def _trigger_webhook(self, client, repos):
        """
        Triggers the webhook in each repo in the list
        :param client boto3.client('lambda'): the lambda client
        :param repos list: an array of repos
        :return:
        """
        if not repos or not len(repos):
            self.logger.info('No new repositories found')
            return
        for repo in repos:
            try:
                payload = self.make_hook_payload(repo)
            except Exception as e:
                self.logger.error(
                    'Failed to retrieve master branch for {0}: {1}'.format(
                        repo.full_name, e))
                continue
            try:
                self.logger.info('Simulating Webhook for {}'.format(
                    repo.full_name))
                client.invoke(FunctionName=self.catalog_webhook,
                              InvocationType='Event',
                              Payload=json.dumps(payload))
                time.sleep(.5)
            except Exception as e:
                self.logger.error('Failed to trigger webhook {0}: {1}'.format(
                    repo.full_name, e))
                continue

    def make_hook_payload(self, repo):
        """
        Generates a webhook payload for the repo
        :param repo:
        :return:
        """

        branch = self.gogs_api.get_branch(self.gogs_auth, self.gogs_org,
                                          repo.name, repo.default_branch)
        return {
            "stage-variables": self.event['stage-variables'],
            "context": self.event['context'],
            "body-json": {
                "after":
                branch.commit.id,
                "commits": [{
                    "id":
                    branch.commit.id,
                    "message":
                    branch.commit.message,
                    "timestamp":
                    branch.commit.timestamp,
                    "url":
                    '{0}/{1}/{2}/commit/{3}'.format(
                        self.gogs_url, self.gogs_org, repo.name, branch.commit.
                        id)  # branch.commit.url <-- not implemented yet
                }],
                "repository": {
                    "owner": {
                        "username": self.gogs_org
                    },
                    "name": repo.name
                }
            },
        }

    def get_new_repos(self):
        """
        Compares the organization repos with what's in progress
        and returns those that are new or updated.
        :return:
        """
        org_repos = self.gogs_api.get_user_repos(None, self.gogs_org)
        items = self.progress_table.query_items()

        new_repos = []
        for repo in org_repos:
            repo_name = repo.full_name.split("/")[-1]
            matching_item = self.__get_obj_in_array('repo_name', repo_name,
                                                    items)
            if not matching_item or ('dirty' in matching_item
                                     and matching_item['dirty']):
                new_repos.append(repo)
            else:
                # check if changed
                # TODO: the branch API is currently broken so this code won't run
                try:
                    branch = self.gogs_api.get_branch(None, self.gogs_org,
                                                      repo_name, 'master')
                    if branch:
                        commit_id = branch.commit.id[:10]
                        for item in items:
                            if item['repo_name'] == repo_name and item[
                                    'commit_id'] != commit_id:
                                new_repos.append(repo)
                except Exception as e:
                    # TRICKY: with the api broken this would create a lot of noise
                    # print('WARNING: failed to detect changes: {}'.format(e))
                    pass  # pragma: no cover

        return new_repos

    def __get_obj_in_array(self, key, value, array):
        """
        Retrieves the first item in an array if the key matches the value
        :param key:
        :param value:
        :param array:
        :return:
        """
        for item in array:
            if item[key] == value: return item
        return None
class CatalogHandler(InstanceHandler):

    def __init__(self, event, context, **kwargs):
        super(CatalogHandler, self).__init__(event, context)

        env_vars = self.retrieve(event, 'stage-variables', 'payload')
        self.cdn_url = self.retrieve(env_vars, 'cdn_url').rstrip('/')
        self.cdn_bucket = self.retrieve(env_vars, 'cdn_bucket')
        self.api_bucket = self.retrieve(env_vars, 'api_bucket')
        self.api_url = self.retrieve(env_vars, 'api_url').rstrip('/')
        self.to_email = self.retrieve(env_vars, 'to_email')
        self.from_email = self.retrieve(env_vars, 'from_email')
        self.api_version = self.retrieve(env_vars, 'version')

        if 'dynamodb_handler' in kwargs:
            db_handler = kwargs['dynamodb_handler']
            self.progress_table = db_handler('{}d43-catalog-in-progress'.format(self.stage_prefix()))
            self.status_table = db_handler('{}d43-catalog-status'.format(self.stage_prefix()))
            self.errors_table = db_handler('{}d43-catalog-errors'.format(self.stage_prefix()))
        else:
            self.progress_table = DynamoDBHandler('{}d43-catalog-in-progress'.format(self.stage_prefix())) # pragma: no cover
            self.status_table = DynamoDBHandler('{}d43-catalog-status'.format(self.stage_prefix())) # pragma: no cover
            self.errors_table = DynamoDBHandler('{}d43-catalog-errors'.format(self.stage_prefix())) # pragma: no cover

        self.catalog = {
            "languages": []
        }
        if 's3_handler' in kwargs:
            self.api_handler = kwargs['s3_handler'](self.api_bucket)
        else:
            self.api_handler = S3Handler(self.api_bucket) # pragma: no cover
        if 'ses_handler' in kwargs:
            self.ses_handler = kwargs['ses_handler']()
        else:
            self.ses_handler = SESHandler() # pragma: no cover
        if 'consistency_checker' in kwargs:
            self.checker = kwargs['consistency_checker']()
        else:
            self.checker = ConsistencyChecker(self.cdn_bucket, self.api_bucket) # pragma: no cover
        if 'get_url_handler' in kwargs:
            self.get_url = kwargs['get_url_handler']
        else:
            self.get_url = get_url # pragma: no cover
        if 'url_exists_handler' in kwargs:
            self.url_exists = kwargs['url_exists_handler']
        else:
            self.url_exists = url_exists # pragma: no cover

    def get_language(self, language):
        """
        Gets the existing language or creates a new one
        :param language:
        :return:
        """
        found_lang = None
        for lang in self.catalog['languages']:
            if lang['identifier'] == language['identifier']:
                found_lang = lang
                break
        if not found_lang:
            self.catalog['languages'].append(language)
        else:
            language = found_lang
        if 'resources' not in language:
            language['resources'] = []
        return language

    def _run(self):
        completed_items = 0
        items = self.progress_table.query_items()

        for item in items:
            repo_name = item['repo_name']
            self.logger.info('Processing {}'.format(repo_name))
            try:
                package = json.loads(item['package'])
            except Exception as e:
                self.report_error('Skipping {}. Bad Manifest: {}'.format(repo_name, e))
                continue
            if repo_name == "catalogs":
                self.catalog['catalogs'] = package
            elif repo_name == 'localization':
                self._build_localization(package)
            elif repo_name == 'versification':
                # TODO: we have not yet determined what to do with versification
                pass
            else:
                if self._build_rc(item, package, self.checker):
                    completed_items += 1

        # remove empty languages
        condensed_languages = []
        for lang in self.catalog['languages']:
            if 'resources' in lang and len(lang['resources']) > 0:
                condensed_languages.append(lang)
        self.catalog['languages'] = condensed_languages

        response = {
            'success': False,
            'incomplete': len(self.checker.all_errors) > 0,
            'message': None,
            'catalog': self.catalog
        }

        if completed_items > 0:
            status = self._read_status()
            if status and status['state'] == 'complete' and not self._catalog_has_changed(self.catalog):
                response['success'] = True
                response['message'] = 'No changes detected. Catalog not deployed'
            else:
                cat_str = json.dumps(self.catalog, sort_keys=True, separators=(',',':'))
                try:
                    catalog_path = os.path.join(tempfile.gettempdir(), 'catalog.json')
                    write_file(catalog_path, cat_str)
                    c_stats = os.stat(catalog_path)
                    self.logger.info('New catalog built: {} Kilobytes'.format(c_stats.st_size * 0.001))

                    self.api_handler.upload_file(catalog_path, 'v{0}/catalog.json'.format(self.api_version), cache_time=0)
                    # TRICKY: only mark as complete when there are no errors
                    if len(self.checker.all_errors):
                        self._publish_status('incomplete')
                    else:
                        self._publish_status()

                    response['success'] = True
                    response['message'] = 'Uploaded new catalog to {0}/v{1}/catalog.json'.format(self.api_url, self.api_version)
                except Exception as e:
                    self.checker.log_error('Unable to save catalog: {0}'.format(e)) # pragma: no cover

        if len(self.checker.all_errors) > 0:
            self.report_error(self.checker.all_errors)

        if completed_items == 0:
            self.checker.log_error('There were no formats to process')

        if not response['success']:
            response['catalog'] = None
            response['message'] = '{0}'.format(self.checker.all_errors)

        if(response['success']):
            self.logger.info(response['message'])
        else:
            self.logger.error('Catalog was not published due to errors')

        return response

    def _read_status(self):
        """
        Retrieves the recorded status of the catalog
        :return:
        """
        results = self.status_table.query_items({'api_version': self.api_version})
        if not results:
            return None
        else:
            return results[0]

    def _publish_status(self, state='complete'):
        """
        Updates the catalog status
        :param state: the state of completion the catalog is in
        :return:
        """
        self.logger.debug('Recording catalog status: "{}"'.format(state))
        self.status_table.update_item(
            {'api_version': self.api_version},
            {
                'state': state,
                'timestamp': time.strftime("%Y-%m-%dT%H:%M:%SZ"),
                'catalog_url': '{0}/v{1}/catalog.json'.format(self.api_url, self.api_version)
            }
        )

    def _build_rc(self, item, manifest, checker):
        """
        Builds a RC entry in the catalog.
        :param item:
        :param manifest:
        :param checker:
        :return: True if the entry was successfully added otherwise False
        """
        errors = checker.check(item)
        if errors:
            return False
        dc = manifest['dublin_core']
        language = dc['language']
        language = self.get_language(language)  # gets the existing language container or creates a new one

        formats = []
        for fmt in manifest['formats']:
            errors = checker.check_format(fmt, item)
            if not errors:
                self._strip_build_rules(fmt)
                formats.append(fmt)

        if len(formats) > 0:
            resource = copy.deepcopy(dc)
            resource['projects'] = []
            del resource['conformsto']
            del resource['format']
            del resource['language']
            del resource['type']
            resource['checking'] = copy.deepcopy(manifest['checking'])
            if not resource['relation']:
                resource['relation'] = []

            # store projects
            for project in manifest['projects']:
                if 'formats' in project:
                    for fmt in project['formats']:
                        self._strip_build_rules(fmt)
                        checker.check_format(fmt, item)
                if not project['categories']:
                    project['categories'] = []
                del project['path']
                resource['projects'].append(project)

            # store formats
            # TRICKY: Bible usfm bundles should always be at the resource level
            is_bible = dc['identifier'] == 'ulb' or dc['identifier'] == 'udb'
            if len(manifest['projects']) == 1 and not (is_bible and self.has_usfm_bundle(formats)):
                # single-project RCs store formats in projects for backwards compatibility.
                if 'formats' in resource['projects'][0]:
                    formats = formats + resource['projects'][0]['formats']
                resource['projects'][0]['formats'] = formats

            # multi-project RCs store formats in resource
            resource['formats'] = formats

            if 'comment' not in resource: resource['comment'] = ''

            language['resources'].append(resource)
            return True

        return False

    def _strip_build_rules(self, obj):
        """
        Recursively removes 'build_tools' from an object
        :param obj:
        :return:
        """
        if 'build_rules' in obj:
            del obj['build_rules']
        if 'projects' in obj:
            for project in obj['projects']:
                self._strip_build_rules(project)
        if 'formats' in obj:
            for format in obj['formats']:
                self._strip_build_rules(format)
        if 'chapters' in obj:
            for chapter in obj['chapters']:
                self._strip_build_rules(chapter)


    def has_usfm_bundle(self, formats):
        """
        Checks if an array of formats contains a format that is a usfm bundle
        :param formats:
        :return:
        """
        for format in formats:
            if 'text/usfm' in format['format'] and 'type=bundle' in format['format']:
                return True
        return False

    def _build_versification(self, package, checker):
        """
        DEPRECATED

        Adds versification chunks to projects in the catalog.
        Note: this may not do anything if no languages have been generated yet.
        self._build_rc will pick up the slack in that case.
        :param package:
        :return: False if errors were encountered
        """
        dict = {}


        for project in package:
            dict[project['identifier']] = project
            if not self.url_exists(project['chunks_url']):
                checker.log_error('{} does not exist'.format(project['chunks_url']))
                # for performance's sake we'll fail on a single error
                return False

        # inject into existing projects
        for lang in self.catalog['languages']:
            if 'resources' not in lang: continue
            for res in lang['resources']:
                if 'projects' not in res: continue
                for proj in res['projects']:
                    if proj['identifier'] in dict and proj['versification']:
                        proj.update(dict[proj['identifier']])

        return True

    def _build_localization(self, package):
        """
        Adds localization to the catalog
        :param package:
        :return:
        """
        for lang in package:
            localization = package[lang]
            language = localization['language']
            del localization['language']
            language = self.get_language(language)  # gets the existing language container or creates a new one
            language.update(localization)

    def _catalog_has_changed(self, catalog):
        """
        Checks if the catalog has changed compared to the given catalog
        :param catalog:
        :return:
        """
        try:
            catalog_url = '{0}/v{1}/catalog.json'.format(self.api_url, self.api_version)
            self.logger.debug('Comparing new catalog against old ({})'.format(catalog_url))
            old_catalog_str = self.get_url(catalog_url, True)
            new_catalog_str = json.dumps(catalog, sort_keys=True, separators=(',',':'))

            old_hash = hashlib.md5(old_catalog_str.encode('utf-8')).hexdigest()
            new_hash = hashlib.md5(new_catalog_str.encode('utf-8')).hexdigest()
            self.logger.debug('Old catalog hash: {}'.format(old_hash))
            self.logger.debug('New catalog hash: {}'.format(new_hash))
            return old_hash != new_hash
        except Exception as e:
            return True
Beispiel #3
0
class SigningHandler(InstanceHandler):
    max_file_size = 400000000  # 400mb

    def __init__(self, event, context, logger, signer, **kwargs):
        super(SigningHandler, self).__init__(event, context)

        env_vars = self.retrieve(event, 'stage-variables', 'payload')
        self.cdn_bucket = self.retrieve(env_vars, 'cdn_bucket',
                                        'Environment Vars')
        self.cdn_url = self.retrieve(env_vars, 'cdn_url', 'Environment Vars')
        self.from_email = self.retrieve(env_vars, 'from_email',
                                        'Environment Vars')
        self.to_email = self.retrieve(env_vars, 'to_email', 'Environment Vars')
        self.api_version = self.retrieve(env_vars, 'version',
                                         'Environment Vars')
        self.api_bucket = self.retrieve(env_vars, 'api_bucket',
                                        'Environment Vars')
        self.logger = logger  # type: logging._loggerClass
        self.signer = signer

        self.in_progress_db = self.retrieve_with_default(
            env_vars, 'in_progress_db',
            '{}d43-catalog-in-progress'.format(self.stage_prefix()))

        if 's3_handler' in kwargs:
            self.cdn_handler = kwargs['s3_handler']
        else:
            self.cdn_handler = S3Handler(self.cdn_bucket)  # pragma: no cover

        self.temp_dir = tempfile.mkdtemp(prefix='signing_')

        if 'dynamodb_handler' in kwargs:
            self.db_handler = kwargs['dynamodb_handler']
        else:
            self.db_handler = DynamoDBHandler(
                self.in_progress_db)  # pragma: no cover
        if 'download_handler' in kwargs:
            self.download_file = kwargs['download_handler']
        else:
            self.download_file = download_file  # pragma: no cover
        if 'url_exists_handler' in kwargs:
            self.url_exists = kwargs['url_exists_handler']
        else:
            self.url_exists = url_exists  # pragma: no cover
        if 'url_headers_handler' in kwargs:
            self.url_headers = kwargs['url_headers_handler']
        else:
            self.url_headers = url_headers  # pragma: no cover

    def __del__(self):
        shutil.rmtree(self.temp_dir, ignore_errors=True)

    def _safe_url_exists(self, url):
        """
        Safely checks if a url exists.
        :param url:
        :return:
        """
        try:
            return self.url_exists(url)
        except Exception as e:
            self.report_error('Failed to read url "{}": {}'.format(
                url, e.message))
            return False

    def _run(self):
        items = self.db_handler.query_items({'signed': False})
        try:
            for item in items:
                repo_name = item['repo_name']
                try:
                    package = json.loads(item['package'])
                except Exception as e:
                    self.report_error('Skipping {}. Bad Manifest: {}'.format(
                        repo_name, e))
                    continue

                if repo_name != "catalogs" and repo_name != 'localization' and repo_name != 'versification':
                    self.process_db_item(item, package)

            found_items = len(items) > 0
            if not found_items and self.logger:
                self.logger.info('No items found for signing')
            return found_items
        except Exception as e:
            self.report_error('Failed processing an item: {}'.format(
                e.message))
            raise Exception, Exception(e), sys.exc_info()[2]
        finally:
            if os.path.isdir(self.temp_dir):
                shutil.rmtree(self.temp_dir, ignore_errors=True)

    def process_db_item(self, item, package):
        was_signed = False
        fully_signed = True
        self.logger.info('Processing {}'.format(item['repo_name']))
        if 'formats' in package:
            for format in package['formats']:
                # process resource formats
                (already_signed,
                 newly_signed) = self.process_format(item,
                                                     package['dublin_core'],
                                                     None, format)
                if newly_signed:
                    was_signed = True
                if not (already_signed or newly_signed):
                    fully_signed = False
        for project in package['projects']:
            if 'formats' in project:
                for format in project['formats']:
                    # process project formats
                    (already_signed, newly_signed) = self.process_format(
                        item, package['dublin_core'], project, format)
                    if newly_signed:
                        was_signed = True
                    if not (already_signed or newly_signed):
                        fully_signed = False

                    # process format chapters
                    if 'chapters' in format:
                        sanitized_chapters = []
                        for chapter in format['chapters']:
                            # TRICKY: only process/keep chapters that actually have a valid url
                            if 'url' not in chapter or not self._safe_url_exists(
                                    chapter['url']):
                                if 'url' not in chapter:
                                    missing_url = 'empty url'
                                else:
                                    missing_url = chapter['url']
                                self.logger.warning(
                                    'Skipping chapter {}:{} missing url {}'.
                                    format(project['identifier'],
                                           chapter['identifier'], missing_url))
                                continue

                            (already_signed,
                             newly_signed) = self.process_format(
                                 item, package['dublin_core'], project,
                                 chapter)
                            sanitized_chapters.append(chapter)
                            if newly_signed:
                                was_signed = True
                            if not (already_signed or newly_signed):
                                fully_signed = False

                        format['chapters'] = sanitized_chapters
                        # update format
                        if sanitized_chapters and not 'content=' in format[
                                'format'] and format['url'].endswith('zip'):
                            if format['chapters'][0]['url'].endswith('.mp3'):
                                format[
                                    'format'] = 'application/zip; content=audio/mp3'
                            if format['chapters'][0]['url'].endswith('.mp4'):
                                format[
                                    'format'] = 'application/zip; content=video/mp4'

        if was_signed or fully_signed:
            self.logger.debug('recording signatures')
            record_keys = {'repo_name': item['repo_name']}
            self.db_handler.update_item(
                record_keys, {
                    'package': json.dumps(package, sort_keys=True),
                    'signed': fully_signed
                })

    def process_format(self, item, dublin_core, project, format):
        """
        Performs the signing on the format object.
        Files outside of the cdn will not be signed
        :param item:
        :param dublin_core:
        :param project: this may be None.
        :param format:
        :return: (already_signed, newly_signed)
        """
        if 'signature' in format and format['signature']:
            return (True, False)
        else:
            self.logger.debug('Signing {}'.format(format['url']))

        base_name = os.path.basename(format['url'])
        file_to_sign = os.path.join(self.temp_dir, base_name)

        # extract cdn key from url
        url_info = urlparse.urlparse(format['url'])
        src_key = url_info.path.lstrip('/')
        sig_key = '{}.sig'.format(src_key)

        build_rules = get_build_rules(format, 'signing')

        # TRICKY: allow dev environments to download from prod environment
        # RS: I added the s3 bucket here because it isn't yet accessible via urls
        valid_hosts = [
            self.cdn_bucket, self.cdn_bucket + ".s3.us-east-2.amazonaws.com"
        ]
        if self.stage_prefix():
            if not self.cdn_bucket.startswith(self.stage_prefix()):
                self.logger.warning(
                    'Expected `cdn_bucket` to begin with the stage prefix ({}) but found {}'
                    .format(self.stage_prefix(), self.cdn_bucket))
            prod_cdn_bucket = self.cdn_bucket.lstrip(self.stage_prefix())
            valid_hosts.append(prod_cdn_bucket)
            # TRICKY: force dev environments to handle prod content as external files
            # if format['url'].startswith(prod_cdn_url):
            #     build_rules.append('sign_given_url')

        # TRICKY: some html content is on the api
        if 'html_format' in build_rules:
            valid_hosts.append(self.api_bucket)
            prod_api_bucket = self.api_bucket.lstrip(self.stage_prefix())
            valid_hosts.append(prod_api_bucket)

        # verify url is on the cdn
        if not url_info.hostname in valid_hosts:
            # TODO: external media should be imported if it's not too big
            # This allows media to be hosted on third party servers
            format['signature'] = ''  #'{}.sig'.format(format['url'])
            self.logger.warning(
                'cannot sign files outside of the cdn: {}'.format(
                    format['url']))
            self.logger.warning('valid hosts are: {}'.format(
                ", ".join(valid_hosts)))
            return (True, True)

        try:
            headers = self.url_headers(format['url'])
        except Exception as e:
            self.report_error('Could not read headers from {}: {}'.format(
                format['url'], e))
            return (False, False)

        # skip files that are too large
        size = int(headers.get('content-length', 0))
        if size > SigningHandler.max_file_size:
            sig_url = '{}.sig'.format(format['url'])
            if not self._safe_url_exists(sig_url):
                # wait for signature to be manually uploaded
                self.report_error('File is too large to sign {}'.format(
                    format['url']))
                return (False, False)

            # finish with manually uploaded signature
            format['size'] = size
            if not format['modified']:
                format['modified'] = str_to_timestamp(
                    datetime.datetime.now().isoformat())
            format['signature'] = sig_url
            return (False, True)

        # download file
        try:
            if 'sign_given_url' in build_rules or 'html_format' in build_rules:
                # report error if response is 400+
                if headers.status >= 400:
                    self.report_error('Resource not available at {}'.format(
                        format['url']))
                    return (False, False)

                self.download_file(format['url'], file_to_sign)
            else:
                # TRICKY: most files to be signed are stored in a temp directory
                src_temp_key = 'temp/{}/{}/{}'.format(item['repo_name'],
                                                      item['commit_id'],
                                                      src_key)
                self.cdn_handler.download_file(src_temp_key, file_to_sign)
        except Exception as e:
            self.report_error(
                'The file "{}" could not be downloaded: {}'.format(
                    base_name, e))
            return (False, False)

        # strip print script from html
        if 'html_format' in build_rules:
            self.logger.debug('Removing print script from {} html'.format(
                item['repo_name']))
            self._strip_print_script(file_to_sign)

        # sign file
        sig_file = self.signer.sign_file(file_to_sign)
        try:
            self.signer.verify_signature(file_to_sign, sig_file)
        except RuntimeError:
            if self.logger:
                self.logger.warning(
                    'The signature was not successfully verified.')
            return (False, False)

        # TRICKY: re-format html urls
        if 'html_format' in build_rules:
            html_name = dublin_core['identifier']
            if project:
                html_name = project['identifier']
            src_key = '{}/{}/v{}/media/html/{}.html'.format(
                dublin_core['language']['identifier'],
                dublin_core['identifier'], self.api_version, html_name)
            sig_key = '{}.sig'.format(src_key)
            format['url'] = '{}/{}'.format(self.cdn_url, src_key)

        # upload files
        if 'sign_given_url' not in build_rules or 'html_format' in build_rules:
            # TRICKY: upload temp files to production
            self.cdn_handler.upload_file(file_to_sign, src_key)
        self.cdn_handler.upload_file(sig_file, sig_key)

        # add the url of the sig file to the format
        format['signature'] = '{}.sig'.format(format['url'])

        # read modified date from file
        stats = os.stat(file_to_sign)
        if not format['modified']:
            modified = headers.get('last-modified')
            if modified:
                # TRICKY: http header gives an odd date format
                date = datetime.datetime.strptime(modified,
                                                  '%a, %d %b %Y %H:%M:%S %Z')
                modified = str_to_timestamp(date.isoformat())
            else:
                modified = unix_to_timestamp(stats.st_mtime)
            format['modified'] = modified
        format['size'] = stats.st_size

        # retrieve playback time from multimedia files
        _, ext = os.path.splitext(file_to_sign)
        if ext == '.mp3':
            audio = MP3(file_to_sign)
            format['length'] = audio.info.length
        elif ext == '.mp4':
            video = MP4(file_to_sign)
            format['length'] = video.info.length

        # add file format if missing
        if not 'format' in format or not format['format']:
            try:
                mime = ext_to_mime(ext)
                format['format'] = mime
            except Exception as e:
                if self.logger:
                    self.logger.error(e.message)

        # clean up disk space
        os.remove(file_to_sign)

        return (False, True)

    @staticmethod
    def _strip_print_script(file_to_sign):
        html = read_file(file_to_sign)
        html = html.replace('window.print()', '')
        write_file(file_to_sign, html)
Beispiel #4
0
class UwV2CatalogHandler(InstanceHandler):

    cdn_root_path = 'v2/uw'
    api_version = 'uw.2'

    def __init__(self, event, context, logger, **kwargs):
        super(UwV2CatalogHandler, self).__init__(event, context)

        env_vars = self.retrieve(event, 'stage-variables', 'payload')
        self.cdn_bucket = self.retrieve(env_vars, 'cdn_bucket',
                                        'Environment Vars')
        self.cdn_url = self.retrieve(env_vars, 'cdn_url',
                                     'Environment Vars').rstrip('/')
        self.from_email = self.retrieve(env_vars, 'from_email',
                                        'Environment Vars')
        self.to_email = self.retrieve(env_vars, 'to_email', 'Environment Vars')
        self.logger = logger  # type: logging._loggerClass
        self.temp_dir = tempfile.mkdtemp('', 'uw_v2', None)

        if 's3_handler' in kwargs:
            self.cdn_handler = kwargs['s3_handler']
        else:
            self.cdn_handler = S3Handler(self.cdn_bucket)  # pragma: no cover

        if 'dynamodb_handler' in kwargs:
            self.db_handler = kwargs['dynamodb_handler']
        else:
            self.db_handler = DynamoDBHandler('{}d43-catalog-status'.format(
                self.stage_prefix()))  # pragma: no cover

        if 'url_handler' in kwargs:
            self.get_url = kwargs['url_handler']
        else:
            self.get_url = get_url  # pragma: no cover

        if 'download_handler' in kwargs:
            self.download_file = kwargs['download_handler']
        else:
            self.download_file = download_file  # pragma: no cover

        if 'signing_handler' in kwargs:
            self.signer = kwargs['signing_handler']
        else:
            self.signer = Signer(ENC_PRIV_PEM_PATH)  # pragma: no cover

    def __del__(self):
        try:
            shutil.rmtree(self.temp_dir)
        finally:
            pass

    def _run(self):
        """
        Generates the v2 catalog
        :return:
        """
        try:
            return self.__execute()
        except Exception as e:
            self.report_error(e.message)
            raise Exception, Exception(e), sys.exc_info()[2]

    def __execute(self):
        """
        We wrap this in a separate function to more easily handle errors
        :return:
        """
        uploads = []

        result = self._get_status()
        if not result:
            return False
        else:
            (status, source_status) = result

        # check if build is complete
        if status['state'] == 'complete':
            if self.logger:
                self.logger.debug('Catalog already generated')
            return True

        # retrieve the latest catalog
        catalog_content = self.get_url(source_status['catalog_url'], True)
        if not catalog_content:
            if self.logger:
                self.logger.error("{0} does not exist".format(
                    source_status['catalog_url']))
            return False
        try:
            self.latest_catalog = json.loads(catalog_content)
        except Exception as e:
            if self.logger:
                self.logger.error(
                    "Failed to load the catalog json: {0}".format(e))
            return False

        catalog = self.convert_v3_to_v2(self.latest_catalog, status)

        catalog_upload = self._prep_json_upload('catalog.json', catalog)
        uploads.append(catalog_upload)
        # TRICKY: also upload to legacy path for backwards compatibility
        uploads.append({
            'key': '/uw/txt/2/catalog.json',
            'path': catalog_upload['path']
        })

        # upload files
        for upload in uploads:
            if not upload['key'].startswith('/'):
                key = '{}/{}'.format(UwV2CatalogHandler.cdn_root_path,
                                     upload['key'])
            else:
                key = upload['key'].lstrip('/')
            self.cdn_handler.upload_file(upload['path'], key)

        status['timestamp'] = time.strftime("%Y-%m-%dT%H:%M:%SZ")
        status['state'] = 'complete'
        self.db_handler.update_item(
            {'api_version': UwV2CatalogHandler.api_version}, status)

    def convert_v3_to_v2(self, v3_catalog, status):
        """
        Builds a v2 catalog for the uW api endpoint.
        This uses the v3 catalog as the source
        :param v3_catalog: the v3 catalog
        :param status: the build status retrieved from AWS.
        :return: the complete v2 catalog
        """
        cat_keys = []
        v2_catalog = {'obs': {}, 'bible': {}}

        title_map = {'bible': 'Bible', 'obs': 'Open Bible Stories'}

        last_modified = 0

        for lang in v3_catalog['languages']:
            lid = lang['identifier']
            self.logger.info('Processing {}'.format(lid))
            for res in lang['resources']:
                rid = res['identifier']
                if rid == 'obs':
                    cat_key = 'obs'
                else:
                    cat_key = 'bible'

                mod = str_to_unix_time(res['modified'])

                if int(mod) > last_modified:
                    last_modified = int(mod)

                # TRICKY: we are not processing the resource formats

                toc = []
                for proj in res['projects']:
                    pid = proj['identifier']
                    if 'formats' in proj and proj['formats']:
                        source = None
                        pdf = None
                        media = {
                            'audio': {
                                'src_dict': {}
                            },
                            'video': {
                                'src_dict': {}
                            }
                        }
                        for format in proj['formats']:
                            # skip media formats that do not match the source version
                            if 'source_version' in format and format[
                                    'source_version'] != res['version']:
                                if self.logger:
                                    self.logger.warning(
                                        '{}_{}_{}: media format "{}" does not match source version "{}" and will be excluded.'
                                        .format(lid, rid, pid, format['url'],
                                                res['version']))
                                continue

                            if rid == 'obs' and 'type=book' in format['format']:
                                # TRICKY: obs must be converted to json
                                process_id = '_'.join([lid, rid, pid])
                                obs_key = '{}/{}/{}/{}/v{}/source.json'.format(
                                    self.cdn_root_path, pid, lid, rid,
                                    res['version'])
                                if process_id not in status['processed']:
                                    obs_json = index_obs(
                                        lid, rid, format, self.temp_dir,
                                        self.download_file)
                                    upload = self._prep_json_upload(
                                        obs_key, obs_json)
                                    self.cdn_handler.upload_file(
                                        upload['path'], upload['key'])

                                    # sign obs file.
                                    # TRICKY: we only need to sign obs so we do so now.
                                    sig_file = self.signer.sign_file(
                                        upload['path'])
                                    try:
                                        self.signer.verify_signature(
                                            upload['path'], sig_file)
                                        self.cdn_handler.upload_file(
                                            sig_file,
                                            '{}.sig'.format(upload['key']))
                                    except RuntimeError:
                                        if self.logger:
                                            self.logger.warning(
                                                'Could not verify signature {}'
                                                .format(sig_file))

                                    status['processed'].update(
                                        {process_id: []})
                                    status['timestamp'] = time.strftime(
                                        "%Y-%m-%dT%H:%M:%SZ")
                                    self.db_handler.update_item(
                                        {
                                            'api_version':
                                            UwV2CatalogHandler.api_version
                                        }, status)
                                else:
                                    cat_keys = cat_keys + status['processed'][
                                        process_id]

                                source = {
                                    'url':
                                    '{}/{}'.format(self.cdn_url, obs_key),
                                    'signature':
                                    '{}/{}.sig'.format(self.cdn_url, obs_key)
                                }
                            elif rid != 'obs' and format[
                                    'format'] == 'text/usfm':
                                # process bible
                                process_id = '_'.join([lid, rid, pid])
                                bible_key = '{0}/{1}/{2}/{3}/v{4}/{1}.usfm'.format(
                                    self.cdn_root_path, pid, lid, rid,
                                    res['version'])
                                if process_id not in status['processed']:
                                    usfm = self._process_usfm(format)
                                    upload = self._prep_text_upload(
                                        bible_key, usfm)
                                    self.cdn_handler.upload_file(
                                        upload['path'], upload['key'])

                                    # sign file
                                    sig_file = self.signer.sign_file(
                                        upload['path'])
                                    try:
                                        self.signer.verify_signature(
                                            upload['path'], sig_file)
                                        self.cdn_handler.upload_file(
                                            sig_file,
                                            '{}.sig'.format(upload['key']))
                                    except RuntimeError:
                                        if self.logger:
                                            self.logger.warning(
                                                'Could not verify signature {}'
                                                .format(sig_file))

                                    status['processed'].update(
                                        {process_id: []})
                                    status['timestamp'] = time.strftime(
                                        "%Y-%m-%dT%H:%M:%SZ")
                                    self.db_handler.update_item(
                                        {
                                            'api_version':
                                            UwV2CatalogHandler.api_version
                                        }, status)
                                else:
                                    cat_keys = cat_keys + status['processed'][
                                        process_id]
                                source = {
                                    'url':
                                    '{}/{}'.format(self.cdn_url, bible_key),
                                    'signature':
                                    '{}/{}.sig'.format(self.cdn_url, bible_key)
                                }
                            elif 'content=audio/mp3' in format[
                                    'format'] or 'content=video/mp4' in format[
                                        'format']:
                                # process media
                                quality_value, quality_suffix = self.__parse_media_quality(
                                    format['quality'])
                                if 'content=audio/mp3' in format['format']:
                                    media_container = media['audio']
                                    quality_key = 'bitrate'
                                    quality_short_key = 'br'
                                else:
                                    media_container = media['video']
                                    quality_key = 'resolution'
                                    quality_short_key = 'res'

                                # build chapter src
                                src_dict = {}
                                if 'chapters' in format:
                                    for chapter in format['chapters']:
                                        src_dict[chapter['identifier']] = {
                                            quality_short_key: [{
                                                quality_key:
                                                int(quality_value),
                                                'mod':
                                                int(
                                                    str_to_unix_time(
                                                        chapter['modified'])),
                                                'size':
                                                chapter['size']
                                            }],
                                            'chap':
                                            chapter['identifier'],
                                            'length':
                                            int(math.ceil(chapter['length'])),
                                            'src':
                                            chapter['url'].replace(
                                                format['quality'],
                                                '{bitrate}' + quality_suffix),
                                            'src_sig':
                                            chapter['signature'].replace(
                                                format['quality'],
                                                '{bitrate}' + quality_suffix)
                                        }

                                merge_dict(
                                    media_container, {
                                        'contributors':
                                        ',\\n'.join(format['contributor']),
                                        'rev':
                                        format['version'],
                                        'txt_ver':
                                        format['source_version'],
                                        'src_dict':
                                        src_dict
                                    })
                            elif 'application/pdf' == format['format']:
                                pdf = {
                                    'url': format['url'],
                                    'source_version': format['source_version']
                                }

                        # build catalog
                        if not source:
                            if self.logger:
                                self.logger.debug(
                                    'No book text found in {}_{}_{}'.format(
                                        lid, rid, pid))
                            continue

                        media_keys = media.keys()
                        for key in media_keys:
                            if media[key]['src_dict']:
                                media[key]['src_list'] = [
                                    media[key]['src_dict'][k]
                                    for k in media[key]['src_dict']
                                ]
                                del media[key]['src_dict']
                            else:
                                del media[key]
                        toc_item = {
                            'desc': '',
                            'media': media,
                            'mod': mod,
                            'slug': proj['identifier'],
                            'src': source['url'],
                            'src_sig': source['signature'],
                            'title': proj['title'],
                        }
                        if rid == 'obs':
                            del toc_item['slug']
                        if pdf:
                            toc_item['pdf'] = pdf['url']

                        if not media:
                            del toc_item['media']
                        toc.append(toc_item)

                if not toc:
                    continue

                # TRICKY: not all manifests have a source text
                if 'source' in res and len(res['source']):
                    source = res['source'][0]
                else:
                    source = {'language': '', 'version': ''}

                comment = ''
                if 'comment' in res:
                    comment = res['comment']

                # TRICKY: maintain legacy slug formatting for backwards compatibility
                legacy_slug = '{}-{}'.format(rid, lid)
                res_v2_id = rid
                if legacy_slug in self.legacy_slugs or rid == 'obs':
                    res_v2_id = legacy_slug

                res_v2 = {
                    'slug': res_v2_id,
                    'name': res['title'],
                    'mod': mod,
                    'status': {
                        'checking_entity':
                        '; '.join(res['checking']['checking_entity']),
                        'checking_level':
                        res['checking']['checking_level'],
                        'comments':
                        comment,
                        'contributors':
                        '; '.join(res['contributor']),
                        'publish_date':
                        res['issued'],
                        'source_text':
                        source['language'],
                        'source_text_version':
                        source['version'],
                        'version':
                        res['version']
                    },
                    'toc': toc
                }

                if not lid in v2_catalog[cat_key]:
                    v2_catalog[cat_key][lid] = {
                        'lc': lid,
                        'mod': mod,
                        'vers': []
                    }
                v2_catalog[cat_key][lid]['vers'].append(res_v2)

        # condense catalog
        catalog = {'cat': [], 'mod': last_modified}
        for cat_slug in v2_catalog:
            langs = []
            for lid in v2_catalog[cat_slug]:
                langs.append(v2_catalog[cat_slug][lid])

            catalog['cat'].append({
                'slug': cat_slug,
                'title': title_map[cat_slug],
                'langs': langs
            })
        return catalog

    def _process_usfm(self, format):
        url = format['url']
        usfm_file = os.path.join(self.temp_dir, md5(url).hexdigest())
        self.download_file(url, usfm_file)
        usfm = read_file(usfm_file)
        return convert_chunk_markers(strip_word_data(usfm))

    def _get_status(self):
        """
        Retrieves the catalog status from AWS.

        :return: A tuple containing the status object of the target and source catalogs, or False if the source is not ready
        """
        status_results = self.db_handler.query_items({
            'api_version': {
                'condition': 'is_in',
                'value': ['3', UwV2CatalogHandler.api_version]
            }
        })
        source_status = None
        status = None
        for s in status_results:
            if s['api_version'] == '3':
                source_status = s
            elif s['api_version'] == UwV2CatalogHandler.api_version:
                status = s
        if not source_status:
            if self.logger:
                self.logger.debug('Source catalog status not found')
            return False
        if source_status['state'] != 'complete':
            if self.logger:
                self.logger.debug('Source catalog is not ready for use')
            return False
        if not status or status['source_timestamp'] != source_status[
                'timestamp']:
            # begin or restart process
            status = {
                'api_version': UwV2CatalogHandler.api_version,
                'catalog_url': '{}/uw/txt/2/catalog.json'.format(self.cdn_url),
                'source_api': source_status['api_version'],
                'source_timestamp': source_status['timestamp'],
                'state': 'in-progress',
                'processed': {}
            }

        return (status, source_status)

    def _prep_json_upload(self, key, data):
        """
        Prepares some data for upload to s3
        :param key:
        :param data:
        :return:
        """
        temp_file = os.path.join(self.temp_dir, key)
        write_file(temp_file, json.dumps(data, sort_keys=True))
        return {'key': key, 'path': temp_file}

    def _prep_text_upload(self, key, data):
        """
        Prepares some data for upload to s3
        :param key:
        :param data:
        :return:
        """
        temp_file = os.path.join(self.temp_dir, key)
        write_file(temp_file, data)
        return {'key': key, 'path': temp_file}

    def __parse_media_quality(self, quality):
        """
        Returns the value and suffix from the quality
        :param quality:
        :return:
        """
        abc = 'abcdefghijklmnopqrstufwxyz'
        value = quality.rstrip('{}{}'.format(abc, abc.upper()))
        suffix = quality[len(value):]
        return value, suffix

    # 'legacy_slugs' contains a list of legacy slugs for resources 'vers'. Legacy slugs are formatted as `res-lang`
    legacy_slugs = [
        "ulb-ceb", "udb-ceb", "ulb-ee", "ulb-en", "udb-en", "ulb-hna",
        "ulb-ilo", "ulb-kbp", "ulb-kpo", "ulb-las", "ulb-lpx"
    ]
Beispiel #5
0
class TsV2CatalogHandler(InstanceHandler):

    cdn_root_path = 'v2/ts'
    api_version = 'ts.2'

    def __init__(self, event, context, logger, **kwargs):
        super(TsV2CatalogHandler, self).__init__(event, context)

        env_vars = self.retrieve(event, 'stage-variables', 'payload')
        self.cdn_bucket = self.retrieve(env_vars, 'cdn_bucket',
                                        'Environment Vars')
        self.cdn_url = self.retrieve(env_vars, 'cdn_url',
                                     'Environment Vars').rstrip('/')
        self.from_email = self.retrieve(env_vars, 'from_email',
                                        'Environment Vars')
        self.to_email = self.retrieve(env_vars, 'to_email', 'Environment Vars')
        self.max_usfm_size = int(
            self.retrieve_with_default(env_vars, 'max_usfm_size', '2000000'))

        self.status_db = self.retrieve_with_default(
            env_vars, 'status_db',
            '{}d43-catalog-status'.format(self.stage_prefix()))

        self.logger = logger  # type: logging._loggerClass
        if 's3_handler' in kwargs:
            self.cdn_handler = kwargs['s3_handler']
        else:
            self.cdn_handler = S3Handler(self.cdn_bucket)  # pragma: no cover
        if 'dynamodb_handler' in kwargs:
            self.db_handler = kwargs['dynamodb_handler']
        else:
            self.db_handler = DynamoDBHandler(
                self.status_db)  # pragma: no cover
            if self.db_handler.logger:
                self.db_handler.logger.setLevel(logger.level)
        if 'url_handler' in kwargs:
            self.get_url = kwargs['url_handler']
        else:
            self.get_url = get_url  # pragma: no cover
        if 'download_handler' in kwargs:
            self.download_file = kwargs['download_handler']
        else:
            self.download_file = download_file  # pragma: no cover
        if 'url_exists_handler' in kwargs:
            self.url_exists = kwargs['url_exists_handler']
        else:
            self.url_exists = url_exists  # pragma: no cover

        self.temp_dir = tempfile.mkdtemp('', 'tsv2', None)

    def __del__(self):
        try:
            shutil.rmtree(self.temp_dir)
        finally:
            pass

    def _run(self):
        """
        Generates the v2 catalog
        :return:
        """
        try:
            self.logger.debug('Temp directory {} contents {}'.format(
                '/tmp', get_subdirs('/tmp/')))
            return self.__execute()
        except Exception as e:
            self.report_error(e.message)
            raise Exception, Exception(e), sys.exc_info()[2]

    def __execute(self):
        cat_keys = []
        cat_dict = {}
        supplemental_resources = []

        result = self._get_status()
        if not result:
            return False
        else:
            (self.status, source_status) = result

        # check if build is complete
        if self.status['state'] == 'complete':
            self.logger.debug('Catalog already generated')
            return True

        # retrieve the latest catalog
        self.logger.debug("Catalog url {0}".format(
            source_status['catalog_url']))
        catalog_content = self.get_url(source_status['catalog_url'], True)
        if not catalog_content:
            self.logger.error("{0} does not exist".format(
                source_status['catalog_url']))
            return False
        try:
            self.latest_catalog = json.loads(catalog_content)
        except Exception as e:
            self.logger.error("Failed to load the catalog json: {0}".format(e))
            return False

        # walk v3 catalog
        for lang in self.latest_catalog['languages']:
            lid = TsV2CatalogHandler.sanitize_identifier(lang['identifier'],
                                                         lower=False)
            self.logger.info('Processing {}'.format(lid))
            for res in lang['resources']:
                rid = TsV2CatalogHandler.sanitize_identifier(res['identifier'])
                self.logger.debug('Processing {}_{}'.format(lid, rid))

                rc_format = None

                self.logger.debug('Temp directory {} contents {}'.format(
                    self.temp_dir, get_subdirs(self.temp_dir)))
                res_temp_dir = os.path.join(self.temp_dir, lid, rid)
                os.makedirs(res_temp_dir)

                if 'formats' in res:
                    for format in res['formats']:
                        finished_processes = {}
                        if not rc_format and get_rc_type(format):
                            # locate rc_format (for multi-project RCs)
                            rc_format = format
                        #res is resource, rid is resource id, lid is language id
                        process_id = '_'.join([lid, rid, 'usfm'])
                        if process_id not in self.status['processed']:
                            self._process_usfm(lid, rid, res, format,
                                               res_temp_dir)
                            finished_processes[process_id] = []

                        # TRICKY: bible notes and questions are in the resource
                        if rid != 'obs':
                            process_id = '_'.join([lid, rid, 'notes'])
                            if process_id not in self.status['processed']:
                                self.logger.info(
                                    'Processing notes {}_{}'.format(lid, rid))
                                tn = self._index_note_files(
                                    lid, rid, format, process_id, res_temp_dir)
                                if tn:
                                    self._upload_all(tn)
                                    finished_processes[process_id] = tn.keys()
                                    cat_keys = cat_keys + tn.keys()
                            else:
                                cat_keys = cat_keys + self.status['processed'][
                                    process_id]

                            process_id = '_'.join([lid, rid, 'questions'])
                            if process_id not in self.status['processed']:
                                self.logger.info(
                                    'Processing questions {}_{}'.format(
                                        lid, rid))
                                tq = self._index_question_files(
                                    lid, rid, format, process_id, res_temp_dir)
                                if tq:
                                    self._upload_all(tq)
                                    finished_processes[process_id] = tq.keys()
                                    cat_keys = cat_keys + tq.keys()
                            else:
                                cat_keys = cat_keys + self.status['processed'][
                                    process_id]

                        # TRICKY: update the finished processes once per format to limit db hits
                        if finished_processes:
                            self.status['processed'].update(finished_processes)
                            self.status['timestamp'] = time.strftime(
                                "%Y-%m-%dT%H:%M:%SZ")
                            self.db_handler.update_item(
                                {
                                    'api_version':
                                    TsV2CatalogHandler.api_version
                                }, self.status)

                for project in res['projects']:
                    pid = TsV2CatalogHandler.sanitize_identifier(
                        project['identifier'])
                    self.logger.debug('Processing {}_{}_{}'.format(
                        lid, rid, pid))
                    if 'formats' in project:
                        for format in project['formats']:
                            finished_processes = {}
                            if not rc_format and get_rc_type(format):
                                # locate rc_format (for single-project RCs)
                                rc_format = format

                            # TRICKY: there should only be a single tW for each language
                            process_id = '_'.join([lid, 'words'])
                            if process_id not in self.status['processed']:
                                tw = self._index_words_files(
                                    lid, rid, format, process_id, res_temp_dir)
                                if tw:
                                    self._upload_all(tw)
                                    finished_processes[process_id] = tw.keys()
                                    cat_keys = cat_keys + tw.keys()
                            else:
                                cat_keys = cat_keys + self.status['processed'][
                                    process_id]

                            if rid == 'obs':
                                process_id = '_'.join([lid, rid, pid])
                                if process_id not in self.status['processed']:
                                    self.logger.debug(
                                        'Processing {}'.format(process_id))
                                    obs_json = index_obs(
                                        lid, rid, format, res_temp_dir,
                                        self.download_file)
                                    upload = prep_data_upload(
                                        '{}/{}/{}/v{}/source.json'.format(
                                            pid, lid, rid, res['version']),
                                        obs_json, res_temp_dir)
                                    self._upload(upload)
                                    finished_processes[process_id] = []
                                else:
                                    cat_keys = cat_keys + self.status[
                                        'processed'][process_id]

                            # TRICKY: obs notes and questions are in the project
                            process_id = '_'.join([lid, rid, pid, 'notes'])
                            if process_id not in self.status['processed']:
                                tn = self._index_note_files(
                                    lid, rid, format, process_id, res_temp_dir)
                                if tn:
                                    self._upload_all(tn)
                                    finished_processes[process_id] = tn.keys()
                                    cat_keys = cat_keys + tn.keys()
                            else:
                                cat_keys = cat_keys + self.status['processed'][
                                    process_id]

                            process_id = '_'.join([lid, rid, pid, 'questions'])
                            if process_id not in self.status['processed']:
                                tq = self._index_question_files(
                                    lid, rid, format, process_id, res_temp_dir)
                                if tq:
                                    self._upload_all(tq)
                                    finished_processes[process_id] = tq.keys()
                                    cat_keys = cat_keys + tq.keys()
                            else:
                                cat_keys = cat_keys + self.status['processed'][
                                    process_id]

                            # TRICKY: update the finished processes once per format to limit db hits
                            if finished_processes:
                                self.status['processed'].update(
                                    finished_processes)
                                self.status['timestamp'] = time.strftime(
                                    "%Y-%m-%dT%H:%M:%SZ")
                                self.db_handler.update_item(
                                    {
                                        'api_version':
                                        TsV2CatalogHandler.api_version
                                    }, self.status)

                    if not rc_format:
                        raise Exception(
                            'Could not find a format for {}_{}_{}'.format(
                                lid, rid, pid))

                    modified = make_legacy_date(rc_format['modified'])
                    rc_type = get_rc_type(rc_format)

                    self.logger.debug(
                        'Resource container type is {}'.format(rc_type))

                    if modified is None:
                        modified = time.strftime('%Y%m%d')
                        self.logger.warning(
                            'Could not find date modified for {}_{}_{} from "{}"'
                            .format(lid, rid, pid, rc_format['modified']))

                    if rc_type == 'book' or rc_type == 'bundle':
                        self._build_catalog_node(cat_dict, lang, res, project,
                                                 modified)
                    else:
                        # store supplementary resources for processing after catalog nodes have been fully built
                        supplemental_resources.append({
                            'language': lang,
                            'resource': res,
                            'project': project,
                            'modified': modified,
                            'rc_type': rc_type
                        })

                # cleanup resource directory
                remove_tree(res_temp_dir)
            # cleanup language directory
            remove_tree(os.path.join(self.temp_dir, lid))
        # inject supplementary resources
        for s in supplemental_resources:
            self._add_supplement(cat_dict, s['language'], s['resource'],
                                 s['project'], s['modified'], s['rc_type'])

        api_uploads = []

        # normalize catalog nodes
        root_cat = []
        for pid in cat_dict:
            project = cat_dict[pid]
            lang_cat = []
            for lid in project['_langs']:
                lang = project['_langs'][lid]
                res_cat = []
                for rid in lang['_res']:
                    res = lang['_res'][rid]

                    # disable missing catalogs

                    # disable tN
                    if '_'.join([lid, '*', pid, 'tn']) not in cat_keys:
                        res['notes'] = ''

                    # disable tQ
                    if '_'.join([lid, '*', pid, 'tq']) not in cat_keys:
                        res['checking_questions'] = ''

                    # disable tW
                    if '_'.join([lid, '*', '*', 'tw']) not in cat_keys:
                        res['terms'] = ''

                    res_cat.append(res)
                api_uploads.append(
                    prep_data_upload('{}/{}/resources.json'.format(pid, lid),
                                     res_cat, self.temp_dir))

                del lang['_res']
                if ('project' in lang):
                    # skip empty artifacts
                    lang_cat.append(lang)
                else:
                    self.logger.warning(
                        'Excluding empty language artifact in {}'.format(pid))
            api_uploads.append(
                prep_data_upload('{}/languages.json'.format(pid), lang_cat,
                                 self.temp_dir))

            del project['_langs']
            if len(lang_cat) != 0:
                root_cat.append(project)
        catalog_upload = prep_data_upload('catalog.json', root_cat,
                                          self.temp_dir)
        api_uploads.append(catalog_upload)
        # TRICKY: also upload to legacy path for backwards compatibility
        api_uploads.append({
            'key': '/ts/txt/2/catalog.json',
            'path': catalog_upload['path']
        })

        # upload files
        for upload in api_uploads:
            if not upload['key'].startswith('/'):
                key = '{}/{}'.format(TsV2CatalogHandler.cdn_root_path,
                                     upload['key'])
            else:
                key = upload['key'].lstrip('/')
            self.cdn_handler.upload_file(upload['path'], key)

        self.status['state'] = 'complete'
        self.status['timestamp'] = time.strftime("%Y-%m-%dT%H:%M:%SZ")
        self.db_handler.update_item(
            {'api_version': TsV2CatalogHandler.api_version}, self.status)

    def _get_status(self):
        """
        Retrieves the catalog status from AWS or generates a new status object

        :return: A tuple containing the status object of the target and source catalogs, or False if the source is not ready
        """
        status_results = self.db_handler.query_items({
            'api_version': {
                'condition': 'is_in',
                'value': ['3', TsV2CatalogHandler.api_version]
            }
        })
        source_status = None
        status = None
        for s in status_results:
            if s['api_version'] == '3':
                source_status = s
            elif s['api_version'] == TsV2CatalogHandler.api_version:
                status = s
        if not source_status:
            self.logger.warning('Source catalog status not found')
            return False
        if source_status['state'] != 'complete':
            self.logger.debug('Source catalog is not ready for use')
            return False
        if not status or status['source_timestamp'] != source_status[
                'timestamp']:
            # begin or restart process
            status = {
                'api_version': TsV2CatalogHandler.api_version,
                'catalog_url': '{}/ts/txt/2/catalog.json'.format(self.cdn_url),
                'source_api': source_status['api_version'],
                'source_timestamp': source_status['timestamp'],
                'state': 'in-progress',
                'processed': {}
            }

        return (status, source_status)

    def _index_note_files(self, lid, rid, format, process_id, temp_dir):
        """

        :param lid:
        :param rid:
        :param format:
        :return: a dictionary of notes to upload
        """
        tn_uploads = {}

        format_str = format['format']
        if (rid == 'obs-tn' or rid == 'tn') and 'type=help' in format_str:
            self.logger.debug('Processing {}'.format(process_id))
            rc_dir = download_rc(lid, rid, format['url'], temp_dir,
                                 self.download_file)
            if not rc_dir: return {}

            tn_uploads = index_tn_rc(lid=lid, temp_dir=temp_dir, rc_dir=rc_dir)
            remove_tree(rc_dir, True)

        return tn_uploads

    def _index_question_files(self, lid, rid, format, process_id, temp_dir):
        question_re = re.compile('^#+([^#\n]+)#*([^#]*)',
                                 re.UNICODE | re.MULTILINE | re.DOTALL)
        tq_uploads = {}

        format_str = format['format']
        if (rid == 'obs-tq' or rid == 'tq') and 'type=help' in format_str:
            self.logger.debug('Processing {}'.format(process_id))
            rc_dir = download_rc(lid, rid, format['url'], temp_dir,
                                 self.download_file)
            if not rc_dir: return {}

            manifest = yaml.load(
                read_file(os.path.join(rc_dir, 'manifest.yaml')))
            dc = manifest['dublin_core']

            for project in manifest['projects']:
                pid = TsV2CatalogHandler.sanitize_identifier(
                    project['identifier'])
                question_dir = os.path.normpath(
                    os.path.join(rc_dir, project['path']))
                question_json = []

                if not os.path.isdir(question_dir):
                    self.logger.warning(
                        'Missing directory at {}. Is the manifest out of date?'
                        .format(question_dir))
                    continue

                chapters = os.listdir(question_dir)
                for chapter in chapters:
                    if chapter in ['.', '..']: continue
                    unique_questions = {}
                    chapter_dir = os.path.join(question_dir, chapter)
                    chunks = os.listdir(chapter_dir)
                    for chunk in chunks:
                        if chunk in ['.', '..']: continue
                        chunk_file = os.path.join(chapter_dir, chunk)
                        chunk = chunk.split('.')[0]
                        chunk_body = read_file(chunk_file)

                        for question in question_re.findall(chunk_body):
                            hasher = hashlib.md5()
                            hasher.update(question[1].strip().encode('utf-8'))
                            question_hash = hasher.hexdigest()
                            if question_hash not in unique_questions:
                                # insert unique question
                                unique_questions[question_hash] = {
                                    'q': question[0].strip(),
                                    'a': question[1].strip(),
                                    'ref': [u'{}-{}'.format(chapter, chunk)]
                                }
                            else:
                                # append new reference
                                unique_questions[question_hash]['ref'].append(
                                    '{}-{}'.format(chapter, chunk))

                    question_array = []
                    for hash in unique_questions:
                        question_array.append(unique_questions[hash])
                    if question_array:
                        question_json.append({
                            'id': chapter,
                            'cq': question_array
                        })

                if question_json:
                    tq_key = '_'.join([lid, '*', pid, 'tq'])
                    question_json.append(
                        {'date_modified': dc['modified'].replace('-', '')})
                    upload = prep_data_upload(
                        '{}/{}/questions.json'.format(pid, lid), question_json,
                        temp_dir)
                    tq_uploads[tq_key] = upload
            remove_tree(rc_dir, True)
        return tq_uploads

    def _index_words_files(self, lid, rid, format, process_id, temp_dir):
        """
        Returns an array of markdown files found in a tW dictionary
        :param lid:
        :param rid:
        :param format:
        :return:
        """
        word_title_re = re.compile('^#([^#\n]*)#*', re.UNICODE)
        h2_re = re.compile('^##([^#\n]*)#*', re.UNICODE)
        obs_example_re = re.compile('\_*\[([^\[\]]+)\]\(([^\(\)]+)\)_*(.*)',
                                    re.UNICODE | re.IGNORECASE)
        block_re = re.compile('^##', re.MULTILINE | re.UNICODE)
        word_links_re = re.compile(
            '\[([^\[\]]+)\]\(\.\.\/(kt|other)\/([^\(\)]+)\.md\)',
            re.UNICODE | re.IGNORECASE)
        ta_html_re = re.compile(
            '(<a\s+href="(:[a-z-_0-9]+:ta:vol\d:[a-z-\_]+:[a-z-\_]+)"\s*>([^<]+)<\/a>)',
            re.UNICODE | re.IGNORECASE)

        words = []
        format_str = format['format']
        if rid == 'tw' and 'type=dict' in format_str:
            self.logger.debug('Processing {}'.format(process_id))
            rc_dir = download_rc(lid, rid, format['url'], temp_dir,
                                 self.download_file)
            if not rc_dir: return {}

            manifest = yaml.load(
                read_file(os.path.join(rc_dir, 'manifest.yaml')))
            dc = manifest['dublin_core']

            # TRICKY: there should only be one project
            for project in manifest['projects']:
                pid = TsV2CatalogHandler.sanitize_identifier(
                    project['identifier'])
                content_dir = os.path.normpath(
                    os.path.join(rc_dir, project['path']))
                categories = os.listdir(content_dir)
                for cat in categories:
                    if cat in ['.', '..']: continue
                    cat_dir = os.path.join(content_dir, cat)
                    if not os.path.isdir(cat_dir): continue
                    word_files = os.listdir(cat_dir)
                    for word in word_files:
                        if word in ['.', '..', '.DS_Store']: continue
                        word_path = os.path.join(cat_dir, word)
                        word_id = word.split('.md')[0]
                        try:
                            word_content = read_file(word_path)
                        except Exception as e:
                            self.report_error(
                                'Failed to read file {}: {}'.format(
                                    word_path, e.message))
                            raise

                        # TRICKY: the title is always at the top
                        title_match = word_title_re.match(word_content)
                        if title_match:
                            title = title_match.group(1)
                        else:
                            self.report_error(
                                'missing title in {}'.format(word_path))
                            continue
                        word_content = word_title_re.sub('',
                                                         word_content).strip()

                        # TRICKY: the definition title is always after the title
                        def_title = ''
                        def_title_match = h2_re.match(word_content)
                        if def_title_match:
                            def_title = def_title_match.group(1).strip()
                            word_content = h2_re.sub('', word_content).strip()
                        else:
                            self.report_error(
                                'missing definition title in {}'.format(
                                    word_path))

                        # find obs examples
                        blocks = block_re.split(word_content)
                        cleaned_blocks = []
                        examples = []
                        for block in blocks:
                            if 'examples from the bible stories' in block.lower(
                            ):
                                for link in obs_example_re.findall(block):
                                    if 'obs' not in link[1]:
                                        self.logger.error(
                                            'non-obs link found in passage examples: {}'
                                            .format(link[1]))
                                    else:
                                        examples.append({
                                            'ref':
                                            link[0].replace(':', '-'),
                                            'text':
                                            markdown.markdown(link[2].strip())
                                        })
                            else:
                                cleaned_blocks.append(block)
                        word_content = '##'.join(cleaned_blocks)

                        # find all tW links and use them in related words
                        related_words = [
                            w[2] for w in word_links_re.findall(word_content)
                        ]

                        # convert links to legacy form. TODO: we should convert links after converting to html so we don't have to do it twice.
                        word_content = convert_rc_links(word_content)
                        word_content = markdown.markdown(word_content)
                        # convert html links back to dokuwiki links
                        # TRICKY: we converted the ta urls, but now we need to format them as dokuwiki links
                        # e.g. [[en:ta:vol1:translate:translate_unknown | How to Translate Unknowns]]
                        for ta_link in ta_html_re.findall(word_content):
                            new_link = u'[[{} | {}]]'.format(
                                ta_link[1], ta_link[2])
                            word_content = word_content.replace(
                                ta_link[0], new_link)

                        words.append({
                            'aliases': [
                                a.strip() for a in title.split(',')
                                if a.strip() != word_id
                                and a.strip() != title.strip()
                            ],
                            'cf':
                            related_words,
                            'def':
                            word_content,
                            'def_title':
                            def_title.rstrip(':'),
                            'ex':
                            examples,
                            'id':
                            word_id,
                            'sub':
                            '',
                            'term':
                            title.strip()
                        })

            remove_tree(rc_dir, True)

            if words:
                words.append({
                    'date_modified':
                    dc['modified'].replace('-', '').split('T')[0]
                })
                upload = prep_data_upload('bible/{}/words.json'.format(lid),
                                          words, temp_dir)
                return {'_'.join([lid, '*', '*', 'tw']): upload}
        return {}

    def _process_usfm(self, lid, rid, resource, format, temp_dir):
        """
        Converts a USFM bundle into usx, loads the data into json and uploads it.
        Returns an array of usx file paths.
        :param lid:
        :param rid:
        :param format:
        :return: an array of json blobs
        """

        format_str = format['format']
        if 'application/zip' in format_str and 'usfm' in format_str:
            self.logger.debug('Downloading {}'.format(format['url']))
            rc_dir = download_rc(lid, rid, format['url'], temp_dir,
                                 self.download_file)
            if not rc_dir: return

            manifest = yaml.load(
                read_file(os.path.join(rc_dir, 'manifest.yaml')))
            usx_dir = os.path.join(rc_dir, 'usx')
            for project in manifest['projects']:
                pid = TsV2CatalogHandler.sanitize_identifier(
                    project['identifier'])
                # pid is project identifier, lid is language id, rid is resourceid
                process_id = '_'.join([lid, rid, pid])

                if process_id not in self.status['processed']:
                    self.logger.debug(
                        'Processing usfm for {}'.format(process_id))

                    # copy usfm project file
                    usfm_dir = os.path.join(temp_dir,
                                            '{}_usfm'.format(process_id))
                    if not os.path.exists(usfm_dir):
                        os.makedirs(usfm_dir)
                    usfm_dest_file = os.path.normpath(
                        os.path.join(usfm_dir, project['path']))
                    usfm_src_file = os.path.normpath(
                        os.path.join(rc_dir, project['path']))

                    if os.path.getsize(usfm_src_file) < self.max_usfm_size:

                        shutil.copyfile(usfm_src_file, usfm_dest_file)

                        # transform usfm to usx
                        build_usx(usfm_dir, usx_dir, self.logger)

                        # convert USX to JSON
                        path = os.path.normpath(
                            os.path.join(usx_dir,
                                         '{}.usx'.format(pid.upper())))
                        source = build_json_source_from_usx(
                            path, format['modified'], self)
                        upload = prep_data_upload(
                            '{}/{}/{}/v{}/source.json'.format(
                                pid, lid, rid, resource['version']),
                            source['source'], temp_dir)
                        self.logger.debug('Uploading {}/{}/{}'.format(
                            self.cdn_bucket, TsV2CatalogHandler.cdn_root_path,
                            upload['key']))
                        self.cdn_handler.upload_file(
                            upload['path'],
                            '{}/{}'.format(TsV2CatalogHandler.cdn_root_path,
                                           upload['key']))

                        self.status['processed'][process_id] = []
                    else:
                        self.logger.warn(
                            "Skipping {} because it is too big".format(
                                process_id))
                        self.status['processed'][process_id] = ['skipped']

                    self.status['timestamp'] = time.strftime(
                        "%Y-%m-%dT%H:%M:%SZ")
                    self.db_handler.update_item(
                        {'api_version': TsV2CatalogHandler.api_version},
                        self.status)
                else:
                    self.logger.debug(
                        'USFM for {} has already been processed'.format(
                            process_id))

            # clean up download
            remove_tree(rc_dir, True)

    def _upload_all(self, uploads):
        """
        Uploads an array or object of uploads
        :param uploads:
        :return:
        """
        for upload in uploads:
            if isinstance(upload, dict):
                self._upload(upload)
            elif upload in uploads and isinstance(uploads[upload], dict):
                self._upload(uploads[upload])
            else:
                raise Exception('invalid upload object')

    def _upload(self, upload):
        """
        Uploads an upload
        :param upload:
        :return:
        """
        path = upload['path']
        key = '{}/{}'.format(TsV2CatalogHandler.cdn_root_path, upload['key'])
        self.logger.debug('Uploading {}/{}'.format(path, key))
        self.cdn_handler.upload_file(path, key)

    def _add_supplement(self, catalog, language, resource, project, modified,
                        rc_type):
        """
        Adds supplementary helps to the catalog nodes
        :param catalog:
        :param language:
        :param resource:
        :param project:
        :param modified:
        :param rc_type:
        :return:
        """
        lid = TsV2CatalogHandler.sanitize_identifier(language['identifier'],
                                                     lower=False)

        if rc_type == 'help':
            pid = TsV2CatalogHandler.sanitize_identifier(project['identifier'])

            # tricky some languages may only have supplementary resources and no books
            # so no catalog node will have been built. Therefore we init them here.
            TsV2CatalogHandler._init_catalog_node(catalog, pid, lid)

            for rid in catalog[pid]['_langs'][lid]['_res']:
                res = catalog[pid]['_langs'][lid]['_res'][rid]
                if 'tn' in TsV2CatalogHandler.sanitize_identifier(
                        resource['identifier']):
                    res.update({
                        'notes':
                        '{}/{}/{}/{}/notes.json?date_modified={}'.format(
                            self.cdn_url, TsV2CatalogHandler.cdn_root_path,
                            pid, lid, modified)
                    })
                elif 'tq' in self.sanitize_identifier(resource['identifier']):
                    res.update({
                        'checking_questions':
                        '{}/{}/{}/{}/questions.json?date_modified={}'.format(
                            self.cdn_url, TsV2CatalogHandler.cdn_root_path,
                            pid, lid, modified)
                    })
        elif rc_type == 'dict':
            for pid in catalog:
                # tricky some languages may only have supplementary resources and no books
                # so no catalog node will have been built. Therefore we init them here.
                TsV2CatalogHandler._init_catalog_node(catalog, pid, lid)

                for rid in catalog[pid]['_langs'][lid]['_res']:
                    res = catalog[pid]['_langs'][lid]['_res'][rid]
                    # TRICKY: obs and Bible now use the same words
                    res.update({
                        'terms':
                        '{}/{}/bible/{}/words.json?date_modified={}'.format(
                            self.cdn_url, TsV2CatalogHandler.cdn_root_path,
                            lid, modified)
                    })

    @staticmethod
    def _init_catalog_node(catalog, pid, lid=None, rid=None):
        """
        Initializes a node in the catalog.
        :param catalog: the v2 catalog dictionary
        :param pid: the project id to include in the catalog
        :param lid: the language id to include in the catalog
        :param rid: the resource id to include in the catalog
        :return:
        """
        if pid not in catalog: catalog[pid] = {'_langs': {}}
        if lid is not None:
            if lid not in catalog[pid]['_langs']:
                catalog[pid]['_langs'][lid] = {'_res': {}, 'language': {}}
        if lid is not None and rid is not None:
            if rid not in catalog[pid]['_langs'][lid]['_res']:
                catalog[pid]['_langs'][lid]['_res'][rid] = {}

    def _build_catalog_node(self, catalog, language, resource, project,
                            modified):
        """
        Creates/updates a node in the catalog
        :param catalog: the v2 catalog dictionary
        :param language: the v3 language catalog object
        :param resource: the v3 resource catalog object
        :param project: the v3 project catalog object
        :param modified:
        :return:
        """
        lid = TsV2CatalogHandler.sanitize_identifier(language['identifier'],
                                                     lower=False)
        rid = TsV2CatalogHandler.sanitize_identifier(resource['identifier'])
        pid = TsV2CatalogHandler.sanitize_identifier(project['identifier'])

        # TRICKY: v2 api sorted obs with 1
        if pid == 'obs': project['sort'] = 1

        TsV2CatalogHandler._init_catalog_node(catalog, pid, lid, rid)

        # TRICKY: we must process the modified date in the order of resource, language, project to propagate dates correctly

        # resource
        res = catalog[pid]['_langs'][lid]['_res'][rid]
        r_modified = max_modified_date(
            res, modified)  # TRICKY: dates bubble up from project
        comments = ''  # TRICKY: comments are not officially supported in RCs but we use them if available
        if 'comment' in resource: comments = resource['comment']

        # add chunks to non-obs projects
        chunks_url = ''
        if rid != 'obs':
            chunks_url = 'https://api.unfoldingword.org/bible/txt/1/{}/chunks.json'.format(
                pid)
            # if not self.url_exists(chunks_url) and 'chunks_url' in project:
            # Use the v3 api chunks url if the legacy version cannot be found
            # chunks_url = project['chunks_url']

        source_url = '{}/{}/{}/{}/{}/v{}/source.json?date_modified={}'.format(
            self.cdn_url, TsV2CatalogHandler.cdn_root_path, pid, lid, rid,
            resource['version'], r_modified)
        source_text = ''
        source_text_version = ''
        if resource['source']:
            # TRICKY: some resources don't have a source
            source_text = resource['source'][0]['language']
            source_text_version = resource['source'][0]['version']
        else:
            self.report_error('Missing source translation in {} {}'.format(
                lid, rid))
        res.update({
            'date_modified': r_modified,
            'name': resource['title'],
            'notes': '',
            'slug': rid,
            'status': {
                'checking_entity':
                ', '.join(resource['checking']['checking_entity']),
                'checking_level':
                resource['checking']['checking_level'],
                'comments':
                comments,
                'contributors':
                '; '.join(resource['contributor']),
                'publish_date':
                resource['issued'],
                'source_text':
                source_text,  # v2 can only handle one source
                'source_text_version':
                source_text_version,  # v2 can only handle one source
                'version':
                resource['version']
            },
            'checking_questions': '',
            'chunks': chunks_url,
            'source': source_url,
            'terms': '',
            'tw_cat': ''
        })

        res.update({
            'tw_cat':
            '{}/{}/{}/{}/tw_cat.json?date_modified={}'.format(
                self.cdn_url, TsV2CatalogHandler.cdn_root_path, pid, lid,
                r_modified)
        })

        # bible projects have usfm
        if pid != 'obs':
            if 'formats' in project:
                for format in project['formats']:
                    if 'text/usfm' == format['format']:
                        res.update({
                            'usfm':
                            '{}?date_modified={}'.format(
                                format['url'], r_modified)
                        })
                        break

        # language
        lang = catalog[pid]['_langs'][lid]
        l_modified = max_modified_date(
            lang['language'],
            r_modified)  # TRICKY: dates bubble up from resource
        description = ''
        if rid == 'obs': description = resource['description']
        project_meta = list(project['categories'])  # default to category ids
        if 'category_labels' in language:
            project_meta = []
            for cat_id in project['categories']:
                if cat_id in language['category_labels']:
                    project_meta.append(language['category_labels'][cat_id])
                else:
                    project_meta.append(cat_id)

        cat_lang = {
            'language': {
                'date_modified': l_modified,
                'direction': language['direction'],
                'name': language['title'],
                'slug': lid
            },
            'project': {
                'desc': description,
                'meta': project_meta,
                'name': project['title']
            },
            'res_catalog':
            '{}/{}/{}/{}/resources.json?date_modified={}'.format(
                self.cdn_url, TsV2CatalogHandler.cdn_root_path, pid, lid,
                l_modified)
        }
        if 'ulb' == rid or 'udb' == rid:
            cat_lang['project']['sort'] = '{}'.format(project['sort'])
        lang.update(cat_lang)

        # project
        p_modified = max_modified_date(catalog[pid], l_modified)
        catalog[pid].update({
            'date_modified':
            p_modified,
            'lang_catalog':
            '{}/{}/{}/languages.json?date_modified={}'.format(
                self.cdn_url, TsV2CatalogHandler.cdn_root_path, pid,
                p_modified),
            'meta':
            project['categories'],
            'slug':
            pid,
            'sort':
            '{}'.format(project['sort']).zfill(2)
        })