Beispiel #1
0
def set_lambda_running(context,
                       dbname,
                       lambda_suffix=None,
                       dynamodb_handler=None):
    """
    Sets the process information for this lambda.
    This is used to indicate the lambda is currently running.
    :param context:
    :param string dbname: the database holding a list of lambda run times.
    :param string lambda_suffix: name of the lambda handler
    :return:
    """
    if not context:
        return
    if dynamodb_handler:
        db = dynamodb_handler(dbname)
    else:
        db = DynamoDBHandler(dbname)

    lambda_name = context.function_name
    if lambda_suffix:
        lambda_name = '{}.{}'.format(lambda_name, lambda_suffix)

    db.insert_item({
        "lambda": lambda_name,
        "request_id": context.aws_request_id,
        "started_at": arrow.utcnow().isoformat(),
        "expires": context.get_remaining_time_in_millis()
    })
Beispiel #2
0
class WebhookHandler(Handler):
    def __init__(self, event, context, logger, **kwargs):
        super(WebhookHandler, self).__init__(event, context)

        env_vars = self.retrieve(event, 'stage-variables', 'payload')
        self.gogs_url = self.retrieve(env_vars, 'gogs_url', 'Environment Vars')
        self.gogs_org = self.retrieve(env_vars, 'gogs_org', 'Environment Vars')
        self.cdn_bucket = self.retrieve(env_vars, 'cdn_bucket',
                                        'Environment Vars')
        self.cdn_url = self.retrieve(env_vars, 'cdn_url', 'Environment Vars')
        self.from_email = self.retrieve(env_vars, 'from_email',
                                        'Environment Vars')
        self.to_email = self.retrieve(env_vars, 'to_email', 'Environment Vars')
        self.api_url = self.retrieve(env_vars, 'api_url', 'Environment Vars')
        self.repo_commit = self.retrieve(event, 'body-json', 'payload')
        self.api_version = self.retrieve(env_vars, 'version')

        self.in_progress_db = self.retrieve_with_default(
            env_vars, 'in_progress_db',
            '{}d43-catalog-in-progress'.format(self.stage_prefix()))

        if 'pull_request' in self.repo_commit:
            self.__parse_pull_request(self.repo_commit)
        else:
            self.__parse_push(self.repo_commit)

        self.resource_id = None  # set in self._build
        self.logger = logger  # type: logging._loggerClass

        if 'dynamodb_handler' in kwargs:
            self.db_handler = kwargs['dynamodb_handler']
        else:
            self.logger.debug(
                "Creating Dynamodb handler pointing to {}".format(
                    self.in_progress_db))
            self.db_handler = DynamoDBHandler(
                self.in_progress_db)  # pragma: no cover

        if 's3_handler' in kwargs:
            self.s3_handler = kwargs['s3_handler']
        else:
            self.s3_handler = S3Handler(self.cdn_bucket)  # pragma: no cover

        if 'download_handler' in kwargs:
            self.download_file = kwargs['download_handler']
        else:
            self.download_file = download_file  # pragma: no cover

    def __parse_pull_request(self, payload):
        """
        Parses a  pull request
        :param payload:
        :return: True if the pull request should be processed
        """

        pull_request = self.retrieve(payload, 'pull_request', 'payload')

        self.repo_owner = payload['repository']['owner']['username']
        self.repo_name = payload['repository']['name']
        self.temp_dir = tempfile.mkdtemp('', self.repo_name, None)
        self.repo_file = os.path.join(self.temp_dir, self.repo_name + '.zip')
        # TRICKY: gogs gives a lower case name to the folder in the zip archive
        self.repo_dir = os.path.join(self.temp_dir, self.repo_name.lower())

        commit_sha = self.retrieve(pull_request, 'merge_commit_sha',
                                   'pull_request')
        self.timestamp = str_to_timestamp(
            self.retrieve(pull_request, 'merged_at', 'pull_request'))
        repository = self.retrieve(payload, 'repository', 'payload')
        url = self.retrieve(repository, 'html_url', 'repository').rstrip('/')
        self.commit_url = '{}/commit/{}'.format(url, commit_sha)
        if commit_sha:
            self.commit_id = commit_sha[:10]
        else:
            self.commit_id = None

    def __parse_push(self, payload):
        """
        Parses a regular push commit
        :param payload:
        :return:
        """
        self.repo_owner = payload['repository']['owner']['username']
        self.repo_name = payload['repository']['name']
        self.temp_dir = tempfile.mkdtemp('', self.repo_name, None)
        self.repo_file = os.path.join(self.temp_dir, self.repo_name + '.zip')
        # TRICKY: gogs gives a lower case name to the folder in the zip archive
        self.repo_dir = os.path.join(self.temp_dir, self.repo_name.lower())

        self.commit_id = payload['after']
        commit = None
        for commit in payload['commits']:
            if commit['id'] == self.commit_id:
                break
        self.commit_url = commit['url']
        self.timestamp = str_to_timestamp(commit['timestamp'])
        self.commit_id = self.commit_id[:10]

    def _run(self):
        if not self.commit_url.startswith(self.gogs_url):
            raise Exception(
                'Only accepting webhooks from {0} but found {1}'.format(
                    self.gogs_url, self.commit_url))  # pragma: no cover

        if self.repo_owner.lower() != self.gogs_org.lower():
            raise Exception(
                "Only accepting repos from the {0} organization. Organization sent is {1}"
                .format(self.gogs_org, self.repo_owner))  # pragma: no cover

        # skip un-merged pull requests
        if 'pull_request' in self.repo_commit:
            pr = self.repo_commit['pull_request']
            if not pr['merged']:
                raise Exception('Skipping un-merged pull request')

        try:
            # build catalog entry
            data = self._build()
            if data:
                # upload data
                if 'uploads' in data:
                    self.logger.debug('Uploading files for "{}"'.format(
                        self.repo_name))
                    for upload in data['uploads']:
                        self.logger.debug('^...{}'.format(upload['key']))
                        self.logger.debug("Uploading to {0} {1}".format(
                            upload["path"], upload["key"]))
                        self.s3_handler.upload_file(upload['path'],
                                                    upload['key'])
                    del data['uploads']
                else:
                    self.logger.debug(
                        'No upload-able content found in "{}"'.format(
                            self.repo_name))
                self.db_handler.insert_item(data)
            else:
                self.logger.debug('No data found in {}'.format(self.repo_name))
        except Exception as e:
            self.report_error(e.message)
            raise Exception, Exception(e), sys.exc_info()[2]
        finally:
            # clean
            if self.temp_dir and os.path.isdir(self.temp_dir):
                shutil.rmtree(self.temp_dir, ignore_errors=True)

        return {
            "success":
            True,
            "message":
            "Successfully added {0} ({1}) to the catalog".format(
                self.repo_name, self.commit_id)
        }

    def _build(self):
        """
        Constructs a new catalog entry from the repository
        :return: the constructed object
        """

        self.download_repo(self.commit_url, self.repo_file)
        self.unzip_repo_file(self.repo_file, self.temp_dir)

        if not os.path.isdir(self.repo_dir):
            raise Exception('Was not able to find {0}'.format(
                self.repo_dir))  # pragma: no cover

        self.logger.info('Processing repository "{}"'.format(self.repo_name))
        data = {}
        if self.repo_name == 'localization':
            data = self._build_localization()
        elif self.repo_name == 'catalogs':
            data = self._build_catalogs()
        elif self.repo_name == 'versification':
            # TODO: we do not yet know what to do with versification
            return None
        else:
            data = self._build_rc()

        return data

    def _build_rc(self):
        """
        Builds a Resource Container following the RC0.2 spec
        :return:
        """
        manifest_path = os.path.join(self.repo_dir, 'manifest.yaml')
        if not os.path.isfile(manifest_path):
            raise Exception(
                'Repository {0} does not have a manifest.yaml file'.format(
                    self.repo_name))
        try:
            manifest = WebhookHandler.load_yaml_file(manifest_path)
        except Exception as e:
            raise Exception('Bad Manifest: {0}'.format(e))

        try:
            ConsistencyChecker.check_manifest(manifest)
        except Exception as e:
            raise Exception('Bad Manifest: {0}'.format(e))

        # identifiers must be lowercase
        manifest['dublin_core']['identifier'] = self.sanitize_identifier(
            manifest['dublin_core']['identifier'])
        # resource version must be string
        manifest['dublin_core']['version'] = '{}'.format(
            manifest['dublin_core']['version'])

        # build media formats
        media_path = os.path.join(self.repo_dir, 'media.yaml')
        resource_formats = []
        project_formats = {}
        if os.path.isfile(media_path):
            try:
                media = WebhookHandler.load_yaml_file(media_path)
            except Exception as e:
                raise Exception('Bad Media: {0}'.format(e))
            project_chapters = self._listChapters(self.repo_dir, manifest)
            try:
                resource_formats, project_formats = parse_media(
                    media=media,
                    content_version=manifest['dublin_core']['version'],
                    project_chapters=project_chapters)
            except Exception as e:
                self.report_error('Failed to parse media in {}. {}'.format(
                    self.repo_name, e.message))

        stats = os.stat(self.repo_file)

        # normalize dates
        try:
            manifest['dublin_core']['modified'] = str_to_timestamp(
                manifest['dublin_core']['modified'])
        except Exception as e:
            self.logger.warning('Invalid datetime detected: {}'.format(
                e.message))
        try:
            manifest['dublin_core']['issued'] = str_to_timestamp(
                manifest['dublin_core']['issued'])
        except Exception as e:
            self.logger.warning('Invalid datetime detected: {}'.format(
                e.message))

        # TRICKY: single-project RCs get named after the project to avoid conflicts with multi-project RCs.
        if len(manifest['projects']) == 1:
            zip_name = manifest['projects'][0]['identifier'].lower()
        else:
            zip_name = manifest['dublin_core']['identifier']

        resource_key = '{}/{}/v{}/{}.zip'.format(
            manifest['dublin_core']['language']['identifier'],
            manifest['dublin_core']['identifier'].split('-')[-1],
            manifest['dublin_core']['version'], zip_name)
        url = '{}/{}'.format(self.cdn_url, resource_key)

        file_info = {
            'size':
            stats.st_size,
            'modified':
            self.timestamp,
            'format':
            'application/zip; type={0} content={1} conformsto={2}'.format(
                manifest['dublin_core']['type'],
                manifest['dublin_core']['format'],
                manifest['dublin_core']['conformsto']),
            'url':
            url,
            'signature':
            ""
        }
        manifest['formats'] = [file_info]

        uploads = [{
            'key': self.make_upload_key(resource_key),
            'path': self.repo_file
        }]

        # split usfm bundles
        if manifest['dublin_core']['type'] == 'bundle' and manifest[
                'dublin_core']['format'] == 'text/usfm':
            for project in manifest['projects']:
                pid = self.sanitize_identifier(project['identifier'])
                if 'formats' not in project:
                    project['formats'] = []
                resource_id = manifest['dublin_core']['identifier'].split(
                    '-')[-1]
                project_key = '{}/{}/v{}/{}.usfm'.format(
                    manifest['dublin_core']['language']['identifier'],
                    resource_id, manifest['dublin_core']['version'], pid)
                project_url = '{}/{}'.format(self.cdn_url, project_key)
                p_file_path = os.path.join(self.repo_dir,
                                           project['path'].lstrip('\.\/'))
                p_stats = os.stat(p_file_path)
                try:
                    resource_mtime = str_to_timestamp(
                        manifest['dublin_core']['modified'])
                except Exception as e:
                    self.logger.warning('Invalid datetime detected: {}'.format(
                        e.message))
                    resource_mtime = manifest['dublin_core']['modified']
                project['formats'].append({
                    'format': 'text/usfm',
                    'modified': resource_mtime,
                    'signature': '',
                    'size': p_stats.st_size,
                    'url': project_url
                })
                uploads.append({
                    'key': self.make_upload_key(project_key),
                    'path': p_file_path
                })

        # add media to projects
        for project in manifest['projects']:
            pid = self.sanitize_identifier(project['identifier'])
            if pid in project_formats:
                if 'formats' not in project: project['formats'] = []
                project['formats'] = project['formats'] + project_formats[pid]

        # add media to resource
        manifest['formats'] = manifest['formats'] + resource_formats

        # add html format
        # TRICKY: these URLS are only available in prod
        # for project in manifest['projects']:
        #     pid = self.sanitize_identifier(project['identifier'])
        #     html_url = ''
        #     if manifest['dublin_core']['identifier'] == 'obs':
        #         # obs html
        #         html_url = 'https://api.door43.org/tx/print?id={}/{}/{}'.format(self.gogs_org, self.repo_name, self.commit_id)
        #     elif manifest['dublin_core']['identifier'] == 'ta':
        #         # ta html
        #         sort_slug = '{}'.format(int(project['sort']) + 1).zfill(2)
        #         html_url = 'https://cdn.door43.org/u/Door43-Catalog/{}/{}/{}-{}.html'.format(self.repo_name, self.commit_id, sort_slug, pid)
        #     elif manifest['dublin_core']['identifier'] not in ['tq', 'tn', 'tw', 'obs-tn', 'obs-tq']:
        #         # we also have html for Bible resources
        #         name, _ = os.path.splitext(os.path.basename(project['path']))
        #         html_url = 'https://cdn.door43.org/u/Door43-Catalog/{}/{}/{}.html'.format(self.repo_name, self.commit_id, name)
        #
        #     if html_url and url_exists(html_url):
        #         self.logger.info('Injecting {} html url: {}'.format(manifest['dublin_core']['identifier'], html_url))
        #         if 'formats' not in project: project['formats'] = []
        #         project['formats'].append({
        #             'format': 'text/html',
        #             'modified': '',
        #             'signature': '',
        #             'size': '',
        #             'url': html_url,
        #             'build_rules': [
        #                 'signing.html_format'
        #             ]
        #         })
        #     else:
        #         self.logger.warning('Missing html format for {}_{} at {}'.format(self.repo_name, pid, html_url))

        return {
            'repo_name': self.repo_name,
            'commit_id': self.commit_id,
            'language': manifest['dublin_core']['language']['identifier'],
            'timestamp': self.timestamp,
            'added_at': arrow.utcnow().isoformat(),
            'package': json.dumps(manifest, sort_keys=True),
            'signed': False,
            'dirty': False,
            'uploads': uploads
        }

    def _listChapters(self, rc_dir, manifest):
        """
        Builds a dictionary of chapter ids for each project
        :param rc_dir:
        :param manifest:
        :return:
        """
        chapters = {}
        if manifest['dublin_core']['type'] == 'book':
            for project in manifest['projects']:
                pid = self.sanitize_identifier(project['identifier'])
                project_path = os.path.normpath(
                    os.path.join(rc_dir, project['path']))
                files = os.listdir(project_path)
                for chapter in files:
                    if chapter in [
                            '.', '..', 'toc.yaml', 'config.yaml', 'back',
                            'front'
                    ]:
                        continue
                    chapter = chapter.split('.')[0]
                    if pid not in chapters:
                        chapters[pid] = []
                    chapters[pid].append(chapter)
        else:
            id = '_'.join([
                manifest['dublin_core']['language']['identifier'],
                manifest['dublin_core']['identifier'],
                manifest['dublin_core']['type']
            ])
            self.logger.warning(
                'Failed to generate media chapters. Only book RCs are currently supported. {}'
                .format(id))
        return chapters

    def _build_versification(self):
        """
        DEPRECATED

        we are no longer processing versification.
        :return:
        """
        bible_dir = os.path.join(self.repo_dir, 'bible')
        versification_dirs = os.listdir(bible_dir)
        books = {}
        package = []
        uploads = []

        # group by project
        for vrs_dir in versification_dirs:
            vrs_id = os.path.basename(vrs_dir)
            book_files = sorted(
                glob(os.path.join(bible_dir, vrs_dir, 'chunks', '*.json')))
            for b in book_files:
                self.logger.debug('Reading "{}" versification for "{}"'.format(
                    vrs_id, b))
                b_id = os.path.splitext(os.path.basename(b))[0]
                try:
                    book_vrs = json.loads(read_file(b))
                except Exception as e:
                    raise Exception, Exception(
                        'Bad JSON: {0}'.format(e)), sys.exc_info()[2]
                book = WebhookHandler.retrieve_or_make(
                    books, b_id, {
                        'identifier':
                        b_id,
                        'chunks_url':
                        '{0}/bible/{}/{}/v{}/chunks.json'.format(
                            self.cdn_url, vrs_id, b_id, self.api_version),
                        'chunks': {}
                    })
                book['chunks'][vrs_id] = book_vrs
        temp_dir = os.path.join(self.temp_dir, 'versification')
        if not os.path.isdir:
            os.mkdir(temp_dir)
        for book in books:
            book = books[book]

            # write chunks
            chunk_file = os.path.join(temp_dir, book['identifier'] + '.json')
            write_file(chunk_file, json.dumps(book['chunks'], sort_keys=True))
            # for now we bypass signing and upload chunks directly
            upload_key = 'bible/{}/v{}/chunks.json'.format(
                book['identifier'], self.api_version)
            uploads.append({'key': upload_key, 'path': chunk_file})

            # build package
            del book['chunks']
            package.append(book)

        return {
            'repo_name': self.repo_name,
            'commit_id': self.commit_id,
            'timestamp': self.timestamp,
            'package': json.dumps(package, sort_keys=True),
            'uploads': uploads,
            'dirty': False
        }

    def _build_localization(self):
        """
        Builds the localization for various components in the catalog
        :return:
        """
        files = sorted(glob(os.path.join(self.repo_dir, '*.json')))
        localization = {}
        for f in files:
            self.logger.debug("Reading {0}...".format(f))
            language = os.path.splitext(os.path.basename(f))[0]
            try:
                localization[language] = json.loads(read_file(f))
            except Exception as e:
                raise Exception('Bad JSON: {0}'.format(e))
        return {
            'repo_name': self.repo_name,
            'commit_id': self.commit_id,
            'timestamp': self.timestamp,
            'package': json.dumps(localization, sort_keys=True),
            'dirty': False
        }

    def _build_catalogs(self):
        """
        Builds the global catalogs
        :return:
        """
        catalogs_path = os.path.join(self.repo_dir, 'catalogs.json')
        package = read_file(catalogs_path)
        return {
            'repo_name': self.repo_name,
            'commit_id': self.commit_id,
            'timestamp': self.timestamp,
            'package': package,
            'dirty': False
        }

    def make_upload_key(self, path):
        """
        Generates an upload key that conforms to the format `temp/<repo_name>/<commit>/<path>`.
        This allows further processing to associate files with an entry in dynamoDB.
        :param path:
        :return:
        """
        return 'temp/{0}/{1}/{2}'.format(self.repo_name, self.commit_id, path)

    @staticmethod
    def retrieve_or_make(dictionary, key, default=None):
        """
        Retrieves a value from a dictionary.
        If the key does not exist it will be created with the default value
        :param dict dictionary:
        :param any key:
        :param default:
        :return:
        """
        if key not in dictionary:
            dictionary[key] = default
        return dictionary[key]

    @staticmethod
    def load_yaml_file(file_name, default=None):
        """
        Deserialized <file_name> into a Python object
        :param str|unicode file_name: The name of the file to read
        :param default: The value to return if the file is not found
        """
        if not os.path.isfile(file_name):
            return default

        # use utf-8-sig in case the file has a Byte Order Mark
        with codecs.open(file_name, 'r', 'utf-8-sig') as stream:
            return yaml.load(stream)

    def get_url(self, url):
        return get_url(url)

    def download_repo(self, commit_url, repo_file):
        repo_zip_url = commit_url.replace('commit', 'archive') + '.zip'
        try:
            self.logger.debug('Downloading {0}...'.format(repo_zip_url))
            if not os.path.isfile(repo_file):
                self.download_file(repo_zip_url, repo_file)
        finally:
            pass

    def unzip_repo_file(self, repo_file, repo_dir):
        try:
            self.logger.debug('Unzipping {0}...'.format(repo_file))
            unzip(repo_file, repo_dir)
        finally:
            pass