Example #1
0
    def __parse_pull_request(self, payload):
        """
        Parses a  pull request
        :param payload:
        :return: True if the pull request should be processed
        """

        pull_request = self.retrieve(payload, 'pull_request', 'payload')

        self.repo_owner = payload['repository']['owner']['username']
        self.repo_name = payload['repository']['name']
        self.temp_dir = tempfile.mkdtemp('', self.repo_name, None)
        self.repo_file = os.path.join(self.temp_dir, self.repo_name + '.zip')
        # TRICKY: gogs gives a lower case name to the folder in the zip archive
        self.repo_dir = os.path.join(self.temp_dir, self.repo_name.lower())

        commit_sha = self.retrieve(pull_request, 'merge_commit_sha',
                                   'pull_request')
        self.timestamp = str_to_timestamp(
            self.retrieve(pull_request, 'merged_at', 'pull_request'))
        repository = self.retrieve(payload, 'repository', 'payload')
        url = self.retrieve(repository, 'html_url', 'repository').rstrip('/')
        self.commit_url = '{}/commit/{}'.format(url, commit_sha)
        if commit_sha:
            self.commit_id = commit_sha[:10]
        else:
            self.commit_id = None
Example #2
0
    def __parse_push(self, payload):
        """
        Parses a regular push commit
        :param payload:
        :return:
        """
        self.repo_owner = payload['repository']['owner']['username']
        self.repo_name = payload['repository']['name']
        self.temp_dir = tempfile.mkdtemp('', self.repo_name, None)
        self.repo_file = os.path.join(self.temp_dir, self.repo_name + '.zip')
        # TRICKY: gogs gives a lower case name to the folder in the zip archive
        self.repo_dir = os.path.join(self.temp_dir, self.repo_name.lower())

        self.commit_id = payload['after']
        commit = None
        for commit in payload['commits']:
            if commit['id'] == self.commit_id:
                break
        self.commit_url = commit['url']
        self.timestamp = str_to_timestamp(commit['timestamp'])
        self.commit_id = self.commit_id[:10]
Example #3
0
    def process_format(self, item, dublin_core, project, format):
        """
        Performs the signing on the format object.
        Files outside of the cdn will not be signed
        :param item:
        :param dublin_core:
        :param project: this may be None.
        :param format:
        :return: (already_signed, newly_signed)
        """
        if 'signature' in format and format['signature']:
            return (True, False)
        else:
            self.logger.debug('Signing {}'.format(format['url']))

        base_name = os.path.basename(format['url'])
        file_to_sign = os.path.join(self.temp_dir, base_name)

        # extract cdn key from url
        url_info = urlparse.urlparse(format['url'])
        src_key = url_info.path.lstrip('/')
        sig_key = '{}.sig'.format(src_key)

        build_rules = get_build_rules(format, 'signing')

        # TRICKY: allow dev environments to download from prod environment
        # RS: I added the s3 bucket here because it isn't yet accessible via urls
        valid_hosts = [
            self.cdn_bucket, self.cdn_bucket + ".s3.us-east-2.amazonaws.com"
        ]
        if self.stage_prefix():
            if not self.cdn_bucket.startswith(self.stage_prefix()):
                self.logger.warning(
                    'Expected `cdn_bucket` to begin with the stage prefix ({}) but found {}'
                    .format(self.stage_prefix(), self.cdn_bucket))
            prod_cdn_bucket = self.cdn_bucket.lstrip(self.stage_prefix())
            valid_hosts.append(prod_cdn_bucket)
            # TRICKY: force dev environments to handle prod content as external files
            # if format['url'].startswith(prod_cdn_url):
            #     build_rules.append('sign_given_url')

        # TRICKY: some html content is on the api
        if 'html_format' in build_rules:
            valid_hosts.append(self.api_bucket)
            prod_api_bucket = self.api_bucket.lstrip(self.stage_prefix())
            valid_hosts.append(prod_api_bucket)

        # verify url is on the cdn
        if not url_info.hostname in valid_hosts:
            # TODO: external media should be imported if it's not too big
            # This allows media to be hosted on third party servers
            format['signature'] = ''  #'{}.sig'.format(format['url'])
            self.logger.warning(
                'cannot sign files outside of the cdn: {}'.format(
                    format['url']))
            self.logger.warning('valid hosts are: {}'.format(
                ", ".join(valid_hosts)))
            return (True, True)

        try:
            headers = self.url_headers(format['url'])
        except Exception as e:
            self.report_error('Could not read headers from {}: {}'.format(
                format['url'], e))
            return (False, False)

        # skip files that are too large
        size = int(headers.get('content-length', 0))
        if size > SigningHandler.max_file_size:
            sig_url = '{}.sig'.format(format['url'])
            if not self._safe_url_exists(sig_url):
                # wait for signature to be manually uploaded
                self.report_error('File is too large to sign {}'.format(
                    format['url']))
                return (False, False)

            # finish with manually uploaded signature
            format['size'] = size
            if not format['modified']:
                format['modified'] = str_to_timestamp(
                    datetime.datetime.now().isoformat())
            format['signature'] = sig_url
            return (False, True)

        # download file
        try:
            if 'sign_given_url' in build_rules or 'html_format' in build_rules:
                # report error if response is 400+
                if headers.status >= 400:
                    self.report_error('Resource not available at {}'.format(
                        format['url']))
                    return (False, False)

                self.download_file(format['url'], file_to_sign)
            else:
                # TRICKY: most files to be signed are stored in a temp directory
                src_temp_key = 'temp/{}/{}/{}'.format(item['repo_name'],
                                                      item['commit_id'],
                                                      src_key)
                self.cdn_handler.download_file(src_temp_key, file_to_sign)
        except Exception as e:
            self.report_error(
                'The file "{}" could not be downloaded: {}'.format(
                    base_name, e))
            return (False, False)

        # strip print script from html
        if 'html_format' in build_rules:
            self.logger.debug('Removing print script from {} html'.format(
                item['repo_name']))
            self._strip_print_script(file_to_sign)

        # sign file
        sig_file = self.signer.sign_file(file_to_sign)
        try:
            self.signer.verify_signature(file_to_sign, sig_file)
        except RuntimeError:
            if self.logger:
                self.logger.warning(
                    'The signature was not successfully verified.')
            return (False, False)

        # TRICKY: re-format html urls
        if 'html_format' in build_rules:
            html_name = dublin_core['identifier']
            if project:
                html_name = project['identifier']
            src_key = '{}/{}/v{}/media/html/{}.html'.format(
                dublin_core['language']['identifier'],
                dublin_core['identifier'], self.api_version, html_name)
            sig_key = '{}.sig'.format(src_key)
            format['url'] = '{}/{}'.format(self.cdn_url, src_key)

        # upload files
        if 'sign_given_url' not in build_rules or 'html_format' in build_rules:
            # TRICKY: upload temp files to production
            self.cdn_handler.upload_file(file_to_sign, src_key)
        self.cdn_handler.upload_file(sig_file, sig_key)

        # add the url of the sig file to the format
        format['signature'] = '{}.sig'.format(format['url'])

        # read modified date from file
        stats = os.stat(file_to_sign)
        if not format['modified']:
            modified = headers.get('last-modified')
            if modified:
                # TRICKY: http header gives an odd date format
                date = datetime.datetime.strptime(modified,
                                                  '%a, %d %b %Y %H:%M:%S %Z')
                modified = str_to_timestamp(date.isoformat())
            else:
                modified = unix_to_timestamp(stats.st_mtime)
            format['modified'] = modified
        format['size'] = stats.st_size

        # retrieve playback time from multimedia files
        _, ext = os.path.splitext(file_to_sign)
        if ext == '.mp3':
            audio = MP3(file_to_sign)
            format['length'] = audio.info.length
        elif ext == '.mp4':
            video = MP4(file_to_sign)
            format['length'] = video.info.length

        # add file format if missing
        if not 'format' in format or not format['format']:
            try:
                mime = ext_to_mime(ext)
                format['format'] = mime
            except Exception as e:
                if self.logger:
                    self.logger.error(e.message)

        # clean up disk space
        os.remove(file_to_sign)

        return (False, True)
Example #4
0
    def _build_rc(self):
        """
        Builds a Resource Container following the RC0.2 spec
        :return:
        """
        manifest_path = os.path.join(self.repo_dir, 'manifest.yaml')
        if not os.path.isfile(manifest_path):
            raise Exception(
                'Repository {0} does not have a manifest.yaml file'.format(
                    self.repo_name))
        try:
            manifest = WebhookHandler.load_yaml_file(manifest_path)
        except Exception as e:
            raise Exception('Bad Manifest: {0}'.format(e))

        try:
            ConsistencyChecker.check_manifest(manifest)
        except Exception as e:
            raise Exception('Bad Manifest: {0}'.format(e))

        # identifiers must be lowercase
        manifest['dublin_core']['identifier'] = self.sanitize_identifier(
            manifest['dublin_core']['identifier'])
        # resource version must be string
        manifest['dublin_core']['version'] = '{}'.format(
            manifest['dublin_core']['version'])

        # build media formats
        media_path = os.path.join(self.repo_dir, 'media.yaml')
        resource_formats = []
        project_formats = {}
        if os.path.isfile(media_path):
            try:
                media = WebhookHandler.load_yaml_file(media_path)
            except Exception as e:
                raise Exception('Bad Media: {0}'.format(e))
            project_chapters = self._listChapters(self.repo_dir, manifest)
            try:
                resource_formats, project_formats = parse_media(
                    media=media,
                    content_version=manifest['dublin_core']['version'],
                    project_chapters=project_chapters)
            except Exception as e:
                self.report_error('Failed to parse media in {}. {}'.format(
                    self.repo_name, e.message))

        stats = os.stat(self.repo_file)

        # normalize dates
        try:
            manifest['dublin_core']['modified'] = str_to_timestamp(
                manifest['dublin_core']['modified'])
        except Exception as e:
            self.logger.warning('Invalid datetime detected: {}'.format(
                e.message))
        try:
            manifest['dublin_core']['issued'] = str_to_timestamp(
                manifest['dublin_core']['issued'])
        except Exception as e:
            self.logger.warning('Invalid datetime detected: {}'.format(
                e.message))

        # TRICKY: single-project RCs get named after the project to avoid conflicts with multi-project RCs.
        if len(manifest['projects']) == 1:
            zip_name = manifest['projects'][0]['identifier'].lower()
        else:
            zip_name = manifest['dublin_core']['identifier']

        resource_key = '{}/{}/v{}/{}.zip'.format(
            manifest['dublin_core']['language']['identifier'],
            manifest['dublin_core']['identifier'].split('-')[-1],
            manifest['dublin_core']['version'], zip_name)
        url = '{}/{}'.format(self.cdn_url, resource_key)

        file_info = {
            'size':
            stats.st_size,
            'modified':
            self.timestamp,
            'format':
            'application/zip; type={0} content={1} conformsto={2}'.format(
                manifest['dublin_core']['type'],
                manifest['dublin_core']['format'],
                manifest['dublin_core']['conformsto']),
            'url':
            url,
            'signature':
            ""
        }
        manifest['formats'] = [file_info]

        uploads = [{
            'key': self.make_upload_key(resource_key),
            'path': self.repo_file
        }]

        # split usfm bundles
        if manifest['dublin_core']['type'] == 'bundle' and manifest[
                'dublin_core']['format'] == 'text/usfm':
            for project in manifest['projects']:
                pid = self.sanitize_identifier(project['identifier'])
                if 'formats' not in project:
                    project['formats'] = []
                resource_id = manifest['dublin_core']['identifier'].split(
                    '-')[-1]
                project_key = '{}/{}/v{}/{}.usfm'.format(
                    manifest['dublin_core']['language']['identifier'],
                    resource_id, manifest['dublin_core']['version'], pid)
                project_url = '{}/{}'.format(self.cdn_url, project_key)
                p_file_path = os.path.join(self.repo_dir,
                                           project['path'].lstrip('\.\/'))
                p_stats = os.stat(p_file_path)
                try:
                    resource_mtime = str_to_timestamp(
                        manifest['dublin_core']['modified'])
                except Exception as e:
                    self.logger.warning('Invalid datetime detected: {}'.format(
                        e.message))
                    resource_mtime = manifest['dublin_core']['modified']
                project['formats'].append({
                    'format': 'text/usfm',
                    'modified': resource_mtime,
                    'signature': '',
                    'size': p_stats.st_size,
                    'url': project_url
                })
                uploads.append({
                    'key': self.make_upload_key(project_key),
                    'path': p_file_path
                })

        # add media to projects
        for project in manifest['projects']:
            pid = self.sanitize_identifier(project['identifier'])
            if pid in project_formats:
                if 'formats' not in project: project['formats'] = []
                project['formats'] = project['formats'] + project_formats[pid]

        # add media to resource
        manifest['formats'] = manifest['formats'] + resource_formats

        # add html format
        # TRICKY: these URLS are only available in prod
        # for project in manifest['projects']:
        #     pid = self.sanitize_identifier(project['identifier'])
        #     html_url = ''
        #     if manifest['dublin_core']['identifier'] == 'obs':
        #         # obs html
        #         html_url = 'https://api.door43.org/tx/print?id={}/{}/{}'.format(self.gogs_org, self.repo_name, self.commit_id)
        #     elif manifest['dublin_core']['identifier'] == 'ta':
        #         # ta html
        #         sort_slug = '{}'.format(int(project['sort']) + 1).zfill(2)
        #         html_url = 'https://cdn.door43.org/u/Door43-Catalog/{}/{}/{}-{}.html'.format(self.repo_name, self.commit_id, sort_slug, pid)
        #     elif manifest['dublin_core']['identifier'] not in ['tq', 'tn', 'tw', 'obs-tn', 'obs-tq']:
        #         # we also have html for Bible resources
        #         name, _ = os.path.splitext(os.path.basename(project['path']))
        #         html_url = 'https://cdn.door43.org/u/Door43-Catalog/{}/{}/{}.html'.format(self.repo_name, self.commit_id, name)
        #
        #     if html_url and url_exists(html_url):
        #         self.logger.info('Injecting {} html url: {}'.format(manifest['dublin_core']['identifier'], html_url))
        #         if 'formats' not in project: project['formats'] = []
        #         project['formats'].append({
        #             'format': 'text/html',
        #             'modified': '',
        #             'signature': '',
        #             'size': '',
        #             'url': html_url,
        #             'build_rules': [
        #                 'signing.html_format'
        #             ]
        #         })
        #     else:
        #         self.logger.warning('Missing html format for {}_{} at {}'.format(self.repo_name, pid, html_url))

        return {
            'repo_name': self.repo_name,
            'commit_id': self.commit_id,
            'language': manifest['dublin_core']['language']['identifier'],
            'timestamp': self.timestamp,
            'added_at': arrow.utcnow().isoformat(),
            'package': json.dumps(manifest, sort_keys=True),
            'signed': False,
            'dirty': False,
            'uploads': uploads
        }