def test_processing_hbo(self):
        """
        Test downloading and processing some hebrew
        :return:
        """
        return
        rc_dir = download_rc('hbo', 'uhb',
                             'https://cdn.door43.org/hbo/uhb/v2.1.1/uhb.zip',
                             self.temp_dir)

        manifest = yaml.load(read_file(os.path.join(rc_dir, 'manifest.yaml')))
        usx_dir = os.path.join(rc_dir, 'usx')
        for project in manifest['projects']:
            pid = project['identifier']

            # copy usfm project file
            usfm_dir = os.path.join(self.temp_dir, 'usfm')
            if not os.path.exists(usfm_dir):
                os.makedirs(usfm_dir)
            usfm_dest_file = os.path.normpath(
                os.path.join(usfm_dir, project['path']))
            usfm_src_file = os.path.normpath(
                os.path.join(rc_dir, project['path']))
            shutil.copyfile(usfm_src_file, usfm_dest_file)

            # transform usfm to usx
            build_usx(usfm_dir, usx_dir)

            # clean up converted usfm file
            remove(usfm_dest_file, True)

            # convert USX to JSON
            path = os.path.normpath(
                os.path.join(usx_dir, '{}.usx'.format(pid.upper())))
            source = build_json_source_from_usx(path, 'hbo', pid, '2019')
Example #2
0
    def test_transform_usfm_with_word_data(self, mock_reporter):
        mockS3 = MockS3Handler()
        mockS3._load_path(os.path.join(self.resources_dir, 'usfm_sources'))
        usx_dir = tempfile.mkdtemp('-usx_output')
        build_usx(mockS3.temp_dir, usx_dir)
        expected_usx_file = os.path.join(self.resources_dir,
                                         'expected_usx/1JN.usx')
        out_file = os.path.join(usx_dir, '1JN.usx')

        expected_usx = read_file(expected_usx_file)
        output = read_file(out_file)
        self.assertEqual(expected_usx, output)
 def test_build_usx(self, mock_reporter):
     usfm_dir = os.path.join(self.resources_dir, 'usfm')
     usx_dir = os.path.join(self.temp_dir, 'usx')
     build_usx(usfm_dir, usx_dir)
     # TODO: evaluate output
     assert not mock_reporter.called
Example #4
0
    def _process_usfm(self, lid, rid, resource, format, temp_dir):
        """
        Converts a USFM bundle into usx, loads the data into json and uploads it.
        Returns an array of usx file paths.
        :param lid:
        :param rid:
        :param format:
        :return: an array of json blobs
        """

        format_str = format['format']
        if 'application/zip' in format_str and 'usfm' in format_str:
            self.logger.debug('Downloading {}'.format(format['url']))
            rc_dir = download_rc(lid, rid, format['url'], temp_dir,
                                 self.download_file)
            if not rc_dir: return

            manifest = yaml.load(
                read_file(os.path.join(rc_dir, 'manifest.yaml')))
            usx_dir = os.path.join(rc_dir, 'usx')
            for project in manifest['projects']:
                pid = TsV2CatalogHandler.sanitize_identifier(
                    project['identifier'])
                # pid is project identifier, lid is language id, rid is resourceid
                process_id = '_'.join([lid, rid, pid])

                if process_id not in self.status['processed']:
                    self.logger.debug(
                        'Processing usfm for {}'.format(process_id))

                    # copy usfm project file
                    usfm_dir = os.path.join(temp_dir,
                                            '{}_usfm'.format(process_id))
                    if not os.path.exists(usfm_dir):
                        os.makedirs(usfm_dir)
                    usfm_dest_file = os.path.normpath(
                        os.path.join(usfm_dir, project['path']))
                    usfm_src_file = os.path.normpath(
                        os.path.join(rc_dir, project['path']))

                    if os.path.getsize(usfm_src_file) < self.max_usfm_size:

                        shutil.copyfile(usfm_src_file, usfm_dest_file)

                        # transform usfm to usx
                        build_usx(usfm_dir, usx_dir, self.logger)

                        # convert USX to JSON
                        path = os.path.normpath(
                            os.path.join(usx_dir,
                                         '{}.usx'.format(pid.upper())))
                        source = build_json_source_from_usx(
                            path, format['modified'], self)
                        upload = prep_data_upload(
                            '{}/{}/{}/v{}/source.json'.format(
                                pid, lid, rid, resource['version']),
                            source['source'], temp_dir)
                        self.logger.debug('Uploading {}/{}/{}'.format(
                            self.cdn_bucket, TsV2CatalogHandler.cdn_root_path,
                            upload['key']))
                        self.cdn_handler.upload_file(
                            upload['path'],
                            '{}/{}'.format(TsV2CatalogHandler.cdn_root_path,
                                           upload['key']))

                        self.status['processed'][process_id] = []
                    else:
                        self.logger.warn(
                            "Skipping {} because it is too big".format(
                                process_id))
                        self.status['processed'][process_id] = ['skipped']

                    self.status['timestamp'] = time.strftime(
                        "%Y-%m-%dT%H:%M:%SZ")
                    self.db_handler.update_item(
                        {'api_version': TsV2CatalogHandler.api_version},
                        self.status)
                else:
                    self.logger.debug(
                        'USFM for {} has already been processed'.format(
                            process_id))

            # clean up download
            remove_tree(rc_dir, True)