def test_processing_hbo(self): """ Test downloading and processing some hebrew :return: """ return rc_dir = download_rc('hbo', 'uhb', 'https://cdn.door43.org/hbo/uhb/v2.1.1/uhb.zip', self.temp_dir) manifest = yaml.load(read_file(os.path.join(rc_dir, 'manifest.yaml'))) usx_dir = os.path.join(rc_dir, 'usx') for project in manifest['projects']: pid = project['identifier'] # copy usfm project file usfm_dir = os.path.join(self.temp_dir, 'usfm') if not os.path.exists(usfm_dir): os.makedirs(usfm_dir) usfm_dest_file = os.path.normpath( os.path.join(usfm_dir, project['path'])) usfm_src_file = os.path.normpath( os.path.join(rc_dir, project['path'])) shutil.copyfile(usfm_src_file, usfm_dest_file) # transform usfm to usx build_usx(usfm_dir, usx_dir) # clean up converted usfm file remove(usfm_dest_file, True) # convert USX to JSON path = os.path.normpath( os.path.join(usx_dir, '{}.usx'.format(pid.upper()))) source = build_json_source_from_usx(path, 'hbo', pid, '2019')
def test_transform_usfm_with_word_data(self, mock_reporter): mockS3 = MockS3Handler() mockS3._load_path(os.path.join(self.resources_dir, 'usfm_sources')) usx_dir = tempfile.mkdtemp('-usx_output') build_usx(mockS3.temp_dir, usx_dir) expected_usx_file = os.path.join(self.resources_dir, 'expected_usx/1JN.usx') out_file = os.path.join(usx_dir, '1JN.usx') expected_usx = read_file(expected_usx_file) output = read_file(out_file) self.assertEqual(expected_usx, output)
def test_build_usx(self, mock_reporter): usfm_dir = os.path.join(self.resources_dir, 'usfm') usx_dir = os.path.join(self.temp_dir, 'usx') build_usx(usfm_dir, usx_dir) # TODO: evaluate output assert not mock_reporter.called
def _process_usfm(self, lid, rid, resource, format, temp_dir): """ Converts a USFM bundle into usx, loads the data into json and uploads it. Returns an array of usx file paths. :param lid: :param rid: :param format: :return: an array of json blobs """ format_str = format['format'] if 'application/zip' in format_str and 'usfm' in format_str: self.logger.debug('Downloading {}'.format(format['url'])) rc_dir = download_rc(lid, rid, format['url'], temp_dir, self.download_file) if not rc_dir: return manifest = yaml.load( read_file(os.path.join(rc_dir, 'manifest.yaml'))) usx_dir = os.path.join(rc_dir, 'usx') for project in manifest['projects']: pid = TsV2CatalogHandler.sanitize_identifier( project['identifier']) # pid is project identifier, lid is language id, rid is resourceid process_id = '_'.join([lid, rid, pid]) if process_id not in self.status['processed']: self.logger.debug( 'Processing usfm for {}'.format(process_id)) # copy usfm project file usfm_dir = os.path.join(temp_dir, '{}_usfm'.format(process_id)) if not os.path.exists(usfm_dir): os.makedirs(usfm_dir) usfm_dest_file = os.path.normpath( os.path.join(usfm_dir, project['path'])) usfm_src_file = os.path.normpath( os.path.join(rc_dir, project['path'])) if os.path.getsize(usfm_src_file) < self.max_usfm_size: shutil.copyfile(usfm_src_file, usfm_dest_file) # transform usfm to usx build_usx(usfm_dir, usx_dir, self.logger) # convert USX to JSON path = os.path.normpath( os.path.join(usx_dir, '{}.usx'.format(pid.upper()))) source = build_json_source_from_usx( path, format['modified'], self) upload = prep_data_upload( '{}/{}/{}/v{}/source.json'.format( pid, lid, rid, resource['version']), source['source'], temp_dir) self.logger.debug('Uploading {}/{}/{}'.format( self.cdn_bucket, TsV2CatalogHandler.cdn_root_path, upload['key'])) self.cdn_handler.upload_file( upload['path'], '{}/{}'.format(TsV2CatalogHandler.cdn_root_path, upload['key'])) self.status['processed'][process_id] = [] else: self.logger.warn( "Skipping {} because it is too big".format( process_id)) self.status['processed'][process_id] = ['skipped'] self.status['timestamp'] = time.strftime( "%Y-%m-%dT%H:%M:%SZ") self.db_handler.update_item( {'api_version': TsV2CatalogHandler.api_version}, self.status) else: self.logger.debug( 'USFM for {} has already been processed'.format( process_id)) # clean up download remove_tree(rc_dir, True)