def test_map_dir(self): rc = factory.load(os.path.join(self.resources_dir, 'tw_rc')) out_dir = os.path.join(self.temp_dir, 'mapped_usfm') maptwtousfm3.mapDir(os.path.join(self.resources_dir, 'usfm'), rc, out_dir) mapped_usfm = read_file(os.path.join(out_dir, '41-MAT.usfm')) expected_usfm = read_file( os.path.join(self.resources_dir, 'mapped_mat.usfm')) self.assertEqual(mapped_usfm, expected_usfm)
def test_strip_word_data_from_file(self): """ This ensures we are correctly converting content to be used in the uW api. This content wasn't getting converted correctly in the past. :return: """ input = read_file(os.path.join(self.resources_dir, 'apiv3_1ch.usfm')) expected = read_file(os.path.join(self.resources_dir, 'uwapi_1ch.usfm')) output = strip_word_data(input) self.assertEqual(expected, output)
def test_usfm3_file_to_usfm2(self): """ This ensures we are correctly converting content to be used in the uW api. This content wasn't getting converted correctly in the past. :return: """ input = read_file(os.path.join(self.resources_dir, 'fr_gen.usfm3')) expected = read_file(os.path.join(self.resources_dir, 'fr_gen.usfm2')) output = strip_word_data(input) self.assertEqual(expected, output)
def test_transform_usfm_with_word_data(self, mock_reporter): mockS3 = MockS3Handler() mockS3._load_path(os.path.join(self.resources_dir, 'usfm_sources')) usx_dir = tempfile.mkdtemp('-usx_output') build_usx(mockS3.temp_dir, usx_dir) expected_usx_file = os.path.join(self.resources_dir, 'expected_usx/1JN.usx') out_file = os.path.join(usx_dir, '1JN.usx') expected_usx = read_file(expected_usx_file) output = read_file(out_file) self.assertEqual(expected_usx, output)
def test_map_usfm_by_occurrence(self): usfm = read_file(os.path.join(self.resources_dir, 'usfm/41-MAT.usfm')) rc = factory.load(os.path.join(self.resources_dir, 'tw_rc')) words_index = maptwtousfm3.indexWordsLocation(rc) category_index = maptwtousfm3.indexWordsCategory(rc) mappedUSFM = maptwtousfm3.mapUSFMByOccurrence( usfm=usfm, words_rc=rc, words_index=words_index['occurrences'], words_category_index=category_index) expected_usfm = read_file( os.path.join(self.resources_dir, 'mapped_mat.usfm')) self.assertEqual(mappedUSFM, expected_usfm)
def test_map_usfm_by_global_search(self): usfm = read_file(os.path.join(self.resources_dir, 'usfm/41-MAT.usfm')) rc = factory.load(os.path.join(self.resources_dir, 'tw_rc')) locations_index = maptwtousfm3.indexWordsLocation(rc) strongs_index = maptwtousfm3.indexWordByStrongs(rc) category_index = maptwtousfm3.indexWordsCategory(rc) mappedUSFM = maptwtousfm3.mapUSFMByGlobalSearch( usfm=usfm, words_strongs_index=strongs_index, words_false_positives_index=locations_index['false_positives'], words_category_index=category_index) expected_usfm = read_file( os.path.join(self.resources_dir, 'mapped_mat_global.usfm')) self.assertEqual(mappedUSFM, expected_usfm)
def test_processing_hbo(self): """ Test downloading and processing some hebrew :return: """ return rc_dir = download_rc('hbo', 'uhb', 'https://cdn.door43.org/hbo/uhb/v2.1.1/uhb.zip', self.temp_dir) manifest = yaml.load(read_file(os.path.join(rc_dir, 'manifest.yaml'))) usx_dir = os.path.join(rc_dir, 'usx') for project in manifest['projects']: pid = project['identifier'] # copy usfm project file usfm_dir = os.path.join(self.temp_dir, 'usfm') if not os.path.exists(usfm_dir): os.makedirs(usfm_dir) usfm_dest_file = os.path.normpath( os.path.join(usfm_dir, project['path'])) usfm_src_file = os.path.normpath( os.path.join(rc_dir, project['path'])) shutil.copyfile(usfm_src_file, usfm_dest_file) # transform usfm to usx build_usx(usfm_dir, usx_dir) # clean up converted usfm file remove(usfm_dest_file, True) # convert USX to JSON path = os.path.normpath( os.path.join(usx_dir, '{}.usx'.format(pid.upper()))) source = build_json_source_from_usx(path, 'hbo', pid, '2019')
def build_usx(usfm_dir, usx_dir): """ Builds the usx from usfm after performing some custom processing :param usfm_dir: :param usx_dir: :return: """ # strip word data files = os.listdir(usfm_dir) usfm2_dir = tempfile.mkdtemp(prefix='usfm2') try: for name in files: if name == '.DS_Store': continue f = os.path.join(usfm_dir, name) usfm3 = read_file(f) usfm2 = usfm3_to_usfm2(usfm3) out_f = os.path.join(usfm2_dir, name) write_file(out_f, usfm2) UsfmTransform.buildUSX(usfm2_dir, usx_dir, '', True) finally: try: shutil.rmtree(usfm2_dir) finally: pass
def _process_usfm(self, format): url = format['url'] usfm_file = os.path.join(self.temp_dir, md5(url).hexdigest()) self.download_file(url, usfm_file) usfm = read_file(usfm_file) return remove_unknown_markers( convert_chunk_markers(strip_word_data(usfm)))
def test_strip_word_data_large_string(self): input = u'''\\id 1CH \\h PREMIER LIVRE DES CHRONIQUES \\toc1 PREMIER LIVRE DES CHRONIQUES \\toc2 1 Chroniques \\toc3 1 Ch \\mt1 LES LIVRES DES CHRONIQUES \\mt1 PREMIER LIVRE DES CHRONIQUES \\s5 \\c 1 \\p \\v 1 \\w Adam|strong="H121"\\w*, \\w Seth|strong="H8352"\\w*, \\w Énosch|strong="H583"\\w*, \\v 2 \\w Kénan|strong="H7018"\\w*, \\w Mahalaleel|strong="H4111"\\w*, \\w Jéred|strong="H3382"\\w*, \\v 3 \\w Hénoc|strong="H2585"\\w*, \\w Metuschélah|strong="H4968"\\w*, \\w Lémec|strong="H3929"\\w*, \\v 4 \\w Noé|strong="H5146"\\w*, \\w Sem|strong="H8035"\\w*, \\w Cham|strong="H2526"\\w* et \\w Japhet|strong="H3315"\\w*. \\s5 \\v 5 \\w Fils|strong="H1121"\\w* de \\w Japhet|strong="H3315"\\w*: \\w Gomer|strong="H1586"\\w*, \\w Magog|strong="H4031"\\w*, \\w Madaï|strong="H4074"\\w*, \\w Javan|strong="H3120"\\w*, \\w Tubal|strong="H8422"\\w*, \\w Méschec|strong="H4902"\\w* et \\w Tiras|strong="H8494"\\w*. - \\v 6 \\w Fils|strong="H1121"\\w* de \\w Gomer|strong="H1586"\\w*: \\w Aschkenaz|strong="H813"\\w*, \\w Diphat|strong="H7384"\\w* et \\w Togarma|strong="H8425"\\w*. - \\v 7 \\w Fils|strong="H1121"\\w* de \\w Javan|strong="H3120"\\w*: \\w Élischa|strong="H473"\\w*, \\w Tarsisa|strong="H8659"\\w*, \\w Kittim|strong="H3794"\\w* et \\w Rodanim|strong="H1721"\\w*. \\s5 \\v 8 \\w Fils|strong="H1121"\\w* de \\w Cham|strong="H2526"\\w*: \\w Cusch|strong="H3568"\\w*, \\w Mitsraïm|strong="H4714"\\w*, \\w Puth|strong="H6316"\\w* et \\w Canaan|strong="H3667"\\w*. - \\v 9 \\w Fils|strong="H1121"\\w* de \\w Cusch|strong="H3568"\\w*: \\w Saba|strong="H5434"\\w*, \\w Havila|strong="H2341"\\w*, \\w Sabta|strong="H5454"\\w*, \\w Raema|strong="H7484"\\w* et \\w Sabteca|strong="H5455"\\w*. -\\w Fils|strong="H1121"\\w* de \\w Raema|strong="H7484"\\w*: \\w Séba|strong="H7614"\\w* et \\w Dedan|strong="H1719"\\w*. \\v 10 \\w Cusch|strong="H3568"\\w* \\w engendra|strong="H3205" x-morph="strongMorph:TH8804"\\w* \\w Nimrod|strong="H5248"\\w*; c'est lui qui \\w commença|strong="H2490" x-morph="strongMorph:TH8689"\\w* à être \\w puissant|strong="H1368"\\w* sur la \\w terre|strong="H776"\\w*. - ''' expected = read_file(os.path.join(self.resources_dir, 'uwapi_1ch.usfm')) output = strip_word_data(input) self.assertEqual(expected, output)
def test_tw_phrase_print(self): phrase = tWPhrase(1) phrase.addLine(u'\w Ἰησοῦ|lemma="Ἰησοῦς" strong="G24240" x-morph="Gr,N,,,,,GMS," x-tw="rc://*/tw/dict/bible/kt/jesus" \w*') phrase.addLine(u'\w Χριστοῦ|lemma="χριστός" strong="G55470" x-morph="Gr,N,,,,,GMS," x-tw="rc://*/tw/dict/bible/kt/christ" x-tw="rc://*/tw/dict/bible/kt/jesus" \w*,') expected = read_file(os.path.join(self.resources_dir, 'usfm_milestone.usfm')) self.assertEqual(unicode(expected), unicode(phrase))
def test_convert_file(self): usfm = osistousfm3.convertFile(osis_file=os.path.join( self.resources_dir, 'osis/Hag.xml'), lexicon=self.lexicon) expected_usfm = read_file( os.path.join(self.resources_dir, 'usfm/37-HAG.usfm')) self.assertEqual(expected_usfm, usfm)
def test_convert_osis_with_book_key_migration(self): usfm = osistousfm3.convertFile(osis_file=os.path.join( self.resources_dir, 'osis/2Sam.xml'), lexicon=self.lexicon) expected_usfm = read_file( os.path.join(self.resources_dir, 'usfm/10-2SA.usfm')) self.assertEqual(expected_usfm, usfm)
def test_index_tn_tsv_rc(self, mock_reporter): tmp = os.path.join(self.temp_dir, 'index_tn_rc') rc = os.path.join(self.resources_dir, 'en_tn_tsv') expected_file = os.path.join(self.resources_dir, 'en_tn_tsv/expected_gen_notes.json') converted_file = '{}/gen/en/notes.json'.format(tmp) expected = { 'en_*_gen_tn': { 'key': 'gen/en/notes.json', 'path': converted_file } } to_upload = index_tn_rc('en', tmp, rc) self.assertEqual(expected, to_upload) self.assertEquals(read_file(expected_file), read_file(converted_file))
def test_titus_multiple_word_match(self): """ Ensures we are correctly finding multiple word matches in Titus. :return: """ usfm = read_file(os.path.join(self.resources_dir, 'usfm/57-TIT.usfm')) rc = factory.load(os.path.join(self.resources_dir, 'tw_rc')) words_index = maptwtousfm3.indexWordsLocation(rc) category_index = maptwtousfm3.indexWordsCategory(rc) mappedUSFM = maptwtousfm3.mapUSFMByOccurrence( usfm=usfm, words_rc=rc, words_index=words_index['occurrences'], words_category_index=category_index) expected_usfm = read_file( os.path.join(self.resources_dir, 'mapped_tit.usfm')) self.assertEqual(mappedUSFM, expected_usfm)
def mapDir(usfm_dir, words_rc, output_dir, global_search=False, map_phrases=True): """ Maps tW to words within each USFM file found in the directory. :param usfm_dir: a directory containing USFM files generated by `csvtousfm3` :param words_rc: the tW resource container :type words_rc: ResourceContainer.RC :param output_dir: a directory where the newly mapped usfm will be saved :param global_search: performs a global word-by-word search in addition to the searcy by occurrence :return: """ usfm_files = [] strongs_index = {} for root, dirs, files in os.walk(usfm_dir): usfm_files.extend(files) break print('Generating occurrences index') location_index = indexWordsLocation(words_rc) category_index = indexWordsCategory(words_rc) if map_phrases: print('Phrase mapping enabled.') if global_search: print('Global search enabled.') print('Generating strongs index.') strongs_index = indexWordByStrongs(words_rc) for file_name in usfm_files: if not file_name.endswith('.usfm'): continue file = os.path.join(usfm_dir, file_name) print('{}'.format(file_name)) usfm = read_file(file) usfm = mapUSFMByOccurrence(usfm=usfm, words_rc=words_rc, words_index=location_index['occurrences'], words_category_index=category_index) if map_phrases: usfm = mapPhrases(usfm) if global_search: usfm = mapUSFMByGlobalSearch( usfm=usfm, words_strongs_index=strongs_index, words_false_positives_index=location_index['false_positives'], words_category_index=category_index) # NOTE: if we need to add phrase mapping to global search un-comment these lines # if map_phrases: # usfm = mapPhrases(usfm) outfile = os.path.join(output_dir, os.path.basename(file)) write_file(outfile, usfm)
def _build_catalogs(self): """ Builds the global catalogs :return: """ catalogs_path = os.path.join(self.repo_dir, 'catalogs.json') package = read_file(catalogs_path) return { 'repo_name': self.repo_name, 'commit_id': self.commit_id, 'timestamp': self.timestamp, 'package': package, 'dirty': False }
def test_convert_file(self): usfm = csvtousfm3.convert(lang='Gr', csv_file=os.path.join( self.resources_dir, 'input.csv')) self.assertIsInstance(usfm, list) self.assertEqual(2, len(usfm)) for book in usfm: self.assertIsInstance(book['usfm'], unicode) expected_usfm = read_file( os.path.join(self.resources_dir, '{}_output.usfm'.format(book['id']))) self.assertIsInstance(expected_usfm, unicode) self.assertMultiLineEqual(expected_usfm, book['usfm'])
def build_usx(usfm_dir, usx_dir): """ Builds the usx from usfm after performing some custom processing :param usfm_dir: :param usx_dir: :return: """ # strip word data files = os.listdir(usfm_dir) for name in files: f = os.path.join(usfm_dir, name) usfm = read_file(f) write_file(f, convert_chunk_markers(strip_word_data(usfm))) UsfmTransform.buildUSX(usfm_dir, usx_dir, '', True)
def test_unsigned_external_content(self, mock_reporter): format = { 'format': '', 'modified': '', 'size': '', 'url': 'https://google.com', 'signature': '' } row = json.loads( read_file( os.path.join(self.resources_dir, 'progress_db/no_sig_external_content-row.json'))) checker = ConsistencyChecker('cdn.door43.org', 'api.door43.org') errors = checker.check_format(format, row) self.assertEqual([], errors)
def build_usx(usfm_dir, usx_dir, logger=None): """ Builds the usx from usfm after performing some custom processing :param usfm_dir: :param usx_dir: :return: """ # strip word data files = os.listdir(usfm_dir) for name in files: f = os.path.join(usfm_dir, name) usfm = read_file(f) write_file(f, remove_unknown_markers(convert_chunk_markers(strip_word_data(usfm)))) if logger: logger.debug("Actual USX conversion into {}".format(usx_dir)) UsfmTransform.buildUSX(usfm_dir, usx_dir, '', True)
def test_unsigned_local_content(self, mock_reporter): format = { 'format': '', 'modified': '', 'size': '', 'url': 'https://api.door43.org', 'signature': '' } row = json.loads( read_file( os.path.join(self.resources_dir, 'progress_db/no_sig_external_content-row.json'))) checker = ConsistencyChecker('cdn.door43.org', 'api.door43.org') errors = checker.check_format(format, row) self.assertIn( "Consistency Check Failed: en_obs: url 'https://api.door43.org' has not been signed yet", errors)
def _build_localization(self): """ Builds the localization for various components in the catalog :return: """ files = sorted(glob(os.path.join(self.repo_dir, '*.json'))) localization = {} for f in files: self.logger.debug("Reading {0}...".format(f)) language = os.path.splitext(os.path.basename(f))[0] try: localization[language] = json.loads(read_file(f)) except Exception as e: raise Exception('Bad JSON: {0}'.format(e)) return { 'repo_name': self.repo_name, 'commit_id': self.commit_id, 'timestamp': self.timestamp, 'package': json.dumps(localization, sort_keys=True), 'dirty': False }
def index_tn_rc(lid, temp_dir, rc_dir, reporter=None): """ Converts a v3 tN into it's v2 equivalent. This will write a bunch of files and return a list of files to be uploaded. Chunk definitions will be used to validate the note organization. :param lid: the language id of the notes :param temp_dir: the directory where all the files will be written :param rc_dir: the directory of the resource container :param reporter: a lambda handler used for reporting :type reporter: Handler :return: a list of note files to upload """ manifest = yaml.load(read_file(os.path.join(rc_dir, 'manifest.yaml'))) content_format = manifest['dublin_core']['format'] if content_format == 'text/markdown': return tn_md_to_json_file(lid, temp_dir, rc_dir, manifest, reporter) elif content_format == 'text/tsv': return tn_tsv_to_json_file(lid, temp_dir, rc_dir, manifest, reporter) elif reporter: reporter.report_error("Unsupported content type '{}' found in {}".format(content_format, rc_dir)) raise Exception("Unsupported content type '{}' found in {}".format(content_format, rc_dir))
def test_complex_usfm3_to_usfm2(self): usfm3 = read_file(os.path.join(self.resources_dir, 'usfm/57-TIT.usfm')) expected_usfm2 = read_file(os.path.join(self.resources_dir, 'complex_tit.usfm2')) usfm2 = usfm3_to_usfm2(usfm3) self.assertEqual(expected_usfm2, usfm2)
def test_usfm3_to_usfm2(self): usfm3 = read_file(os.path.join(self.resources_dir, 'usfm3_sample.usfm')) expected_usfm2 = read_file(os.path.join(self.resources_dir, 'usfm2_sample.usfm')) usfm2 = usfm3_to_usfm2(usfm3) self.assertEqual(expected_usfm2, usfm2)
def _strip_print_script(file_to_sign): html = read_file(file_to_sign) html = html.replace('window.print()', '') write_file(file_to_sign, html)
def _build_versification(self): """ DEPRECATED we are no longer processing versification. :return: """ bible_dir = os.path.join(self.repo_dir, 'bible') versification_dirs = os.listdir(bible_dir) books = {} package = [] uploads = [] # group by project for vrs_dir in versification_dirs: vrs_id = os.path.basename(vrs_dir) book_files = sorted( glob(os.path.join(bible_dir, vrs_dir, 'chunks', '*.json'))) for b in book_files: self.logger.debug('Reading "{}" versification for "{}"'.format( vrs_id, b)) b_id = os.path.splitext(os.path.basename(b))[0] try: book_vrs = json.loads(read_file(b)) except Exception as e: raise Exception, Exception( 'Bad JSON: {0}'.format(e)), sys.exc_info()[2] book = WebhookHandler.retrieve_or_make( books, b_id, { 'identifier': b_id, 'chunks_url': '{0}/bible/{}/{}/v{}/chunks.json'.format( self.cdn_url, vrs_id, b_id, self.api_version), 'chunks': {} }) book['chunks'][vrs_id] = book_vrs temp_dir = os.path.join(self.temp_dir, 'versification') if not os.path.isdir: os.mkdir(temp_dir) for book in books: book = books[book] # write chunks chunk_file = os.path.join(temp_dir, book['identifier'] + '.json') write_file(chunk_file, json.dumps(book['chunks'], sort_keys=True)) # for now we bypass signing and upload chunks directly upload_key = 'bible/{}/v{}/chunks.json'.format( book['identifier'], self.api_version) uploads.append({'key': upload_key, 'path': chunk_file}) # build package del book['chunks'] package.append(book) return { 'repo_name': self.repo_name, 'commit_id': self.commit_id, 'timestamp': self.timestamp, 'package': json.dumps(package, sort_keys=True), 'uploads': uploads, 'dirty': False }
def test_inprogress(self, mock_reporter): mockV3Api = MockAPI(os.path.join(self.resources_dir, 'v3_api'), 'https://api.door43.org/') mockV3Api.add_host(os.path.join(self.resources_dir, 'v3_cdn'), 'https://test-cdn.door43.org/') mockV2Api = MockAPI(os.path.join(self.resources_dir, 'ts_api'), 'https://test') mockS3 = MockS3Handler('ts_bucket') mockDb = MockDynamodbHandler() mockDb._load_db( os.path.join(TestTsV2Catalog.resources_dir, 'ready_inprogress_db.json')) mockLog = MockLogger() event = self.make_event() converter = TsV2CatalogHandler( event=event, context=None, logger=mockLog, s3_handler=mockS3, dynamodb_handler=mockDb, url_handler=mockV3Api.get_url, download_handler=mockV3Api.download_file, url_exists_handler=lambda url: True) converter.run() assert_s3_equals_api_json(self, mockS3, mockV2Api, 'v2/ts/catalog.json') assert_s3_equals_api_json(self, mockS3, mockV2Api, 'v2/ts/obs/languages.json') assert_s3_equals_api_json(self, mockS3, mockV2Api, 'v2/ts/obs/en/resources.json') self.assertNotIn('v2/ts/obs/en/obs/source.json', mockS3._recent_uploads) # assert_s3_equals_api_json(self, mockS3, mockV2Api, 'v2/ts/obs/en/obs/source.json') assert_s3_equals_api_json(self, mockS3, mockV2Api, 'v2/ts/obs/en/notes.json') assert_s3_equals_api_json(self, mockS3, mockV2Api, 'v2/ts/obs/en/questions.json') # we have frozen tw_cat self.assertNotIn('v2/ts/obs/en/tw_cat.json', mockS3._recent_uploads) assert_s3_equals_api_json(self, mockS3, mockV2Api, 'v2/ts/1ch/languages.json') assert_s3_equals_api_json(self, mockS3, mockV2Api, 'v2/ts/1ch/en/resources.json') assert_s3_equals_api_json(self, mockS3, mockV2Api, 'v2/ts/1ch/en/ulb/v7/source.json') self.assertNotIn('v2/ts/1ch/en/notes.json', mockS3._recent_uploads) # assert_s3_equals_api_json(self, mockS3, mockV2Api, 'v2/ts/1ch/en/notes.json') assert_s3_equals_api_json(self, mockS3, mockV2Api, 'v2/ts/1ch/en/questions.json') self.assertNotIn('v2/ts/1ch/en/tw_cat.json', mockS3._recent_uploads) # assert_s3_equals_api_json(self, mockS3, mockV2Api, 'v2/ts/1ch/en/tw_cat.json') assert_s3_equals_api_json(self, mockS3, mockV2Api, 'v2/ts/bible/en/words.json') # validate urls in generate catalogs match the generated output paths root_url = '{}/'.format( event['stage-variables']['cdn_url'].rstrip('/')) catalog = json.loads( read_file(mockS3._recent_uploads['v2/ts/catalog.json'])) url_err_msg = 'url in catalog does not match upload path: {}' for project in catalog: lang_catalog_path = project['lang_catalog'].replace( root_url, '').split('?')[0] self.assertIn(lang_catalog_path, mockS3._recent_uploads, url_err_msg.format(lang_catalog_path)) lang_catalog = json.loads( read_file(mockS3._recent_uploads[lang_catalog_path])) for language in lang_catalog: res_catalog_path = language['res_catalog'].replace( root_url, '').split('?')[0] self.assertIn(res_catalog_path, mockS3._recent_uploads, url_err_msg.format(res_catalog_path)) res_catalog = json.loads( read_file(mockS3._recent_uploads[res_catalog_path])) for resource in res_catalog: questions_path = resource['checking_questions'].replace( root_url, '').split('?')[0] # notes_path = resource['notes'].replace(root_url, '').split('?')[0] # source_path = resource['source'].replace(root_url, '').split('?')[0] terms_path = resource['terms'].replace(root_url, '').split('?')[0] # terms_map_path = resource['tw_cat'].replace(root_url, '').split('?')[0] if questions_path: self.assertIn(questions_path, mockS3._recent_uploads, url_err_msg.format(questions_path)) # if notes_path: # self.assertIn(notes_path, mockS3._uploads, url_err_msg.format(notes_path)) # if source_path: # self.assertIn(source_path, mockS3._uploads, url_err_msg.format(source_path)) if terms_path: self.assertIn(terms_path, mockS3._recent_uploads, url_err_msg.format(terms_path))
def tn_md_to_json_file(lid, temp_dir, rc_dir, manifest, reporter=None): """ Converts a markdown tN to json This will write a bunch of files and return a list of files to be uploaded. Chunk definitions will be used to validate the note organization. :param lid: the language id of the notes :param temp_dir: the directory where all the files will be written :param rc_dir: the directory of the resource container :param manifest: the rc manifest data :param reporter: a lambda handler used for reporting :type reporter: Handler :return: a list of note files to upload """ dc = manifest['dublin_core'] note_general_re = re.compile('^([^#]+)', re.UNICODE) note_re = re.compile('^#+([^#\n]+)#*([^#]*)', re.UNICODE | re.MULTILINE | re.DOTALL) tn_uploads = {} for project in manifest['projects']: pid = Handler.sanitize_identifier(project['identifier']) chunk_json = {} if pid != 'obs': try: chunk_json = index_chunks(download_chunks(pid)) except: if reporter: reporter.report_error('Failed to retrieve chunk information for {}-{}'.format(lid, pid)) continue note_dir = os.path.normpath(os.path.join(rc_dir, project['path'])) note_json = [] if not os.path.exists(note_dir): raise Exception('Could not find translationNotes directory at {}'.format(note_dir)) chapters = os.listdir(note_dir) for chapter in chapters: if chapter in ['.', '..', 'front', '.DS_Store']: continue chapter_dir = os.path.join(note_dir, chapter) verses = os.listdir(chapter_dir) verses.sort() # zero pad chapter to match chunking scheme chapter = pad_to_match(chapter, chunk_json) chapter_chunk_json = chunk_json.get(chapter, {}) # validate chapters if pid != 'obs' and chapter not in chunk_json: raise Exception('Missing chapter "{}" key in chunk json while reading chunks for {}. RC: {}' \ .format(chapter, pid, rc_dir)) notes = [] firstvs = None note_hashes = [] for verse in verses: if verse in ['.', '..', 'intro.md', '.DS_Store']: continue # notes = [] verse_file = os.path.join(chapter_dir, verse) verse = verse.split('.')[0] try: verse_body = read_file(verse_file) except Exception as e: if reporter: reporter.report_error('Failed to read file {}'.format(verse_file)) raise e verse_body = convert_rc_links(verse_body) general_notes = note_general_re.search(verse_body) # zero pad verse to match chunking scheme verse = pad_to_match(verse, chapter_chunk_json) # close chunk if firstvs is not None and (pid != 'obs' and not chapter_chunk_json): if reporter: reporter.report_error( 'Could not find chunk data for {} {} {}'.format(rc_dir, pid, chapter)) if firstvs is not None and (pid == 'obs' or verse in chapter_chunk_json): note_json.append({ 'id': '{}-{}'.format(chapter, firstvs), 'tn': notes }) firstvs = verse notes = [] elif firstvs is None: firstvs = verse if general_notes: verse_body = note_general_re.sub('', verse_body) notes.append({ 'ref': 'General Information', 'text': general_notes.group(0).strip() }) for note in note_re.findall(verse_body): # TRICKY: do not include translation words in the list of notes if note[0].strip().lower() != 'translationwords': hasher = hashlib.md5() hasher.update(note[0].strip().lower().encode('utf-8')) note_hash = hasher.hexdigest() if note_hash not in note_hashes: note_hashes.append(note_hash) notes.append({ 'ref': note[0].strip(), 'text': note[1].strip() }) # close last chunk if firstvs is not None: note_json.append({ 'id': '{}-{}'.format(chapter, firstvs), 'tn': notes }) if note_json: tn_key = '_'.join([lid, '*', pid, 'tn']) note_json.append({'date_modified': str(dc['modified']).replace('-', '')}) note_upload = prep_data_upload('{}/{}/notes.json'.format(pid, lid), note_json, temp_dir) tn_uploads[tn_key] = note_upload return tn_uploads