def test_ta_preprocessor(self): file_name = os.path.join('raw_sources', 'en_ta.zip') repo_name = 'en_ta' rc, repo_dir, self.temp_dir = self.extractFiles(file_name, repo_name) self.out_dir = tempfile.mkdtemp(prefix='Door43_test_output_') do_preprocess('Translation_Academy', 'dummyOwner', 'dummyURL', rc, repo_dir, self.out_dir) self.assertTrue(os.path.isfile(os.path.join(self.out_dir, '01-intro.md'))) self.assertTrue(os.path.isfile(os.path.join(self.out_dir, '02-process.md'))) self.assertTrue(os.path.isfile(os.path.join(self.out_dir, '03-translate.md'))) self.assertTrue(os.path.isfile(os.path.join(self.out_dir, '04-checking.md'))) intro = read_file(os.path.join(self.out_dir, '01-intro.md')) process = read_file(os.path.join(self.out_dir, '02-process.md')) translate = read_file(os.path.join(self.out_dir, '03-translate.md')) checking = read_file(os.path.join(self.out_dir, '04-checking.md')) soup = BeautifulSoup(markdown2.markdown(checking, extras=['markdown-in-html', 'tables']), 'html.parser') self.assertEqual(soup.h1.text, "Checking Manual") self.assertIsNotNone(soup.find("a", {"id": "accurate"})) self.assertEqual(len(soup.find_all('li')), 350) # Test links have been converted self.assertIsNotNone(soup.find("a", {"href": "#accuracy-check"})) self.assertIsNotNone(soup.find("a", {"href": "03-translate.html#figs-explicit"})) # make sure no old links exist self.assertTrue('../' not in checking) self.assertTrue('../' not in intro) self.assertTrue('../' not in process) self.assertTrue('../' not in translate) self.assertTrue(os.path.isfile(os.path.join(self.out_dir, '04-checking-toc.yaml'))) self.assertTrue(os.path.isfile(os.path.join(self.out_dir, '04-checking-config.yaml'))) preprocessor = TaPreprocessor('dummyURL', rc, 'dummyOwner', repo_dir, self.out_dir) self.assertEqual(preprocessor.get_title(rc.project('checking'), 'fake-link', 'My Title'), 'My Title') self.assertEqual(preprocessor.get_title(rc.project('checking'), 'fake-link'), 'Fake Link')
def test_tn_preprocessor_short(self): # given repo_name = 'en_tn_2books' file_name = os.path.join('raw_sources', repo_name + '.zip') rc, repo_dir, self.temp_dir = self.extractFiles(file_name, repo_name) repo_dir = os.path.join(repo_dir) self.out_dir = tempfile.mkdtemp(prefix='Door43_test_output_') # repo_name = 'dummy_repo' # when do_preprocess('Translation_Notes', 'dummyOwner', 'dummyURL', rc, repo_dir, self.out_dir) # then # self.assertTrue(preproc.is_multiple_jobs()) # self.assertEqual(len(preproc.get_book_list()), 2) self.assertTrue( os.path.isfile(os.path.join(self.out_dir, 'index.json'))) self.assertFalse( os.path.isfile(os.path.join(self.out_dir, '01-GEN.md'))) self.assertFalse( os.path.isfile(os.path.join(self.out_dir, '67-REV.md'))) self.assertTrue(os.path.isfile(os.path.join(self.out_dir, '02-EXO.md'))) self.assertTrue(os.path.isfile(os.path.join(self.out_dir, '03-LEV.md'))) read_file(os.path.join(self.out_dir, 'index.json')) exo = read_file(os.path.join(self.out_dir, '02-EXO.md')) self.assertGreater(len(exo), 1000) lev = read_file(os.path.join(self.out_dir, '03-LEV.md')) self.assertGreater(len(lev), 1000)
def test_PhpValidWithFormattingTags(self): out_dir = self.copy_resource(self.php_repo_path) replace = read_file(os.path.join(self.resources_dir, 'formatting_example.txt')) self.replace_verse(out_dir, self.php_file_name, chapter=2, start_vs=1, end_vs=13, replace=replace + ' ') # replace v1 expected_warnings = 0 linter = self.run_linter(out_dir) self.verify_results_counts(expected_warnings, linter)
def get_chapter_title(self, chapter): title_file = os.path.join(self.source_dir, chapter, 'title.txt') title = chapter.lstrip('0') + '. ' if os.path.exists(title_file): contents = read_file(title_file) title = contents.strip() return title
def populate_book_data(self, bible_id, lang_code=None): if not lang_code: lang_code = self.lang_code bible_path = os.path.join(self.resources_dir, lang_code, 'bibles', bible_id) if not bible_path: self.logger.error(f'{bible_path} not found!') exit(1) bible_version_path = get_latest_version_path(bible_path) if not bible_version_path: self.logger.error(f'No versions found in {bible_path}!') exit(1) book_data = OrderedDict() book_file = os.path.join( self.resources[bible_id].repo_dir, f'{self.book_number}-{self.project_id.upper()}.usfm') book_usfm = read_file(book_file) unaligned_usfm = unalign_usfm(book_usfm) self.logger.info( f'Converting {self.project_id.upper()} from USFM to HTML...') book_html, warnings = SingleFilelessHtmlRenderer({ self.project_id.upper(): unaligned_usfm }).render() html_verse_splits = re.split( r'(<span id="[^"]+-ch-0*(\d+)-v-(\d+(?:-\d+)?)" class="v-num">)', book_html) usfm_chapter_splits = re.split(r'\\c ', unaligned_usfm) usfm_verse_splits = None chapter_verse_index = 0 for i in range(1, len(html_verse_splits), 4): chapter = html_verse_splits[i + 1] verses = html_verse_splits[i + 2] if chapter not in book_data: book_data[chapter] = OrderedDict() usfm_chapter = f'\\c {usfm_chapter_splits[int(chapter)]}' usfm_verse_splits = re.split(r'\\v ', usfm_chapter) chapter_verse_index = 0 chapter_verse_index += 1 verse_usfm = f'\\v {usfm_verse_splits[chapter_verse_index]}' verse_html = html_verse_splits[i] + html_verse_splits[i + 3] verse_html = re.split('<h2', verse_html)[ 0] # remove next chapter since only split on verses verse_soup = BeautifulSoup(verse_html, 'html.parser') for tag in verse_soup.find_all(): if (not tag.contents or len(tag.get_text(strip=True)) <= 0 ) and tag.name not in ['br', 'img']: tag.decompose() verse_html = str(verse_soup) verses = re.findall(r'\d+', verses) for verse in verses: verse = verse.lstrip('0') book_data[chapter][verse] = { 'usfm': verse_usfm, 'html': verse_html } self.book_data[bible_id] = book_data
def get_chapter_reference(self, chapter): reference_file = os.path.join(self.source_dir, chapter, 'reference.txt') reference = '' if os.path.exists(reference_file): contents = read_file(reference_file) reference = contents.strip() return reference
def get_chapter_reference(project_path: str, chapter: str) -> str: """Get the chapters reference text""" reference_file = os.path.join(project_path, chapter, 'reference.txt') reference = '' if os.path.exists(reference_file): contents = read_file(reference_file) reference = contents.strip() return reference
def title(self) -> str: if 'title' in self.project and self.project['title']: return self.project['title'] elif 'name' in self.project and self.project['name']: return self.project['name'] elif self.rc.path and os.path.isfile( os.path.join(self.rc.path, self.path, 'title.txt')): self.project['title'] = read_file( os.path.join(self.rc.path, self.path, 'title.txt')) return self.project['title'] elif self.rc.path and os.path.isfile( os.path.join(self.rc.path, 'title.txt')): self.project['title'] = read_file( os.path.join(self.rc.path, 'title.txt')) return self.project['title'] else: return self.rc.resource.title
def mock_requests_post(url, json=None, headers=None): print('Mock posting {}'.format(url)) if url == 'unit_test_api_url/tx/job': response = { 'status_code': 200, 'text': read_file(os.path.join(TestPipeline.resources_dir, 'en-obs-job-resp.json')) } return TestPipeline.JsonObject(response)
def get_strings(self) -> List[str]: strings = {} for filename in self.get_files(relative_paths=True): filepath = os.path.join(self.source_dir, filename) try: text = read_file(filepath) except Exception as e: self.log.warning(f"Error reading {filename}: {e}") strings[filename] = text return strings
def get_chapter_frames(self, chapter): frames = [] chapter_dir = os.path.join(self.source_dir, chapter) for frame in os.listdir(chapter_dir): if not frame in self.framesIgnoreFiles: text = read_file(os.path.join(chapter_dir, frame)) frames.append({ 'id': chapter + '-' + frame.strip('.txt'), 'text': text }) return frames
def replace_verse_to_end(self, out_dir, file_name, chapter, start_vs, replace): book_path = os.path.join(out_dir, file_name) book_text = read_file(book_path) chapter_marker = f'\\c {chapter}' c_pos = book_text.find(chapter_marker) previous_section = book_text[:c_pos] next_section = book_text[c_pos:] start_pos = next_section.find(f'\\v {start_vs} ') start_text = next_section[:start_pos] new_text = previous_section + start_text + replace write_file(book_path, new_text)
def replace_chapter(self, out_dir, file_name, start_ch, end_ch, replace): book_path = os.path.join(out_dir, file_name) book_text = read_file(book_path) start_chapter_marker = f'\\c {start_ch}' end_chapter_marker = f'\\c {end_ch}' c_start_pos = book_text.find(start_chapter_marker) c_end_pos = book_text.find(end_chapter_marker) previous_section = book_text[:c_start_pos] next_section = book_text[c_end_pos:] new_text = previous_section + replace + next_section write_file(book_path, new_text)
def replace_tag(self, out_dir, file_name, tag, replace): book_path = os.path.join(out_dir, file_name) book_text = read_file(book_path) start_marker = f'\\{tag}' end_marker = '\\' c_start_pos = book_text.find(start_marker) c_end_pos = book_text.find(end_marker, c_start_pos + 1) previous_section = book_text[:c_start_pos] next_section = book_text[c_end_pos:] new_text = previous_section + replace + next_section write_file(book_path, new_text)
def get_chapter_title(project_path: str, chapter) -> str: """ Get a chapter title. if the title file does not exist, it will hand back the number with a period only. """ title_filepath = os.path.join(project_path, chapter, 'title.txt') if os.path.exists(title_filepath): # title = self.check_and_clean_title(read_file(title_filepath), f'{chapter}/title/txt') title = read_file(title_filepath).strip() else: title = chapter.lstrip('0') + '. ' return title
def get_chapter_frames(project_path: str, chapter: str) -> List[Dict[str, Any]]: frames: List[Dict[str, Any]] = [] chapter_dir = os.path.join(project_path, chapter) for frame in sorted(os.listdir(chapter_dir)): if frame not in ObsPreprocessor.ignoreFiles: text = read_file(os.path.join(project_path, chapter, frame)) frames.append({ 'id': chapter + '-' + frame.strip('.txt'), 'text': text }) return frames
def run(self) -> Tuple[int, List[str]]: """ Default Preprocessor Case #1: Project path is a file, then we copy the file over to the output dir Case #2: It's a directory of files, so we copy them over to the output directory Case #3: The project path is multiple chapters, so we piece them together """ for idx, project in enumerate(self.rc.projects): project_path = os.path.join(self.source_dir, project.path) if os.path.isfile(project_path): filename = f'{str(idx + 1).zfill(2)}-{project.identifier}.{self.rc.resource.file_ext}' copy(project_path, os.path.join(self.output_dir, filename)) self.num_files_written += 1 else: # Case #2: It's a directory of files, so we copy them over to the output directory files = glob( os.path.join(project_path, f'*.{self.rc.resource.file_ext}')) if files: for file_path in files: output_file_path = os.path.join( self.output_dir, os.path.basename(file_path)) if os.path.isfile(file_path) and not os.path.exists(output_file_path) \ and os.path.basename(file_path) not in self.ignoreFiles: copy(file_path, output_file_path) self.num_files_written += 1 else: # Case #3: The project path is multiple chapters, so we piece them together chapters = self.rc.chapters(project.identifier) if chapters: text = '' for chapter in chapters: text = self.mark_chapter(project.identifier, chapter, text) for chunk in self.rc.chunks( project.identifier, chapter): text = self.mark_chunk(project.identifier, chapter, chunk, text) text += read_file( os.path.join(project_path, chapter, chunk)) + "\n\n" filename = f'{str(idx + 1).zfill(2)}-{project.identifier}.{self.rc.resource.file_ext}' write_file(os.path.join(self.output_dir, filename), text) self.num_files_written += 1 if self.num_files_written == 0: self.errors.append("No source files discovered") return self.num_files_written, self.errors + self.warnings + ( self.messages if self.errors or self.warnings else [])
def populate_verse_usfm(self, bible_id, lang_code=None): if not lang_code: lang_code = self.lang_code bible_path = os.path.join(self.working_dir, 'resources', lang_code, 'bibles', bible_id) if not bible_path: self.logger.error(f'{bible_path} not found!') exit(1) bible_version_path = get_latest_version_path(bible_path) if not bible_version_path: self.logger.error(f'No versions found in {bible_path}!') exit(1) book_data = OrderedDict() book_file = os.path.join( self.resources[bible_id].repo_dir, f'{self.book_number}-{self.project_id.upper()}.usfm') book_usfm = read_file(book_file) unaligned_usfm = unalign_usfm(book_usfm) chapters = unaligned_usfm.split(r'\c ') for chapter_usfm in chapters[1:]: chapter = re.findall(r'(\d+)', chapter_usfm)[0] book_data[chapter] = OrderedDict() chapter_usfm = r'\c ' + chapter_usfm chapter_vo_file = os.path.join(bible_version_path, self.project_id, f'{chapter}.json') chapter_verse_objects = load_json_object(chapter_vo_file) verses = chapter_usfm.split(r'\v ') for verse_usfm in verses[1:]: from_verse, to_verse = re.findall(r'^(\d+)(?:-(\d+))*', verse_usfm)[0] if not to_verse: to_verse = from_verse for verse in range(int(from_verse), int(to_verse) + 1): verse = str(verse) from_to_verse = f'{from_verse}-{to_verse}' if from_to_verse in chapter_verse_objects: usfm = rf'\v {from_to_verse} {self.get_text_from_verse_objects(chapter_verse_objects[from_to_verse])}' elif verse in chapter_verse_objects: usfm = rf'\v {verse} {self.get_text_from_verse_objects(chapter_verse_objects[verse]["verseObjects"])}' else: usfm = rf'\v {verse_usfm}' html = self.get_verse_html(usfm, bible_id, chapter, verse) book_data[chapter][verse] = { 'usfm': usfm.strip(), 'html': html.strip() } self.verse_usfm[bible_id] = book_data
def init_items(self): self.items = { 'Door43/en_obs': { 'repo_name': 'en_obs', 'user_name': 'Door43', 'lang_code': 'en', 'resource_id': 'obs', 'resource_type': 'book', 'title': 'Open Bible Stories', 'views': 2, 'last_updated': datetime.utcnow(), 'manifest': read_file(os.path.join(self.resources_dir, 'obs_manifest.yaml')), }, 'JohnDoe/en_obs': { 'repo_name': 'en_obs', 'user_name': 'JohnDoe', 'lang_code': 'en', 'resource_id': 'obs', 'resource_type': 'book', 'title': 'Open Bible Stories', 'views': 2, 'last_updated': datetime.strptime('2016-12-21T05:23:01Z', '%Y-%m-%dT%H:%M:%SZ'), 'manifest': read_file(os.path.join(self.resources_dir, 'obs_manifest.yaml')), }, 'francis/fr_ulb': { 'repo_name': 'fr_ulb', 'user_name': 'francis', 'lang_code': 'fr', 'resource_id': 'ulb', 'resource_type': 'bundle', 'title': 'Unlocked Literal Bible', 'views': 12, 'last_updated': datetime.strptime('2017-02-11T15:43:11Z', '%Y-%m-%dT%H:%M:%SZ'), 'manifest': read_file(os.path.join(self.resources_dir, 'obs_manifest.yaml')), }, }
def lint(self) -> bool: """ Checks for issues with translationNotes Use self.log.warning("message") to log any issues. self.source_dir is the directory of source files (.md) :return boolean: """ self.source_dir = os.path.abspath(self.source_dir) source_dir = self.source_dir if not self.single_dir else os.path.join( self.source_dir, self.single_dir) for root, _dirs, files in os.walk(source_dir): for f in files: file_path = os.path.join(root, f) parts = os.path.splitext(f) if parts[1] == '.md': contents = file_utils.read_file(file_path) self.find_invalid_links(root, f, contents) for dir in BOOK_NUMBERS: found_files = False if self.single_dir and (dir != self.single_dir): continue AppSettings.logger.debug(f"Processing folder {dir}") file_path = os.path.join(self.source_dir, dir) for root, _dirs, files in os.walk(file_path): if root == file_path: continue # skip book folder if files: found_files = True break if not found_files \ and 'OBS' not in self.repo_subject \ and len(self.rc.projects) != 1: # Many repos are intentionally just one book self.log.warning(f"Missing tN book: '{dir}'") results = super( TnLinter, self).lint() # Runs checks on Markdown, using the markdown linter if not results: AppSettings.logger.debug( f"Error running MD linter on {self.repo_subject}") return results
def convert_obs(self) -> None: self.log.info("Converting OBS markdown files…") # Find the first directory that has md files. files = get_files(directory=self.files_dir, exclude=self.EXCLUDED_FILES) current_dir = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(current_dir, 'templates', 'template.html')) as template_file: html_template = string.Template(template_file.read()) # found_chapters = {} for filepath in sorted(files): if filepath.endswith('.md'): # Convert files that are markdown files base_name_part = os.path.splitext( os.path.basename(filepath))[0] # found_chapters[base_name] = True try: md = read_file(filepath) except Exception as e: self.log.error( f"Error reading {base_name_part+'.md'}: {e}") continue html = markdown.markdown(md) html = html_template.safe_substitute( title=self.repo_subject.replace('_', ' '), content=html) html_filename = base_name_part + '.html' output_filepath = os.path.join(self.output_dir, html_filename) write_file(output_filepath, html) self.log.info( f"Converted {os.path.basename(filepath)} to {os.path.basename(html_filename)}." ) else: # Directly copy over files that are not markdown files try: output_filepath = os.path.join(self.output_dir, os.path.basename(filepath)) if not os.path.exists(output_filepath): copyfile(filepath, output_filepath) except: pass self.log.info("Finished processing OBS markdown files.")
def lint(self): """ Checks for issues with translationWords Use self.log.warning("message") to log any issues. self.source_dir is the directory of source files (.md) :return bool: """ self.source_dir = os.path.abspath(self.source_dir) for root, _dirs, files in os.walk(self.source_dir): for f in files: file_path = os.path.join(root, f) parts = os.path.splitext(f) if parts[1] == '.md': contents = file_utils.read_file(file_path) self.find_invalid_links(root, f, contents) return super( TwLinter, self).lint() # Runs checks on Markdown, using the markdown linter
def testTemplaterTwComplete(self): test_folder_name = os.path.join('converted_projects', 'en_tw_converted.zip') expect_success = True test_file_path = self.extractZipFiles(test_folder_name) test_file_path = os.path.join(test_file_path, 'en_tw_converted') success = self.doTemplater('Translation_Words', test_file_path) self.verifyTaTemplater(success, expect_success, self.out_dir, ['kt.html', 'names.html', 'other.html']) # Verify sidebar nav generated soup = BeautifulSoup(read_file(os.path.join(self.out_dir, 'kt.html')), 'html.parser') self.assertEqual( len(soup.find('nav', { 'id': 'right-sidebar-nav' }).findAll('li')), 1020) self.assertEqual( len( soup.find('div', { 'id': 'content' }).findAll(re.compile(r'h\d+'), {'class': 'section-header'})), 212)
def testTemplaterTaComplete(self): test_folder_name = os.path.join('converted_projects', 'en_ta-complete.zip') expect_success = True test_file_path = self.extractZipFiles(test_folder_name) success = self.doTemplater('Translation_Academy', test_file_path) self.verifyTaTemplater( success, expect_success, self.out_dir, ['checking.html', 'intro.html', 'process.html', 'translate.html']) # Verify sidebar nav generated soup = BeautifulSoup( read_file(os.path.join(self.out_dir, 'checking.html')), 'html.parser') self.assertEqual( len(soup.find('nav', { 'id': 'right-sidebar-nav' }).findAll('li')), 49) self.assertEqual( len( soup.find('div', { 'id': 'content' }).findAll(re.compile(r'h\d+'), {'class': 'section-header'})), 44)
def test_manifest_last_modified_not_auto_updating(self): sometime = datetime.strptime('2017-02-11T15:43:11Z', '%Y-%m-%dT%H:%M:%SZ') manifest = TxManifest(**{ 'repo_name': 'es_ulb', 'user_name': 'franco', 'lang_code': 'es', 'resource_id': 'ulb', 'resource_type': 'bundle', 'title': 'Unlocked Literal Bible', 'views': 12, 'last_updated': sometime, 'manifest': read_file(os.path.join(self.resources_dir, 'obs_manifest.yaml')) }) manifest.insert() manifest_from_db = TxManifest.get(manifest.id) self.assertEqual(manifest_from_db.last_updated, sometime) manifest.views = manifest.views + 1 manifest.update() manifest_from_db = TxManifest.get(manifest.id) self.assertEqual(manifest_from_db.last_updated, sometime) manifest.last_updated = datetime.strptime('2018-03-12T15:43:11Z', '%Y-%m-%dT%H:%M:%SZ') manifest.update() manifest_from_db = TxManifest.get(manifest.id) self.assertNotEqual(manifest_from_db.last_updated, sometime)
def test_read_file(self): _, self.tmp_file = tempfile.mkstemp(prefix='Door43_test_') with open(self.tmp_file, "w") as tmpf: tmpf.write("hello world") self.assertEqual(file_utils.read_file(self.tmp_file), "hello world")
def prepend_text(self, out_dir, file_name, prefix): file_path = os.path.join(out_dir, file_name) text = read_file(file_path) new_text = prefix + text write_file(file_path, new_text)
def replace_text(self, out_dir, file_name, match, replace): file_path = os.path.join(out_dir, file_name) text = read_file(file_path) new_text = text.replace(match, replace) self.assertNotEqual(text, new_text) write_file(file_path, new_text)
def get_chapter(self, chapter): chapter_content = '' for chapter in sorted(chapter): chapter_content += read_file(chapter) + u'\n' return chapter_content + u'\n\n'
def get_title(self): for root, dirnames, filenames in os.walk(self.source_dir): for filename in fnmatch.filter(filenames, 'title.txt'): return read_file(os.path.join(root, filename)) return self.manifest.project['name']