def test_tn_preprocessor_short(self): # given repo_name = 'en_tn_2books' file_name = os.path.join('raw_sources', repo_name + '.zip') rc, repo_dir, self.temp_dir = self.extractFiles(file_name, repo_name) repo_dir = os.path.join(repo_dir) self.out_dir = tempfile.mkdtemp(prefix='output_') repo_name = 'dummy_repo' # when results, preproc = do_preprocess(rc, repo_dir, self.out_dir) # then self.assertTrue(preproc.is_multiple_jobs()) self.assertEquals(len(preproc.get_book_list()), 2) self.assertTrue( os.path.isfile(os.path.join(self.out_dir, 'index.json'))) self.assertFalse( os.path.isfile(os.path.join(self.out_dir, '01-GEN.md'))) self.assertFalse( os.path.isfile(os.path.join(self.out_dir, '67-REV.md'))) self.assertTrue(os.path.isfile(os.path.join(self.out_dir, '02-EXO.md'))) self.assertTrue(os.path.isfile(os.path.join(self.out_dir, '03-LEV.md'))) index_json = read_file(os.path.join(self.out_dir, 'index.json')) exo = read_file(os.path.join(self.out_dir, '02-EXO.md')) self.assertGreater(len(exo), 1000) lev = read_file(os.path.join(self.out_dir, '03-LEV.md')) self.assertGreater(len(lev), 1000)
def test_tw_preprocessor(self): # given repo_name = 'en_tw' file_name = os.path.join('raw_sources', repo_name + '.zip') rc, repo_dir, self.temp_dir = self.extractFiles(file_name, repo_name) repo_dir = os.path.join(repo_dir) self.out_dir = tempfile.mkdtemp(prefix='output_') # when results, preproc = do_preprocess(rc, repo_dir, self.out_dir) # then self.assertTrue(os.path.isfile(os.path.join(self.out_dir, 'index.json'))) self.assertTrue(os.path.isfile(os.path.join(self.out_dir, 'kt.md'))) self.assertTrue(os.path.isfile(os.path.join(self.out_dir, 'names.md'))) self.assertTrue(os.path.isfile(os.path.join(self.out_dir, 'other.md'))) kt = read_file(os.path.join(self.out_dir, 'kt.md')) names = read_file(os.path.join(self.out_dir, 'names.md')) other = read_file(os.path.join(self.out_dir, 'other.md')) soup = BeautifulSoup(markdown2.markdown(kt, extras=['markdown-in-html', 'tables']), 'html.parser') self.assertEqual(soup.h1.text, 'Key Terms') self.assertEqual(soup.h2.text, 'abomination, abominations, abominable') self.assertIsNotNone(soup.find('a', {'id': 'adoption'})) self.assertEqual(len(soup.find_all('li')), 4009) # Test links have been converted # self.assertIsNotNone(soup.find("a", {"href": "#accuracy-check"})) # self.assertIsNotNone(soup.find("a", {"href": "03-translate.html#figs-explicit"})) # make sure no old links exist self.assertTrue(os.path.isfile(os.path.join(self.out_dir, 'manifest.yaml'))) self.assertTrue('(rc:' not in kt) self.assertTrue('(rc:' not in names) self.assertTrue('(rc:' not in other) self.assertTrue('../' not in kt) self.assertTrue('../' not in names) self.assertTrue('../' not in other)
def test_tn_preprocessor_short(self): # given repo_name = 'en_tn_2books' file_name = os.path.join('raw_sources', repo_name + '.zip') rc, repo_dir, self.temp_dir = self.extractFiles(file_name, repo_name) repo_dir = os.path.join(repo_dir) self.out_dir = tempfile.mkdtemp(prefix='output_') repo_name = 'dummy_repo' # when results, preproc = do_preprocess(rc, repo_dir, self.out_dir) # then self.assertTrue(preproc.is_multiple_jobs()) self.assertEquals(len(preproc.get_book_list()), 2) self.assertTrue(os.path.isfile(os.path.join(self.out_dir, 'index.json'))) self.assertFalse(os.path.isfile(os.path.join(self.out_dir, '01-GEN.md'))) self.assertFalse(os.path.isfile(os.path.join(self.out_dir, '67-REV.md'))) self.assertTrue(os.path.isfile(os.path.join(self.out_dir, '02-EXO.md'))) self.assertTrue(os.path.isfile(os.path.join(self.out_dir, '03-LEV.md'))) index_json = read_file(os.path.join(self.out_dir, 'index.json')) exo = read_file(os.path.join(self.out_dir, '02-EXO.md')) self.assertGreater(len(exo), 1000) lev = read_file(os.path.join(self.out_dir, '03-LEV.md')) self.assertGreater(len(lev), 1000)
def title(self): if 'title'in self.project and self.project['title']: return self.project['title'] elif 'name'in self.project and self.project['name']: return self.project['name'] elif self.rc.path and os.path.isfile(os.path.join(self.rc.path, self.path, 'title.txt')): self.project['title'] = read_file(os.path.join(self.rc.path, self.path, 'title.txt')) return self.project['title'] elif self.rc.path and os.path.isfile(os.path.join(self.rc.path, 'title.txt')): self.project['title'] = read_file(os.path.join(self.rc.path, 'title.txt')) return self.project['title'] else: return self.rc.resource.title
def test_PhpValidWithFormattingTags(self): out_dir = self.copy_resource(self.php_repo_path) replace = read_file(os.path.join(self.resources_dir, 'formatting_example.txt')) self.replace_verse(out_dir, self.php_file_name, chapter=2, start_vs=1, end_vs=13, replace=replace + ' ') # replace v1 expected_warnings = 0 linter = self.run_linter(out_dir) self.verify_results_counts(expected_warnings, linter)
def print_project(self, project_id): """ :param string project_id: :return string: """ self.project_id = project_id if len(project_id.split('/')) != 3: raise Exception('Project not found.') user_name, repo_name, commit_id = project_id.split('/') source_path = 'u/{0}'.format(project_id) print_all_key = '{0}/print_all.html'.format(source_path) print_all_file = tempfile.mktemp(prefix='print_all_') if not App.cdn_s3_handler().key_exists(print_all_key): files_dir = tempfile.mkdtemp(prefix='files_') App.cdn_s3_handler().download_dir(source_path, files_dir) project_dir = os.path.join(files_dir, source_path.replace('/', os.path.sep)) if not os.path.isdir(project_dir): raise Exception('Project not found.') rc = RC(project_dir, repo_name) with codecs.open(print_all_file, 'w', 'utf-8-sig') as print_all: print_all.write(""" <html lang="{0}" dir="{1}"> <head> <meta charset="UTF-8"/> <title>{2}: {3}</title> <style type="text/css"> body > div {{ page-break-after: always; }} </style> </head> <body onLoad="window.print()"> <h1>{2}: {3}</h1> """.format(rc.resource.language.identifier, rc.resource.language.direction, rc.resource.language.title, rc.resource.title)) for fname in sorted(glob(os.path.join(project_dir, '*.html')), key=self.front_to_back): with codecs.open(fname, 'r', 'utf-8-sig') as f: soup = BeautifulSoup(f, 'html.parser') # get the body of the raw html file content = soup.div if not content: content = BeautifulSoup( '<div>No content</div>', 'html.parser').find('div').extract() content['id'] = os.path.basename(fname) print_all.write(unicode(content)) print_all.write(""" </body> </html> """) App.cdn_s3_handler().upload_file(print_all_file, print_all_key, cache_time=0) html = read_file(print_all_file) else: html = App.cdn_s3_handler().get_file_contents(print_all_key) return html
def get_chapter_reference(project_path, chapter): """Get the chapters reference text""" reference_file = os.path.join(project_path, chapter, 'reference.txt') reference = '' if os.path.exists(reference_file): contents = read_file(reference_file) reference = contents.strip() return reference
def title(self): if 'title' in self.project and self.project['title']: return self.project['title'] elif 'name' in self.project and self.project['name']: return self.project['name'] elif self.rc.path and os.path.isfile( os.path.join(self.rc.path, self.path, 'title.txt')): self.project['title'] = read_file( os.path.join(self.rc.path, self.path, 'title.txt')) return self.project['title'] elif self.rc.path and os.path.isfile( os.path.join(self.rc.path, 'title.txt')): self.project['title'] = read_file( os.path.join(self.rc.path, 'title.txt')) return self.project['title'] else: return self.rc.resource.title
def test_ta_preprocessor(self): file_name = os.path.join('raw_sources', 'en_ta.zip') repo_name = 'en_ta' rc, repo_dir, self.temp_dir = self.extractFiles(file_name, repo_name) self.out_dir = tempfile.mkdtemp(prefix='output_') do_preprocess(rc, repo_dir, self.out_dir) self.assertTrue( os.path.isfile(os.path.join(self.out_dir, '01-intro.md'))) self.assertTrue( os.path.isfile(os.path.join(self.out_dir, '02-process.md'))) self.assertTrue( os.path.isfile(os.path.join(self.out_dir, '03-translate.md'))) self.assertTrue( os.path.isfile(os.path.join(self.out_dir, '04-checking.md'))) intro = read_file(os.path.join(self.out_dir, '01-intro.md')) process = read_file(os.path.join(self.out_dir, '02-process.md')) translate = read_file(os.path.join(self.out_dir, '03-translate.md')) checking = read_file(os.path.join(self.out_dir, '04-checking.md')) soup = BeautifulSoup( markdown2.markdown(checking, extras=['markdown-in-html', 'tables']), 'html.parser') self.assertEqual(soup.h1.text, "Checking Manual") self.assertIsNotNone(soup.find("a", {"id": "accurate"})) self.assertEqual(len(soup.find_all('li')), 350) # Test links have been converted self.assertIsNotNone(soup.find("a", {"href": "#accuracy-check"})) self.assertIsNotNone( soup.find("a", {"href": "03-translate.html#figs-explicit"})) # make sure no old links exist self.assertTrue('../' not in checking) self.assertTrue('../' not in intro) self.assertTrue('../' not in process) self.assertTrue('../' not in translate) self.assertTrue( os.path.isfile(os.path.join(self.out_dir, '04-checking-toc.yaml'))) self.assertTrue( os.path.isfile( os.path.join(self.out_dir, '04-checking-config.yaml'))) preprocessor = TaPreprocessor(rc, repo_dir, self.out_dir) self.assertEqual( preprocessor.get_title(rc.project('checking'), 'fake-link', 'My Title'), 'My Title') self.assertEqual( preprocessor.get_title(rc.project('checking'), 'fake-link'), 'Fake Link')
def run(self): for idx, project in enumerate(self.rc.projects): project_path = os.path.join(self.source_dir, project.path) if os.path.isfile(project_path): # Case #1: Project path is a file, then we copy the file over to the output dir if project.identifier.lower() in BOOK_NUMBERS: filename = '{0}-{1}.{2}'.format( BOOK_NUMBERS[project.identifier.lower()], project.identifier.upper(), self.rc.resource.file_ext) else: filename = '{0}-{1}.{2}'.format( str(idx + 1).zfill(2), project.identifier, self.rc.resource.file_ext) copy(project_path, os.path.join(self.output_dir, filename)) else: # Case #2: It's a directory of files, so we copy them over to the output directory files = glob( os.path.join(project_path, '*.{0}'.format(self.rc.resource.file_ext))) if len(files): for file_path in files: output_file_path = os.path.join( self.output_dir, os.path.basename(file_path)) if os.path.isfile(file_path) and not os.path.exists(output_file_path) \ and os.path.basename(file_path) not in self.ignoreFiles: copy(file_path, output_file_path) else: # Case #3: The project path is multiple chapters, so we piece them together chapters = self.rc.chapters(project.identifier) App.logger.debug("Merging chapters in '{0}'".format( project.identifier)) if len(chapters): text = '' for chapter in chapters: text = self.mark_chapter(project.identifier, chapter, text) for chunk in self.rc.chunks( project.identifier, chapter): text = self.mark_chunk(project.identifier, chapter, chunk, text) text += read_file( os.path.join(project_path, chapter, chunk)) + "\n\n" if project.identifier.lower() in BOOK_NUMBERS: filename = '{0}-{1}.{2}'.format( BOOK_NUMBERS[project.identifier.lower()], project.identifier.upper(), self.rc.resource.file_ext) else: filename = '{0}-{1}.{2}'.format( str(idx + 1).zfill(2), project.identifier, self.rc.resource.file_ext) write_file(os.path.join(self.output_dir, filename), text) return True
def replace_verse_to_end(self, out_dir, file_name, chapter, start_vs, replace): book_path = os.path.join(out_dir, file_name) book_text = read_file(book_path) chapter_marker = '\\c {0:02d}'.format(chapter) c_pos = book_text.find(chapter_marker) previous_section = book_text[:c_pos] next_section = book_text[c_pos:] start_pos = next_section.find('\\v {0} '.format(start_vs)) start_text = next_section[:start_pos] new_text = previous_section + start_text + replace write_file(book_path, new_text)
def replace_chapter(self, out_dir, file_name, start_ch, end_ch, replace): book_path = os.path.join(out_dir, file_name) book_text = read_file(book_path) start_chapter_marker = '\\c {0:02d}'.format(start_ch) end_chapter_marker = '\\c {0:02d}'.format(end_ch) c_start_pos = book_text.find(start_chapter_marker) c_end_pos = book_text.find(end_chapter_marker) previous_section = book_text[:c_start_pos] next_section = book_text[c_end_pos:] new_text = previous_section + replace + next_section write_file(book_path, new_text)
def replace_tag(self, out_dir, file_name, tag, replace): book_path = os.path.join(out_dir, file_name) book_text = read_file(book_path) start_marker = '\\{0}'.format(tag) end_marker = '\\' c_start_pos = book_text.find(start_marker) c_end_pos = book_text.find(end_marker, c_start_pos + 1) previous_section = book_text[:c_start_pos] next_section = book_text[c_end_pos:] new_text = previous_section + replace + next_section write_file(book_path, new_text)
def get_chapter_frames(project_path, chapter): frames = [] chapter_dir = os.path.join(project_path, chapter) for frame in sorted(os.listdir(chapter_dir)): if frame not in ObsPreprocessor.ignoreFiles: text = read_file(os.path.join(project_path, chapter, frame)) frames.append({ 'id': chapter + '-' + frame.strip('.txt'), 'text': text }) return frames
def testTemplaterTaComplete(self): test_folder_name = os.path.join('converted_projects', 'en_ta-complete.zip') expect_success = True test_file_path = self.extractZipFiles(test_folder_name) success = self.doTemplater('ta', test_file_path) self.verifyTaTemplater(success, expect_success, self.out_dir, ['checking.html', 'intro.html', 'process.html', 'translate.html']) # Verify sidebar nav generated soup = BeautifulSoup(read_file(os.path.join(self.out_dir, 'checking.html')), 'html.parser') self.assertEqual(len(soup.find('nav', {'id': 'right-sidebar-nav'}).findAll('li')), 49) self.assertEqual(len(soup.find('div', {'id': 'content'}).findAll(re.compile(r'h\d+'), {'class': 'section-header'})), 44)
def testTemplaterTwComplete(self): test_folder_name = os.path.join('converted_projects', 'en_tw_converted.zip') expect_success = True test_file_path = self.extractZipFiles(test_folder_name) test_file_path = os.path.join(test_file_path, 'en_tw_converted') success = self.doTemplater('tw', test_file_path) self.verifyTaTemplater(success, expect_success, self.out_dir, ['kt.html', 'names.html', 'other.html']) # Verify sidebar nav generated soup = BeautifulSoup(read_file(os.path.join(self.out_dir, 'kt.html')), 'html.parser') self.assertEqual(len(soup.find('nav', {'id': 'right-sidebar-nav'}).findAll('li')), 1020) self.assertEqual(len(soup.find('div', {'id': 'content'}).findAll(re.compile(r'h\d+'), {'class': 'section-header'})), 212)
def get_chapter_title(project_path, chapter): """ Get a chapter title. if the title file does not exist, it will hand back the number with a period only. """ title_file = os.path.join(project_path, chapter, 'title.txt') if os.path.exists(title_file): contents = read_file(title_file) title = contents.strip() else: title = chapter.lstrip('0') + '. ' return title
def test_PhpValidWithFormattingTags(self): out_dir = self.copy_resource(self.php_repo_path) replace = read_file( os.path.join(self.resources_dir, 'formatting_example.txt')) self.replace_verse(out_dir, self.php_file_name, chapter=2, start_vs=1, end_vs=13, replace=replace + ' ') # replace v1 expected_warnings = 0 linter = self.run_linter(out_dir) self.verify_results_counts(expected_warnings, linter)
def run(self): index_json = { 'titles': {}, 'chapters': {}, 'book_codes': {} } title_re = re.compile('^# +(.*?) *#*$', flags=re.MULTILINE) headers_re = re.compile('^(#+) +(.+?) *#*$', flags=re.MULTILINE) for idx, project in enumerate(self.rc.projects): term_text = {} section_dirs = sorted(glob(os.path.join(self.source_dir, project.path, '*'))) for section_dir in section_dirs: section = os.path.basename(section_dir) if section not in self.section_titles: continue key = '{0}.html'.format(section) index_json['titles'][key] = self.section_titles[section] index_json['chapters'][key] = {} index_json['book_codes'][key] = section term_files = sorted(glob(os.path.join(section_dir, '*.md'))) for term_file in term_files: term = os.path.splitext(os.path.basename(term_file))[0] text = read_file(term_file) if title_re.search(text): title = title_re.search(text).group(1) text = title_re.sub(r'# <a id="{0}"/>\1 #'.format(term), text) # inject the term by the title else: title = os.path.splitext(os.path.basename(term_file))[0] # No title found, so using term text = headers_re.sub(r'#\1 \2', text) index_json['chapters'][key][term] = title term_text[term] = text # Sort terms by title and add to markdown markdown = '' titles = index_json['chapters'][key] terms_sorted_by_title = sorted(titles, key=lambda i: titles[i].lower()) for term in terms_sorted_by_title: if markdown: markdown += '<hr>\n\n' markdown += term_text[term] + '\n\n' markdown = '# <a id="tw-section-{0}"/>{1}\n\n'.format(section, self.section_titles[section]) + markdown markdown = self.fix_links(markdown, section) output_file = os.path.join(self.output_dir, '{0}.md'.format(section)) write_file(output_file, markdown) config_file = os.path.join(self.source_dir, project.path, 'config.yaml') if os.path.isfile(config_file): copy(config_file, os.path.join(self.output_dir, 'config.yaml')) output_file = os.path.join(self.output_dir, 'index.json') write_file(output_file, index_json) return True
def init_items(self): self.items = { 'Door43/en_obs': { 'repo_name': 'en_obs', 'user_name': 'Door43', 'lang_code': 'en', 'resource_id': 'obs', 'resource_type': 'book', 'title': 'Open Bible Stories', 'views': 2, 'last_updated': datetime.utcnow(), 'manifest': read_file(os.path.join(self.resources_dir, 'obs_manifest.yaml')), }, 'JohnDoe/en_obs': { 'repo_name': 'en_obs', 'user_name': 'JohnDoe', 'lang_code': 'en', 'resource_id': 'obs', 'resource_type': 'book', 'title': 'Open Bible Stories', 'views': 2, 'last_updated': datetime.strptime('2016-12-21T05:23:01Z', '%Y-%m-%dT%H:%M:%SZ'), 'manifest': read_file(os.path.join(self.resources_dir, 'obs_manifest.yaml')), }, 'francis/fr_ulb': { 'repo_name': 'fr_ulb', 'user_name': 'francis', 'lang_code': 'fr', 'resource_id': 'ulb', 'resource_type': 'bundle', 'title': 'Unlocked Literal Bible', 'views': 12, 'last_updated': datetime.strptime('2017-02-11T15:43:11Z', '%Y-%m-%dT%H:%M:%SZ'), 'manifest': read_file(os.path.join(self.resources_dir, 'obs_manifest.yaml')), }, }
def get_title(self, project, link, alt_title=None): proj = None if link in project.config(): proj = project else: for p in self.rc.projects: if link in p.config(): proj = p if proj: title_file = os.path.join(self.source_dir, proj.path, link, 'title.md') if os.path.isfile(title_file): return read_file(title_file) if alt_title: return alt_title else: return link.replace('-', ' ').title()
def get_strings(self): if self.single_dir: dir_path = os.path.join(self.source_dir, self.single_dir) sub_files = sorted(get_files(directory=dir_path, relative_paths=True, exclude=self.EXCLUDED_FILES, extensions=['.md'])) files = [] for f in sub_files: files.append(os.path.join(self.single_dir, f)) else: files = sorted(get_files(directory=self.source_dir, relative_paths=True, exclude=self.EXCLUDED_FILES, extensions=['.md'])) strings = {} for f in files: path = os.path.join(self.source_dir, f) text = read_file(path) strings[f] = text return strings
def run(self): index_json = { 'titles': {}, 'chapters': {}, 'book_codes': {} } headers_re = re.compile('^(#+) +(.+?) *#*$', flags=re.MULTILINE) for idx, project in enumerate(self.rc.projects): if project.identifier in BOOK_NAMES: markdown = '' book = project.identifier.lower() html_file = '{0}-{1}.html'.format(BOOK_NUMBERS[book], book.upper()) index_json['book_codes'][html_file] = book name = BOOK_NAMES[book] index_json['titles'][html_file] = name chapter_dirs = sorted(glob(os.path.join(self.source_dir, project.path, '*'))) markdown += '# <a id="tq-{0}"/> {1}\n\n'.format(book, name) index_json['chapters'][html_file] = [] for chapter_dir in chapter_dirs: chapter = os.path.basename(chapter_dir) link = 'tq-chapter-{0}-{1}'.format(book, chapter.zfill(3)) index_json['chapters'][html_file].append(link) markdown += '## <a id="{0}"/> {1} {2}\n\n'.format(link, name, chapter.lstrip('0')) chunk_files = sorted(glob(os.path.join(chapter_dir, '*.md'))) for chunk_idx, chunk_file in enumerate(chunk_files): start_verse = os.path.splitext(os.path.basename(chunk_file))[0].lstrip('0') if chunk_idx < len(chunk_files)-1: end_verse = str(int(os.path.splitext(os.path.basename(chunk_files[chunk_idx+1]))[0])-1) else: end_verse = BOOK_CHAPTER_VERSES[book][chapter.lstrip('0')] link = 'tq-chunk-{0}-{1}-{2}'.format(book, str(chapter).zfill(3), str(start_verse).zfill(3)) markdown += '### <a id="{0}"/>{1} {2}:{3}{4}\n\n'.\ format(link, name, chapter.lstrip('0'), start_verse, '-'+end_verse if start_verse != end_verse else '') text = read_file(chunk_file) + '\n\n' text = headers_re.sub(r'\1### \2', text) # This will bump any header down 3 levels markdown += text file_path = os.path.join(self.output_dir, '{0}-{1}.md'.format(BOOK_NUMBERS[book], book.upper())) write_file(file_path, markdown) else: App.logger.debug('TqPreprocessor: extra project found: {0}'.format(project.identifier)) # Write out index.json output_file = os.path.join(self.output_dir, 'index.json') write_file(output_file, index_json) return True
def lint(self): """ Checks for issues with translationWords Use self.log.warning("message") to log any issues. self.source_dir is the directory of source files (.md) :return bool: """ self.source_dir = os.path.abspath(self.source_dir) for root, dirs, files in os.walk(self.source_dir): for f in files: file_path = os.path.join(root, f) parts = os.path.splitext(f) if parts[1] == '.md': contents = file_utils.read_file(file_path) self.find_invalid_links(root, f, contents) return super(TwLinter, self).lint() # Runs checks on Markdown, using the markdown linter
def run(self): for idx, project in enumerate(self.rc.projects): project_path = os.path.join(self.source_dir, project.path) if os.path.isfile(project_path): # Case #1: Project path is a file, then we copy the file over to the output dir if project.identifier.lower() in BOOK_NUMBERS: filename = '{0}-{1}.{2}'.format(BOOK_NUMBERS[project.identifier.lower()], project.identifier.upper(), self.rc.resource.file_ext) else: filename = '{0}-{1}.{2}'.format(str(idx + 1).zfill(2), project.identifier, self.rc.resource.file_ext) copy(project_path, os.path.join(self.output_dir, filename)) else: # Case #2: It's a directory of files, so we copy them over to the output directory files = glob(os.path.join(project_path, '*.{0}'.format(self.rc.resource.file_ext))) if len(files): for file_path in files: output_file_path = os.path.join(self.output_dir, os.path.basename(file_path)) if os.path.isfile(file_path) and not os.path.exists(output_file_path) \ and os.path.basename(file_path) not in self.ignoreFiles: copy(file_path, output_file_path) else: # Case #3: The project path is multiple chapters, so we piece them together chapters = self.rc.chapters(project.identifier) App.logger.debug("Merging chapters in '{0}'".format(project.identifier)) if len(chapters): text = '' for chapter in chapters: text = self.mark_chapter(project.identifier, chapter, text) for chunk in self.rc.chunks(project.identifier, chapter): text = self.mark_chunk(project.identifier, chapter, chunk, text) text += read_file(os.path.join(project_path, chapter, chunk))+"\n\n" if project.identifier.lower() in BOOK_NUMBERS: filename = '{0}-{1}.{2}'.format(BOOK_NUMBERS[project.identifier.lower()], project.identifier.upper(), self.rc.resource.file_ext) else: filename = '{0}-{1}.{2}'.format(str(idx+1).zfill(2), project.identifier, self.rc.resource.file_ext) write_file(os.path.join(self.output_dir, filename), text) return True
def lint(self): """ Checks for issues with translationNotes Use self.log.warning("message") to log any issues. self.source_dir is the directory of source files (.md) :return boolean: """ self.source_dir = os.path.abspath(self.source_dir) source_dir = self.source_dir if not self.single_dir else os.path.join(self.source_dir, self.single_dir) for root, dirs, files in os.walk(source_dir): for f in files: file_path = os.path.join(root, f) parts = os.path.splitext(f) if parts[1] == '.md': contents = file_utils.read_file(file_path) self.find_invalid_links(root, f, contents) for dir in BOOK_NUMBERS: found_files = False if self.single_dir and (dir != self.single_dir): continue App.logger.debug("Processing folder {0}".format(dir)) file_path = os.path.join(self.source_dir, dir) for root, dirs, files in os.walk(file_path): if root == file_path: continue # skip book folder if len(files) > 0: found_files = True break if not found_files: msg = "missing book: '{0}'".format(dir) self.log.warnings.append(msg) App.logger.debug(msg) results = super(TnLinter, self).lint() # Runs checks on Markdown, using the markdown linter if not results: App.logger.debug("Error running MD linter on {0}".format(self.s3_results_key)) return results
def testTemplaterTwComplete(self): test_folder_name = os.path.join('converted_projects', 'en_tw_converted.zip') expect_success = True test_file_path = self.extractZipFiles(test_folder_name) test_file_path = os.path.join(test_file_path, 'en_tw_converted') success = self.doTemplater('tw', test_file_path) self.verifyTaTemplater(success, expect_success, self.out_dir, ['kt.html', 'names.html', 'other.html']) # Verify sidebar nav generated soup = BeautifulSoup(read_file(os.path.join(self.out_dir, 'kt.html')), 'html.parser') self.assertEqual( len(soup.find('nav', { 'id': 'right-sidebar-nav' }).findAll('li')), 1020) self.assertEqual( len( soup.find('div', { 'id': 'content' }).findAll(re.compile(r'h\d+'), {'class': 'section-header'})), 212)
def get_strings(self): if self.single_dir: dir_path = os.path.join(self.source_dir, self.single_dir) sub_files = sorted( get_files(directory=dir_path, relative_paths=True, exclude=self.EXCLUDED_FILES, extensions=['.md'])) files = [] for f in sub_files: files.append(os.path.join(self.single_dir, f)) else: files = sorted( get_files(directory=self.source_dir, relative_paths=True, exclude=self.EXCLUDED_FILES, extensions=['.md'])) strings = {} for f in files: path = os.path.join(self.source_dir, f) text = read_file(path) strings[f] = text return strings
def testTemplaterTaComplete(self): test_folder_name = os.path.join('converted_projects', 'en_ta-complete.zip') expect_success = True test_file_path = self.extractZipFiles(test_folder_name) success = self.doTemplater('ta', test_file_path) self.verifyTaTemplater( success, expect_success, self.out_dir, ['checking.html', 'intro.html', 'process.html', 'translate.html']) # Verify sidebar nav generated soup = BeautifulSoup( read_file(os.path.join(self.out_dir, 'checking.html')), 'html.parser') self.assertEqual( len(soup.find('nav', { 'id': 'right-sidebar-nav' }).findAll('li')), 49) self.assertEqual( len( soup.find('div', { 'id': 'content' }).findAll(re.compile(r'h\d+'), {'class': 'section-header'})), 44)
def test_manifest_last_modified_not_auto_updating(self): sometime = datetime.strptime('2017-02-11T15:43:11Z', '%Y-%m-%dT%H:%M:%SZ') manifest = TxManifest( **{ 'repo_name': 'es_ulb', 'user_name': 'franco', 'lang_code': 'es', 'resource_id': 'ulb', 'resource_type': 'bundle', 'title': 'Unlocked Literal Bible', 'views': 12, 'last_updated': sometime, 'manifest': read_file(os.path.join(self.resources_dir, 'obs_manifest.yaml')) }) manifest.insert() manifest_from_db = TxManifest.get(manifest.id) self.assertEqual(manifest_from_db.last_updated, sometime) manifest.views = manifest.views + 1 manifest.update() manifest_from_db = TxManifest.get(manifest.id) self.assertEqual(manifest_from_db.last_updated, sometime) manifest.last_updated = datetime.strptime('2018-03-12T15:43:11Z', '%Y-%m-%dT%H:%M:%SZ') manifest.update() manifest_from_db = TxManifest.get(manifest.id) self.assertNotEqual(manifest_from_db.last_updated, sometime)
def test_manifest_last_modified_not_auto_updating(self): sometime = datetime.strptime('2017-02-11T15:43:11Z', '%Y-%m-%dT%H:%M:%SZ') manifest = TxManifest(**{ 'repo_name': 'es_ulb', 'user_name': 'franco', 'lang_code': 'es', 'resource_id': 'ulb', 'resource_type': 'bundle', 'title': 'Unlocked Literal Bible', 'views': 12, 'last_updated': sometime, 'manifest': read_file(os.path.join(self.resources_dir, 'obs_manifest.yaml')) }) manifest.insert() manifest_from_db = TxManifest.get(manifest.id) self.assertEqual(manifest_from_db.last_updated, sometime) manifest.views = manifest.views + 1 manifest.update() manifest_from_db = TxManifest.get(manifest.id) self.assertEqual(manifest_from_db.last_updated, sometime) manifest.last_updated = datetime.strptime('2018-03-12T15:43:11Z', '%Y-%m-%dT%H:%M:%SZ') manifest.update() manifest_from_db = TxManifest.get(manifest.id) self.assertNotEqual(manifest_from_db.last_updated, sometime)
def replace_text(self, out_dir, file_name, match, replace): file_path = os.path.join(out_dir, file_name) text = read_file(file_path) new_text = text.replace(match, replace) self.assertNotEqual(text, new_text) write_file(file_path, new_text)
def get_content(self, project, slug): content_file = os.path.join(self.source_dir, project.path, slug, '01.md') if os.path.isfile(content_file): return read_file(content_file)
def append_text(self, out_dir, file_name, append): book_path = os.path.join(out_dir, file_name) book_text = read_file(book_path) new_text = book_text + append write_file(book_path, new_text)
def read_last_uploaded_file(self, match): file_path = self.get_last_uploaded_file(match) if file_path: return read_file(file_path) return None
def print_file(self, file_name, file_path): text = file_utils.read_file( file_path)[:200] # get the start of the file App.logger.debug("\nOutput file (" + file_name + "): " + text + "\n")
def get_question(self, project, slug): subtitle_file = os.path.join(self.source_dir, project.path, slug, 'sub-title.md') if os.path.isfile(subtitle_file): return read_file(subtitle_file)
def run(self): for idx, project in enumerate(self.rc.projects): project_path = os.path.join(self.source_dir, project.path) file_format = '{0}-{1}.usfm' # Case #1: The project path is a file, and thus is one book of the Bible, copy to standard filename if os.path.isfile(project_path): if project.identifier.lower() in BOOK_NUMBERS: filename = file_format.format(BOOK_NUMBERS[project.identifier.lower()], project.identifier.upper()) else: filename = file_format.format(str(idx+1).zfill(2), project.identifier.upper()) copy(project_path, os.path.join(self.output_dir, filename)) self.books.append(filename) else: # Case #2: Project path is a dir with one or more USFM files, is one or more books of the Bible usfm_files = glob(os.path.join(project_path, '*.usfm')) if len(usfm_files): for usfm_path in usfm_files: book_code = os.path.splitext(os.path.basename(usfm_path))[0].split('-')[-1].lower() if book_code in BOOK_NUMBERS: filename = file_format.format(BOOK_NUMBERS[book_code], book_code.upper()) else: filename = '{0}.usfm'.format(os.path.splitext(os.path.basename(usfm_path))[0]) output_file_path = os.path.join(self.output_dir, filename) if os.path.isfile(usfm_path) and not os.path.exists(output_file_path): copy(usfm_path, output_file_path) self.books.append(filename) else: # Case #3: Project path is a dir with one or more chapter dirs with chunk & title files chapters = self.rc.chapters(project.identifier) if len(chapters): # Piece the USFM file together title_file = os.path.join(project_path, chapters[0], 'title.txt') if os.path.isfile(title_file): title = read_file(title_file) title = re.sub(r' \d+$', '', title).strip() else: title = project.title if not title and os.path.isfile(os.path.join(project_path, 'title.txt')): title = read_file(os.path.join(project_path, 'title.txt')) usfm = """ \\id {0} {1} \\ide UTF-8 \\h {2} \\toc1 {2} \\toc2 {2} \\mt {2} """.format(project.identifier.upper(), self.rc.resource.title, title) for chapter in chapters: if chapter in self.ignoreDirectories: continue chapter_num = chapter.lstrip('0') chunks = self.rc.chunks(project.identifier, chapter) if not len(chunks): continue first_chunk = read_file(os.path.join(project_path, chapter, chunks[0])) usfm += "\n\n" if '\\c {0}'.format(chapter_num) not in first_chunk: usfm += "\\c {0}\n".format(chapter_num) if os.path.isfile(os.path.join(project_path, chapter, 'title.txt')): translated_title = read_file(os.path.join(project_path, chapter, 'title.txt')) book_name = re.sub(r' \d+$', '', translated_title).strip() if book_name.lower() != title.lower(): usfm += "\cl {0}\n".format(translated_title) for chunk in chunks: if chunk in self.ignoreFiles: continue chunk_num = os.path.splitext(chunk)[0].lstrip('0') chunk_content = read_file(os.path.join(project_path, chapter, chunk)) if '\\v {0} '.format(chunk_num) not in chunk_content: chunk_content = '\\v {0} '.format(chunk_num) + chunk_content usfm += chunk_content+"\n" if project.identifier.lower() in BOOK_NUMBERS: filename = file_format.format(BOOK_NUMBERS[project.identifier.lower()], project.identifier.upper()) else: filename = file_format.format(str(idx + 1).zfill(2), project.identifier.upper()) write_file(os.path.join(self.output_dir, filename), usfm) self.books.append(filename) return True
def run(self): index_json = {'titles': {}, 'chapters': {}, 'book_codes': {}} headers_re = re.compile('^(#+) +(.+?) *#*$', flags=re.MULTILINE) for idx, project in enumerate(self.rc.projects): App.logger.debug('TnPreprocessor: processing project: {0}'.format( project.identifier)) if project.identifier in BOOK_NAMES: markdown = '' book = project.identifier.lower() html_file = '{0}-{1}.html'.format(BOOK_NUMBERS[book], book.upper()) index_json['book_codes'][html_file] = book name = BOOK_NAMES[book] index_json['titles'][html_file] = name chapter_dirs = sorted( glob(os.path.join(self.source_dir, project.path, '*'))) markdown += '# <a id="tn-{0}"/> {1}\n\n'.format(book, name) index_json['chapters'][html_file] = [] for move_str in ['front', 'intro']: self.move_to_front(chapter_dirs, move_str) for chapter_dir in chapter_dirs: chapter = os.path.basename(chapter_dir) link = 'tn-chapter-{0}-{1}'.format(book, chapter.zfill(3)) index_json['chapters'][html_file].append(link) markdown += '## <a id="{0}"/> {1} {2}\n\n'.format( link, name, chapter.lstrip('0')) chunk_files = sorted( glob(os.path.join(chapter_dir, '*.md'))) chunk_files_txt = sorted( glob(os.path.join(chapter_dir, '*.txt'))) # If there are txt files in chapter folders, convert them to md format if len(chunk_files_txt): if txt2md(chapter_dir): return self.run() for move_str in ['front', 'intro']: self.move_to_front(chunk_files, move_str) for chunk_idx, chunk_file in enumerate(chunk_files): start_verse = os.path.splitext( os.path.basename(chunk_file))[0].lstrip('0') if chunk_idx < len(chunk_files) - 1: base_file_name = os.path.splitext( os.path.basename(chunk_files[chunk_idx + 1]))[0] if base_file_name.isdigit(): end_verse = str(int(base_file_name) - 1) else: end_verse = start_verse else: chapter_str = chapter.lstrip('0') chapter_verses = BOOK_CHAPTER_VERSES[book] end_verse = chapter_verses[ chapter_str] if chapter_str in chapter_verses else start_verse start_verse_str = str(start_verse).zfill( 3) if start_verse.isdigit() else start_verse link = 'tn-chunk-{0}-{1}-{2}'.format( book, str(chapter).zfill(3), start_verse_str) markdown += '### <a id="{0}"/>{1} {2}:{3}{4}\n\n'. \ format(link, name, chapter.lstrip('0'), start_verse, '-'+end_verse if start_verse != end_verse else '') text = read_file(chunk_file) + '\n\n' text = headers_re.sub( r'\1## \2', text) # This will bump any header down 2 levels markdown += text markdown = self.fix_links(markdown) book_file_name = '{0}-{1}.md'.format(BOOK_NUMBERS[book], book.upper()) self.books.append(book_file_name) file_path = os.path.join(self.output_dir, book_file_name) write_file(file_path, markdown) else: App.logger.debug( 'TnPreprocessor: extra project found: {0}'.format( project.identifier)) # Write out index.json output_file = os.path.join(self.output_dir, 'index.json') write_file(output_file, index_json) return True
def run(self): index_json = {'titles': {}, 'chapters': {}, 'book_codes': {}} title_re = re.compile('^# +(.*?) *#*$', flags=re.MULTILINE) headers_re = re.compile('^(#+) +(.+?) *#*$', flags=re.MULTILINE) for idx, project in enumerate(self.rc.projects): term_text = {} section_dirs = sorted( glob(os.path.join(self.source_dir, project.path, '*'))) for section_dir in section_dirs: section = os.path.basename(section_dir) if section not in self.section_titles: continue key = '{0}.html'.format(section) index_json['titles'][key] = self.section_titles[section] index_json['chapters'][key] = {} index_json['book_codes'][key] = section term_files = sorted(glob(os.path.join(section_dir, '*.md'))) term_files_txt = sorted( glob(os.path.join(section_dir, '*.txt'))) # If there are txt files in section folders, convert them to md format if len(term_files_txt): if txt2md(section_dir): return self.run() for term_file in term_files: term = os.path.splitext(os.path.basename(term_file))[0] text = read_file(term_file) if title_re.search(text): title = title_re.search(text).group(1) text = title_re.sub( r'# <a id="{0}"/>\1 #'.format(term), text) # inject the term by the title else: title = os.path.splitext(os.path.basename(term_file))[ 0] # No title found, so using term text = headers_re.sub(r'#\1 \2', text) index_json['chapters'][key][term] = title term_text[term] = text # Sort terms by title and add to markdown markdown = '' titles = index_json['chapters'][key] terms_sorted_by_title = sorted(titles, key=lambda i: titles[i].lower()) for term in terms_sorted_by_title: if markdown: markdown += '<hr>\n\n' markdown += term_text[term] + '\n\n' markdown = '# <a id="tw-section-{0}"/>{1}\n\n'.format( section, self.section_titles[section]) + markdown markdown = self.fix_links(markdown, section) output_file = os.path.join(self.output_dir, '{0}.md'.format(section)) write_file(output_file, markdown) config_file = os.path.join(self.source_dir, project.path, 'config.yaml') if os.path.isfile(config_file): copy(config_file, os.path.join(self.output_dir, 'config.yaml')) output_file = os.path.join(self.output_dir, 'index.json') write_file(output_file, index_json) return True
def prepend_text(self, out_dir, file_name, prefix): file_path = os.path.join(out_dir, file_name) text = read_file(file_path) new_text = prefix + text write_file(file_path, new_text)
def test_read_file(self): _, self.tmp_file = tempfile.mkstemp() with open(self.tmp_file, "w") as tmpf: tmpf.write("hello world") self.assertEqual(file_utils.read_file(self.tmp_file), "hello world")
def print_file(self, file_name, file_path): text = file_utils.read_file(file_path)[:200] # get the start of the file App.logger.debug("\nOutput file (" + file_name + "): " + text + "\n")