def test_get_files(self): self.tmp_dir = tempfile.mkdtemp(prefix='Door43_test_file_utils_') _, tmp_file1 = tempfile.mkstemp(dir=self.tmp_dir) _, tmp_file2 = tempfile.mkstemp(dir=self.tmp_dir) tmp_subdir = os.path.join(self.tmp_dir, 'subdir') os.mkdir(tmp_subdir) _, tmp_file3 = tempfile.mkstemp(dir=tmp_subdir, suffix=".md") files = file_utils.get_files(self.tmp_dir, relative_paths=False, include_directories=True) self.assertEqual(len(files), 4) self.assertTrue(any(self.paths_equal(tmp_file1, d) for d in files)) self.assertTrue(any(self.paths_equal(tmp_file2, d) for d in files)) self.assertTrue(any(self.paths_equal(tmp_subdir, d) for d in files)) self.assertTrue(any(self.paths_equal(tmp_file3, d) for d in files)) files = file_utils.get_files(self.tmp_dir, extensions=['.md']) self.assertEqual(len(files), 1) files = file_utils.get_files(self.tmp_dir, relative_paths=True, include_directories=True) self.assertEqual(len(files), 4) self.assertTrue( any( self.paths_equal(os.path.relpath(tmp_file1, self.tmp_dir), d) for d in files)) self.assertTrue( any( self.paths_equal(os.path.relpath(tmp_file2, self.tmp_dir), d) for d in files)) self.assertTrue( any( self.paths_equal(os.path.relpath(tmp_subdir, self.tmp_dir), d) for d in files)) self.assertTrue( any( self.paths_equal(os.path.relpath(tmp_file3, self.tmp_dir), d) for d in files))
def get_files(self, relative_paths: bool) -> List[str]: """ relative_paths can be True or False Returns a sorted list of .md files to be processed """ if self.single_dir: dir_path = os.path.join(self.source_dir, self.single_dir) sub_files = sorted( get_files(directory=dir_path, relative_paths=relative_paths, exclude=self.EXCLUDED_FILES, extensions=['.md'])) files = [] for f in sub_files: files.append(os.path.join(self.single_dir, f)) else: files = sorted( get_files(directory=self.source_dir, relative_paths=relative_paths, exclude=self.EXCLUDED_FILES, extensions=['.md'])) return files
def test_get_files(self): tmp_dir = tempfile.mkdtemp() _, tmp_file1 = tempfile.mkstemp(dir=tmp_dir) _, tmp_file2 = tempfile.mkstemp(dir=tmp_dir) tmp_subdir = tmp_dir + "/subdir" os.mkdir(tmp_subdir) _, tmp_file3 = tempfile.mkstemp(dir=tmp_subdir) files = file_utils.get_files(tmp_dir, relative_paths=False, include_directories=True) self.assertEqual(len(files), 4) self.assertTrue(any(self.paths_equal(tmp_file1, d) for d in files)) self.assertTrue(any(self.paths_equal(tmp_file2, d) for d in files)) self.assertTrue(any(self.paths_equal(tmp_subdir, d) for d in files)) self.assertTrue(any(self.paths_equal(tmp_file3, d) for d in files)) files = file_utils.get_files(tmp_dir, relative_paths=True, include_directories=True) self.assertEqual(len(files), 4) self.assertTrue( any( self.paths_equal(os.path.relpath(tmp_file1, tmp_dir), d) for d in files)) self.assertTrue( any( self.paths_equal(os.path.relpath(tmp_file2, tmp_dir), d) for d in files)) self.assertTrue( any( self.paths_equal(os.path.relpath(tmp_subdir, tmp_dir), d) for d in files)) self.assertTrue( any( self.paths_equal(os.path.relpath(tmp_file3, tmp_dir), d) for d in files))
def update_from_files(self, path): path = path.rstrip('/') found_markdown = False found_usfm = False found_html = False found_text_in_numbered_dir = False if not self.format: for f in get_files(path): if f.endswith('usfm'): found_usfm = True elif f.endswith('.md'): found_markdown = True elif f.endswith('.html'): found_html = True elif f.endswith('.txt'): try: if int(os.path.basename(os.path.dirname(f))): found_text_in_numbered_dir = True except Exception: pass if found_usfm: if not self.format: self.format = 'usfm' if not self.resource['id']: self.resource['id'] = 'bible' self.resource['name'] = 'Bible' elif found_markdown: if not self.format: self.format = 'markdown' elif found_html: if not self.format: self.format = 'html' if not self.generator['name']: for subdir in glob(os.path.join(path, '*')): if os.path.isdir(subdir): dir_name = subdir[len(path) + 1:] try: if int(dir_name) and len( glob(os.path.join(subdir, '*.txt'))) > 0: self.generator['name'] = 'ts' break except Exception: continue
def convert_obs(self) -> None: self.log.info("Converting OBS markdown files…") # Find the first directory that has md files. files = get_files(directory=self.files_dir, exclude=self.EXCLUDED_FILES) current_dir = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(current_dir, 'templates', 'template.html')) as template_file: html_template = string.Template(template_file.read()) # found_chapters = {} for filepath in sorted(files): if filepath.endswith('.md'): # Convert files that are markdown files base_name_part = os.path.splitext( os.path.basename(filepath))[0] # found_chapters[base_name] = True try: md = read_file(filepath) except Exception as e: self.log.error( f"Error reading {base_name_part+'.md'}: {e}") continue html = markdown.markdown(md) html = html_template.safe_substitute( title=self.repo_subject.replace('_', ' '), content=html) html_filename = base_name_part + '.html' output_filepath = os.path.join(self.output_dir, html_filename) write_file(output_filepath, html) self.log.info( f"Converted {os.path.basename(filepath)} to {os.path.basename(html_filename)}." ) else: # Directly copy over files that are not markdown files try: output_filepath = os.path.join(self.output_dir, os.path.basename(filepath)) if not os.path.exists(output_filepath): copyfile(filepath, output_filepath) except: pass self.log.info("Finished processing OBS markdown files.")
def mock_s3_bible_project(self, test_file_name, project_key, multi_part=False): converted_proj_dir = os.path.join(self.resources_dir, 'converted_projects') test_file_base = test_file_name.split('.zip')[0] zip_file = os.path.join(converted_proj_dir, test_file_name) out_dir = os.path.join(self.temp_dir, test_file_base) unzip(zip_file, out_dir) project_dir = os.path.join(out_dir, test_file_base) + os.path.sep self.project_files = file_utils.get_files(out_dir) self.project_key = project_key for filename in self.project_files: sub_path = filename.split(project_dir)[1].replace( os.path.sep, '/') # Make sure it is a bucket path AppSettings.cdn_s3_handler().upload_file( filename, '{0}/{1}'.format(project_key, sub_path)) if multi_part: # copy files from cdn to door43 base_name = os.path.basename(filename) if '.html' in base_name: with open(filename, 'r') as f: soup = BeautifulSoup(f, 'html.parser') # add nav tag new_tag = soup.new_tag('div', id='right-sidebar') soup.body.append(new_tag) html = str(soup) file_utils.write_file( filename, html.encode('ascii', 'xmlcharrefreplace')) AppSettings.door43_s3_handler().upload_file( filename, '{0}/{1}'.format(project_key, base_name)) # u, user, repo = project_key AppSettings.door43_s3_handler().upload_file( os.path.join(self.resources_dir, 'templates', 'project-page.html'), 'templates/project-page.html')
def convert(self) -> bool: """ Main function to convert info in TSV files into HTML files. """ AppSettings.logger.debug( "Tsv2HtmlConverter processing the TSV files …") # Find the first directory that has usfm files. filepaths = get_files(directory=self.files_dir, exclude=self.EXCLUDED_FILES) # convert_only_list = self.check_for_exclusive_convert() convert_only_list = [] # Not totally sure what the above line did # Process the manifest file self.manifest_dict = None for source_filepath in filepaths: if 'manifest.yaml' in source_filepath: self.process_manifest(source_filepath) break current_dir = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(current_dir, 'templates', 'template.html')) as template_file: # Simple HTML template which includes $title and $content fields template_html = template_file.read() # Convert tsv files and copy across other files num_successful_books = num_failed_books = 0 for source_filepath in sorted(filepaths): base_name = os.path.basename(source_filepath) if source_filepath.endswith('.tsv'): if convert_only_list and ( base_name not in convert_only_list ): # see if this is a file we are to convert continue # Convert the TSV file self.log.info( f"Tsv2HtmlConverter converting TSV file: {base_name} …" ) # Logger also issues DEBUG msg filebase = os.path.splitext( os.path.basename(source_filepath))[0] # Do the actual TSV -> HTML conversion converted_html = self.buildSingleHtml(source_filepath) # AppSettings.logger.debug(f"Got converted html: {converted_html[:5000]}{' …' if len(converted_html)>5000 else ''}") # Now what are we doing with the converted html ??? template_soup = BeautifulSoup(template_html, 'html.parser') template_soup.head.title.string = self.repo_subject converted_soup = BeautifulSoup(converted_html, 'html.parser') content_div = template_soup.find('div', id='content') content_div.clear() if converted_soup and converted_soup.body: content_div.append(converted_soup.body) content_div.body.unwrap() num_successful_books += 1 else: content_div.append('ERROR! NOT CONVERTED!') self.log.warning( f"TSV parsing or conversion error for {base_name}") # AppSettings.logger.debug(f"Got converted html: {converted_html[:600]}{' …' if len(converted_html)>600 else ''}") if not converted_soup: AppSettings.logger.debug(f"No converted_soup") elif not converted_soup.body: AppSettings.logger.debug(f"No converted_soup.body") # from bs4.diagnose import diagnose # diagnose(converted_html) num_failed_books += 1 html_filename = filebase + '.html' output_filepath = os.path.join(self.output_dir, html_filename) #print("template_soup type is", type(template_soup)) # <class 'bs4.BeautifulSoup'> write_file(output_filepath, str(template_soup)) #print("Got converted x2 html:", str(template_soup)[:500]) self.log.info( f"Converted {os.path.basename(source_filepath)} to {os.path.basename(html_filename)}." ) else: # Directly copy over files that are not TSV files try: output_filepath = os.path.join(self.output_dir, base_name) if not os.path.exists(output_filepath): copyfile(source_filepath, output_filepath) except: pass if num_failed_books and not num_successful_books: self.log.error(f"Conversion of all books failed!") self.log.info("Finished processing TSV files.") return True
def convert(self): AppSettings.logger.debug("Processing the Bible USFM files …") # Find the first directory that has usfm files. files = get_files(directory=self.files_dir, exclude=self.EXCLUDED_FILES) # convert_only_list = self.check_for_exclusive_convert() convert_only_list = [] # Not totally sure what the above line did current_dir = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(current_dir, 'templates', 'template.html')) as template_file: # Simple HTML template which includes $title and $content fields template_html = template_file.read() # Convert usfm files and copy across other files num_successful_books = num_failed_books = 0 for filename in sorted(files): if filename.endswith('.usfm'): base_name = os.path.basename(filename) if convert_only_list and ( base_name not in convert_only_list ): # see if this is a file we are to convert continue # Convert the USFM file self.log.info(f"Converting Bible USFM file: {base_name} …" ) # Logger also issues DEBUG msg # Copy just the single file to be converted into a single scratch folder scratch_dir = tempfile.mkdtemp( prefix='tX_convert_usfm_scratch_') delete_scratch_dir_flag = True # Set to False for debugging this code copyfile(filename, os.path.join(scratch_dir, os.path.basename(filename))) filebase = os.path.splitext(os.path.basename(filename))[0] # Do the actual USFM -> HTML conversion warning_list = UsfmTransform.buildSingleHtml( scratch_dir, scratch_dir, filebase) if warning_list: for warning_msg in warning_list: self.log.warning(f"{filebase} - {warning_msg}") # This code seems to be cleaning up or adjusting the converted HTML file html_filename = filebase + '.html' with open(os.path.join(scratch_dir, html_filename), 'rt', encoding='utf-8') as html_file: converted_html = html_file.read() converted_html_length = len(converted_html) # AppSettings.logger.debug(f"### Usfm2HtmlConverter got converted html of length {converted_html_length:,}") # AppSettings.logger.debug(f"Got converted html: {converted_html[:500]}{' …' if len(converted_html)>500 else ''}") if '</p></p></p>' in converted_html: AppSettings.logger.debug( f"Usfm2HtmlConverter got multiple consecutive paragraph closures in converted {html_filename}" ) # Now what are we doing with the converted html ??? template_soup = BeautifulSoup(template_html, 'html.parser') template_soup.head.title.string = self.repo_subject converted_soup = BeautifulSoup(converted_html, 'html.parser') content_div = template_soup.find('div', id='content') content_div.clear() if converted_soup and converted_soup.body: content_div.append(converted_soup.body) content_div.body.unwrap() num_successful_books += 1 else: content_div.append("ERROR! NOT CONVERTED!") self.log.warning( f"USFM parsing or conversion error for {base_name}") AppSettings.logger.debug( f"Got converted html: {converted_html[:600]}{' …' if len(converted_html)>600 else ''}" ) if not converted_soup: AppSettings.logger.debug(f"No converted_soup") elif not converted_soup.body: AppSettings.logger.debug(f"No converted_soup.body") # from bs4.diagnose import diagnose # diagnose(converted_html) num_failed_books += 1 output_filepath = os.path.join(self.output_dir, html_filename) #print("template_soup type is", type(template_soup)) # <class 'bs4.BeautifulSoup'> template_soup_string = str(template_soup) write_file(output_filepath, template_soup_string) template_soup_string_length = len(template_soup_string) # AppSettings.logger.debug(f"### Usfm2HtmlConverter wrote souped-up html of length {template_soup_string_length:,} from {converted_html_length:,}") if '</p></p></p>' in template_soup_string: AppSettings.logger.warning( f"Usfm2HtmlConverter got multiple consecutive paragraph closures in {html_filename}" ) if template_soup_string_length < converted_html_length * 0.67: # What is the 33% or so that's lost ??? AppSettings.logger.debug( f"### Usfm2HtmlConverter wrote souped-up html of length {template_soup_string_length:,} from {converted_html_length:,} = {template_soup_string_length*100.0/converted_html_length}%" ) self.log.warning( f"Usfm2HtmlConverter possibly lost converted html for {html_filename}" ) AppSettings.logger.info( f"Usfm2HtmlConverter {html_filename} was {converted_html_length:,} now {template_soup_string_length:,}" ) # AppSettings.logger.debug(f"Usfm2HtmlConverter {html_filename} was: {converted_html}") write_file( os.path.join(scratch_dir, filebase + '.converted.html'), template_soup_string) if prefix and debug_mode_flag: delete_scratch_dir_flag = False #print("Got converted x2 html:", str(template_soup)[:500]) # self.log.info(f"Converted {os.path.basename(filename)} to {os.path.basename(html_filename)}.") if delete_scratch_dir_flag: remove_tree(scratch_dir) else: # Directly copy over files that are not USFM files try: output_filepath = os.path.join(self.output_dir, os.path.basename(filename)) if not os.path.exists(output_filepath): copyfile(filename, output_filepath) except: pass if num_failed_books and not num_successful_books: self.log.error(f"Conversion of all books failed!") self.log.info( f"Finished processing {num_successful_books} Bible USFM files.") return True
def convert_markdown(self) -> None: self.log.info("Converting Markdown files…") # Find the first directory that has md files. files = get_files(directory=self.files_dir, exclude=self.EXCLUDED_FILES) # convert_only_list = self.check_for_exclusive_convert() convert_only_list = [] # Not totally sure what the above line did current_dir = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(current_dir, 'templates', 'template.html')) as template_file: # Just a very simple template with $title and $content place-holders html_template = string.Template(template_file.read()) # found_chapters = {} for filepath in sorted(files): if filepath.endswith('.md'): base_name_part = os.path.splitext( os.path.basename(filepath))[0] filename = base_name_part + '.md' if convert_only_list and ( filename not in convert_only_list ): # see if this is a file we are to convert continue html_filename = base_name_part + '.html' AppSettings.logger.debug( f"Converting '{filename}' to '{html_filename}' …") # Convert files that are markdown files try: md = read_file(filepath) except Exception as e: self.log.error(f"Error reading {filename}: {e}") continue # if 0: # test code—creates html1 # headers = {"content-type": "application/json"} # url = "http://bg.door43.org/api/v1/markdown" # payload = { # 'Context': "", # 'Mode': "normal", # 'Text': md, # 'Wiki': False # } # # url = "http://bg.door43.org/api/v1/markdown/raw" # AppSettings.logger.debug(f"Making callback to {url} with payload:") # AppSettings.logger.debug(json.dumps(payload)[:256] + '…') # try: # response = requests.post(url, json=payload, headers=headers) # # response = requests.post(url, data=md, headers=headers) # except requests.exceptions.ConnectionError as e: # AppSettings.logger.critical(f"Markdown->HTML connection error: {e}") # response = None # if response: # #AppSettings.logger.info(f"response.status_code = {response.status_code}, response.reason = {response.reason}") # #AppSettings.logger.debug(f"response.headers = {response.headers}") # AppSettings.logger.debug(f"response.text = {response.text[:256] + '…'}") # html1 = response.text # if response.status_code != 200: # AppSettings.logger.critical(f"Failed to submit Markdown->HTML job:" # f" {response.status_code}={response.reason}") # # callback_status = response.status_code # # if (callback_status >= 200) and (callback_status < 299): # # AppSettings.logger.debug("Markdown->HTML callback finished.") # # else: # # AppSettings.logger.error(f"Error calling callback code {callback_status}: {response.reason}") # else: # no response # AppSettings.logger.error("Submission of job to Markdown->HTML got no response") if 1: # old/existing code—creates html2 if self.repo_subject in [ 'Translation_Academy', ]: html2 = markdown2.markdown( md, extras=['markdown-in-html', 'tables']) if prefix and debug_mode_flag: write_file( os.path.join(self.debug_dir, base_name_part + '.1.html'), html2) else: html2 = markdown.markdown(md) # if 0: # if html2 == html1: # AppSettings.logger.debug("HTML responses are identical.") # else: # AppSettings.logger.error(f"HTML responses differ: {len(html1)} and {len(html2)}") # AppSettings.logger.debug(repr(html1)[:256] + ' …… ' + repr(html1)[-256:]) # AppSettings.logger.debug(repr(html2)[:256] + ' …… ' + repr(html2)[-256:]) # try: html = html1 # except UnboundLocalError: html = html2 # else: html = html2 # print(f"md len = {len(md):,} html len = {len(html):,} ratio = {len(html)/len(md):.2f}") if len(html) < len(md): # Seems created html was too short AppSettings.logger.error( f"Converter error: {filename} HTML ended up smaller than the original markdown!" ) self.log.info(f"Markdown was {md[:20]} …… {md[-200:]}") self.log.info(f"HTML is {html[:20]} ' …… ', {html[-200:]}") html = html_template.safe_substitute( title=self.repo_subject.replace('_', ' '), content=html) if prefix and debug_mode_flag: write_file( os.path.join(self.debug_dir, base_name_part + '.2.html'), html) html = fix_naked_urls(html) if prefix and debug_mode_flag: write_file( os.path.join(self.debug_dir, base_name_part + '.3.html'), html) # Change headers like <h1><a id="verbs"/>Verbs</h1> to <h1 id="verbs">Verbs</h1> soup = BeautifulSoup(html, 'html.parser') for tag in soup.findAll('a', {'id': True}): if tag.parent and tag.parent.name in [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ]: tag.parent['id'] = tag['id'] tag.parent['class'] = tag.parent.get( 'class', []) + ['section-header'] tag.extract() html = str(soup) # Write the file base_name_part = os.path.splitext( os.path.basename(filepath))[0] # found_chapters[base_name_part] = True output_file = os.path.join(self.output_dir, html_filename) write_file(output_file, html) self.log.info(f"Converted {filename} to {html_filename}.") else: # Directly copy over files that are not markdown files try: output_file = os.path.join(self.output_dir, os.path.basename(filepath)) if not os.path.exists(output_file): copyfile(filepath, output_file) except: pass self.log.info("Finished processing generic markdown files.")