Esempio n. 1
0
    def test_get_files(self):
        self.tmp_dir = tempfile.mkdtemp(prefix='Door43_test_file_utils_')
        _, tmp_file1 = tempfile.mkstemp(dir=self.tmp_dir)
        _, tmp_file2 = tempfile.mkstemp(dir=self.tmp_dir)
        tmp_subdir = os.path.join(self.tmp_dir, 'subdir')
        os.mkdir(tmp_subdir)
        _, tmp_file3 = tempfile.mkstemp(dir=tmp_subdir, suffix=".md")

        files = file_utils.get_files(self.tmp_dir,
                                     relative_paths=False,
                                     include_directories=True)
        self.assertEqual(len(files), 4)
        self.assertTrue(any(self.paths_equal(tmp_file1, d) for d in files))
        self.assertTrue(any(self.paths_equal(tmp_file2, d) for d in files))
        self.assertTrue(any(self.paths_equal(tmp_subdir, d) for d in files))
        self.assertTrue(any(self.paths_equal(tmp_file3, d) for d in files))

        files = file_utils.get_files(self.tmp_dir, extensions=['.md'])
        self.assertEqual(len(files), 1)

        files = file_utils.get_files(self.tmp_dir,
                                     relative_paths=True,
                                     include_directories=True)
        self.assertEqual(len(files), 4)
        self.assertTrue(
            any(
                self.paths_equal(os.path.relpath(tmp_file1, self.tmp_dir), d)
                for d in files))
        self.assertTrue(
            any(
                self.paths_equal(os.path.relpath(tmp_file2, self.tmp_dir), d)
                for d in files))
        self.assertTrue(
            any(
                self.paths_equal(os.path.relpath(tmp_subdir, self.tmp_dir), d)
                for d in files))
        self.assertTrue(
            any(
                self.paths_equal(os.path.relpath(tmp_file3, self.tmp_dir), d)
                for d in files))
Esempio n. 2
0
    def get_files(self, relative_paths: bool) -> List[str]:
        """
        relative_paths can be True or False

        Returns a sorted list of .md files to be processed
        """
        if self.single_dir:
            dir_path = os.path.join(self.source_dir, self.single_dir)
            sub_files = sorted(
                get_files(directory=dir_path,
                          relative_paths=relative_paths,
                          exclude=self.EXCLUDED_FILES,
                          extensions=['.md']))
            files = []
            for f in sub_files:
                files.append(os.path.join(self.single_dir, f))
        else:
            files = sorted(
                get_files(directory=self.source_dir,
                          relative_paths=relative_paths,
                          exclude=self.EXCLUDED_FILES,
                          extensions=['.md']))
        return files
Esempio n. 3
0
    def test_get_files(self):
        tmp_dir = tempfile.mkdtemp()
        _, tmp_file1 = tempfile.mkstemp(dir=tmp_dir)
        _, tmp_file2 = tempfile.mkstemp(dir=tmp_dir)
        tmp_subdir = tmp_dir + "/subdir"
        os.mkdir(tmp_subdir)
        _, tmp_file3 = tempfile.mkstemp(dir=tmp_subdir)

        files = file_utils.get_files(tmp_dir,
                                     relative_paths=False,
                                     include_directories=True)
        self.assertEqual(len(files), 4)
        self.assertTrue(any(self.paths_equal(tmp_file1, d) for d in files))
        self.assertTrue(any(self.paths_equal(tmp_file2, d) for d in files))
        self.assertTrue(any(self.paths_equal(tmp_subdir, d) for d in files))
        self.assertTrue(any(self.paths_equal(tmp_file3, d) for d in files))

        files = file_utils.get_files(tmp_dir,
                                     relative_paths=True,
                                     include_directories=True)
        self.assertEqual(len(files), 4)
        self.assertTrue(
            any(
                self.paths_equal(os.path.relpath(tmp_file1, tmp_dir), d)
                for d in files))
        self.assertTrue(
            any(
                self.paths_equal(os.path.relpath(tmp_file2, tmp_dir), d)
                for d in files))
        self.assertTrue(
            any(
                self.paths_equal(os.path.relpath(tmp_subdir, tmp_dir), d)
                for d in files))
        self.assertTrue(
            any(
                self.paths_equal(os.path.relpath(tmp_file3, tmp_dir), d)
                for d in files))
Esempio n. 4
0
    def update_from_files(self, path):
        path = path.rstrip('/')

        found_markdown = False
        found_usfm = False
        found_html = False
        found_text_in_numbered_dir = False

        if not self.format:
            for f in get_files(path):
                if f.endswith('usfm'):
                    found_usfm = True
                elif f.endswith('.md'):
                    found_markdown = True
                elif f.endswith('.html'):
                    found_html = True
                elif f.endswith('.txt'):
                    try:
                        if int(os.path.basename(os.path.dirname(f))):
                            found_text_in_numbered_dir = True
                    except Exception:
                        pass
        if found_usfm:
            if not self.format:
                self.format = 'usfm'
            if not self.resource['id']:
                self.resource['id'] = 'bible'
                self.resource['name'] = 'Bible'
        elif found_markdown:
            if not self.format:
                self.format = 'markdown'
        elif found_html:
            if not self.format:
                self.format = 'html'

        if not self.generator['name']:
            for subdir in glob(os.path.join(path, '*')):
                if os.path.isdir(subdir):
                    dir_name = subdir[len(path) + 1:]
                    try:
                        if int(dir_name) and len(
                                glob(os.path.join(subdir, '*.txt'))) > 0:
                            self.generator['name'] = 'ts'
                            break
                    except Exception:
                        continue
    def convert_obs(self) -> None:
        self.log.info("Converting OBS markdown files…")

        # Find the first directory that has md files.
        files = get_files(directory=self.files_dir,
                          exclude=self.EXCLUDED_FILES)

        current_dir = os.path.dirname(os.path.realpath(__file__))
        with open(os.path.join(current_dir, 'templates',
                               'template.html')) as template_file:
            html_template = string.Template(template_file.read())

        # found_chapters = {}
        for filepath in sorted(files):
            if filepath.endswith('.md'):
                # Convert files that are markdown files
                base_name_part = os.path.splitext(
                    os.path.basename(filepath))[0]
                # found_chapters[base_name] = True
                try:
                    md = read_file(filepath)
                except Exception as e:
                    self.log.error(
                        f"Error reading {base_name_part+'.md'}: {e}")
                    continue
                html = markdown.markdown(md)
                html = html_template.safe_substitute(
                    title=self.repo_subject.replace('_', ' '), content=html)
                html_filename = base_name_part + '.html'
                output_filepath = os.path.join(self.output_dir, html_filename)
                write_file(output_filepath, html)
                self.log.info(
                    f"Converted {os.path.basename(filepath)} to {os.path.basename(html_filename)}."
                )
            else:
                # Directly copy over files that are not markdown files
                try:
                    output_filepath = os.path.join(self.output_dir,
                                                   os.path.basename(filepath))
                    if not os.path.exists(output_filepath):
                        copyfile(filepath, output_filepath)
                except:
                    pass
        self.log.info("Finished processing OBS markdown files.")
    def mock_s3_bible_project(self,
                              test_file_name,
                              project_key,
                              multi_part=False):
        converted_proj_dir = os.path.join(self.resources_dir,
                                          'converted_projects')
        test_file_base = test_file_name.split('.zip')[0]
        zip_file = os.path.join(converted_proj_dir, test_file_name)
        out_dir = os.path.join(self.temp_dir, test_file_base)
        unzip(zip_file, out_dir)
        project_dir = os.path.join(out_dir, test_file_base) + os.path.sep
        self.project_files = file_utils.get_files(out_dir)
        self.project_key = project_key
        for filename in self.project_files:
            sub_path = filename.split(project_dir)[1].replace(
                os.path.sep, '/')  # Make sure it is a bucket path
            AppSettings.cdn_s3_handler().upload_file(
                filename, '{0}/{1}'.format(project_key, sub_path))

            if multi_part:  # copy files from cdn to door43
                base_name = os.path.basename(filename)
                if '.html' in base_name:
                    with open(filename, 'r') as f:
                        soup = BeautifulSoup(f, 'html.parser')

                    # add nav tag
                    new_tag = soup.new_tag('div', id='right-sidebar')
                    soup.body.append(new_tag)
                    html = str(soup)
                    file_utils.write_file(
                        filename, html.encode('ascii', 'xmlcharrefreplace'))

                AppSettings.door43_s3_handler().upload_file(
                    filename, '{0}/{1}'.format(project_key, base_name))

        # u, user, repo = project_key
        AppSettings.door43_s3_handler().upload_file(
            os.path.join(self.resources_dir, 'templates', 'project-page.html'),
            'templates/project-page.html')
Esempio n. 7
0
    def convert(self) -> bool:
        """
        Main function to convert info in TSV files into HTML files.
        """
        AppSettings.logger.debug(
            "Tsv2HtmlConverter processing the TSV files …")

        # Find the first directory that has usfm files.
        filepaths = get_files(directory=self.files_dir,
                              exclude=self.EXCLUDED_FILES)
        # convert_only_list = self.check_for_exclusive_convert()
        convert_only_list = []  # Not totally sure what the above line did

        # Process the manifest file
        self.manifest_dict = None
        for source_filepath in filepaths:
            if 'manifest.yaml' in source_filepath:
                self.process_manifest(source_filepath)
                break

        current_dir = os.path.dirname(os.path.realpath(__file__))
        with open(os.path.join(current_dir, 'templates',
                               'template.html')) as template_file:
            # Simple HTML template which includes $title and $content fields
            template_html = template_file.read()

        # Convert tsv files and copy across other files
        num_successful_books = num_failed_books = 0
        for source_filepath in sorted(filepaths):
            base_name = os.path.basename(source_filepath)
            if source_filepath.endswith('.tsv'):
                if convert_only_list and (
                        base_name not in convert_only_list
                ):  # see if this is a file we are to convert
                    continue

                # Convert the TSV file
                self.log.info(
                    f"Tsv2HtmlConverter converting TSV file: {base_name} …"
                )  # Logger also issues DEBUG msg
                filebase = os.path.splitext(
                    os.path.basename(source_filepath))[0]
                # Do the actual TSV -> HTML conversion
                converted_html = self.buildSingleHtml(source_filepath)
                # AppSettings.logger.debug(f"Got converted html: {converted_html[:5000]}{' …' if len(converted_html)>5000 else ''}")
                # Now what are we doing with the converted html ???
                template_soup = BeautifulSoup(template_html, 'html.parser')
                template_soup.head.title.string = self.repo_subject
                converted_soup = BeautifulSoup(converted_html, 'html.parser')
                content_div = template_soup.find('div', id='content')
                content_div.clear()
                if converted_soup and converted_soup.body:
                    content_div.append(converted_soup.body)
                    content_div.body.unwrap()
                    num_successful_books += 1
                else:
                    content_div.append('ERROR! NOT CONVERTED!')
                    self.log.warning(
                        f"TSV parsing or conversion error for {base_name}")
                    # AppSettings.logger.debug(f"Got converted html: {converted_html[:600]}{' …' if len(converted_html)>600 else ''}")
                    if not converted_soup:
                        AppSettings.logger.debug(f"No converted_soup")
                    elif not converted_soup.body:
                        AppSettings.logger.debug(f"No converted_soup.body")
                    # from bs4.diagnose import diagnose
                    # diagnose(converted_html)
                    num_failed_books += 1
                html_filename = filebase + '.html'
                output_filepath = os.path.join(self.output_dir, html_filename)
                #print("template_soup type is", type(template_soup)) # <class 'bs4.BeautifulSoup'>
                write_file(output_filepath, str(template_soup))
                #print("Got converted x2 html:", str(template_soup)[:500])
                self.log.info(
                    f"Converted {os.path.basename(source_filepath)} to {os.path.basename(html_filename)}."
                )
            else:
                # Directly copy over files that are not TSV files
                try:
                    output_filepath = os.path.join(self.output_dir, base_name)
                    if not os.path.exists(output_filepath):
                        copyfile(source_filepath, output_filepath)
                except:
                    pass
        if num_failed_books and not num_successful_books:
            self.log.error(f"Conversion of all books failed!")
        self.log.info("Finished processing TSV files.")
        return True
    def convert(self):
        AppSettings.logger.debug("Processing the Bible USFM files …")

        # Find the first directory that has usfm files.
        files = get_files(directory=self.files_dir,
                          exclude=self.EXCLUDED_FILES)
        # convert_only_list = self.check_for_exclusive_convert()
        convert_only_list = []  # Not totally sure what the above line did

        current_dir = os.path.dirname(os.path.realpath(__file__))
        with open(os.path.join(current_dir, 'templates',
                               'template.html')) as template_file:
            # Simple HTML template which includes $title and $content fields
            template_html = template_file.read()

        # Convert usfm files and copy across other files
        num_successful_books = num_failed_books = 0
        for filename in sorted(files):
            if filename.endswith('.usfm'):
                base_name = os.path.basename(filename)
                if convert_only_list and (
                        base_name not in convert_only_list
                ):  # see if this is a file we are to convert
                    continue

                # Convert the USFM file
                self.log.info(f"Converting Bible USFM file: {base_name} …"
                              )  # Logger also issues DEBUG msg
                # Copy just the single file to be converted into a single scratch folder
                scratch_dir = tempfile.mkdtemp(
                    prefix='tX_convert_usfm_scratch_')
                delete_scratch_dir_flag = True  # Set to False for debugging this code
                copyfile(filename,
                         os.path.join(scratch_dir, os.path.basename(filename)))
                filebase = os.path.splitext(os.path.basename(filename))[0]
                # Do the actual USFM -> HTML conversion
                warning_list = UsfmTransform.buildSingleHtml(
                    scratch_dir, scratch_dir, filebase)
                if warning_list:
                    for warning_msg in warning_list:
                        self.log.warning(f"{filebase} - {warning_msg}")

                # This code seems to be cleaning up or adjusting the converted HTML file
                html_filename = filebase + '.html'
                with open(os.path.join(scratch_dir, html_filename),
                          'rt',
                          encoding='utf-8') as html_file:
                    converted_html = html_file.read()
                converted_html_length = len(converted_html)
                # AppSettings.logger.debug(f"### Usfm2HtmlConverter got converted html of length {converted_html_length:,}")
                # AppSettings.logger.debug(f"Got converted html: {converted_html[:500]}{' …' if len(converted_html)>500 else ''}")
                if '</p></p></p>' in converted_html:
                    AppSettings.logger.debug(
                        f"Usfm2HtmlConverter got multiple consecutive paragraph closures in converted {html_filename}"
                    )
                # Now what are we doing with the converted html ???
                template_soup = BeautifulSoup(template_html, 'html.parser')
                template_soup.head.title.string = self.repo_subject
                converted_soup = BeautifulSoup(converted_html, 'html.parser')
                content_div = template_soup.find('div', id='content')
                content_div.clear()
                if converted_soup and converted_soup.body:
                    content_div.append(converted_soup.body)
                    content_div.body.unwrap()
                    num_successful_books += 1
                else:
                    content_div.append("ERROR! NOT CONVERTED!")
                    self.log.warning(
                        f"USFM parsing or conversion error for {base_name}")
                    AppSettings.logger.debug(
                        f"Got converted html: {converted_html[:600]}{' …' if len(converted_html)>600 else ''}"
                    )
                    if not converted_soup:
                        AppSettings.logger.debug(f"No converted_soup")
                    elif not converted_soup.body:
                        AppSettings.logger.debug(f"No converted_soup.body")
                    # from bs4.diagnose import diagnose
                    # diagnose(converted_html)
                    num_failed_books += 1
                output_filepath = os.path.join(self.output_dir, html_filename)
                #print("template_soup type is", type(template_soup)) # <class 'bs4.BeautifulSoup'>
                template_soup_string = str(template_soup)
                write_file(output_filepath, template_soup_string)
                template_soup_string_length = len(template_soup_string)
                # AppSettings.logger.debug(f"### Usfm2HtmlConverter wrote souped-up html of length {template_soup_string_length:,} from {converted_html_length:,}")
                if '</p></p></p>' in template_soup_string:
                    AppSettings.logger.warning(
                        f"Usfm2HtmlConverter got multiple consecutive paragraph closures in {html_filename}"
                    )
                if template_soup_string_length < converted_html_length * 0.67:  # What is the 33% or so that's lost ???
                    AppSettings.logger.debug(
                        f"### Usfm2HtmlConverter wrote souped-up html of length {template_soup_string_length:,} from {converted_html_length:,} = {template_soup_string_length*100.0/converted_html_length}%"
                    )
                    self.log.warning(
                        f"Usfm2HtmlConverter possibly lost converted html for {html_filename}"
                    )
                    AppSettings.logger.info(
                        f"Usfm2HtmlConverter {html_filename} was {converted_html_length:,} now {template_soup_string_length:,}"
                    )
                    # AppSettings.logger.debug(f"Usfm2HtmlConverter {html_filename} was: {converted_html}")
                    write_file(
                        os.path.join(scratch_dir,
                                     filebase + '.converted.html'),
                        template_soup_string)
                    if prefix and debug_mode_flag:
                        delete_scratch_dir_flag = False
                #print("Got converted x2 html:", str(template_soup)[:500])
                # self.log.info(f"Converted {os.path.basename(filename)} to {os.path.basename(html_filename)}.")
                if delete_scratch_dir_flag:
                    remove_tree(scratch_dir)
            else:
                # Directly copy over files that are not USFM files
                try:
                    output_filepath = os.path.join(self.output_dir,
                                                   os.path.basename(filename))
                    if not os.path.exists(output_filepath):
                        copyfile(filename, output_filepath)
                except:
                    pass
        if num_failed_books and not num_successful_books:
            self.log.error(f"Conversion of all books failed!")
        self.log.info(
            f"Finished processing {num_successful_books} Bible USFM files.")
        return True
    def convert_markdown(self) -> None:
        self.log.info("Converting Markdown files…")

        # Find the first directory that has md files.
        files = get_files(directory=self.files_dir,
                          exclude=self.EXCLUDED_FILES)
        # convert_only_list = self.check_for_exclusive_convert()
        convert_only_list = []  # Not totally sure what the above line did

        current_dir = os.path.dirname(os.path.realpath(__file__))
        with open(os.path.join(current_dir, 'templates',
                               'template.html')) as template_file:
            # Just a very simple template with $title and $content place-holders
            html_template = string.Template(template_file.read())

        # found_chapters = {}
        for filepath in sorted(files):
            if filepath.endswith('.md'):
                base_name_part = os.path.splitext(
                    os.path.basename(filepath))[0]
                filename = base_name_part + '.md'
                if convert_only_list and (
                        filename not in convert_only_list
                ):  # see if this is a file we are to convert
                    continue
                html_filename = base_name_part + '.html'
                AppSettings.logger.debug(
                    f"Converting '{filename}' to '{html_filename}' …")

                # Convert files that are markdown files
                try:
                    md = read_file(filepath)
                except Exception as e:
                    self.log.error(f"Error reading {filename}: {e}")
                    continue
                # if 0: # test code—creates html1
                #     headers = {"content-type": "application/json"}
                #     url = "http://bg.door43.org/api/v1/markdown"
                #     payload = {
                #         'Context': "",
                #         'Mode': "normal",
                #         'Text': md,
                #         'Wiki': False
                #         }
                #     # url = "http://bg.door43.org/api/v1/markdown/raw"
                #     AppSettings.logger.debug(f"Making callback to {url} with payload:")
                #     AppSettings.logger.debug(json.dumps(payload)[:256] + '…')
                #     try:
                #         response = requests.post(url, json=payload, headers=headers)
                #         # response = requests.post(url, data=md, headers=headers)
                #     except requests.exceptions.ConnectionError as e:
                #         AppSettings.logger.critical(f"Markdown->HTML connection error: {e}")
                #         response = None
                #     if response:
                #         #AppSettings.logger.info(f"response.status_code = {response.status_code}, response.reason = {response.reason}")
                #         #AppSettings.logger.debug(f"response.headers = {response.headers}")
                #         AppSettings.logger.debug(f"response.text = {response.text[:256] + '…'}")
                #         html1 = response.text
                #         if response.status_code != 200:
                #             AppSettings.logger.critical(f"Failed to submit Markdown->HTML job:"
                #                                         f" {response.status_code}={response.reason}")
                #         # callback_status = response.status_code
                #         # if (callback_status >= 200) and (callback_status < 299):
                #         #     AppSettings.logger.debug("Markdown->HTML callback finished.")
                #         # else:
                #         #     AppSettings.logger.error(f"Error calling callback code {callback_status}: {response.reason}")
                #     else: # no response
                #         AppSettings.logger.error("Submission of job to Markdown->HTML got no response")
                if 1:  # old/existing code—creates html2
                    if self.repo_subject in [
                            'Translation_Academy',
                    ]:
                        html2 = markdown2.markdown(
                            md, extras=['markdown-in-html', 'tables'])
                        if prefix and debug_mode_flag:
                            write_file(
                                os.path.join(self.debug_dir,
                                             base_name_part + '.1.html'),
                                html2)
                    else:
                        html2 = markdown.markdown(md)
                # if 0:
                #     if html2 == html1:
                #         AppSettings.logger.debug("HTML responses are identical.")
                #     else:
                #         AppSettings.logger.error(f"HTML responses differ: {len(html1)} and {len(html2)}")
                #         AppSettings.logger.debug(repr(html1)[:256] + ' …… ' + repr(html1)[-256:])
                #         AppSettings.logger.debug(repr(html2)[:256] + ' …… ' + repr(html2)[-256:])
                #     try: html = html1
                #     except UnboundLocalError: html = html2
                # else:
                html = html2
                # print(f"md len = {len(md):,}  html len = {len(html):,}  ratio = {len(html)/len(md):.2f}")
                if len(html) < len(md):  # Seems created html was too short
                    AppSettings.logger.error(
                        f"Converter error: {filename} HTML ended up smaller than the original markdown!"
                    )
                    self.log.info(f"Markdown was {md[:20]} …… {md[-200:]}")
                    self.log.info(f"HTML is {html[:20]} ' …… ', {html[-200:]}")

                html = html_template.safe_substitute(
                    title=self.repo_subject.replace('_', ' '), content=html)
                if prefix and debug_mode_flag:
                    write_file(
                        os.path.join(self.debug_dir,
                                     base_name_part + '.2.html'), html)

                html = fix_naked_urls(html)
                if prefix and debug_mode_flag:
                    write_file(
                        os.path.join(self.debug_dir,
                                     base_name_part + '.3.html'), html)

                # Change headers like <h1><a id="verbs"/>Verbs</h1> to <h1 id="verbs">Verbs</h1>
                soup = BeautifulSoup(html, 'html.parser')
                for tag in soup.findAll('a', {'id': True}):
                    if tag.parent and tag.parent.name in [
                            'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
                    ]:
                        tag.parent['id'] = tag['id']
                        tag.parent['class'] = tag.parent.get(
                            'class', []) + ['section-header']
                        tag.extract()
                html = str(soup)

                # Write the file
                base_name_part = os.path.splitext(
                    os.path.basename(filepath))[0]
                # found_chapters[base_name_part] = True
                output_file = os.path.join(self.output_dir, html_filename)
                write_file(output_file, html)
                self.log.info(f"Converted {filename} to {html_filename}.")
            else:
                # Directly copy over files that are not markdown files
                try:
                    output_file = os.path.join(self.output_dir,
                                               os.path.basename(filepath))
                    if not os.path.exists(output_file):
                        copyfile(filepath, output_file)
                except:
                    pass
        self.log.info("Finished processing generic markdown files.")