Example #1
0
def test_allow_unclosed_non_curly_braces():
    """Tests that non-curly-brace 'delimiters' can be unclosed

    Non-curly-brace delimiters only cause parse errors when parsing arguments
    for a command.
    """
    soup = TexSoup("[)")
    assert len(list(soup.contents)) == 2

    soup = TexSoup(r"""
    \documentclass{article}
        \usepackage[utf8]{inputenc}
    \begin{document}
        \textbf{[}
    \end{document}
    """)
    assert soup.textbf.string == '['

    soup = TexSoup("[regular text]")
    contents = list(soup.contents)
    assert isinstance(contents[0], str)

    soup = TexSoup("{regular text}[")
    contents = list(soup.contents)
    assert isinstance(contents[1], str)
Example #2
0
def test_unclosed_math_environments():
    """Tests that unclosed math environment results in error."""
    with pytest.raises(EOFError):
        TexSoup(r"""$$\min_x \|Xw-y\|_2^2""")

    with pytest.raises(EOFError):
        TexSoup(r"""$\min_x \|Xw-y\|_2^2""")
Example #3
0
 def get_sglinks(cls, textext, vertex):
     try:
         texsoup = TexSoup(textext)
     except Exception as e:
         return None, [], 'Could not parse LaTeX code. {}'.format(e)
     sglinks = []
     used_edges = {}
     for sgl in texsoup.find_all(name='sglink'):
         sglink = SGlink(vertex, sgl)
         if not sglink.valid:
             return texsoup, sglinks, sglink.errortext
         if sglink.edge_id in used_edges:
             used_edges[sglink.edge_id] += 1
         else:
             used_edges[sglink.edge_id] = 0
         sglink.set_id(used_edges[sglink.edge_id])
         if not sglink.valid:
             return texsoup, sglinks, sglink.errortext
         while sglink.id in textext:  # for users who would put sglink id directly in their content
             used_edges[sglink.edge_id] += 1
             sglink.set_id(used_edges[sglink.edge_id])
             if not sglink.valid:
                 return texsoup, sglinks, sglink.errortext
         sgl.replace_with(sglink.id)
         sglinks.append(sglink)
     return texsoup, sglinks, b''
Example #4
0
    def read(self, path):
        """ Read the .tex file and return a list of Figure objects that represent each figure included in the .tex file,
        and include the paths to the script and the data files that each figure relies on. """
        figures = []
    
        with open(path, "r") as f:
            tex = TexSoup(f)

            # Find all figure environments.
            found = list(tex.find_all("figure"))

            # Loop over all figures in the document.
            for f in found:

                # Get the path to the figure.
                path = os.path.abspath(os.path.join(os.path.dirname(path), f.includegraphics.args[-1]))
                
                # Only consider PDF figures.
                if(path.split(".")[-1] != "pdf"):
                    continue
                
                # Parse the remaining Pynea parameters.
                try:
                    script = Script(os.path.abspath(f.pyneascript.args[0]))
                    command = f.pyneacommand.args[0]
                    data = [Data(os.path.abspath(d)) for d in f.pyneadata.args[0].split()]
                
                    figure = Figure(path, script, command, data)
                    figures.append(figure)
                except AttributeError as e:
                    print("Warning: No script, command and/or data files specified for figure %s. Skipping..." % (path))
                    continue

        return figures
Example #5
0
    def extract_math_envs(self):
        """
        Extract the math environments that are contained in the file (e.g. within '$...$').
        :return: A list of the math environments as strings.
        """
        def handle_special_chars(math_env):
            """
            Mainly for FormulaSplitter() in latexformlaidentifiers.py, since it has trouble detecting certain
            characters. Make it easier by preprocessing the math environment first (e.g. 'S_i' becomes 'S_{i}').
            :param math_env: The math environment that is currently being processed.
            :return: A tuple of the old math environment and the new math environment (with special characters
                     handled). If the math environment contains no special characters, return a tuple of the math
                     environment x 2.

            """
            special_char = re.search(r'_(\w|\d)', math_env)
            if special_char:
                found_char = re.search(r'(?<=_)(\w|\d)', math_env).group()[0]
                found_char_with_brackets = '{' + found_char + '}'
                math_env_new = math_env.replace(found_char,
                                                found_char_with_brackets)
                return (math_env, math_env_new)
            return (math_env, math_env)

        tex_soup = TexSoup(self.file)
        equation = list(tex_soup.find_all('equation'))
        align = list(tex_soup.find_all('align'))
        dollar = list(tex_soup.find_all('$'))
        math_envs = equation + align + dollar
        math_envs = list(map(lambda m: str(m), math_envs))
        return math_envs
Example #6
0
class Latex(object):
    def __init__(self, latex_string):
        self.soup = TexSoup(latex_string)

    def _delete_commands(self):
        for command in config.del_commands:
            try:
                for include in self.soup.find_all(command):
                    include.delete()
            except ValueError:
                pass

    def _replace_commands(self):
        for command in config.replace_commands:
            print(command[0])
            liste_commands = self.soup.find_all(command[0])
            self.content = str(self.soup)
            for match in liste_commands:
                self.content = self.content.replace(
                    str(match), command[1].replace("S_T_R", match.string))
            self.soup = TexSoup(self.content)

    def process(self):
        self._delete_commands()
        self._replace_commands()
        self.content = str(self.soup)
        math_inline = self.soup.find_all("$")
        for match in math_inline:
            string = str(list(match.descendants)[0])
            self.content = self.content.replace(string, string.strip())

        return self.content
Example #7
0
def main():
    file_abstract = []
    path = os.listdir('/Akamai_scratch/SpArxiv')
    indir = '/Akamai_scratch/arxiv/outdir3'

    for root, dirs, filenames in os.walk(indir):
        for f in filenames:
            try:
                # open file
                soup = TexSoup(open(os.path.join(root, f)))
                # find abstract
                abstract = str(soup.find('abstract'))
                if abstract is not None:
                    # TODO FIXME: comment out once we want to generate math
                    # delete Latex formatting
                    abstract = re.sub(r'\\begin\{.*?}(\[.*?\])?({.*?})?', '',
                                      abstract)
                    abstract = re.sub(r'\\end\{.*?}', '', abstract)
                    # remove custom named latex commands while keeping the
                    # stuff inside the braces
                    abstract = re.sub(r'\\.*?{(.*?)}', r'\1', abstract)

                    # make abstract one line before append
                    abstract = abstract.replace('\n', ' ') \
                        .replace('\r', '') \
                        .replace('  ', ' ')  # strip double spaces

                    file_abstract.append((f, abstract))
            except:
                pass
    with open('all_abstracts.csv', 'w') as out:
        csv_out = csv.writer(out)
        csv_out.writerow(['file', 'abstract'])
        for row in file_abstract:
            csv_out.writerow(row)
Example #8
0
def main():
    file_abstract = []
    path = os.listdir('/Akamai_scratch/SpArxiv')
    indir = '/Akamai_scratch/arxiv/outdir3'
    for root, dirs, filenames in os.walk(indir):
        for f in filenames:
            try:
                #open file
                soup = TexSoup(open(os.path.join(root, f)))
                #find abstract
                count = 0
                references = soup.find_all('bibitem')
                for reference in references:
                    count += 1
                references = soup.find_all('cite')
                for reference in references:
                    count += 1
                file_abstract.append((f, count))
            except:
                pass
    with open('all_references.csv', 'w') as out:
        csv_out = csv.writer(out)
        csv_out.writerow(['file', 'reference_count'])
        for row in file_abstract:
            csv_out.writerow(row)
Example #9
0
def test_commands_without_arguments_searchable():
    """Tests that command without arguments can still be found."""
    soup = TexSoup(r"""\Question (10 points)
This is the question here.

\Question (6 points)""")
    assert len(list(soup.find_all('Question'))) == 2
Example #10
0
def test_math_environment_weirdness():
    """Tests that math environment interacts correctly with other envs."""
    soup = TexSoup(r"""\begin{a} \end{a}$ b$""")
    assert '$' not in str(soup.a), 'Math env snuck into begin env.'
    soup2 = TexSoup(r"""\begin{a} $ b$ \end{a}""")
    assert '$' in str(next(soup2.a.contents)), 'Math env not found in begin env'
    soup3 = TexSoup(r"""\begin{verbatim} $ \end{verbatim}""")
    assert soup3.verbatim is not None
Example #11
0
def test_list_search():
    """Tests that giving a list to search returns all matches """
    soup = TexSoup(r"""
    \section*{Chikin Tales}
    \subsection{Chikin Fly}
    \section{Chikin Sequel}
    """)
    assert len(list(soup.find_all(['section', 'section*']))) == 2
Example #12
0
def test_load_edit_save(pancake):
    """Tests whether a LaTeX document can be loaded, modified and saved."""
    soup = TexSoup(pancake)
    emph = soup.find('emph')
    emph.delete()
    pancake_no_emph_soup = str(soup)
    pancake_no_emph_replace = pancake.replace(r'\emph{Enjoy your meal!}', '')
    assert pancake_no_emph_soup == pancake_no_emph_replace
Example #13
0
def test_item_parsing():
    """Tests that item parsing is valid."""
    soup = TexSoup(r"""\item aaa {\bbb} ccc""")
    assert str(soup.item) == r'\item aaa {\bbb} ccc'
    soup2 = TexSoup(r"""\begin{itemize}
\item hello $\alpha$
\end{itemize}""")
    assert str(soup2.item) == r'\item hello $\alpha$'
Example #14
0
def test_commands_with_one_or_more_arguments():
    """Tests that commands with one or more argument can still be searched."""
    soup = TexSoup(r"""
    \section{Chikin Tales}
    \subsection{Chikin Fly}
    \section{Chikin Sequel}
    """)
    assert len(list(soup.find_all('section'))) == 2
    assert soup.find('title') is None
Example #15
0
 def _replace_commands(self):
     for command in config.replace_commands:
         print(command[0])
         liste_commands = self.soup.find_all(command[0])
         self.content = str(self.soup)
         for match in liste_commands:
             self.content = self.content.replace(
                 str(match), command[1].replace("S_T_R", match.string))
         self.soup = TexSoup(self.content)
Example #16
0
def test_command_env_name_parse():
    """Tests that the begin/end command is parsed correctly."""

    with_space = TexSoup(r"""\begin            {itemize}\end{itemize}""")
    assert len(list(with_space.contents)) == 1

    with_whitespace = TexSoup(r"""\begin
{itemize}\end{itemize}""")
    assert len(list(with_whitespace.contents)) == 1
Example #17
0
def test_commands_with_one_or_more_arguments():
    """Tests that commands with one or more argument can still be searched."""
    soup = TexSoup(r"""
    \section{Chikin Tales}
    \subsection{Chikin Fly}
    \section{Chikin Sequel}
    """)
    assert len(list(soup.find_all('section'))) == 2
    assert soup.find('title') is None
Example #18
0
 def __init__(self, code):
     code = _replace_display_math(code)
     code = remove_comments(code)
     self.soup = TexSoup(code)
     self.listings = get_all_listings(code)
     self._listings_count = 0
     self.info = {}
     self.cells = []
     self.current = None
Example #19
0
def test_unclosed_commands():
    """Tests that unclosed commands result in an error."""
    with pytest.raises(TypeError):
        TexSoup(r"""\textit{hello""")

    with pytest.raises(TypeError):
        TexSoup(r"""\textit{hello %}""")

    with pytest.raises(TypeError):
        TexSoup(r"""\textit{hello \\%}""")
Example #20
0
 def soupTab(self):
     """Utilise TexSoup pour tabular et tabularx"""
     soup = TexSoup(self.contenu)
     for tabu in soup.find_all('tabular'):
         print(tabu)
         arg = []
         for i in tabu.contents:
             arg.append(str(i))
         intab = "".join(arg)
         tableau = self.processTab(intab)
         self.contenu = self.contenu.replace(repr(tabu), tableau)
Example #21
0
def test_skip_envs():
    """Test envs with invalid latex are not parsed."""
    with pytest.raises(TypeError):
        soup = TexSoup(r"""will raise error \textbf{aaaaa""")

    # no error, ignores verbatim
    TexSoup(r"""\begin{verbatim} \textbf{aaaaa \end{verbatim}""")

    # no error, customized to ignore foobar
    TexSoup(r"""\begin{foobar} \textbf{aaaaa \end{foobar}""",
            skip_envs=('foobar', ))
Example #22
0
def test_non_punctuation_command_structure():
    """Tests that normal commands do not include punctuation in the command.

    However, the asterisk is one exception.
    """
    soup = TexSoup(r"""\mycommand, hello""")
    contents = list(soup.contents)
    assert '\mycommand' == str(contents[0]), \
        'Comma considered part of the command.'
    soup2 = TexSoup(r"""\hspace*{0.2in} hello \hspace*{2in} world""")
    assert len(list(soup2.contents)) == 4, '* not recognized as part of command.'
def get_flushright(filename):
    with open(filename) as raw:
        soup = TexSoup(raw.read())

    flushright = list(soup.find_all('flushright'))

    try:
        res = flushright[2]
    except:
        res = None
    finally:
        return res
Example #24
0
def test_multiline_args():
    """Tests that macros with arguments are different lines are parsed
    properly. See Issue #31."""
    soup = TexSoup(r"""\mytitle{Essay title}
{Essay subheading.}""")
    assert "Essay subheading." in soup.mytitle.args
    # Only one newline allowed
    soup = TexSoup(r"""\mytitle{Essay title}

{Essay subheading.}""")
    assert "Essay subheading." not in soup.mytitle.args
    assert "Essay title" in soup.mytitle.args
Example #25
0
def test_math_environment_whitespace():
    """Tests that math environments are untouched."""
    soup = TexSoup(r"""$$\lambda
    \Sigma$$ But don't mind me \$3.00""")
    children, contents = list(soup.children), list(soup.contents)
    assert '\n' in str(children[0]), 'Whitesapce not preserved in math env.'
    assert len(children) == 1 and children[0].name == '$$', 'Math env wrong'
    assert r'\$' == contents[2], 'Dollar sign not escaped!'
    soup = TexSoup(r"""\gamma = \beta\begin{notescaped}\gamma = \beta\end{notescaped}
    \begin{equation*}\beta = \gamma\end{equation*}""")
    assert str(soup.find('equation*')) == r'\begin{equation*}\beta = \gamma\end{equation*}'
    assert str(soup).startswith(r'\gamma = \beta')
    assert str(soup.notescaped) == r'\begin{notescaped}\gamma = \beta\end{notescaped}'
Example #26
0
def count(tex):
    """Extract all labels, then count the number of times each is referenced in
    the provided file. Does not follow \includes.
    """

    # soupify
    soup = TexSoup(tex)

    # extract all unique labels
    labels = set(label.string for label in soup.find_all('label'))

    # create dictionary mapping label to number of references
    return dict((label, soup.find_all('\ref{%s}' % label)) for label in labels)
Example #27
0
def test_math_environment_whitespace():
    """Tests that math environments are untouched."""
    soup = TexSoup(r"""$$\lambda
    \Sigma$$ But don't mind me \$3.00""")
    children, contents = list(soup.children), list(soup.contents)
    assert '\n' in str(children[0]), 'Whitesapce not preserved in math env.'
    assert len(children) == 1 and children[0].name == '$$', 'Math env wrong'
    assert r'\$' in contents[1], 'Dollar sign not escaped!'
    soup = TexSoup(r"""\gamma = \beta\begin{notescaped}\gamma = \beta\end{notescaped}
    \begin{equation*}\beta = \gamma\end{equation*}""")
    assert str(soup.find('equation*')) == r'\begin{equation*}\beta = \gamma\end{equation*}'
    assert str(soup).startswith(r'\gamma = \beta')
    assert str(soup.notescaped) == r'\begin{notescaped}\gamma = \beta\end{notescaped}'
Example #28
0
def test_math_environment_weirdness():
    """Tests that math environment interacts correctly with other envs."""
    soup = TexSoup(r"""\begin{a} \end{a}$ b$""")
    assert '$' not in str(soup.a), 'Math env snuck into begin env.'
    soup = TexSoup(r"""\begin{a} $ b$ \end{a}""")
    assert '$' in str(soup.a.contents[0]), 'Math env not found in begin env'
    soup = TexSoup(r"""\begin{verbatim} $ \end{verbatim}""")
    assert soup.verbatim is not None
    # GH48
    soup = TexSoup(r"""a\\$a$""")
    assert '$' in str(soup), 'Math env not correctly parsed after \\\\'
    # GH55
    soup = TexSoup(r"""\begin{env} text\\$formula$ \end{env}""")
    assert '$' in str(soup.env), 'Math env not correctly parsed after \\\\'
Example #29
0
def test_commands_without_any_sort_arguments():
    """Tests that commands without any sort argument can still be searched."""
    soup = TexSoup(r"""
    \Question \textbf{Question Title}

    Here is what chickens do:

    \sol{They fly!}

    \Question
    \textbf{Question 2 Title}
    """)
    assert len(list(soup.find_all('Question'))) == 2
    assert soup.find('section') is None
Example #30
0
def test_commands_without_any_sort_arguments():
    """Tests that commands without any sort argument can still be searched."""
    soup = TexSoup(r"""
    \Question \textbf{Question Title}

    Here is what chickens do:

    \sol{They fly!}

    \Question
    \textbf{Question 2 Title}
    """)
    assert len(list(soup.find_all('Question'))) == 2
    assert soup.find('section') is None
Example #31
0
def read_source_archi1(path):
    print(path)
    testi = []
    with open(path) as infile:
        soup = TexSoup(infile)
        for it in soup.find_all('item'):
            if "epsfig" in str(it) or "includegraphics" in str(it):
                testi.append(None)
            else:
                t = str(it)
                t = t.replace("\item","", 1)
                t = re.sub(r"\\begin{minipage}\[.\]{.*?}", "", t)
                t = re.sub(r"\\end{minipage}", "", t)
                t = re.sub(r"\\a", r"\\bigskip\n\\a", t)
                testi.append(t.strip())
    return testi
Example #32
0
def test_toc_latex_urllink(cli: CliRunner, temp_with_override,
                           file_regression):
    """Testing LaTex output"""
    path_output = temp_with_override.joinpath("mybook").absolute()
    # Regular TOC should work
    p_toc = path_books.joinpath("toc")
    path_toc = p_toc.joinpath("_toc_urllink.yml")
    result = cli.invoke(
        build,
        [
            p_toc.as_posix(),
            "--path-output",
            path_output.as_posix(),
            "--toc",
            path_toc.as_posix(),
            "--builder",
            "pdflatex",
        ],
    )
    assert result.exit_code == 0, result.output

    # reading the tex file
    path_output_file = path_output.joinpath("_build", "latex", "python.tex")
    file_content = TexSoup(path_output_file.read_text())
    file_regression.check(str(file_content.document),
                          extension=".tex",
                          encoding="utf8")
Example #33
0
def test_toc_latex_parts(cli: CliRunner, temp_with_override, file_regression):
    """Testing LaTex output"""
    path_input = temp_with_override.joinpath("mybook_input").absolute()
    path_output = temp_with_override.joinpath("mybook").absolute()
    # Regular TOC should work
    p_toc = path_books.joinpath("toc")
    shutil.copytree(p_toc, path_input)
    # setup correct files
    (path_input / "subfolder" / "asubpage.md").unlink()
    for i in range(4):
        (path_input / "subfolder" / f"asubpage{i+1}.md").write_text(
            f"# A subpage {i+1}\n", encoding="utf8")
    path_toc = path_input.joinpath("_toc_parts.yml")
    result = cli.invoke(
        build,
        [
            path_input.as_posix(),
            "--path-output",
            path_output.as_posix(),
            "--toc",
            path_toc.as_posix(),
            "--builder",
            "pdflatex",
            "-W",
        ],
    )
    assert result.exit_code == 0, result.output

    # reading the tex file
    path_output_file = path_output.joinpath("_build", "latex", "python.tex")
    file_content = TexSoup(path_output_file.read_text())
    file_regression.check(str(file_content.document),
                          extension=".tex",
                          encoding="utf8")
Example #34
0
def resolve(tex):
    """Resolve all imports and update the parse tree.

    Reads from a tex file and once finished, writes to a tex file.
    """

    # soupify
    soup = TexSoup(tex)

    # resolve subimports
    for subimport in soup.find_all('subimport'):
        path = subimport.args[0] + subimport.args[1]
        subimport.replace_with(*resolve(open(path)).contents)

    # resolve imports
    for _import in soup.find_all('import'):
        _import.replace_with(*resolve(open(_import.args[0])).contents)

    # resolve includes
    for include in soup.find_all('include'):
        include.replace_with(*resolve(open(include.args[0])).contents)

    return soup
Example #35
0
def test_item_parsing():
    """Tests that item parsing is valid."""
    soup = TexSoup(r"""\item aaa {\bbb} ccc""")
    assert str(soup.item) == r'\item aaa {\bbb} ccc'
    soup = TexSoup(r"""\begin{itemize}
    \item hello $\alpha$
    \end{itemize}""")
    assert str(soup.item).strip() == r'\item hello $\alpha$'
    soup = TexSoup(r"""\begin{itemize}
    \item
    \item first item
    \end{itemize}""")
    assert len(list(soup.item.contents)) == 0, \
        "Zeroth item should have no contents"
    soup = TexSoup(r"""\begin{itemize}
    \item second item
    \item


    third item
    with third item

    floating text
    \end{itemize}""")
    items = list(soup.find_all('item'))
    content = next(items[1].contents)
    assert 'third item' in content, 'Item does not tolerate starting line breaks (as it should)'
    assert 'with' in content, 'Item does not tolerate line break in middle (as it should)'
    assert 'floating' not in content, 'Item should not tolerate multiple line breaks in middle'
    soup = TexSoup(r"""\begin{itemize}
    \item This item contains code!
    \begin{lstlisting}
    Code code code
    \end{lstlisting}
    \item hello
    \end{itemize}""")
    assert ' Code code code' in str(soup.item.lstlisting), 'Item does not correctly parse contained environments.'
    assert '\n    Code code code\n    ' in soup.item.lstlisting.expr.contents
    soup = TexSoup(r"""\begin{itemize}
    \item\label{some-label} waddle
    \item plop
    \end{itemize}""")
    assert str(soup.item.label) == r'\label{some-label}'
Example #36
0
 def extractFromTex(self, text):
     """Extract answers from the provided LaTeX source"""
     soup = TexSoup(text)
     self.__answers = list(soup.find_all('answer'))
     return self