def read_wmt_bib() -> List[Paper]: result = [] with open("downloads/2020.wmt-1.0.bib") as f: bib = database.parse_file(f) for i, entry in enumerate(bib.entries.values()): if entry.type == "book": continue title = LatexNodes2Text().latex_to_text(entry.fields["title"]) url = entry.fields["url"] abstract = LatexNodes2Text().latex_to_text( entry.fields["abstract"]) author = "|".join([ " ".join(reversed(str(e).split(", "))) for e in entry.persons["author"] ]) uid = url.replace("https://www.aclweb.org/anthology/", "") url = "https://www.statmt.org/wmt20/pdf/" + uid + ".pdf" paper = Paper( uid=f"WS-2.{uid}", ws_id="WS-2", title=title, authors=author, abstract=abstract, track="WS-2", kind="workshop", link=url, ) result.append(paper) return result
def test_repl_doc_title(self): # test that \title/\author/\date work and produce something reasonable # (exact output might change in the future) self.assertEqualUpToWhitespace( LatexNodes2Text().latex_to_text(r""" \title{The Title} \author{The Author(s)} \date{July 4, 2020} \maketitle """), r""" The Title The Author(s) July 4, 2020 ================= """) # missing all \title, \author, \date today = '{dt:%B} {dt.day}, {dt.year}'.format( dt=datetime.datetime.now()) eqhrule = '=' * max(4 + len(r'[NO \author GIVEN]'), 4 + len(today)) self.assertEqualUpToWhitespace( LatexNodes2Text().latex_to_text(r""" \maketitle """), r""" [NO \title GIVEN] [NO \author GIVEN] %(today)s %(eqhrule)s """ % { 'today': today, 'eqhrule': eqhrule })
def test_input(self): latex = r'''ABCDEF fdksanfkld safnkd anfklsa \input{test_input_1.tex} MORENKFDNSN''' correct_text = r'''ABCDEF fdksanfkld safnkd anfklsa hi there! This is an equation: x + y i = 0 where i is the imaginary unit. MORENKFDNSN''' testdir = os.path.realpath(os.path.abspath(os.path.dirname(__file__))) l2t = LatexNodes2Text() l2t.set_tex_input_directory(testdir) output = l2t.nodelist_to_text(LatexWalker(latex).get_latex_nodes()[0]) self.assertEqualUpToWhitespace(output, correct_text) latex = r'''ABCDEF fdksanfkld safnkd anfklsa \input{test_input_1} MORENKFDNSN''' self.assertEqualUpToWhitespace( l2t.nodelist_to_text(LatexWalker(latex).get_latex_nodes()[0]), correct_text) latex = r'''ABCDEF fdksanfkld safnkd anfklsa \input{../test_input_1} MORENKFDNSN''' correct_text_unsafe = correct_text # as before correct_text_safe = r'''ABCDEF fdksanfkld safnkd anfklsa MORENKFDNSN''' # make sure that the \input{} directive failed to include the file. l2t = LatexNodes2Text() l2t.set_tex_input_directory(os.path.join(testdir, 'dummy')) self.assertEqualUpToWhitespace( l2t.nodelist_to_text(LatexWalker(latex).get_latex_nodes()[0]), correct_text_safe) # but without the strict_input flag, it can access it. l2t.set_tex_input_directory(os.path.join(testdir, 'dummy'), strict_input=False) self.assertEqualUpToWhitespace( l2t.nodelist_to_text(LatexWalker(latex).get_latex_nodes()[0]), correct_text_unsafe)
def parse_steps(steps_page_source): """ Go through all the children of the steps and extract the text """ main_list = ["Steps"] # print(f"Page: *{steps_page_source}", sep='\n') # print("Page: ") # print(*steps_page_source, sep='\n') for child in steps_page_source: if main_list[-1] == "Plotting:": main_list = main_list[:-1] break try: if 'solution_step_result' in child['class']: text = child.find('span', class_='selectable').text req_text = LatexNodes2Text().latex_to_text(text) main_list.append(req_text) continue except Exception as e: print(e) try: if 'mathquill-embedded-latex' in child['class']: text = child.find('span', class_='selectable').text req_text = LatexNodes2Text().latex_to_text(text) main_list.append(req_text) continue except: pass try: if child['class'] == 'solution_step_list_item': text = child.find('span', class_='selectable').text req_text = LatexNodes2Text().latex_to_text(text) main_list.append(req_text) continue except: pass try: if child['class'] == 'solution_step_list_item': text = child.find('span', class_='selectable').text req_text = LatexNodes2Text().latex_to_text(text) main_list.append(req_text) continue except: pass try: if child['class'] == 'solution_step_explanation': text = child.find('span', class_='selectable').text req_text = LatexNodes2Text().latex_to_text(text) main_list.append(req_text) continue except: pass return main_list
def test_accents(self): self.assertEqual( LatexNodes2Text().nodelist_to_text(LatexWalker(r"Fran\c cais").get_latex_nodes()[0]), '''Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais''' ) self.assertEqual( LatexNodes2Text().nodelist_to_text(LatexWalker(r"Fr\'en{\'{e}}tique").get_latex_nodes()[0]), '''Fr\N{LATIN SMALL LETTER E WITH ACUTE}n\N{LATIN SMALL LETTER E WITH ACUTE}tique''' )
def parseJsonFile(self): singleArticle = dict() for dic in self.data: author = self.getAuthor(dic) authorCount = self.getAuthorCount(dic) journal = self.getJournal(dic) title = self.getTitle(dic) year = self.getYear(dic) doi = self.getDoi(dic) collaboration = self.getCollaboration(dic) pages = self.getPages(dic) volume = self.getVolume(dic) eprint = self.getEprint(dic) abstract = self.getAbstract(dic) id = self.getID(dic) citationCount=self.getCitationCount(dic) if author is not None: singleArticle['FirstAuthor'] = author else: singleArticle['FirstAuthor'] = None if authorCount is not None: singleArticle['AuthorCount'] = authorCount if journal is not None: singleArticle['Journal'] = journal if title is not None: singleArticle['Title'] = LatexNodes2Text().latex_to_text(title) if year is not None: singleArticle['Year'] = int(year) else: singleArticle['Year'] = 0 if doi is not None: singleArticle['Doi'] = doi if collaboration is not None: singleArticle['Collaboration'] = collaboration if pages is not None: singleArticle['Pages'] = pages if volume is not None: singleArticle['Volume'] = volume if eprint is not None: singleArticle['Eprint'] = eprint if abstract is not None: singleArticle['Summary'] = LatexNodes2Text().latex_to_text(abstract) if id is not None: singleArticle['Source'] = f'https://inspirehep.net/literature/{id}' if citationCount is not None: singleArticle['CitationCount'] = citationCount singleArticle['Bibtex'] = self.convertToBibtex(singleArticle) singleArticle['DB'] = "https://inspirehep.net/" self.ListOfArticles.append(singleArticle.copy()) singleArticle.clear()
def test_math_alphabets(self): def gen_latex(macroname): return r""" %s{-ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz-} """.strip() % ('\\' + macroname) self.assertEqual( LatexNodes2Text().latex_to_text(gen_latex('mathbf')), '-𝐀𝐁𝐂𝐃𝐄𝐅𝐆𝐇𝐈𝐉𝐊𝐋𝐌𝐍𝐎𝐏𝐐𝐑𝐒𝐓𝐔𝐕𝐖𝐗𝐘𝐙 𝐚𝐛𝐜𝐝𝐞𝐟𝐠𝐡𝐢𝐣𝐤𝐥𝐦𝐧𝐨𝐩𝐪𝐫𝐬𝐭𝐮𝐯𝐰𝐱𝐲𝐳-') self.assertEqual( LatexNodes2Text().latex_to_text(gen_latex('mathit')), '-𝐴𝐵𝐶𝐷𝐸𝐹𝐺𝐻𝐼𝐽𝐾𝐿𝑀𝑁𝑂𝑃𝑄𝑅𝑆𝑇𝑈𝑉𝑊𝑋𝑌𝑍 𝑎𝑏𝑐𝑑𝑒𝑓𝑔ℎ𝑖𝑗𝑘𝑙𝑚𝑛𝑜𝑝𝑞𝑟𝑠𝑡𝑢𝑣𝑤𝑥𝑦𝑧-') self.assertEqual( LatexNodes2Text().latex_to_text(gen_latex('mathsf')), '-𝖠𝖡𝖢𝖣𝖤𝖥𝖦𝖧𝖨𝖩𝖪𝖫𝖬𝖭𝖮𝖯𝖰𝖱𝖲𝖳𝖴𝖵𝖶𝖷𝖸𝖹 𝖺𝖻𝖼𝖽𝖾𝖿𝗀𝗁𝗂𝗃𝗄𝗅𝗆𝗇𝗈𝗉𝗊𝗋𝗌𝗍𝗎𝗏𝗐𝗑𝗒𝗓-') self.assertEqual( LatexNodes2Text().latex_to_text(gen_latex('mathbb')), '-𝔸𝔹ℂ𝔻𝔼𝔽𝔾ℍ𝕀𝕁𝕂𝕃𝕄ℕ𝕆ℙℚℝ𝕊𝕋𝕌𝕍𝕎𝕏𝕐ℤ 𝕒𝕓𝕔𝕕𝕖𝕗𝕘𝕙𝕚𝕛𝕜𝕝𝕞𝕟𝕠𝕡𝕢𝕣𝕤𝕥𝕦𝕧𝕨𝕩𝕪𝕫-') self.assertEqual( LatexNodes2Text().latex_to_text(gen_latex('mathtt')), '-𝙰𝙱𝙲𝙳𝙴𝙵𝙶𝙷𝙸𝙹𝙺𝙻𝙼𝙽𝙾𝙿𝚀𝚁𝚂𝚃𝚄𝚅𝚆𝚇𝚈𝚉 𝚊𝚋𝚌𝚍𝚎𝚏𝚐𝚑𝚒𝚓𝚔𝚕𝚖𝚗𝚘𝚙𝚚𝚛𝚜𝚝𝚞𝚟𝚠𝚡𝚢𝚣-') self.assertEqual( LatexNodes2Text().latex_to_text(gen_latex('mathcal')), '-𝒜ℬ𝒞𝒟ℰℱ𝒢ℋℐ𝒥𝒦ℒℳ𝒩𝒪𝒫𝒬ℛ𝒮𝒯𝒰𝒱𝒲𝒳𝒴𝒵 𝒶𝒷𝒸𝒹ℯ𝒻ℊ𝒽𝒾𝒿𝓀𝓁𝓂𝓃ℴ𝓅𝓆𝓇𝓈𝓉𝓊𝓋𝓌𝓍𝓎𝓏-') self.assertEqual( LatexNodes2Text().latex_to_text(gen_latex('mathscr')), '-𝒜ℬ𝒞𝒟ℰℱ𝒢ℋℐ𝒥𝒦ℒℳ𝒩𝒪𝒫𝒬ℛ𝒮𝒯𝒰𝒱𝒲𝒳𝒴𝒵 𝒶𝒷𝒸𝒹ℯ𝒻ℊ𝒽𝒾𝒿𝓀𝓁𝓂𝓃ℴ𝓅𝓆𝓇𝓈𝓉𝓊𝓋𝓌𝓍𝓎𝓏-') self.assertEqual( LatexNodes2Text().latex_to_text(gen_latex('mathfrak')), '-𝔄𝔅ℭ𝔇𝔈𝔉𝔊ℌℑ𝔍𝔎𝔏𝔐𝔑𝔒𝔓𝔔ℜ𝔖𝔗𝔘𝔙𝔚𝔛𝔜ℨ 𝔞𝔟𝔠𝔡𝔢𝔣𝔤𝔥𝔦𝔧𝔨𝔩𝔪𝔫𝔬𝔭𝔮𝔯𝔰𝔱𝔲𝔳𝔴𝔵𝔶𝔷-')
def tex_to_plain(tex): ''' Try hard converting tex to unicode plain text. ''' for reg, cate in ( (r'_\{([^}]*?)\}', subscripts), (r'[\^]\{([^}]*?)\}', superscripts), (r'_(.)', subscripts), (r'[\^](.)', superscripts), ): pieces = [] while True: match = re.search(reg, tex, flags=re.DOTALL | re.UNICODE) if match: chars = match.groups()[0] if all_in(chars, cate): chars = [cate[x] for x in chars] else: chars = tex[match.start():match.end()] pieces.append(tex[:match.start()]) pieces.append(''.join(chars)) tex = tex[match.end():] else: pieces.append(tex) break tex = ''.join(pieces) return LatexNodes2Text().latex_to_text(tex)
def window_results(results_list, keywords): root_tk = tk.Tk() scrollbar = tk.Scrollbar() scrollbar.pack(side=tk.RIGHT, fill=tk.Y) window = tk.Text(root_tk, font=24, width=100, height=40, spacing2=4, padx=10, pady=10, wrap=tk.WORD) i = 1 for result in results_list: window.insert(tk.END, str(i) + ". " + result[0] + "\n") i += 1 window.insert(tk.END, "\n") i = 1 for result in results_list: for keyword, data in zip(keywords, result): if keyword == keywords[0]: window.insert(tk.END, str(i) + ". " + keyword + "\n") else: window.insert(tk.END, "--> " + keyword + "\n") window.insert(tk.END, LatexNodes2Text().latex_to_text(data) + "\n") window.insert(tk.END, "================================= \n") i += 1 window.pack(side=tk.LEFT, fill=tk.BOTH) scrollbar.config(command=window.yview) tk.mainloop()
def do_test(tex, uni, strict_latex_spaces=None, keep_comments=None, **kwargs): self.assertEqual( LatexNodes2Text(strict_latex_spaces=strict_latex_spaces, keep_comments=keep_comments, keep_inline_math=False, **kwargs) .latex_to_text(tex, keep_inline_math=True, **kwargs), uni )
def test_accents(self): self.assertEqual( LatexNodes2Text().nodelist_to_text( LatexWalker(r"Fran\c cais").get_latex_nodes()[0]), '''Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais''') self.assertEqual( LatexNodes2Text().nodelist_to_text( LatexWalker(r"Fr\'en{\'{e}}tique").get_latex_nodes()[0]), '''Fr\N{LATIN SMALL LETTER E WITH ACUTE}n\N{LATIN SMALL LETTER E WITH ACUTE}tique''' ) self.assertEqual( LatexNodes2Text(math_mode='with-delimiters').nodelist_to_text( LatexWalker(r"$1 \not= 2$").get_latex_nodes()[0]), '''$1 {} 2$'''.format( unicodedata.normalize('NFC', "=\N{COMBINING LONG SOLIDUS OVERLAY}")))
def test_empty_pars(self): self.assertEqual( LatexNodes2Text(fill_text=10, strict_latex_spaces=True).latex_to_text(r""" A car once was very fast. Another car came by. And then some space: Note the few space tokens in the otherwise empty line above. """), r"""A car once was very fast. Another car came by. And then some space: Note the few space tokens in the otherwise empty line above. """)
def _get(self,key,bibentry,compile_latex): if key in bibentry: value = bibentry[key] if compile_latex: return LatexNodes2Text().latex_to_text(value) return value else: raise KeyError(key)
def test(self): latex = r"""\textbf{Hi there!} Here is \emph{an equation \begin{equation} \zeta = x + i y \end{equation} where $i$ is the imaginary unit. """ return LatexNodes2Text().latex_to_text(latex)
def do_test(tex, uni, math_mode=None): kwargs = {} if math_mode is not None: kwargs['math_mode'] = math_mode self.assertEqual(LatexNodes2Text(strict_latex_spaces=True, **kwargs).latex_to_text(tex), uni, msg="For TeX=r'{}'".format(tex))
def test_keep_braced_groups(self): self.assertEqual( LatexNodes2Text(keep_braced_groups=True) .nodelist_to_text(LatexWalker(r"\textit{Voil\`a du texte}. Il est \'{e}crit {en fran{\c{c}}ais}") .get_latex_nodes()[0]), '''Voil\N{LATIN SMALL LETTER A WITH GRAVE} du texte. Il est \N{LATIN SMALL LETTER E WITH ACUTE}crit {en fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais}''' ) self.assertEqual( LatexNodes2Text(keep_braced_groups=True, keep_braced_groups_minlen=4) .nodelist_to_text(LatexWalker(r"A{XYZ}{ABCD}").get_latex_nodes()[0]), '''AXYZ{ABCD}''' ) self.assertEqual( LatexNodes2Text(keep_braced_groups=True, keep_braced_groups_minlen=0) .nodelist_to_text(LatexWalker(r"{A}{XYZ}{ABCD}").get_latex_nodes()[0]), '''{A}{XYZ}{ABCD}''' )
def test_text_filling_InitEndPar(self): self.assertEqual( LatexNodes2Text(fill_text=True, strict_latex_spaces=True).latex_to_text(r""" Hello \emph{world}. % comment more text. """), "\n\nHello world. more text.\n\n") self.assertEqual( LatexNodes2Text(fill_text=True, strict_latex_spaces=True).latex_to_text(r""" Hello \emph{world}. % comment more text. """), "Hello world. more text.\n\n")
def get_title_info(entry): """Try to guess title information from a publication.""" if isinstance(entry, Path): pdf = PyPDF2.PdfFileReader(str(entry)) if '/Title' in pdf.documentInfo: return pdf.documentInfo['/Title'] else: return None elif type(entry) is pybtex.database.Entry: if 'title' in entry.fields: return LatexNodes2Text().latex_to_text(entry.fields['title']) elif 'booktitle' in entry.fields: return LatexNodes2Text().latex_to_text(entry.fields['booktitle']) else: return None else: raise NotImplementedError("Can only handle pdf or bib objects (was %s)" % type(entry))
def _preprocess(self): # Defining the "None" value for the "NaN" values. self._dataframe.replace({np.nan: None}, inplace=True) # Removing unnecessary columns. columns_drop = ["methods", "conclusions", "results", "copyrights", "xml", "isbn", "language", "publication_type", "sections", "publisher", "publisher_location"] self._dataframe.drop(axis=1, columns=columns_drop, inplace=True) # Getting the PubMed ID for each paper. self._dataframe.pubmed_id = self._dataframe.pubmed_id.apply(lambda x: x.split()[0].strip()) # Normalizing the features "abstract" and "title". self._dataframe.abstract = self._dataframe.abstract.apply( lambda x: LatexNodes2Text().latex_to_text( re.sub(r"\s+", " ", re.sub("%", "\\%", x))) if x and len(x) > 0 else None) self._dataframe.title = self._dataframe.title.apply( lambda x: x.replace("\n", " ") if x and len(x) > 0 else None) # Setting the feature "keywords" as a tuple of keywords and # normalizing the keywords for each paper. self._dataframe.keywords.loc[self._dataframe.keywords.notnull()] = [ tuple([ProcessPubmed.__clean_text(keyword) for keyword in eval(keywords)]) \ if eval(keywords) else None for keywords in self._dataframe.keywords[self._dataframe.keywords.notnull()]] # Correcting the feature "authors". for idx, authors in enumerate(self._dataframe.authors): if not eval(authors): self._dataframe.authors[idx] = None else: list_authors = [] for author in eval(authors): auth = {} if author["firstname"] and author["lastname"]: auth["name"] = ProcessPubmed.__clean_text( "{} {}".format(author["firstname"], author["lastname"])) elif author["firstname"] and not author["lastname"]: auth["name"] = ProcessPubmed.__clean_text(author["firstname"]) elif not author["firstname"] and author["lastname"]: auth["name"] = ProcessPubmed.__clean_text(author["lastname"]) if "affiliation" in author: auth["affiliation"] = ProcessPubmed.__clean_text(author["affiliation"]) else: auth["affiliation"] = None if "name" in auth: list_authors.append(auth) if list_authors: self._dataframe.authors[idx] = tuple(list_authors) else: self._dataframe.authors[idx] = None # Renaming the features "authors", "keywords" and "journal". self._dataframe.rename(columns={"authors": "author_affil", "keywords": "auth_keywords", "journal": "vehicle_name"}, inplace=True)
def parse_LaTEX(t): #übersetzt f von LaTEX in 'normalen' text t = LatexNodes2Text().latex_to_text(t) t = t.replace("∫", "") #ändert noch ein paar Zeichen t = t.replace("=", "") t = t.replace("ds", "") t = t.replace("dx", "") t = regex.sub("f\([a-z]\)", "", t) #RegEx t = regex.sub("d[a-z]", "", t) print("t:", t) return t
def make_plain(text): """ Detexify and asciify text :param str text: Text to make plain :returns: Text with all LaTeX sequences rendered to text and unicode\ characters replaced :rtype: str """ return unidecode(LatexNodes2Text().latex_to_text(text))
def test_repl_matrix_environment(self): for env, arg in (('array', '{lll}'), ('pmatrix', ''), ('bmatrix', ''), ('smallmatrix', '')): self.assertEqualUpToWhitespace( LatexNodes2Text().latex_to_text( r"\begin{%(env)s}%(arg)s1 & 2 & abcdef\\ 3 & 4\end{%(env)s}" % { 'env': env, 'arg': arg }), "[ 1 2 abcdef; 3 4 ]")
def test_repl_eqn(self): for env in ('equation', 'equation*', 'eqnarray', 'eqnarray*', 'align', 'align*', 'multline', 'multline*', 'gather', 'gather*', 'dmath', 'dmath*'): self.assertEqualUpToWhitespace( LatexNodes2Text( strict_latex_spaces='except-in-equations').latex_to_text( r"\begin{%(env)s} e \approx 2.718 \end{%(env)s}" % {'env': env}), u"e ≈ 2.718")
def format_title(title): """format the publication title""" logger.info(f"... formatting title \"{title}\"") title = title.replace("\\sqrt s", "\\sqrt{s}") title = title.replace(" sqrts ", " \\sqrt{s} ") title = title.replace(" \\bar{", "\\bar{") title = title.replace("\\smash[b]", "") title = title.replace("\\smash [b]", "") title = title.replace("\\mbox{", "{") title = title.replace("{\\rm ", "{") title = title.replace("{\\rm\\scriptscriptstyle ", "{") title = title.replace("\\kern -0.1em ", "") title = title.replace("$~\\mathrm{", "~$\\mathrm{") if re.search(r"rightarrow\S", title): title = title.replace("rightarrow", "rightarrow ") # fix overline without space overline = re.search(r"overline\s([a-zA-Z])", title) if overline: title = title.replace(f"overline {overline.group(1)}", "overline{%s}" % overline.group(1)) title = title.replace(" \\overline{", "\\overline{") # fix "{\mathrm XXX}" to "\mathrm{XXX}" mathrm = re.search(r"{\\mathrm (.*)}", title) if mathrm: title = title.replace(f"\\mathrm {mathrm.group(1)}", "\\mathrm{%s}" % mathrm.group(1)) # overline{D} gives problems when in mathrm title = title.replace("\\overline{D", "\\bar{D") try: text_title = LatexNodes2Text().latex_to_text(title) except LatexWalkerError as identifier: logger.error(f"LatexWalkerError in {identifier}") text_title = title logger.debug(f"... text title {text_title}") # Convert some of remaining text to unicode text_title = convert_to_unicode(text_title) # insert spaces before and after the following characters char_with_spaces = ["=", "→"] for my_char in char_with_spaces: pat = re.compile(r"\s?%s\s?" % my_char) text_title = re.sub(pat, " %s " % my_char, text_title) # insert space before eV/keV/MeV/GeV/TeV in case of wrong formatting text_title = re.sub(r"(\d)([kMGT]?eV)", r"\1 \2", text_title) # reduce all spaces to a maximum of one text_title = re.sub(r"\s+", " ", text_title) # reduce all underscores to a maximum of one text_title = re.sub(r"_+", "_", text_title) # reduce all hyphens to a maximum of one text_title = re.sub(r"-+", "-", text_title) # remove space before comma text_title = text_title.replace(" ,", ",") # merge s_NN text_title = text_title.replace("s_ NN", "s_NN").strip() return text_title
def read_findings_bib(): with open("downloads/2020.findings-EMNLP.0.bib") as f: bib = database.parse_file(f) uids = [] titles = [] abstracts = [] authors = [] urls = [] for i, entry in enumerate(bib.entries.values()): if entry.type == "book": continue title = LatexNodes2Text().latex_to_text(entry.fields["title"]) url = entry.fields["url"] abstract = LatexNodes2Text().latex_to_text(entry.fields["abstract"]) author = "|".join( [ " ".join(reversed(str(e).split(", "))) for e in entry.persons["author"] ] ) uids.append(f"findings.{i}") titles.append(title) abstracts.append(abstract) authors.append(author) urls.append(url) data = { "UID": uids, "title": titles, "abstract": abstracts, "authors": authors, "pdf_url": urls, } df = pd.DataFrame(data) df.to_csv("yamls/findings_papers.csv", index=False)
def decode(entry: Entry) -> Entry: """Decode a dictionary with LaTeX strings into a dictionary with unicode strings.""" translator = LatexNodes2Text() # Perform a deepcopy, otherwise the input entry will get altered out = deepcopy(entry) assert out.fields is not None for key, value in out.fields.items(): if key == "url": # The url can contain special LaTeX characters (like %) and that's fine continue out.fields[key] = translator.latex_to_text(value) return out
def fix_utf8_field(entry, field, args): if field not in entry: return entry value = entry[field] if field is args.utf8: value = LatexNodes2Text().latex_to_text(value) elif field is args.latex: value = unicode_to_latex(value) entry[field] = value return entry
def print(self, line, params): def to_str(name, kind): obj = params[name] if kind == 'degree': return degree_to_string(obj) return str(obj) for match in re.finditer('%{(?P<type>[^:}]*):(?P<name>[^}]*)}', line): line = line.replace( match.group(0), to_str(match.group('name'), match.group('type'))) return LatexNodes2Text().latex_to_text(line)
def decode_latex(latex_text): """Decode latex text. Args: latex_text (str): a latex text. Returns: str: the latex text decoded. """ if not isinstance(latex_text, text_type): latex_text = text_type(latex_text, 'utf8') return LatexNodes2Text().latex_to_text(latex_text)
def __convert(self, lecture): path = self.prefix + lecture.path if not os.path.exists(path): print "File not found: {}".format(path) return try: with open(path, 'r') as f: lecture.content = LatexNodes2Text().latex_to_text(f.read()) print lecture.url except Exception as e: print "Skipping due to {}".format(e) return lecture