def convert_pdf(song, path, output): """Parse and convert a pdf into text, including metadata. Deals with various common conversion errors.""" meta = PdfReader(str(path)).Info song['author'] = strip_brackets(meta.Author) song['creator'] = strip_brackets(meta.Creator) song['producer'] = strip_brackets(meta.Producer) if meta.Title: # explicit title in metadata, use that. Fairly rare. song['title'] = strip_brackets(meta.Title.strip()) else: # multiline titles get mangles when converting to text, so we use # a library that uses heuristics to guess the title. # It is slow, though try: title = pdftitle.get_title_from_file(str(path)).strip() except Exception: pass else: song['title'] = re.sub(r'([a-z])([A-Z])', r'\1 \2', title) cmd = ['pdftotext', '-layout', '-enc', 'UTF-8', '-eol', 'unix', '-nopgbrk'] subprocess.run(cmd + [str(path), output]) with open(output, 'r') as f: contents = f.read() # fix various issues contents = clean_encoding(contents) # TODO: we probably enforce ASCII, stripping unicode? # sometimes a chord like D(4) is extracted as D(4 which is hard to parse contents = re.sub(r'([A-JZ]\([\d])([ $])', r'\1)\2', contents) return contents
def metadata(path: Path) -> Metadata: """ Reads a given PDF file and produces a Metadata object. :param path: path to a PDF file :return: the metadata extracted from the PDF file """ with path.open('rb') as f: reader = PdfFileReader(f) info = reader.getDocumentInfo() page_count = reader.getNumPages() typer.echo(f'PDF metadata: {info}', err=True) # Decide which possible title to use: # - the title annotated in the PDF metadata # - the title read by pdftitle (largest text on the first page) # - the file name without extension pdftitle_title = pdftitle.get_title_from_file(str(path)) typer.echo(f'Title according to pdftitle: {pdftitle_title}', err=True) title_candidates = [t for t in [info.title, pdftitle_title, path.stem] if t is not None] # The current heuristic is just to use the longest of the three candidates title = max(title_candidates, key=len) return Metadata( title=title, author=info.author, page_count=page_count )
def changePdfName(filename): print('changePdfName: original filename {}'.format(filename)) title = pdftitle.get_title_from_file(filename) print('original title: ', title) forbidChars = ['/', ':'] for ch in forbidChars: newTitle = title.replace(ch,"") title = newTitle newTitle = newTitle+'.pdf' print('new title: ', newTitle) os.rename(filename, newTitle)
def segment(self, uri: str, buffer: bytes, mime_type: str, *args, **kwargs) -> List[Dict]: """ Segements PDF files. Extracts data from them. Checks if the input is a string of the filename, or if it's the file in bytes. It will then extract the data from the file, creating a list for images, and text. :param uri: File name of PDF :type uri: str :param buffer: PDF file in bytes :type buffer: bytes :param mime_type: the type of data :returns: A list of documents with the extracted data :rtype: List[Dict] """ chunks = [] if mime_type != 'application/pdf': return chunks if uri: try: pdf_img = fitz.open(str(uri)) pdf_content = pdfplumber.open(uri) title = get_title_from_file(uri) except Exception as ex: self.logger.error(f'Failed to open {uri}: {ex}') return chunks elif buffer: try: pdf_img = fitz.open(stream=buffer, filetype='pdf') pdf_content = pdfplumber.open(io.BytesIO(buffer), password=b"") title = get_title_from_io(io.BytesIO(buffer)) except Exception as ex: self.logger.error(f'Failed to load from buffer') return chunks else: self.logger.warning('No value found in `buffer` or `uri`') return chunks # Extract images self._extract_img(pdf_img, chunks) # Extract text self._extract_text(pdf_content, chunks, title) return chunks
def find_possible_titles(path): """ Given a valid path to a pdf file, it tries to extract a list of possible titles. In the current implementation it looks for titles by 1) looking for the outcome of pdftitle library, 2) looking in the dictionary returned by the PyPDF library and 3) looking in the filename. Parameters ---------- path : string A valid path to a pdf file Returns ------- titles : list of strings Possible titles of the paper. """ titles = [] # (1) try: title = pdftitle.get_title_from_file(path) except: title = '' if len( title.strip() ) > 12: #This is to check that the title found is neither empty nor just few characters titles.append(title) # (2) info = get_pdf_info(path) if not (info): return None for key, value in info.items(): if 'title' in key.lower(): if isinstance(value, str) and len( value.strip() ) > 12: #This is to check that the title found is neither empty nor just few characters titles.append(value) # (3) title = os.path.basename(path) if len( title.strip() ) > 30: #This is to check that the title found is neither empty nor just few characters titles.append(title) return titles
def force_index_file(path, index, title): #uses crossref; deprecated print('force index', path, index, title) if title == "": try: title = pdftitle.get_title_from_file(path) print("Title: ", title, path) except: try: title = get_info(path).get('/Title') print("Title: ", title, path) if title == "" or title == None: print("Unable to get title of file", path) except: print("Unable to get title of file", path) if title != "" and title != None: try: # search_query_scholar = scholarly.search_pubs_query(title) # search_result_scholar = next(search_query_scholar) # print(search_result_scholar) search_query_crossref = gWorks.query( bibliographic=title).sort('relevance') search_result_crossref = [] if search_query_crossref.count() != 0: for item in search_query_crossref: search_result_crossref = item break else: print('ERROR: crossref search did not return any results', title) raise ValueError title = search_result_crossref['title'][0] print(title) print(type(title)) assert (isinstance(title, str)) # print(search_result_scholar.bib) # print(type(search_result_scholar.bib.get('title'))) # title2 = search_result_scholar.bib.get('title') # if title.lower() != title2.lower(): # print("ERROR: title doesnt match") # # print("ERROR: title doesnt match from two queries, crossref:",title,"scholar:",title2) # # print(title) # # print(title2) # raise ValueError # abstract = search_result_scholar.bib.get('abstract') authors = search_result_crossref['author'] url = search_result_crossref['URL'] date = [0, 0, 0] if 'published-online' in search_result_crossref: date = search_result_crossref['published-online'][ 'date-parts'][0] elif 'published-print' in search_result_crossref: date = search_result_crossref['published-print']['date-parts'][ 0] # search_result.get_citedby() cited_by_url = "" citations = 0 # if hasattr(search_result_scholar, 'id_scholarcitedby'): # cited_by_url = search_result_scholar.id_scholarcitedby # if hasattr(search_result_scholar, 'citedby'): # citations = search_result_scholar.citedby if index == -1: gPapers.append({ 'path': path, 'title': title, 'authors': authors, 'tags': [], # 'abstract': abstract, 'date': date, 'url': url, 'citations': citations, 'cited_by_url': cited_by_url }) else: gPapers[index] = { 'path': path, 'title': title, 'authors': authors, 'tags': [], # 'abstract': abstract, 'date': date, 'url': url, 'citations': citations, 'cited_by_url': cited_by_url } n = notify2.Notification("Document Parsed Fully", title, "package-install") n.show() except Exception as e: print(e) if index == -1: gPapers.append({'path': path, 'title': title, 'tags': []}) else: gPapers[index] = {'path': path, 'title': title, 'tags': []} n = notify2.Notification("Document Parsed Partially", title, "package-installed-outdated") n.show() else: if index == -1: gPapers.append({ 'path': path, 'title': 'Untitled Document', 'tags': [] }) else: gPapers[index] = { 'path': path, 'title': 'Untitled Document', 'tags': [] } n = notify2.Notification("Unable to get title", "Untitled Document - " + path, "package-broken") n.show()
"Ex- pression" : "Expression", "\u2013" : "-", ",and" : ", and" } # name fixes author_fix = { "NAIK" : "Naik"} pdfs = glob.glob("final project reports/*.pdf") titles = {} for f in pdfs: f_clean = re.sub("LATE_", "", f) bn = os.path.basename(f_clean) ss = bn.split("_") user_id = int(ss[1]) title = problems[user_id] if (user_id in problems) else pdftitle.get_title_from_file(f) for k,v in toremove.items(): title = re.sub(k, v, title) titles[user_id] = title htmls = glob.glob("video_links/*.html") parser = MyHTMLParser() # Project groups can be downloaded from Courseworks (People -> Project Groups -> Import(!) -> Download Course Roster) groups = pd.read_csv("Project groups.csv") poster_data = {} for i,f in enumerate(htmls): f_clean = re.sub("LATE_", "", f) bn = os.path.basename(f_clean) _,user_id,_ = bn.split("_")