Example #1
0
def convert_pdf(song, path, output):
    """Parse and convert a pdf into text, including metadata.

    Deals with various common conversion errors."""

    meta = PdfReader(str(path)).Info
    song['author'] = strip_brackets(meta.Author)
    song['creator'] = strip_brackets(meta.Creator)
    song['producer'] = strip_brackets(meta.Producer)
    if meta.Title:
        # explicit title in metadata, use that. Fairly rare.
        song['title'] = strip_brackets(meta.Title.strip())
    else:
        # multiline titles get mangles when converting to text, so we use
        # a library that uses heuristics to guess the title.
        # It is slow, though
        try:
            title = pdftitle.get_title_from_file(str(path)).strip()
        except Exception:
            pass
        else:
            song['title'] = re.sub(r'([a-z])([A-Z])', r'\1 \2', title)

    cmd = ['pdftotext', '-layout', '-enc', 'UTF-8', '-eol', 'unix', '-nopgbrk']
    subprocess.run(cmd + [str(path), output])
    with open(output, 'r') as f:
        contents = f.read()
    # fix various issues
    contents = clean_encoding(contents)
    # TODO: we probably enforce ASCII, stripping unicode?
    # sometimes a chord like D(4) is extracted as D(4 which is hard to parse
    contents = re.sub(r'([A-JZ]\([\d])([ $])', r'\1)\2', contents)
    return contents
Example #2
0
def metadata(path: Path) -> Metadata:
    """
    Reads a given PDF file and produces a Metadata object.

    :param path: path to a PDF file
    :return: the metadata extracted from the PDF file
    """
    with path.open('rb') as f:
        reader = PdfFileReader(f)
        info = reader.getDocumentInfo()
        page_count = reader.getNumPages()

    typer.echo(f'PDF metadata: {info}', err=True)

    # Decide which possible title to use:
    # - the title annotated in the PDF metadata
    # - the title read by pdftitle (largest text on the first page)
    # - the file name without extension
    pdftitle_title = pdftitle.get_title_from_file(str(path))
    typer.echo(f'Title according to pdftitle: {pdftitle_title}', err=True)

    title_candidates = [t for t in [info.title, pdftitle_title, path.stem] if t is not None]

    # The current heuristic is just to use the longest of the three candidates
    title = max(title_candidates, key=len)

    return Metadata(
        title=title,
        author=info.author,
        page_count=page_count
    )
Example #3
0
def changePdfName(filename):
    print('changePdfName: original filename {}'.format(filename))
    title = pdftitle.get_title_from_file(filename)
    print('original title: ', title)

    forbidChars = ['/', ':']
    for ch in forbidChars:
        newTitle = title.replace(ch,"")
        title = newTitle

    newTitle = newTitle+'.pdf'
    print('new title: ', newTitle)

    os.rename(filename, newTitle)
Example #4
0
    def segment(self, uri: str, buffer: bytes, mime_type: str, *args,
                **kwargs) -> List[Dict]:
        """
        Segements PDF files. Extracts data from them.

        Checks if the input is a string of the filename,
        or if it's the file in bytes.
        It will then extract the data from the file, creating a list for images,
        and text.

        :param uri: File name of PDF
        :type uri: str
        :param buffer: PDF file in bytes
        :type buffer: bytes
        :param mime_type: the type of data
        :returns: A list of documents with the extracted data
        :rtype: List[Dict]
        """
        chunks = []
        if mime_type != 'application/pdf':
            return chunks

        if uri:
            try:
                pdf_img = fitz.open(str(uri))
                pdf_content = pdfplumber.open(uri)
                title = get_title_from_file(uri)
            except Exception as ex:
                self.logger.error(f'Failed to open {uri}: {ex}')
                return chunks
        elif buffer:
            try:
                pdf_img = fitz.open(stream=buffer, filetype='pdf')
                pdf_content = pdfplumber.open(io.BytesIO(buffer), password=b"")
                title = get_title_from_io(io.BytesIO(buffer))
            except Exception as ex:
                self.logger.error(f'Failed to load from buffer')
                return chunks
        else:
            self.logger.warning('No value found in `buffer` or `uri`')
            return chunks
        # Extract images
        self._extract_img(pdf_img, chunks)
        # Extract text
        self._extract_text(pdf_content, chunks, title)
        return chunks
Example #5
0
def find_possible_titles(path):
    """
    Given a valid path to a pdf file, it tries to extract a list of possible titles. 
    In the current implementation it looks for titles by 1) looking for the outcome of pdftitle library, 
    2) looking in the dictionary returned by the PyPDF library and 3) looking in the filename.

    Parameters
    ----------
    path : string
        A valid path to a pdf file
    
    Returns
    -------
    titles : list of strings
        Possible titles of the paper.
    """
    titles = []
    # (1)
    try:
        title = pdftitle.get_title_from_file(path)
    except:
        title = ''
    if len(
            title.strip()
    ) > 12:  #This is to check that the title found is neither empty nor just few characters
        titles.append(title)

    # (2)
    info = get_pdf_info(path)
    if not (info): return None

    for key, value in info.items():
        if 'title' in key.lower():
            if isinstance(value, str) and len(
                    value.strip()
            ) > 12:  #This is to check that the title found is neither empty nor just few characters
                titles.append(value)
    # (3)
    title = os.path.basename(path)
    if len(
            title.strip()
    ) > 30:  #This is to check that the title found is neither empty nor just few characters
        titles.append(title)

    return titles
Example #6
0
def force_index_file(path, index, title):  #uses crossref; deprecated
    print('force index', path, index, title)

    if title == "":
        try:
            title = pdftitle.get_title_from_file(path)
            print("Title: ", title, path)
        except:
            try:
                title = get_info(path).get('/Title')
                print("Title: ", title, path)

                if title == "" or title == None:
                    print("Unable to get title of file", path)
            except:
                print("Unable to get title of file", path)

    if title != "" and title != None:
        try:
            # search_query_scholar = scholarly.search_pubs_query(title)
            # search_result_scholar = next(search_query_scholar)
            # print(search_result_scholar)

            search_query_crossref = gWorks.query(
                bibliographic=title).sort('relevance')
            search_result_crossref = []
            if search_query_crossref.count() != 0:
                for item in search_query_crossref:
                    search_result_crossref = item
                    break
            else:
                print('ERROR: crossref search did not return any results',
                      title)
                raise ValueError

            title = search_result_crossref['title'][0]
            print(title)
            print(type(title))
            assert (isinstance(title, str))
            # print(search_result_scholar.bib)
            # print(type(search_result_scholar.bib.get('title')))
            # title2 = search_result_scholar.bib.get('title')

            # if title.lower() != title2.lower():
            # 	print("ERROR: title doesnt match")
            # 	# print("ERROR: title doesnt match from two queries, crossref:",title,"scholar:",title2)
            # 	# print(title)
            # 	# print(title2)
            # 	raise ValueError

            # abstract = search_result_scholar.bib.get('abstract')
            authors = search_result_crossref['author']
            url = search_result_crossref['URL']

            date = [0, 0, 0]
            if 'published-online' in search_result_crossref:
                date = search_result_crossref['published-online'][
                    'date-parts'][0]
            elif 'published-print' in search_result_crossref:
                date = search_result_crossref['published-print']['date-parts'][
                    0]

            # search_result.get_citedby()
            cited_by_url = ""
            citations = 0

            # if hasattr(search_result_scholar, 'id_scholarcitedby'):
            # 	cited_by_url = search_result_scholar.id_scholarcitedby

            # if hasattr(search_result_scholar, 'citedby'):
            # 	citations = search_result_scholar.citedby

            if index == -1:
                gPapers.append({
                    'path': path,
                    'title': title,
                    'authors': authors,
                    'tags': [],
                    # 'abstract': abstract,
                    'date': date,
                    'url': url,
                    'citations': citations,
                    'cited_by_url': cited_by_url
                })
            else:
                gPapers[index] = {
                    'path': path,
                    'title': title,
                    'authors': authors,
                    'tags': [],
                    # 'abstract': abstract,
                    'date': date,
                    'url': url,
                    'citations': citations,
                    'cited_by_url': cited_by_url
                }

            n = notify2.Notification("Document Parsed Fully", title,
                                     "package-install")
            n.show()
        except Exception as e:
            print(e)
            if index == -1:
                gPapers.append({'path': path, 'title': title, 'tags': []})
            else:
                gPapers[index] = {'path': path, 'title': title, 'tags': []}

            n = notify2.Notification("Document Parsed Partially", title,
                                     "package-installed-outdated")
            n.show()
    else:
        if index == -1:
            gPapers.append({
                'path': path,
                'title': 'Untitled Document',
                'tags': []
            })
        else:
            gPapers[index] = {
                'path': path,
                'title': 'Untitled Document',
                'tags': []
            }

        n = notify2.Notification("Unable to get title",
                                 "Untitled Document - " + path,
                                 "package-broken")
        n.show()
Example #7
0
    "Ex- pression" : "Expression",
    "\u2013" : "-",
    ",and" : ", and" }

# name fixes
author_fix = { "NAIK" : "Naik"}


pdfs = glob.glob("final project reports/*.pdf")
titles = {}
for f in pdfs: 
    f_clean = re.sub("LATE_", "", f)
    bn = os.path.basename(f_clean)
    ss = bn.split("_")
    user_id = int(ss[1])
    title = problems[user_id] if (user_id in problems) else pdftitle.get_title_from_file(f)
    for k,v in toremove.items(): 
        title = re.sub(k, v, title)
    titles[user_id] = title

htmls = glob.glob("video_links/*.html")

parser = MyHTMLParser()

# Project groups can be downloaded from Courseworks (People -> Project Groups -> Import(!) -> Download Course Roster)
groups = pd.read_csv("Project groups.csv")
poster_data = {}
for i,f in enumerate(htmls):
    f_clean = re.sub("LATE_", "", f)
    bn = os.path.basename(f_clean)
    _,user_id,_ = bn.split("_")