Beispiel #1
0
    def get_pdf_metadata(self, pdf_file_stream):
        metadata = {
            'author': 'UNKNOWN_AUTHOR',
            'title': 'UNKNOWN_TITLE',
            'year': 'UNKNOWN_YEAR'
        }

        pdf_parser = PDFParser(pdf_file_stream)
        pdf_doc = PDFDocument(pdf_parser)
        pdf_metadata = pdf_doc.info[0]

        author = make_pdf_metadata_str(pdf_metadata['Author'] if 'Author' in
                                       pdf_metadata else '')
        if author and author != '':
            metadata['author'] = author

        title = make_pdf_metadata_str(pdf_metadata['Title'] if 'Title' in
                                      pdf_metadata else '')
        if title and title != '':
            metadata['title'] = title

        year = pdf_metadata_moddate_to_year(
            make_pdf_metadata_str(pdf_metadata['ModDate'] if 'ModDate' in
                                  pdf_metadata else ''))
        if year and year != '':
            metadata['year'] = year

        return metadata
Beispiel #2
0
    def get_pdf_metadata(pdf):
        """Get PDF metadata with PDF content

        Args:
            pdf: PDF content (in bytes)

        Returns:
            metadata: PDF metadata dictionary

        """

        temp_pdf_file = tempfile.TemporaryFile()
        temp_pdf_file.write(pdf)

        metadata = {
            'author': 'UNKNOWN_AUTHOR',
            'title': 'UNKNOWN_TITLE',
            'year': 'UNKNOWN_YEAR'
        }

        pdf_parser = PDFParser(temp_pdf_file)

        try:
            pdf_doc = PDFDocument(pdf_parser)
            pdf_metadata = pdf_doc.info[0]

            author = make_pdf_metadata_str(pdf_metadata.get('Author', ''))
            if author and author != '':
                metadata['author'] = author

            title = make_pdf_metadata_str(pdf_metadata.get('Title', ''))
            if title and title != '':
                metadata['title'] = title

            year = pdf_metadata_moddate_to_year(
                make_pdf_metadata_str(pdf_metadata.get('ModDate', '')))
            if year and year != '':
                metadata['year'] = year
        except Exception as e:
            pass

        temp_pdf_file.close()

        return metadata