-
Notifications
You must be signed in to change notification settings - Fork 0
/
PDF_manipulator.py
396 lines (344 loc) · 15 KB
/
PDF_manipulator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
from PyPDF2 import PdfFileReader
import os
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase
import bibtexparser
import requests
import re
from pathlib import Path
import shutil
class RenamePDF:
def __init__(self, path, destination, fail):
self.path = path
self.destination = destination
self.fail_path = fail
def pdfs(self):
"""
Goes to the given directory when the PyPDF object is initiated and finds all the PDFS there.
Returns the values as a full path name.
:return: full file path and name
"""
print(f"pdfs called: {self.path}")
# Determine if path is a single file
test_path = Path(self.path)
if test_path.is_file():
tmp_list = [self.path]
return tmp_list
# This supports running individual files through renamer
# The advantage of this is it allows you to ensure filenames match up
# Convert to list type for later iterating so it doesn't go by chars in string
else:
if not os.path.exists(self.path):
tmp_list = [self.path]
return tmp_list
file_list = os.listdir(self.path)
pdfs = [self.path + x for x in file_list if x.endswith(".pdf")]
return pdfs
def info(self, filename):
"""
Uses P2PDF2's PdfFileReader function, which is very limited and heavily flawed.
The limitation seems to come from the publisher's end who inconsistently use the metadata.
This was ultimately abandoned in favor of using DOI.
:param filename: filname as full path
:return: basic info derived from PyPDF2's PdfFileReader function.
"""
try:
with open(filename, 'rb') as f:
pdf = PdfFileReader(f)
info = pdf.getDocumentInfo()
number_of_pages = pdf.getNumPages()
author = info.author
creator = info.creator
producer = info.producer
subject = info.subject
title = info.title
return info
except:
# input(f"Could not open {path}. Press any key to continue.")
f = open("../../SortedFiles/text/pdf/2020/07/PDF_error_log.txt", "a")
f.write(filename + "\n")
f.close()
def get_first_page(self, path):
"""
Pulls the first page from the PDF. This is useful for parsing the
text later as the needed information for pulling the metadata is
contained there.
This function was based on the incorrect assumption that DOI
is printed on the first page. It in fact can be found anywhere
in the document. So this isn't that helpful.
:param path: filename given as the full file path
:return:
"""
try:
with open(path, 'rb') as f:
pdf = PdfFileReader(f)
page1 = pdf.getPage(1)
text = page1.extractText()
return page1, text
except:
print("error getting page1")
def extract_all_text(self, path, doi_only=False):
"""
Uses the PyPDF2 library to pull all of the text out of a PDF and
returns it as a single string.
:param path: path to pdf being evaluated
:param doi_only: If true, will only include pages containing "doi"
in the text. This is likely going away, not that helpful.
:return: entire text of a pdf as a string
"""
# print(doi_only)
text = []
try:
with open(path, 'rb') as f:
pdf = PdfFileReader(f)
if doi_only == True:
for page in pdf.pages:
x = page.extractText()
if "doi" in x:
text.append(x)
else:
for page in pdf.pages:
text.append(page.extractText())
string_text = "".join(text)
return string_text
except:
print("error getting the file open or something")
def get_doi(self, path):
#TODO: alter structure so that each doi match then tries crossref
#TODO: if match is successful, ultimately return the title
#TODO: if match is unsuccessful, try the next strategy
#TODO: add searching metadata info for doi tag
print(path)
initial_test_string = ["doi", "DOI", "doi:", "DOI:"]
exclude_doi_source = ["zenodo"] # alternative doi publishers that don't work with crossref
text = self.extract_all_text(path)
if text == None:
return None
doi = None
m = False
# single_pattern =["(?:https?://.{0,5})" + # Look for possible https and "doi" versus "dx.doi"
# "?doi(?!ng\b)(?:\.org)/" + #Look for "doi" but not "doing" and possible ".org"
# "?(?:(?:\S|\n)" + # The actual unique identifier for the paper
# "(?!Article|Download|Wiley|1Department|Department|$))*" + # Exclude specific strings
# "(?:[a-z0-9]|-\n+?\S+)"] # Wrap around detection if there's a -\n combination
# preserve this single_pattern = ["(?:https?://.{0,5})?doi(?!ng\b)(:?\n)?(?:\.org)?(?:(?:\S)(?!Article|Download|Wiley|$))*(?:[a-z0-9]|-\n+?\S+)"]
# single_pattern =["(?:https?://.{0,5})?doi(?!ng\b)(:?\n)?(?:\.org)?(?:(\S)(?!Article|Download|Wiley|1Department|Department|$))*(?:[a-z0-9]|-\n+?(?:(\S)(?!Division|$))+)"]
# rp is "regex pattern"
# This is the master regex that is meant to catch the vast majority.
# Some journals are particularly wonky and will require alternative.
rp = ["(?:https?://.{0,5})?" + # Https and up to 5 character
"doi(?!ng\b)" + # "doi" but not "doing"
"(:?\n)?" + # possible ":" and/or newline
"(?:\.org)?" + # possible ".org"
"(?:(\S)(?!Article|Download|Wiley|1Department|Department|$))*" +
# all non-whitespace characters
# but not key words after "?!"
# the keywords approach is probably not sustainable
# as they can fail in unique ways for each PDF encountered
"(?:[a-z0-9]|-\n+?" + # newlines preceded by hyphen
"(?:(\S)(?!Division|$))+)" # characters after a "-\n"
# but not keywords after "?!
# this seems to consume an extra char
]
# Specialized patterns for particular journals
spec_pattern_list = ["\S+sciadv.\d+", "\S+nature+\d+", "\S+/science\.\S+", "DOI: \S+/science\.\S+"]
for pattern in rp:
result = re.search(pattern, text, re.IGNORECASE)
if result != None:
doi = result[0]
if any(n in doi for n in exclude_doi_source):
doi = None
if doi == None:
for pattern in spec_pattern_list:
result = re.search(pattern, text, re.IGNORECASE)
if result != None:
doi = result[0]
return doi
def get_crossref_metadata(self, doi):
"""
Retrieves metadata from Crossref database using the Requests module.
Doi must be formatted as a valid URL before being sent via Requests.
First part of the function handles the URL formatting, which is then
sent on to Crossref.org.
Crossref.org supports multiple header formats which will return a
range of values. See Crossref.org for more examples. Changing the
header will break this function.
:param doi: Document object identifier number
:return: Metadata from the Crossref database
"""
if doi == None:
return None
# print(doi)
doi = doi.replace("\n", "") # get rid of new lines stored in the doi
doi = doi.replace(" ", "") # get rid of whitespace
if "DOI:http://" in doi:
url = "https://" + doi[11:]
elif any(n in doi for n in ["https://dx.doi.org", "http://dx.doi.org"]):
url = doi
elif "https://doi.org" in doi:
url = doi
elif any(n in doi for n in ["doi.org/", "DOI.org/"]):
print(doi)
url = "https://" + doi
elif any(n in doi for n in ["DOI:", "doi:"]):
url = "https://doi.org/" + doi[4:]
elif any(n in doi for n in ["DOI ", "doi "]):
url = "https://doi.org/" + doi[4:]
elif any(n in doi for n in ["DOI", "doi"]):
url = "https://doi.org/" + doi[3:]
elif any(n in doi for n in ["sciadv"]):
# this elif block is for special patterns that return only the unique DOI
url = "https://doi.org/" + doi
else:
print("maybe the doi is malformed?")
return None
# print(url)
# the headers here determine what kind of output you get from Crossref.org
# this method ultimately requires that you be online and depend on
# crossref.org to continue to provide the service
# Given the longevity of Crossref.org, I think this is a reasonable bet
headers = {
'accept': 'text/bibliography; style=bibtex',
}
print(f"url: {url}")
crossref_request = requests.get(url, headers=headers)
return crossref_request.text
def make_bibtex_entries(self, meta_data):
"""
Converts the metadata from crossref.org into a formated bibliography
entry. This is returned. It also saves the entry into two
bibliographic repositiories. Temp.bib is overwritten with each run of
this function. Repository.bib accumulates over time. The idea is to
automatically build a bibliography reference as papers get imported.
:param meta_data: Raw metadata from Crossref.org
:return: formatted bibliography
"""
#TODO:base location of temp.bib and repository.bib on destination path
if meta_data == None:
return None
# print(meta_data)
db = BibDatabase()
db.entries = meta_data
# print(db.entries)
writer = BibTexWriter()
with open("../Scratch/temp.bib", 'w', encoding='utf-8') as bibfile:
bibfile.write(meta_data)
with open("../Scratch/repository.bib", 'a', encoding='utf-8') as bibfile:
bibfile.write(meta_data)
with open("../Scratch/temp.bib", encoding='utf-8') as bibtexfile:
bib_database = bibtexparser.load(bibtexfile)
return bib_database
def make_titles(self, bib_database):
"""
Converts bibliographic data into a simplified title.
Author Year - <5 words of title>
This also strips out simple words/prepositions and certain characters.
:param bib_database: bibliographic formatted metadata
:return: title in above format (str)
"""
if bib_database == None:
return None
if not len(bib_database.entries) > 0:
print("File not readable")
return None
# print(bib_database.entries)
author = bib_database.entries[0]['author'].split(',')[0]
year = bib_database.entries[0]['year']
title = bib_database.entries[0]['title']
prune_words = ["of", "and", "the", "The", "with", "at", "by",
"in", "for", "after", "by", "against", "instead",
"to", "between", "over"]
prune_punctuation = ["&", "!", "@", ":", ",", "$", "#",
"%", "*", "?", ";", "/", "\\", "'", '"',
"\x80", "\x99"
]
base_title = title
title_as_list = base_title.split(' ')
temp_title = []
for word in prune_words:
if word in title_as_list:
title_as_list.remove(word)
if len(title_as_list) > 4:
title_as_list = title_as_list[0:4]
title_as_string = ' '.join(title_as_list)
for character in prune_punctuation:
title_as_string = title_as_string.replace(character, "")
return f"{author} {year} - {title_as_string}.pdf"
def rename(self):
"""
Helper function that renames file to reflect new path. If a file of the same
name already exists in the destination folder, the file name is numbered and
incremented until the filename is unique (prevents overwriting files).
:param Path source: source of file to be moved
:param Path destination_path: path to destination directory
"""
#TODO: Determine best location of these self.variables
self.pdf_list_as_path = self.pdfs()
self.doi_list = [self.get_doi(x) for x in self.pdf_list_as_path]
self.pdf_info = [self.info(x) for x in self.pdf_list_as_path]
self.meta_list = [self.get_crossref_metadata(x) for x in self.doi_list]
self.bibtex_entries = [self.make_bibtex_entries(x) for x in self.meta_list]
self.final_titles = [self.make_titles(x) for x in self.bibtex_entries]
if None in self.final_titles:
return None
x = 0
print("rename called")
for file in self.pdf_list_as_path:
final_name = Path(self.final_titles[x])
destination_path = Path(self.destination)
source = Path(file)
x += 1
new_name = Path(destination_path / final_name)
if new_name.exists():
increment = 0
while new_name.exists():
increment += 1
new_name = destination_path / f'{final_name.stem}_{increment}{final_name.suffix}'
# if not new_name.exists():
# return new_name
# return self.destination / final_name
shutil.move(file, new_name)
class GetImages:
#TODO: A function that pulls out all the images from a PDF
#TODO: Put all those images in a .pptx
#TODO: Determine if this should be a separate library?
#Tried: minecart - got Attribute errors, may need reinstall?
#Tried: PyMuPDF - didn't export all images
pass
# if __name__ == '__main__':
#
# path="C:/Users/gregoryn/test/"
# destination="C:/Users/gregoryn/test2/"
# pdf=RenamePDF(path, destination)
# pdf.rename()
# #doi_page=pdf.extract_all_text(pdf.pdf_list_as_path[0], doi_only=True)
# url="https://doi.org/10.1186/s12984-019-0535-7"
# headers={
# 'accept':'text/bibliography; style=bibtex',
# }
# r=requests.get(url, headers=headers)
# print(r.text)
# headers={
# 'accept':'application/citeproc+json'
# }
#
# r=requests.get(url, headers=headers)
# print(r.text)
#
# # file_list=PyPDF.pdfs(".")
# r=requests.get(url, headers=headers)
# for x in file_list:
# PyPDF.info(x)
# file_list = get_pdfs(".")
# for x in file_list:
# PyPDF2_info(x)
# print(x)
#
#
# remote_list=get_pdfs(path)
# print(remote_list)
# for x in remote_list:
# PyPDF2_info(x)
# print(x)
# data=bibtex(path)
# data.info()