forked from brandongalbraith/springer-dl
/
springer_dl.py
executable file
·89 lines (73 loc) · 3.16 KB
/
springer_dl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python3
import argparse
import os
import sys
import urllib.parse
from lib.util import GithubParser, Parser, request, download_file, print_book
if __name__ == '__main__':
desc = 'springer-dl: download the set of books Springer released for free '\
'during the 2020 COVID-19 outbreak'
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('--path', dest='path', type=str,
help='path to download books', required=True)
parser.add_argument('-t', '--filter-title', help="Keywords to filter category")
parser.add_argument('-c', '--filter-category', help="Keywords to filter book title")
parser.add_argument('--pretty', action='store_true', help="Pretty print the books.")
parser.add_argument('--dryrun', default=False, action="store_true",
help="Print the list of links and titles. Don't download.")
args = parser.parse_args()
dl_path = args.path
os.makedirs(dl_path, exist_ok=True)
HEADERS = {'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like '\
'Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) '\
'CriOS/81.0.4044.124 Mobile/15E148 Safari/604.1'}
BOOK_PAGE = "https://hnarayanan.github.io/springer-books/"
r = request(BOOK_PAGE, headers=HEADERS)
html = r.read().decode('utf-8')
p = GithubParser()
p.feed(html)
prefix = 'http://link.springer.com/openurl'
books = [book for book in p.links if book.href.startswith(prefix)]
if args.filter_title:
text = args.filter_title.lower()
books = [b for b in books if text in b.title.lower()]
if args.filter_category:
text = args.filter_category.lower()
books = [b for b in books if text in b.category.lower()]
if args.dryrun:
for b in books:
if args.pretty:
print_book(b)
else:
print(b)
sys.exit()
for url, isbn, category, title in books:
r = request(url, HEADERS)
end_url = r.url
html = r.read().decode('utf-8')
p = Parser()
p.feed(html)
links = [x for x in p.links if 'content/pdf' in x or 'download/epub' in x]
links = [urllib.parse.urljoin(end_url, x) + '?javascript-disabled=true' for x in links]
book_title = p.title[0].split(' |')[0]
book_path = os.path.join(dl_path, book_title)
os.makedirs(book_path, exist_ok=True)
if 'epub' in links[1]:
links = links[:2]
else:
links = links[:1]
for link in links:
filename = '%s - %s' % (book_title, isbn) + os.path.splitext(
urllib.parse.urlparse(link).path)[-1]
filepath = os.path.join(book_path, filename)
if os.path.exists(filepath):
continue
try:
print('[+] %s' % filename)
download_file(link, HEADERS, filepath)
except KeyboardInterrupt:
if os.path.exists(filepath):
os.remove(filepath)
sys.exit()
except:
print('[x] error downloading %s, skipping...' % filename)