/
mangascrapper.py
99 lines (82 loc) · 3.74 KB
/
mangascrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import time
import os
import requests
import re
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options
from urllib.parse import urlparse
from Notifier import Notifier
from ListManager import ListManager
from Manga import Manga
# instantiate a chrome options object so you can set the size and headless preference
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920x1080")
driver = webdriver.Chrome(chrome_options=chrome_options,
executable_path='./chromedriver')
time.sleep(1)
# Notifier to notify the user when a new chapter of a manga is downloaded
notifier = Notifier()
liste_manager = ListManager()
for entry in liste_manager.lirescan_list:
manga = Manga(entry["name"], entry["url"], entry["last_chapter"])
driver.get(manga.url)
# TODO appeler le bon scrapper correspondant au site du manga actuel
# check if manga dir exist, if not, creates it
dir_name = './images/'+manga.dir_name+'/'
if not os.path.isdir(dir_name):
os.mkdir(dir_name)
is_last_chapter = False
previous_chapter = manga.chapter
while is_last_chapter == False:
# locating the chapter selector
chapter_selector = Select(driver.find_element_by_name('chapitres'))
options = chapter_selector.options
if int(options[0].text) == previous_chapter:
is_last_chapter = True
else:
# finding the next chapter not downloaded
for i in reversed(options):
if int(i.text) > previous_chapter:
previous_chapter = int(i.text)
liste_manager.updateLastChapter(manga, previous_chapter)
chapter = i.text
i.click()
break
# getting the correct number of pages in the chapter
page_select = driver.find_elements_by_class_name(
'page-link')
for i in range(len(page_select)):
if page_select[i].text == 'Suiv':
nb_pages = page_select[i-1].text
break
# checking if the image is png or jpg
src = driver.find_element_by_id(
'image_scan').get_attribute('src')
parsed = urlparse(src)
root, ext = os.path.splitext(parsed.path)
# checking if the images are correctly named. This is to prevent downloading us version of scan
image_checker = root.rsplit('/', 6)[6]
if re.search(r'^\d\d', image_checker):
print('test')
# creating the dir to put the images sorted by chapters
dir_name = './images/'+manga.dir_name+'/'+chapter+'/'
if not os.path.isdir(dir_name):
os.mkdir(dir_name)
# getting the generic link to the asset
src = driver.find_element_by_id(
'image_scan').get_attribute('src').rsplit('/', 1)[0]
# downloading all the pages for the chapter
page = 0
for i in range(int(nb_pages)+1):
page_nb = "%02d" % page
picture_name = manga.name+chapter+"_"+str(page)+ext
# preventing missnamed files on the website (ie : 00.png instead of 01.png)
response = requests.get(src+'/'+page_nb+ext)
if response.status_code == 200:
with open(dir_name+picture_name, "wb") as f:
f.write(requests.get(src+'/'+page_nb+ext).content)
page += 1
notifier.prepare(manga.name)
print('pas de nouveau chapitres')