forked from onstash/PyCyanide
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pycyanide.py
140 lines (118 loc) · 4.22 KB
/
pycyanide.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""Script to download Cyanide & Happiness comics."""
__author__ = "Santosh Venkatraman<santosh.venk@gmail.com>"
from os import path, getcwd, mkdir, makedirs
import sys
import time
import logging
from PIL import Image
from StringIO import StringIO
from argparse import ArgumentParser
from dateutil.parser import parse as date_parse
from lxml.etree import HTML
from requests import get
from requests.exceptions import ReadTimeout, ConnectionError, \
ConnectTimeout
YEAR=None
COMICS_DIR = path.join(getcwd(), "comics")
if not path.exists(COMICS_DIR):
try:
mkdir(COMICS_DIR)
except BaseException:
makedirs(COMICS_DIR)
def get_tree(url):
"""Helper method to get Tree of the url provided."""
try:
return HTML(get(url, timeout=3).content)
except (ReadTimeout, ConnectTimeout, ConnectionError) as error:
logging.error(error)
def fetch_data(url):
"""Helper method to fetch data from url."""
page_tree = get_tree(url)
if page_tree is None:
return
date = \
page_tree.xpath("//div[@class='meta-data']/div/h3/a/text()")[0]
date = date_parse(date)
if YEAR:
if date.year < YEAR:
sys.exit(0)
image_link = page_tree.xpath("//img[@id='main-comic']/@src")[0]
image_link = "http:{}".format(image_link) \
if image_link.startswith("//") else \
image_link
permalink = page_tree.xpath("//input[@id='permalink']/@value")[0]
author = page_tree.xpath(
"//small[@class='author-credit-name']/text()"
)[0].strip("by ")
comic_number = permalink.strip("/").split("/")[-1]
return {
"number": comic_number,
"image": image_link,
"permalink": permalink,
"metadata": {
"date": (date.year, date.month, date.day),
"author": author
}
}
def generate_comic_link(number):
"""Helper method to generate comic link based on number."""
return "http://explosm.net/comics/{number}".format(**locals())
def process_comic(url):
download_comic(fetch_data(url))
def download_comic(data):
"""Helper method to download comic from source to destination."""
if not data:
return
date = data.get("metadata").get("date")
destination = path.join(COMICS_DIR,
str(date[0]),
str(date[1]),
str(date[2]))
if not path.exists(destination):
makedirs(destination)
destination = \
path.join(destination, "{}.png".format(data.get("number")))
if path.exists(destination):
print("Already downloaded comic - {destination}".format(
**locals())
)
return
response = get(data.get("image"), stream=True)
Image.open(StringIO(response.content)).save(destination)
print("Downloaded comic - {destination}".format(**locals()))
def fetch_latest_comic():
"""Helper method to fetch latest comic id."""
base_url = "http://explosm.net/comics/latest"
root_page = fetch_data(base_url)
if not root_page:
sys.exit("Internet connection is ded!")
return int(root_page.get("number"))
def generate_limits(arguments):
"""Helper method to generate limits based on arguments."""
return arguments.start or fetch_latest_comic(), arguments.end or 0
def process_all_links(links):
"""Helper method to process links."""
error_links = []
for url in comic_links:
try:
process_comic(url)
except KeyboardInterrupt:
sys.exit()
except IndexError:
error_links.append(url)
if error_links:
process_all_links(error_links)
if __name__ == '__main__':
argument_parser = ArgumentParser()
argument_parser.add_argument("-s", "--start", type=int,
help="Indicate starting comic number for crawling")
argument_parser.add_argument("-e", "--end", type=int,
help="Indicate ending comic number for crawling")
argument_parser.add_argument("-y", "--year", type=int,
help="Indicate year for crawling")
arguments = argument_parser.parse_args()
global YEAR
YEAR = arguments.year
start, stop = generate_limits(arguments)
comic_links = map(generate_comic_link, xrange(start, stop, -1))
process_all_links(comic_links)