/
vapic.py
executable file
·202 lines (171 loc) · 6.12 KB
/
vapic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
import os
import sys
import defaultencoding
import htmltool
import cjk
import urllib,urllib2
import re
import getopt
import shutil
from collections import Counter
import hashlib
from PIL import Image
import StringIO
def get_image_from_buff(buff):
return Image.open(StringIO.StringIO(buff))
def get_web_content_with_cache(url,debug=False):
cache_hash=hashlib.md5(url).hexdigest()[0:8]
path="/tmp/vahelper/cache"
file=os.path.join(path,cache_hash)
try:
os.makedirs(path)
except:
pass
## read_cache_content
if(os.path.isfile(file)):
with open(file,"rb") as f:
if debug:
printu("read from cache file: "+file)
return f.read()
## read from internet
headers = { 'User-Agent' : 'Mozilla/5.0' } # google banned unvalid user-agent.
html_request = urllib2.Request(url, None, headers)
web_content=urllib2.urlopen(html_request).read()
## write_cache_content
with open(file,"wb+") as f:
f.write(web_content)
return web_content
def get_google_content_pic_search(keyword,urlbase="https://www.google.com/search?",width=800,height=500,debug=False):
'''
return the html content in unicode
'''
get=urllib.urlencode( { 'q': keyword,
'ie' : "utf-8",
'oe' : "utf-8",
'safe': "off", #child protect
'filter': "1", #duplicate filter
'num': "30", #count
'tbm': "isch", #image search
'biw': "%s"%width, # image width
'bih': "%s"%height, # image height
'sa' : "N",
'tab': "wi",
'hl' : "zh-TW",
'um' : "1",
})
html_content=get_web_content_with_cache(urlbase+get,debug);
html_content=html_content.decode("utf-8")
return html_content
def get_vaid(string):
vaid=string
vaid_match=re.search(r"([a-zA-Z]+[-]?\d+)",string,re.IGNORECASE)
if vaid_match is not None:
vaid=vaid_match.group(1)
return vaid
def get_vapic(keyword,path=os.path.abspath(os.path.curdir.decode()),num=3,height=700,width=500,verbose=False,debug=False):
html=get_google_content_pic_search(keyword,debug=debug)
#html=html.lower()
html=htmltool.decode_entity(html)
html=htmltool.remove_tags(html,repl="||")
#html=htmltool.clean_tags(html,repl="||")
if debug:
printu(html)
imgurls=re.findall("imgurl=([^&]*?.jpg)", html, flags=re.I)
for url in imgurls:
if num == 0:
break
if verbose:
printu("try url: %s"%url)
try:
## get image from internet
content=None
content=get_web_content_with_cache(url)
if not content:
continue
## check image size
image=get_image_from_buff(content)
if image.size[0] < 700 or image.size[1] < 500:
continue ## skip small image
## save image to path
filepath=os.path.join(path,os.path.basename(url))
with open(filepath,"wb+") as f:
f.write(content)
num=num-1
printu("(%4d,%4d)[%-32s] <= [%s]"%(image.size[0],image.size[1],os.path.relpath(filepath),url))
except Exception as e:
if verbose:
printu("Error:%s: %s: %s"%(type(e),str(e),url))
def printu(unistr):
print unistr.encode()
def usage():
printu(
"""
Usage:
-k keyword | --keyword=keyword # Kerword to query
-p path | --path=path # Path to query
-n num | --num=num # number of image to download
-h pixel | --height=pixel # height pixel of image
-w pixel | --width=pixel # width pixel of image
-d | --debug # Dump html and stop process
-v | --verbose # verbose
"""
)
def main():
#html=open("out").read().decode("utf=8")
## handle parameter
try:
options,nonoptions = getopt.getopt(sys.argv[1:],"k:p:n:h:w:dv",["keyword=","path=","num=","height=","width=","debug","verbose"])
except getopt.GetoptError as e:
# print help information and exit:
printu("Error: "+str(e)) # will print something like "option -a not recognized"
usage()
sys.exit(2)
opts={ "keyword":None,
"path":None,
"num":2,
"height":0,
"width":0,
"debug":False,
"verbose":False,
}
for opt,arg in options:
if opt in ("-k","--keyword"):
opts["keyword"]=arg.decode()
if opt in ("-p","--path"):
upath=os.path.normpath(arg.decode(sys.getdefaultencoding()))
rpath=upath.encode()
if os.path.exists(rpath) and (os.path.isdir(rpath) or os.path.isfile(rpath)):
opts["path"]=os.path.abspath(rpath)
else:
printu("Error path:"+upath)
sys.exit(1)
if opt in ("-n","--num"):
opts["num"]=int(arg.decode())
if opt in ("-h","--height"):
opts["height"]=int(arg.decode())
if opt in ("-w","--width"):
opts["width"]=int(arg.decode())
if opt in ("-d","--debug"):
opts["debug"]=True
if opt in ("-v","--verbose"):
opts["verbose"]=True
if opts["width"]==0 and opts["height"]==0 :
opts["width"]=536;
opts["height"]=800;
elif opts["width"]==0:
opts["width"]=opts["height"]*800/536
elif opts["height"]==0:
opts["height"]=opts["width"]*536/800
if opts["path"]:
vaid=opts["keyword"] if opts["keyword"] else get_vaid(os.path.basename(opts["path"]))
printu(vaid)
get_vapic(vaid,path=opts["path"], num=opts["num"], verbose=opts["verbose"], debug=opts["debug"])
else:
usage()
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
pass