def __init__(self, name=None): self._name = name self._web_base = None self._url_base = None self._url_file = None self._url = None self._xval = None self._pr = Print(self.__class__.__name__) self._class = None self._thread_max = 5 self._thread_queue = None self._run_ui = None
class WebURLCrawler(object): HELP_MENU = ( '==================================', ' WebURLCrawler help', '==================================', 'option:', ' -s path: path to be collect', ' -t file: file to be save urls', ) pr = Print('WebURLCrawler') def __init__(self, name=None): self._name = name self._src = None self._tgt = None def collect_web_url(self): urls = list() # read all of web_url.txt for rt, ds, fs in os.walk(self._src): if fs: for f in fs: if f == 'web_url.txt': f = os.path.join(rt, f) with open(f, 'r') as fd: fd.readline() urls.append(fd.readline()) # write to file with open(self._tgt, 'w') as fd: for url in urls: fd.write('%s\n' % url) def get_user_input(self): args = Base.get_user_input('hs:t:') if '-h' in args: Base.print_help(self.HELP_MENU) if '-s' in args: self._src = re.sub('/$', '', args['-s']) if '-t' in args: self._tgt = Path.get_abs_path(args['-t']) return args def main(self): self.get_user_input() if not self._src: Base.print_exit('no -s, -h for help!') if not self._tgt: self._tgt = '%s/%s.txt' % (self._src, os.path.basename(self._src)) # collect urls. self.collect_web_url()
def __init__(self, name=None): self._name = name self._com = None self._url_base = None self._url = None self._num = 1 self._path = '%s/%s' % (Base.DEFAULT_DWN_PATH, self.__class__.__name__) self._re_image_url = [ #re.compile('src=[\'|\"]?(http[s]?://.+\.(?:jpg|png|gif|bmp|jpeg))[\'|\"]?', re.I), #re.compile('src=[\'|\"]?(/.+\.(?:jpg|png|gif|bmp|jpeg))[\'|\"]?', re.I), re.compile('src=[\'|\"]?(http[s]?://[a-z0-9\./-]+\.(?:jpg|png|gif|bmp|jpeg))[\'|\"]?', re.I), re.compile('src=[\'|\"]?(/[a-z0-9\./-]+\.(?:jpg|png|gif|bmp|jpeg))[\'|\"]?', re.I), ] self._ex_re_image_url = None self._title = None self._remove_small_image = True self._view = False self._xval = None self._dl_image = self.urlopen_get_url_image self._redundant_title = None self._pr = Print(self.__class__.__name__) self.__dbg = 0 self._thread_max = 5 self._thread_queue = None
class WebContent(object): CONTEXT_UNVERIFIED = ssl._create_unverified_context() CONTEXT_TLSv1 = ssl.SSLContext(ssl.PROTOCOL_TLSv1) WEB_URL_FILE = r'web_url.txt' pr = Print('WebContent') @classmethod def url_is_https(cls, url): if re.match('https://', url): return True else: return False @classmethod def get_url_charset(cls, html=None, content_type=None): charset = None pattern = re.compile('charset=[a-z0-8-]*', flags=re.I) if content_type: charset = pattern.search( re.sub('charset=(\"|\')', 'charset=', content_type)) if all((html, not charset)): charset = pattern.search( re.sub('charset=(\"|\')', 'charset=', str(html))) # get data if charset: charset = charset.group() charset = charset[len('charset='):].upper() return charset @classmethod def get_html(cls, url, context=None, retry_times=3, view=False): if view: cls.pr.pr_info('Downloading: %s' % url) html_content = None while all((retry_times, not html_content)): retry_times -= 1 url_charset = None req = Request(url, headers=URL_HEADER) try: html = urlopen(req, context=context) except URLError as e: cls.pr.pr_warn(str(e)) html_content = None else: content_type = html.getheader('Content-Type') if content_type: url_charset = cls.get_url_charset( content_type=content_type) data = html.read() encoding = html.getheader('Content-Encoding') if encoding == 'gzip': data = gzip.GzipFile(fileobj=io.StringIO(data)).read() if data: for charset in CHARSETS: if url_charset: html_content = data.decode( url_charset, 'ignore').encode('utf-8') break else: html_content = data.decode( charset, 'ignore').encode('utf-8') if html_content: url_charset = cls.get_url_charset(html_content) if not url_charset: url_charset = DEFAULT_CHARSET elif charset == url_charset: break else: #cls.pr.pr_err('Error: fail to get data from html') html_content = None return html_content @classmethod def get_url_content(cls, url, retry_times=3, view=True, path=None): if cls.url_is_https(url): content = cls.get_html(url=url, context=cls.CONTEXT_UNVERIFIED, retry_times=retry_times, view=view) else: content = cls.get_html(url=url, retry_times=retry_times, view=view) # save content to path. if all((content, path)): Path.make_path(path) f = '%s/%s' % (path, cls.convert_url_to_title(url)) if File.get_exname(f) != '.html': f = f + '.html' with open(f, 'w') as fd: fd.write(content) return content @classmethod def urlretrieve_callback(cls, blocknum, blocksize, totalsize): if not totalsize: return percent = 100.0 * blocknum * blocksize / totalsize if percent > 100: percent = 100 cls.pr.pr_dbg("%.2f%%" % percent) @classmethod def retrieve_url_file(cls, url, path, view=False): fname = os.path.join(path, url.split('/')[len(url.split('/')) - 1]) if not os.path.exists(fname): if view: cls.pr.pr_info('retrieve: %s' % fname) try: urllib.urlretrieve(url, fname, cls.urlretrieve_callback) except socket.error or ZeroDivisionError as e: cls.pr.pr_info('urlretrieve error: %s' % e.errno) else: try: urllib.urlretrieve(url, fname) except socket.error as e: cls.pr.pr_warn('%s, retrieve %s failed.' % (str(e), url)) @classmethod def urlopen_get_url_file(cls, url, path, ssl=False, headers=None, view=False): fname = os.path.join(path, url.split('/')[len(url.split('/')) - 1]) if not os.path.exists(fname): req = Request(url=url) if headers: for key, value in headers.items(): req.add_header(key, value) if ssl: context = cls.CONTEXT_UNVERIFIED else: context = None try: r = urlopen(req, context=context) except (URLError, HTTPError) as e: cls.pr.pr_warn('%s, uget %s failed.' % (str(e), url)) else: try: data = r.read() except ConnectionResetError as e: cls.pr.pr_err(str(e)) else: with open(fname, 'wb') as f: if view: cls.pr.pr_info('uget: %s' % fname) f.write(data) @classmethod def requests_get_url_file(cls, url, path, view=False): fname = os.path.join(path, url.split('/')[len(url.split('/')) - 1]) if not os.path.exists(fname): r = requests.get(url) with open(fname, 'wb') as f: if view: cls.pr.pr_info('requests get: %s' % fname) f.write(r.content) @classmethod def wget_url_file(cls, url, path, config='-c -t 3 -T 10 -U \'%s\'' % USER_AGENTS['Kubuntu'], view=False): if view: cmd = 'wget %s -P %s %s -nv' % (config, path, url) else: cmd = 'wget %s -P %s %s -q' % (config, path, url) try: cls.pr.pr_dbg('wget cmd: %s' % cmd) return subprocess.check_output(cmd, shell=True) except subprocess.CalledProcessError: return None @classmethod def get_url_title(cls, html_content, pattern=None): if not pattern: pattern = re.compile(b'<title>.+</title>') data = pattern.search(html_content) if data: data = data.group() return data[len('<title>'):len(data) - len('</title>')] else: return None @classmethod def get_url_pages(cls, html, pattern=None): if not pattern: # find all of \d+/\d+ pattern = re.compile('\d+/\d+') data = pattern.findall(str(html)) if data: # match ^1/\d+$ to get number of pages. pattern = re.compile('^1/\d+$') for d in data: d = pattern.match(d) if d: # getnumber of pages and return int. pages = int(re.compile('\d+').findall(d.group())[1]) break else: pages = None else: pages = None return pages @classmethod def get_url_base_and_num(cls, url): base = None num = None # numbers. num = re.compile('^\d+$').search(url) if num: num = num.group() else: num = re.compile('(\d)+(/)?(.html)?$').search(url) if num: num = re.compile('\d+').search(num.group()).group() base = re.sub(num, 'URLID', url) return base, num @classmethod def set_url_base_and_num(cls, base, num): if base: return re.sub('URLID', str(num), base) else: return num @classmethod def convert_url_to_title(cls, url): return File.reclaim_name(re.sub('/$', '', url))
' wget: using wget to download file', ' rtrv: using retrieve to download file', ' rget: using requests to download file', ' uget: using urlopen to download file', ' html: download html of url' ' -v:', ' view info of webcontent.', ) path = None url = None df = None view = False wc = WebContent() pr = Print(wc.__class__.__name__) args = Base.get_user_input('hp:u:d:v') if '-h' in args: Base.print_help(HELP_MENU) if '-p' in args: path = Path.get_abs_path(args['-p']) if '-u' in args: url = args['-u'] if '-v' in args: view = True wc.pr.set_pr_level(0x07) if '-d' in args: df_funcs = { 'wget': wc.wget_url_file, 'rtrv': wc.retrieve_url_file,
' -c img: check img is a image file', ' img: the path of image file', ' -r path,(w,d): remove small size of images', ' path: path of dir or file', ' -R path: reclaim image format', ' path: path of dir or file', ' -o path,rename,nz: rename image to order', ' path: path of images', ' rename: the format of image to be rename', ' nz: True is set %0d, False is set %d', ' -i img: show detail info of image file', ' img: the path of image file', ) Img = Image() pr = Print(Img.__class__.__name__) xval = None args = Base.get_user_input('hc:r:R:x:o:i:') if '-h' in args: Base.print_help(HELP_MENU) if '-c' in args: result = Img.image_file(Path.get_abs_path(args['-c'])) pr.pr_info(result) if '-r' in args: data = args['-r'].split(',') path = data[0] if len(data) >=2: w = data[1] h = data[2] Img.remove_small_image(path, int(w), int(h)) else:
class WebImage(object): help_menu = ( '==================================', ' WebImage help', '==================================', 'option:', ' -u url:', ' url of web to be download', ' -n num:', ' number of web number to be download', ' -p path:', ' root path to store images.', ' -v:', ' view info while download.', ' -x val:', ' val for expand cmd.', ' -m mode:', ' wget: using wget to download imgages', ' rtrv: using retrieve to download images', ' rget: using requests to download images', ' uget: using urlopen to download images', ' -R file:', ' re config file for re_image_url.' ' -t num:', ' set max number of thread to download web.' ) def __init__(self, name=None): self._name = name self._com = None self._url_base = None self._url = None self._num = 1 self._path = '%s/%s' % (Base.DEFAULT_DWN_PATH, self.__class__.__name__) self._re_image_url = [ #re.compile('src=[\'|\"]?(http[s]?://.+\.(?:jpg|png|gif|bmp|jpeg))[\'|\"]?', re.I), #re.compile('src=[\'|\"]?(/.+\.(?:jpg|png|gif|bmp|jpeg))[\'|\"]?', re.I), re.compile('src=[\'|\"]?(http[s]?://[a-z0-9\./-]+\.(?:jpg|png|gif|bmp|jpeg))[\'|\"]?', re.I), re.compile('src=[\'|\"]?(/[a-z0-9\./-]+\.(?:jpg|png|gif|bmp|jpeg))[\'|\"]?', re.I), ] self._ex_re_image_url = None self._title = None self._remove_small_image = True self._view = False self._xval = None self._dl_image = self.urlopen_get_url_image self._redundant_title = None self._pr = Print(self.__class__.__name__) self.__dbg = 0 self._thread_max = 5 self._thread_queue = None def get_image_url(self, html): pattern = self._re_image_url imgs = list() # find image. try: if type(pattern) == list: for pt in pattern: imgs = imgs + pt.findall(str(html)) else: imgs = pattern.findall(str(html)) #self._pr.pr_dbg('%s' % imgs) except TypeError as e: self._pr.pr_err('%s: failed to findall image url' % str(e)) return imgs def get_image_url_of_pages(self, pages, header_content=None): limg = list() url_pages = self.get_url_of_pages(pages) for index in range(len(url_pages)): if all((index == 0, header_content)): url_content = header_content else: url_content = self.get_url_content(url_pages[index]) if not url_content: self._pr.pr_err('failed to download %s sub web' % url_pages[index]) continue imgs = self.get_image_url(url_content) for img in imgs: limg.append(img) return limg def get_image_raw_url(self, url): if not re.match('http(s)?:', url): url = '%s%s' % (self._com, url) return url def retrieve_url_image(self, url, path, view=False): return WebContent.retrieve_url_file(url, path, view=view) def wget_url_image(self, url, path, view=False): return WebContent.wget_url_file(url, path, config="-c -t 3 -T 10 -U \'%s\'" % USER_AGENTS['AppleWebKit/537.36'], view=view) def requests_get_url_image(self, url, path, view=False): return WebContent.requests_get_url_file(url, path, view=view) def urlopen_get_url_image(self, url, path, view=False): headers = { 'User-Agent': '%s' % USER_AGENTS['AppleWebKit/537.36'], 'GET' : url, 'Referer' : self._com, } return WebContent.urlopen_get_url_file(url, path, ssl=WebContent.url_is_https(url), headers=headers, view=view) # download image of url. def download_image(self, url, path): if self._dl_image: Path.make_path(path) self._dl_image(url, path, self.__dbg) def get_url_content(self, url, view=False): return WebContent.get_url_content(url=url, view=view) def get_title(self, html, pattern=None): title = WebContent.get_url_title(html, pattern).decode() if self._redundant_title: for rt in self._redundant_title: title = title.replace(rt, '') return title def get_pages(self, html, pattern=None): return WebContent.get_url_pages(html, pattern) def download_images(self, imgs, path): for img in imgs: self.download_image(self.get_image_raw_url(img), path) def get_url_of_pages(self, num): url = map(lambda x: WebContent.set_url_base_and_num(self._url_base, '%s/%d' % (int(self._url), x)), range(2, num + 1)) url.insert(0, WebContent.set_url_base_and_num(self._url_base, self._url)) return url def get_url_address(self, url_base, url): return WebContent.set_url_base_and_num(url_base, url) def convert_url_to_title(self, url): return WebContent.convert_url_to_title(url) def store_web_info(self, path, title, url): with open('%s/%s' % (path, WebContent.WEB_URL_FILE), 'w') as fd: fd.write('%s\n%s' % (title, url)) def store_url_of_images(self, path, urls): with open('%s/%s' % (path, WebContent.WEB_URL_FILE), 'a') as fd: fd.write('\n') fd.write('\n') fd.write('url of imgs:\n') for url in urls: fd.write('%s\n' % url) def output_image_exists(self, path): for rt, ds, fs in os.walk(path): if fs: for f in fs: f = os.path.join(rt, f) if Image.image_file2(f): return True return False def add_external_re_image_url(self): if self._ex_re_image_url: relist = list() try: with open(self._ex_re_image_url, 'r') as fd: lines = fd.readlines() # list all of lines. for line in lines: line = re.sub('\n', '', line) relist.append(re.compile(line)) # add old lines. if type(self._re_image_url) == list: for r in self._re_image_url: relist.append(r) else: relist.append(self._re_image_url) # update re_image_url self._re_image_url = relist except IOError as e: self._pr.pr_err('%s, failed to open %s' % (str(e), self._ex_re_image_url)) # process url web images. def process_url_web(self, url, data=None): # get header web header_content = self.get_url_content(url, view=False) if not header_content: if self._thread_queue: self._thread_queue.get() self._pr.pr_err('failed to download %s header web.' % url) return # get url title. title = self.get_title(header_content, self._title) if not title: title = self.convert_url_to_title(url) self._pr.pr_dbg('title: %s' % title) # create path of title to store data. subpath = os.path.join(self._path, title) self._pr.pr_dbg('subpath: %s' % subpath) # get count of pages pages = self.get_pages(header_content) self._pr.pr_dbg('get pages: %s' % pages) if not pages: limg = self.get_image_url(header_content) else: limg = self.get_image_url_of_pages(pages, header_content) # filter images limg = set(limg) # self._pr.pr_dbg('image url list: %s' % limg) # download images if limg: # download all of images. self.download_images(limg, subpath) # write web info self.store_web_info(subpath, title, url) # reclaim image, remove small image if self._remove_small_image: Image.reclaim_path_images(subpath, xfunc=Image.remove_small_image) else: Image.reclaim_path_images(subpath) # show output info. if self._view: if self.output_image_exists(subpath): self._pr.pr_info('output: %s' % (subpath)) else: self._pr.pr_info('output no images: %s' % (subpath)) # save url of images if it is full debug. if self.__dbg >= 2: self.store_url_of_images(subpath, limg) # release queue if self._thread_queue: self._thread_queue.get() if data: self._pr.pr_info('%d/%d: process %s done!' % (data[0], data[1], url)) return subpath def get_user_input(self, args=None): if not args: args = Base.get_user_input('hu:n:p:x:m:i:R:t:vdD') if '-h' in args: Base.print_help(self.help_menu) if '-u' in args: self._url = re.sub('/$', '', args['-u']) if '-n' in args: self._num = int(args['-n']) if '-p' in args: self._path = os.path.abspath(args['-p']) if '-R' in args: self._ex_re_image_url = os.path.abspath(args['-R']) if '-t' in args: try: n = int(args['-t']) except ValueError as e: Base.print_exit('%s, -h for help!' % str(e)) if n: self._thread_max = n if '-v' in args: self._view = True self._pr.set_pr_level(self._pr.get_pr_level() | Print.PR_LVL_WARN) if '-x' in args: self._xval = args['-x'] if '-m' in args: dl_image_funcs = { 'wget': self.wget_url_image, 'rtrv' : self.retrieve_url_image, 'rget' : self.requests_get_url_image, 'uget' : self.urlopen_get_url_image, } if args['-m'] in dl_image_funcs.keys(): self._dl_image = dl_image_funcs[args['-m']] if '-d' in args: self.__dbg = 1 self._pr.set_pr_level(self._pr.get_pr_level() | Print.PR_LVL_ALL ) if '-D' in args: self.__dbg = 2 self._pr.set_pr_level(self._pr.get_pr_level() | Print.PR_LVL_DBG) WebContent.pr.set_pr_level(self._pr.get_pr_level() | Print.PR_LVL_DBG) # check url if self._url: base, num = WebContent.get_url_base_and_num(self._url) if base: self._url_base = base if num: self._url = num self._pr.pr_dbg('get base: %s, url: %s' % (base, self._url)) else: Base.print_exit('[WebImage] Error, no set url, -h for help!') if self._url_base: www_com = re.match('http[s]?://.+\.(com|cn|net)', self._url_base) if www_com: self._com = www_com.group() return args def main(self, args=None): self.get_user_input(args) # get external re file. if self._ex_re_image_url: self.add_external_re_image_url() # create queue. if self._num > self._thread_max: self._thread_queue = queue.Queue(self._thread_max) # get web now. for index in range(self._num): # get the first page. if self._url_base: url = self.get_url_address(self._url_base, int(self._url) + index) else: url = self.get_url_address(None, self._url) if self._thread_queue: # create thread and put to queue. t = threading.Thread(target=self.process_url_web, args=(url, (index + 1, self._num))) self._thread_queue.put(url) t.start() else: return self.process_url_web(url)
class WebImageCrawler(WebContent): HELP_MENU = ( '==================================', ' WebImageCrawler help', '==================================', 'option:', ' -u url:', ' url of web to be download', ' -n num:', ' number of web number to be download', ' -p path:', ' root path to store images.', ' -v:', ' view info while download.', ' -x val:', ' xgmn: xgmn of girlsky', ' swmn: swmn of girlsky', ' wgmn: wgmn of girlsky', ' zpmn: zpmn of girlsky', ' mnxz: mnxz of girlsky', ' rtys: rtys of girlsky', ' jpmn: jpmn of girlsky', ' gzmn: gzmn of girlsky', ' pstatp: pstatp of toutiao', ' toutiao: toutiao of toutiao', ' meizitu: meizitu of meizitu', ' mzitu: mzitu of mzitu', ' -m mode:', ' wget: using wget to download imgages', ' rtrv: using retrieve to download images', ' rget: using requests to download images', ' uget: using urlopen to download images', ' -R file:', ' re config file for re_image_url.', ' -t num:', ' set number of thread to download images.', ) def __init__(self, name=None): self._name = name self._web_base = None self._url_base = None self._url_file = None self._url = None self._xval = None self._pr = Print(self.__class__.__name__) self._class = None self._thread_max = 5 self._thread_queue = None self._run_ui = None def get_input(self, args=None): if not args: args = Base.get_user_input('hu:n:p:x:m:R:t:UvDd') if '-h' in args: Base.print_help(self.HELP_MENU) if '-U' in args: self._run_ui = True if '-u' in args: if os.path.isfile(args['-u']): self._url_file = Path.get_abs_path(args['-u']) else: self._url = re.sub('/$', '', args['-u']) if '-x' in args: self._xval = args['-x'] if '-d' in args: self._pr.set_pr_level(self._pr.get_pr_level() | Print.PR_LVL_DBG) # get url_base from xval if self._xval: if self._xval in URL_BASE: self._url_base = list(URL_BASE[self._xval])[0] self._class = URL_BASE[self._xval][self._url_base] else: Base.print_exit('[WebImageCrawler] Error, invalid -x val!') # get class from url if self._url: base, num = self.get_url_base_and_num(self._url) if base: self._url_base = base # get class from url_base if all((not self._class, self._url_base)): for dict_url_base in URL_BASE.values(): if self._url_base == list(dict_url_base)[0]: self._class = dict_url_base[self._url_base] break return args def process_input(self, args=None, info=None): if self._class: if self._class == 'girlsky': hdr = Girlsky('Girlsky') elif self._class == 'pstatp': hdr = Pstatp('Pstatp') elif self._class == 'meizitu': hdr = Meizitu('Meizitu') elif self._class == 'mzitu': hdr = Mzitu('Mzitu') else: hdr = WebImage('WebImage') if hdr: hdr.main(args) else: self._pr.pr_err('[WebImageCrawler] Error, no found handler!') # release queue if self._thread_queue: self._thread_queue.get() if info: index = info[0] total = info[1] self._pr.pr_info('process %d/%d input file done' % (index, total)) def process_file_input(self, args=None): if self._url_file: with open(self._url_file, 'r') as fd: lines = set(fd.readlines()) self._thread_queue = queue.Queue(self._thread_max) total = len(lines) index = 1 # delete -u arg. if '-u' in args: del args['-u'] # process all of url. for url in lines: self._class = None # remove invalid chars. for key, value in {'/$': '', '\n$': ''}.items(): url = re.sub(key, value, url) # get base and num base, num = self.get_url_base_and_num(url) if base: for dict_url_base in URL_BASE.values(): if base == list(dict_url_base)[0]: self._class = dict_url_base[base] break if self._class: url_args = {'-u': url} url_args.update(args) info = (index, total) # create thread and put to queue. t = threading.Thread(target=self.process_input, args=(url_args, info)) self._thread_queue.put(url) t.start() index = index + 1 def main(self, args=None): if not args: args = self.get_input() if self._run_ui: cwd = os.path.dirname(os.path.realpath(__file__)) os.system('%s/webimagecrawlerUI.py' % cwd) elif self._url_file: self.process_file_input(args) else: self.process_input()