Exemple #1
0
 def __init__(self, name=None):
     self._name = name
     self._web_base = None
     self._url_base = None
     self._url_file = None
     self._url = None
     self._xval = None
     self._pr = Print(self.__class__.__name__)
     self._class = None
     self._thread_max = 5
     self._thread_queue = None
     self._run_ui = None
Exemple #2
0
class WebURLCrawler(object):

    HELP_MENU = (
        '==================================',
        '    WebURLCrawler help',
        '==================================',
        'option:',
        '  -s path: path to be collect',
        '  -t file: file to be save urls',
    )

    pr = Print('WebURLCrawler')

    def __init__(self, name=None):
        self._name = name
        self._src = None
        self._tgt = None

    def collect_web_url(self):
        urls = list()
        # read all of web_url.txt
        for rt, ds, fs in os.walk(self._src):
            if fs:
                for f in fs:
                    if f == 'web_url.txt':
                        f = os.path.join(rt, f)
                        with open(f, 'r') as fd:
                            fd.readline()
                            urls.append(fd.readline())
        # write to file
        with open(self._tgt, 'w') as fd:
            for url in urls:
                fd.write('%s\n' % url)

    def get_user_input(self):
        args = Base.get_user_input('hs:t:')
        if '-h' in args:
            Base.print_help(self.HELP_MENU)
        if '-s' in args:
            self._src = re.sub('/$', '', args['-s'])
        if '-t' in args:
            self._tgt = Path.get_abs_path(args['-t'])
        return args

    def main(self):
        self.get_user_input()
        if not self._src:
            Base.print_exit('no -s, -h for help!')
        if not self._tgt:
            self._tgt = '%s/%s.txt' % (self._src, os.path.basename(self._src))
        # collect urls.
        self.collect_web_url()
Exemple #3
0
 def __init__(self, name=None):
     self._name = name
     self._com = None
     self._url_base = None
     self._url = None
     self._num = 1
     self._path = '%s/%s' %  (Base.DEFAULT_DWN_PATH, self.__class__.__name__)
     self._re_image_url = [
         #re.compile('src=[\'|\"]?(http[s]?://.+\.(?:jpg|png|gif|bmp|jpeg))[\'|\"]?', re.I),
         #re.compile('src=[\'|\"]?(/.+\.(?:jpg|png|gif|bmp|jpeg))[\'|\"]?', re.I),
         re.compile('src=[\'|\"]?(http[s]?://[a-z0-9\./-]+\.(?:jpg|png|gif|bmp|jpeg))[\'|\"]?', re.I),
         re.compile('src=[\'|\"]?(/[a-z0-9\./-]+\.(?:jpg|png|gif|bmp|jpeg))[\'|\"]?', re.I),
     ]
     self._ex_re_image_url = None
     self._title = None
     self._remove_small_image = True
     self._view = False
     self._xval = None
     self._dl_image = self.urlopen_get_url_image
     self._redundant_title = None
     self._pr = Print(self.__class__.__name__)
     self.__dbg = 0
     self._thread_max = 5
     self._thread_queue = None
Exemple #4
0
class WebContent(object):

    CONTEXT_UNVERIFIED = ssl._create_unverified_context()
    CONTEXT_TLSv1 = ssl.SSLContext(ssl.PROTOCOL_TLSv1)

    WEB_URL_FILE = r'web_url.txt'

    pr = Print('WebContent')

    @classmethod
    def url_is_https(cls, url):
        if re.match('https://', url):
            return True
        else:
            return False

    @classmethod
    def get_url_charset(cls, html=None, content_type=None):
        charset = None
        pattern = re.compile('charset=[a-z0-8-]*', flags=re.I)
        if content_type:
            charset = pattern.search(
                re.sub('charset=(\"|\')', 'charset=', content_type))
        if all((html, not charset)):
            charset = pattern.search(
                re.sub('charset=(\"|\')', 'charset=', str(html)))
        # get data
        if charset:
            charset = charset.group()
            charset = charset[len('charset='):].upper()
        return charset

    @classmethod
    def get_html(cls, url, context=None, retry_times=3, view=False):
        if view:
            cls.pr.pr_info('Downloading: %s' % url)
        html_content = None
        while all((retry_times, not html_content)):
            retry_times -= 1
            url_charset = None
            req = Request(url, headers=URL_HEADER)
            try:
                html = urlopen(req, context=context)
            except URLError as e:
                cls.pr.pr_warn(str(e))
                html_content = None
            else:
                content_type = html.getheader('Content-Type')
                if content_type:
                    url_charset = cls.get_url_charset(
                        content_type=content_type)
                data = html.read()
                encoding = html.getheader('Content-Encoding')
                if encoding == 'gzip':
                    data = gzip.GzipFile(fileobj=io.StringIO(data)).read()
                if data:
                    for charset in CHARSETS:
                        if url_charset:
                            html_content = data.decode(
                                url_charset, 'ignore').encode('utf-8')
                            break
                        else:
                            html_content = data.decode(
                                charset, 'ignore').encode('utf-8')
                            if html_content:
                                url_charset = cls.get_url_charset(html_content)
                                if not url_charset:
                                    url_charset = DEFAULT_CHARSET
                                elif charset == url_charset:
                                    break
                else:
                    #cls.pr.pr_err('Error: fail to get data from html')
                    html_content = None
        return html_content

    @classmethod
    def get_url_content(cls, url, retry_times=3, view=True, path=None):
        if cls.url_is_https(url):
            content = cls.get_html(url=url,
                                   context=cls.CONTEXT_UNVERIFIED,
                                   retry_times=retry_times,
                                   view=view)
        else:
            content = cls.get_html(url=url, retry_times=retry_times, view=view)
        # save content to path.
        if all((content, path)):
            Path.make_path(path)
            f = '%s/%s' % (path, cls.convert_url_to_title(url))
            if File.get_exname(f) != '.html':
                f = f + '.html'
            with open(f, 'w') as fd:
                fd.write(content)
        return content

    @classmethod
    def urlretrieve_callback(cls, blocknum, blocksize, totalsize):
        if not totalsize:
            return
        percent = 100.0 * blocknum * blocksize / totalsize
        if percent > 100:
            percent = 100
        cls.pr.pr_dbg("%.2f%%" % percent)

    @classmethod
    def retrieve_url_file(cls, url, path, view=False):
        fname = os.path.join(path, url.split('/')[len(url.split('/')) - 1])
        if not os.path.exists(fname):
            if view:
                cls.pr.pr_info('retrieve: %s' % fname)
                try:
                    urllib.urlretrieve(url, fname, cls.urlretrieve_callback)
                except socket.error or ZeroDivisionError as e:
                    cls.pr.pr_info('urlretrieve error: %s' % e.errno)
            else:
                try:
                    urllib.urlretrieve(url, fname)
                except socket.error as e:
                    cls.pr.pr_warn('%s, retrieve %s failed.' % (str(e), url))

    @classmethod
    def urlopen_get_url_file(cls,
                             url,
                             path,
                             ssl=False,
                             headers=None,
                             view=False):
        fname = os.path.join(path, url.split('/')[len(url.split('/')) - 1])
        if not os.path.exists(fname):
            req = Request(url=url)
            if headers:
                for key, value in headers.items():
                    req.add_header(key, value)
            if ssl:
                context = cls.CONTEXT_UNVERIFIED
            else:
                context = None
            try:
                r = urlopen(req, context=context)
            except (URLError, HTTPError) as e:
                cls.pr.pr_warn('%s, uget %s failed.' % (str(e), url))
            else:
                try:
                    data = r.read()
                except ConnectionResetError as e:
                    cls.pr.pr_err(str(e))
                else:
                    with open(fname, 'wb') as f:
                        if view:
                            cls.pr.pr_info('uget: %s' % fname)
                        f.write(data)

    @classmethod
    def requests_get_url_file(cls, url, path, view=False):
        fname = os.path.join(path, url.split('/')[len(url.split('/')) - 1])
        if not os.path.exists(fname):
            r = requests.get(url)
            with open(fname, 'wb') as f:
                if view:
                    cls.pr.pr_info('requests get: %s' % fname)
                f.write(r.content)

    @classmethod
    def wget_url_file(cls,
                      url,
                      path,
                      config='-c -t 3 -T 10 -U \'%s\'' %
                      USER_AGENTS['Kubuntu'],
                      view=False):
        if view:
            cmd = 'wget %s -P %s %s -nv' % (config, path, url)
        else:
            cmd = 'wget %s -P %s %s -q' % (config, path, url)
        try:
            cls.pr.pr_dbg('wget cmd: %s' % cmd)
            return subprocess.check_output(cmd, shell=True)
        except subprocess.CalledProcessError:
            return None

    @classmethod
    def get_url_title(cls, html_content, pattern=None):
        if not pattern:
            pattern = re.compile(b'<title>.+</title>')
        data = pattern.search(html_content)
        if data:
            data = data.group()
            return data[len('<title>'):len(data) - len('</title>')]
        else:
            return None

    @classmethod
    def get_url_pages(cls, html, pattern=None):
        if not pattern:
            # find all of \d+/\d+
            pattern = re.compile('\d+/\d+')
        data = pattern.findall(str(html))
        if data:
            # match ^1/\d+$ to get number of pages.
            pattern = re.compile('^1/\d+$')
            for d in data:
                d = pattern.match(d)
                if d:
                    # getnumber of pages and return int.
                    pages = int(re.compile('\d+').findall(d.group())[1])
                    break
                else:
                    pages = None
        else:
            pages = None
        return pages

    @classmethod
    def get_url_base_and_num(cls, url):
        base = None
        num = None
        # numbers.
        num = re.compile('^\d+$').search(url)
        if num:
            num = num.group()
        else:
            num = re.compile('(\d)+(/)?(.html)?$').search(url)
            if num:
                num = re.compile('\d+').search(num.group()).group()
                base = re.sub(num, 'URLID', url)
        return base, num

    @classmethod
    def set_url_base_and_num(cls, base, num):
        if base:
            return re.sub('URLID', str(num), base)
        else:
            return num

    @classmethod
    def convert_url_to_title(cls, url):
        return File.reclaim_name(re.sub('/$', '', url))
Exemple #5
0
        '    wget: using wget to download file',
        '    rtrv: using retrieve to download file',
        '    rget: using requests to download file',
        '    uget: using urlopen to download file',
        '    html: download html of url'
        '  -v:',
        '    view info of webcontent.',
    )

    path = None
    url = None
    df = None
    view = False

    wc = WebContent()
    pr = Print(wc.__class__.__name__)

    args = Base.get_user_input('hp:u:d:v')
    if '-h' in args:
        Base.print_help(HELP_MENU)
    if '-p' in args:
        path = Path.get_abs_path(args['-p'])
    if '-u' in args:
        url = args['-u']
    if '-v' in args:
        view = True
        wc.pr.set_pr_level(0x07)
    if '-d' in args:
        df_funcs = {
            'wget': wc.wget_url_file,
            'rtrv': wc.retrieve_url_file,
Exemple #6
0
        '  -c img: check img is a image file',
        '    img: the path of image file',
        '  -r path,(w,d): remove small size of images',
        '    path: path of dir or file',
        '  -R path: reclaim image format',
        '    path: path of dir or file',
        '  -o path,rename,nz: rename image to order',
        '    path: path of images',
        '    rename: the format of image to be rename',
        '    nz: True is set %0d, False is set %d',
        '  -i img: show detail info of image file',
        '    img: the path of image file',
    )

    Img = Image()
    pr = Print(Img.__class__.__name__)
    xval = None
    args = Base.get_user_input('hc:r:R:x:o:i:')
    if '-h' in args:
        Base.print_help(HELP_MENU)
    if '-c' in args:
        result = Img.image_file(Path.get_abs_path(args['-c']))
        pr.pr_info(result)
    if '-r' in args:
        data = args['-r'].split(',')
        path = data[0]
        if len(data) >=2:
            w = data[1]
            h = data[2]
            Img.remove_small_image(path, int(w), int(h))
        else:
Exemple #7
0
class WebImage(object):

    help_menu = (
        '==================================',
        '    WebImage help',
        '==================================',
        'option:',
        '  -u url:',
        '    url of web to be download',
        '  -n num:',
        '    number of web number to be download',
        '  -p path:',
        '    root path to store images.',
        '  -v:',
        '    view info while download.',
        '  -x val:',
        '    val for expand cmd.',
        '  -m mode:',
        '    wget: using wget to download imgages',
        '    rtrv: using retrieve to download images',
        '    rget: using requests to download images',
        '    uget: using urlopen to download images',
        '  -R file:',
        '    re config file for re_image_url.'
        '  -t num:',
        '    set max number of thread to download web.'
    )

    def __init__(self, name=None):
        self._name = name
        self._com = None
        self._url_base = None
        self._url = None
        self._num = 1
        self._path = '%s/%s' %  (Base.DEFAULT_DWN_PATH, self.__class__.__name__)
        self._re_image_url = [
            #re.compile('src=[\'|\"]?(http[s]?://.+\.(?:jpg|png|gif|bmp|jpeg))[\'|\"]?', re.I),
            #re.compile('src=[\'|\"]?(/.+\.(?:jpg|png|gif|bmp|jpeg))[\'|\"]?', re.I),
            re.compile('src=[\'|\"]?(http[s]?://[a-z0-9\./-]+\.(?:jpg|png|gif|bmp|jpeg))[\'|\"]?', re.I),
            re.compile('src=[\'|\"]?(/[a-z0-9\./-]+\.(?:jpg|png|gif|bmp|jpeg))[\'|\"]?', re.I),
        ]
        self._ex_re_image_url = None
        self._title = None
        self._remove_small_image = True
        self._view = False
        self._xval = None
        self._dl_image = self.urlopen_get_url_image
        self._redundant_title = None
        self._pr = Print(self.__class__.__name__)
        self.__dbg = 0
        self._thread_max = 5
        self._thread_queue = None


    def get_image_url(self, html):
        pattern = self._re_image_url
        imgs = list()
        # find image.
        try:
            if type(pattern) == list:
                for pt in pattern:
                    imgs = imgs + pt.findall(str(html))
            else:
                imgs = pattern.findall(str(html))
            #self._pr.pr_dbg('%s' % imgs)
        except TypeError as e:
           self._pr.pr_err('%s: failed to findall image url' % str(e))
        return imgs

    def get_image_url_of_pages(self, pages, header_content=None):
        limg = list()
        url_pages = self.get_url_of_pages(pages)
        for index in range(len(url_pages)):
            if all((index == 0, header_content)):
                url_content = header_content
            else:
                url_content = self.get_url_content(url_pages[index])
            if not url_content:
                self._pr.pr_err('failed to download %s sub web' % url_pages[index])
                continue
            imgs = self.get_image_url(url_content)
            for img in imgs:
                limg.append(img)
        return limg

    def get_image_raw_url(self, url):
        if not re.match('http(s)?:', url):
            url = '%s%s' % (self._com, url)
        return url

    def retrieve_url_image(self, url, path, view=False):
        return WebContent.retrieve_url_file(url, path, view=view)

    def wget_url_image(self, url, path, view=False):
        return WebContent.wget_url_file(url, path,
                                        config="-c -t 3 -T 10 -U \'%s\'" % USER_AGENTS['AppleWebKit/537.36'],
                                        view=view)

    def requests_get_url_image(self, url, path, view=False):
        return WebContent.requests_get_url_file(url, path, view=view)



    def urlopen_get_url_image(self, url, path, view=False):
        headers = {
            'User-Agent': '%s' % USER_AGENTS['AppleWebKit/537.36'],
            'GET' : url,
            'Referer' : self._com,
        }
        return WebContent.urlopen_get_url_file(url, path,
                                               ssl=WebContent.url_is_https(url),
                                               headers=headers, view=view)

    # download image of url.
    def download_image(self, url, path):
        if self._dl_image:
            Path.make_path(path)
            self._dl_image(url, path, self.__dbg)

    def get_url_content(self, url, view=False):
        return WebContent.get_url_content(url=url, view=view)

    def get_title(self, html, pattern=None):
        title = WebContent.get_url_title(html, pattern).decode()
        if self._redundant_title:
            for rt in self._redundant_title:
                title = title.replace(rt, '')
        return title

    def get_pages(self, html, pattern=None):
        return WebContent.get_url_pages(html, pattern)

    def download_images(self, imgs, path):
        for img in imgs:
            self.download_image(self.get_image_raw_url(img), path)

    def get_url_of_pages(self, num):
        url = map(lambda x: WebContent.set_url_base_and_num(self._url_base,
                                                            '%s/%d' % (int(self._url), x)),
                                                            range(2, num + 1))
        url.insert(0, WebContent.set_url_base_and_num(self._url_base, self._url))
        return url

    def get_url_address(self, url_base, url):
        return WebContent.set_url_base_and_num(url_base, url)

    def convert_url_to_title(self, url):
        return WebContent.convert_url_to_title(url)

    def store_web_info(self, path, title, url):
        with open('%s/%s' % (path, WebContent.WEB_URL_FILE), 'w') as fd:
            fd.write('%s\n%s' % (title, url))

    def store_url_of_images(self, path, urls):
        with open('%s/%s' % (path, WebContent.WEB_URL_FILE), 'a') as fd:
            fd.write('\n')
            fd.write('\n')
            fd.write('url of imgs:\n')
            for url in urls:
                fd.write('%s\n' % url)

    def output_image_exists(self, path):
        for rt, ds, fs in os.walk(path):
            if fs:
                for f in fs:
                    f = os.path.join(rt, f)
                    if Image.image_file2(f):
                        return True
        return False

    def add_external_re_image_url(self):
        if self._ex_re_image_url:
            relist = list()
            try:
                with open(self._ex_re_image_url, 'r') as fd:
                    lines = fd.readlines()
                # list all of lines.
                for line in lines:
                    line = re.sub('\n', '', line)
                    relist.append(re.compile(line))
                # add old lines.
                if type(self._re_image_url) == list:
                    for r in self._re_image_url:
                        relist.append(r)
                else:
                    relist.append(self._re_image_url)
                # update re_image_url
                self._re_image_url = relist
            except IOError as e:
                self._pr.pr_err('%s, failed to open %s' % (str(e), self._ex_re_image_url))

    # process url web images.
    def process_url_web(self, url, data=None):
        # get header web
        header_content = self.get_url_content(url, view=False)
        if not header_content:
            if self._thread_queue:
                self._thread_queue.get()
            self._pr.pr_err('failed to download %s header web.' % url)
            return
        # get url title.
        title = self.get_title(header_content, self._title)
        if not title:
            title = self.convert_url_to_title(url)
        self._pr.pr_dbg('title: %s' % title)
        # create path of title to store data.
        subpath = os.path.join(self._path, title)
        self._pr.pr_dbg('subpath: %s' % subpath)
        # get count of pages
        pages = self.get_pages(header_content)
        self._pr.pr_dbg('get pages: %s' % pages)
        if not pages:
            limg = self.get_image_url(header_content)
        else:
            limg = self.get_image_url_of_pages(pages, header_content)
        # filter images
        limg = set(limg)
        # self._pr.pr_dbg('image url list: %s' % limg)
        # download images
        if limg:
            # download all of images.
            self.download_images(limg, subpath)
            # write web info
            self.store_web_info(subpath, title, url)
            # reclaim image, remove small image
            if self._remove_small_image:
                Image.reclaim_path_images(subpath, xfunc=Image.remove_small_image)
            else:
                Image.reclaim_path_images(subpath)
            # show output info.
            if self._view:
                if self.output_image_exists(subpath):
                    self._pr.pr_info('output: %s' % (subpath))
                else:
                    self._pr.pr_info('output no images: %s' % (subpath))
            # save url of images if it is full debug.
            if self.__dbg >= 2:
                self.store_url_of_images(subpath, limg)
        # release queue
        if self._thread_queue:
            self._thread_queue.get()
        if data:
            self._pr.pr_info('%d/%d: process %s done!' % (data[0], data[1], url))
        return subpath

    def get_user_input(self, args=None):
        if not args:
            args = Base.get_user_input('hu:n:p:x:m:i:R:t:vdD')
        if '-h' in args:
            Base.print_help(self.help_menu)
        if '-u' in args:
            self._url = re.sub('/$', '', args['-u'])
        if '-n' in args:
            self._num = int(args['-n'])
        if '-p' in args:
            self._path = os.path.abspath(args['-p'])
        if '-R' in args:
            self._ex_re_image_url = os.path.abspath(args['-R'])
        if '-t' in args:
            try:
                n = int(args['-t'])
            except ValueError as e:
                Base.print_exit('%s, -h for help!' % str(e))
            if n:
                self._thread_max = n
        if '-v' in args:
            self._view = True
            self._pr.set_pr_level(self._pr.get_pr_level() | Print.PR_LVL_WARN)
        if '-x' in args:
            self._xval = args['-x']
        if '-m' in args:
            dl_image_funcs = {
                'wget': self.wget_url_image,
                'rtrv' : self.retrieve_url_image,
                'rget' : self.requests_get_url_image,
                'uget' : self.urlopen_get_url_image,
            }
            if args['-m'] in dl_image_funcs.keys():
                self._dl_image = dl_image_funcs[args['-m']]
        if '-d' in args:
            self.__dbg = 1
            self._pr.set_pr_level(self._pr.get_pr_level() | Print.PR_LVL_ALL )
        if '-D' in args:
            self.__dbg = 2
            self._pr.set_pr_level(self._pr.get_pr_level() | Print.PR_LVL_DBG)
            WebContent.pr.set_pr_level(self._pr.get_pr_level() | Print.PR_LVL_DBG)
        # check url
        if self._url:
            base, num = WebContent.get_url_base_and_num(self._url)
            if base:
                self._url_base = base
            if num:
                self._url = num
            self._pr.pr_dbg('get base: %s, url: %s' % (base, self._url))
        else:
            Base.print_exit('[WebImage] Error, no set url, -h for help!')
        if self._url_base:
            www_com = re.match('http[s]?://.+\.(com|cn|net)', self._url_base)
            if www_com:
                self._com = www_com.group()
        return args

    def main(self, args=None):
        self.get_user_input(args)
        # get external re file.
        if self._ex_re_image_url:
            self.add_external_re_image_url()
        # create queue.
        if self._num > self._thread_max:
            self._thread_queue = queue.Queue(self._thread_max)
        # get web now.
        for index in range(self._num):
            # get the first page.
            if self._url_base:
                url = self.get_url_address(self._url_base, int(self._url) + index)
            else:
                url = self.get_url_address(None, self._url)
            if self._thread_queue:
                # create thread and put to queue.
                t = threading.Thread(target=self.process_url_web, args=(url, (index + 1, self._num)))
                self._thread_queue.put(url)
                t.start()
            else:
                return self.process_url_web(url)
Exemple #8
0
class WebImageCrawler(WebContent):

    HELP_MENU = (
        '==================================',
        '    WebImageCrawler help',
        '==================================',
        'option:',
        '  -u url:',
        '    url of web to be download',
        '  -n num:',
        '    number of web number to be download',
        '  -p path:',
        '    root path to store images.',
        '  -v:',
        '    view info while download.',
        '  -x val:',
        '    xgmn:    xgmn of girlsky',
        '    swmn:    swmn of girlsky',
        '    wgmn:    wgmn of girlsky',
        '    zpmn:    zpmn of girlsky',
        '    mnxz:    mnxz of girlsky',
        '    rtys:    rtys of girlsky',
        '    jpmn:    jpmn of girlsky',
        '    gzmn:    gzmn of girlsky',
        '    pstatp:  pstatp of toutiao',
        '    toutiao: toutiao of toutiao',
        '    meizitu: meizitu of meizitu',
        '    mzitu:   mzitu of mzitu',
        '  -m mode:',
        '    wget: using wget to download imgages',
        '    rtrv: using retrieve to download images',
        '    rget: using requests to download images',
        '    uget: using urlopen to download images',
        '  -R file:',
        '    re config file for re_image_url.',
        '  -t num:',
        '    set number of thread to download images.',
    )

    def __init__(self, name=None):
        self._name = name
        self._web_base = None
        self._url_base = None
        self._url_file = None
        self._url = None
        self._xval = None
        self._pr = Print(self.__class__.__name__)
        self._class = None
        self._thread_max = 5
        self._thread_queue = None
        self._run_ui = None

    def get_input(self, args=None):
        if not args:
            args = Base.get_user_input('hu:n:p:x:m:R:t:UvDd')
        if '-h' in args:
            Base.print_help(self.HELP_MENU)
        if '-U' in args:
            self._run_ui = True
        if '-u' in args:
            if os.path.isfile(args['-u']):
                self._url_file = Path.get_abs_path(args['-u'])
            else:
                self._url = re.sub('/$', '', args['-u'])
        if '-x' in args:
            self._xval = args['-x']
        if '-d' in args:
            self._pr.set_pr_level(self._pr.get_pr_level() | Print.PR_LVL_DBG)
        # get url_base from xval
        if self._xval:
            if self._xval in URL_BASE:
                self._url_base = list(URL_BASE[self._xval])[0]
                self._class = URL_BASE[self._xval][self._url_base]
            else:
                Base.print_exit('[WebImageCrawler] Error, invalid -x val!')
        # get class from url
        if self._url:
            base, num = self.get_url_base_and_num(self._url)
            if base:
                self._url_base = base
        # get class from url_base
        if all((not self._class, self._url_base)):
            for dict_url_base in URL_BASE.values():
                if self._url_base == list(dict_url_base)[0]:
                    self._class = dict_url_base[self._url_base]
                    break
        return args

    def process_input(self, args=None, info=None):
        if self._class:
            if self._class == 'girlsky':
                hdr = Girlsky('Girlsky')
            elif self._class == 'pstatp':
                hdr = Pstatp('Pstatp')
            elif self._class == 'meizitu':
                hdr = Meizitu('Meizitu')
            elif self._class == 'mzitu':
                hdr = Mzitu('Mzitu')
        else:
            hdr = WebImage('WebImage')
        if hdr:
            hdr.main(args)
        else:
            self._pr.pr_err('[WebImageCrawler] Error, no found handler!')
        # release queue
        if self._thread_queue:
            self._thread_queue.get()
        if info:
            index = info[0]
            total = info[1]
            self._pr.pr_info('process %d/%d input file done' % (index, total))

    def process_file_input(self, args=None):
        if self._url_file:
            with open(self._url_file, 'r') as fd:
                lines = set(fd.readlines())
            self._thread_queue = queue.Queue(self._thread_max)
            total = len(lines)
            index = 1
            # delete -u arg.
            if '-u' in args:
                del args['-u']
            # process all of url.
            for url in lines:
                self._class = None
                # remove invalid chars.
                for key, value in {'/$': '', '\n$': ''}.items():
                    url = re.sub(key, value, url)
                # get base and num
                base, num = self.get_url_base_and_num(url)
                if base:
                    for dict_url_base in URL_BASE.values():
                        if base == list(dict_url_base)[0]:
                            self._class = dict_url_base[base]
                            break
                if self._class:
                    url_args = {'-u': url}
                    url_args.update(args)
                    info = (index, total)
                    # create thread and put to queue.
                    t = threading.Thread(target=self.process_input,
                                         args=(url_args, info))
                    self._thread_queue.put(url)
                    t.start()
                index = index + 1

    def main(self, args=None):
        if not args:
            args = self.get_input()
        if self._run_ui:
            cwd = os.path.dirname(os.path.realpath(__file__))
            os.system('%s/webimagecrawlerUI.py' % cwd)
        elif self._url_file:
            self.process_file_input(args)
        else:
            self.process_input()