Ejemplo n.º 1
0
def saveFileName(_imgUrl):
    fName = os.path.split(_imgUrl)[1]
    getLogger().debug('file name ::{}'.format(fName))

    ext = '.jpg'
    reg = re.compile(r'[.](jpg|png|bmp|gif)$')
    mo = reg.search(fName)
    if mo == None:
        return fName + ext
    return fName
Ejemplo n.º 2
0
    def __nextDepth__(self, _depUrl, _soup, sSaveDir, _maxDepth, _depth):
        print('depth:' + str(_depth) + '#' * 100)
        depthElms = _soup.select('a')
        getLogger().debug('depthElms count:{}'.format(len(depthElms)))
        if len(depthElms) > 0:
            hrefl = []
            for link in depthElms:
                if link.get('href') == None or len(
                        link.get('href')) == 0 or link.get('href').startswith(
                            'http') == False:
                    continue

                if _depUrl == link.get('href'):
                    continue

                hrefl.append(link.get('href'))

            linkLen = len(hrefl)
            for index in range(0, linkLen):
                print('-' * 100)
                getLogger().info('Dep {}/{}'.format(_depth, _maxDepth))
                subDir = sSaveDir + '/Depth_' + str(_depth) + (os.path.split(
                    hrefl[index])[1])
                if os.path.exists(subDir) == False or os.path.isdir(
                        subDir) == False:
                    os.mkdir(subDir)
                getLogger().info('{}/{} -- {}'.format(index, linkLen,
                                                      hrefl[index]))
                getLogger().info('dir -- {}'.format(subDir))
                print('-' * 100)

                self.__crawring(hrefl[index], subDir, _maxDepth, _depth)
Ejemplo n.º 3
0
 def __init__(self,
              lUrls=None,
              lSaveDirs=None,
              bMp=0,
              bZip=False,
              maxDepth=0,
              proxy=False):
     getLogger().info('__init__ called!!')
     self.__lUrls = lUrls
     self.__lSaveDirs = lSaveDirs
     self.__lProcs = []
     self.__bMp = bMp
     self.__bZip = bZip
     self.__maxDepth = maxDepth
     self.__nJobCount = 0
     self.__proxies = self.__select_proxy__(self.__get_proxies__())
     self.__isProxy = proxy
Ejemplo n.º 4
0
def main():
    init()
    getOption()
    loadProperties()

    getLogger().debug(d['dirs'])
    getLogger().debug(d['urls'])
    getLogger().debug('mp : {} , z:{} , d:{}'.format(getMPOption(),
                                                     getZipOption(),
                                                     getMaxDepth()))

    getLogger().info('-- Crawling ... start ::{}'.format(getDefCurrentTime()))

    # RUN
    crawler = mcrawler.MCrawler(d['urls'], d['dirs'], getMPOption(),
                                getZipOption(), getMaxDepth(),
                                getProxyOption())
    crawler.run()

    getLogger().info('-- Crawling ... end ::{}'.format(getDefCurrentTime()))
Ejemplo n.º 5
0
    def __download_image(self, _list, sSaveDir):
        getLogger().info('-- pid:{} --  image download start'.format(
            os.getpid()))
        session = requests.session()
        for downUrl in _list:
            try:
                if self.__isProxy: session.proxies = self.__proxies
                downUrlRes = session.get(downUrl)
                downUrlRes.raise_for_status()

                # 파일이름 생성
                nFileName = self.__getSaveFileName__(downUrl)
                wFileName = sSaveDir + '/' + nFileName
                getLogger().debug('save file path :' + wFileName)
                print('Downloading...%s' % (nFileName))
                fw = open(wFileName, 'wb')
                for chunk in downUrlRes.iter_content(100000):
                    fw.write(chunk)
                fw.close()
            except:
                print('Download Error...%s' % (os.path.split(downUrl)[1]))
        getLogger().info('-- pid:{} --  image download end'.format(
            os.getpid()))
Ejemplo n.º 6
0
    def __crawring(self, sUrl, sSaveDir, _maxDepth, _depth):
        # getLogger().info('pid -- {}:{}'.format(os.getpid(),sUrl))
        getLogger().info('thread -- {}'.format(sUrl))
        try:
            session = requests.session()
            ## Proxy 설정이 있다면..
            if self.__isProxy:
                print('#' * 100)
                print('proxy ip :{}'.format(self.__proxies))
                print('#' * 100)
                session.proxies = self.__proxies

            res = session.get(sUrl)
            res.raise_for_status()
            getLogger().info('text :{}'.format(res.text))
            soup = bs4.BeautifulSoup(res.text)
            imgElms = soup.select('img')

            getLogger().info('len :{}'.format(len(imgElms)))
            validUrl = []
            if len(imgElms) > 0:
                for elm in imgElms:
                    downUrl = elm.get('src')

                    #-- url 유효성 체크
                    if downUrl == None:
                        continue

                    rgx = re.compile('^[.]{1,2}')
                    mo = rgx.search(downUrl)
                    if mo != None:
                        rgx = re.compile('^https?.+(com|net|edu|org)')
                        mo = rgx.search(sUrl)
                        if mo != None:
                            downUrl = str(mo.group()) + '/' + downUrl[2:]
                    elif downUrl.startswith('http') == False:
                        downUrl = 'https:' + downUrl

                    # 유효한 url을 리스트에 저장
                    validUrl.append(downUrl)

                #--File Download and Save
                getLogger().info('-url:{} , mp count : {}'.format(
                    len(validUrl), self.__bMp))
                nMPCount = self.__bMp
                if nMPCount > 1 and len(validUrl) > 5:
                    # -- url count보다 Mp count가 클경우 url count값으로 mp count를 조절한
                    if len(validUrl) < nMPCount:
                        nMPCount = len(validUrl) - 1

                    threads = []
                    bit = int(len(validUrl) / nMPCount)

                    with concurrent.futures.ThreadPoolExecutor(
                            max_workers=nMPCount) as te:
                        for index in range(0, nMPCount):
                            start = index * bit
                            end = start + bit
                            if index == (nMPCount - 1):
                                end = len(validUrl)
                            th = te.submit(self.__download_image,
                                           validUrl[start:end], sSaveDir)
                            threads.append(th)

                        for th in concurrent.futures.as_completed(threads):
                            print('{}'.format(th.result()))

                else:
                    self.__download_image(validUrl, sSaveDir)

                # Zip압축
                if self.__bZip == True:
                    mzip.MZip.compress(sSaveDir, logging)

            # NEXT DEPTH
            getLogger().info('_maxDepth :{} , _depth:{}'.format(
                _maxDepth, _depth))
            if _maxDepth != 0 and _depth < _maxDepth:
                _depth += 1
                self.__nextDepth__(sUrl, soup, sSaveDir, _maxDepth, _depth)
        except Exception as ex:
            getLogger().info('{}'.format(ex))

        return '{} ==> complete'.format(sUrl)
Ejemplo n.º 7
0
def run():
    global d

    getLogger().debug('adfadfa')
    url = d['urls'][0]
    getLogger().debug('-- url:{}'.format(url))
    res = requests.get(url, timeout=3)
    getLogger().debug(res.text)
    res.raise_for_status()
    soup = bs4.BeautifulSoup(res.text)
    elms = soup.select('img')
    getLogger().debug('-- step1 len:{}'.format(len(elms)))

    downUrls = []
    for e in elms:
        imgUrl = e.get('src')
        # getLogger().debug('-- step2 imgUrl:{}'.format(imgUrl))
        if imgUrl.startswith('http') == False:
            continue
        downUrls.append(imgUrl)

    getLogger().debug('-- downUrls len:{}'.format(len(downUrls)))
    for durl in downUrls:
        fw = None
        try:
            getLogger().debug('-- step2 durl:{}'.format(durl))
            downRes = requests.get(durl, timeout=3)
            downRes.raise_for_status()

            fName = saveFileName(durl)
            getLogger().debug('-- fName :{}'.format(fName))

            fw = open(d['dirs'][0] + '/' + fName, 'wb')
            print('Downloading.. {}'.format(fName))
            for chunk in downRes.iter_content(100000):
                fw.write(chunk)
        except:
            print('Downloading Failed.. {}'.format(fName))
        finally:
            if fw != None:
                fw.close()