Beispiel #1
0
class CWebParserSite(CWebParserMultiUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if not url:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html)
                # items
                items = a(
                    'div.ts-responsive-wrap div.tshowcase-inner-box div.tshowcase-box-photo > a'
                )

                for item in items.items():
                    modelurl = item.attr('href')
                    name = item('img').attr('title')
                    board = item('img').attr('src')

                    html = self.utils.get_page(modelurl)
                    if html:
                        b = pq(html)
                        products = b('div.home_tall_box > a')
                        for product in products.items():
                            data_p = self.common.parse_item(product)
                            data_t = {
                                'name': name,
                                'url': modelurl,
                                'board': board,
                                'refurl': modelurl
                            }

                            data = dict(data_t, **data_p)
                            yield data

                self.log('parsed url %s' % url)
                self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Beispiel #2
0
class CWebParserSite(CWebParserMultiUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html)
                # items
                items = a('div.content div.modelItem')

                for item in items.items():
                    model_url = item('a.title').attr('href')
                    model = item('a.title').text()
                    board = item('img').attr('src')

                    html2 = self.utils.get_page(model_url)
                    if html2:
                        b = pq(html2)
                        items_model = b('div.content div.item')
                        for item_model in items_model.items():
                            data_p = self.common.parse_item(item_model)
                            data_t = {
                                'name': model,
                                'url': model_url,
                                'board': board,
                                'refurl': url
                            }

                            data = dict(data_t, **data_p)
                            yield data

                self.log('parsed url %s' % url)
                self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Beispiel #3
0
class CWebParserSite(CWebParserMultiUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.utils.verify = False
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page

    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html)
                # items
                items = a('#posts > div.post-table > div > a')
                parse_succeed = True
                icount = 0
                for item in items.items():
                    try:
                        data_p = self.common.parse_item(item)
                        data_t = {
                            'name': data_p.get('brief').get('model'),
                            'url': data_p.get('brief').get('url'),
                            'refurl': url
                        }

                        data = dict(data_t, **data_p)
                        yield data
                        icount += 1
                    except:
                        parse_succeed = False
                        continue
                if parse_succeed and icount > 0:
                    self.log('parsed url %s' % url)
                    self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None

    def urls_genarator(self):
        for i in range(self.start, self.end + 1):
            yield self.url.format(page=i * 24)
        yield None
Beispiel #4
0
class CWebParserSite(CWebParserMultiUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            if self.dbUtils.get_db_url(url) is not None:
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html)
                # items
                items = a(
                    '#inner-center-content div.index-thumb div.index-thumb-wrap div.thumb-wrap a'
                )
                parse_succeed = True
                for item in items.items():
                    try:
                        data_p = self.common.parse_item(item)
                        data_t = {
                            'name': data_p.get('brief').get('name'),
                            'url': data_p.get('brief').get('url'),
                            'refurl': url
                        }

                        data = dict(data_t, **data_p)
                        yield data
                    except:
                        parse_succeed = False
                        continue
                if parse_succeed:
                    self.log('parsed url %s' % url)
                    self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None

    def urls_genarator(self):
        for i in range(self.start, self.end + 1):
            yield self.url.format(page=i * 30)
        yield None
Beispiel #5
0
class CWebParserSite(CWebParserMultiUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page

    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            html = self.utils.get_page(url, headers={"User-Agent": "",
                                                     "Accept": "",
                                                     "Accept-Encoding": "",
                                                     "Accept-Language": "",
                                                     "Cache-Control": "",
                                                     'Connection': ''})
            if html:
                a = pq(html)
                # items
                items = a('div.inner-box-container > div.row div.item-col.col > div.item-inner-col.inner-col >a')
                parse_successed = True
                for item in items.items():
                    data_p = self.common.parse_item(item)
                    if not data_p:
                        parse_successed = False
                        continue

                    data_t = {
                        'url': data_p.get('brief').get('name'),
                        'url': data_p.get('brief').get('url'),
                        'refurl': url
                    }

                    data = dict(data_t, **data_p)
                    yield data
                if parse_successed:
                    self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Beispiel #6
0
class CWebParserSite(CWebParserMultiUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page

    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html)
                items = a(
                    'div.gallery-section div.thumb-list.gallery.thumb-list--sidebar div.thumb-list__item.gallery-thumb a'
                )
                parse_succeed = True
                for item in items.items():
                    try:
                        data_p = self.common.parse_item(item)
                        data_t = {
                            'name': "Galleries",
                            'url': data_p.get('brief').get('url'),
                            'refurl': url
                        }

                        data = dict(data_t, **data_p)
                        yield data

                    except:
                        parse_succeed = False
                        continue
                if parse_succeed:
                    self.log('parsed url %s' % url)
                    self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Beispiel #7
0
class CWebParserSite(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if not url:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html)
                # items
                items = a('.products .contain .grid .col-sm-12')
                parse_succeed = True
                for item in items.items():
                    try:
                        data_p = self.common.parse_item(item)
                        data_t = {
                            'name': data_p.get('brief').get('name'),
                            'url': data_p.get('brief').get('url'),
                            # 'board': data_p.get('brief').get('board'),
                            'refurl': url
                        }

                        data = dict(data_t, **data_p)
                        yield data
                    except:
                        parse_succeed = False
                        continue
                if parse_succeed:
                    self.dbUtils.put_db_url(url)
            else:
                self.log('html none in parse url %s' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Beispiel #8
0
class CWebParserHunterSingleUrl(CWebParserSingleUrl):    
    def __init__(self, url, savePath, parseOnly):
        super(CWebParserSingleUrl, self).__init__(url)
        self.savePath = savePath
        self.utils = CWebSpiderUtils(savePath)  
        self.common = CWebParserHunterCommon(savePath, parseOnly)     
        self.parseOnly = parseOnly            
    '''
    parse_page
    
    @author: chenzf
    ''' 
    def parse_page(self):
        urlsGen = self.urls_genarator()
        while True: 
            try:
                url = next(urlsGen)
                if not url:
                    yield None
                
                html = self.utils.get_page(url)     
                if html:
                    a = pq(html)   
                    #items
                    items = a('ul.gallery-a li')
    
                    for item in items.items():
                        try:
                            yield self.common.parse_item(item)
                        except:
                            self.log('error in parse item %s' % url)         
                            continue
                    self.log('parsed url %s' % url)     
                else:
                    self.log('request %s error' %url)         
                                
            except:
                self.log('error in parse url %s' % url)         
                yield None    
            
        yield None
    
    '''
    process_image
    
    @author: chenzf
    '''    
    def process_data(self, data):
        if self.parseOnly == 1:
           return 
        self.common.process_data(data)    
Beispiel #9
0
class CWebParserHunterSingleUrl(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if not url:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html)
                # items
                items = a('#content h2').prev_all('ul.gallery-a li')
                parse_succeed = True
                for item in items.items():
                    try:
                        data_p = self.common.parse_item(item)
                        data_t = {
                            'url': data_p.get('brief').get('url'),
                            'refurl': url
                        }

                        data = dict(data_t, **data_p)
                        yield data

                    except:
                        parse_succeed = False
                        continue
                if parse_succeed:
                    self.log('parsed url %s' % url)
                    self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Beispiel #10
0
class CWebParserSite(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html, parser='html')
                # items
                items = a('a.artwork')
                for item in items.items():
                    modelurl = urljoin('http://www.hegre.com/',
                                       item.attr('href').strip())
                    board = item('img').attr('src')
                    name = item.attr('title')

                    if self.dbUtils.get_db_url(modelurl):
                        continue

                    bFarseSucceed = True
                    html2 = self.utils.get_page(modelurl)
                    if html2:
                        b = pq(html2, parser='html')
                        item_models = b('#films-wrapper div.item')
                        for item_model in item_models.items():
                            try:
                                data_p = self.common.parse_item(item_model)
                                data_t = {
                                    'name': self.utils.format_name(name),
                                    'url': modelurl,
                                    'board': board,
                                    'refurl': modelurl
                                }

                                data = dict(data_t, **data_p)
                                yield data
                            except:
                                bFarseSucceed = False
                                continue
                        b = pq(html2, parser='html')
                        item_models = b('#massages-wrapper div.item')
                        for item_model in item_models.items():
                            try:
                                data_p = self.common.parse_item(item_model)
                                data_t = {
                                    'name': self.utils.format_name(name),
                                    'url': modelurl,
                                    'board': board,
                                    'refurl': modelurl
                                }

                                data = dict(data_t, **data_p)
                                yield data
                            except:
                                bFarseSucceed = False
                                continue

                        self.log('parsed url %s' % modelurl)
                        if bFarseSucceed:
                            self.dbUtils.put_db_url(modelurl)

                self.log('parsed url %s' % url)
                self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Beispiel #11
0
class CWebParserSite(CWebParserMultiUrl):
    def __init__(self, url, start, end, savePath):
        super(CWebParserMultiUrl, self).__init__(url, start, end)
        self.savePath = savePath
        self.utils = CWebSpiderUtils(savePath)

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self):
        try:
            urlsGen = self.urls_genarator()
            while True:
                url = next(urlsGen)
                if not url:
                    yield None
                html = self.utils.get_page(url)

                if html:
                    soup = BeautifulSoup(html, 'lxml')
                    items = soup.find_all('div', class_="item")

                    for item in items:
                        posterImg = item.find(
                            'div',
                            class_='img-holder').find('img').attrs['src']

                        mid = item.find('div', class_='cover-links').find(
                            'a',
                            attrs={'data-lightbox': "lightbox--posterImg"})
                        if not mid:
                            midUrl = None
                        else:
                            midUrl = mid.attrs['href']

                        large = item.find('div', class_='cover-links').find(
                            'a',
                            attrs={'data-lightbox': "lightbox--board_image"})
                        if not large:
                            largeUrl = None
                        else:
                            largeUrl = large.attrs['href']

                        name = item.find('a', class_='open-in-content-overlay'
                                         ).attrs['title'].strip()
                        url = urljoin(
                            'http://www.hegre.com/',
                            item.find('a', class_='open-in-content-overlay').
                            attrs['href'])
                        data = {
                            'name': self.utils.format_name(name),
                            'small': posterImg,
                            'mid': midUrl,
                            'large': largeUrl,
                            'url': url,
                            'detail': self.process_data_detail(url)
                        }
                        yield data
        except:
            print('error in parse %s' % url)
            yield None

        yield None

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page_detail(self, html):
        data = {}

        soup = BeautifulSoup(html, 'lxml')

        board = None
        item = soup.find('div', class_="content-overlay-wrapper")
        if item:
            style_text = item.select_one(
                'div[class="non-members"]').attrs['style']
            board = re.search("url\((.*?)\)", style_text, re.S).group(1)
        data['board'] = board

        DownLoad = []
        items = soup.find_all('div', class_="gallery-zips")
        for item in items:
            DownLoad.append(item.find('a').attrs['href'])
        data['download'] = DownLoad

        data['date'] = soup.find('span', class_="date").string
        return data

    '''
    process_data_detail
    
    @author: chenzf
    '''

    def process_data_detail(self, url):
        detail = None
        html = self.utils.get_page(url)
        if html:
            detail = self.parse_page_detail(html)

        return detail

    '''
    process_data
    
    @author: chenzf
    '''

    #     @vthread.pool(8)
    def process_data(self, data):
        dir_name = self.savePath.format(filePath=data.get('name'))
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)

        with open(dir_name + '\\info.json', 'w') as f:
            json.dump(data, f)

        for subkeys in ['large', 'mid', 'small']:
            url = data.get(subkeys)
            if url:
                self.utils.download_file(
                    url, '%s\\%s' % (data.get('name'), data.get('name')))
                break

        detail = data.get('detail')
        board = detail.get('board')
        if board:
            self.utils.download_file(board,
                                     '%s\\%s' % (data.get('name'), 'cover'))
        elif data.get('mid'):
            board = data.get('mid')
            self.utils.download_file(board,
                                     '%s\\%s' % (data.get('name'), 'cover'))
Beispiel #12
0
class CWebParserSite(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            html = self.utils.get_page(url,
                                       headers={
                                           "Host": "godsartnudes.com",
                                           "Upgrade-Insecure-Requests": "1"
                                       })
            if html:
                a = pq(html)
                # items
                items = a(
                    'div.row.gan-central div.col-xxs-12.col-xs-6.col-sm-4.col-md-3 div.Thumb a:last-of-type'
                )
                for item in items.items():
                    name = item.text()
                    # board = item('a img').attr('lsrc') + '.jpg'
                    model_url = urljoin('http://godsartnudes.com',
                                        item.attr('href'))

                    if self.dbUtils.get_db_url(model_url):
                        continue

                    html2 = self.utils.get_page(model_url)
                    if html2:
                        b = pq(html2)
                        modelitems = b(
                            'div.row.spacetop div.col-xxs-12.col-xs-6.col-sm-4.col-md-3 div.thumbImage > a:first-child'
                        )
                        parse_succeed = True
                        processNum = 0
                        for modelitem in modelitems.items():
                            parse_succeed &= True
                            try:
                                data_p = self.common.parse_item(modelitem)
                                data_t = {
                                    'name': name,
                                    'url': model_url,
                                    'refurl': url
                                }

                                data = dict(data_t, **data_p)
                                yield data
                                processNum += 1
                            except:
                                parse_succeed = False
                                continue

                        if parse_succeed and processNum > 0:
                            self.log('parsed url %s' % model_url)
                            self.dbUtils.put_db_url(model_url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None

    def urls_genarator(self):
        for url in range(ord("A"), ord("Z") + 1):
            yield self.url.format(page=chr(url))
        yield None
Beispiel #13
0
class CWebParserSite(CWebParserMultiUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html, parser='html')
                # items
                items = a('#block-system-main .node-grid')

                for item in items.items():
                    board = item('div.field-type-image img').attr('src')
                    name = item('.grid-meta a').text()
                    modelurl = urljoin('http://www.hegregirls.com/',
                                       item('.grid-meta a').attr('href'))

                    html2 = self.utils.get_page(modelurl)
                    if html2:
                        b = pq(html2, parser='html')
                        items_model = b(
                            '#main-content .content .content .grid-4')
                        for item_model in items_model.items():
                            try:
                                if not re.search('galleries',
                                                 item_model.attr('about')):
                                    continue

                                data_p = self.common.parse_item(item_model)
                                data_t = {
                                    'name': name,
                                    'url': modelurl,
                                    'board': board,
                                    'refurl': url
                                }

                                data = dict(data_t, **data_p)
                                yield data
                            except:
                                continue

                self.log('parsed url %s' % url)
                self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Beispiel #14
0
class CWebParserSite(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            while True:
                html = self.utils.get_page(url)
                if html:
                    if self.dbUtils.get_db_url(url):
                        pass
                    else:
                        a = pq(html)
                        # items
                        items = a('ul.set.sluts_main li')
                        parse_succeed = True
                        for item in items.items():
                            try:
                                name = item('b a').text()
                                board = item('a img').attr('lsrc') + '.jpg'
                                model_url = urljoin('https://www.hqsluts.com/',
                                                    item('b a').attr('href'))

                                html2 = self.utils.get_page(model_url)
                                if html2:
                                    b = pq(html2)
                                    modelitems = b('ul.set.s**t li')
                                    for modelitem in modelitems.items():
                                        try:
                                            data_p = self.common.parse_item(
                                                modelitem)
                                            data_t = {
                                                'name':
                                                self.utils.format_name(name),
                                                'url':
                                                model_url,
                                                'board':
                                                board,
                                                'refurl':
                                                url
                                            }

                                            data = dict(data_t, **data_p)
                                            yield data
                                        except:
                                            parse_succeed = False
                                            continue
                            except:
                                parse_succeed = False
                                continue
                        if parse_succeed:
                            self.log('parsed url %s' % url)
                            self.dbUtils.put_db_url(url)

                    next_url = a('#pages li a[count="Next Page"]')
                    if next_url:
                        url = urljoin('https://www.hqsluts.com/',
                                      next_url.attr('href'))
                        self.log('request %s' % url)
                    else:
                        break
                else:
                    self.log('request %s error' % url)
                    continue
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Beispiel #15
0
class CWebParserSite(CWebParserMultiUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page

    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            html = self.utils.get_page(url)
            if html:
                if self.dbUtils.get_db_url(url):
                    pass
                else:
                    a = pq(html)
                    items = a(
                        'body > div.main-wrap > div.best-list-block.hide-on-search > div.width-wrap > div.thumb-container div.pornstar-thumb-container div.pornstar-thumb-container__info div.pornstar-thumb-container__info-title a')
                    for item in items.items():
                        model_url_origin = item.attr('href')
                        name = item.text()

                        index = 1
                        while True:
                            model_url = "%s/%s" % (model_url_origin, index)
                            if index == 1:
                                if self.dbUtils.get_db_url(model_url_origin):
                                    index = index + 1
                                    continue
                            elif self.dbUtils.get_db_url(model_url):
                                index = index + 1
                                continue

                            break

                        if index > 2:
                            index = index - 1
                            model_url = "%s/%s" % (model_url_origin, index)
                        else:
                            model_url = model_url_origin

                        while True:
                            self.log('request %s' % model_url)
                            html2 = self.utils.get_page(model_url)
                            if html2:
                                if self.dbUtils.get_db_url(model_url):
                                    pass
                                else:
                                    board = pq(html2)('div.pornstar-logo img').attr('src')
                                    data_ps, parse_res = self.parse_sub_page(html2)
                                    for data_p in data_ps:
                                        data_t = {
                                            'name': name,
                                            'url': model_url,
                                            'board': board,
                                            'refurl': url
                                        }

                                        data = dict(data_t, **data_p)
                                        yield data

                                    if parse_res:
                                        self.log('parsed url %s' % model_url)
                                        self.dbUtils.put_db_url(model_url)

                                next_url = pq(html2)('li.next a').attr("href")
                                if next_url:
                                    model_url = next_url
                                else:
                                    break
                            else:
                                break;
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None

    def parse_sub_page(self, html):
        b = pq(html)
        items = b(
            'body > div.main-wrap > main > div > article > div.index-videos.mixed-section > div.thumb-list.thumb-list--sidebar.thumb-list--recent > div.thumb-list__item.video-thumb a.video-thumb-info__name')

        sub_datas = []
        parse_successed = None
        for item in items.items():
            try:
                data_p = self.common.parse_item(item)
                sub_datas.append(data_p)

                if not parse_successed:
                    parse_successed = True
                else:
                    parse_successed = True & parse_successed
            except:
                parse_successed = False

        return sub_datas, parse_successed
Beispiel #16
0
class CWebParserSite(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))
        self.utils.verify = False

    '''
    parse_page

    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html)
                # items
                items = a('#models tr td a')
                for item in items.items():
                    name = item.attr('title')
                    model_url = urljoin('https://www.babesmachine.com',
                                        item.attr('href'))

                    if self.dbUtils.get_db_url(model_url):
                        yield None

                    html2 = self.utils.get_page(model_url)
                    if html2:
                        b = pq(html2)
                        modelitems = b('#posts tr td a')
                        parse_succeed = True
                        for modelitem in modelitems.items():
                            try:
                                data_p = self.common.parse_item(modelitem)
                                data_t = {
                                    'name': name,
                                    'url': model_url,
                                    'refurl': url
                                }

                                data = dict(data_t, **data_p)
                                yield data
                            except:
                                parse_succeed = False
                                continue

                        if parse_succeed:
                            self.log('parsed url %s' % model_url)
                            self.dbUtils.put_db_url(model_url)
                    else:
                        self.log('request %s error' % model_url)
                        continue
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Beispiel #17
0
class CWebParserSite(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if not url:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            end_pos = url.rfind('/') - 1  # 倒数第一个"/"的位置再左移一位
            start_pos = url.rfind(
                '/', 0,
                end_pos)  # 网址从开始截至到end_pos的位置,从右往左出现的第一个"/"也就是我们要找的倒数第二个"/"
            name = url[start_pos + 1:]  # 截取网址的倒数第二个 "/" 后面的内容

            data_total = 1
            html = self.utils.get_page(url)
            if html:
                a = pq(html)
                data_total = a('button.js-load-more').attr('data-total')
                if not data_total:
                    data_total = 1

            if int(data_total) > 0:
                for page in range(1, int(data_total) + 1):
                    try:
                        cate_url = '%s?mode=async&action=get_block&block_id=list_albums_common_albums_list&from=%s' % (
                            url, page)

                        if self.dbUtils.get_db_url(cate_url):
                            continue

                        html = self.utils.get_page(cate_url)
                        if html:
                            b = pq(html)

                            items = b('div.masonry_item >a')
                            for item in items.items():
                                data_p = self.common.parse_item(item)
                                data_t = {
                                    'name': name,
                                    'url': data_p.get('brief').get('url'),
                                    'refurl': cate_url
                                }

                                data = dict(data_t, **data_p)
                                yield data
                            self.dbUtils.put_db_url(cate_url)
                    except:
                        continue
                self.log('parsed url %s' % url)
                self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None

    '''
    urls_genarator
    
    @author: chenzf
    '''

    def urls_genarator(self):
        html = self.utils.get_page(self.url)
        if html:
            a = pq(html)
            categorys = a('div.masonry_item a')
            for category in categorys.items():
                yield category.attr('href'), category.attr('title')
        yield None
Beispiel #18
0
class CWebParserSite(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if not url:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            html = self.utils.get_page(url)
            if html:
                name = url.split('/')[-2]

                a = pq(html)
                # items
                items = a('#content div.wrap.wrap2 div.thumbs a')
                parse_succeed = True
                for item in items.items():
                    try:
                        data_p = self.common.parse_item(item)
                        data_t = {
                            'name': name,
                            'url': data_p.get('brief').get('url'),
                            'refurl': url
                        }

                        data = dict(data_t, **data_p)
                        yield data
                    except:
                        parse_succeed = False
                        continue

                if parse_succeed:
                    self.log('parsed url %s' % url)
                    self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None

    def urls_genarator(self):
        html = self.utils.get_page(self.url)
        if html:
            a = pq(html)
            models = a('ul.bottomLists2  ul li a')
            for model in models.items():
                yield urljoin('http://xnudegirls.com/', model.attr('href'))
        yield None