Example #1
0
class CWebParserSite(CWebParserMultiUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page

    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html)
                items = a(
                    'div.gallery-section div.thumb-list.gallery.thumb-list--sidebar div.thumb-list__item.gallery-thumb a'
                )
                parse_succeed = True
                for item in items.items():
                    try:
                        data_p = self.common.parse_item(item)
                        data_t = {
                            'name': "Galleries",
                            'url': data_p.get('brief').get('url'),
                            'refurl': url
                        }

                        data = dict(data_t, **data_p)
                        yield data

                    except:
                        parse_succeed = False
                        continue
                if parse_succeed:
                    self.log('parsed url %s' % url)
                    self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Example #2
0
class CWebParserHunterSingleUrl(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if not url:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html)
                # items
                items = a('#content h2').prev_all('ul.gallery-a li')
                parse_succeed = True
                for item in items.items():
                    try:
                        data_p = self.common.parse_item(item)
                        data_t = {
                            'url': data_p.get('brief').get('url'),
                            'refurl': url
                        }

                        data = dict(data_t, **data_p)
                        yield data

                    except:
                        parse_succeed = False
                        continue
                if parse_succeed:
                    self.log('parsed url %s' % url)
                    self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Example #3
0
class CWebParserSite(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if not url:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html)
                # items
                items = a('.products .contain .grid .col-sm-12')
                parse_succeed = True
                for item in items.items():
                    try:
                        data_p = self.common.parse_item(item)
                        data_t = {
                            'name': data_p.get('brief').get('name'),
                            'url': data_p.get('brief').get('url'),
                            # 'board': data_p.get('brief').get('board'),
                            'refurl': url
                        }

                        data = dict(data_t, **data_p)
                        yield data
                    except:
                        parse_succeed = False
                        continue
                if parse_succeed:
                    self.dbUtils.put_db_url(url)
            else:
                self.log('html none in parse url %s' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Example #4
0
class CWebParserSite(CWebParserMultiUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if not url:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            html = self.utils.get_page(url)
            if html:
                soup = pq(html)
                items = soup('.listProfiles li')

                for item in items.items():
                    data = {}
                    try:
                        data_gen = self.common.parse_item(item)
                        while True:
                            data = next(data_gen)
                            if not data:
                                break

                            yield data

                    except:
                        self.log('error in item in url %s' % url)
                        continue
                self.log('parsed url %s' % url)
                self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None
        yield None
Example #5
0
class CWebParserSite(CWebParserMultiUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if not url:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html)
                # items
                items = a('ul.gallery-d li')

                for item in items.items():
                    data_p = self.common.parse_item(item)
                    data_t = {
                        'name': data_p.get('brief').get('name'),
                        'url': data_p.get('brief').get('url'),
                        'board': data_p.get('brief').get('board'),
                        'refurl': url
                    }

                    data = dict(data_t, **data_p)
                    yield data

                self.log('parsed url %s' % url)
                self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Example #6
0
class CWebParserSite(CWebParserMultiUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if not url:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html)
                # items
                items = a('li.g1-collection-item')

                for item in items.items():
                    data = self.common.parse_item(item)
                    yield data

                self.log('parsed url %s' % url)
                self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Example #7
0
class CWebParserSite(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page

    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if not url:
                yield None

            url_origin = url
            index = 1
            while True:
                search_url = "%s?page=%s" % (url_origin, index)
                if index == 1:
                    if self.dbUtils.get_db_url(url_origin):
                        index = index + 1
                        continue
                elif self.dbUtils.get_db_url(search_url):
                    index = index + 1
                    continue
                break

            if index > 2:
                index = index - 1
                search_url = "%s?page=%s" % (url_origin, index)
            else:
                search_url = url_origin

            while True:
                self.log('request %s' % search_url)
                html2 = self.utils.get_page(search_url)
                if html2:
                    if self.dbUtils.get_db_url(search_url):
                        pass
                    else:
                        a = pq(html2)
                        items = a('div.js_video_row  div.video-box  a.video-box-image')
                        parse_successed = True
                        for item in items.items():
                            try:
                                data_p = self.common.parse_item(item)
                                if not data_p:
                                    parse_successed = False
                                    continue
                                elif self.common.parse_detail_fr_brief_duplicate(data_p):
                                    continue

                                data_t = {
                                    'name': 'Categories',
                                    'url': data_p.get('brief').get('url'),
                                    # 'refurl': search_url
                                }

                                data = dict(data_t, **data_p)
                                yield data
                            except:
                                parse_successed = False
                                continue

                        if parse_successed:
                            self.log('parsed url %s' % search_url)
                            self.dbUtils.put_db_url(search_url)
                        else:
                            self.log('request %s error' % search_url)

                    next_url = pq(html2)('#next .prev-next a').attr("data-page-number")
                    if next_url:
                        search_url = "%s?page=%s" % (url_origin, next_url)
                    else:
                        break
                else:
                    break
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None

    def urls_genarator(self):
        html = self.utils.get_page(self.url)
        if html:
            a = pq(html)
            categorys = a('#categoryList a.categoryBox')
            for category in categorys.items():
                yield urljoin("https://www.youporn.com", category.attr('href'))
        yield None
Example #8
0
class CWebParserSite(CWebParserMultiUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page

    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            html = self.utils.get_page(url)
            if html:
                if self.dbUtils.get_db_url(url):
                    pass
                else:
                    a = pq(html)
                    # items
                    items = a('#content > ul  li.pornstars a')
                    for item in items.items():
                        name = item('a img').attr('alt')
                        board = item('a img').attr('src')
                        model_url_origin = urljoin(
                            'https://www.thumbzilla.com/', item.attr('href'))

                        index = 1
                        while True:
                            model_url = "%s?page=%s" % (model_url_origin,
                                                        index)
                            if index == 1:
                                if self.dbUtils.get_db_url(model_url_origin):
                                    index = index + 1
                                    continue
                            elif self.dbUtils.get_db_url(model_url):
                                index = index + 1
                                continue

                            break

                        if index > 2:
                            index = index - 1
                            model_url = "%s?page=%s" % (model_url_origin,
                                                        index)
                        else:
                            model_url = model_url_origin

                        while True:
                            self.log('request %s' % model_url)
                            html2 = self.utils.get_page(model_url)
                            if html2:
                                if self.dbUtils.get_db_url(model_url):
                                    pass
                                else:
                                    data_ps, parse_res = self.parse_sub_page(
                                        html2)
                                    for data_p in data_ps:
                                        data_t = {
                                            'name': name,
                                            'url': model_url,
                                            'board': board,
                                            'refurl': url
                                        }

                                        data = dict(data_t, **data_p)
                                        yield data

                                    if parse_res:
                                        self.log('parsed url %s' % model_url)
                                        self.dbUtils.put_db_url(model_url)

                                next_url = pq(html2)('li.page_next a')
                                if next_url:
                                    model_url = urljoin(
                                        'https://www.thumbzilla.com',
                                        next_url.attr('href'))
                                else:
                                    break
                            else:
                                break
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None

    def parse_sub_page(self, html):
        b = pq(html)
        items = b('#content ul li:gt(4) a')

        sub_datas = []
        parse_successed = None
        for item in items.items():
            try:
                data_p = self.common.parse_item(item)
                sub_datas.append(data_p)

                if not parse_successed:
                    parse_successed = True
                else:
                    parse_successed = True & parse_successed
            except:
                parse_successed = False

        return sub_datas, parse_successed
Example #9
0
class CWebParserSite(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if not url:
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html)
                # items
                items = a(
                    'div.ts-responsive-wrap div.tshowcase-inner-box div.tshowcase-box-photo > a'
                )

                for item in items.items():
                    modelurl = item.attr('href')
                    modelsearch = modelurl
                    name = item('img').attr('title')
                    board = item('img').attr('src')

                    try:
                        while modelsearch is not None:
                            html = self.utils.get_page(modelsearch)

                            if html:
                                b = pq(html)
                                if self.dbUtils.get_db_url(modelsearch):
                                    continue
                                else:
                                    products = b('div.home_box > a')
                                    for product in products.items():
                                        data_p = self.common.parse_item(
                                            product)
                                        data_t = {
                                            'name':
                                            self.utils.format_name(name),
                                            'url': modelurl,
                                            'board': board,
                                            'refurl': modelurl
                                        }

                                        data = dict(data_t, **data_p)
                                        yield data

                                    self.dbUtils.put_db_url(modelsearch)
                                nexturl = b('link[rel=next]')
                                if nexturl:
                                    modelsearch = nexturl.attr('href')
                                else:
                                    modelsearch = None
                            else:
                                modelsearch = None
                    except:
                        continue

                self.log('parsed url %s' % url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Example #10
0
class CWebParserSite(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if not url:
                yield None

            html = self.utils.get_page(url, headers={
                "Accept-Encoding": "",
            })
            if html:
                a = pq(html)
                # items
                items = a('ul.links li.hideli')

                for item in items.items():
                    modelurl = item('a').attr('href')
                    name = item('a').attr('title')

                    if self.dbUtils.get_db_url(modelurl):
                        continue

                    html = self.utils.get_page(modelurl,
                                               headers={
                                                   "Accept-Encoding": "",
                                               })
                    if html:
                        b = pq(html)
                        products = b('li.box-shadow')
                        try:
                            for product in products.items():
                                data_p = self.common.parse_item(product)
                                data_t = {
                                    'name': name,
                                    'url': modelurl,
                                    'refurl': modelurl
                                }

                                data = dict(data_t, **data_p)
                                yield data
                        except:
                            continue

                        self.dbUtils.put_db_url(modelurl)

                self.log('parsed url %s' % url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)

        yield None
Example #11
0
class CWebParserSite(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))
        self.utils.verify = False

    '''
    parse_page

    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html)
                # items
                items = a('#models tr td a')
                for item in items.items():
                    name = item.attr('title')
                    model_url = urljoin('https://www.babesmachine.com',
                                        item.attr('href'))

                    if self.dbUtils.get_db_url(model_url):
                        yield None

                    html2 = self.utils.get_page(model_url)
                    if html2:
                        b = pq(html2)
                        modelitems = b('#posts tr td a')
                        parse_succeed = True
                        for modelitem in modelitems.items():
                            try:
                                data_p = self.common.parse_item(modelitem)
                                data_t = {
                                    'name': name,
                                    'url': model_url,
                                    'refurl': url
                                }

                                data = dict(data_t, **data_p)
                                yield data
                            except:
                                parse_succeed = False
                                continue

                        if parse_succeed:
                            self.log('parsed url %s' % model_url)
                            self.dbUtils.put_db_url(model_url)
                    else:
                        self.log('request %s error' % model_url)
                        continue
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Example #12
0
class CWebParserSite(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if not url:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            end_pos = url.rfind('/') - 1  # 倒数第一个"/"的位置再左移一位
            start_pos = url.rfind(
                '/', 0,
                end_pos)  # 网址从开始截至到end_pos的位置,从右往左出现的第一个"/"也就是我们要找的倒数第二个"/"
            name = url[start_pos + 1:]  # 截取网址的倒数第二个 "/" 后面的内容

            data_total = 1
            html = self.utils.get_page(url)
            if html:
                a = pq(html)
                data_total = a('button.js-load-more').attr('data-total')
                if not data_total:
                    data_total = 1

            if int(data_total) > 0:
                for page in range(1, int(data_total) + 1):
                    try:
                        cate_url = '%s?mode=async&action=get_block&block_id=list_albums_common_albums_list&from=%s' % (
                            url, page)

                        if self.dbUtils.get_db_url(cate_url):
                            continue

                        html = self.utils.get_page(cate_url)
                        if html:
                            b = pq(html)

                            items = b('div.masonry_item >a')
                            for item in items.items():
                                data_p = self.common.parse_item(item)
                                data_t = {
                                    'name': name,
                                    'url': data_p.get('brief').get('url'),
                                    'refurl': cate_url
                                }

                                data = dict(data_t, **data_p)
                                yield data
                            self.dbUtils.put_db_url(cate_url)
                    except:
                        continue
                self.log('parsed url %s' % url)
                self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None

    '''
    urls_genarator
    
    @author: chenzf
    '''

    def urls_genarator(self):
        html = self.utils.get_page(self.url)
        if html:
            a = pq(html)
            categorys = a('div.masonry_item a')
            for category in categorys.items():
                yield category.attr('href'), category.attr('title')
        yield None
Example #13
0
class CWebParserSite(CWebParserMultiUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.utils.verify = False
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page

    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html)
                # items
                items = a('div.thumbs div.thumb > a')
                processNum = 0
                parse_succeed = True
                for item in items.items():
                    try:
                        name = item.text()
                        model_url = urljoin('https://www.erosberry.com/',
                                            item.attr('href'))

                        html2 = self.utils.get_page(model_url)
                        if html2:
                            b = pq(html2)
                            board = urljoin('https://www.erosberry.com/',
                                            b('div.info > img').attr('src'))
                            modelitems = b('div.girl_thumbs div.container > a')
                            for modelitem in modelitems.items():
                                try:
                                    data_p = self.common.parse_item(modelitem)
                                    data_t = {
                                        'name': name,
                                        'url': model_url,
                                        'board': board,
                                        'refurl': url
                                    }

                                    data = dict(data_t, **data_p)
                                    yield data
                                    processNum += 1
                                except:
                                    parse_succeed = False
                                    continue
                    except:
                        parse_succeed = False
                        continue
                if parse_succeed and processNum > 0:
                    self.log('parsed url %s' % url)
                    self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None

    def urls_genarator(self):
        for i in range(self.start, self.end + 1):
            yield self.url.format(page=i * 44)
        yield None
Example #14
0
class CWebParserSite(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html, parser='html')
                # items
                items = a('a.artwork')
                for item in items.items():
                    modelurl = urljoin('http://www.hegre.com/',
                                       item.attr('href').strip())
                    board = item('img').attr('src')
                    name = item.attr('title')

                    if self.dbUtils.get_db_url(modelurl):
                        continue

                    bFarseSucceed = True
                    html2 = self.utils.get_page(modelurl)
                    if html2:
                        b = pq(html2, parser='html')
                        item_models = b('#films-wrapper div.item')
                        for item_model in item_models.items():
                            try:
                                data_p = self.common.parse_item(item_model)
                                data_t = {
                                    'name': self.utils.format_name(name),
                                    'url': modelurl,
                                    'board': board,
                                    'refurl': modelurl
                                }

                                data = dict(data_t, **data_p)
                                yield data
                            except:
                                bFarseSucceed = False
                                continue
                        b = pq(html2, parser='html')
                        item_models = b('#massages-wrapper div.item')
                        for item_model in item_models.items():
                            try:
                                data_p = self.common.parse_item(item_model)
                                data_t = {
                                    'name': self.utils.format_name(name),
                                    'url': modelurl,
                                    'board': board,
                                    'refurl': modelurl
                                }

                                data = dict(data_t, **data_p)
                                yield data
                            except:
                                bFarseSucceed = False
                                continue

                        self.log('parsed url %s' % modelurl)
                        if bFarseSucceed:
                            self.dbUtils.put_db_url(modelurl)

                self.log('parsed url %s' % url)
                self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Example #15
0
class CWebParserSite(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            html = self.utils.get_page(url,
                                       headers={
                                           "Host": "godsartnudes.com",
                                           "Upgrade-Insecure-Requests": "1"
                                       })
            if html:
                a = pq(html)
                # items
                items = a(
                    'div.row.gan-central div.col-xxs-12.col-xs-6.col-sm-4.col-md-3 div.Thumb a:last-of-type'
                )
                for item in items.items():
                    name = item.text()
                    # board = item('a img').attr('lsrc') + '.jpg'
                    model_url = urljoin('http://godsartnudes.com',
                                        item.attr('href'))

                    if self.dbUtils.get_db_url(model_url):
                        continue

                    html2 = self.utils.get_page(model_url)
                    if html2:
                        b = pq(html2)
                        modelitems = b(
                            'div.row.spacetop div.col-xxs-12.col-xs-6.col-sm-4.col-md-3 div.thumbImage > a:first-child'
                        )
                        parse_succeed = True
                        processNum = 0
                        for modelitem in modelitems.items():
                            parse_succeed &= True
                            try:
                                data_p = self.common.parse_item(modelitem)
                                data_t = {
                                    'name': name,
                                    'url': model_url,
                                    'refurl': url
                                }

                                data = dict(data_t, **data_p)
                                yield data
                                processNum += 1
                            except:
                                parse_succeed = False
                                continue

                        if parse_succeed and processNum > 0:
                            self.log('parsed url %s' % model_url)
                            self.dbUtils.put_db_url(model_url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None

    def urls_genarator(self):
        for url in range(ord("A"), ord("Z") + 1):
            yield self.url.format(page=chr(url))
        yield None
Example #16
0
class CWebParserSite(CWebParserMultiUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html, parser='html')
                # items
                items = a('#block-system-main .node-grid')

                for item in items.items():
                    board = item('div.field-type-image img').attr('src')
                    name = item('.grid-meta a').text()
                    modelurl = urljoin('http://www.hegregirls.com/',
                                       item('.grid-meta a').attr('href'))

                    html2 = self.utils.get_page(modelurl)
                    if html2:
                        b = pq(html2, parser='html')
                        items_model = b(
                            '#main-content .content .content .grid-4')
                        for item_model in items_model.items():
                            try:
                                if not re.search('galleries',
                                                 item_model.attr('about')):
                                    continue

                                data_p = self.common.parse_item(item_model)
                                data_t = {
                                    'name': name,
                                    'url': modelurl,
                                    'board': board,
                                    'refurl': url
                                }

                                data = dict(data_t, **data_p)
                                yield data
                            except:
                                continue

                self.log('parsed url %s' % url)
                self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Example #17
0
class CWebParserSite(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if not url:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            html = self.utils.get_page(url)
            if html:
                name = url.split('/')[-2]

                a = pq(html)
                # items
                items = a('#content div.wrap.wrap2 div.thumbs a')
                parse_succeed = True
                for item in items.items():
                    try:
                        data_p = self.common.parse_item(item)
                        data_t = {
                            'name': name,
                            'url': data_p.get('brief').get('url'),
                            'refurl': url
                        }

                        data = dict(data_t, **data_p)
                        yield data
                    except:
                        parse_succeed = False
                        continue

                if parse_succeed:
                    self.log('parsed url %s' % url)
                    self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None

    def urls_genarator(self):
        html = self.utils.get_page(self.url)
        if html:
            a = pq(html)
            models = a('ul.bottomLists2  ul li a')
            for model in models.items():
                yield urljoin('http://xnudegirls.com/', model.attr('href'))
        yield None
Example #18
0
class CWebParserSite(CWebParserMultiUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page

    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            html = self.utils.get_page(url)
            if html:
                if self.dbUtils.get_db_url(url):
                    pass
                else:
                    a = pq(html)
                    items = a(
                        'body > div.main-wrap > div.best-list-block.hide-on-search > div.width-wrap > div.thumb-container div.pornstar-thumb-container div.pornstar-thumb-container__info div.pornstar-thumb-container__info-title a')
                    for item in items.items():
                        model_url_origin = item.attr('href')
                        name = item.text()

                        index = 1
                        while True:
                            model_url = "%s/%s" % (model_url_origin, index)
                            if index == 1:
                                if self.dbUtils.get_db_url(model_url_origin):
                                    index = index + 1
                                    continue
                            elif self.dbUtils.get_db_url(model_url):
                                index = index + 1
                                continue

                            break

                        if index > 2:
                            index = index - 1
                            model_url = "%s/%s" % (model_url_origin, index)
                        else:
                            model_url = model_url_origin

                        while True:
                            self.log('request %s' % model_url)
                            html2 = self.utils.get_page(model_url)
                            if html2:
                                if self.dbUtils.get_db_url(model_url):
                                    pass
                                else:
                                    board = pq(html2)('div.pornstar-logo img').attr('src')
                                    data_ps, parse_res = self.parse_sub_page(html2)
                                    for data_p in data_ps:
                                        data_t = {
                                            'name': name,
                                            'url': model_url,
                                            'board': board,
                                            'refurl': url
                                        }

                                        data = dict(data_t, **data_p)
                                        yield data

                                    if parse_res:
                                        self.log('parsed url %s' % model_url)
                                        self.dbUtils.put_db_url(model_url)

                                next_url = pq(html2)('li.next a').attr("href")
                                if next_url:
                                    model_url = next_url
                                else:
                                    break
                            else:
                                break;
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None

    def parse_sub_page(self, html):
        b = pq(html)
        items = b(
            'body > div.main-wrap > main > div > article > div.index-videos.mixed-section > div.thumb-list.thumb-list--sidebar.thumb-list--recent > div.thumb-list__item.video-thumb a.video-thumb-info__name')

        sub_datas = []
        parse_successed = None
        for item in items.items():
            try:
                data_p = self.common.parse_item(item)
                sub_datas.append(data_p)

                if not parse_successed:
                    parse_successed = True
                else:
                    parse_successed = True & parse_successed
            except:
                parse_successed = False

        return sub_datas, parse_successed
Example #19
0
class CWebParserSite(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            while True:
                html = self.utils.get_page(url)
                if html:
                    if self.dbUtils.get_db_url(url):
                        pass
                    else:
                        a = pq(html)
                        # items
                        items = a('ul.set.sluts_main li')
                        parse_succeed = True
                        for item in items.items():
                            try:
                                name = item('b a').text()
                                board = item('a img').attr('lsrc') + '.jpg'
                                model_url = urljoin('https://www.hqsluts.com/',
                                                    item('b a').attr('href'))

                                html2 = self.utils.get_page(model_url)
                                if html2:
                                    b = pq(html2)
                                    modelitems = b('ul.set.s**t li')
                                    for modelitem in modelitems.items():
                                        try:
                                            data_p = self.common.parse_item(
                                                modelitem)
                                            data_t = {
                                                'name':
                                                self.utils.format_name(name),
                                                'url':
                                                model_url,
                                                'board':
                                                board,
                                                'refurl':
                                                url
                                            }

                                            data = dict(data_t, **data_p)
                                            yield data
                                        except:
                                            parse_succeed = False
                                            continue
                            except:
                                parse_succeed = False
                                continue
                        if parse_succeed:
                            self.log('parsed url %s' % url)
                            self.dbUtils.put_db_url(url)

                    next_url = a('#pages li a[count="Next Page"]')
                    if next_url:
                        url = urljoin('https://www.hqsluts.com/',
                                      next_url.attr('href'))
                        self.log('request %s' % url)
                    else:
                        break
                else:
                    self.log('request %s error' % url)
                    continue
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None