Exemple #1
0
class CWebParserSite(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if not url:
                yield None

            html = self.utils.get_page(url, headers={
                "Accept-Encoding": "",
            })
            if html:
                a = pq(html)
                # items
                items = a('a.list_model')

                for item in items.items():
                    modelurl = item.attr('href')
                    name = item('b').text()
                    board = item('img').attr('src')

                    if self.dbUtils.get_db_url(modelurl):
                        continue

                    html = self.utils.get_page(modelurl)
                    if html:
                        b = pq(html)
                        products = b('a.list_model2')
                        try:
                            for product in products.items():
                                data_p = self.common.parse_item(product)
                                data_t = {
                                    'name': self.utils.format_name(name),
                                    'url': modelurl,
                                    'board': board,
                                    'refurl': modelurl
                                }

                                data = dict(data_t, **data_p)
                                yield data
                        except:
                            continue

                        self.dbUtils.put_db_url(modelurl)
                self.log('parsed url %s' % url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Exemple #2
0
 def __init__(self, **kwArgs):
     super().__init__(**kwArgs)
     self.utils = CWebSpiderUtils(self.savePath)
     self.utils.verify = False
     self.common = CWebParserSiteCommon(self)
     self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))
Exemple #3
0
class CWebParserSite(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page
    
    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            while True:
                html = self.utils.get_page(url)
                if html:
                    if self.dbUtils.get_db_url(url):
                        pass
                    else:
                        a = pq(html)
                        # items
                        items = a('ul.set.babes_main li')
                        parse_succeed = True
                        for item in items.items():
                            try:
                                name = item('b a').text()
                                board = 'https:' + item('a img').attr(
                                    'lsrc') + '.jpg'
                                model_url = urljoin('https://www.hqbabes.com/',
                                                    item('b a').attr('href'))

                                html2 = self.utils.get_page(model_url)
                                if html2:
                                    b = pq(html2)
                                    modelitems = b('ul.set.babe li')
                                    for modelitem in modelitems.items():
                                        try:
                                            data_p = self.common.parse_item(
                                                modelitem)
                                            data_t = {
                                                'name':
                                                self.utils.format_name(name),
                                                'url':
                                                model_url,
                                                'board':
                                                board,
                                                'refurl':
                                                url
                                            }

                                            data = dict(data_t, **data_p)
                                            yield data
                                        except:
                                            self.log('parsed error 1 %s_%s' %
                                                     (url, modelitem))
                                            parse_succeed = False
                                            continue
                            except:
                                self.log('parsed error 2 %s_%s' % (url, item))
                                parse_succeed = False
                                continue
                        if parse_succeed:
                            self.log('parsed url %s' % url)
                            self.dbUtils.put_db_url(url)

                    next_url = a('#pages li a[count="Next Page"]')
                    if next_url:
                        url = urljoin('https://www.hqbabes.com/',
                                      next_url.attr('href'))
                        self.log('request %s' % url)
                    else:
                        break
                else:
                    self.log('request %s error' % url)
                    continue
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None
Exemple #4
0
class CWebParserSite(CWebParserMultiUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.utils.verify = False
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page

    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            if self.dbUtils.get_db_url(url):
                yield None

            html = self.utils.get_page(url)
            if html:
                a = pq(html)
                # items
                items = a('div.thumbs div.thumb > a')
                processNum = 0
                parse_succeed = True
                for item in items.items():
                    try:
                        name = item.text()
                        model_url = urljoin('https://www.erosberry.com/',
                                            item.attr('href'))

                        html2 = self.utils.get_page(model_url)
                        if html2:
                            b = pq(html2)
                            board = urljoin('https://www.erosberry.com/',
                                            b('div.info > img').attr('src'))
                            modelitems = b('div.girl_thumbs div.container > a')
                            for modelitem in modelitems.items():
                                try:
                                    data_p = self.common.parse_item(modelitem)
                                    data_t = {
                                        'name': name,
                                        'url': model_url,
                                        'board': board,
                                        'refurl': url
                                    }

                                    data = dict(data_t, **data_p)
                                    yield data
                                    processNum += 1
                                except:
                                    parse_succeed = False
                                    continue
                    except:
                        parse_succeed = False
                        continue
                if parse_succeed and processNum > 0:
                    self.log('parsed url %s' % url)
                    self.dbUtils.put_db_url(url)
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None

    def urls_genarator(self):
        for i in range(self.start, self.end + 1):
            yield self.url.format(page=i * 44)
        yield None
Exemple #5
0
class CWebParserSite(CWebParserSingleUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page

    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if not url:
                yield None

            url_origin = url
            index = 1
            while True:
                search_url = "%s?page=%s" % (url_origin, index)
                if index == 1:
                    if self.dbUtils.get_db_url(url_origin):
                        index = index + 1
                        continue
                elif self.dbUtils.get_db_url(search_url):
                    index = index + 1
                    continue
                break

            if index > 2:
                index = index - 1
                search_url = "%s?page=%s" % (url_origin, index)
            else:
                search_url = url_origin

            while True:
                self.log('request %s' % search_url)
                html2 = self.utils.get_page(search_url)
                if html2:
                    if self.dbUtils.get_db_url(search_url):
                        pass
                    else:
                        a = pq(html2)
                        items = a('div.js_video_row  div.video-box  a.video-box-image')
                        parse_successed = True
                        for item in items.items():
                            try:
                                data_p = self.common.parse_item(item)
                                if not data_p:
                                    parse_successed = False
                                    continue
                                elif self.common.parse_detail_fr_brief_duplicate(data_p):
                                    continue

                                data_t = {
                                    'name': 'Categories',
                                    'url': data_p.get('brief').get('url'),
                                    # 'refurl': search_url
                                }

                                data = dict(data_t, **data_p)
                                yield data
                            except:
                                parse_successed = False
                                continue

                        if parse_successed:
                            self.log('parsed url %s' % search_url)
                            self.dbUtils.put_db_url(search_url)
                        else:
                            self.log('request %s error' % search_url)

                    next_url = pq(html2)('#next .prev-next a').attr("data-page-number")
                    if next_url:
                        search_url = "%s?page=%s" % (url_origin, next_url)
                    else:
                        break
                else:
                    break
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None

    def urls_genarator(self):
        html = self.utils.get_page(self.url)
        if html:
            a = pq(html)
            categorys = a('#categoryList a.categoryBox')
            for category in categorys.items():
                yield urljoin("https://www.youporn.com", category.attr('href'))
        yield None
Exemple #6
0
class CWebParserSite(CWebParserMultiUrl):
    def __init__(self, **kwArgs):
        super().__init__(**kwArgs)
        self.utils = CWebSpiderUtils(self.savePath)
        self.common = CWebParserSiteCommon(self)
        self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))

    '''
    parse_page

    @author: chenzf
    '''

    def parse_page(self, url):
        try:
            if url is None:
                yield None

            html = self.utils.get_page(url)
            if html:
                if self.dbUtils.get_db_url(url):
                    pass
                else:
                    a = pq(html)
                    # items
                    items = a('#popular_pornstars_wrapper div.pornstar a')
                    for item in items.items():
                        name = item('img').attr('alt')
                        board = item('img').attr('src')
                        model_url_origin = item.attr('href')

                        index = 1
                        while True:
                            model_url = "%spage/%s/" % (model_url_origin,
                                                        index)
                            if index == 1:
                                if self.dbUtils.get_db_url(model_url_origin):
                                    index = index + 1
                                    continue
                            elif self.dbUtils.get_db_url(model_url):
                                index = index + 1
                                continue

                            break

                        if index > 2:
                            index = index - 1
                            model_url = "%spage/%s/" % (model_url_origin,
                                                        index)
                        else:
                            model_url = model_url_origin

                        while True:
                            self.log('request %s' % model_url)
                            html2 = self.utils.get_page(model_url)
                            if html2:
                                if self.dbUtils.get_db_url(model_url):
                                    pass
                                else:
                                    data_ps, parse_res = self.parse_sub_page(
                                        html2)
                                    for data_p in data_ps:
                                        data_t = {
                                            'name': name,
                                            'url': model_url,
                                            'board': board,
                                            'refurl': url
                                        }

                                        data = dict(data_t, **data_p)
                                        yield data

                                    if parse_res:
                                        self.log('parsed url %s' % model_url)
                                        self.dbUtils.put_db_url(model_url)

                                next_url = pq(html2)('#pagination_next')
                                if next_url:
                                    model_url = next_url.attr('href')
                                else:
                                    break
                            else:
                                break
            else:
                self.log('request %s error' % url)
        except:
            self.log('error in parse url %s' % url)
            yield None

        yield None

    def parse_sub_page(self, html):
        b = pq(html)
        items = b('a.video-thumb-link')

        sub_datas = []
        parse_successed = None
        for item in items.items():
            try:
                data_p = self.common.parse_item(item)
                sub_datas.append(data_p)

                if not parse_successed:
                    parse_successed = True
                else:
                    parse_successed = True & parse_successed
            except:
                parse_successed = False

        return sub_datas, parse_successed