Exemple #1
0
class Page(Item):
    __base_url__ = "http://www.allitebooks.com"

    pages = Css('.pagination>.pages')
    current_page = Css('.pagination>.current')
    next_page = Css('.pagination>a', attr='href')

    def clean_next_page(self, value):
        if isinstance(value, list):
            return [
                i.get('href').replace('http://www.allitebooks.com/',
                                      'http://127.0.0.1:5000/allitebooks/')
                for i in value
            ]
        else:
            return ['http://127.0.0.1:5000/' + value]

    class Meta:
        source = None
        route = {
            '/allitebooks/': '/',
            '/allitebooks/?s=:keyword': '/?s=:keyword',
            '/allitebooks/:keyword/': '/:keyword/',
            '/allitebooks/page/:path': '/page/:path'
        }
Exemple #2
0
class Post(Item):
    url = Css('div.hd>a', attr='href')
    title = Css('span.title')

    class Meta:
        source = Css('div.item', attr='target')
        route = '/'
Exemple #3
0
class IndexArticle(Item):
    __base_url__ = "http://wufazhuce.com"

    # Article
    one_article_index = Css(".fp-one-articulo p.one-titulo")
    one_article_title = Css(".fp-one-articulo p.one-articulo-titulo a")
    one_article_url = Css(".fp-one-articulo p.one-articulo-titulo a", attr='href')
    one_article_list = Css(".fp-one-articulo ul li")

    def clean_one_article_index(self, one_article_index):
        return one_article_index.strip()

    def clean_one_article_title(self, one_article_title):
        return one_article_title.strip()

    def clean_one_article_url(self, one_article_url):
        return 'http://127.0.0.1:5000/' + one_article_url.replace("http://wufazhuce.com", "one")

    def clean_one_article_list(self, one_article_list):
        article_list = []
        for article in one_article_list:
            each_article = {}
            each_article['one_index'] = article.cssselect('span')[0].text.strip()
            each_article['one_title'] = article.cssselect('a')[0].text.strip()
            each_article['one_article_url'] = 'http://127.0.0.1:5000/' + article.cssselect('a')[0].get('href').replace(
                "http://wufazhuce.com", "one")
            article_list.append(each_article)
        return article_list

    class Meta:
        source = None
        route = {
            '/one/': '/'
        }
Exemple #4
0
class IndexQuestion(Item):
    __base_url__ = "http://wufazhuce.com"

    # Question
    one_question_index = Css(".fp-one-cuestion p.one-titulo")
    one_question_title = Css(".fp-one-cuestion p.one-cuestion-titulo a")
    one_question_url = Css(".fp-one-cuestion p.one-cuestion-titulo a", attr='href')
    one_question_list = Css(".fp-one-cuestion ul li")

    def clean_one_question_index(self, one_question_index):
        return one_question_index.strip()

    def clean_one_question_title(self, one_question_title):
        return one_question_title.strip()

    def clean_one_question_url(self, one_question_url):
        return 'http://127.0.0.1:5000/' + one_question_url.replace("http://wufazhuce.com", "one")

    def clean_one_question_list(self, one_question_list):
        question_list = []
        for question in one_question_list:
            each_question = {}
            each_question['one_index'] = question.cssselect('span')[0].text.strip()
            each_question['one_title'] = question.cssselect('a')[0].text.strip()
            each_question['one_question_url'] = 'http://127.0.0.1:5000/' + question.cssselect('a')[0].get(
                'href').replace("http://wufazhuce.com", "one")
            question_list.append(each_question)
        return question_list

    class Meta:
        source = None
        route = {
            '/one/': '/'
        }
Exemple #5
0
class Question(Item):
    __base_url__ = "http://wufazhuce.com"

    title = Css("div.one-cuestion > h4")
    editor = Css("div.one-cuestion p.cuestion-editor")
    content = Css("div.one-cuestion div.cuestion-contenido")

    def clean_title(self, title):
        if isinstance(title, list):
            return ''.join([i.text.strip() for i in title])
        return title.strip()

    def clean_content(self, abstract):
        if isinstance(abstract, list):
            result = []
            for i in abstract:
                text = ''
                for node in i.itertext():
                    text += node.strip()
                value = text
                result.append(value)
            return result

        return [abstract.strip()]

    class Meta:
        source = None
        route = {
            '/one/question/:path': '/question/:path'
        }
Exemple #6
0
class Baidu(Bing):
    __name__ = 'baidu'
    __base_url__ = 'http://www.baidu.com'

    url = Css('h3.t a', attr='href')
    title = Css('h3.t a')

    class Meta:
        source = Css('div.result')
        route = {'/:wd': '/s?wd=:wd&ie=utf-8&vf_bl=1'}
Exemple #7
0
class Post(Item):
    url = Css('div.hd>a', attr='href')
    title = Css('span.title')

    class Meta:
        source = Css('div.item', attr='target')
        route = '/'

    def clean_title(self, title):
        if isinstance(title, unicode):
            return title.replace(u'\xa0', '')
        else:
            return ''.join([i.text.strip().replace(u'\xa0', '') for i in title])
Exemple #8
0
class Recipe(Item):
    url = Css('div.recipe > a', attr='href')
    name = Css('div.recipe > div.info > p.name > a')
    cover = Css('div.recipe > a > div.cover > img', attr='data-src')

    def clean_name(self, name):
        return name.split(' ')

    class Meta:
        source = XPath('//div[contains(@class, "main-panel")]//div[@class="normal-recipe-list"]/ul[@class="list"]/li')
        route = {
            '/category/:cat/': '/category/:cat/',
            '/category/:cat/?page=:page': '/category/:cat/?page=:page',
            '/search/:keyword': '/search/?keyword=:keyword&cat=1001'
        }
Exemple #9
0
class Book(Item):
    __base_url__ = "http://www.allitebooks.com"

    book_list = Css('article>div.entry-body>header>.entry-title>a',
                    attr='href')

    def clean_book_list(self, book_list):
        if isinstance(book_list, list):
            result = [{
                'id':
                str(index),
                "name":
                value.text,
                "url":
                value.get('href').replace(
                    'http://www.allitebooks.com/',
                    'http://127.0.0.1:5000/allitebooks-info/')
            } for index, value in enumerate(book_list)]
            return result
        else:
            return [{
                'id': '0',
                'name': '',
                'href': 'http://127.0.0.1:5000/' + book_list
            }]

    class Meta:
        source = None
        route = Page.Meta.route
class Category(Item):
    categories = Css('div.cates-list')

    def clean_categories(self, nodes):
        categories = []
        for node in nodes:
            topic = {
                'name': node.findtext('div/h3').strip(),
                'list': []
            }
            cates_list = node.find('div[3]')
            h4_nodes = cates_list.findall('h4')
            ul_nodes = cates_list.findall('ul')
            for idx, el in enumerate(h4_nodes):
                tmp_dict = {
                    'name': el.text.strip(),
                    'types': []
                }
                for a in ul_nodes[idx].findall('li/a'):
                    tmp_dict['types'].append({
                        'name': a.text.strip(),
                        'link': a.get('href', '#')
                    })
                topic['list'].append(tmp_dict)
            categories.append(topic)
        return categories

    class Meta:
        source = Css('div.category-container > div')
        route = { '/category/': '/category/' }
Exemple #11
0
class IndexOne(Item):
    __base_url__ = "http://wufazhuce.com"

    # One
    one_item_list = Css("div#carousel-one div.item")

    def clean_one_item_list(self, one_item):
        item_list = []
        for item in one_item:
            each_item = {}
            each_item['one_index'] = item.cssselect('div.fp-one-titulo-pubdate p.titulo')[0].text.strip()
            each_item['one_type'] = item.cssselect('div.fp-one-imagen-footer')[0].text.strip()
            each_item['one_url'] = 'http://127.0.0.1:5000/' + item.cssselect('div.fp-one-cita a')[0].get(
                'href').replace(
                "http://wufazhuce.com", "one")
            abstract = ''
            for node in item.cssselect('div.fp-one-cita a')[0].itertext():
                abstract += node.strip() + "    "
            each_item['one_abstract'] = abstract.strip()
            each_item['date'] = item.cssselect('div.fp-one-titulo-pubdate p.dom')[0].text + " " + \
                                item.cssselect('div.fp-one-titulo-pubdate p.may')[0].text
            item_list.append(each_item)
        return item_list

    class Meta:
        source = None
        route = {
            '/one/': '/'
        }
Exemple #12
0
 class Meta:
     """
     URL: http://127.0.0.1:5000/250/
     Des: 豆瓣250电影api
     Params:
         start: eg: http://127.0.0.1:5000/250/?start=25
     """
     source = Css('div.item', attr='target')
     route = (('/250/?start=:start', '/?start=:start'), ('/250/', '/'))
Exemple #13
0
class Detail(Item):
    __base_url__ = "http://www.allitebooks.com"

    title = Css('.single-title')
    abstract = Css('.entry-header>h4')
    cover = Css('.entry-body-thumbnail>a>img', attr='src')
    description = Css('.entry-content')
    pdf_url = Css('span.download-links>a', attr='href')

    def clean_pdf_url(self, pdf_url):
        if isinstance(pdf_url, list):
            return pdf_url[0].get('href')
        else:
            return pdf_url

    class Meta:
        source = None
        route = {'/allitebooks-info/:keyword': '/:keyword/'}
Exemple #14
0
class ImageInfo(Item):
    image_url = Css('head > meta[property="og:image"]', attr='content')
    description = Css('head > meta[property="og:description"]', attr='content')
    source_url = Css('head > meta[property="og:url"]', attr='content')
    user_id = Css('head > meta[property="instapp:owner_user_id"]',
                  attr='content')
    user_info_url = Css('head > meta[property="instapp:owner_user_id"]',
                        attr='content')

    def clean_user_info_url(self, user_info_url):
        return "https://i.instagram.com/api/v1/users/{}/info/".format(
            user_info_url[0])

    class Meta:
        source = None
        route = '/p/.*?'

        web = {"with_ajax": False}
Exemple #15
0
class Page(Item):
    next = Css('a.next', attr='href')

    class Meta:
        source = XPath('//div[@class="pager"]')
        route = {
            '/category/:cat/': '/category/:cat/',
            '/category/:cat/?page=:page': '/category/:cat/?page=:page',
            '/search/:keyword': '/search/?keyword=:keyword&cat=1001'
        }
Exemple #16
0
class Article(Item):
    __base_url__ = "http://wufazhuce.com"

    title = Css("h2.articulo-titulo")
    author = Css("p.articulo-autor")
    abstract = Css("div.comilla-cerrar")
    content = Css("div.articulo-contenido")

    def clean_title(self, title):
        return title.strip()

    def clean_author(self, author):
        return author.strip()

    def clean_abstract(self, abstract):
        return abstract.strip()

    class Meta:
        source = None
        route = {'/one/article/:path': '/article/:path'}
Exemple #17
0
def test_css():
    field = Css(rule="head title", attr=None)
    inline_field = Css(rule="p.p1", attr='html')
    value = field.parse(html)
    inline_field_value = inline_field.parse(html)
    assert isinstance(inline_field_value, list) == True
    assert value == "toapi"
Exemple #18
0
class One(Item):
    __base_url__ = "http://wufazhuce.com"

    index = Css("div.tab-content div.one-titulo")
    image = Css("div.tab-content div.one-imagen img", attr='src')
    abstract = Css("div.tab-content div.one-cita")
    type = Css("div.tab-content div.one-imagen-leyenda")
    date = Css("div.tab-content div.one-pubdate p")

    def clean_index(self, index):
        return index.strip()

    def clean_abstract(self, abstract):
        return abstract.strip()

    def clean_date(self, date):
        if isinstance(date, list):
            return ' '.join([i.text.strip() for i in date])

    class Meta:
        source = None
        route = {'/one/one/:path': '/one/:path'}
Exemple #19
0
class Bing(Item):
    __name__ = 'bing'
    __base_url__ = 'https://www.bing.com'

    url = Css('h2 a', attr='href')
    title = Css('h2 a')

    def clean_url(self, url):
        if isinstance(url, list) and len(url):
            url = url[0].get('href')
        return url if url else ''

    def clean_title(self, title):
        if isinstance(title, list) and len(title):
            text = ''
            for node in title[0].itertext():
                text += node
            title = text.strip()
        return title if title else ''

    class Meta:
        source = Css('li.b_algo')
        route = {'/:wd': '/search?q=:wd&ensearch=1'}
Exemple #20
0
class Post(Item):
    url = Css('div.hd>a', attr='href')
    title = Css('span.title')

    class Meta:
        """
        URL: http://127.0.0.1:5000/250/
        Des: 豆瓣250电影api
        Params:
            start: eg: http://127.0.0.1:5000/250/?start=25
        """
        source = Css('div.item', attr='target')
        route = (('/250/?start=:start', '/?start=:start'), ('/250/', '/'))

    def clean_title(self, title):
        if isinstance(title, unicode):
            return title.replace(u'\xa0', '')
        else:
            return ''.join(
                [i.text.strip().replace(u'\xa0', '') for i in title])

    def clean_url(self, value):
        return value
Exemple #21
0
 class Meta:
     source = Css('div.g')
     route = {
         '/:wd': '/search?hl=en&q=:wd&btnG=Search&gbv=1',
     }
     web = {
         "with_ajax": False,
         "request_config": {
             'headers': {
                 'User-Agent':
                 "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)"
             },
             'proxies': {
                 'http': '0.0.0.0:8118',
                 'https': '0.0.0.0:8118'
             }
         },
         "headers": None
     }
Exemple #22
0
class Content(Item):
    name = Css('h1.page-title[itemprop="name"]')
    cover = Css('div.recipe-show > div.cover > img', attr='src')
    grade = Css(
        'div.recipe-show > div.container > div.stats > div.score > span.number'
    )
    materials = Css('div.recipe-show > div.ings > table tr')
    steps = Css('div.steps > ol li', attr='html')
    tip = Css('div.tip')

    def clean_name(self, name):
        return name.strip()

    def clean_materials(self, nodes):
        materials = [{
            'name':
            node.findtext('td[@class="name"]').strip()
            or node.findtext('td[@class="name"]/a').strip(),
            'unit':
            node.findtext('td[@class="unit"]').strip()
        } for node in nodes]
        return materials

    def clean_steps(self, nodes):
        # HTML tag <p/>
        re_p = re.compile('</?p[^>]*>')
        # HTML tag <br/>
        re_br = re.compile('<br\s*?/?>')
        steps = [{
            'step':
            idx + 1,
            'desc':
            re_br.sub('\n',
                      re_p.sub('',
                               etree.tounicode(
                                   node.find('p')).strip())).strip(),
            'img':
            node.find('img').get('src') if node.find('img') is not None else ''
        } for idx, node in enumerate(nodes)]
        return steps

    def clean_tip(self, tip):
        return tip.strip()

    class Meta:
        source = XPath('//div[contains(@class,"main-panel")]/div[1]')
        route = {'/recipe/:no/': '/recipe/:no/'}
 class Meta:
     source = Css('div.category-container > div')
     route = { '/category/': '/category/' }
Exemple #24
0
 class Meta:
     source = Css('li.b_algo')
     route = {'/:wd': '/search?q=:wd&ensearch=1'}
Exemple #25
0
 class Meta:
     source = Css('div.item', attr='target')
     route = (('/250/?start=:start', '/?start=:start'), ('/250/', '/'))
Exemple #26
0
 class Meta:
     source = Css('div.item', attr='target')
     route = {'/250/?start=:start': '/?start=:start', '/250/': '/'}
Exemple #27
0
class Google(Item):
    __name__ = 'google'
    __base_url__ = 'https://www.google.com'

    url = Css('h3.r > a', attr='href')
    title = Css('h3.r > a')

    def clean_url(self, url):
        if isinstance(url, list) and len(url):
            url = url[0].get('href')
        return self.filter_link(link=url) if url else ''

    def clean_title(self, title):
        if isinstance(title, list) and len(title):
            text = ''
            for node in title[0].itertext():
                text += node
            title = text.strip()
        return title if title else ''

    @classmethod
    def filter_link(cls, link):
        """
        Returns None if the link doesn't yield a valid result.
        Token from https://github.com/MarioVilas/google
        :return: a valid result
        """
        try:
            # Valid results are absolute URLs not pointing to a Google domain
            # like images.google.com or googleusercontent.com
            o = urlparse(link, 'http')
            if o.netloc:
                return link
            # Decode hidden URLs.
            if link.startswith('/url?'):
                link = parse_qs(o.query)['q'][0]
                # Valid results are absolute URLs not pointing to a Google domain
                # like images.google.com or googleusercontent.com
                o = urlparse(link, 'http')
                if o.netloc:
                    return link
        # Otherwise, or on error, return None.
        except Exception as e:
            return ''

    class Meta:
        source = Css('div.g')
        route = {
            '/:wd': '/search?hl=en&q=:wd&btnG=Search&gbv=1',
        }
        web = {
            "with_ajax": False,
            "request_config": {
                'headers': {
                    'User-Agent':
                    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)"
                },
                'proxies': {
                    'http': '0.0.0.0:8118',
                    'https': '0.0.0.0:8118'
                }
            },
            "headers": None
        }
 class Meta:
     source = Css('div.item', attr='target')
     route = '/'
Exemple #29
0
def test_css_attr():
    field = Css(rule="p a.test_link", attr='href')
    value = field.parse(html)
    assert value == "https://github.com/gaojiuli/toapi"
Exemple #30
0
class Content(Item):
    name = Css('h1.page-title[itemprop="name"]')
    cover = Css('div.recipe-show > div.cover > img', attr='src')
    grade = Css('div.recipe-show > div.container > div.stats > div.score > span.number')
    cooked = Css("div.recipe-show > div.container > div.stats > div.cooked > span.number")
    materials = Css('div.recipe-show > div.ings > table tr')
    steps = Css('div.steps > ol li', attr='html')
    tip = Css('div.tip')

    def clean_name(self, name):
        #assert(isinstance(name, str))
        assert(name is not None)
        return name.strip()

    def clean_materials(self, nodes):
        assert(nodes is not None)
        #assert(nodes[0].findtext('td[@class="name"]') is not None)
        materials = []
        for node in nodes:
            name1 = node.findtext('td[@class="name"]')
            name2 = node.findtext('td[@class="name"]/a')
            unit = node.findtext('td[@class="unit"]')
            if (name1 is None and name2 is None) or unit is None:
                pass
            else: 
                if name1 is None:
                    name1 = ""
                if name2 is None:
                    name2 = ""
                name = name1.strip() or name2.strip()
                unit = unit.strip()
                materials.append({"name": name, unit: "unit"})

        """    
        print(nodes[0].findtext('td[@class="unit"]'))
        materials = [{
            'name': node.findtext('td[@class="name"]').strip() or node.findtext('td[@class="name"]/a').strip(),
            'unit': node.findtext('td[@class="unit"]').strip()
        } for node in nodes]
        """
        return materials

    def clean_steps(self, nodes):
        # HTML tag <p/>
        re_p = re.compile('</?p[^>]*>')
        # HTML tag <br/>
        re_br = re.compile('<br\s*?/?>')
        steps = [{
            'step': idx + 1,
            'desc': re_br.sub('\n', re_p.sub('', etree.tounicode(node.find('p')).strip())).strip(),
            'img': node.find('img').get('src') if node.find('img') is not None else ''
        } for idx, node in enumerate(nodes)]
        return steps

    def clean_tip(self, tip):
        #assert(isinstance(tip, str))
        if not isinstance(tip, str):
            tip = ""
        return tip.strip()

    class Meta:
        source = XPath('//div[contains(@class,"main-panel")]/div[1]')
        route = { '/recipe/:no/': '/recipe/:no/' }