Python pQ Exemples, pyquery.pQ Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : __init__.py Projet : belimawr/wsgi_seo_iframe

    def response_modifier(self, app_iter):
        '''
        Replaces all iframes with 'src' attributes
        by their rendered html code
        '''

        browser = selenium.webdriver.PhantomJS()
        html = [line for line in app_iter]
        html = '\n'.join(html)

        dom = pQ(html)
        iframes = dom('iframe[src]')
        for iframe in iframes:
            dom_node = pQ(iframe)
            start_time = time.time()
            browser.get(dom_node.attr['src'])
            end_time = time.time()

            WAIT_SEC = end_time - start_time
            TRIES_LEFT = 5
            '''
            Loop to wait all ajax script to finish.
            Everytime the length of the page source changes
            the counter is reset.

            In each interaction it waits the time the
            page took to load initally
            '''
            last_length = len(browser.page_source)
            while TRIES_LEFT:
                time.sleep(WAIT_SEC)
                new_length = len(browser.page_source)
                if new_length == last_length:
                    TRIES_LEFT -= 1
                else:
                    last_length = new_length
                    TRIES_LEFT = 5

            rendered_iframe = browser.page_source

            div = pQ(rendered_iframe)
            # Gets only the content of the body tag
            div = div.children('body')
            div.insertAfter(dom_node)
            dom_node.remove()

        ret = dom.outerHtml()

        # Converts to str encoded in UTF-8
        return [ret.encode('utf-8')]

Exemple #2

0

Afficher le fichier

def get_html_from_son_element():
    doc = pQ(html)
    items = doc('.list')
    # print(items)
    # lis = items.find('li')  # find不要求是子元素，有在里面就可以
    lis = items.children('.active')  #XX.children 子元素
    print(lis)

Exemple #3

0

Afficher le fichier

def get_html_from_parents():
    doc = pQ(html)
    items = doc('.list')
    # parents = items.parents()
    # print(parents)  #  两个父节点都分别打出来
    parent = items.parents('.wrap')  # 只选择class为wrap的数据
    print(parent)

Exemple #4

0

Afficher le fichier

def get_html_text():
    doc = pQ(html)
    a = doc('.item-0.active a')
    print(a)
    txt = a.text()  # a 标签内的文本，不包括html标签
    print(txt)
    htm = a.html()  # a 标签内的html
    print(htm)

Exemple #5

0

Afficher le fichier

def url_to_context(page_url):
    response = requests.get(url=page_url, headers=tc_head, timeout=3)
    if response and response.status_code == 200:
        response.encoding = "UTF-8"
        response_ = get_num(response)
        context = pQ(response_)
        return context
    else:
        print('Fail to Operate!')

Exemple #6

0

Afficher le fichier

def url_to_context(page_url):
    response = requests.get(url=page_url, headers=tc_head)
    print(response.text)
    if response and response.status_code == 200:
        response.encoding = "UTF-8"
        response_ = get_num(response)
        context = pQ(response_)
        return context
    else:
        print('fail')

Exemple #7

0

Afficher le fichier

def parse_href(current_url):
    response = requests.get(url=current_url, headers=xm_fish_head)
    if response.status_code == 200:
        response.encoding = "UTF-8"
        doc = pQ(response.text)
        #  在进行正式的爬取之前先将页面的加密内容破解
        lis = doc('#select_tab2 > ul > li')  # 通过JS路径匹配要爬取的元素
        for value in lis.items():
            name = value('div.list-word > h3 > a').text()
            name = " ".join(name.split('，'))
            info = value('div.list-word > span.list-attr').text()
            floor = info.split('第')[1].split('层')[0]
            add = value('div.list-word > span.list-addr').text()
            village = value('div.list-word > span.list-addr > em:nth-child(1) > a').text()
            district = add.split('[')[1].split('-')[0]
            rent = value('div.list-word > span.list-price').text().split('元/月')[0]
            branded = value('div.list-word > div > span > i').text()
            is_branded = branded.find('品牌公寓')
            if is_branded == -1:  # 不是品牌公寓
                info = " ".join(info.split('朝'))
                rent_type = info.split()[0]
                house_type = info.split()[1]
                layout = info.split()[2]
                square = info.split()[3].split('平米')[0]
                direction = info.split()[4]
                trim_type = info.split()[5]
                update_time = value('div.list-word > span:nth-child(7)').text()
                sub_page_url = 'http://fangzi.xmfish.com' + value('div.list-word > h3 > a').attr('href')
                response_sub = requests.get(url=sub_page_url, headers=xm_fish_head)
                if response_sub.status_code == 200:
                    response_sub.encoding = "UTF-8"
                    sub_doc = pQ(response_sub.text)
                    describe = sub_doc('#info1 > div.bd > div').text()
                    describe = "".join(describe.split())
                    describe = " ".join(describe.split('，'))
                    describe = " ".join(describe.split(','))
                    with open(filename, 'a', encoding='UTF-8') as f:
                        f.write('{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10},{11},{12},{13}\n '
                                .format(name, rent_type, house_type, layout, square, floor, direction, trim_type,
                                        add, district, rent, village, describe, update_time))
                    with open('for_the_word_cloud.csv', 'w', encoding='utf-8') as f:
                        f.write('{0},{1}\n'.format(name, describe))
            else:
                info = " ".join(info.split('第'))
                rent_type = info.split()[0]
                house_type = info.split()[1]
                layout = info.split()[2]
                square = info.split()[3].split('平米')[0]
                direction = 'null'
                trim_type = 'null'
                update_time = value('div.list-word > span.list-square').text()
                sub_page_url = 'http://fangzi.xmfish.com' + value('div.list-word > h3 > a').attr('href')
                response_sub = requests.get(url=sub_page_url, headers=xm_fish_head)
                if response_sub.status_code == 200:
                    response_sub.encoding = "UTF-8"
                    sub_doc = pQ(response_sub.text)
                    describe = sub_doc('body > div.bck > div > div > div.col-md-5').text()
                    describe = " ".join(describe.split('\n'))
                    describe = " ".join(describe.split('，'))
                    describe = " ".join(describe.split(','))
                    describe = "".join(describe.split('房源描述'))
                    describe = "".join(describe.split('收起'))
                    with open(filename, 'a', encoding='UTF-8') as f:
                        f.write('{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10},{11},{12},{13}\n '
                                .format(name, rent_type, house_type, layout, square, floor, direction, trim_type,
                                        add, district, rent, village, describe, update_time))
                    with open('for_the_word_cloud.csv', 'w', encoding='utf-8') as f:
                        f.write('{0},{1}\n'.format(name, describe))

Exemple #8

0

Afficher le fichier

def get_html_attribute():
    doc = pQ(html)
    a = doc('.item-0.active a')
    print(a)
    print(a.attr('href'))
    print(a.attr.href)

Exemple #9

0

Afficher le fichier

def get_html_from_brother():
    doc = pQ(html)
    items = doc('.list .item-0.active')
    li = items.siblings()
    print(li)

Exemple #10

0

Afficher le fichier

def get_html_from_parent():
    doc = pQ(html)
    items = doc('.list')
    container = items.parent('#container')
    print(container)

Exemple #11

0

Afficher le fichier

def get_html_from_css_base():
    doc = pQ(html)
    print(doc('#container .list li'))

Exemple #12

0

Afficher le fichier

def get_html_from_file():
    # doc = pq(filename='1.txt') 中文问题没解决,改为传统f.read()读取方法
    txt = read_file('1.txt')
    doc = pQ(txt)
    print(doc('li'))  # 查找li元素

Exemple #13

0

Afficher le fichier

def get_html_from_url(url):
    doc = pQ(url=url, encoding="utf-8")  # 增加encoding="utf-8"，防止中文乱码
    # print(doc('head'))
    return doc

Exemple #14

0

Afficher le fichier

def get_html_li():
    doc = pQ(html)
    print(doc('li'))

Exemple #15

0

Afficher le fichier

 def get_html_from_url(url):
     doc = pQ(url=url, encoding="utf-8")  # 增加encoding="utf-8"，防止中文乱码
     return doc