Exemple #1
0
    def response_modifier(self, app_iter):
        '''
        Replaces all iframes with 'src' attributes
        by their rendered html code
        '''

        browser = selenium.webdriver.PhantomJS()
        html = [line for line in app_iter]
        html = '\n'.join(html)

        dom = pQ(html)
        iframes = dom('iframe[src]')
        for iframe in iframes:
            dom_node = pQ(iframe)
            start_time = time.time()
            browser.get(dom_node.attr['src'])
            end_time = time.time()

            WAIT_SEC = end_time - start_time
            TRIES_LEFT = 5
            '''
            Loop to wait all ajax script to finish.
            Everytime the length of the page source changes
            the counter is reset.

            In each interaction it waits the time the
            page took to load initally
            '''
            last_length = len(browser.page_source)
            while TRIES_LEFT:
                time.sleep(WAIT_SEC)
                new_length = len(browser.page_source)
                if new_length == last_length:
                    TRIES_LEFT -= 1
                else:
                    last_length = new_length
                    TRIES_LEFT = 5

            rendered_iframe = browser.page_source

            div = pQ(rendered_iframe)
            # Gets only the content of the body tag
            div = div.children('body')
            div.insertAfter(dom_node)
            dom_node.remove()

        ret = dom.outerHtml()

        # Converts to str encoded in UTF-8
        return [ret.encode('utf-8')]
Exemple #2
0
def get_html_from_son_element():
    doc = pQ(html)
    items = doc('.list')
    # print(items)
    # lis = items.find('li')  # find不要求是子元素,有在里面就可以
    lis = items.children('.active')  #XX.children 子元素
    print(lis)
Exemple #3
0
def get_html_from_parents():
    doc = pQ(html)
    items = doc('.list')
    # parents = items.parents()
    # print(parents)  #  两个父节点都分别打出来
    parent = items.parents('.wrap')  # 只选择class为wrap的数据
    print(parent)
Exemple #4
0
def get_html_text():
    doc = pQ(html)
    a = doc('.item-0.active a')
    print(a)
    txt = a.text()  # a 标签内的文本,不包括html标签
    print(txt)
    htm = a.html()  # a 标签内的html
    print(htm)
Exemple #5
0
def url_to_context(page_url):
    response = requests.get(url=page_url, headers=tc_head, timeout=3)
    if response and response.status_code == 200:
        response.encoding = "UTF-8"
        response_ = get_num(response)
        context = pQ(response_)
        return context
    else:
        print('Fail to Operate!')
Exemple #6
0
def url_to_context(page_url):
    response = requests.get(url=page_url, headers=tc_head)
    print(response.text)
    if response and response.status_code == 200:
        response.encoding = "UTF-8"
        response_ = get_num(response)
        context = pQ(response_)
        return context
    else:
        print('fail')
Exemple #7
0
def parse_href(current_url):
    response = requests.get(url=current_url, headers=xm_fish_head)
    if response.status_code == 200:
        response.encoding = "UTF-8"
        doc = pQ(response.text)
        #  在进行正式的爬取之前先将页面的加密内容破解
        lis = doc('#select_tab2 > ul > li')  # 通过JS路径匹配要爬取的元素
        for value in lis.items():
            name = value('div.list-word > h3 > a').text()
            name = " ".join(name.split(','))
            info = value('div.list-word > span.list-attr').text()
            floor = info.split('第')[1].split('层')[0]
            add = value('div.list-word > span.list-addr').text()
            village = value('div.list-word > span.list-addr > em:nth-child(1) > a').text()
            district = add.split('[')[1].split('-')[0]
            rent = value('div.list-word > span.list-price').text().split('元/月')[0]
            branded = value('div.list-word > div > span > i').text()
            is_branded = branded.find('品牌公寓')
            if is_branded == -1:  # 不是品牌公寓
                info = " ".join(info.split('朝'))
                rent_type = info.split()[0]
                house_type = info.split()[1]
                layout = info.split()[2]
                square = info.split()[3].split('平米')[0]
                direction = info.split()[4]
                trim_type = info.split()[5]
                update_time = value('div.list-word > span:nth-child(7)').text()
                sub_page_url = 'http://fangzi.xmfish.com' + value('div.list-word > h3 > a').attr('href')
                response_sub = requests.get(url=sub_page_url, headers=xm_fish_head)
                if response_sub.status_code == 200:
                    response_sub.encoding = "UTF-8"
                    sub_doc = pQ(response_sub.text)
                    describe = sub_doc('#info1 > div.bd > div').text()
                    describe = "".join(describe.split())
                    describe = " ".join(describe.split(','))
                    describe = " ".join(describe.split(','))
                    with open(filename, 'a', encoding='UTF-8') as f:
                        f.write('{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10},{11},{12},{13}\n '
                                .format(name, rent_type, house_type, layout, square, floor, direction, trim_type,
                                        add, district, rent, village, describe, update_time))
                    with open('for_the_word_cloud.csv', 'w', encoding='utf-8') as f:
                        f.write('{0},{1}\n'.format(name, describe))
            else:
                info = " ".join(info.split('第'))
                rent_type = info.split()[0]
                house_type = info.split()[1]
                layout = info.split()[2]
                square = info.split()[3].split('平米')[0]
                direction = 'null'
                trim_type = 'null'
                update_time = value('div.list-word > span.list-square').text()
                sub_page_url = 'http://fangzi.xmfish.com' + value('div.list-word > h3 > a').attr('href')
                response_sub = requests.get(url=sub_page_url, headers=xm_fish_head)
                if response_sub.status_code == 200:
                    response_sub.encoding = "UTF-8"
                    sub_doc = pQ(response_sub.text)
                    describe = sub_doc('body > div.bck > div > div > div.col-md-5').text()
                    describe = " ".join(describe.split('\n'))
                    describe = " ".join(describe.split(','))
                    describe = " ".join(describe.split(','))
                    describe = "".join(describe.split('房源描述'))
                    describe = "".join(describe.split('收起'))
                    with open(filename, 'a', encoding='UTF-8') as f:
                        f.write('{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10},{11},{12},{13}\n '
                                .format(name, rent_type, house_type, layout, square, floor, direction, trim_type,
                                        add, district, rent, village, describe, update_time))
                    with open('for_the_word_cloud.csv', 'w', encoding='utf-8') as f:
                        f.write('{0},{1}\n'.format(name, describe))
Exemple #8
0
def get_html_attribute():
    doc = pQ(html)
    a = doc('.item-0.active a')
    print(a)
    print(a.attr('href'))
    print(a.attr.href)
Exemple #9
0
def get_html_from_brother():
    doc = pQ(html)
    items = doc('.list .item-0.active')
    li = items.siblings()
    print(li)
Exemple #10
0
def get_html_from_parent():
    doc = pQ(html)
    items = doc('.list')
    container = items.parent('#container')
    print(container)
Exemple #11
0
def get_html_from_css_base():
    doc = pQ(html)
    print(doc('#container .list li'))
Exemple #12
0
def get_html_from_file():
    # doc = pq(filename='1.txt') 中文问题没解决,改为传统f.read()读取方法
    txt = read_file('1.txt')
    doc = pQ(txt)
    print(doc('li'))  # 查找li元素
Exemple #13
0
def get_html_from_url(url):
    doc = pQ(url=url, encoding="utf-8")  # 增加encoding="utf-8",防止中文乱码
    # print(doc('head'))
    return doc
Exemple #14
0
def get_html_li():
    doc = pQ(html)
    print(doc('li'))
Exemple #15
0
 def get_html_from_url(url):
     doc = pQ(url=url, encoding="utf-8")  # 增加encoding="utf-8",防止中文乱码
     return doc