Ejemplo n.º 1
0
    def parse_ask(self, response):
        selector = Selector(response)
        username = response.url.split('/')[-2]

        try:
            for record in selector.xpath(r"id('zh-profile-ask-list')/div"):
                view_num = record.xpath(r'span/div[1]/text()')[0].extract()
                title = record.xpath(r"div/h2/a/text()")[0].extract()
                answer_num = record.xpath(
                    r"div/div/span[1]/following-sibling::text()")[0].extract(
                    ).split(' ')[0].replace('\n', '')
                follower_num = record.xpath(
                    r"div/div/span[2]/following-sibling::text()")[0].extract(
                    ).split(' ')[0].replace('\n', '')
                url = host + record.xpath(r"div/h2/a/@href")[0].extract()
                print url
                yield ZhihuAskItem(_id=url,
                                   username=username,
                                   url=url,
                                   view_num=view_num,
                                   title=title,
                                   answer_num=answer_num,
                                   follower_num=follower_num)
        except Exception, e:
            open('error_pages/asks' + response.url.split('/')[-2] + '.html',
                 'w').write(response.body)
            print '=' * 10 + str(e)
Ejemplo n.º 2
0
                        summary = ''.join(record.xpath(r"div/div[4]/div/text()").extract()).replace("\n","").strip()  #TODO
                        content = ''.join(record.xpath(r"div/div[4]/textarea/text()").extract()).replace("\n","").strip()

                        comment_num = record.xpath(r"div/div[5]/div/a[2]/text()")[1].extract() #'添加评论'或者'3 条评论'
                        comment_num = comment_num.split(' ')[0] #取数字
                        if comment_num.startswith(u'添加评论'):
                            comment_num = '0'

                        yield ZhihuAnswerItem(_id=url,username = username,url = url, ask_title = ask_title, \
                                              ask_url = ask_url, agree_num = agree_num, summary = summary
                                              , content = content, comment_num = comment_num)
                except Exception, e:
                    open('error_pages/answers_' + response.url.split('/')[-2]+'.html', 'w').write(response.body)
                    print '='*10 + str(e)

            elif typeinfo.startswith('asks'):
                username = response.url.split('/')[-2]
                try:
                    for record in selector.xpath(r"id('zh-profile-ask-list')/div"):
                        view_num = record.xpath(r'span/div[1]/text()')[0].extract()
                        title = record.xpath(r"div/h2/a/text()")[0].extract()
                        answer_num = record.xpath(r"div/div/span[1]/following-sibling::text()")[0].extract().split(' ')[0].replace('\n','')
                        follower_num = record.xpath(r"div/div/span[2]/following-sibling::text()")[0].extract().split(' ')[0].replace('\n','')
                        url = host+record.xpath(r"div/h2/a/@href")[0].extract()

                        yield ZhihuAskItem(_id=url,username = username,url = url, view_num = view_num, title = title, answer_num = answer_num, follower_num = follower_num)
                except Exception, e:
                    open('error_pages/asks' + response.url.split('/')[-2]+'.html', 'w').write(response.body)
                    print '='*10 + str(e)