コード例 #1
0
    def parse(self, response):
        ipItem = IpItem()

        proxy = response.meta['p']
        rootPath = proxy['root']
        table = response.xpath(rootPath).extract()

        for sel in Selector(response=response).xpath(rootPath):
            ipPath = proxy['ip']
            portPath = proxy['port']

            ipList = sel.xpath(ipPath).extract()
            portList = sel.xpath(portPath).extract()

            ip = Utility.listToStr(ipList)
            port = Utility.listToStr(portList)
            # using regular expression
            regex = '\d.{1,3}\d{1,3}'
            if re.match(regex, ip):
                print ip
                v = ValidateIp()
                protocol, anonymous, speed = v.validate(ip, port)
                if protocol is not -1:
                    ipItem['ip'] = ip
                    ipItem['port'] = port
                    print ipItem['ip'], ':', ipItem['port']
                    yield ipItem
                else:
                    continue
            else:
                continue
コード例 #2
0
ファイル: accountSpider.py プロジェクト: rexbean/spider
    def parseAccount(self, response):
        urls = []

        articleListItem = ArticleListItem()
        title = response.xpath('/html/head/title//text()').extract()
        print Utility.listToStr(title)
        # account = response.xpath('//*[@class="profile_infoß"]//text()').extract()[0].strip()
        print '----------------------account-------------------------'
        for articlePath in Selector(response=response).xpath(
                '//*[@class="weui_media_box appmsg"]/div'):
            # title
            title = articlePath.xpath('./h4//text()').extract()[0].strip()
            articleListItem['title'] = title
            print articleListItem['title']
            # url
            url = articlePath.xpath('./h4//@hrefs').extract()[0]
            url = "https://mp.weixin.qq.com" + url
            articleListItem['url'] = url

            print articleListItem['url']
            # date
            date = articlePath.xpath(
                './/*[@class="weui_media_extra_info"]//text()').extract()[0]
            articleListItem['date'] = date
            print articleListItem['date']
            # abstract
            abstract = articlePath.xpath(
                './/*[@class="weui_media_desc"]//text()').extract()
            articleListItem['abstract'] = Utility.listToStr(abstract)
            print articleListItem['abstract']

        urls.append(url)
        return urls
コード例 #3
0
ファイル: articleSpider.py プロジェクト: rexbean/spider
    def parseArticle(self, response):
        articleItem = ArticleItem()

        title = response.xpath(
            '//*[@id="activity-name"]//text()').extract()[0].strip()
        date = response.xpath('//*[@id="post-date"]//text()').extract()[0]
        author = response.xpath(
            '//*[@id="img-content"]/div[1]/em[2]').extract()
        account = response.xpath('//*[@id="post-user"]//text()').extract()[0]
        accountId = response.xpath(
            '//*[@id="js_profile_qrcode"]/div/p[1]/span//text()').extract()[0]
        content = response.xpath('//*[@id="js_content"]').extract()

        articleItem['title'] = title.encode('utf-8')
        articleItem['date'] = date.encode('utf-8')
        articleItem['account'] = account.encode('utf-8')
        articleItem['accountId'] = accountId.encode('utf-8')
        articleItem['content'] = Utility.listToStr(content).encode('utf-8')

        if len(author) > 0:
            author = author[0].encode('utf-8')
            articleItem['author'] = author
        else:
            articleItem['author'] = ''

        print articleItem['title']
        # Utility.writeArticleToFile(articleItem)
        yield articleItem
コード例 #4
0
ファイル: articleSpider.py プロジェクト: rexbean/spider
    def parse(self, response):
        for n in range(9):
            articleListItem = ArticleListItem()
            num = self.articleListCon
            rootPath = '//*[@id="sogou_vr_' + num

            titlePath = rootPath + '_title_' + str(n) + '"]'
            contentPath = rootPath + '_summary_' + str(n) + '"]'

            title = response.xpath(titlePath + '//text()').extract()
            url = response.xpath(titlePath + '/@href').extract()[0]
            content = response.xpath(contentPath + '//text()').extract()

            articleListItem['title'] = Utility.listToStr(title)
            articleListItem['url'] = url
            articleListItem['abstract'] = Utility.listToStr(content)

            print 'title==', articleListItem['title']
            print 'href==', articleListItem['url']
            print 'content=='
            print articleListItem['abstract']

            yield scrapy.Request(url, callback=self.parseArticle)
コード例 #5
0
ファイル: accountSpider.py プロジェクト: rexbean/spider
    def parse(self, response):
        for n in range(1):
            accountListItem = AccountListItem()
            num = self.accountListCon

            rootPath = '//*[@id="sogou_vr_' + num + '_box_' + str(n) + '"]'
            accountIdPath = './div/div[2]/p[2]/label//text()'

            txtBox = response.xpath(rootPath)
            accountId = txtBox.xpath(accountIdPath).extract()

            accountName = txtBox.xpath('./div/div[2]/p[1]/a//text()').extract()

            url = txtBox.xpath('./div/div[2]/p[1]/a/@href').extract()[0]

            accountListItem['account'] = Utility.listToStr(accountName)
            accountListItem['url'] = url
            accountListItem['accountId'] = accountId[0].encode('utf-8')

            print accountListItem['account']
            print accountListItem['url']
            print accountListItem['accountId']

            url = url.replace('http', 'https')

            cmd = "phantomjs spider/getBody.js '%s'" % url
            print url
            print cmd
            stdout, stderr = subprocess.Popen(
                cmd,
                shell=True,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE).communicate()
            r = HtmlResponse(url=url, body=stdout)

            articleUrls = self.parseAccount(r)
            for url in articleUrls:
                yield scrapy.Request(url,
                                     callback=ArticleSpider().parseArticle)