コード例 #1
0
    def parse(self, response):

        logger.info( 'fetch : ' + response.url )
        states = f_xpath( response, '//select[@id="states-dropdown"]' ).xpath( './option' )

        sts = {}
        for st in states:
            st_short = fx_extract( st, './@value' )
            st_name = fx_extract( st, './text()' )

            if not st_short:
                continue

            if st_short not in sts:
                sts[ st_short ] = st_name

        states = xpath( response, '//ul[contains(@class, "hide") ' + \
                                  ' and contains(@class, "clearfix")]' )

        #state_fd = open( '/tmp/state_url.csv', 'w' )
        #csvw = csv.writer( state_fd )
        for st in states:
            st_short = fx_extract( st, './@id' )
            locs = st.xpath( './li' )
            for loc in locs:
                url = fx_extract( loc, './a/@href' )
                area = fx_extract( loc, './a/text()' )
                #csvw.writerow( [ st_short, sts.get( st_short, '' ), area, url ] )

                if st_short not in sts:
                    continue

                d = FindnsaveAreaItem()
                d[ 'area'  ] = area
                d[ 'short' ] = st_short
                d[ 'state' ] = sts[ st_short ]
                d[ 'url'   ] = url

                yield d
コード例 #2
0
    def parse(self, response):
        #with open( '/tmp/amazon.html', 'w' ) as f:
        #    f.write( response.body )

        logger.info('fetch : ' + response.url)
        prdid = response.url.split('?')[0].split('/')[-1]

        review = f_xpath(response, '//table[@id="productReviews"]/tr/td')
        if review is None:
            yield scrapy.http.Request(url=next_product_url(),
                                      callback=self.parse)

        rids = xpath_extract(review, './a/@name')
        details = xpath(review, './div')

        lenth = min(len(rids), len(details))
        for i in xrange(lenth):

            rdetail = details[i]
            divs = xpath(rdetail, './div')

            # max of len( divs ) is 7, ( 0 - 6 )
            # 0 : number of helpful review
            # 1 : star, helpful text, date
            # 2 : reviewer, reviewer from
            # 3 : from
            # 4 : free product
            # 5 : reviewText
            # 6 : helpful?

            d = self.empty_item()
            d['prdid'] = prdid
            d['rid'] = rids[i]
            d['text'] = ' '.join(
                xpath_extract(rdetail, './div[@class="reviewText"]/text()'))

            while len(divs) > 0:
                div = divs[0]
                divs = divs[1:]

                text = div.extract()

                if 'people found the following review helpful' in text:
                    d['num_help_review'] = self.parse_num_help_review(div)
                    continue

                if 'out of' in text and 'stars' in text and '</nobr>' in text:
                    d[ 'star' ], d[ 'help_text' ], d[ 'date' ] = \
                                self.parse_star_help_date( div )
                    continue

                if 'By' in text and 'See all my reviews' in text:
                    d[ 'reviewer' ], d[ 'reviewer_from' ] = \
                                self.parse_reviewer_from( div )
                    continue

                if 'This review is from' in text:
                    d['rfrom'] = self.parse_from(div)

                break

            yield d

        next_url = self.next_page( response ) or \
                        next_product_url()

        # see http://doc.scrapy.org/en/latest/topics/request-response.html
        yield scrapy.http.Request(url=next_url,
                                  callback=self.parse,
                                  dont_filter=True)
コード例 #3
0
ファイル: amazon_spider.py プロジェクト: haiyandeng/iNT_fb
    def parse(self, response):
        #with open( '/tmp/amazon.html', 'w' ) as f:
        #    f.write( response.body )

        logger.info( 'fetch : ' + response.url )
        prdid = response.url.split( '?' )[ 0 ].split( '/' )[ -1 ]

        review = f_xpath( response, '//table[@id="productReviews"]/tr/td' )
        if review is None:
            yield scrapy.http.Request( url = next_product_url(),
                                       callback = self.parse )

        rids = xpath_extract( review, './a/@name' )
        details = xpath( review, './div' )

        lenth = min( len( rids ), len( details ) )
        for i in xrange( lenth ):

            rdetail = details[ i ]
            divs = xpath( rdetail, './div' )

            # max of len( divs ) is 7, ( 0 - 6 )
            # 0 : number of helpful review
            # 1 : star, helpful text, date
            # 2 : reviewer, reviewer from
            # 3 : from
            # 4 : free product
            # 5 : reviewText
            # 6 : helpful?

            d = self.empty_item()
            d[ 'prdid' ] = prdid
            d[ 'rid' ] = rids[ i ]
            d[ 'text' ] = ' '.join( xpath_extract( rdetail,
                                    './div[@class="reviewText"]/text()' ) )

            while len( divs ) > 0:
                div = divs[ 0 ]
                divs = divs[ 1: ]

                text = div.extract()

                if 'people found the following review helpful' in text:
                    d[ 'num_help_review' ] = self.parse_num_help_review( div )
                    continue

                if 'out of' in text and 'stars' in text and '</nobr>' in text:
                    d[ 'star' ], d[ 'help_text' ], d[ 'date' ] = \
                                self.parse_star_help_date( div )
                    continue

                if 'By' in text and 'See all my reviews' in text:
                    d[ 'reviewer' ], d[ 'reviewer_from' ] = \
                                self.parse_reviewer_from( div )
                    continue

                if 'This review is from' in text:
                    d[ 'rfrom' ] = self.parse_from( div )

                break

            yield d

        next_url = self.next_page( response ) or \
                        next_product_url()

        # see http://doc.scrapy.org/en/latest/topics/request-response.html
        yield scrapy.http.Request( url = next_url, callback = self.parse,
                                   dont_filter = True )