コード例 #1
0
ファイル: parsers.py プロジェクト: GRSEB9S/rfpow
    def next(self, parse_each=True):
        """Return next (at most 10) parsed RFPs.
        
        Return list of dictionaries for each RFP. 
        If parse_each, then parse dedicated page of each RFP extracting
        additional metadata. Otherwise, return only parent ID, title, 
        and permanent URI of the RFP"""

        rfp_list = []

        # load HTML for page with list of RFPs
        self.load( self.get_list_uri() )

        if self.request is None:
            raise IOError( 'Request object not initialized. Run load() first' )

        try:
            s = self.request.read()
            self.doc = pq( s )
        except lxml.etree.XMLSyntaxError as e:
            logging.error( 'Could not parse URI: %s' % self.list_uri )

        parse_list = self.parse_list()

        # Don't parse individual RFPs if not instructed to
        if not parse_each: 
            return parse_list

        # Load and parse each RFP's dedicated page to grab more detailed
        # information about each one
        for l in parse_list:
            rfp_list.append( self.parse_details(l) )

        return rfp_list
コード例 #2
0
ファイル: parsers.py プロジェクト: GRSEB9S/rfpow
    def setup(self, uri):
        
        # retrieve search list
        request = urllib2.Request(uri, headers = self.headers)
        response = urllib2.urlopen(request).read()

        try:
            self.doc = pq(response)
        except lxml.etree.XMLSyntaxError:
            logging.error('Could not parse URI: %s' % self.list_uri)
コード例 #3
0
ファイル: parsers.py プロジェクト: GRSEB9S/rfpow
    def parse_list(self):
        """Parse page with list of RFPs

        Assumes self.doc contains parsed DOM of list of RFPs page
        """

        self.parsed_list = []
        # parse the DOM looking for table rows of RFPs
        rows = self.doc(".BoxBody_Center.FullSearchListBody_Center table tr")

        # remove title row and last row from self.parsed_lists; they're garbage HTML
        rows.pop(0)
        pagination = rows.pop()

        logging.info( "Got %s rows from Merx" % len(rows) )

        # extract RFP titles and links
        for i in range(0, len(rows)):
            link = rows.eq(i).find('td').eq(5).find('a')
            uri = MerxParser.domain + link.attr( "href" )
            id_search = self.id_pattern.search(uri)

            rfp = { 
                "title"     : link.text(),
                "uri"       : uri,
                "original_id" : ( id_search is not None ) and id_search.group(1) or "",
                'origin' : 'merx'
            }

            self.parsed_list.append( rfp )

        pagination_links = pq( pagination ).find( '.NavLinkStyleLink' )
        next_page = pagination_links.eq( len(pagination_links)-1 )


        # This is the last page. Mark it for future reference
        # XXX: test stopping at last page
        if next_page.text().strip() != "Next":
            self.page = -1
            logging.debug( 'Reached last self.parsed_lists page' )
        else:
            self.page = self.page + 1
            self.pagination_uri = next_page.parent().attr('onclick')[14:-3]
            # more Merx's stupid magic values
            self.pagination_data[ 'search_profile' ] = self.doc( 'input' ).eq(0).val()

        return self.parsed_list
コード例 #4
0
    def testParseList(self):
        self.parser.load(self.parser.get_list_uri())

        try:
            s = self.parser.request.read()
            self.parser.doc = pq(s)
        except lxml.etree.XMLSyntaxError as e:
            logging.error('Could not parse URI: %s' % self.list_uri)

        parsed_list = self.parser.parse_list()

        # 10 RFPs should have been parsed
        self.assertEquals(10, len(parsed_list))

        for rfp in parsed_list:
            self.assertEquals('merx', rfp['origin'])
            self.assertEquals(0, rfp['uri'].find('http://www.merx.com/'))
            self.assertNotEquals('', rfp['title'])
            self.assertNotEquals('', rfp['original_id'])
コード例 #5
0
ファイル: parsers_test.py プロジェクト: dmkc/rfpow
    def testParseList(self):
        self.parser.load(self.parser.get_list_uri())

        try:
            s = self.parser.request.read()
            self.parser.doc = pq( s )
        except lxml.etree.XMLSyntaxError as e:
            logging.error( 'Could not parse URI: %s' % self.list_uri )

        parsed_list = self.parser.parse_list()

        # 10 RFPs should have been parsed
        self.assertEquals(10, len(parsed_list))

        for rfp in parsed_list:
            self.assertEquals('merx', rfp['origin'])
            self.assertEquals(0, rfp['uri'].find('http://www.merx.com/'))
            self.assertNotEquals('', rfp['title'])
            self.assertNotEquals('', rfp['original_id'])
コード例 #6
0
ファイル: parsers.py プロジェクト: GRSEB9S/rfpow
    def parse_details(self, l):
        try:
            self.load( (l['uri'], '') )
            s = self.request.read()
            self.doc = pq( s )

            # Parse page's data and stash results in a dictionary
            rfp = self.parse_rfp()
            rfp['title'] = l['title']
            rfp['original_id'] = l['original_id']
            rfp['origin'] = l['origin']
            rfp['uri']  = l['uri']


        except lxml.etree.XMLSyntaxError as e:
            logging.error( 'Could not parse RFP: %s' % l.uri )
            raise e

        return rfp
コード例 #7
0
ファイル: parsers.py プロジェクト: GRSEB9S/rfpow
    def setup(self, uri):
        # login
        request = urllib2.Request(self.domain + self.login_url, urllib.urlencode(self.login_parameters), self.headers)
        response = urllib2.urlopen(request)
        
        # get cookie
        for i in range(len(response.info().headers)):
            if response.info().headers[i].startswith('Set-Cookie') or response.info().headers[i].startswith('set-cookie'):
                self.headers['Cookie'] = response.info().headers[i][12:].replace('; path=/', '')[:-2]



        # retrieve search list
        request = urllib2.Request(uri, headers = self.headers)
        response = urllib2.urlopen(request).read()

        try:
            self.doc = pq(response)
        except lxml.etree.XMLSyntaxError:
            logging.error('Could not parse URI: %s' % self.list_uri)