def extract(self, link):
        response = requests.get(link).content
        (title, body) = readability_extract(response)
        date = getattr(self, 'parse_%s_date'% self.extra['leader'])(body, response, link) 
        doc = { 'url': link,
                'title': title,
                'text': body,
                'date': date,
                'source': self.sources[self.index]}

        return doc
    def extract(self, response, link):

        (title, body) = readability_extract(response)

        #how to find out where speaker links start?
        date = getattr(self, 'parse_%s_date'% self.extra['leader'])(body, response, link) 

        doc = { 'url': link,
                'title': title,
                'text': body,
                'date': date,
                'source': self.sources[self.index]}

        return doc
    def extract(self, html, link):
        (title, body) = readability_extract(html)

        document = lxml.html.fromstring(html.encode('utf-8'))
        date_cells = document.cssselect('td.createdate')
        date = date_cells[0].text_content().strip() if len(date_cells) == 1 else None
        doc = {
            'url': link,
            'title': title,
            'text': body,
            'date': parse(date),
            'source': 'ACGA News & Views'
        }
        return doc
Example #4
0
    def extract(self, html, link):
        (title, body) = readability_extract(html)

        document = lxml.html.fromstring(html.encode('utf-8'))
        date_cells = document.cssselect('td.createdate')
        date = date_cells[0].text_content().strip() if len(
            date_cells) == 1 else None
        doc = {
            'url': link,
            'title': title,
            'text': body,
            'date': parse(date),
            'source': 'ACGA News & Views'
        }
        return doc
    def extract(self, response, link):

        (title, body) = readability_extract(response)

        #how to find out where speaker links start?
        date = getattr(self,
                       'parse_%s_date' % self.extra['leader'])(body, response,
                                                               link)

        doc = {
            'url': link,
            'title': title,
            'text': body,
            'date': date,
            'source': self.sources[self.index]
        }

        return doc