def extract(self, link): response = requests.get(link).content (title, body) = readability_extract(response) date = getattr(self, 'parse_%s_date'% self.extra['leader'])(body, response, link) doc = { 'url': link, 'title': title, 'text': body, 'date': date, 'source': self.sources[self.index]} return doc
def extract(self, response, link): (title, body) = readability_extract(response) #how to find out where speaker links start? date = getattr(self, 'parse_%s_date'% self.extra['leader'])(body, response, link) doc = { 'url': link, 'title': title, 'text': body, 'date': date, 'source': self.sources[self.index]} return doc
def extract(self, html, link): (title, body) = readability_extract(html) document = lxml.html.fromstring(html.encode('utf-8')) date_cells = document.cssselect('td.createdate') date = date_cells[0].text_content().strip() if len(date_cells) == 1 else None doc = { 'url': link, 'title': title, 'text': body, 'date': parse(date), 'source': 'ACGA News & Views' } return doc
def extract(self, html, link): (title, body) = readability_extract(html) document = lxml.html.fromstring(html.encode('utf-8')) date_cells = document.cssselect('td.createdate') date = date_cells[0].text_content().strip() if len( date_cells) == 1 else None doc = { 'url': link, 'title': title, 'text': body, 'date': parse(date), 'source': 'ACGA News & Views' } return doc
def extract(self, response, link): (title, body) = readability_extract(response) #how to find out where speaker links start? date = getattr(self, 'parse_%s_date' % self.extra['leader'])(body, response, link) doc = { 'url': link, 'title': title, 'text': body, 'date': date, 'source': self.sources[self.index] } return doc