def get_entries(self, soup): for a_tag in soup.find_all('a', {'class':'link-article'}): yield { 'url': self.url + a_tag.attrs.get('href'), 'headline': a_tag.text.strip(), 'created_at': utils.now() }
def get_created(self, obj): """ return earliest time of candidates or current time. """ candidates = self.get_candidates(obj, DATE_CANDIDATE_JSONPATH) if len(candidates) > 0: return utils.struct_time_to_ts(sorted(candidates)[0]) return utils.now()
def get_entries(self, soup=None): soup = self.soup for div in soup.find_all('div', {'class': ['caption','caption-dark']}): a_tag = div.find('a') yield { 'url': self.url + a_tag.attrs.get('href'), 'headline': a_tag.text.strip().split('\n')[0].strip(), 'created_at': utils.now() }
def get_entries(self, soup): for div in soup.find_all('div', {'class': 'post-box'}): a_tag = div.find('a') hed_tag = div.find('h4') yield { 'url': a_tag.attrs.get('href'), 'headline': hed_tag.text.strip(), 'created_at': utils.now() }
def parse_entry(self, entry): """ Parse an entry in an RSS feed. """ u = self.get_url(entry) return { 'url': u, 'headline': self.get_title(entry), 'created_at': utils.now() }
def parse_entries(self, soup): """ Parse entries from soup. """ for a_tag in soup.find_all('a', {'class':'box-link'}): hed_tag = a_tag.find('strong', {'class':['post-title', 'title-news']}) yield { 'url': a_tag.attrs.get('href'), 'headline': hed_tag.text.strip(), 'created_at': utils.now() }