def get_links_from_soup(self, soup, baseurl): """Finds links in HTML DOM. Returns list of strings (urls) """ base = soup.head.find('base') baseurl = base['href'] if base else baseurl linky = soup.findAll('div', attrs={'class' : 'article-preview-top'}) + soup.findAll('div', attrs={'class' : 'article-preview'}) linky = [x.find('a') for x in linky if x is not None] linky = [url_to_absolute(x['href'], baseurl) for x in linky if x is not None] return linky
def get_links_from_soup(self, soup, baseurl): """Finds links in HTML DOM. Returns list of strings (urls) """ div = soup.find('div', attrs = {'id': 'contentw'}) pole = [] for h3 in div.findAll('h3'): a = h3.find('a', attrs={'class' : 'mainHeadline'}) if a is None: a = h3.find('a') if a is not None: pole.append(url_to_absolute(a['href'], baseurl)) return pole