def links(content_soup,url): links = [] try: content_soup_all_a=content_soup.find_all('a') #content_soup_all_base=content_soup.find_all('base') for a in content_soup_all_a: href=a.get('href') parse_url=urlparse(href) if href==None: pass elif href=="": pass elif parse_url.fragment and not parse_url.scheme: pass elif parse_url.scheme: if check_if_seed_hostname(parse_url): links.append(str(href)) else: pass else: ''' converting relative urls into absolute urls ''' absolute_link=urljoin(url,href.replace('../','').replace('./','')) links.append(absolute_link) except: import sys print 'Error in get_all_links.py '+url print sys.exc_info(), traceback.print_exc() try: db.crawler_error_log.insert({'error_type':str(sys.exc_info()),'from_module':str(__file__)}) except: return links return links links=list(set(links)) return links
def links(content_soup, url): links = [] try: content_soup_all_a = content_soup.find_all('a') for a in content_soup_all_a: href = a.get('href') parse_url = urlparse(href) if href == None: pass elif href == "": pass elif parse_url.fragment and not parse_url.scheme: pass elif parse_url.scheme: if check_if_seed_hostname(parse_url): links.append(str(href)) else: pass else: absolute_link = urljoin(url, str(href)) links.append(absolute_link) except: print 'Error in get_all_links.py' + url return links links = list(set(links)) return links #c=urlopen('http://www.google.com') #page=c.read(); #print links(soup(page),'http://www.google.com')