Esempio n. 1
0
def links(content_soup,url):
      links = []
      try:
        content_soup_all_a=content_soup.find_all('a')
        #content_soup_all_base=content_soup.find_all('base')
        
        for a in content_soup_all_a:
            
            href=a.get('href')
           
        
            parse_url=urlparse(href)
            
            if href==None:
                pass
            elif href=="":
                pass
            elif parse_url.fragment and not parse_url.scheme:
                 pass
                
            elif parse_url.scheme:
                
                if check_if_seed_hostname(parse_url):
                    
                    links.append(str(href))
                    
                    
                else:
                    
                    pass
                
                
                 
            else:
                '''
                converting relative urls into absolute urls 
                '''
                absolute_link=urljoin(url,href.replace('../','').replace('./',''))
                
                links.append(absolute_link)
                
      
      except:
           import sys
           print 'Error in get_all_links.py '+url
           print sys.exc_info(), traceback.print_exc()
           try: db.crawler_error_log.insert({'error_type':str(sys.exc_info()),'from_module':str(__file__)})
           except: return links
          
           return links
      links=list(set(links))
      return links
Esempio n. 2
0
def links(content_soup, url):
    links = []
    try:
        content_soup_all_a = content_soup.find_all('a')
        for a in content_soup_all_a:

            href = a.get('href')

            parse_url = urlparse(href)

            if href == None:
                pass
            elif href == "":
                pass
            elif parse_url.fragment and not parse_url.scheme:

                pass
            elif parse_url.scheme:
                if check_if_seed_hostname(parse_url):
                    links.append(str(href))

                else:

                    pass

            else:

                absolute_link = urljoin(url, str(href))
                links.append(absolute_link)

    except:
        print 'Error in get_all_links.py' + url
        return links
    links = list(set(links))
    return links


#c=urlopen('http://www.google.com')
#page=c.read();

#print links(soup(page),'http://www.google.com')