Example #1
0
def get_links( tags ):
    links = []
    for link in tags.find_all('a'):
            url = link.get('href')
            #check whether we have a bunch of relative links, or tags, or scripts or even just garbage
            if validator.non_shitty_link(url):
                url = validator.clean_crappy_link(url)
                if validator.relative_link( url ) or not (validator.has_http(url) or validator.has_https(url) ):
                    url = validator.make_non_relative_link( site_url, url ) #this seems inefficient...
                 
                links.append(url)
    return links
Example #2
0
def robotize_url(url):
    replacement = ""
    print("attempting to robotize %s"%url)
    if validator.has_http(url):
        replacement = "http://"
        url = validator.chop_http(url)
    elif validator.has_https(url):
        replacement = "https://"
        url = validator.chop_https(url)
    else:
        print("Was given link:", url, "now dieing...")
        sys.exit("What the f")
        #we were given a bad link and need to shut down and figure out what went wrong.
        #we should have clean links coming into this class.
    path = url.split('/')[0]
    return replacement + path + "/robots.txt"
Example #3
0
def get_links( site_url, tags ):
    links = []
    top_url = validator.get_top_level_url(site_url)
    for link in tags.find_all('a'):
            url = link.get('href')
            #print( "Top:%s ---- and siteURL: %s ---- "%(top_url, site_url))
            if url and str(top_url) not in str(url) and not is_a_file( url ) and not validator.skip_this_link(url):
            #check whether we have a bunch of relative links, or tags, or scripts or even just garbage
                if validator.non_shitty_link(url):
                    url = validator.clean_crappy_link(url)
                    if validator.relative_link( url ) or not (validator.has_http(url) or validator.has_https(url) ):
                        continue#url = validator.make_non_relative_link( site_url, url ) 
                    
                    links.append(url)
            else:
                pass
                #print("EXCLUDING %s for similarity/file issue. "%url)
    return links