def get_links( tags ): links = [] for link in tags.find_all('a'): url = link.get('href') #check whether we have a bunch of relative links, or tags, or scripts or even just garbage if validator.non_shitty_link(url): url = validator.clean_crappy_link(url) if validator.relative_link( url ) or not (validator.has_http(url) or validator.has_https(url) ): url = validator.make_non_relative_link( site_url, url ) #this seems inefficient... links.append(url) return links
def robotize_url(url): replacement = "" print("attempting to robotize %s"%url) if validator.has_http(url): replacement = "http://" url = validator.chop_http(url) elif validator.has_https(url): replacement = "https://" url = validator.chop_https(url) else: print("Was given link:", url, "now dieing...") sys.exit("What the f") #we were given a bad link and need to shut down and figure out what went wrong. #we should have clean links coming into this class. path = url.split('/')[0] return replacement + path + "/robots.txt"
def get_links( site_url, tags ): links = [] top_url = validator.get_top_level_url(site_url) for link in tags.find_all('a'): url = link.get('href') #print( "Top:%s ---- and siteURL: %s ---- "%(top_url, site_url)) if url and str(top_url) not in str(url) and not is_a_file( url ) and not validator.skip_this_link(url): #check whether we have a bunch of relative links, or tags, or scripts or even just garbage if validator.non_shitty_link(url): url = validator.clean_crappy_link(url) if validator.relative_link( url ) or not (validator.has_http(url) or validator.has_https(url) ): continue#url = validator.make_non_relative_link( site_url, url ) links.append(url) else: pass #print("EXCLUDING %s for similarity/file issue. "%url) return links