def get_child_list(url, depth, dInstant): """ Function to get all the links contained in a url, together with a series of characters indicating the depth level of the link being printed Keyword arguments: url -- The URL to analyze depth -- The crawling depth being analyzed, needed for printing stuff """ bibtmp = [] url_list = get_url_list(url) global file_locations global links_watched if dInstant == False: for l in url_list: if url_is_http(l): if l.endswith(".bib") or l.endswith(".bib.gz"): biblist.append(l) bibtmp.append(l) elif True: if l.endswith(".bib") or l.endswith(".bib.gz"): biblist.append(urljoin(url, l)) bibtmp.append(urljoin(url, l)) links_watched = links_watched +1 else: for l in url_list: if url_is_http(l): if l.endswith(".bib") or l.endswith(".bib.gz"): file_locations.append(bib_download(l, True)) bibtmp.append(l) elif True: if l.endswith(".bib") or l.endswith(".bib.gz"): file_locations.append(bib_download(urljoin(url, l), True)) bibtmp.append(urljoin(url, l)) links_watched = links_watched +1 logger.info("Links watched in total: %i" % links_watched) """ for l in url_list2: print_depth_point(depth) print(" %s" % (l)) """ for l in bibtmp: print_depth_point(depth) print(" %s" % l)
def recursive_bib_crawl(url, depth, max_level, dInstant): """ Recursive function that crawl at level of depth, and if the max_level has not been reached, continues analyzing to the next level. Keyword arguments: url -- A string with the URL to analyze depth -- The current crawling depth max_level -- The maximum depth of crawling """ url_list2 = [] url_list = get_url_list(url) for l in url_list: if url_is_http(l): url_list2.append(l) # elif url_is_relative(l): elif True: url_list2.append(urljoin(url, l)) if depth <= max_level: for l in url_list2: get_child_list(l, depth, dInstant) for l in url_list2: recursive_bib_crawl(l, depth+1, max_level, dInstant)
def print_child_list(url, depth): """ Function to print all the links contained in a url, together with a series of characters indicating the depth level of the link being printed Keyword arguments: url -- A string with the URL to analyze depth -- The crawling depth being analyzed, needed for printing stuff """ url_list = get_url_list(url) for l in url_list: if url_is_http(l): print_depth_point(depth) print " %s" % (l)
def bib_crawl(url, max_level=2, dInstant=False): """ Receives a URL and a specified crawling depth and a download flag. It will crawl for bib files until the specified deepness. If dInstant is true this files will be download right away. Keyword arguments: url -- A string with the URL to analyze max_level -- The maximum depth of crawling for bib files dInstant -- if true it will download the files right away """ if not url_is_http(url): exit_error("ERROR: URL provided must have HTTP/HTTPS scheme", 1) else: # First print all the child links of the URL get_child_list(url, 1, dInstant) # Print level 2 links and recursive among their links until reach maximum level recursive_bib_crawl(url, 2, max_level, dInstant)
def print_links_to_level(url, max_depth): """ arsespyder main function. Receives a URL and the crawling depth and prints on screen the links of the url, the links of the links of the url, etc. up to the max_depth Keyword arguments: url -- A string with the URL to analyze max_depth -- The maximum depth of link analysis """ if not url_is_http(url): exit_error("ERROR: URL provided must have HTTP/HTTPS scheme", 1) else: # First print all the child links (links on the URL) print_child_list(url, 1) # Print level 2 links and recursive among their links until reach # maximum depth recursive_analyze_links(url, 2, max_depth)
def print_links_to_level(url, max_depth): """ arsespyder main function. Receives a URL and the crawling depth and prints on screen the links of the url, the links of the links of the url, etc. up to the max_depth Keyword arguments: url -- A string with the URL to analyze max_depth -- The maximum depth of link analysis """ if not url_is_http(url): exit_error ("ERROR: URL provided must have HTTP/HTTPS scheme", 1) else: # First print all the child links (links on the URL) print_child_list(url, 1) # Print level 2 links and recursive among their links until reach # maximum depth recursive_analyze_links(url, 2, max_depth)