def get_child_list(url, depth, dInstant):
    """
    Function to get all the links contained in a url, together with a
    series of characters indicating the depth level of the link
    being printed

    Keyword arguments:
    url -- The URL to analyze
    depth -- The crawling depth being analyzed, needed for printing stuff
    """
    bibtmp = []
    url_list = get_url_list(url)
    global file_locations
    global links_watched

    if dInstant == False:
        for l in url_list:
            if url_is_http(l):
                if l.endswith(".bib") or l.endswith(".bib.gz"):
                    biblist.append(l)
                    bibtmp.append(l)
            elif True:
                if l.endswith(".bib") or l.endswith(".bib.gz"):
                    biblist.append(urljoin(url, l))
                    bibtmp.append(urljoin(url, l))
            links_watched = links_watched +1
    else:
        for l in url_list:
            if url_is_http(l):
                if l.endswith(".bib") or l.endswith(".bib.gz"):
                    file_locations.append(bib_download(l, True))
                    bibtmp.append(l)
            elif True:
                if l.endswith(".bib") or l.endswith(".bib.gz"):
                    file_locations.append(bib_download(urljoin(url, l), True))
                    bibtmp.append(urljoin(url, l))
            links_watched = links_watched +1
        logger.info("Links watched in total: %i" % links_watched)

    """
    for l in url_list2:
        print_depth_point(depth)
        print(" %s" % (l))
    """

    for l in bibtmp:
        print_depth_point(depth)
        print(" %s" % l)
def recursive_bib_crawl(url, depth, max_level, dInstant):
    """
    Recursive function that crawl at level of depth,
    and if the max_level has not been reached, continues analyzing to the
    next level.

    Keyword arguments:
    url -- A string with the URL to analyze
    depth -- The current crawling depth
    max_level -- The maximum depth of crawling
    """

    url_list2 = []
    url_list = get_url_list(url)

    for l in url_list:
        if url_is_http(l):
            url_list2.append(l)
        # elif url_is_relative(l):
        elif True:
            url_list2.append(urljoin(url, l))

    if depth <= max_level:
        for l in url_list2:
            get_child_list(l, depth, dInstant)

        for l in url_list2:
            recursive_bib_crawl(l, depth+1, max_level, dInstant)
Exemple #3
0
def print_child_list(url, depth):
    """ 
    Function to print all the links contained in a url, together with a 
    series of characters indicating the depth level of the link 
    being printed
   
    Keyword arguments:
    url -- A string with the URL to analyze
    depth -- The crawling depth being analyzed, needed for printing stuff

    """
    url_list = get_url_list(url)
    for l in url_list:
        if url_is_http(l):
            print_depth_point(depth)
            print " %s" % (l)
Exemple #4
0
def print_child_list(url, depth):
    """ 
    Function to print all the links contained in a url, together with a 
    series of characters indicating the depth level of the link 
    being printed
   
    Keyword arguments:
    url -- A string with the URL to analyze
    depth -- The crawling depth being analyzed, needed for printing stuff

    """
    url_list = get_url_list(url)
    for l in url_list:
        if url_is_http(l):
            print_depth_point(depth)
            print " %s" % (l)
def bib_crawl(url, max_level=2, dInstant=False):
    """
    Receives a URL and a specified crawling depth and a download flag. It will crawl for bib files until the specified
    deepness. If dInstant is true this files will be download right away.

    Keyword arguments:
    url -- A string with the URL to analyze
    max_level -- The maximum depth of crawling for bib files
    dInstant -- if true it will download the files right away

    """
    if not url_is_http(url):
        exit_error("ERROR: URL provided must have HTTP/HTTPS scheme", 1)
    else:
        # First print all the child links of the URL
        get_child_list(url, 1, dInstant)

        # Print level 2 links and recursive among their links until reach maximum level
        recursive_bib_crawl(url, 2, max_level, dInstant)
Exemple #6
0
def print_links_to_level(url, max_depth):
    """ 
    arsespyder main function. Receives a URL and the crawling depth
    and prints on screen the links of the url, the links of the links
    of the url, etc. up to the max_depth
   
    Keyword arguments:
    url -- A string with the URL to analyze
    max_depth -- The maximum depth of link analysis

    """
    if not url_is_http(url):
        exit_error("ERROR: URL provided must have HTTP/HTTPS scheme", 1)
    else:
        # First print all the child links (links on the URL)
        print_child_list(url, 1)

        # Print level 2 links and recursive among their links until reach
        # maximum depth
        recursive_analyze_links(url, 2, max_depth)
Exemple #7
0
def print_links_to_level(url, max_depth):
    """ 
    arsespyder main function. Receives a URL and the crawling depth
    and prints on screen the links of the url, the links of the links
    of the url, etc. up to the max_depth
   
    Keyword arguments:
    url -- A string with the URL to analyze
    max_depth -- The maximum depth of link analysis

    """
    if not url_is_http(url):
        exit_error ("ERROR: URL provided must have HTTP/HTTPS scheme", 1)
    else:
        # First print all the child links (links on the URL)
        print_child_list(url, 1)

        # Print level 2 links and recursive among their links until reach
        # maximum depth
        recursive_analyze_links(url, 2, max_depth)