Beispiel #1
0
def config(fname):
    print "Trying to load configuration from %s" % fname
    cp = ConfigParser.SafeConfigParser()
    try:
        with open(fname, "r") as fp:
            cp.readfp(fp)
    except IOError as ie:
        if ie.errno == errno.ENOENT:
            return
        raise
    for section in cp.sections():
        if not section.lower().startswith("issuer "):
            continue
        if 'issuer' not in cp.options(section):
            print "Ignoring section %s as it has no `issuer` option set." % section
        if 'base_path' not in cp.options(section):
            print "Ignoring section %s as it has no `base_path` option set." % section
        issuer = cp.get(section, 'issuer')
        base_path = cp.get(section, 'base_path')
        base_path = urltools.normalize(base_path)
        issuer_info = g_authorized_issuers.setdefault(issuer, {})
        issuer_info['base_path'] = base_path
        if 'map_subject' in cp.options(section):
            issuer_info['map_subject'] = cp.getboolean(section, 'map_subject')
        print "Configured token access for %s (issuer %s): %s" % (
            section, issuer, str(issuer_info))
Beispiel #2
0
def parseLink(url):  #be aware example.com is malformed
    arr = []
    baseUrl = base(url)
    page = getPage(url)
    if (page is not None):
        soup = BeautifulSoup(page, 'html.parser')
        for x in soup.find_all('a'):
            link = x.get('href')
            if (link is not None and link[0:4] == "http"):
                arr.append(link)
            elif (link is not None and len(link) >= 1 and link[0] == "/"):
                arr.append(baseUrl + link)
            elif (link is not None and link[0:4] == "www."):
                arr.append("http://" + link)
        arr2 = [urltools.normalize(x) for x in arr]
        arr3 = [transform(x) for x in arr2 if checkLinkStr(x)]
        terms = clearHtml(page)
        if (terms == None):
            return None
        return {
            'url': url,
            'html': page,
            'links': arr3,
            'terms': terms,
            'title': soup.title.text if (soup.title is not None) else ""
        }
    return None
Beispiel #3
0
def normalize(url, parent):
    try:
        if '#' in url:
            url = url.split('#')[0]

        if url.startswith('http') and '//' not in url:
            url = url.lstrip('http:')

        if url:
            url = urlparse.urljoin(parent, url)
            url = urltools.normalize(url)

        if url.startswith('https'):
            url = url.replace('https','http')

        if re.match(r'http:/', url):
            url = url.replace('https', 'http')

        if url.endswith('jpg') or url.endswith('png') or url.endswith('jpeg'):
            url = None

        url = url.rstrip('/')
    except:
        url = None
    return url
    def urlchecker(self, url):
        if url is None:
            return False
        normalized_url = urltools.normalize(url)
        robotparser = urllib.robotparser.RobotFileParser()

        try:
            url_comp = urlparse(normalized_url)
            base_url = url_comp.scheme + "://" + url_comp.netloc + "/"
        except:
            self.logger.error("Cannot parse: " + url)
        try:
            robotparser.set_url(base_url + "robots.txt")
            robotparser.read()
            if not robotparser.can_fetch("*", normalized_url):
                self.logger.error(url + " is excluded due to protocol")
                return False
        except:
            self.logger.error("Cannot determine robots exclusion protocol: " +
                              url)

        if normalized_url in self.visited_urls:
            self.logger.debug(url + " Has been visited before! ")
            return False
        elif base_url in self.sites_times and self.sites_times[base_url] > int(
                self.limit):
            #
            self.logger.debug(
                url + " Times visiting this site have reach the limit ")
            return False
        elif 'cgi' in normalized_url:
            return False
        else:
            return True
Beispiel #5
0
def build_url(link):
    """
    Create valid URL from link.

    :param link: `str` relative url.
    :returns: `str` absolute url.
    """
    return urltools.normalize(''.join((constants.HOST, link)))
Beispiel #6
0
 def save(self, *args, **kwargs):
     """
     Normalises the URL and generates the correct unique id
     :type args: []
     :type kwargs: {}
     """
     self.url = urltools.normalize(self.url)
     super(ShortUrl, self).save(*args, **kwargs)
Beispiel #7
0
def canonicalize_url(url):
    # Canonicalize URL
    url = urltools.normalize(url)

    # Remove fragment
    url = urllib.parse.urldefrag(url).url

    return url
Beispiel #8
0
 def validate_path(self, values):
     if isinstance(values, str) or isinstance(values, unicode):
         values = [values]
     for value in values:
         if not value.startswith("/"):
             return False
         self.paths.add(urltools.normalize(value))
     return True
Beispiel #9
0
def scrape_url(root_url, url):
    url = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(url, 'html.parser')
    link_tags = soup.find_all('a')
    result = deque()
    for link in link_tags:
        current_link = link['href']
        if current_link.startswith(root_url):
            result.append(normalize(current_link))
    return result
Beispiel #10
0
def _standardize_url(url):
    """Takes in a url and returns a clean, consistent format. For example:
    example.com, http://example.com, example.com/ all are http://example.com/
    Returns None if the url is somehow invalid."""
    parts = parse.urlparse(url, "http") #default scheme is http if omitted
    standard = parts.geturl()
    standard = urltools.normalize(standard)
    if not url_regex.match(standard):
        return None
    return standard
Beispiel #11
0
def urlchecker (url, limit):
	normalized_url = urltools.normalize(url)
	url_comp = urlparse(url)
	if visited_urls.has_key(url):
		return False
	elif times_visiting_site[url_comp.net_loc] > limit :
		visited_urls[url] = True
		return False
	else:
		return True
Beispiel #12
0
def urlnorm(url):
    u = urllib.parse.urlparse(urltools.normalize(url))
    path = u.path

    if len(path) > 0:
        if path[-1] == '/':
            path = path[:-1]

    v = (u.scheme, u.netloc, path, u.params, '', '')
    return urllib.parse.urlunparse(v)
Beispiel #13
0
    def extract_urls(self, r):
        urls = set()
        tree = lxml.html.fromstring(r.text)

        for element, attribute, link, pos in tree.iterlinks():
            url = urltools.normalize(urljoin(r.url, link))
            urls.add(url)

        # self.stats['urls'] += len(urls)
        self.stats['processed'] += 1

        return urls
Beispiel #14
0
    def extract_urls(self, r):
        urls = set()
        tree = lxml.html.fromstring(r.text)

        for element, attribute, link, pos in tree.iterlinks():
            url = urltools.normalize(urljoin(r.url, link))
            urls.add(url)

        # self.stats['urls'] += len(urls)
        self.stats['processed'] += 1

        return urls
Beispiel #15
0
    def __init__(self, website, site_ctx=None, debug=False):
        self.uri = urltools.normalize(website)
        self.parsed = urltools.parse(website)
        self.domain = ".".join(self.parsed[4:6]).lstrip("www.")

        self.robots = None
        self.sitemap = None  # list of documents
        self.error = {}
        self.debug = debug

        self.__session = None
        self.load_domain_state()
Beispiel #16
0
def pick():
    global timeout
    pair = sortedList.pop()
    if doLiveCheck:
        while (time.time() < 3 + timeout):  #This should be moved to fetch
            pass
        timeout = time.time()
        links = parseLink(wikiPrefix + pair[1])

        for link in links:
            if pair[2] in urltools.normalize(link):
                return pick()
    return pair
Beispiel #17
0
    def _normalize_url(url):
        if url.startswith("//"):
            url = "http:" + url

        parsed_url = urlparse.urlparse(url)

        url_path = parsed_url.path
        url_path = urllib.quote(url_path, safe="%/:=&?~#+!$,;'@()*[]")

        url = urlparse.urlunparse(
            (parsed_url.scheme, parsed_url.netloc, url_path, "", "", ""))
        url = urltools.normalize(url)

        return url
 def fetch(self, method, endpoint, params):
     api_endpoint = normalize(self.api_base + endpoint)
     if method.lower() in ["get", "delete"]:
         content = self.oauth.request(
             method, api_endpoint, params=params, headers={"User-Agent": "Semantics3 Python Lib/0.2"}
         )
     else:
         content = self.oauth.request(
             method,
             api_endpoint,
             data=json.dumps(params),
             headers={"User-Agent": "Semantics3 Python Lib/0.2", "Content-Type": "application/json"},
         )
     return content
    def crawl(self):
        try:
            harvest_rate_accum = 0
            while self.webpages_crawled < int(self.webpages_limit):
                print(self.webpages_crawled)
                try:
                    url = self.priority_queue.pop()
                except e:
                    print("cannot pop")
                print(url)
                if self.urlchecker(url):
                    try:
                        content = self.downloader.download(url).decode('utf-8')
                        if content is not None:
                            self.webpages_crawled += 1
                            rel = self.relevance.relevance(content, self.query)
                            harvest_rate_accum += rel
                            self.crawled_log(" Harvest rate: " +
                                             str(harvest_rate_accum /
                                                 self.webpages_crawled))
                    except:
                        print("Failed in downloading")
                    normalized_url = urltools.normalize(url)
                    try:
                        url_comp = urlparse(normalized_url)
                        base_url = url_comp.scheme + "://" + url_comp.netloc + "/"
                    except:
                        self.logger.error("Cannot parse: " + url)

                    if base_url in self.sites_times:
                        self.sites_times[base_url] += 1
                    else:
                        self.sites_times[base_url] = 1
                    self.visited_urls.add(normalized_url)

                    if rel < 0.2:
                        continue
                    for link in self.parser.extract_all_links(content):
                        full_link = self.parser.parse_links(url, link)
                        if full_link is not None:
                            link_promise = self.calculator.link_promise(
                                full_link) + rel

                        try:
                            self.priority_queue.additem(
                                full_link, link_promise)
                        except:
                            pass
        except KeyError:
            print("Queue is empty now")
Beispiel #20
0
def register_url():
    url_param = request.args.get('url')
    if not url_param:
        return make_response("url param is missing", 400)  # bad request

    # TODO: advanced input validation
    # https://validators.readthedocs.io/en/latest/#module-validators.url
    # https://github.com/django/django/blob/master/django/core/validators.py#L74

    clean_url = urltools.normalize(url_param)

    # create page
    page = store.create_page(clean_url)
    if page:
        return make_response(str(page.id), 201)  # Created

    return jsonify({'status': 'Url already exist'})
 def fetch(self, method, endpoint, params):
     api_endpoint = normalize(self.api_base + endpoint)
     if method.lower() in ['get', 'delete']:
         content = self.oauth.request(
             method,
             api_endpoint,
             params=params,
             headers={'User-Agent': 'Semantics3 Python Lib/0.2'})
     else:
         content = self.oauth.request(method,
                                      api_endpoint,
                                      data=json.dumps(params),
                                      headers={
                                          'User-Agent':
                                          'Semantics3 Python Lib/0.2',
                                          'Content-Type': 'application/json'
                                      })
     return content
Beispiel #22
0
def parseLink(url):  #be aware example.com is malformed
    arr = []
    baseUrl = base(url)
    page = getPage(url)
    if (page is not None):
        soup = BeautifulSoup(page, 'html.parser')
        for x in soup.find_all('a'):
            link = x.get('href')
            if (link is not None and link[0:4] == "http"):
                arr.append(link)
            elif (link is not None and len(link) >= 1 and link[0] == "/"):
                arr.append(baseUrl + link)
            elif (link is not None and link[0:4] == "www."):
                arr.append("http://" + link)
        arr2 = [urltools.normalize(x) for x in arr]
        arr3 = [transform(x) for x in arr2]
        return arr3
    return None
Beispiel #23
0
def canonicalize_url(url, keep_params=False, keep_fragments=False):
    """Canonicalize the given url by applying the following procedures:

    # a sort query arguments, first by key, then by value
    # b percent encode paths and query arguments. non-ASCII characters are
    # c percent-encoded using UTF-8 (RFC-3986)
    # d normalize all spaces (in query arguments) '+' (plus symbol)
    # e normalize percent encodings case (%2f -> %2F)
    # f remove query arguments with blank values (unless site in NONCANONIC_SITES)
    # g remove fragments (unless #!)
    # h remove username/password at front of domain
    # i remove port if 80, keep if not
    # k remove query arguments (unless site in USEFUL_QUERY_KEYS)

    The url passed can be a str or unicode, while the url returned is always a
    str.
    """
    if keep_params:
        # Preserve all query params
        parsed = extract(norm(url))
    else:
        # Remove unwanted params
        parsed = extract(url_query_cleaner(normalize(url), parameterlist=config.USEFUL_QUERY_KEYS))

    # Sort params, remove blank if not wanted
    query = urllib.urlencode(sorted(urlparse.parse_qsl(parsed.query, keep_blank_values=keep_params)))
    fragment = getFragment(url, keep_fragments)

    # The following is to remove orphaned '=' from query string params with no values
    query = re.sub(r"=$", "", query.replace("=&", "&"))

    # Reconstruct URL, escaping apart from safe chars
    # See http://stackoverflow.com/questions/2849756/list-of-valid-characters-for-the-fragment-identifier-in-an-url
    # http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
    safe = "/.-_~!$&'()*+,;=:@"
    newurl = construct(URL(parsed.scheme, '', '', parsed.subdomain, parsed.domain, parsed.tld, parsed.port, quote(parsed.path, safe=safe), query, quote(fragment, safe=safe), ''))
    return newurl.rstrip('/')
Beispiel #24
0
def _main(args):
    synsets = {}
    lines_read = 0

    urldict = {}
    dup_count = 0

    #urllist_file = codecs.open('../fall11_urls.txt',
    #                           errors='ignore',
    #                            encoding='utf-8')
    urllist_file = open(args.url_file, 'r', encoding="latin-1")
    for line in urllist_file:
        #line = repr(line)
        lines_read += 1

        wnid, url = re.split('\s+', line, maxsplit=1)

        url = url.strip()
        url = url.strip('\n')
        url_norm = ut.normalize(url)

        if args.normalized and (url != url_norm):
            print('NORMALIZED URL:')
            print('   original:  ', url)
            print('   normalized:', url_norm)

        if url_norm not in urldict:
            urldict[url_norm] = line
        else:
            dup_count += 1
            print('DUPLICATE URL:')
            print('   ', urldict[url_norm])
            print('   ', line)

    print(dup_count, 'duplicate URLs found')
    exit()
def test_normalize__malformed():
    assert normalize('http://example.com/?foo') == 'http://example.com/'
    assert normalize('http://example.com?foo') == 'http://example.com/'
    assert normalize('http://example.com/foo//bar') == 'http://example.com/foo/bar'
    assert normalize('http://example.com?') == 'http://example.com/'
    assert normalize('http://example.com/?') == 'http://example.com/'
    assert normalize('http://example.com//?') == 'http://example.com/'
    assert normalize('http://example.com/foo/?http://example.com/bar/?x=http://examle.com/y/z') == 'http://example.com/foo/?http://example.com/bar/?x=http://examle.com/y/z'
    assert normalize('http://example.com/#foo?bar') == 'http://example.com/#foo?bar'
    assert normalize('http://example.com/#foo/bar/blub.html?x=1') == 'http://example.com/#foo/bar/blub.html?x=1'
    assert normalize('http://example.com/foo#?=bar') == 'http://example.com/foo#?=bar'
    assert normalize('http://example.com/foo/bar/http://example.com') == 'http://example.com/foo/bar/http:/example.com'
def test_normalize__ip6():
    assert normalize('[::1]') == '[::1]'
    assert normalize('http://[::1]') == 'http://[::1]/'
    assert normalize('[::1]:8080') == '[::1]:8080'
    assert normalize('http://[::1]:8080') == 'http://[::1]:8080/'
Beispiel #27
0
def test_normalize__malformed():
    assert normalize('http://example.com/?foo') == 'http://example.com/'
    assert normalize('http://example.com?foo') == 'http://example.com/'
    assert normalize(
        'http://example.com/foo//bar') == 'http://example.com/foo/bar'
    assert normalize('http://example.com?') == 'http://example.com/'
    assert normalize('http://example.com/?') == 'http://example.com/'
    assert normalize('http://example.com//?') == 'http://example.com/'
    assert normalize(
        'http://example.com/foo/?http://example.com/bar/?x=http://examle.com/y/z'
    ) == 'http://example.com/foo/?http://example.com/bar/?x=http://examle.com/y/z'
    assert normalize(
        'http://example.com/#foo?bar') == 'http://example.com/#foo?bar'
    assert normalize('http://example.com/#foo/bar/blub.html?x=1'
                     ) == 'http://example.com/#foo/bar/blub.html?x=1'
    assert normalize(
        'http://example.com/foo#?=bar') == 'http://example.com/foo#?=bar'
    assert normalize('http://example.com/foo/bar/http://example.com'
                     ) == 'http://example.com/foo/bar/http:/example.com'
Beispiel #28
0
def test_normalize__ip6():
    assert normalize('[::1]') == '[::1]'
    assert normalize('http://[::1]') == 'http://[::1]/'
    assert normalize('[::1]:8080') == '[::1]:8080'
    assert normalize('http://[::1]:8080') == 'http://[::1]:8080/'
Beispiel #29
0
def test_normalize():
    assert normalize("http://example.com") == "http://example.com/"
    assert normalize("http://example.com/") == "http://example.com/"
    assert normalize("https://example.com/") == "https://example.com/"
    assert normalize("hTTp://example.com/") == "http://example.com/"
    assert normalize("http://ExAMPLe.COM/") == "http://example.com/"
    assert normalize("http://example.com./") == "http://example.com/"
    assert normalize("http://example.com:80/") == "http://example.com/"
    assert normalize("http://example.com:/") == "http://example.com/"
    assert normalize("http://example.com/#") == "http://example.com/"

    assert normalize("http://example.com:8080/") == "http://example.com:8080/"

    assert normalize("http://www.example.com/") == "http://www.example.com/"
    assert normalize("http://www.example.com") == "http://www.example.com/"
    assert normalize("http://foo.bar.example.com/") == "http://foo.bar.example.com/"

    assert normalize("http://example.com/a") == "http://example.com/a"
    assert normalize("http://example.com/a/b/c") == "http://example.com/a/b/c"

    assert normalize("http://example.com/?x=1") == "http://example.com/?x=1"
    assert normalize("http://example.com/a?x=1") == "http://example.com/a?x=1"
    assert normalize("http://example.com/a?x=1&y=2") == "http://example.com/a?x=1&y=2"

    assert normalize("http://example.com/#abc") == "http://example.com/#abc"
    assert normalize("http://example.com/a/b/c#abc") == "http://example.com/a/b/c#abc"
    assert normalize("http://example.com/a/b/c?x=1#abc") == "http://example.com/a/b/c?x=1#abc"

    assert normalize("http://example.com/a/./b/././c") == "http://example.com/a/b/c"
    assert normalize("http://example.com/a/../b") == "http://example.com/b"

    assert normalize("eXAmplE.com") == "example.com"
    assert normalize("example.com/a/../b") == "example.com/b"

    assert normalize("http://www.example.com") == "http://www.example.com/"
    assert normalize("www.example.com") == "www.example.com"
if len(sys.argv) <= 3:
	print "arguments illegal: " + str(sys.argv) 
	sys.exit(1)

html_file = sys.argv[1]
url=sys.argv[2]
suffix = sys.argv[3]

html_path = url.rsplit('/', 1)[0] + '/'
html_text = open(html_file, 'r').read()
soup = BeautifulSoup(html_text, "html.parser")

contents = set()
for link in soup.findAll('a'):
	content = link.get('href')
	if content is None:
		continue
	if suffix not in content:
		continue

	if not is_absolute(content):
		content = html_path + content

	content = urltools.normalize(content)

	if content in contents:
		continue
	print content
	contents.add(content)
Beispiel #31
0
def test_normalize__ip4():
    assert normalize('http://192.168.1.1/') == 'http://192.168.1.1/'
    assert normalize(
        'http://192.168.1.1:8088/foo?x=1') == 'http://192.168.1.1:8088/foo?x=1'
    assert normalize('192.168.1.1') == '192.168.1.1'
    assert normalize('192.168.1.1:8080/foo/bar') == '192.168.1.1:8080/foo/bar'
Beispiel #32
0
	def url(self, page): 
		url_pv = pv.PV(self.prefix + ":URL:" + page)
		url = url_pv.get()
		url_pv.disconnect()
		return urltools.normalize(str(url))
def test_normalize__no_scheme():
    assert normalize('eXAmplE.com') == 'example.com'
    assert normalize('example.com/a/../b') == 'example.com/b'
    assert normalize('www.example.com') == 'www.example.com'
def test_normalize():
    assert normalize('') == ''
    assert normalize('http://example.com') == 'http://example.com/'
    assert normalize('http://example.com/') == 'http://example.com/'
    assert normalize('    http://example.com/      ') == 'http://example.com/'
    assert normalize('https://example.com/') == 'https://example.com/'
    assert normalize('hTTp://example.com/') == 'http://example.com/'
    assert normalize('http://ExAMPLe.COM/') == 'http://example.com/'
    assert normalize('http://example.com./') == 'http://example.com/'
    assert normalize('http://example.com:/') == 'http://example.com/'
    assert normalize('http://example.com/#') == 'http://example.com/'

    # subdomain
    assert normalize('http://www.example.com/') == 'http://www.example.com/'
    assert normalize('http://www.example.com') == 'http://www.example.com/'
    assert normalize('http://foo.bar.example.com/') == 'http://foo.bar.example.com/'

    # port
    assert normalize('http://example.com:80/') == 'http://example.com/'
    assert normalize('https://example.com:443/') == 'https://example.com/'
    assert normalize('ws://example.com:80/') == 'ws://example.com/'
    assert normalize('http://example.com:8080/') == 'http://example.com:8080/'

    # path
    assert normalize('http://example.com/a') == 'http://example.com/a'
    assert normalize('http://example.com/a/b/c') == 'http://example.com/a/b/c'
    assert normalize('http://example.com/foo/') == 'http://example.com/foo/'
    assert normalize('http://example.com/a/./b/././c') == 'http://example.com/a/b/c'
    assert normalize('http://example.com/a/../b') == 'http://example.com/b'
    assert normalize('http://example.com/./b') == 'http://example.com/b'
    assert normalize('http://example.com/../b') == 'http://example.com/b'
    assert normalize('http://example.com/////////foo') == 'http://example.com/foo'
    assert normalize('http://example.com/foo/.../bar') == 'http://example.com/foo/.../bar'
    assert normalize('http://example.com/foo+bar') == 'http://example.com/foo+bar'
    assert normalize('http://example.com/.') == 'http://example.com/'
    assert normalize('http://example.com/..') == 'http://example.com/'
    assert normalize('http://example.com/./') == 'http://example.com/'
    assert normalize('http://example.com/../') == 'http://example.com/'
    assert normalize('http://example.com/a/..') == 'http://example.com/'
    assert normalize('http://example.com/a/../') == 'http://example.com/'

    # encoded path
    assert normalize('http://example.com/%25%32%35') == 'http://example.com/%25'
    assert normalize('http://example.com/foo%25%32%35bar') == 'http://example.com/foo%25bar'
    assert normalize('http://example.com/foo/%25%32%35/bar') == 'http://example.com/foo/%25/bar'
    assert normalize('http://example.com/%7Efoo') == 'http://example.com/~foo'
    assert normalize('http://example.com/foo%23bar') == 'http://example.com/foo%23bar'  # %23 = #

    # query
    assert normalize('http://example.com/?x=1') == 'http://example.com/?x=1'
    assert normalize('http://example.com?x=1') == 'http://example.com/?x=1'
    assert normalize('http://example.com/a?x=1') == 'http://example.com/a?x=1'
    assert normalize('http://example.com/a/?x=1') == 'http://example.com/a/?x=1'
    assert normalize('http://example.com/a?x=1&y=2') == 'http://example.com/a?x=1&y=2'
    assert normalize('http://example.com/a?y=2&x=1') == 'http://example.com/a?x=1&y=2'
    assert normalize('http://example.com/a?x=&y=2') == 'http://example.com/a?y=2'

    # fragment
    assert normalize('http://example.com/#abc') == 'http://example.com/#abc'
    assert normalize('http://example.com/a/b/c#abc') == 'http://example.com/a/b/c#abc'
    assert normalize('http://example.com/a/b/c?x=1#abc') == 'http://example.com/a/b/c?x=1#abc'

    # username/password
    assert normalize('http://*****:*****@example.com') == 'http://*****:*****@example.com/'
    assert normalize('http://*****:*****@exaMPLE.COM/') == 'http://*****:*****@example.com/'

    # scheme without //
    assert normalize('mailto:[email protected]') == 'mailto:[email protected]'
    assert normalize('mailto:[email protected]') == 'mailto:[email protected]'
Beispiel #35
0
def test_normalize__idn():
    assert normalize('http://xn--e1afmkfd.xn--p1ai/') == u'http://пример.рф/'
Beispiel #36
0
	def current_url(self):
		return urltools.normalize(self.browser.current_url)
Beispiel #37
0
def test_normalize__no_scheme():
    assert normalize('eXAmplE.com') == 'example.com'
    assert normalize('example.com/a/../b') == 'example.com/b'
    assert normalize('www.example.com') == 'www.example.com'
def test_normalize__idn():
    assert normalize('http://xn--e1afmkfd.xn--p1ai/') == u'http://пример.рф/'
Beispiel #39
0
 def _normalize_url(self, url):
     return urltools.normalize(url)
def test_normalize__ip4():
    assert normalize('http://192.168.1.1/') == 'http://192.168.1.1/'
    assert normalize('http://192.168.1.1:8088/foo?x=1') == 'http://192.168.1.1:8088/foo?x=1'
    assert normalize('192.168.1.1') == '192.168.1.1'
    assert normalize('192.168.1.1:8080/foo/bar') == '192.168.1.1:8080/foo/bar'
def crawl(original_url):
    tic = time.time()
    parent_list = [original_url]
    url_to_check = get_base_url(original_url)
    print(url_to_check)
    url_to_check = str(url_to_check)

    layer_stop = 1
    layer = 0

    #initializing all the requiered lists---------------------------------------------
    visited_all = [original_url]
    visited_current_layer = []
    child_list = []
    child_list_filtered = []
    #columns = ['Link','Parent Link', 'Layer']
    df = pd.DataFrame()

    # Main execution of scrapper----------------------------------------------------

    #looping through layers
    while layer < layer_stop:

        #looping through URLs in parent-list
        for url in parent_list:

            #scraping the children from the parent url----------------------------
            if href_scrapper(url) != 0:
                child_list = href_scrapper(url)

            for child in child_list:
                if child != None:
                    #if child link is of the form "index.php/blahblah" and parent ends with '/'
                    #---> "parentlink/index.php/blahblah"
                    if child.startswith('/'):
                        child = str(url) + str(child)

                    if url.endswith('/') and url_to_check not in child:
                        child = str(url) + str(child)

                    #normalize the child links-------------------------------------
                    child = urltools.normalize(child)

                    #filtering out based on 1) External 2) Repeating 3) Invalid links---------------------------
                    if url_to_check in child and child not in visited_all and does_page_exist(
                            child) == 1:
                        child_list_filtered.append(child)

                    #adding everthing to visited all--------------------
                    if child not in visited_all:
                        child_slash = child + '/'
                        visited_all.append(child)
                        visited_all.append(child_slash)

            #adding  the visited and filtered children into the "current visited layer" ------
            for child_filtered in child_list_filtered:
                visited_current_layer.append(child_filtered)

            #creating a Pandas dataframe to store everything for download----------
            layer_number = [layer + 1] * len(child_list_filtered)
            parent_of_child = [url] * len(child_list_filtered)

            df_child = pd.DataFrame(child_list_filtered)
            df_parent = pd.DataFrame(parent_of_child)
            df_layer = pd.DataFrame(layer_number)

            df_to_be_added = pd.concat([df_child, df_parent, df_layer], axis=1)
            df = pd.concat([df, df_to_be_added], ignore_index=True, axis=0)
            #----------------------------------------------------------------------

            #emptying the child lists
            child_list = []
            child_list_filtered = []

        #condition to stop filtering-----------------------------------------------
        if not visited_current_layer:
            layer_stop = layer_stop
        else:
            layer_stop += 1

        #child layer is now parent layer--------------------------------------------
        parent_list = []

        #we only dont add .png, .jpg , .pdf to the new parent layer
        for visited_current in visited_current_layer:
            print(visited_current)
            if (not visited_current.endswith('.png')
                    and not visited_current.endswith('.jpg')
                    and not visited_current.endswith('.pdf')):
                parent_list.append(visited_current)

        #displaying the links in different layers----------------------------------
        #print("Links in LAYER:" + str(layer+1))
        print("No of links = " + str(len(visited_current_layer)))
        #print(visited_current_layer)
        print("\n")
        visited_current_layer = []
        #updating the layer number
        layer += 1

        return df
Beispiel #42
0
def _sanitize(url):
    ret = url
    ret = _urlsplit(ret)
    ret = urlunsplit(ret)
    ret = urltools.normalize(ret)
    return ret
Beispiel #43
0
 def get_urls(self):
     return {
         normalize(get_absolute_url(self.url, link.get("href")))
         for link in self._page.find_all(name="a")
     }
def _main(args):
    ## print(args.image_dir, args.url_file, args.shopping_file, args.dryrun)

    # First step is to read the "shopping list".  This is the list of synsets we
    # want to downlown images for.  By convetion if this list is empty we will
    # download all synsets.
    #
    # This file contins one sysset per line but the synsets can be in either
    # of two formats:
    #   1)  A "synset name" such as "benthos.n.02" or,
    #   2)  A "wordnet ID" or "offset" such as "n00004475"
    # These two formats are interchangable.  For every synset name there is an offset
    # and vice versa.
    #
    # The software below figures out which form is used inthe files (forms can be mixed
    # withion a file)
    #

    # Dictionary of acceptable image file extensions and what we will use as
    # the extension when we save the file locally
    file_ext_whitelist = {
        'jpg': 'jpg',
        'png': 'png',
        'jpeg': 'jpg',
        'JPG': 'jpg',
        'PNG': 'png',
        'JPEG': 'jpg'
    }
    file_ext_gif = {'gif': 'gif', 'GIF': 'gif'}

    synsetdict = {}
    lines = 0
    shoppinglist_file = open(args.shopping_file, 'r', encoding="utf-8")
    for line in shoppinglist_file:
        lines += 1
        line = line.strip()
        line = line.strip('\n')

        if line[0] == 'n' and line[1:2].isnumeric():
            # We have a wordnet ID
            wnid = line

            pos = line[0]
            offset = int(line[1:])
            ss = wn.synset_from_pos_and_offset(pos, offset)
            synsetdict[offset] = ss

        elif line[0:2].isalpha:
            # We have a synset name

            ss = wn.synset(line)
            offset = int(ss.offset())
            synsetdict[offset] = ss
        else:
            # We can't figute out what is in the file
            print('ERROR shoppinglist.txt, line', lines, 'unrecognised format',
                  line)
            exit()

    if args.verbose:
        print('INFO: Processing URLs from the following shopping list',
              synsetdict)

    # Make sure we have a directory for every synset, these may alreadys exist or not
    for offset in synsetdict:
        ssstr = str(synsetdict[offset])[8:-2]
        path = args.image_dir + ssstr
        if not os.path.exists(path):
            os.makedirs(path)

    # if we are going to allow GIF files, append to the whitelist
    if args.gif_ok:
        file_ext_whitelist.update(file_ext_gif)
        if args.verbose:
            print('INFO: allowing gif files')

    # read the URL list file end to end and process only those lines that
    # match synsets in our shopping list
    lines_read = 0
    files_downloaded = 0
    files_existing = 0
    dup_count = 0
    urldict = {}
    urllist_file = open(args.url_file, 'r', encoding="latin-1")
    for line in urllist_file:
        lines_read += 1

        wnid, url = re.split('\s+', line, maxsplit=1)

        # Normalixe the URL
        url = url.strip()
        url = url.strip('\n')
        url = ut.normalize(url)

        pos_offset, serial = wnid.split('_')
        pos = pos_offset[0]
        offset = int(pos_offset[1:])

        ss = wn.synset_from_pos_and_offset(pos, offset)
        ssstr = str(ss)[8:-2]

        # If synset is not on our shopping list we don't want it
        if offset not in synsetdict:
            continue

        # Attempt to find the file extension.  If we can't find it skip the URL
        # if we do find it, normalise the extension to lower case and three characters
        urlparts = urlparse(url)
        urlpath = urlparts.path

        try:
            _f, urlextension = urlpath.rsplit(sep='.', maxsplit=1)
        except (ValueError):
            print('WARNING No file extension, URL skiped:', line)
            continue

        if urlextension not in file_ext_whitelist:

            # did not find filename extension in path, perhaps it is a parameter
            for ext in file_ext_whitelist:
                dotext = '.' + ext
                if (dotext in urlparts.params) or (dotext in urlparts.query):
                    file_extension = file_ext_whitelist[ext]
                    break
                else:
                    file_extension = ''
                    print('WARNING No file extension found, URL skiped:', line)
                    break
            if '' == file_extension:
                continue

        else:
            file_extension = file_ext_whitelist[urlextension]

        # Have we already downloaded this URL?  Don't waste time doing it again.
        if url not in urldict:
            urldict[url] = line
        else:
            dup_count += 1
            print(
                'WARNING DUPLICATE URL this jpg file will NOT be downloaded again:'
            )
            print('   ', urldict[url])
            print('   ', line)
            continue

        # create the file name
        image_filename = args.image_dir + ssstr + '/' + ssstr + '-' + serial + '.' + file_extension

        # If we already have this file, we don't need to get it
        if Path(image_filename).is_file():
            files_existing += 1
            if args.verbose:
                print('INFO: File exists, not downloaing again',
                      image_filename)
            continue

        try:
            response = urllib.request.urlopen(url)
            imagedata = response.read()

        except urllib.error.URLError as e:
            print(e.reason, wnid, ssstr, ' at line', lines_read, url)
            continue
        except:
            print('WARNING unknown error while downloading data at line',
                  lines_read, url)
            continue

        ext_by_magic = check_magic(imagedata)
        if ext_by_magic not in file_ext_whitelist:
            print('WARNING Downloaded file signature is wrong, not saved',
                  line)
            continue
        if ext_by_magic != file_extension:
            print("WARNING Downloaded file signature", ext_by_magic,
                  "does not match URL", line)
            continue

        newfile = open(image_filename, 'wb')
        newfile.write(imagedata)
        newfile.close()
        files_downloaded += 1

        # Crude progress bar
        print('.', end='')

    # after loop end, print a summary of what was done then exit
    print('downloaded', files_downloaded, 'skipped', files_existing,
          'existing files', 'did not download', dup_count, 'duplicate URLs')
    exit()
Beispiel #45
0
def normalize(url):
    return urltools.normalize(url).rstrip('/')
Beispiel #46
0
def test_normalize():
    assert normalize("") == ""
    assert normalize("http://example.com") == "http://example.com/"
    assert normalize("http://example.com/") == "http://example.com/"
    assert normalize("    http://example.com/      ") == "http://example.com/"
    assert normalize("https://example.com/") == "https://example.com/"
    assert normalize("hTTp://example.com/") == "http://example.com/"
    assert normalize("http://ExAMPLe.COM/") == "http://example.com/"
    assert normalize("http://example.com./") == "http://example.com/"
    assert normalize("http://example.com:/") == "http://example.com/"
    assert normalize("http://example.com/#") == "http://example.com/"

    # port
    assert normalize("http://example.com:80/") == "http://example.com/"
    assert normalize("https://example.com:443/") == "https://example.com/"
    assert normalize("ws://example.com:80/") == "ws://example.com/"
    assert normalize("http://example.com:8080/") == "http://example.com:8080/"

    # subdomain
    assert normalize("http://www.example.com/") == "http://www.example.com/"
    assert normalize("http://www.example.com") == "http://www.example.com/"
    assert normalize("http://foo.bar.example.com/") == "http://foo.bar.example.com/"

    # ip
    assert normalize("http://192.168.1.1/") == "http://192.168.1.1/"
    assert normalize("http://192.168.1.1:8088/foo?x=1") == "http://192.168.1.1:8088/foo?x=1"
    assert normalize("192.168.1.1") == "192.168.1.1"
    assert normalize("192.168.1.1:8080/foo/bar") == "192.168.1.1:8080/foo/bar"

    # ip6
    assert normalize("[::1]") == "[::1]"
    assert normalize("http://[::1]") == "http://[::1]/"
    assert normalize("[::1]:8080") == "[::1]:8080"
    assert normalize("http://[::1]:8080") == "http://[::1]:8080/"

    # path
    assert normalize("http://example.com/a") == "http://example.com/a"
    assert normalize("http://example.com/a/b/c") == "http://example.com/a/b/c"
    assert normalize("http://example.com/foo/") == "http://example.com/foo/"
    assert normalize("http://example.com/a/./b/././c") == "http://example.com/a/b/c"
    assert normalize("http://example.com/a/../b") == "http://example.com/b"
    assert normalize("http://example.com/./b") == "http://example.com/b"
    assert normalize("http://example.com/../b") == "http://example.com/b"
    assert normalize("http://example.com/////////foo") == "http://example.com/foo"
    assert normalize("http://example.com/foo/.../bar") == "http://example.com/foo/.../bar"
    assert normalize("http://example.com/foo+bar") == "http://example.com/foo+bar"
    assert normalize("http://example.com/.") == "http://example.com/"
    assert normalize("http://example.com/..") == "http://example.com/"
    assert normalize("http://example.com/./") == "http://example.com/"
    assert normalize("http://example.com/../") == "http://example.com/"
    assert normalize("http://example.com/a/..") == "http://example.com/"
    assert normalize("http://example.com/a/../") == "http://example.com/"

    # encoded path
    assert normalize("http://example.com/%25%32%35") == "http://example.com/%25"
    assert normalize("http://example.com/foo%25%32%35bar") == "http://example.com/foo%25bar"
    assert normalize("http://example.com/foo/%25%32%35/bar") == "http://example.com/foo/%25/bar"
    assert normalize("http://example.com/%7Efoo") == "http://example.com/~foo"
    assert normalize("http://example.com/foo%23bar") == "http://example.com/foo%23bar" # %23 = #

    # query
    assert normalize("http://example.com/?x=1") == "http://example.com/?x=1"
    assert normalize("http://example.com?x=1") == "http://example.com/?x=1"
    assert normalize("http://example.com/a?x=1") == "http://example.com/a?x=1"
    assert normalize("http://example.com/a/?x=1") == "http://example.com/a/?x=1"
    assert normalize("http://example.com/a?x=1&y=2") == "http://example.com/a?x=1&y=2"
    assert normalize("http://example.com/a?y=2&x=1") == "http://example.com/a?x=1&y=2"
    assert normalize("http://example.com/a?x=&y=2") == "http://example.com/a?y=2"

    # fragment
    assert normalize("http://example.com/#abc") == "http://example.com/#abc"
    assert normalize("http://example.com/a/b/c#abc") == "http://example.com/a/b/c#abc"
    assert normalize("http://example.com/a/b/c?x=1#abc") == "http://example.com/a/b/c?x=1#abc"

    # no scheme
    assert normalize("eXAmplE.com") == "example.com"
    assert normalize("example.com/a/../b") == "example.com/b"
    assert normalize("www.example.com") == "www.example.com"

    # username/password
    assert normalize("http://*****:*****@example.com") == "http://*****:*****@example.com/"
    assert normalize("http://*****:*****@exaMPLE.COM/") == "http://*****:*****@example.com/"

    # scheme without //
    assert normalize("mailto:[email protected]") == "mailto:[email protected]"
    assert normalize("mailto:[email protected]") == "mailto:[email protected]"

    # IDN
    assert normalize("http://xn--e1afmkfd.xn--p1ai/") == "http://пример.рф/"

    # malformed urls
    assert normalize("http://example.com/?foo") == "http://example.com/"
    assert normalize("http://example.com?foo") == "http://example.com/"
    assert normalize("http://example.com/foo//bar") == "http://example.com/foo/bar"
    assert normalize("http://example.com?") == "http://example.com/"
    assert normalize("http://example.com/?") == "http://example.com/"
    assert normalize("http://example.com//?") == "http://example.com/"
    assert normalize("http://example.com/foo/?http://example.com/bar/?x=http://examle.com/y/z") == "http://example.com/foo/?http://example.com/bar/?x=http://examle.com/y/z"
    assert normalize("http://example.com/#foo?bar") == "http://example.com/#foo?bar"
    assert normalize("http://example.com/#foo/bar/blub.html?x=1") == "http://example.com/#foo/bar/blub.html?x=1"
    assert normalize("http://example.com/foo#?=bar") == "http://example.com/foo#?=bar"
    assert normalize("http://example.com/foo/bar/http://example.com") == "http://example.com/foo/bar/http:/example.com"
Beispiel #47
0
def normalize(url, strip=False):
    "RFC3986 normalize URL & Optionally removing url-query/fragment string"
    if strip:
        p = _urltools.parse(url)
        url = p.scheme + '://' + p.subdomain + p.domain + p.path
    return _urltools.normalize(url)
Beispiel #48
0
def urls_equal(url1, url2):
    return urltools.normalize(url1) == urltools.normalize(url2)
Beispiel #49
0
def test_normalize():
    assert normalize("") == ""
    assert normalize("http://example.com") == "http://example.com/"
    assert normalize("http://example.com/") == "http://example.com/"
    assert normalize("    http://example.com/      ") == "http://example.com/"
    assert normalize("https://example.com/") == "https://example.com/"
    assert normalize("hTTp://example.com/") == "http://example.com/"
    assert normalize("http://ExAMPLe.COM/") == "http://example.com/"
    assert normalize("http://example.com./") == "http://example.com/"
    assert normalize("http://example.com:/") == "http://example.com/"
    assert normalize("http://example.com/#") == "http://example.com/"

    # port
    assert normalize("http://example.com:80/") == "http://example.com/"
    assert normalize("https://example.com:443/") == "https://example.com/"
    assert normalize("ws://example.com:80/") == "ws://example.com/"
    assert normalize("http://example.com:8080/") == "http://example.com:8080/"

    # subdomain
    assert normalize("http://www.example.com/") == "http://www.example.com/"
    assert normalize("http://www.example.com") == "http://www.example.com/"
    assert normalize(
        "http://foo.bar.example.com/") == "http://foo.bar.example.com/"

    # ip
    assert normalize("http://192.168.1.1/") == "http://192.168.1.1/"
    assert normalize(
        "http://192.168.1.1:8088/foo?x=1") == "http://192.168.1.1:8088/foo?x=1"
    assert normalize("192.168.1.1") == "192.168.1.1"
    assert normalize("192.168.1.1:8080/foo/bar") == "192.168.1.1:8080/foo/bar"

    # ip6
    assert normalize("[::1]") == "[::1]"
    assert normalize("http://[::1]") == "http://[::1]/"
    assert normalize("[::1]:8080") == "[::1]:8080"
    assert normalize("http://[::1]:8080") == "http://[::1]:8080/"

    # path
    assert normalize("http://example.com/a") == "http://example.com/a"
    assert normalize("http://example.com/a/b/c") == "http://example.com/a/b/c"
    assert normalize("http://example.com/foo/") == "http://example.com/foo/"
    assert normalize(
        "http://example.com/a/./b/././c") == "http://example.com/a/b/c"
    assert normalize("http://example.com/a/../b") == "http://example.com/b"
    assert normalize("http://example.com/./b") == "http://example.com/b"
    assert normalize("http://example.com/../b") == "http://example.com/b"
    assert normalize(
        "http://example.com/////////foo") == "http://example.com/foo"
    assert normalize(
        "http://example.com/foo/.../bar") == "http://example.com/foo/.../bar"
    assert normalize(
        "http://example.com/foo+bar") == "http://example.com/foo+bar"
    assert normalize("http://example.com/.") == "http://example.com/"
    assert normalize("http://example.com/..") == "http://example.com/"
    assert normalize("http://example.com/./") == "http://example.com/"
    assert normalize("http://example.com/../") == "http://example.com/"
    assert normalize("http://example.com/a/..") == "http://example.com/"
    assert normalize("http://example.com/a/../") == "http://example.com/"

    # encoded path
    assert normalize(
        "http://example.com/%25%32%35") == "http://example.com/%25"
    assert normalize(
        "http://example.com/foo%25%32%35bar") == "http://example.com/foo%25bar"
    assert normalize("http://example.com/foo/%25%32%35/bar"
                     ) == "http://example.com/foo/%25/bar"
    assert normalize("http://example.com/%7Efoo") == "http://example.com/~foo"
    assert normalize("http://example.com/foo%23bar"
                     ) == "http://example.com/foo%23bar"  # %23 = #

    # query
    assert normalize("http://example.com/?x=1") == "http://example.com/?x=1"
    assert normalize("http://example.com?x=1") == "http://example.com/?x=1"
    assert normalize("http://example.com/a?x=1") == "http://example.com/a?x=1"
    assert normalize(
        "http://example.com/a/?x=1") == "http://example.com/a/?x=1"
    assert normalize(
        "http://example.com/a?x=1&y=2") == "http://example.com/a?x=1&y=2"
    assert normalize(
        "http://example.com/a?y=2&x=1") == "http://example.com/a?x=1&y=2"
    assert normalize(
        "http://example.com/a?x=&y=2") == "http://example.com/a?y=2"

    # fragment
    assert normalize("http://example.com/#abc") == "http://example.com/#abc"
    assert normalize(
        "http://example.com/a/b/c#abc") == "http://example.com/a/b/c#abc"
    assert normalize("http://example.com/a/b/c?x=1#abc"
                     ) == "http://example.com/a/b/c?x=1#abc"

    # no scheme
    assert normalize("eXAmplE.com") == "example.com"
    assert normalize("example.com/a/../b") == "example.com/b"
    assert normalize("www.example.com") == "www.example.com"

    # username/password
    assert normalize(
        "http://*****:*****@example.com") == "http://*****:*****@example.com/"
    assert normalize(
        "http://*****:*****@exaMPLE.COM/") == "http://*****:*****@example.com/"

    # scheme without //
    assert normalize("mailto:[email protected]") == "mailto:[email protected]"
    assert normalize("mailto:[email protected]") == "mailto:[email protected]"

    # IDN
    assert normalize("http://xn--e1afmkfd.xn--p1ai/") == "http://пример.рф/"

    # malformed urls
    assert normalize("http://example.com/?foo") == "http://example.com/"
    assert normalize("http://example.com?foo") == "http://example.com/"
    assert normalize(
        "http://example.com/foo//bar") == "http://example.com/foo/bar"
    assert normalize("http://example.com?") == "http://example.com/"
    assert normalize("http://example.com/?") == "http://example.com/"
    assert normalize("http://example.com//?") == "http://example.com/"
    assert normalize(
        "http://example.com/foo/?http://example.com/bar/?x=http://examle.com/y/z"
    ) == "http://example.com/foo/?http://example.com/bar/?x=http://examle.com/y/z"
    assert normalize(
        "http://example.com/#foo?bar") == "http://example.com/#foo?bar"
    assert normalize("http://example.com/#foo/bar/blub.html?x=1"
                     ) == "http://example.com/#foo/bar/blub.html?x=1"
    assert normalize(
        "http://example.com/foo#?=bar") == "http://example.com/foo#?=bar"
    assert normalize("http://example.com/foo/bar/http://example.com"
                     ) == "http://example.com/foo/bar/http:/example.com"
Beispiel #50
0
def crawl(original_url, num_Id, output_file):

    # Accepting input of URL and depth----------------------------------------------
    tic = time.time()

    parent_list = [original_url]
    url_to_check = get_base_url(original_url)
    print(url_to_check)
    url_to_check = str(url_to_check)

    layer_stop = 1
    layer = 0


    #initializing all the required lists---------------------------------------------
    visited_all  = [original_url]
    visited_current_layer = []
    child_list =[]
    child_list_filtered = [original_url]
    #columns = ['Link','Parent Link', 'Layer']
    df = pd.DataFrame()

    Di={}

    # Main execution of scrapper----------------------------------------------------

    #looping through layers
    while layer < layer_stop:

        #looping through URLs in parent-list
        for url in parent_list:

            #scraping the children from the parent url----------------------------
            href_url = href_scrapper(url)
            if href_url != 0:
                child_list = href_url


            for child in child_list:
                if child != None:
                    ch=child
                    #if child link is of the form "index.php/blahblah" and parent ends with '/'
                    #---> "parentlink/index.php/blahblah"
                    #if child.startswith('/'):
                        #child= str(url) + str(child)
                    #if url.endswith('/') and url_to_check not in child:
                        #child = str(url) + str(child)

                    child = urljoin(url,child)

                #normalize the child links-------------------------------------
                    child=urltools.normalize(child)
                    social_media = ['facebook.com','google.com','reddit.com','linkedin.com','github.com','twitter.com','digg.com','.png', '.jpg','.jpeg', '.pdf', '.css']

                    #filtering out based on 1) External 2) Repeating 3) Invalid  4) social media + pdf + css ---------------------------
                    #if url_to_check in child and child not in visited_all and does_page_exist(child)==1 and ch not in Di:
                    if url_to_check in child and child not in visited_all and ch not in Di and all(social not in child for social in social_media):
                        child_list_filtered.append(child)
                        Di[ch]=1

                    #adding everything to visited all--------------------
                    if child not in visited_all:
                        child_slash = child + '/'
                        visited_all.append(child)
                        visited_all.append(child_slash)

                #sleep-------------------------------------------------------
                time.sleep(0.250)

            #adding  the visited and filtered children into the "current visited layer" ------
            for child_filtered in child_list_filtered:
                visited_current_layer.append(child_filtered)

            #creating a Pandas dataframe to store everything for download----------

            layer_number = [layer+1]*len(child_list_filtered)
            parent_of_child = [url]*len(child_list_filtered)

            df_child = pd.DataFrame(child_list_filtered)
            df_parent = pd.DataFrame(parent_of_child)
            df_layer = pd.DataFrame(layer_number)


            df_to_be_added = pd.concat([df_child,df_parent,df_layer], axis=1)
            df = pd.concat([df,df_to_be_added],ignore_index=True, axis = 0)


            #----------------------------------------------------------------------

            #emptying the child lists
            child_list = []
            child_list_filtered = []

        #condition to stop filtering-----------------------------------------------
        if not visited_current_layer :
            layer_stop = layer_stop
        else:
            layer_stop += 1


        #child layer is now parent layer--------------------------------------------
        parent_list = []

        for visited_current in visited_current_layer:
            print(visited_current)
            #if(not visited_current.endswith(unwanted_extensions)):
            parent_list.append(visited_current)


        #displaying the links in different layers----------------------------------
        #print("Links in LAYER:" + str(layer+1))
        print("No of links = " + str(len(visited_current_layer)))
        #print(visited_current_layer)
        print("\n")
        visited_current_layer = []
        #updating the layer number
        layer +=1
    df.to_csv(output_file + '/' + str(num_Id) + '_' + str(url_to_check) +  '.csv', sep=',', encoding='utf-8')
    return df, num_Id
Beispiel #51
0
def test_normalize():
    assert normalize('') == ''
    assert normalize('http://example.com') == 'http://example.com/'
    assert normalize('http://example.com/') == 'http://example.com/'
    assert normalize('    http://example.com/      ') == 'http://example.com/'
    assert normalize('https://example.com/') == 'https://example.com/'
    assert normalize('hTTp://example.com/') == 'http://example.com/'
    assert normalize('http://ExAMPLe.COM/') == 'http://example.com/'
    assert normalize('http://example.com./') == 'http://example.com/'
    assert normalize('http://example.com:/') == 'http://example.com/'
    assert normalize('http://example.com/#') == 'http://example.com/'

    # subdomain
    assert normalize('http://www.example.com/') == 'http://www.example.com/'
    assert normalize('http://www.example.com') == 'http://www.example.com/'
    assert normalize(
        'http://foo.bar.example.com/') == 'http://foo.bar.example.com/'

    # port
    assert normalize('http://example.com:80/') == 'http://example.com/'
    assert normalize('https://example.com:443/') == 'https://example.com/'
    assert normalize('ws://example.com:80/') == 'ws://example.com/'
    assert normalize('http://example.com:8080/') == 'http://example.com:8080/'

    # path
    assert normalize('http://example.com/a') == 'http://example.com/a'
    assert normalize('http://example.com/a/b/c') == 'http://example.com/a/b/c'
    assert normalize('http://example.com/foo/') == 'http://example.com/foo/'
    assert normalize(
        'http://example.com/a/./b/././c') == 'http://example.com/a/b/c'
    assert normalize('http://example.com/a/../b') == 'http://example.com/b'
    assert normalize('http://example.com/./b') == 'http://example.com/b'
    assert normalize('http://example.com/../b') == 'http://example.com/b'
    assert normalize(
        'http://example.com/////////foo') == 'http://example.com/foo'
    assert normalize(
        'http://example.com/foo/.../bar') == 'http://example.com/foo/.../bar'
    assert normalize(
        'http://example.com/foo+bar') == 'http://example.com/foo+bar'
    assert normalize('http://example.com/.') == 'http://example.com/'
    assert normalize('http://example.com/..') == 'http://example.com/'
    assert normalize('http://example.com/./') == 'http://example.com/'
    assert normalize('http://example.com/../') == 'http://example.com/'
    assert normalize('http://example.com/a/..') == 'http://example.com/'
    assert normalize('http://example.com/a/../') == 'http://example.com/'

    # encoded path
    assert normalize(
        'http://example.com/%25%32%35') == 'http://example.com/%25'
    assert normalize(
        'http://example.com/foo%25%32%35bar') == 'http://example.com/foo%25bar'
    assert normalize('http://example.com/foo/%25%32%35/bar'
                     ) == 'http://example.com/foo/%25/bar'
    assert normalize('http://example.com/%7Efoo') == 'http://example.com/~foo'
    assert normalize('http://example.com/foo%23bar'
                     ) == 'http://example.com/foo%23bar'  # %23 = #

    # query
    assert normalize('http://example.com/?x=1') == 'http://example.com/?x=1'
    assert normalize('http://example.com?x=1') == 'http://example.com/?x=1'
    assert normalize('http://example.com/a?x=1') == 'http://example.com/a?x=1'
    assert normalize(
        'http://example.com/a/?x=1') == 'http://example.com/a/?x=1'
    assert normalize(
        'http://example.com/a?x=1&y=2') == 'http://example.com/a?x=1&y=2'
    assert normalize(
        'http://example.com/a?y=2&x=1') == 'http://example.com/a?x=1&y=2'
    assert normalize(
        'http://example.com/a?x=&y=2') == 'http://example.com/a?y=2'

    # fragment
    assert normalize('http://example.com/#abc') == 'http://example.com/#abc'
    assert normalize(
        'http://example.com/a/b/c#abc') == 'http://example.com/a/b/c#abc'
    assert normalize('http://example.com/a/b/c?x=1#abc'
                     ) == 'http://example.com/a/b/c?x=1#abc'

    # username/password
    assert normalize(
        'http://*****:*****@example.com') == 'http://*****:*****@example.com/'
    assert normalize(
        'http://*****:*****@exaMPLE.COM/') == 'http://*****:*****@example.com/'

    # scheme without //
    assert normalize('mailto:[email protected]') == 'mailto:[email protected]'
    assert normalize('mailto:[email protected]') == 'mailto:[email protected]'