Ejemplos de norm en Python, ejemplos de urlnorm.norm en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: crawler3.py Proyecto: sigmundc/CS6965

def normalize_url(base_url, url):
	myfile3 = open('normalization_log', 'a')
	myfile3.write("base url:{0}\n".format(base_url))
	myfile3.write("url:{0}\n".format(url))
	myfile3.close()
	result = ''

	# if url starts with http:// or https://
	allowed_scheme = ['http', 'https']
	url_scheme = urlparse(url).scheme
	if url_scheme in allowed_scheme:
		return urlnorm.norm(url)
	elif url_scheme == 'mailto':
		return False
	elif len(url_scheme) == 0:
		# check if URL starts with ../
		if (url[:3] == '../') or (url[:2] == './'):
			return urlnorm.norm(base_url+'/'+url)
		elif url[0] == '/': # e.g. /page/page
			# That means it's the domain + url
			url_obj = urlparse(base_url)
			new_url = url_obj.scheme + "://" + url_obj.netloc + url
			return urlnorm.norm(new_url)

		else: # URL should be just html page e.g. research.html
			# so we need to replace the last part
			# if URL is 'http://www.test.com/page/page/12345':
			# results will be ['http://www.test.com/page/page', '12345']
			parts = base_url.rsplit('/', 1)
			return urlnorm.norm(parts[0]+'/'+url)
	result = url
	return result

Ejemplo n.º 2

0

Mostrar archivo

Archivo: llc.py Proyecto: rickychang/letter-link-crawl

def main():
	if (len(sys.argv) < 3 ):
		print "usage: python ll-print.py <url> <search term>"
		print "example: python ll-print.py http://www.hunch.com 'hunch team'"
		exit(0)
	root_URL = sys.argv[1]
	search_term = sys.argv[2]
	if (not validate_search_term(search_term)):
		print "Invalid search term.  Please only use valid url characters and spaces."
		exit(1)
	first_letter = search_term[0]
	first_letter_match = root_URL.find(first_letter.lower())
	if (first_letter_match != -1):
		try:
			br = mechanize.Browser()
			br._factory.is_html = True
			result = []
			br.open(root_URL)
			# print "visiting: " + urlnorm.norm(br.geturl())
			visited = set([urlnorm.norm(br.geturl()), urlnorm.norm(root_URL)])
			result = find_matching_links(br, search_term, result, visited)
			if (result):
				max_index = max(result, key=lambda u: u[1])[1]
				for l, i, c in result:
					print_url(l, i, max_index)
		except urlnorm.InvalidUrl:
			print "Invalid root URL"
		except urllib2.URLError, e:
			print "Error opening root URL"
			print e
		except Exception, e:
			print e

Ejemplo n.º 3

0

Mostrar archivo

Archivo: crawler.py Proyecto: joshelser/phoenixcon-2018

def crawl(db, url, urls_crawled={}):
    # Make sure we don't infinite loop, keep track of what we urls_crawled
    # TODO pull this from Phoenix

    # Crawl this website, get all of the outbound URLs
    urls_to_crawl = crawl_one(db, url)
    # Record that we crawled this url
    urls_crawled[url] = None

    for url_to_crawl in urls_to_crawl:
        try:
            url_to_crawl = urlnorm.norm(url_to_crawl)
        except urlnorm.InvalidUrl:
            # Try to convert it to an absolute url
            url_to_crawl = urlnorm.norm("%s%s" % (url, url_to_crawl))

        # Don't re-record
        if url_to_crawl in urls_crawled:
            print 'Skipping %s as already crawled' % (url_to_crawl)
        # Only crawl my site
        elif url_to_crawl.startswith('https://penguinsinabox.com'):
            crawl(db, url_to_crawl, urls_crawled)
        else:
            # A website not owned by me
            print 'Skipping %s as not a self-controlled site' % (url_to_crawl)
    print "Finished processing children of %s" % (url)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: llc.py Proyecto: rickychang/letter-link-crawl

def find_matching_links(br, target_word, result, visited):
	if (not target_word):
		return result
	else:
		current_URL = urlnorm.norm(br.geturl())
		current_letter = target_word[0].lower()
		if (current_letter.isspace()):
			return find_matching_links(br, target_word[1:], result + [('', -1, ' ')], visited)
		else:
			matching_index = current_URL[7:].find(current_letter)
			if (matching_index == -1):
				return []
			else:
				new_result = result + [(current_URL, matching_index + 7, current_letter)]
				links = list(br.links())
				for link in links:
					try:
						link_URL = urlnorm.norm(link.absolute_url)
						if (link_URL not in visited):
							br.open(link_URL)
							new_visited = visited.copy()
							new_visited.add(link_URL)
							# print "visiting: " + urlnorm.norm(br.geturl())
							new_visited.add(urlnorm.norm(br.geturl()))
							child_result = find_matching_links(br, target_word[1:], new_result, new_visited)
							if (child_result):
								return child_result
 					except Exception, e:
						continue

Ejemplo n.º 5

0

Mostrar archivo

Archivo: testingSingleWebsite.py Proyecto: saurin94/Search_Engine

def fetchOutlinks(ahrefs):
    newOutLinks = set()
    base_url = "https://en.wikipedia.org"
    for a in ahrefs:
        try:
            ahref = a['href'].lower()
            not_parseable_ressources = (".avi", ".mkv", ".mp4", ".jpg",
                                        ".jpeg", ".png", ".gif", ".pdf",
                                        ".iso", ".rar", ".tar", ".tgz", ".zip",
                                        ".dmg", ".exe")
            if not urlparse.urlparse(ahref).path.endswith(
                    not_parseable_ressources):
                if "wiki" in ahref:
                    if "#" in ahref:  # Finding and removing URLs with # in them
                        ahref = ahref[:ahref.find("#")]
                        pass
                    elif "?" in ahref:  # Finding and removing URLs with ? in them
                        ahref = ahref[:ahref.find("?")]
                        pass
                    elif ":" in ahref:  # Finding and removing URLs with : in them
                        ahref = ahref[:ahref.find(":")]
                        pass
                    elif "//" in ahref:  # Finding and removing URLs with // in them
                        ahref = ahref[:ahref.find("//")]
                        pass
                    elif ahref == "/wiki/Main_Page":  # Finding and removing URLs of Main page of Wiki
                        pass
                    elif "united" in ahref or "states" in ahref or "u.s" in ahref or "illegal" in ahref or "immig" in ahref or "donald" in ahref or "trump" in ahref:
                        newUrl = a['href']
                        finalUrl = base_url + newUrl
                        finalUrl = urlnorm.norm(finalUrl).encode(
                            "utf-8", "ignore")
                        newOutLinks.add(finalUrl)
                    else:
                        if ahref[:
                                 2] == "//":  # Finding and removing URLs with // in them
                            pass
                        elif "index" in ahref or "youtube" in ahref or "rgu" in ahref or "book" in ahref or "american" in ahref:
                            pass
                        elif "#" in ahref:  # Finding and removing URLs with # in them
                            ahref = ahref[:ahref.find("#")]
                            pass
                        elif "united" in ahref or "states" in ahref or "u.s" in ahref or "illegal" in ahref or "immig" in ahref or "donald" in ahref or "trump" in ahref:
                            newUrl = a['href']
                            finalUrl = newUrl
                            print "outlinks:", finalUrl
                            finalUrl = urlnorm.norm(finalUrl).encode(
                                "utf-8", "ignore")
                            newOutLinks.add(finalUrl)
        except KeyError, e:
            pass

Ejemplo n.º 6

0

Mostrar archivo

    def __init__(self, url, previous=None, **info):
        # Apply the simple idempotent optimizations to all urls (no need to
        # ever deal with "HTTP://.."). This means case-sensitivity, and a
        # whole lot of other things that the urlnorm library will do for us.
        # We call this the original url, even though it is a bit of a lie.
        try:
            self.original_url = urlnorm.norm(url)
        except urlnorm.InvalidUrl as e:
            raise urlnorm.InvalidUrl('{}: {}'.format(e, url))

        # For the normalized url that we'll be exposing, remove the
        # fragment, and treat https and http the same.
        url, fragment = urldefrag(self.original_url)
        self.lossy_url_data = {'fragment': fragment}
        if url.startswith('https:'):
            url = 'http' + url[5:]
            self.lossy_url_data.update({'protocol': 'https'})
        self.url = url

        self.set_previous(previous)
        self.info = info
        self.post = None

        # Runtime data
        self.response = None
        self.exception = None
        self.retries = 0

Ejemplo n.º 7

0

Mostrar archivo

Archivo: utils.py Proyecto: jpopelka/gluetool

def treat_url(url, logger=None):
    """
    Remove "weird" artifacts from the given URL. Collapse adjacent '.'s, apply '..', etc.

    :param str url: URL to clear.
    :param gluetool.log.ContextAdapter logger: logger to use for logging.
    :rtype: str
    :returns: Treated URL.
    """

    logger = logger or Logging.get_logger()

    logger.debug("treating a URL '{}'".format(url))

    try:
        url = str(urlnorm.norm(url))

    except urlnorm.InvalidUrl as exc:
        # urlnorm cannot handle localhost: https://github.com/jehiah/urlnorm/issues/3
        if exc.message == "host u'localhost' is not valid":
            pass

        else:
            raise exc

    return url.strip()

Ejemplo n.º 8

0

Mostrar archivo

Archivo: http_url.py Proyecto: gongchengshi/python_common

    def _canonize(self):
        if self.is_absolute:
            self.canonical_scheme, self.canonical_netloc, self.canonical_path, self.canonical_query, self.canonical_fragment \
                = urlsplit(urlnorm.norm(self.raw))
        else:
            self.canonical_scheme, self.canonical_netloc, self.canonical_path, self.canonical_query, self.canonical_fragment \
                = self.raw_scheme, self.raw_netloc, self.raw_path, self.raw_query, self.raw_fragment

        self.canonical_scheme = self.canonical_scheme.lower()

        if self.raw_scheme.endswith('s'):
            self.canonical_netloc, _ = rstrip_string(self.canonical_netloc,
                                                     ':443')
        else:
            self.canonical_netloc, _ = rstrip_string(self.canonical_netloc,
                                                     ':80')

        self.canonical_netloc = self.canonical_netloc.lower()

        self.canonical_path = '' if self.canonical_path == '/' else self.canonical_path

        params = parse_qsl(self.canonical_query, True)
        self.canonical_query_params = [(k, v) for (k, v) in sorted(params)]

        self.canonical = urlunsplit(
            (self.canonical_scheme, self.canonical_netloc, self.canonical_path,
             self.canonical_query, ''))

Ejemplo n.º 9

0

Mostrar archivo

Archivo: Crawler.py Proyecto: srujans1/FocusedWebCrawler

def processPage():
    while not urls.counter > urlcount:
        try:
            link = urlpool.get()
            newurl = urlparse.urljoin(
                link.base_url,
                link.url)  # Converting relative URLs to Absolute ones
            newurl = unicode(urlnorm.norm(newurl))  # Normalizing URL
            print "out: " + newurl
            disassembled = urlparse.urlsplit(newurl)
            filename, file_ext = splitext(
                basename(disassembled.path
                         ))  # Finding file extension for filtering exclusions
            file_ext = file_ext.lower()
            if filename == 'index':
                newurl = newurl[:-len(filename + file_ext)]
            if (file_ext not in excludedExtensions
                    and disassembled.scheme in ['http', 'https']
                    and disassembled.fragment == ''):
                print "in : " + newurl
                if newurl not in visited:  # Checking to see if URL has already been queued once
                    visited.add(newurl)
                    if urlContains(newurl, searchTags) > 0:
                        urls.put(newurl, 1)
                    else:
                        priority = priorityCalculator.searchPage(
                            newurl, searchTags)
                        if priority < len(searchTags) + 1:
                            urls.put(
                                newurl, priority
                            )  # Adding URL to queue with calculated priority
        except UnicodeEncodeError:
            print "UnicodeEncodeError"
        except:
            print "Invalid URL"

Ejemplo n.º 10

0

Mostrar archivo

Archivo: Crawler.py Proyecto: Walliee/FocusedCrawler

def processPage():
    while not urls.counter > urlcount:
        try:
            link = urlpool.get()
            newurl = urlparse.urljoin(link.base_url, link.url) # Converting relative URLs to Absolute ones
            newurl = unicode(urlnorm.norm(newurl)) # Normalizing URL
            print "out: " + newurl
            disassembled = urlparse.urlsplit(newurl)
            filename, file_ext = splitext(basename(disassembled.path)) # Finding file extension for filtering exclusions
            file_ext = file_ext.lower()
            if filename == 'index':
                newurl = newurl[:-len(filename + file_ext)]
            if (file_ext not in excludedExtensions and disassembled.scheme in ['http', 'https'] and disassembled.fragment == ''):
                print "in : " + newurl
                if newurl not in visited: # Checking to see if URL has already been queued once
                    visited.add(newurl)
                    if urlContains(newurl, searchTags) > 0:
                        urls.put(newurl, 1)
                    else:
                        priority = priorityCalculator.searchPage(newurl, searchTags)
                        if priority < len(searchTags) + 1:
                            urls.put(newurl, priority) # Adding URL to queue with calculated priority
        except UnicodeEncodeError:
            print "UnicodeEncodeError"
        except:
            print "Invalid URL"

Ejemplo n.º 11

0

Mostrar archivo

def test_invalid_urls(url):
    try:
        output = urlnorm.norm(url)
        print '%r' % output
    except urlnorm.InvalidUrl:
        return
    assert 1 == 0, "this should have raised an InvalidUrl exception"

Ejemplo n.º 12

0

Mostrar archivo

Archivo: urlwork.py Proyecto: MattLeMay/thresholderbot

def canonicalize(url):
    """Canonicalize a URL in just a few easy steps:

        1. Resolve any redirects
        2. Normalize the URL
        3. Strip any superflous query params
        4. Sort any remaining query params
        5. Profit!

    This relies on the urlnorm module for normalization, and, at the moment,
    just removes utm_* query params.

    TODO: Special case normalization for major sites (e.g. youtube)?
    """
    url = urlnorm.norm(resolve(url))
    url_parts = urlparse.urlsplit(url)
    scheme, netloc, path, query, fragment = url_parts

    params = []
    for key, value in cgi.parse_qs(query).iteritems():
        if exclude_param(url_parts, key, value):
            continue
        if isinstance(value, list):
            params.extend((key, v) for v in value)
        else:
            params.append((key, value))

    query = urllib.urlencode(sorted(params), doseq=1)
    return urlparse.urlunsplit((scheme, netloc, path, query, ''))

Ejemplo n.º 13

0

Mostrar archivo

Archivo: test_urlnorm.py Proyecto: jehiah/urlnorm

def test_invalid_urls(url):
    try:
        output = urlnorm.norm(url)
        print '%r' % output
    except urlnorm.InvalidUrl:
        return
    assert 1 == 0, "this should have raised an InvalidUrl exception"

Ejemplo n.º 14

0

Mostrar archivo

Archivo: crawler.py Proyecto: jforge/distributed-webcrawler

def task_listener_crawler(gearman_worker, gearman_job):
	url = gearman_job.data
	url_frontier.add(url)
	urls = urlparse.urlparse(url)
	print "Crawling ", url
	response = requests.get(url, crawler_headers)
	print 'Downloaded page'
	if response.status_code == 200:
		raw_data = response.text
		if response.encoding != 'utf8':
			raw_data = response.text.encode(response.encoding).decode('utf8')
		r.table(raw_result_table).insert({'url': url, 'raw': raw_data, 'status': 200}, conflict="replace").run(rethink)

		links = linkregex.findall(raw_data)
		for link in (links.pop(0) for _ in xrange(len(links))):
			pre_norm_url = url_pre_norm(link, urls)
			norm_url = urlnorm.norm(pre_norm_url)
			norm_parts = urlparse.urlparse(norm_url)
			ext_url = norm_parts.path.split(".")[-1].lower()
			if ext_url not in except_url_suffixes and url_frontier.add(norm_url):
				print "Add ", norm_url, " to redis queue"
				redis_client.rpush("urls:enqueued", norm_url)
		print "Done"
		return "ok"
	else:
		r.table(raw_result_table).insert({'url': url, 'status': response.status_code}, conflict="replace").run(rethink)
	return "fail"

Ejemplo n.º 15

0

Mostrar archivo

Archivo: urlwork.py Proyecto: vdeleon/thresholderbot

def canonicalize(url):
    """Canonicalize a URL in just a few easy steps:

        1. Resolve any redirects
        2. Normalize the URL
        3. Strip any superflous query params
        4. Sort any remaining query params
        5. Profit!

    This relies on the urlnorm module for normalization, and, at the moment,
    just removes utm_* query params.

    TODO: Special case normalization for major sites (e.g. youtube)?
    """
    url = urlnorm.norm(resolve(url))
    url_parts = urlparse.urlsplit(url)
    scheme, netloc, path, query, fragment = url_parts

    params = []
    for key, value in cgi.parse_qs(query).iteritems():
        if exclude_param(url_parts, key, value):
            continue
        if isinstance(value, list):
            params.extend((key, v) for v in value)
        else:
            params.append((key, value))

    query = urllib.urlencode(sorted(params), doseq=1)
    return urlparse.urlunsplit((scheme, netloc, path, query, ''))

Ejemplo n.º 16

0

Mostrar archivo

Archivo: urls.py Proyecto: adamchainz/aleph

def normalize_url(url):
    # TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py
    try:
        norm = urlnorm.norm(url)
        norm, _ = urldefrag(norm)
        return norm.rstrip('/')
    except:
        return None

Ejemplo n.º 17

0

Mostrar archivo

Archivo: feedimporter.py Proyecto: osks/komfeeder

def normalize_url(url):
    norm_url = urlnorm.norm(url)
    if norm_url.startswith("https://"):
        return norm_url[8:]
    elif norm_url.startswith("http://"):
        return norm_url[7:]
    else:
        return norm_url

Ejemplo n.º 18

0

Mostrar archivo

Archivo: url.py Proyecto: carriercomm/yeti

 def clean(self):
     """Ensures that URLs are canonized before saving"""
     self.value = refang(self.value.strip())
     try:
         if re.match(r"[^:]+://", self.value) is None:  # if no schema is specified, assume http://
             self.value = u"http://{}".format(self.value)
         self.value = urlnorm.norm(self.value)
     except urlnorm.InvalidUrl:
         raise ObservableValidationError("Invalid URL: {}".format(self.value))

Ejemplo n.º 19

0

Mostrar archivo

Archivo: semantics3.py Proyecto: jakespring/semantics3-python

 def fetch(self, method, endpoint, params):
     api_endpoint = norm(self.api_base + endpoint)
     content = self.oauth.request(
         method,
         api_endpoint,
         params=params,
         headers={'User-Agent': 'Semantics3 Python Lib/0.2'})
     print(content)
     return content

Ejemplo n.º 20

0

Mostrar archivo

Archivo: url.py Proyecto: rlugojr/krauler

def normalize_url(url):
    # TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py
    try:
        url = urlnorm.norm(url)
        url, _ = urldefrag(url)
        url = url.rstrip("/")
        return url
    except:
        return None

Ejemplo n.º 21

0

Mostrar archivo

Archivo: base.py Proyecto: axknightroad/metasearch

 def new(cls, *args, **kwargs):
     obj = cls(*args)
     obj.source = kwargs['source']
     obj.duplicates = 0
     obj.priority = 0
     # normalize url
     if hasattr(obj, 'url'):
         obj.url = urlnorm.norm(obj.url)
     return obj

Ejemplo n.º 22

0

Mostrar archivo

Archivo: gQuery.py Proyecto: srujans1/FocusedWebCrawler

def googleSearch ( searchString ):
    g = pygoogle(searchString)
    g.pages = 2
    urls = g.get_urls()
    urls = urls[:10]
    for i in range(len(urls)):
        urls[i]=unicode(urlnorm.norm(urls[i]))

    return urls

Ejemplo n.º 23

0

Mostrar archivo

 def new(cls, *args, **kwargs):
     obj = cls(*args)
     obj.source = kwargs['source']
     obj.duplicates = 0
     obj.priority = 0
     # normalize url
     if hasattr(obj, 'url'):
         obj.url = urlnorm.norm(obj.url)
     return obj

Ejemplo n.º 24

0

Mostrar archivo

Archivo: fetch_insert.py Proyecto: thlor/portalmonitor

def createMetaResources(md5v, dataset):
    with Timer(key='createMetaResources'):
        res = getDistributionAccessURLs(dataset) + getDistributionDownloadURLs(
            dataset)
        bulk_mr = []
        uris = []
        for uri in res:
            valid = True
            try:
                uri = urlnorm.norm(uri.strip())
            except Exception as e:
                log.debug("URIFormat", uri=uri, md5=md5v, msg=e.message)
                uri = uri
                valid = False

            f = getDistributionFormatWithURL(dataset, uri)
            m = getDistributionMediaTypeWithURL(dataset, uri)
            s = getDistributionSizeWithURL(dataset, uri)
            c = getDistributionCreationDateWithURL(dataset, uri)
            mod = getDistributionModificationDateWithURL(dataset, uri)
            try:
                s_uri = safe_url_string(uri, 'utf-8')
                uri = escape_ajax(s_uri)
            except Exception as exc:
                ErrorHandler.handleError(log,
                                         "safe_url_string",
                                         exception=exc,
                                         md5=md5,
                                         uri=uri,
                                         exc_info=True)
                uri = uri

            if uri in uris:
                log.debug("WARNING, duplicate URI",
                          dataset=dataset.id,
                          md5=md5v,
                          uri=uri,
                          format=f,
                          media=m)
                continue
            try:
                s = int(float(s)) if s is not None else None
            except Exception as e:
                s = None

            MR = MetaResource(uri=uri,
                              md5=md5v,
                              media=m,
                              valid=valid,
                              format=normaliseFormat(f),
                              size=s,
                              created=toDatetime(c),
                              modified=toDatetime(mod))
            bulk_mr.append(MR)
            uris.append(uri)
        return bulk_mr

Ejemplo n.º 25

0

Mostrar archivo

Archivo: semantics3.py Proyecto: abishekk92/semantics3-python

 def fetch(self, method, endpoint, params):
     api_endpoint = norm(self.api_base + endpoint)
     content = self.oauth.request(
                 method,
                 api_endpoint,
                 params = params,
                 headers={'User-Agent':'Semantics3 Python Lib/0.2'}
               )
     print(content)
     return content

Ejemplo n.º 26

0

Mostrar archivo

def dl_html(page):
    url = "http://en.wiktionary.org/wiki/%s" % page
    url = urlnorm.norm(url)

    # we should be able to crawl any page from the links we obtained
    # and we're obeying crawling delays here
    response = urllib2.urlopen(url.encode("utf8"), timeout=5)

    time.sleep(config.page_crawl_delay)
    return response.read()

Ejemplo n.º 27

0

Mostrar archivo

Archivo: crawler.py Proyecto: Leeyp/WiktionaryCrawler

def dl_html(page):
	url = "http://en.wiktionary.org/wiki/%s" % page
	url = urlnorm.norm(url)

	# we should be able to crawl any page from the links we obtained
	# and we're obeying crawling delays here
	response = urllib2.urlopen(url.encode("utf8"), timeout=5)

	time.sleep(config.page_crawl_delay)
	return response.read()

Ejemplo n.º 28

0

Mostrar archivo

Archivo: simpleCrawler.py Proyecto: piyushbjadhav/pythoncrawler

def canonizeurl(url):
    split = urlsplit(urlnorm.norm(url))
    path = split[2].split(" ")[0]
    while path.startswith("/.."):
        path = path[3:]
    while path.endswith("%20"):
        path = path[:-3]
    # qs = urlencode(sorted(parse_qsl(split.query)))
    qs = ""
    return urlunsplit((split.scheme, split.netloc, path, qs, ""))

Ejemplo n.º 29

0

Mostrar archivo

Archivo: url.py Proyecto: file-citas/pyhtoncrawler

	def __init__(self, url):
		"""Construct from a string or Django request."""
		nurl = urlnorm.norm(url.encode('utf-16').lower())
		if hasattr(nurl, 'get_full_path'):
			nurl = nurl.get_full_path()

		self.scheme, self.netloc, self.path, self.params, \
			self.query, self.fragment = urlparse.urlparse(nurl)
		filename, self.ftype = os.path.splitext(self.path)
		self.args = dict(cgi.parse_qsl(self.query))

Ejemplo n.º 30

0

Mostrar archivo

def normalize_url(url, path=None):
    try:
        if path:
            url = urljoin(url, path)
        url = urlnorm.norm(url)
        # force HTTP protocol
        if url.startswith('http'):
            return url
    except urlnorm.InvalidUrl:
        pass

Ejemplo n.º 31

0

Mostrar archivo

def dl_xml(params):
    url = "http://en.wiktionary.org/w/api.php?format=xml"
    for key, val in params.iteritems():
        url += "&%s=%s" % (key, val)
    url = urlnorm.norm(url)

    # We're permitted to crawl any page with the API regardless
    # of robots.txt since we're using the API
    response = urllib2.urlopen(url.encode("utf8"), timeout=5)

    time.sleep(config.api_crawl_delay)
    return response.read()

Ejemplo n.º 32

0

Mostrar archivo

Archivo: crawler.py Proyecto: Leeyp/WiktionaryCrawler

def dl_xml(params):
	url = "http://en.wiktionary.org/w/api.php?format=xml"
	for key, val in params.iteritems():
		url += "&%s=%s" % (key, val)
	url = urlnorm.norm(url)

	# We're permitted to crawl any page with the API regardless
	# of robots.txt since we're using the API
	response = urllib2.urlopen(url.encode("utf8"), timeout=5)

	time.sleep(config.api_crawl_delay)
	return response.read()

Ejemplo n.º 33

0

Mostrar archivo

Archivo: GetImage.py Proyecto: asevans48/CrawlerAids

 def getImage(self,opener,url,data,wait_time):
     """
     Directly get an Image using URLLib. Errors Must be handled.
     
     *Optional Parameters*
     
     :param opener: urllib opener to use (use GetPage for setup)
     :param url: url address to use
     :param data: data to use in request (like that passed to urlencode)
     :param wait_time: time to wait for request
     
     """
     return opener.open(urlnorm.norm(url),data,wait_time).read()

Ejemplo n.º 34

0

Mostrar archivo

 def clean(self):
     """Ensures that URLs are canonized before saving"""
     self.value = refang(self.value.strip())
     try:
         if re.match(r"[^:]+://", self.value) is None:
             # if no schema is specified, assume http://
             self.value = u"http://{}".format(self.value)
         self.value = urlnorm.norm(self.value)
         self.parse()
     except urlnorm.InvalidUrl:
         raise ObservableValidationError("Invalid URL: {}".format(self.value))
     except UnicodeDecodeError:
         raise ObservableValidationError("Invalid URL (UTF-8 decode error): {}".format(self.value))

Ejemplo n.º 35

0

Mostrar archivo

Archivo: data_cache.py Proyecto: ODInfoBiz/csvengine-ui

 def getFileName(self, url, folder=None):
     url_norm = urlnorm.norm(url.strip())
     url_fname = urllib.quote_plus(url_norm)
     if folder:
         submit_path = os.path.join(self.submit_folder[folder], url_fname)
         if os.path.exists(submit_path):
             return os.readlink(submit_path)
     else:
         for f in self.submit_folder:
             submit_path = os.path.join(self.submit_folder[f], url_fname)
             if os.path.exists(submit_path):
                 return os.readlink(submit_path)
     return None

Ejemplo n.º 36

0

Mostrar archivo

Archivo: link_and_youtube_job.py Proyecto: grumpycake/extract_links-mr-job

def extract_links(body):
  links = []
  for link in HTML_TAG_PATTERN.findall(body):
    try:
      link = link[2]
      netloc = urlparse.urlparse(link).netloc
      if (netloc in domains_of_interest):
        link = urlnorm.norm(link)
        links.append(link)
    except:
      pass

  return links

Ejemplo n.º 37

0

Mostrar archivo

Archivo: __init__.py Proyecto: svven/summary

 def _clean_url(self, url):
     """
     Canonicalizes the url, as it is done in Scrapy.
     And keeps only USEFUL_QUERY_KEYS. It also strips the 
     trailing slash to help identifying dupes.
     """
     # TODO: Turn this into regex
     if not url.startswith('http') or url.endswith('}}') or 'nojs_router' in url:
         return None
     if site(norm(url).lower()) in config.NONCANONIC_SITES:
         clean_url = canonicalize_url(url, keep_params=True)
     else:
         clean_url = canonicalize_url(url)
     return clean_url

Ejemplo n.º 38

0

Mostrar archivo

Archivo: url_normalization.py Proyecto: Chaoruii/common_crawl

def normalize_canonical_url(url, use_url_norm=True):
    try:
        if use_url_norm:
            url = urlnorm.norm(url)

        scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
        host = urlparse.urlunparse((scheme, netloc, '', '', '', ''))
        path = urlparse.urlunparse(('', '', path, params, query, fragment))
        path = shebang_regex.sub('/', path)
        url = host + path

        return url.rstrip('/')
    except Exception:
        return None

Ejemplo n.º 39

0

Mostrar archivo

Archivo: py_crawler.py Proyecto: holmes0078/Web-Crawler

    def normalize(self, url):
        parsed = urlparse(url.encode('utf-8'))

        if '//' not in url:
            url = '%s%s' % ('http://', url)

        if parsed.scheme == "http" or "https":
            try:
                normalized_url = urlnorm.norm(url)
            except:
                return None
            return normalized_url
        else:
            return None

Ejemplo n.º 40

0

Mostrar archivo

Archivo: url_normalization.py Proyecto: openvenues/common_crawl

def normalize_canonical_url(url, use_url_norm=True):
    try:
        if use_url_norm:
            url = urlnorm.norm(url)

        scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
        host = urlparse.urlunparse((scheme, netloc, '', '', '', ''))
        path = urlparse.urlunparse(('', '', path, params, query, fragment))
        path = shebang_regex.sub('/', path)
        url = host + path
        
        return url.rstrip('/')
    except Exception:
        return None

Ejemplo n.º 41

0

Mostrar archivo

Archivo: url.py Proyecto: yuankeyang/yeti

    def normalize(self):
        self.value = refang(self.value)

        try:
            if re.match(r"[^:]+://", self.value) is None:
                # if no schema is specified, assume http://
                self.value = u"http://{}".format(self.value)
            self.value = urlnorm.norm(self.value).replace(' ', '%20')
            self.parse()
        except urlnorm.InvalidUrl:
            raise ObservableValidationError("Invalid URL: {}".format(
                self.value))
        except UnicodeDecodeError:
            raise ObservableValidationError(
                "Invalid URL (UTF-8 decode error): {}".format(self.value))

Ejemplo n.º 42

0

Mostrar archivo

Archivo: GetImage.py Proyecto: asevans48/CrawlerAids

 def getImageSpynner(self,baseurl,spynner,iser,wait_time,proxy):
     """
     Directly get an Image with Spynner.
     
     *Required Parameters*
     
     :param baseurl: base url to use  with link (a blank string is nothing)
     :param spynner: spynner instance
     :param iser: selector for image
     :param wait_time: time to wait in acquiring an image
     :param proxy: String proxy
     """
     br=spynner
     print "Downloading..."+str(iser["src"])
     return br.download(urlnorm.norm(baseurl+iser["src"]),outfd=None,timeout=wait_time,proxy_url=proxy)

Ejemplo n.º 43

0

Mostrar archivo

Archivo: url.py Proyecto: raymundl/yeti

    def normalize(self):
        self.value = refang(self.value)

        try:
            if re.match(r"[^:]+://", self.value) is None:
                # if no schema is specified, assume http://
                self.value = u"http://{}".format(self.value)
            self.value = urlnorm.norm(self.value).replace(' ', '%20')
            self.parse()
        except urlnorm.InvalidUrl:
            raise ObservableValidationError(
                "Invalid URL: {}".format(self.value))
        except UnicodeDecodeError:
            raise ObservableValidationError(
                "Invalid URL (UTF-8 decode error): {}".format(self.value))

Ejemplo n.º 44

0

Mostrar archivo

Archivo: parse.py Proyecto: wilbrodn/aleph

def parse_url(text):
    """Clean and verify a URL."""
    # TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py
    url = stringify(text)
    if url is not None:
        if url.startswith('//'):
            url = 'http:' + url
        elif '://' not in url:
            url = 'http://' + url
        try:
            norm = urlnorm.norm(url)
            norm, _ = urldefrag(norm)
            return norm
        except:
            return None
    return None

Ejemplo n.º 45

0

Mostrar archivo

Archivo: basic.py Proyecto: mrG7/dossier.models

def norm_url(url):
    url = uni(url).encode('utf-8')
    try:
        return urlnorm.norm(url)
    except urlnorm.InvalidUrl:
        # Happens when the URL is relative. Call path normalization directly.
        try:
            return urlnorm.norm_path('', url)
        except UnicodeDecodeError:
            return url

    except UnicodeDecodeError:
        # work around for bug in urlnorm on unicode url
        return url
    except:
        traceback.print_exc()
    return None

Ejemplo n.º 46

0

Mostrar archivo

def norm_url(url):
    url = uni(url).encode('utf-8')
    try:
        return urlnorm.norm(url)
    except urlnorm.InvalidUrl:
        # Happens when the URL is relative. Call path normalization directly.
        try:
            return urlnorm.norm_path('', url)
        except UnicodeDecodeError:
            return url

    except UnicodeDecodeError:
        # work around for bug in urlnorm on unicode url
        return url
    except:
        traceback.print_exc()
    return None

Ejemplo n.º 47

0

Mostrar archivo

Archivo: signatures.py Proyecto: martivic/alexa-genie

def cert_chain_url_valid(cert_url):
    """
    Ensure that the provided URL for the certificate chain is valid, by checking that:
    * it's HTTPS
    * the host is s3.amazonaws.com
    * the port, if specified, is 443
    * the path starts with '/echo.api/'
    """
    normalized = urlnorm.norm(cert_url)
    parsed = urlparse.urlparse(normalized)
    url_checks = {
        'scheme': parsed.scheme == 'https',
        'hostname': parsed.hostname == 's3.amazonaws.com',
        'port': parsed.port in (443, None),
        'path': parsed.path.startswith('/echo.api/'),
    }
    all_checks_pass = all(url_checks.values())
    return all_checks_pass

Ejemplo n.º 48

0

Mostrar archivo

Archivo: RequestsPlus.py Proyecto: sloanlance/RequestsPlus

    def _prepareURL(self, apiQueryURI):
        """
        If the URI (actually just a partial URL, usually the path part) doesn't begin with
        the base URL for the API, concatenate the two into a new URL and return it.

        :param apiQueryURI: URI (actually, just a partial URL, usually the path part) for an API entry point.
        :type apiQueryURI: str
        :return: URL for the API query, ready for use
        :rtype: str
        """
        assert isinstance(apiQueryURI, str)
        assert not util.stringContainsAllCharacters(apiQueryURI, '{}'), \
            'apiQueryURI contains unformatted arguments: "%s"' % apiQueryURI

        if apiQueryURI.startswith(self.apiBaseURL):
            return apiQueryURI

        return urlnorm.norm(self.apiBaseURL + '/' + apiQueryURI)

Ejemplo n.º 49

0

Mostrar archivo

def extract_urls(text, regex):
    results = dict()

    for i in regex.finditer(text):
        try:
            url = urlnorm.norm(i.group(1).strip())
            url_parsed = url_parser(url)
            if results.get(url_parsed.host):
                results[url_parsed.host].add(url)
            else:
                results[url_parsed.host] = set(url)
            log.debug("Parsed domain: {}".format(url_parsed.host))
        except urlnorm.InvalidUrl:
            log.warning("Parsing invalid url: {}".format(url))
        except:
            log.exception("Failed parsing url: {}".format(url))

    return results

Ejemplo n.º 50

0

Mostrar archivo

Archivo: stream_connection.py Proyecto: jrmurray000/twitstream

    def on_data(self, data):
        tweet_data = json.loads(data)
        if 'limit' in tweet_data:
            print("Limit:" + str(tweet_data["limit"]))
        else:
            #insert into tweet db
            tweet = tweet_data["text"]
            username = tweet_data["user"]["screen_name"]
            #lat = tweet_data[]
            #long = tweet_data[]

            c.execute("INSERT INTO tweet (time, username, tweet) VALUES (%s,%s,%s)",
                (time.time(), username, tweet))
            tweet_id = c.lastrowid

            # insert full urls into DB
            for url in tweet_data["entities"]["urls"]:

                # process URL
                norm_url = urlnorm.norm(url["expanded_url"])
                norm_url_tuple = urlparse.urlparse(norm_url)

                # unshorten URLs for common URL minimizer services
                if norm_url_tuple[1] in URL_SHORTENERS:
                    norm_url = unshorten_url(norm_url)
                    norm_url_tuple = urlparse.urlparse(norm_url)

                md5_url = hashlib.md5()
                md5_url.update(norm_url.encode("utf-8"))

                c.execute("INSERT INTO url (url, domain, url_hash) VALUES (%s,%s,%s)",
                          (norm_url, norm_url_tuple[1], md5_url.hexdigest()))
                url_id = c.lastrowid
                c.execute("INSERT INTO tweet_urls (tweet_id, url_id) VALUES (%s,%s)",
                          (tweet_id, url_id))



            conn.commit()
            self.tweet_count += 1
            if self.tweet_count % 1000 == 0:
                print self.tweet_count

        return True

Ejemplo n.º 51

0

Mostrar archivo

Archivo: scrapers.py Proyecto: marekjs/progscrape

    def scrape(self):
        stories = self._scrape()
        # If we've scraped the same canonical URL twice, we will just choose the first one
        urls = set()
        for story in stories:
            try:
                url = urlnorm.norm(story.url)
            except:
                # If we've scraped a bad UTF-8 character here, this might fail
                url = story.url

            if url in urls:
                stories.remove(story)
            else:
                urls.add(url)
                story.url = url
                story.title = story.title.strip()

        return stories

Ejemplo n.º 52

0

Mostrar archivo

Archivo: data_cache.py Proyecto: ODInfoBiz/csvengine-ui

def storeURL(url, path, max_file_size):
    #download URL and send fileID
    log.debug("downloading url", url=url, max_file_size=max_file_size )
    try:
        r = requests.get(url, stream=True)
        size = 0
        ctt = StringIO()
    
        sig = hashlib.md5()
        for chunk in r.iter_content(2048):
            size += len(chunk)
            ctt.write(chunk)
            sig.update(chunk)
            if size >  max_file_size:
                r.close()
                raise RequestEntityTooLarge()
    
        md5 = sig.hexdigest()
        ctt.seek(0)
        
        fpath=os.path.join(path, md5)
        if os.path.exists(fpath):
            print 'file exists', fpath
            return md5
        log.debug("storing url", url=url, file=fpath)
        with open (fpath,'w') as fd:
            t = ctt.read(1048576)
            while t:
                fd.write(t)
                t = ctt.read(1048576)
        
        url_norm = urlnorm.norm(url.strip())
        url_fname = urllib.quote_plus(url_norm)
        f = os.path.join(path, url_fname)

        
        os.symlink(fpath,f)
        log.debug("url stored", url=url, file=fpath)
        
        return md5
    except Exception as e:
        raise e

Ejemplo n.º 53

0

Mostrar archivo

Archivo: docs.py Proyecto: yiransheng/yiransbookmark

 def save_link(cls, title, url, body="", tags=[], clicks=0, unread=True):
     url = norm(url)
     id = mmh3.hash(url)
     key = ndb.Key(LinkModel, id)
     domain = urlparse(url).netloc
     if len(domain) > 4 and domain.startswith('www.'):
         domain = domain[4:]
     link = LinkModel(key=key,
                      title=title,
                      url=url,
                      domain=domain,
                      body=body,
                      tags=tags,
                      clicks=clicks,
                      unread=unread)
     link.put()
     id = str(link.id)
     doc = cls._buildDoc(id, title, body, domain, tags)
     cls.add(doc)
     return cls(doc, link)

Ejemplo n.º 54

0

Mostrar archivo

Archivo: bucklegripper.py Proyecto: KNIGHTTH0R/DATA-1

def main():

    print "\n.: BUCKLEGRIPPER v0.1 https://github.com/hadojae/DATA :."

    parser = argparse.ArgumentParser(description='Visit a suspected phishing page, screenshot it and pillage it for phishing archives')
    parser.add_argument('-u','--url', help='Url to visit',required=False,default=False)
    parser.add_argument('-s','--source', help='Apply a source to where this url came from',required=False,default="bucklegripper")
    parser.add_argument('-r','--readfile', help='Read in a file of URLs one per line',required=False,default=False)
    parser.add_argument('-a','--useragent', help='Custom User-Agent',required=False,default="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36")

    args = parser.parse_args()
    user_agent = args.useragent
    full = args.url
    source = args.source
    readfile = args.readfile

    if full == False and readfile == False:
        print bcolors.FAIL + "\n[-] You have to enter either a url with '-u' to analyze or specify a file with urls in it with '-r'\n" + bcolors.ENDC
        sys.exit() 

    # "setup fake ua for urllib2 requests"
    headers = { 'User-Agent' : user_agent }

    if readfile == False:
        mainloop(full, headers, user_agent, source)
        sys.exit()
    else:
        print "\n[+] Beginning processing of " + readfile
        with open(readfile) as f:
            content = f.readlines()
            for line in content:
                #catch bad url
                try:
                    full = urlnorm.norm(line).strip('\n')
                except Exception:
                    print bcolors.FAIL + "[-] " + line + " is a Malformed URI" + bcolors.ENDC
                    continue 
  		mainloop(full, headers, user_agent, source)
        print "\n[+] Finished processing " + readfile + '\n'
        sys.exit()

Ejemplo n.º 55

0

Mostrar archivo

Archivo: QueueWithDictionary.py Proyecto: ndeepesh/SearchEngine

    def enqueue(self, url, *args):
        # We add explored bool too. Since links that
        # are not explored only can average prev scores
        # normalizedURL = url
        if (len(args) != 1):
            crawlerLogger.error("Required was Priority but more args supplied")

        priority = args[0]
        try:
            normalizedURL = urlnorm.norm(url)

            if (normalizedURL not in self._linkDict):
                self._linkDict[normalizedURL] = (priority, False)
            else:
                # Average the two scores if found
                prevPriority, explored = self._linkDict[normalizedURL]
                if (not explored):
                    self._linkDict[normalizedURL] = (
                        (prevPriority + priority) / 2, False)
        except Exception as e:
            crawlerLogger.warn("Normalization Issues. Not Enqueing " + url)
        self._buildHeap()