コード例 #1
0
ファイル: crawler3.py プロジェクト: sigmundc/CS6965
def normalize_url(base_url, url):
	myfile3 = open('normalization_log', 'a')
	myfile3.write("base url:{0}\n".format(base_url))
	myfile3.write("url:{0}\n".format(url))
	myfile3.close()
	result = ''

	# if url starts with http:// or https://
	allowed_scheme = ['http', 'https']
	url_scheme = urlparse(url).scheme
	if url_scheme in allowed_scheme:
		return urlnorm.norm(url)
	elif url_scheme == 'mailto':
		return False
	elif len(url_scheme) == 0:
		# check if URL starts with ../
		if (url[:3] == '../') or (url[:2] == './'):
			return urlnorm.norm(base_url+'/'+url)
		elif url[0] == '/': # e.g. /page/page
			# That means it's the domain + url
			url_obj = urlparse(base_url)
			new_url = url_obj.scheme + "://" + url_obj.netloc + url
			return urlnorm.norm(new_url)

		else: # URL should be just html page e.g. research.html
			# so we need to replace the last part
			# if URL is 'http://www.test.com/page/page/12345':
			# results will be ['http://www.test.com/page/page', '12345']
			parts = base_url.rsplit('/', 1)
			return urlnorm.norm(parts[0]+'/'+url)
	result = url
	return result
コード例 #2
0
ファイル: llc.py プロジェクト: rickychang/letter-link-crawl
def main():
	if (len(sys.argv) < 3 ):
		print "usage: python ll-print.py <url> <search term>"
		print "example: python ll-print.py http://www.hunch.com 'hunch team'"
		exit(0)
	root_URL = sys.argv[1]
	search_term = sys.argv[2]
	if (not validate_search_term(search_term)):
		print "Invalid search term.  Please only use valid url characters and spaces."
		exit(1)
	first_letter = search_term[0]
	first_letter_match = root_URL.find(first_letter.lower())
	if (first_letter_match != -1):
		try:
			br = mechanize.Browser()
			br._factory.is_html = True
			result = []
			br.open(root_URL)
			# print "visiting: " + urlnorm.norm(br.geturl())
			visited = set([urlnorm.norm(br.geturl()), urlnorm.norm(root_URL)])
			result = find_matching_links(br, search_term, result, visited)
			if (result):
				max_index = max(result, key=lambda u: u[1])[1]
				for l, i, c in result:
					print_url(l, i, max_index)
		except urlnorm.InvalidUrl:
			print "Invalid root URL"
		except urllib2.URLError, e:
			print "Error opening root URL"
			print e
		except Exception, e:
			print e
コード例 #3
0
ファイル: crawler.py プロジェクト: joshelser/phoenixcon-2018
def crawl(db, url, urls_crawled={}):
    # Make sure we don't infinite loop, keep track of what we urls_crawled
    # TODO pull this from Phoenix

    # Crawl this website, get all of the outbound URLs
    urls_to_crawl = crawl_one(db, url)
    # Record that we crawled this url
    urls_crawled[url] = None

    for url_to_crawl in urls_to_crawl:
        try:
            url_to_crawl = urlnorm.norm(url_to_crawl)
        except urlnorm.InvalidUrl:
            # Try to convert it to an absolute url
            url_to_crawl = urlnorm.norm("%s%s" % (url, url_to_crawl))

        # Don't re-record
        if url_to_crawl in urls_crawled:
            print 'Skipping %s as already crawled' % (url_to_crawl)
        # Only crawl my site
        elif url_to_crawl.startswith('https://penguinsinabox.com'):
            crawl(db, url_to_crawl, urls_crawled)
        else:
            # A website not owned by me
            print 'Skipping %s as not a self-controlled site' % (url_to_crawl)
    print "Finished processing children of %s" % (url)
コード例 #4
0
ファイル: llc.py プロジェクト: rickychang/letter-link-crawl
def find_matching_links(br, target_word, result, visited):
	if (not target_word):
		return result
	else:
		current_URL = urlnorm.norm(br.geturl())
		current_letter = target_word[0].lower()
		if (current_letter.isspace()):
			return find_matching_links(br, target_word[1:], result + [('', -1, ' ')], visited)
		else:
			matching_index = current_URL[7:].find(current_letter)
			if (matching_index == -1):
				return []
			else:
				new_result = result + [(current_URL, matching_index + 7, current_letter)]
				links = list(br.links())
				for link in links:
					try:
						link_URL = urlnorm.norm(link.absolute_url)
						if (link_URL not in visited):
							br.open(link_URL)
							new_visited = visited.copy()
							new_visited.add(link_URL)
							# print "visiting: " + urlnorm.norm(br.geturl())
							new_visited.add(urlnorm.norm(br.geturl()))
							child_result = find_matching_links(br, target_word[1:], new_result, new_visited)
							if (child_result):
								return child_result
 					except Exception, e:
						continue
コード例 #5
0
def fetchOutlinks(ahrefs):
    newOutLinks = set()
    base_url = "https://en.wikipedia.org"
    for a in ahrefs:
        try:
            ahref = a['href'].lower()
            not_parseable_ressources = (".avi", ".mkv", ".mp4", ".jpg",
                                        ".jpeg", ".png", ".gif", ".pdf",
                                        ".iso", ".rar", ".tar", ".tgz", ".zip",
                                        ".dmg", ".exe")
            if not urlparse.urlparse(ahref).path.endswith(
                    not_parseable_ressources):
                if "wiki" in ahref:
                    if "#" in ahref:  # Finding and removing URLs with # in them
                        ahref = ahref[:ahref.find("#")]
                        pass
                    elif "?" in ahref:  # Finding and removing URLs with ? in them
                        ahref = ahref[:ahref.find("?")]
                        pass
                    elif ":" in ahref:  # Finding and removing URLs with : in them
                        ahref = ahref[:ahref.find(":")]
                        pass
                    elif "//" in ahref:  # Finding and removing URLs with // in them
                        ahref = ahref[:ahref.find("//")]
                        pass
                    elif ahref == "/wiki/Main_Page":  # Finding and removing URLs of Main page of Wiki
                        pass
                    elif "united" in ahref or "states" in ahref or "u.s" in ahref or "illegal" in ahref or "immig" in ahref or "donald" in ahref or "trump" in ahref:
                        newUrl = a['href']
                        finalUrl = base_url + newUrl
                        finalUrl = urlnorm.norm(finalUrl).encode(
                            "utf-8", "ignore")
                        newOutLinks.add(finalUrl)
                    else:
                        if ahref[:
                                 2] == "//":  # Finding and removing URLs with // in them
                            pass
                        elif "index" in ahref or "youtube" in ahref or "rgu" in ahref or "book" in ahref or "american" in ahref:
                            pass
                        elif "#" in ahref:  # Finding and removing URLs with # in them
                            ahref = ahref[:ahref.find("#")]
                            pass
                        elif "united" in ahref or "states" in ahref or "u.s" in ahref or "illegal" in ahref or "immig" in ahref or "donald" in ahref or "trump" in ahref:
                            newUrl = a['href']
                            finalUrl = newUrl
                            print "outlinks:", finalUrl
                            finalUrl = urlnorm.norm(finalUrl).encode(
                                "utf-8", "ignore")
                            newOutLinks.add(finalUrl)
        except KeyError, e:
            pass
コード例 #6
0
    def __init__(self, url, previous=None, **info):
        # Apply the simple idempotent optimizations to all urls (no need to
        # ever deal with "HTTP://.."). This means case-sensitivity, and a
        # whole lot of other things that the urlnorm library will do for us.
        # We call this the original url, even though it is a bit of a lie.
        try:
            self.original_url = urlnorm.norm(url)
        except urlnorm.InvalidUrl as e:
            raise urlnorm.InvalidUrl('{}: {}'.format(e, url))

        # For the normalized url that we'll be exposing, remove the
        # fragment, and treat https and http the same.
        url, fragment = urldefrag(self.original_url)
        self.lossy_url_data = {'fragment': fragment}
        if url.startswith('https:'):
            url = 'http' + url[5:]
            self.lossy_url_data.update({'protocol': 'https'})
        self.url = url

        self.set_previous(previous)
        self.info = info
        self.post = None

        # Runtime data
        self.response = None
        self.exception = None
        self.retries = 0
コード例 #7
0
ファイル: utils.py プロジェクト: jpopelka/gluetool
def treat_url(url, logger=None):
    """
    Remove "weird" artifacts from the given URL. Collapse adjacent '.'s, apply '..', etc.

    :param str url: URL to clear.
    :param gluetool.log.ContextAdapter logger: logger to use for logging.
    :rtype: str
    :returns: Treated URL.
    """

    logger = logger or Logging.get_logger()

    logger.debug("treating a URL '{}'".format(url))

    try:
        url = str(urlnorm.norm(url))

    except urlnorm.InvalidUrl as exc:
        # urlnorm cannot handle localhost: https://github.com/jehiah/urlnorm/issues/3
        if exc.message == "host u'localhost' is not valid":
            pass

        else:
            raise exc

    return url.strip()
コード例 #8
0
    def _canonize(self):
        if self.is_absolute:
            self.canonical_scheme, self.canonical_netloc, self.canonical_path, self.canonical_query, self.canonical_fragment \
                = urlsplit(urlnorm.norm(self.raw))
        else:
            self.canonical_scheme, self.canonical_netloc, self.canonical_path, self.canonical_query, self.canonical_fragment \
                = self.raw_scheme, self.raw_netloc, self.raw_path, self.raw_query, self.raw_fragment

        self.canonical_scheme = self.canonical_scheme.lower()

        if self.raw_scheme.endswith('s'):
            self.canonical_netloc, _ = rstrip_string(self.canonical_netloc,
                                                     ':443')
        else:
            self.canonical_netloc, _ = rstrip_string(self.canonical_netloc,
                                                     ':80')

        self.canonical_netloc = self.canonical_netloc.lower()

        self.canonical_path = '' if self.canonical_path == '/' else self.canonical_path

        params = parse_qsl(self.canonical_query, True)
        self.canonical_query_params = [(k, v) for (k, v) in sorted(params)]

        self.canonical = urlunsplit(
            (self.canonical_scheme, self.canonical_netloc, self.canonical_path,
             self.canonical_query, ''))
コード例 #9
0
ファイル: Crawler.py プロジェクト: srujans1/FocusedWebCrawler
def processPage():
    while not urls.counter > urlcount:
        try:
            link = urlpool.get()
            newurl = urlparse.urljoin(
                link.base_url,
                link.url)  # Converting relative URLs to Absolute ones
            newurl = unicode(urlnorm.norm(newurl))  # Normalizing URL
            print "out: " + newurl
            disassembled = urlparse.urlsplit(newurl)
            filename, file_ext = splitext(
                basename(disassembled.path
                         ))  # Finding file extension for filtering exclusions
            file_ext = file_ext.lower()
            if filename == 'index':
                newurl = newurl[:-len(filename + file_ext)]
            if (file_ext not in excludedExtensions
                    and disassembled.scheme in ['http', 'https']
                    and disassembled.fragment == ''):
                print "in : " + newurl
                if newurl not in visited:  # Checking to see if URL has already been queued once
                    visited.add(newurl)
                    if urlContains(newurl, searchTags) > 0:
                        urls.put(newurl, 1)
                    else:
                        priority = priorityCalculator.searchPage(
                            newurl, searchTags)
                        if priority < len(searchTags) + 1:
                            urls.put(
                                newurl, priority
                            )  # Adding URL to queue with calculated priority
        except UnicodeEncodeError:
            print "UnicodeEncodeError"
        except:
            print "Invalid URL"
コード例 #10
0
ファイル: Crawler.py プロジェクト: Walliee/FocusedCrawler
def processPage():
    while not urls.counter > urlcount:
        try:
            link = urlpool.get()
            newurl = urlparse.urljoin(link.base_url, link.url) # Converting relative URLs to Absolute ones
            newurl = unicode(urlnorm.norm(newurl)) # Normalizing URL
            print "out: " + newurl
            disassembled = urlparse.urlsplit(newurl)
            filename, file_ext = splitext(basename(disassembled.path)) # Finding file extension for filtering exclusions
            file_ext = file_ext.lower()
            if filename == 'index':
                newurl = newurl[:-len(filename + file_ext)]
            if (file_ext not in excludedExtensions and disassembled.scheme in ['http', 'https'] and disassembled.fragment == ''):
                print "in : " + newurl
                if newurl not in visited: # Checking to see if URL has already been queued once
                    visited.add(newurl)
                    if urlContains(newurl, searchTags) > 0:
                        urls.put(newurl, 1)
                    else:
                        priority = priorityCalculator.searchPage(newurl, searchTags)
                        if priority < len(searchTags) + 1:
                            urls.put(newurl, priority) # Adding URL to queue with calculated priority
        except UnicodeEncodeError:
            print "UnicodeEncodeError"
        except:
            print "Invalid URL"
コード例 #11
0
def test_invalid_urls(url):
    try:
        output = urlnorm.norm(url)
        print '%r' % output
    except urlnorm.InvalidUrl:
        return
    assert 1 == 0, "this should have raised an InvalidUrl exception"
コード例 #12
0
ファイル: urlwork.py プロジェクト: MattLeMay/thresholderbot
def canonicalize(url):
    """Canonicalize a URL in just a few easy steps:

        1. Resolve any redirects
        2. Normalize the URL
        3. Strip any superflous query params
        4. Sort any remaining query params
        5. Profit!

    This relies on the urlnorm module for normalization, and, at the moment,
    just removes utm_* query params.

    TODO: Special case normalization for major sites (e.g. youtube)?
    """
    url = urlnorm.norm(resolve(url))
    url_parts = urlparse.urlsplit(url)
    scheme, netloc, path, query, fragment = url_parts

    params = []
    for key, value in cgi.parse_qs(query).iteritems():
        if exclude_param(url_parts, key, value):
            continue
        if isinstance(value, list):
            params.extend((key, v) for v in value)
        else:
            params.append((key, value))

    query = urllib.urlencode(sorted(params), doseq=1)
    return urlparse.urlunsplit((scheme, netloc, path, query, ''))
コード例 #13
0
ファイル: test_urlnorm.py プロジェクト: jehiah/urlnorm
def test_invalid_urls(url):
    try:
        output = urlnorm.norm(url)
        print '%r' % output
    except urlnorm.InvalidUrl:
        return
    assert 1 == 0, "this should have raised an InvalidUrl exception"
コード例 #14
0
def task_listener_crawler(gearman_worker, gearman_job):
	url = gearman_job.data
	url_frontier.add(url)
	urls = urlparse.urlparse(url)
	print "Crawling ", url
	response = requests.get(url, crawler_headers)
	print 'Downloaded page'
	if response.status_code == 200:
		raw_data = response.text
		if response.encoding != 'utf8':
			raw_data = response.text.encode(response.encoding).decode('utf8')
		r.table(raw_result_table).insert({'url': url, 'raw': raw_data, 'status': 200}, conflict="replace").run(rethink)

		links = linkregex.findall(raw_data)
		for link in (links.pop(0) for _ in xrange(len(links))):
			pre_norm_url = url_pre_norm(link, urls)
			norm_url = urlnorm.norm(pre_norm_url)
			norm_parts = urlparse.urlparse(norm_url)
			ext_url = norm_parts.path.split(".")[-1].lower()
			if ext_url not in except_url_suffixes and url_frontier.add(norm_url):
				print "Add ", norm_url, " to redis queue"
				redis_client.rpush("urls:enqueued", norm_url)
		print "Done"
		return "ok"
	else:
		r.table(raw_result_table).insert({'url': url, 'status': response.status_code}, conflict="replace").run(rethink)
	return "fail"
コード例 #15
0
ファイル: urlwork.py プロジェクト: vdeleon/thresholderbot
def canonicalize(url):
    """Canonicalize a URL in just a few easy steps:

        1. Resolve any redirects
        2. Normalize the URL
        3. Strip any superflous query params
        4. Sort any remaining query params
        5. Profit!

    This relies on the urlnorm module for normalization, and, at the moment,
    just removes utm_* query params.

    TODO: Special case normalization for major sites (e.g. youtube)?
    """
    url = urlnorm.norm(resolve(url))
    url_parts = urlparse.urlsplit(url)
    scheme, netloc, path, query, fragment = url_parts

    params = []
    for key, value in cgi.parse_qs(query).iteritems():
        if exclude_param(url_parts, key, value):
            continue
        if isinstance(value, list):
            params.extend((key, v) for v in value)
        else:
            params.append((key, value))

    query = urllib.urlencode(sorted(params), doseq=1)
    return urlparse.urlunsplit((scheme, netloc, path, query, ''))
コード例 #16
0
ファイル: urls.py プロジェクト: adamchainz/aleph
def normalize_url(url):
    # TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py
    try:
        norm = urlnorm.norm(url)
        norm, _ = urldefrag(norm)
        return norm.rstrip('/')
    except:
        return None
コード例 #17
0
ファイル: feedimporter.py プロジェクト: osks/komfeeder
def normalize_url(url):
    norm_url = urlnorm.norm(url)
    if norm_url.startswith("https://"):
        return norm_url[8:]
    elif norm_url.startswith("http://"):
        return norm_url[7:]
    else:
        return norm_url
コード例 #18
0
ファイル: url.py プロジェクト: carriercomm/yeti
 def clean(self):
     """Ensures that URLs are canonized before saving"""
     self.value = refang(self.value.strip())
     try:
         if re.match(r"[^:]+://", self.value) is None:  # if no schema is specified, assume http://
             self.value = u"http://{}".format(self.value)
         self.value = urlnorm.norm(self.value)
     except urlnorm.InvalidUrl:
         raise ObservableValidationError("Invalid URL: {}".format(self.value))
コード例 #19
0
 def fetch(self, method, endpoint, params):
     api_endpoint = norm(self.api_base + endpoint)
     content = self.oauth.request(
         method,
         api_endpoint,
         params=params,
         headers={'User-Agent': 'Semantics3 Python Lib/0.2'})
     print(content)
     return content
コード例 #20
0
ファイル: url.py プロジェクト: rlugojr/krauler
def normalize_url(url):
    # TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py
    try:
        url = urlnorm.norm(url)
        url, _ = urldefrag(url)
        url = url.rstrip("/")
        return url
    except:
        return None
コード例 #21
0
ファイル: base.py プロジェクト: axknightroad/metasearch
 def new(cls, *args, **kwargs):
     obj = cls(*args)
     obj.source = kwargs['source']
     obj.duplicates = 0
     obj.priority = 0
     # normalize url
     if hasattr(obj, 'url'):
         obj.url = urlnorm.norm(obj.url)
     return obj
コード例 #22
0
ファイル: gQuery.py プロジェクト: srujans1/FocusedWebCrawler
def googleSearch ( searchString ):
    g = pygoogle(searchString)
    g.pages = 2
    urls = g.get_urls()
    urls = urls[:10]
    for i in range(len(urls)):
        urls[i]=unicode(urlnorm.norm(urls[i]))

    return urls
コード例 #23
0
 def new(cls, *args, **kwargs):
     obj = cls(*args)
     obj.source = kwargs['source']
     obj.duplicates = 0
     obj.priority = 0
     # normalize url
     if hasattr(obj, 'url'):
         obj.url = urlnorm.norm(obj.url)
     return obj
コード例 #24
0
ファイル: fetch_insert.py プロジェクト: thlor/portalmonitor
def createMetaResources(md5v, dataset):
    with Timer(key='createMetaResources'):
        res = getDistributionAccessURLs(dataset) + getDistributionDownloadURLs(
            dataset)
        bulk_mr = []
        uris = []
        for uri in res:
            valid = True
            try:
                uri = urlnorm.norm(uri.strip())
            except Exception as e:
                log.debug("URIFormat", uri=uri, md5=md5v, msg=e.message)
                uri = uri
                valid = False

            f = getDistributionFormatWithURL(dataset, uri)
            m = getDistributionMediaTypeWithURL(dataset, uri)
            s = getDistributionSizeWithURL(dataset, uri)
            c = getDistributionCreationDateWithURL(dataset, uri)
            mod = getDistributionModificationDateWithURL(dataset, uri)
            try:
                s_uri = safe_url_string(uri, 'utf-8')
                uri = escape_ajax(s_uri)
            except Exception as exc:
                ErrorHandler.handleError(log,
                                         "safe_url_string",
                                         exception=exc,
                                         md5=md5,
                                         uri=uri,
                                         exc_info=True)
                uri = uri

            if uri in uris:
                log.debug("WARNING, duplicate URI",
                          dataset=dataset.id,
                          md5=md5v,
                          uri=uri,
                          format=f,
                          media=m)
                continue
            try:
                s = int(float(s)) if s is not None else None
            except Exception as e:
                s = None

            MR = MetaResource(uri=uri,
                              md5=md5v,
                              media=m,
                              valid=valid,
                              format=normaliseFormat(f),
                              size=s,
                              created=toDatetime(c),
                              modified=toDatetime(mod))
            bulk_mr.append(MR)
            uris.append(uri)
        return bulk_mr
コード例 #25
0
 def fetch(self, method, endpoint, params):
     api_endpoint = norm(self.api_base + endpoint)
     content = self.oauth.request(
                 method,
                 api_endpoint,
                 params = params,
                 headers={'User-Agent':'Semantics3 Python Lib/0.2'}
               )
     print(content)
     return content
コード例 #26
0
def dl_html(page):
    url = "http://en.wiktionary.org/wiki/%s" % page
    url = urlnorm.norm(url)

    # we should be able to crawl any page from the links we obtained
    # and we're obeying crawling delays here
    response = urllib2.urlopen(url.encode("utf8"), timeout=5)

    time.sleep(config.page_crawl_delay)
    return response.read()
コード例 #27
0
ファイル: crawler.py プロジェクト: Leeyp/WiktionaryCrawler
def dl_html(page):
	url = "http://en.wiktionary.org/wiki/%s" % page
	url = urlnorm.norm(url)

	# we should be able to crawl any page from the links we obtained
	# and we're obeying crawling delays here
	response = urllib2.urlopen(url.encode("utf8"), timeout=5)

	time.sleep(config.page_crawl_delay)
	return response.read()
コード例 #28
0
def canonizeurl(url):
    split = urlsplit(urlnorm.norm(url))
    path = split[2].split(" ")[0]
    while path.startswith("/.."):
        path = path[3:]
    while path.endswith("%20"):
        path = path[:-3]
    # qs = urlencode(sorted(parse_qsl(split.query)))
    qs = ""
    return urlunsplit((split.scheme, split.netloc, path, qs, ""))
コード例 #29
0
ファイル: url.py プロジェクト: file-citas/pyhtoncrawler
	def __init__(self, url):
		"""Construct from a string or Django request."""
		nurl = urlnorm.norm(url.encode('utf-16').lower())
		if hasattr(nurl, 'get_full_path'):
			nurl = nurl.get_full_path()

		self.scheme, self.netloc, self.path, self.params, \
			self.query, self.fragment = urlparse.urlparse(nurl)
		filename, self.ftype = os.path.splitext(self.path)
		self.args = dict(cgi.parse_qsl(self.query))
コード例 #30
0
def normalize_url(url, path=None):
    try:
        if path:
            url = urljoin(url, path)
        url = urlnorm.norm(url)
        # force HTTP protocol
        if url.startswith('http'):
            return url
    except urlnorm.InvalidUrl:
        pass
コード例 #31
0
def dl_xml(params):
    url = "http://en.wiktionary.org/w/api.php?format=xml"
    for key, val in params.iteritems():
        url += "&%s=%s" % (key, val)
    url = urlnorm.norm(url)

    # We're permitted to crawl any page with the API regardless
    # of robots.txt since we're using the API
    response = urllib2.urlopen(url.encode("utf8"), timeout=5)

    time.sleep(config.api_crawl_delay)
    return response.read()
コード例 #32
0
ファイル: crawler.py プロジェクト: Leeyp/WiktionaryCrawler
def dl_xml(params):
	url = "http://en.wiktionary.org/w/api.php?format=xml"
	for key, val in params.iteritems():
		url += "&%s=%s" % (key, val)
	url = urlnorm.norm(url)

	# We're permitted to crawl any page with the API regardless
	# of robots.txt since we're using the API
	response = urllib2.urlopen(url.encode("utf8"), timeout=5)

	time.sleep(config.api_crawl_delay)
	return response.read()
コード例 #33
0
ファイル: GetImage.py プロジェクト: asevans48/CrawlerAids
 def getImage(self,opener,url,data,wait_time):
     """
     Directly get an Image using URLLib. Errors Must be handled.
     
     *Optional Parameters*
     
     :param opener: urllib opener to use (use GetPage for setup)
     :param url: url address to use
     :param data: data to use in request (like that passed to urlencode)
     :param wait_time: time to wait for request
     
     """
     return opener.open(urlnorm.norm(url),data,wait_time).read()
コード例 #34
0
 def clean(self):
     """Ensures that URLs are canonized before saving"""
     self.value = refang(self.value.strip())
     try:
         if re.match(r"[^:]+://", self.value) is None:
             # if no schema is specified, assume http://
             self.value = u"http://{}".format(self.value)
         self.value = urlnorm.norm(self.value)
         self.parse()
     except urlnorm.InvalidUrl:
         raise ObservableValidationError("Invalid URL: {}".format(self.value))
     except UnicodeDecodeError:
         raise ObservableValidationError("Invalid URL (UTF-8 decode error): {}".format(self.value))
コード例 #35
0
ファイル: data_cache.py プロジェクト: ODInfoBiz/csvengine-ui
 def getFileName(self, url, folder=None):
     url_norm = urlnorm.norm(url.strip())
     url_fname = urllib.quote_plus(url_norm)
     if folder:
         submit_path = os.path.join(self.submit_folder[folder], url_fname)
         if os.path.exists(submit_path):
             return os.readlink(submit_path)
     else:
         for f in self.submit_folder:
             submit_path = os.path.join(self.submit_folder[f], url_fname)
             if os.path.exists(submit_path):
                 return os.readlink(submit_path)
     return None
コード例 #36
0
def extract_links(body):
  links = []
  for link in HTML_TAG_PATTERN.findall(body):
    try:
      link = link[2]
      netloc = urlparse.urlparse(link).netloc
      if (netloc in domains_of_interest):
        link = urlnorm.norm(link)
        links.append(link)
    except:
      pass

  return links
コード例 #37
0
ファイル: __init__.py プロジェクト: svven/summary
 def _clean_url(self, url):
     """
     Canonicalizes the url, as it is done in Scrapy.
     And keeps only USEFUL_QUERY_KEYS. It also strips the 
     trailing slash to help identifying dupes.
     """
     # TODO: Turn this into regex
     if not url.startswith('http') or url.endswith('}}') or 'nojs_router' in url:
         return None
     if site(norm(url).lower()) in config.NONCANONIC_SITES:
         clean_url = canonicalize_url(url, keep_params=True)
     else:
         clean_url = canonicalize_url(url)
     return clean_url
コード例 #38
0
def normalize_canonical_url(url, use_url_norm=True):
    try:
        if use_url_norm:
            url = urlnorm.norm(url)

        scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
        host = urlparse.urlunparse((scheme, netloc, '', '', '', ''))
        path = urlparse.urlunparse(('', '', path, params, query, fragment))
        path = shebang_regex.sub('/', path)
        url = host + path

        return url.rstrip('/')
    except Exception:
        return None
コード例 #39
0
ファイル: py_crawler.py プロジェクト: holmes0078/Web-Crawler
    def normalize(self, url):
        parsed = urlparse(url.encode('utf-8'))

        if '//' not in url:
            url = '%s%s' % ('http://', url)

        if parsed.scheme == "http" or "https":
            try:
                normalized_url = urlnorm.norm(url)
            except:
                return None
            return normalized_url
        else:
            return None
コード例 #40
0
def normalize_canonical_url(url, use_url_norm=True):
    try:
        if use_url_norm:
            url = urlnorm.norm(url)

        scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
        host = urlparse.urlunparse((scheme, netloc, '', '', '', ''))
        path = urlparse.urlunparse(('', '', path, params, query, fragment))
        path = shebang_regex.sub('/', path)
        url = host + path
        
        return url.rstrip('/')
    except Exception:
        return None
コード例 #41
0
ファイル: url.py プロジェクト: yuankeyang/yeti
    def normalize(self):
        self.value = refang(self.value)

        try:
            if re.match(r"[^:]+://", self.value) is None:
                # if no schema is specified, assume http://
                self.value = u"http://{}".format(self.value)
            self.value = urlnorm.norm(self.value).replace(' ', '%20')
            self.parse()
        except urlnorm.InvalidUrl:
            raise ObservableValidationError("Invalid URL: {}".format(
                self.value))
        except UnicodeDecodeError:
            raise ObservableValidationError(
                "Invalid URL (UTF-8 decode error): {}".format(self.value))
コード例 #42
0
ファイル: GetImage.py プロジェクト: asevans48/CrawlerAids
 def getImageSpynner(self,baseurl,spynner,iser,wait_time,proxy):
     """
     Directly get an Image with Spynner.
     
     *Required Parameters*
     
     :param baseurl: base url to use  with link (a blank string is nothing)
     :param spynner: spynner instance
     :param iser: selector for image
     :param wait_time: time to wait in acquiring an image
     :param proxy: String proxy
     """
     br=spynner
     print "Downloading..."+str(iser["src"])
     return br.download(urlnorm.norm(baseurl+iser["src"]),outfd=None,timeout=wait_time,proxy_url=proxy)
コード例 #43
0
ファイル: url.py プロジェクト: raymundl/yeti
    def normalize(self):
        self.value = refang(self.value)

        try:
            if re.match(r"[^:]+://", self.value) is None:
                # if no schema is specified, assume http://
                self.value = u"http://{}".format(self.value)
            self.value = urlnorm.norm(self.value).replace(' ', '%20')
            self.parse()
        except urlnorm.InvalidUrl:
            raise ObservableValidationError(
                "Invalid URL: {}".format(self.value))
        except UnicodeDecodeError:
            raise ObservableValidationError(
                "Invalid URL (UTF-8 decode error): {}".format(self.value))
コード例 #44
0
ファイル: parse.py プロジェクト: wilbrodn/aleph
def parse_url(text):
    """Clean and verify a URL."""
    # TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py
    url = stringify(text)
    if url is not None:
        if url.startswith('//'):
            url = 'http:' + url
        elif '://' not in url:
            url = 'http://' + url
        try:
            norm = urlnorm.norm(url)
            norm, _ = urldefrag(norm)
            return norm
        except:
            return None
    return None
コード例 #45
0
ファイル: basic.py プロジェクト: mrG7/dossier.models
def norm_url(url):
    url = uni(url).encode('utf-8')
    try:
        return urlnorm.norm(url)
    except urlnorm.InvalidUrl:
        # Happens when the URL is relative. Call path normalization directly.
        try:
            return urlnorm.norm_path('', url)
        except UnicodeDecodeError:
            return url

    except UnicodeDecodeError:
        # work around for bug in urlnorm on unicode url
        return url
    except:
        traceback.print_exc()
    return None
コード例 #46
0
def norm_url(url):
    url = uni(url).encode('utf-8')
    try:
        return urlnorm.norm(url)
    except urlnorm.InvalidUrl:
        # Happens when the URL is relative. Call path normalization directly.
        try:
            return urlnorm.norm_path('', url)
        except UnicodeDecodeError:
            return url

    except UnicodeDecodeError:
        # work around for bug in urlnorm on unicode url
        return url
    except:
        traceback.print_exc()
    return None
コード例 #47
0
ファイル: signatures.py プロジェクト: martivic/alexa-genie
def cert_chain_url_valid(cert_url):
    """
    Ensure that the provided URL for the certificate chain is valid, by checking that:
    * it's HTTPS
    * the host is s3.amazonaws.com
    * the port, if specified, is 443
    * the path starts with '/echo.api/'
    """
    normalized = urlnorm.norm(cert_url)
    parsed = urlparse.urlparse(normalized)
    url_checks = {
        'scheme': parsed.scheme == 'https',
        'hostname': parsed.hostname == 's3.amazonaws.com',
        'port': parsed.port in (443, None),
        'path': parsed.path.startswith('/echo.api/'),
    }
    all_checks_pass = all(url_checks.values())
    return all_checks_pass
コード例 #48
0
    def _prepareURL(self, apiQueryURI):
        """
        If the URI (actually just a partial URL, usually the path part) doesn't begin with
        the base URL for the API, concatenate the two into a new URL and return it.

        :param apiQueryURI: URI (actually, just a partial URL, usually the path part) for an API entry point.
        :type apiQueryURI: str
        :return: URL for the API query, ready for use
        :rtype: str
        """
        assert isinstance(apiQueryURI, str)
        assert not util.stringContainsAllCharacters(apiQueryURI, '{}'), \
            'apiQueryURI contains unformatted arguments: "%s"' % apiQueryURI

        if apiQueryURI.startswith(self.apiBaseURL):
            return apiQueryURI

        return urlnorm.norm(self.apiBaseURL + '/' + apiQueryURI)
コード例 #49
0
def extract_urls(text, regex):
    results = dict()

    for i in regex.finditer(text):
        try:
            url = urlnorm.norm(i.group(1).strip())
            url_parsed = url_parser(url)
            if results.get(url_parsed.host):
                results[url_parsed.host].add(url)
            else:
                results[url_parsed.host] = set(url)
            log.debug("Parsed domain: {}".format(url_parsed.host))
        except urlnorm.InvalidUrl:
            log.warning("Parsing invalid url: {}".format(url))
        except:
            log.exception("Failed parsing url: {}".format(url))

    return results
コード例 #50
0
    def on_data(self, data):
        tweet_data = json.loads(data)
        if 'limit' in tweet_data:
            print("Limit:" + str(tweet_data["limit"]))
        else:
            #insert into tweet db
            tweet = tweet_data["text"]
            username = tweet_data["user"]["screen_name"]
            #lat = tweet_data[]
            #long = tweet_data[]

            c.execute("INSERT INTO tweet (time, username, tweet) VALUES (%s,%s,%s)",
                (time.time(), username, tweet))
            tweet_id = c.lastrowid

            # insert full urls into DB
            for url in tweet_data["entities"]["urls"]:

                # process URL
                norm_url = urlnorm.norm(url["expanded_url"])
                norm_url_tuple = urlparse.urlparse(norm_url)

                # unshorten URLs for common URL minimizer services
                if norm_url_tuple[1] in URL_SHORTENERS:
                    norm_url = unshorten_url(norm_url)
                    norm_url_tuple = urlparse.urlparse(norm_url)

                md5_url = hashlib.md5()
                md5_url.update(norm_url.encode("utf-8"))

                c.execute("INSERT INTO url (url, domain, url_hash) VALUES (%s,%s,%s)",
                          (norm_url, norm_url_tuple[1], md5_url.hexdigest()))
                url_id = c.lastrowid
                c.execute("INSERT INTO tweet_urls (tweet_id, url_id) VALUES (%s,%s)",
                          (tweet_id, url_id))



            conn.commit()
            self.tweet_count += 1
            if self.tweet_count % 1000 == 0:
                print self.tweet_count

        return True
コード例 #51
0
ファイル: scrapers.py プロジェクト: marekjs/progscrape
    def scrape(self):
        stories = self._scrape()
        # If we've scraped the same canonical URL twice, we will just choose the first one
        urls = set()
        for story in stories:
            try:
                url = urlnorm.norm(story.url)
            except:
                # If we've scraped a bad UTF-8 character here, this might fail
                url = story.url

            if url in urls:
                stories.remove(story)
            else:
                urls.add(url)
                story.url = url
                story.title = story.title.strip()

        return stories
コード例 #52
0
ファイル: data_cache.py プロジェクト: ODInfoBiz/csvengine-ui
def storeURL(url, path, max_file_size):
    #download URL and send fileID
    log.debug("downloading url", url=url, max_file_size=max_file_size )
    try:
        r = requests.get(url, stream=True)
        size = 0
        ctt = StringIO()
    
        sig = hashlib.md5()
        for chunk in r.iter_content(2048):
            size += len(chunk)
            ctt.write(chunk)
            sig.update(chunk)
            if size >  max_file_size:
                r.close()
                raise RequestEntityTooLarge()
    
        md5 = sig.hexdigest()
        ctt.seek(0)
        
        fpath=os.path.join(path, md5)
        if os.path.exists(fpath):
            print 'file exists', fpath
            return md5
        log.debug("storing url", url=url, file=fpath)
        with open (fpath,'w') as fd:
            t = ctt.read(1048576)
            while t:
                fd.write(t)
                t = ctt.read(1048576)
        
        url_norm = urlnorm.norm(url.strip())
        url_fname = urllib.quote_plus(url_norm)
        f = os.path.join(path, url_fname)

        
        os.symlink(fpath,f)
        log.debug("url stored", url=url, file=fpath)
        
        return md5
    except Exception as e:
        raise e
コード例 #53
0
ファイル: docs.py プロジェクト: yiransheng/yiransbookmark
 def save_link(cls, title, url, body="", tags=[], clicks=0, unread=True):
     url = norm(url)
     id = mmh3.hash(url)
     key = ndb.Key(LinkModel, id)
     domain = urlparse(url).netloc
     if len(domain) > 4 and domain.startswith('www.'):
         domain = domain[4:]
     link = LinkModel(key=key,
                      title=title,
                      url=url,
                      domain=domain,
                      body=body,
                      tags=tags,
                      clicks=clicks,
                      unread=unread)
     link.put()
     id = str(link.id)
     doc = cls._buildDoc(id, title, body, domain, tags)
     cls.add(doc)
     return cls(doc, link)
コード例 #54
0
ファイル: bucklegripper.py プロジェクト: KNIGHTTH0R/DATA-1
def main():

    print "\n.: BUCKLEGRIPPER v0.1 https://github.com/hadojae/DATA :."

    parser = argparse.ArgumentParser(description='Visit a suspected phishing page, screenshot it and pillage it for phishing archives')
    parser.add_argument('-u','--url', help='Url to visit',required=False,default=False)
    parser.add_argument('-s','--source', help='Apply a source to where this url came from',required=False,default="bucklegripper")
    parser.add_argument('-r','--readfile', help='Read in a file of URLs one per line',required=False,default=False)
    parser.add_argument('-a','--useragent', help='Custom User-Agent',required=False,default="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36")

    args = parser.parse_args()
    user_agent = args.useragent
    full = args.url
    source = args.source
    readfile = args.readfile

    if full == False and readfile == False:
        print bcolors.FAIL + "\n[-] You have to enter either a url with '-u' to analyze or specify a file with urls in it with '-r'\n" + bcolors.ENDC
        sys.exit() 

    # "setup fake ua for urllib2 requests"
    headers = { 'User-Agent' : user_agent }

    if readfile == False:
        mainloop(full, headers, user_agent, source)
        sys.exit()
    else:
        print "\n[+] Beginning processing of " + readfile
        with open(readfile) as f:
            content = f.readlines()
            for line in content:
                #catch bad url
                try:
                    full = urlnorm.norm(line).strip('\n')
                except Exception:
                    print bcolors.FAIL + "[-] " + line + " is a Malformed URI" + bcolors.ENDC
                    continue 
  		mainloop(full, headers, user_agent, source)
        print "\n[+] Finished processing " + readfile + '\n'
        sys.exit() 
コード例 #55
0
    def enqueue(self, url, *args):
        # We add explored bool too. Since links that
        # are not explored only can average prev scores
        # normalizedURL = url
        if (len(args) != 1):
            crawlerLogger.error("Required was Priority but more args supplied")

        priority = args[0]
        try:
            normalizedURL = urlnorm.norm(url)

            if (normalizedURL not in self._linkDict):
                self._linkDict[normalizedURL] = (priority, False)
            else:
                # Average the two scores if found
                prevPriority, explored = self._linkDict[normalizedURL]
                if (not explored):
                    self._linkDict[normalizedURL] = (
                        (prevPriority + priority) / 2, False)
        except Exception as e:
            crawlerLogger.warn("Normalization Issues. Not Enqueing " + url)
        self._buildHeap()