def normalize_url(base_url, url): myfile3 = open('normalization_log', 'a') myfile3.write("base url:{0}\n".format(base_url)) myfile3.write("url:{0}\n".format(url)) myfile3.close() result = '' # if url starts with http:// or https:// allowed_scheme = ['http', 'https'] url_scheme = urlparse(url).scheme if url_scheme in allowed_scheme: return urlnorm.norm(url) elif url_scheme == 'mailto': return False elif len(url_scheme) == 0: # check if URL starts with ../ if (url[:3] == '../') or (url[:2] == './'): return urlnorm.norm(base_url+'/'+url) elif url[0] == '/': # e.g. /page/page # That means it's the domain + url url_obj = urlparse(base_url) new_url = url_obj.scheme + "://" + url_obj.netloc + url return urlnorm.norm(new_url) else: # URL should be just html page e.g. research.html # so we need to replace the last part # if URL is 'http://www.test.com/page/page/12345': # results will be ['http://www.test.com/page/page', '12345'] parts = base_url.rsplit('/', 1) return urlnorm.norm(parts[0]+'/'+url) result = url return result
def main(): if (len(sys.argv) < 3 ): print "usage: python ll-print.py <url> <search term>" print "example: python ll-print.py http://www.hunch.com 'hunch team'" exit(0) root_URL = sys.argv[1] search_term = sys.argv[2] if (not validate_search_term(search_term)): print "Invalid search term. Please only use valid url characters and spaces." exit(1) first_letter = search_term[0] first_letter_match = root_URL.find(first_letter.lower()) if (first_letter_match != -1): try: br = mechanize.Browser() br._factory.is_html = True result = [] br.open(root_URL) # print "visiting: " + urlnorm.norm(br.geturl()) visited = set([urlnorm.norm(br.geturl()), urlnorm.norm(root_URL)]) result = find_matching_links(br, search_term, result, visited) if (result): max_index = max(result, key=lambda u: u[1])[1] for l, i, c in result: print_url(l, i, max_index) except urlnorm.InvalidUrl: print "Invalid root URL" except urllib2.URLError, e: print "Error opening root URL" print e except Exception, e: print e
def crawl(db, url, urls_crawled={}): # Make sure we don't infinite loop, keep track of what we urls_crawled # TODO pull this from Phoenix # Crawl this website, get all of the outbound URLs urls_to_crawl = crawl_one(db, url) # Record that we crawled this url urls_crawled[url] = None for url_to_crawl in urls_to_crawl: try: url_to_crawl = urlnorm.norm(url_to_crawl) except urlnorm.InvalidUrl: # Try to convert it to an absolute url url_to_crawl = urlnorm.norm("%s%s" % (url, url_to_crawl)) # Don't re-record if url_to_crawl in urls_crawled: print 'Skipping %s as already crawled' % (url_to_crawl) # Only crawl my site elif url_to_crawl.startswith('https://penguinsinabox.com'): crawl(db, url_to_crawl, urls_crawled) else: # A website not owned by me print 'Skipping %s as not a self-controlled site' % (url_to_crawl) print "Finished processing children of %s" % (url)
def find_matching_links(br, target_word, result, visited): if (not target_word): return result else: current_URL = urlnorm.norm(br.geturl()) current_letter = target_word[0].lower() if (current_letter.isspace()): return find_matching_links(br, target_word[1:], result + [('', -1, ' ')], visited) else: matching_index = current_URL[7:].find(current_letter) if (matching_index == -1): return [] else: new_result = result + [(current_URL, matching_index + 7, current_letter)] links = list(br.links()) for link in links: try: link_URL = urlnorm.norm(link.absolute_url) if (link_URL not in visited): br.open(link_URL) new_visited = visited.copy() new_visited.add(link_URL) # print "visiting: " + urlnorm.norm(br.geturl()) new_visited.add(urlnorm.norm(br.geturl())) child_result = find_matching_links(br, target_word[1:], new_result, new_visited) if (child_result): return child_result except Exception, e: continue
def fetchOutlinks(ahrefs): newOutLinks = set() base_url = "https://en.wikipedia.org" for a in ahrefs: try: ahref = a['href'].lower() not_parseable_ressources = (".avi", ".mkv", ".mp4", ".jpg", ".jpeg", ".png", ".gif", ".pdf", ".iso", ".rar", ".tar", ".tgz", ".zip", ".dmg", ".exe") if not urlparse.urlparse(ahref).path.endswith( not_parseable_ressources): if "wiki" in ahref: if "#" in ahref: # Finding and removing URLs with # in them ahref = ahref[:ahref.find("#")] pass elif "?" in ahref: # Finding and removing URLs with ? in them ahref = ahref[:ahref.find("?")] pass elif ":" in ahref: # Finding and removing URLs with : in them ahref = ahref[:ahref.find(":")] pass elif "//" in ahref: # Finding and removing URLs with // in them ahref = ahref[:ahref.find("//")] pass elif ahref == "/wiki/Main_Page": # Finding and removing URLs of Main page of Wiki pass elif "united" in ahref or "states" in ahref or "u.s" in ahref or "illegal" in ahref or "immig" in ahref or "donald" in ahref or "trump" in ahref: newUrl = a['href'] finalUrl = base_url + newUrl finalUrl = urlnorm.norm(finalUrl).encode( "utf-8", "ignore") newOutLinks.add(finalUrl) else: if ahref[: 2] == "//": # Finding and removing URLs with // in them pass elif "index" in ahref or "youtube" in ahref or "rgu" in ahref or "book" in ahref or "american" in ahref: pass elif "#" in ahref: # Finding and removing URLs with # in them ahref = ahref[:ahref.find("#")] pass elif "united" in ahref or "states" in ahref or "u.s" in ahref or "illegal" in ahref or "immig" in ahref or "donald" in ahref or "trump" in ahref: newUrl = a['href'] finalUrl = newUrl print "outlinks:", finalUrl finalUrl = urlnorm.norm(finalUrl).encode( "utf-8", "ignore") newOutLinks.add(finalUrl) except KeyError, e: pass
def __init__(self, url, previous=None, **info): # Apply the simple idempotent optimizations to all urls (no need to # ever deal with "HTTP://.."). This means case-sensitivity, and a # whole lot of other things that the urlnorm library will do for us. # We call this the original url, even though it is a bit of a lie. try: self.original_url = urlnorm.norm(url) except urlnorm.InvalidUrl as e: raise urlnorm.InvalidUrl('{}: {}'.format(e, url)) # For the normalized url that we'll be exposing, remove the # fragment, and treat https and http the same. url, fragment = urldefrag(self.original_url) self.lossy_url_data = {'fragment': fragment} if url.startswith('https:'): url = 'http' + url[5:] self.lossy_url_data.update({'protocol': 'https'}) self.url = url self.set_previous(previous) self.info = info self.post = None # Runtime data self.response = None self.exception = None self.retries = 0
def treat_url(url, logger=None): """ Remove "weird" artifacts from the given URL. Collapse adjacent '.'s, apply '..', etc. :param str url: URL to clear. :param gluetool.log.ContextAdapter logger: logger to use for logging. :rtype: str :returns: Treated URL. """ logger = logger or Logging.get_logger() logger.debug("treating a URL '{}'".format(url)) try: url = str(urlnorm.norm(url)) except urlnorm.InvalidUrl as exc: # urlnorm cannot handle localhost: https://github.com/jehiah/urlnorm/issues/3 if exc.message == "host u'localhost' is not valid": pass else: raise exc return url.strip()
def _canonize(self): if self.is_absolute: self.canonical_scheme, self.canonical_netloc, self.canonical_path, self.canonical_query, self.canonical_fragment \ = urlsplit(urlnorm.norm(self.raw)) else: self.canonical_scheme, self.canonical_netloc, self.canonical_path, self.canonical_query, self.canonical_fragment \ = self.raw_scheme, self.raw_netloc, self.raw_path, self.raw_query, self.raw_fragment self.canonical_scheme = self.canonical_scheme.lower() if self.raw_scheme.endswith('s'): self.canonical_netloc, _ = rstrip_string(self.canonical_netloc, ':443') else: self.canonical_netloc, _ = rstrip_string(self.canonical_netloc, ':80') self.canonical_netloc = self.canonical_netloc.lower() self.canonical_path = '' if self.canonical_path == '/' else self.canonical_path params = parse_qsl(self.canonical_query, True) self.canonical_query_params = [(k, v) for (k, v) in sorted(params)] self.canonical = urlunsplit( (self.canonical_scheme, self.canonical_netloc, self.canonical_path, self.canonical_query, ''))
def processPage(): while not urls.counter > urlcount: try: link = urlpool.get() newurl = urlparse.urljoin( link.base_url, link.url) # Converting relative URLs to Absolute ones newurl = unicode(urlnorm.norm(newurl)) # Normalizing URL print "out: " + newurl disassembled = urlparse.urlsplit(newurl) filename, file_ext = splitext( basename(disassembled.path )) # Finding file extension for filtering exclusions file_ext = file_ext.lower() if filename == 'index': newurl = newurl[:-len(filename + file_ext)] if (file_ext not in excludedExtensions and disassembled.scheme in ['http', 'https'] and disassembled.fragment == ''): print "in : " + newurl if newurl not in visited: # Checking to see if URL has already been queued once visited.add(newurl) if urlContains(newurl, searchTags) > 0: urls.put(newurl, 1) else: priority = priorityCalculator.searchPage( newurl, searchTags) if priority < len(searchTags) + 1: urls.put( newurl, priority ) # Adding URL to queue with calculated priority except UnicodeEncodeError: print "UnicodeEncodeError" except: print "Invalid URL"
def processPage(): while not urls.counter > urlcount: try: link = urlpool.get() newurl = urlparse.urljoin(link.base_url, link.url) # Converting relative URLs to Absolute ones newurl = unicode(urlnorm.norm(newurl)) # Normalizing URL print "out: " + newurl disassembled = urlparse.urlsplit(newurl) filename, file_ext = splitext(basename(disassembled.path)) # Finding file extension for filtering exclusions file_ext = file_ext.lower() if filename == 'index': newurl = newurl[:-len(filename + file_ext)] if (file_ext not in excludedExtensions and disassembled.scheme in ['http', 'https'] and disassembled.fragment == ''): print "in : " + newurl if newurl not in visited: # Checking to see if URL has already been queued once visited.add(newurl) if urlContains(newurl, searchTags) > 0: urls.put(newurl, 1) else: priority = priorityCalculator.searchPage(newurl, searchTags) if priority < len(searchTags) + 1: urls.put(newurl, priority) # Adding URL to queue with calculated priority except UnicodeEncodeError: print "UnicodeEncodeError" except: print "Invalid URL"
def test_invalid_urls(url): try: output = urlnorm.norm(url) print '%r' % output except urlnorm.InvalidUrl: return assert 1 == 0, "this should have raised an InvalidUrl exception"
def canonicalize(url): """Canonicalize a URL in just a few easy steps: 1. Resolve any redirects 2. Normalize the URL 3. Strip any superflous query params 4. Sort any remaining query params 5. Profit! This relies on the urlnorm module for normalization, and, at the moment, just removes utm_* query params. TODO: Special case normalization for major sites (e.g. youtube)? """ url = urlnorm.norm(resolve(url)) url_parts = urlparse.urlsplit(url) scheme, netloc, path, query, fragment = url_parts params = [] for key, value in cgi.parse_qs(query).iteritems(): if exclude_param(url_parts, key, value): continue if isinstance(value, list): params.extend((key, v) for v in value) else: params.append((key, value)) query = urllib.urlencode(sorted(params), doseq=1) return urlparse.urlunsplit((scheme, netloc, path, query, ''))
def task_listener_crawler(gearman_worker, gearman_job): url = gearman_job.data url_frontier.add(url) urls = urlparse.urlparse(url) print "Crawling ", url response = requests.get(url, crawler_headers) print 'Downloaded page' if response.status_code == 200: raw_data = response.text if response.encoding != 'utf8': raw_data = response.text.encode(response.encoding).decode('utf8') r.table(raw_result_table).insert({'url': url, 'raw': raw_data, 'status': 200}, conflict="replace").run(rethink) links = linkregex.findall(raw_data) for link in (links.pop(0) for _ in xrange(len(links))): pre_norm_url = url_pre_norm(link, urls) norm_url = urlnorm.norm(pre_norm_url) norm_parts = urlparse.urlparse(norm_url) ext_url = norm_parts.path.split(".")[-1].lower() if ext_url not in except_url_suffixes and url_frontier.add(norm_url): print "Add ", norm_url, " to redis queue" redis_client.rpush("urls:enqueued", norm_url) print "Done" return "ok" else: r.table(raw_result_table).insert({'url': url, 'status': response.status_code}, conflict="replace").run(rethink) return "fail"
def normalize_url(url): # TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py try: norm = urlnorm.norm(url) norm, _ = urldefrag(norm) return norm.rstrip('/') except: return None
def normalize_url(url): norm_url = urlnorm.norm(url) if norm_url.startswith("https://"): return norm_url[8:] elif norm_url.startswith("http://"): return norm_url[7:] else: return norm_url
def clean(self): """Ensures that URLs are canonized before saving""" self.value = refang(self.value.strip()) try: if re.match(r"[^:]+://", self.value) is None: # if no schema is specified, assume http:// self.value = u"http://{}".format(self.value) self.value = urlnorm.norm(self.value) except urlnorm.InvalidUrl: raise ObservableValidationError("Invalid URL: {}".format(self.value))
def fetch(self, method, endpoint, params): api_endpoint = norm(self.api_base + endpoint) content = self.oauth.request( method, api_endpoint, params=params, headers={'User-Agent': 'Semantics3 Python Lib/0.2'}) print(content) return content
def normalize_url(url): # TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py try: url = urlnorm.norm(url) url, _ = urldefrag(url) url = url.rstrip("/") return url except: return None
def new(cls, *args, **kwargs): obj = cls(*args) obj.source = kwargs['source'] obj.duplicates = 0 obj.priority = 0 # normalize url if hasattr(obj, 'url'): obj.url = urlnorm.norm(obj.url) return obj
def googleSearch ( searchString ): g = pygoogle(searchString) g.pages = 2 urls = g.get_urls() urls = urls[:10] for i in range(len(urls)): urls[i]=unicode(urlnorm.norm(urls[i])) return urls
def createMetaResources(md5v, dataset): with Timer(key='createMetaResources'): res = getDistributionAccessURLs(dataset) + getDistributionDownloadURLs( dataset) bulk_mr = [] uris = [] for uri in res: valid = True try: uri = urlnorm.norm(uri.strip()) except Exception as e: log.debug("URIFormat", uri=uri, md5=md5v, msg=e.message) uri = uri valid = False f = getDistributionFormatWithURL(dataset, uri) m = getDistributionMediaTypeWithURL(dataset, uri) s = getDistributionSizeWithURL(dataset, uri) c = getDistributionCreationDateWithURL(dataset, uri) mod = getDistributionModificationDateWithURL(dataset, uri) try: s_uri = safe_url_string(uri, 'utf-8') uri = escape_ajax(s_uri) except Exception as exc: ErrorHandler.handleError(log, "safe_url_string", exception=exc, md5=md5, uri=uri, exc_info=True) uri = uri if uri in uris: log.debug("WARNING, duplicate URI", dataset=dataset.id, md5=md5v, uri=uri, format=f, media=m) continue try: s = int(float(s)) if s is not None else None except Exception as e: s = None MR = MetaResource(uri=uri, md5=md5v, media=m, valid=valid, format=normaliseFormat(f), size=s, created=toDatetime(c), modified=toDatetime(mod)) bulk_mr.append(MR) uris.append(uri) return bulk_mr
def fetch(self, method, endpoint, params): api_endpoint = norm(self.api_base + endpoint) content = self.oauth.request( method, api_endpoint, params = params, headers={'User-Agent':'Semantics3 Python Lib/0.2'} ) print(content) return content
def dl_html(page): url = "http://en.wiktionary.org/wiki/%s" % page url = urlnorm.norm(url) # we should be able to crawl any page from the links we obtained # and we're obeying crawling delays here response = urllib2.urlopen(url.encode("utf8"), timeout=5) time.sleep(config.page_crawl_delay) return response.read()
def canonizeurl(url): split = urlsplit(urlnorm.norm(url)) path = split[2].split(" ")[0] while path.startswith("/.."): path = path[3:] while path.endswith("%20"): path = path[:-3] # qs = urlencode(sorted(parse_qsl(split.query))) qs = "" return urlunsplit((split.scheme, split.netloc, path, qs, ""))
def __init__(self, url): """Construct from a string or Django request.""" nurl = urlnorm.norm(url.encode('utf-16').lower()) if hasattr(nurl, 'get_full_path'): nurl = nurl.get_full_path() self.scheme, self.netloc, self.path, self.params, \ self.query, self.fragment = urlparse.urlparse(nurl) filename, self.ftype = os.path.splitext(self.path) self.args = dict(cgi.parse_qsl(self.query))
def normalize_url(url, path=None): try: if path: url = urljoin(url, path) url = urlnorm.norm(url) # force HTTP protocol if url.startswith('http'): return url except urlnorm.InvalidUrl: pass
def dl_xml(params): url = "http://en.wiktionary.org/w/api.php?format=xml" for key, val in params.iteritems(): url += "&%s=%s" % (key, val) url = urlnorm.norm(url) # We're permitted to crawl any page with the API regardless # of robots.txt since we're using the API response = urllib2.urlopen(url.encode("utf8"), timeout=5) time.sleep(config.api_crawl_delay) return response.read()
def getImage(self,opener,url,data,wait_time): """ Directly get an Image using URLLib. Errors Must be handled. *Optional Parameters* :param opener: urllib opener to use (use GetPage for setup) :param url: url address to use :param data: data to use in request (like that passed to urlencode) :param wait_time: time to wait for request """ return opener.open(urlnorm.norm(url),data,wait_time).read()
def clean(self): """Ensures that URLs are canonized before saving""" self.value = refang(self.value.strip()) try: if re.match(r"[^:]+://", self.value) is None: # if no schema is specified, assume http:// self.value = u"http://{}".format(self.value) self.value = urlnorm.norm(self.value) self.parse() except urlnorm.InvalidUrl: raise ObservableValidationError("Invalid URL: {}".format(self.value)) except UnicodeDecodeError: raise ObservableValidationError("Invalid URL (UTF-8 decode error): {}".format(self.value))
def getFileName(self, url, folder=None): url_norm = urlnorm.norm(url.strip()) url_fname = urllib.quote_plus(url_norm) if folder: submit_path = os.path.join(self.submit_folder[folder], url_fname) if os.path.exists(submit_path): return os.readlink(submit_path) else: for f in self.submit_folder: submit_path = os.path.join(self.submit_folder[f], url_fname) if os.path.exists(submit_path): return os.readlink(submit_path) return None
def extract_links(body): links = [] for link in HTML_TAG_PATTERN.findall(body): try: link = link[2] netloc = urlparse.urlparse(link).netloc if (netloc in domains_of_interest): link = urlnorm.norm(link) links.append(link) except: pass return links
def _clean_url(self, url): """ Canonicalizes the url, as it is done in Scrapy. And keeps only USEFUL_QUERY_KEYS. It also strips the trailing slash to help identifying dupes. """ # TODO: Turn this into regex if not url.startswith('http') or url.endswith('}}') or 'nojs_router' in url: return None if site(norm(url).lower()) in config.NONCANONIC_SITES: clean_url = canonicalize_url(url, keep_params=True) else: clean_url = canonicalize_url(url) return clean_url
def normalize_canonical_url(url, use_url_norm=True): try: if use_url_norm: url = urlnorm.norm(url) scheme, netloc, path, params, query, fragment = urlparse.urlparse(url) host = urlparse.urlunparse((scheme, netloc, '', '', '', '')) path = urlparse.urlunparse(('', '', path, params, query, fragment)) path = shebang_regex.sub('/', path) url = host + path return url.rstrip('/') except Exception: return None
def normalize(self, url): parsed = urlparse(url.encode('utf-8')) if '//' not in url: url = '%s%s' % ('http://', url) if parsed.scheme == "http" or "https": try: normalized_url = urlnorm.norm(url) except: return None return normalized_url else: return None
def normalize(self): self.value = refang(self.value) try: if re.match(r"[^:]+://", self.value) is None: # if no schema is specified, assume http:// self.value = u"http://{}".format(self.value) self.value = urlnorm.norm(self.value).replace(' ', '%20') self.parse() except urlnorm.InvalidUrl: raise ObservableValidationError("Invalid URL: {}".format( self.value)) except UnicodeDecodeError: raise ObservableValidationError( "Invalid URL (UTF-8 decode error): {}".format(self.value))
def getImageSpynner(self,baseurl,spynner,iser,wait_time,proxy): """ Directly get an Image with Spynner. *Required Parameters* :param baseurl: base url to use with link (a blank string is nothing) :param spynner: spynner instance :param iser: selector for image :param wait_time: time to wait in acquiring an image :param proxy: String proxy """ br=spynner print "Downloading..."+str(iser["src"]) return br.download(urlnorm.norm(baseurl+iser["src"]),outfd=None,timeout=wait_time,proxy_url=proxy)
def normalize(self): self.value = refang(self.value) try: if re.match(r"[^:]+://", self.value) is None: # if no schema is specified, assume http:// self.value = u"http://{}".format(self.value) self.value = urlnorm.norm(self.value).replace(' ', '%20') self.parse() except urlnorm.InvalidUrl: raise ObservableValidationError( "Invalid URL: {}".format(self.value)) except UnicodeDecodeError: raise ObservableValidationError( "Invalid URL (UTF-8 decode error): {}".format(self.value))
def parse_url(text): """Clean and verify a URL.""" # TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py url = stringify(text) if url is not None: if url.startswith('//'): url = 'http:' + url elif '://' not in url: url = 'http://' + url try: norm = urlnorm.norm(url) norm, _ = urldefrag(norm) return norm except: return None return None
def norm_url(url): url = uni(url).encode('utf-8') try: return urlnorm.norm(url) except urlnorm.InvalidUrl: # Happens when the URL is relative. Call path normalization directly. try: return urlnorm.norm_path('', url) except UnicodeDecodeError: return url except UnicodeDecodeError: # work around for bug in urlnorm on unicode url return url except: traceback.print_exc() return None
def cert_chain_url_valid(cert_url): """ Ensure that the provided URL for the certificate chain is valid, by checking that: * it's HTTPS * the host is s3.amazonaws.com * the port, if specified, is 443 * the path starts with '/echo.api/' """ normalized = urlnorm.norm(cert_url) parsed = urlparse.urlparse(normalized) url_checks = { 'scheme': parsed.scheme == 'https', 'hostname': parsed.hostname == 's3.amazonaws.com', 'port': parsed.port in (443, None), 'path': parsed.path.startswith('/echo.api/'), } all_checks_pass = all(url_checks.values()) return all_checks_pass
def _prepareURL(self, apiQueryURI): """ If the URI (actually just a partial URL, usually the path part) doesn't begin with the base URL for the API, concatenate the two into a new URL and return it. :param apiQueryURI: URI (actually, just a partial URL, usually the path part) for an API entry point. :type apiQueryURI: str :return: URL for the API query, ready for use :rtype: str """ assert isinstance(apiQueryURI, str) assert not util.stringContainsAllCharacters(apiQueryURI, '{}'), \ 'apiQueryURI contains unformatted arguments: "%s"' % apiQueryURI if apiQueryURI.startswith(self.apiBaseURL): return apiQueryURI return urlnorm.norm(self.apiBaseURL + '/' + apiQueryURI)
def extract_urls(text, regex): results = dict() for i in regex.finditer(text): try: url = urlnorm.norm(i.group(1).strip()) url_parsed = url_parser(url) if results.get(url_parsed.host): results[url_parsed.host].add(url) else: results[url_parsed.host] = set(url) log.debug("Parsed domain: {}".format(url_parsed.host)) except urlnorm.InvalidUrl: log.warning("Parsing invalid url: {}".format(url)) except: log.exception("Failed parsing url: {}".format(url)) return results
def on_data(self, data): tweet_data = json.loads(data) if 'limit' in tweet_data: print("Limit:" + str(tweet_data["limit"])) else: #insert into tweet db tweet = tweet_data["text"] username = tweet_data["user"]["screen_name"] #lat = tweet_data[] #long = tweet_data[] c.execute("INSERT INTO tweet (time, username, tweet) VALUES (%s,%s,%s)", (time.time(), username, tweet)) tweet_id = c.lastrowid # insert full urls into DB for url in tweet_data["entities"]["urls"]: # process URL norm_url = urlnorm.norm(url["expanded_url"]) norm_url_tuple = urlparse.urlparse(norm_url) # unshorten URLs for common URL minimizer services if norm_url_tuple[1] in URL_SHORTENERS: norm_url = unshorten_url(norm_url) norm_url_tuple = urlparse.urlparse(norm_url) md5_url = hashlib.md5() md5_url.update(norm_url.encode("utf-8")) c.execute("INSERT INTO url (url, domain, url_hash) VALUES (%s,%s,%s)", (norm_url, norm_url_tuple[1], md5_url.hexdigest())) url_id = c.lastrowid c.execute("INSERT INTO tweet_urls (tweet_id, url_id) VALUES (%s,%s)", (tweet_id, url_id)) conn.commit() self.tweet_count += 1 if self.tweet_count % 1000 == 0: print self.tweet_count return True
def scrape(self): stories = self._scrape() # If we've scraped the same canonical URL twice, we will just choose the first one urls = set() for story in stories: try: url = urlnorm.norm(story.url) except: # If we've scraped a bad UTF-8 character here, this might fail url = story.url if url in urls: stories.remove(story) else: urls.add(url) story.url = url story.title = story.title.strip() return stories
def storeURL(url, path, max_file_size): #download URL and send fileID log.debug("downloading url", url=url, max_file_size=max_file_size ) try: r = requests.get(url, stream=True) size = 0 ctt = StringIO() sig = hashlib.md5() for chunk in r.iter_content(2048): size += len(chunk) ctt.write(chunk) sig.update(chunk) if size > max_file_size: r.close() raise RequestEntityTooLarge() md5 = sig.hexdigest() ctt.seek(0) fpath=os.path.join(path, md5) if os.path.exists(fpath): print 'file exists', fpath return md5 log.debug("storing url", url=url, file=fpath) with open (fpath,'w') as fd: t = ctt.read(1048576) while t: fd.write(t) t = ctt.read(1048576) url_norm = urlnorm.norm(url.strip()) url_fname = urllib.quote_plus(url_norm) f = os.path.join(path, url_fname) os.symlink(fpath,f) log.debug("url stored", url=url, file=fpath) return md5 except Exception as e: raise e
def save_link(cls, title, url, body="", tags=[], clicks=0, unread=True): url = norm(url) id = mmh3.hash(url) key = ndb.Key(LinkModel, id) domain = urlparse(url).netloc if len(domain) > 4 and domain.startswith('www.'): domain = domain[4:] link = LinkModel(key=key, title=title, url=url, domain=domain, body=body, tags=tags, clicks=clicks, unread=unread) link.put() id = str(link.id) doc = cls._buildDoc(id, title, body, domain, tags) cls.add(doc) return cls(doc, link)
def main(): print "\n.: BUCKLEGRIPPER v0.1 https://github.com/hadojae/DATA :." parser = argparse.ArgumentParser(description='Visit a suspected phishing page, screenshot it and pillage it for phishing archives') parser.add_argument('-u','--url', help='Url to visit',required=False,default=False) parser.add_argument('-s','--source', help='Apply a source to where this url came from',required=False,default="bucklegripper") parser.add_argument('-r','--readfile', help='Read in a file of URLs one per line',required=False,default=False) parser.add_argument('-a','--useragent', help='Custom User-Agent',required=False,default="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36") args = parser.parse_args() user_agent = args.useragent full = args.url source = args.source readfile = args.readfile if full == False and readfile == False: print bcolors.FAIL + "\n[-] You have to enter either a url with '-u' to analyze or specify a file with urls in it with '-r'\n" + bcolors.ENDC sys.exit() # "setup fake ua for urllib2 requests" headers = { 'User-Agent' : user_agent } if readfile == False: mainloop(full, headers, user_agent, source) sys.exit() else: print "\n[+] Beginning processing of " + readfile with open(readfile) as f: content = f.readlines() for line in content: #catch bad url try: full = urlnorm.norm(line).strip('\n') except Exception: print bcolors.FAIL + "[-] " + line + " is a Malformed URI" + bcolors.ENDC continue mainloop(full, headers, user_agent, source) print "\n[+] Finished processing " + readfile + '\n' sys.exit()
def enqueue(self, url, *args): # We add explored bool too. Since links that # are not explored only can average prev scores # normalizedURL = url if (len(args) != 1): crawlerLogger.error("Required was Priority but more args supplied") priority = args[0] try: normalizedURL = urlnorm.norm(url) if (normalizedURL not in self._linkDict): self._linkDict[normalizedURL] = (priority, False) else: # Average the two scores if found prevPriority, explored = self._linkDict[normalizedURL] if (not explored): self._linkDict[normalizedURL] = ( (prevPriority + priority) / 2, False) except Exception as e: crawlerLogger.warn("Normalization Issues. Not Enqueing " + url) self._buildHeap()