def consolidationThread(): global mydata global remoteHosts print "Starting consolidationThread" while True: for host in remoteHosts: try: host = "//"+host # must be cleaner way to do this remote_address = (urlparse(host).hostname,urlparse(host).port) #print "Trying: ",remote_address sock = socket.socket(socket.AF_INET,socket.SOCK_STREAM) sock.settimeout(5) # could be longer if not on local network sock.connect(remote_address) sock.sendall("{\"numberOfRecords\":1,\"version\":1}\n") data = sock.recv(1024) decoded_data = json.loads(data) if int(decoded_data['RelativeTime']) < ((int(time.time()) * 1000) - int(mydata['CaptureDateTime'])): #print "Received NEWER: ",data print "NEWEST FROM: ",remote_address mydata = decoded_data sock.close() except Exception, e: print e,remote_address time.sleep(10)
def vk(url): try: try: oid, id = urlparse.parse_qs(urlparse.urlparse(url).query)['oid'][0] , urlparse.parse_qs(urlparse.urlparse(url).query)['id'][0] except: oid, id = re.compile('\/video(.*)_(.*)').findall(url)[0] try: hash = urlparse.parse_qs(urlparse.urlparse(url).query)['hash'][0] except: hash = vk_hash(oid, id) u = 'http://api.vk.com/method/video.getEmbed?oid=%s&video_id=%s&embed_hash=%s' % (oid, id, hash) result = client.request(u) result = re.sub(r'[^\x00-\x7F]+',' ', result) try: result = json.loads(result)['response'] except: result = vk_private(oid, id) url = [] try: url += [{'quality': 'HD', 'url': result['url720']}] except: pass try: url += [{'quality': 'SD', 'url': result['url540']}] except: pass try: url += [{'quality': 'SD', 'url': result['url480']}] except: pass if not url == []: return url try: url += [{'quality': 'SD', 'url': result['url360']}] except: pass if not url == []: return url try: url += [{'quality': 'SD', 'url': result['url240']}] except: pass if not url == []: return url except: return
def _format_url(self, url): if not url: return self.root_url dir_sep = self.options['directory_separator'] parsed_url = urlparse(url) if not parsed_url.netloc and not parsed_url.scheme: url = self.root_url + dir_sep + url.lstrip(dir_sep) parsed_url = urlparse(url) formatted_url = parsed_url.scheme + '://' domain = parsed_url.netloc for subdomain in self.options['redundant_subdomains']: domain = re.sub(r'^' + subdomain + r'\.', '', domain, flags=re.IGNORECASE) formatted_url += domain path = parsed_url.path.rstrip(dir_sep) formatted_url += path return formatted_url
def sitemap_parse(sitemap_option, astring, google_results, website_url): not_indexed = [] not_sitemap = [] error = '' sitemap_results = [] website_host = urlparse(website_url).scheme if website_host != '': website_url = urlparse(website_url).scheme + "://" + urlparse(website_url).netloc if website_url[-1] != '/': website_url += '/' if astring != '': if sitemap_option == 'sitemap': resp = requests.get(astring) soup = Soup(resp.content) elif sitemap_option == 'upload_sitemap': soup = Soup(astring) urls = soup.findAll('url') for u in urls: loc = u.find('loc').string sitemap_results.append(loc) if loc not in google_results: not_indexed.append(loc) for loc in google_results: if loc not in sitemap_results: not_sitemap.append(loc) return not_indexed, not_sitemap, error
def __call__(self, url, count_of_crawler): """ Function which fetch the content from the given URL and collect all the URL in the content and pass the first url of the page to fetch the content. """ try: page = urllib2.urlopen(url) soup = BeautifulSoup(page.read()) links_on_page = map(lambda anchor: anchor.get('href'), soup.find_all('a')) cleaned_url = map(lambda link: link if urlparse(link).scheme and urlparse(url).netloc else (urlparse(url) .scheme+"://"+urlparse(url).netloc+link if link[0] == "/" else url+link), links_on_page) visited_url.append(url) total_collected_url.append(cleaned_url) next_url_to_visit = [next_url for next_url in cleaned_url\ if not next_url in visited_url and not "#" in next_url][0] if count_of_crawler and next_url_to_visit: count_of_crawler = crawler(next_url_to_visit, count_of_crawler-1) except: print "It seems there is some issue in URL "+url return count_of_crawler
def local_(masteruri, org_masteruri, uri): ''' Test the node whether it's run on the same machineas the ROS master and ``masteruri`` and ``org_masteruri`` are equal. :param masteruri: The URI of the ROS master currently tested. :type masteruri: str :param org_masteruri: The URI of the ROS master, where the node was originally registered. :type org_masteruri: str :param uri: The URI of the node. :type uri: str :rtype: bool ''' result = False try: from urlparse import urlparse om = urlparse(masteruri) on = urlparse(uri) result = (om.hostname == on.hostname) and (masteruri == org_masteruri) except: pass return result
def ConfigureHostnames(config): """This configures the hostnames stored in the config.""" if flags.FLAGS.external_hostname: hostname = flags.FLAGS.external_hostname else: try: hostname = socket.gethostname() except (OSError, IOError): print "Sorry, we couldn't guess your hostname.\n" hostname = RetryQuestion("Please enter your hostname e.g. " "grr.example.com", "^[\\.A-Za-z0-9-]+$", hostname) print """\n\n-=Server URL=- The Server URL specifies the URL that the clients will connect to communicate with the server. For best results this should be publicly accessible. By default this will be port 8080 with the URL ending in /control. """ frontend_url = RetryQuestion("Frontend URL", "^http://.*/$", "http://%s:8080/" % hostname) config.Set("Client.server_urls", [frontend_url]) frontend_port = urlparse.urlparse(frontend_url).port or config_lib.CONFIG.Get( "Frontend.bind_port") config.Set("Frontend.bind_port", frontend_port) print """\n\n-=AdminUI URL=-: The UI URL specifies where the Administrative Web Interface can be found. """ ui_url = RetryQuestion("AdminUI URL", "^http[s]*://.*$", "http://%s:8000" % hostname) config.Set("AdminUI.url", ui_url) ui_port = urlparse.urlparse(ui_url).port or config_lib.CONFIG.Get( "AdminUI.port") config.Set("AdminUI.port", ui_port)
def test_enketo_remote_server_responses(self): #just in case if we want to shift the testing back to the main server testing_enketo_url = settings.ENKETO_URL #testing_enketo_url = 'http://enketo-dev.formhub.org' form_id = "test_%s" % re.sub(re.compile("\."), "_", str(time())) server_url = "%s/%s" % (self.base_url, self.user.username) enketo_url = '%slaunch/launchSurvey' % testing_enketo_url values = { 'format': 'json', 'form_id': form_id, 'server_url': server_url } data = urllib.urlencode(values) req = urllib2.Request(enketo_url, data) try: response = urllib2.urlopen(req) response = json.loads(response.read()) success = response['success'] if not success and 'reason' in response: fail_msg = "This enketo installation is for use by "\ "formhub.org users only." if response['reason'].startswith(fail_msg): raise SkipTest return_url = response['url'] success = response['success'] self.assertTrue(success) enketo_base_url = urlparse(settings.ENKETO_URL).netloc return_base_url = urlparse(return_url).netloc self.assertIn(enketo_base_url, return_base_url) except urllib2.URLError: self.assertTrue(False) #second time req2 = urllib2.Request(enketo_url, data) try: response2 = urllib2.urlopen(req2) response2 = json.loads(response2.read()) return_url_2 = response2['url'] success2 = response2['success'] reason2 = response2['reason'] self.assertEqual(return_url, return_url_2) self.assertFalse(success2) self.assertEqual(reason2, "existing") except urllib2.URLError: self.assertTrue(False) #error message values['server_url'] = "" data = urllib.urlencode(values) req3 = urllib2.Request(enketo_url, data) try: response3 = urllib2.urlopen(req3) response3 = json.loads(response3.read()) success3 = response3['success'] reason3 = response3['reason'] self.assertFalse(success3) self.assertEqual(reason3, "empty") except urllib2.URLError: self.assertTrue(False)
def checkRedir(self, orig_path): # old_url = portal_url+item['_orig_path'] # XXX: referers to target and not portal old_url = self.target + orig_path # this downloads file. We need a way to do this without the download _, host, targetpath, _, _, _ = urlparse.urlparse(self.target) if "@" in host: auth, host = host.split("@") else: auth = None conn = httplib.HTTPConnection(host) headers = {} if auth: auth = "Basic " + string.strip(base64.encodestring(auth)) headers["Authorization"] = auth # /view is a hack as zope seems to send all content on head request conn.request("HEAD", targetpath + orig_path, headers=headers) res = conn.getresponse() redir = res.status == 301 if redir and res.getheader("location"): _, _, oldpath, _, _, _ = urlparse.urlparse(res.getheader("location")) parts = oldpath.split("/") if parts[-1] == "view": parts = parts[:-1] return "/".join(parts) if res.status == 200: return orig_path return None
def _cal_depth(self, url): # calculate depth of a given URL, return tuple (url, depth) if url.find('#') >= 0: url = url[:url.find('#')] # cut off fragment if url.find('?') >= 0: url = url[:url.find('?')] # cut off query string if url.startswith('//'): return '', 10000 # //www.baidu.com/index.php, ignored if not urlparse.urlparse(url, 'http').scheme.startswith('http'): return '', 10000 # no HTTP protocol, ignored if url.startswith('http'): _ = urlparse.urlparse(url, 'http') if _.netloc == self.host: # same hostname url = _.path else: return '', 10000 # not same hostname, ignored while url.find('//') >= 0: url = url.replace('//', '/') if not url: return '/', 1 # http://www.example.com if url[0] != '/': url = '/' + url url = url[: url.rfind('/')+1] depth = url.count('/') return url, depth
def absolute_url(url, base_href): """ >>> absolute_url('foo', 'http://base/whatever/ooo/fdsh') 'http://base/whatever/ooo/foo' >>> absolute_url('foo/bar/', 'http://base') 'http://base/foo/bar/' >>> absolute_url('/foo/bar', 'http://base/whatever/fdskf') 'http://base/foo/bar' >>> absolute_url('\\n/foo/bar', 'http://base/whatever/fdskf') 'http://base/foo/bar' >>> absolute_url('http://localhost/foo', 'http://base/whatever/fdskf') 'http://localhost/foo' """ url = url.strip() proto = urlparse(url)[0] if proto: return url base_url_parts = urlparse(base_href) base_server = '://'.join(base_url_parts[:2]) if url.startswith('/'): return base_server + url else: path = base_url_parts[2] if '/' in path: path = path.rsplit('/', 1)[0] + '/' else: path = '/' return base_server + path + url
def scan_locs_task(): """Yield a task to calculate the dependencies of the sitemap. Other tasks can depend on this output, instead of having to scan locations. """ scan_locs() # Generate a list of file dependencies for the actual generation # task, so rebuilds are triggered. (Issue #1032) output = kw["output_folder"] file_dep = [] for i in urlset.keys(): p = os.path.join(output, urlparse(i).path.replace(base_path, '', 1)) if not p.endswith('sitemap.xml') and not os.path.isdir(p): file_dep.append(p) if os.path.isdir(p) and os.path.exists(os.path.join(p, 'index.html')): file_dep.append(p + 'index.html') for i in sitemapindex.keys(): p = os.path.join(output, urlparse(i).path.replace(base_path, '', 1)) if not p.endswith('sitemap.xml') and not os.path.isdir(p): file_dep.append(p) if os.path.isdir(p) and os.path.exists(os.path.join(p, 'index.html')): file_dep.append(p + 'index.html') return {'file_dep': file_dep}
def getRepStr(self): urlList1 = [ i for i in self.url1.replace('http://', '').split('/') if i] urlList2 = [ i for i in self.url2.replace('http://', '').split('/') if i] # print urlList1 # print urlList2 n = 0 while True: if urlList1[:n]==urlList2[:n]: n+=1 if n>10: break continue break urlPart = 'http://'+'/'.join(urlList1[:n-1]) if urlparse(urlPart).netloc and ('.' not in urlparse(urlPart).path): urlPart += '/' urlListLen = len(urlList1[n-1:]) if urlListLen<1: return (urlPart, './') if urlListLen>=1: return (urlPart, urlListLen*'../', self.url1, self.url2)
def snapshot(self, newURLs, fetchedURLFragments = []): """ main method that crawls and saves html snapshots by recursive calls """ if newURLs is None or len(newURLs) == 0: return 0 fetchedURLFragments = fetchedURLFragments[:] # needed to clone solve reference problems # logging.debug("URLs: %s, fetchedURLFragments: %s" % (newURLs, fetchedURLFragments)) processedURLCount = 0 if len(newURLs) == 0: return 0 if self.domain == None: self.domain = urlparse(newURLs[0]).netloc logging.debug("first URL is '%s', setting valid domain as '%s'" % (newURLs[0], self.domain)) newURLFragments = [urlparse(newURL).fragment[1:] for newURL in newURLs if newURL.find("#!") != -1 and (urlparse(newURL).netloc == self.domain or urlparse(newURL).netloc == '')] logging.debug("found %d valid URLs to process" % (len(newURLFragments))) # ignored url path, because we don't need it for now # stripped ! character from fragment if len(newURLFragments) == 0: logging.warn("only URLs with #! hashbang are valid!") return 0 for newURLFragment in newURLFragments: if newURLFragment in fetchedURLFragments: logging.debug("URL-'%s' was fetched before", newURLFragment) continue newURL = "http://" + self.domain + "#!" + newURLFragment logging.info("fetching URL-'%s'" % (newURL)) fetchedURLFragments.append(newURLFragment) response = os.popen(self.snapshot_cmd % (newURL)).read() self.saveResponse(newURLFragment, response) foundURLs = self.extractHrefsFromHTML(response) self.snapshot(foundURLs, fetchedURLFragments) processedURLCount += 1 return processedURLCount
def __init__(self, layer, mapfile, fonts=None): """ Initialize Mapnik provider with layer and mapfile. XML mapfile keyword arg comes from TileStache config, and is an absolute path by the time it gets here. """ maphref = urljoin(layer.config.dirpath, mapfile) scheme, h, path, q, p, f = urlparse(maphref) if scheme in ('file', ''): self.mapfile = path else: self.mapfile = maphref self.layer = layer self.mapnik = None engine = mapnik.FontEngine.instance() if fonts: fontshref = urljoin(layer.config.dirpath, fonts) scheme, h, path, q, p, f = urlparse(fontshref) if scheme not in ('file', ''): raise Exception('Fonts from "%s" can\'t be used by Mapnik' % fontshref) for font in glob(path.rstrip('/') + '/*.ttf'): engine.register_font(str(font))
def provision(): if exec_ctx == 'spark_ec2': eggo.spark_ec2.provision() elif exec_ctx == 'director': eggo.director.provision() # at this point, get_master() should be valid # if the DFS is on the local fs, the directories may need to be created url = urlparse(eggo_config.get('dfs', 'dfs_root_url')) if url.scheme == 'file': local('mkdir -p {0}'.format(url.path)) url = urlparse(eggo_config.get('dfs', 'dfs_raw_data_url')) local('mkdir -p {0}'.format(url.path)) url = urlparse(eggo_config.get('dfs', 'dfs_tmp_data_url')) local('mkdir -p {0}'.format(url.path)) # tag all the provisioned instances if exec_ctx in ['spark_ec2', 'director']: conn = connect_to_region(eggo_config.get(exec_ctx, 'region')) instances = conn.get_only_instances( filters={'key-name': [eggo_config.get('aws', 'ec2_key_pair')]}) for instance in instances: instance.add_tag('owner', getuser()) instance.add_tag('stack_name', eggo_config.get(exec_ctx, 'stack_name'))
def __init__(self, uri, consumer, extra_headers=None): asyncore.dispatcher_with_send.__init__(self) # turn the uri into a valid request scheme, host, path, params, query, fragment = urlparse.urlparse(uri) # use origin host self.host = host # get proxy settings, if any proxy = self.proxies.get(scheme) if proxy: scheme, host, x, x, x, x = urlparse.urlparse(proxy) assert scheme == "http", "only supports HTTP requests (%s)" % scheme if not path: path = "/" if params: path = path + ";" + params if query: path = path + "?" + query if proxy: path = scheme + "://" + self.host + path self.path = path # get port number try: host, port = host.split(":", 1) port = int(port) except (TypeError, ValueError): port = 80 # default port self.consumer = consumer self.status = None self.header = None self.bytes_in = 0 self.bytes_out = 0 self.content_type = None self.content_length = None self.content_encoding = None self.transfer_encoding = None self.data = "" self.chunk_size = None self.timestamp = time.time() self.extra_headers = extra_headers self.create_socket(socket.AF_INET, socket.SOCK_STREAM) try: self.connect((host, port)) except socket.error: self.consumer.http(0, self, sys.exc_info())
def get_outgoing_url(url): """ Bounce a URL off an outgoing URL redirector, such as outgoing.prod.mozaws.net. """ if not settings.REDIRECT_URL: return url parsed_url = urlparse(url) url_netloc = parsed_url.netloc # This prevents a link like javascript://addons.mozilla.org... # being returned unchanged since the netloc matches the # safe list see bug 1251023 if parsed_url.scheme not in ['http', 'https']: return '/' # No double-escaping, and some domain names are excluded. if (url_netloc == urlparse(settings.REDIRECT_URL).netloc or url_netloc in settings.REDIRECT_URL_ALLOW_LIST): return url url = force_bytes(jinja2.utils.Markup(url).unescape()) sig = hmac.new(settings.REDIRECT_SECRET_KEY, msg=url, digestmod=hashlib.sha256).hexdigest() # Let '&=' through so query params aren't escaped. We probably shouldn't # bother to quote the query part at all. return '/'.join([settings.REDIRECT_URL.rstrip('/'), sig, urllib.quote(url, safe='/&=')])
def create(self, dbg, uri, name, description, configuration): u = urlparse.urlparse(uri) # sometimes a user can believe that a device exists because # they've just created it, but they don't realise that the actual # device will be created by a queued udev event. Make the client's # life easier by waiting for outstanding udev events to complete. code = subprocess.call(["udevadm", "settle"]) # if that fails then log and continue if code != 0: log.info("udevadm settle exitted with code %d" % code) p = subprocess.Popen(["mkfs.btrfs", u.path, "-f"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if p.returncode != 0: raise xapi.storage.api.volume.Unimplemented("mkfs.btrfs failed on %s" % u.path) local_uri = self.attach(dbg, uri) with open(urlparse.urlparse(local_uri).path + "/.json", "w") as fd: meta = { "name": name, "description": description } json.dump(meta, fd) fd.write("\n") self.detach(dbg, local_uri) return
def get_object(self): user = self.request.user manual_redirect_uri = self.request.auth_data.pop('redirect_uri', None) manual_redirect_uri = self.get_redirect_uri(manual_redirect_uri) if manual_redirect_uri: self.request.backend.redirect_uri = manual_redirect_uri elif DOMAIN_FROM_ORIGIN: origin = self.request.strategy.request.META.get('HTTP_ORIGIN') if origin: relative_path = urlparse(self.request.backend.redirect_uri).path url = urlparse(origin) origin_scheme_host = "%s://%s" % (url.scheme, url.netloc) location = urljoin(origin_scheme_host, relative_path) self.request.backend.redirect_uri = iri_to_uri(location) is_authenticated = user_is_authenticated(user) user = is_authenticated and user or None # skip checking state by setting following params to False # it is responsibility of front-end to check state # TODO: maybe create an additional resource, where front-end will # store the state before making a call to oauth provider # so server can save it in session and consequently check it before # sending request to acquire access token. # In case of token authentication we need a way to store an anonymous # session to do it. self.request.backend.REDIRECT_STATE = False self.request.backend.STATE_PARAMETER = False user = self.request.backend.complete(user=user) return user
def process_page_links(self, raw_html, url): """ simply extracts html links using awesome beautifulsoup """ beautiful_html = BeautifulSoup(raw_html) links = [a.get('href') for a in beautiful_html.find_all('a')] links = [link for link in links if link is not None] for link in links: link_info = urlparse.urlparse(link) if not link_info.scheme and not link_info.netloc: link = urlparse.urljoin(url, link) link_info = urlparse.urlparse(link) if('http' not in link_info.scheme) : continue if self.domain not in link_info.netloc: if not self.allow_external : continue # throwing out external link else: priority = 2 # insert external link with low priority else: priority = 1 self.unparsed_urls.add(link, priority)
def parseImgLinks(self,depth=1): url_response = None try: url_response = urllib2.urlopen(self.scrap_url,timeout=self._timeout) except Exception as e: print(" [ERROR]: Could not open {0}: {1}".format(self.scrap_url,e.reason)) return self.img_list html_parse = BeautifulSoup(url_response) unique_images_found = 0 total_images_found = 0 self.visited[self.scrap_url] = 1 for img in html_parse.findAll('img'): try: abs_url = urljoin(self.scrap_url,img['src']) if urlparse(img['src']).netloc == "" else img['src'] if abs_url not in self.img_list: self.img_list.add(abs_url) unique_images_found += 1 total_images_found += 1 except: pass print(" [Found %d images / %d new]: %s" % (total_images_found,unique_images_found,self.scrap_url)) if depth > 1: for a in html_parse.findAll('a'): try: if (urlparse(a['href']).netloc == "") or (urlparse(self.scrape_url_orig).netloc == urlparse(a['href']).netloc): self.scrap_url = urljoin(self.scrape_url_orig,a['href']) if self.scrap_url in self.visited: continue self.parseImgLinks(depth - 1) except: pass return self.img_list
def test_that_checks_redirect_using_incorrect_query_values(self, base_url): param = { 'product': 'firefox-31.0', 'lang': 'kitty_language', 'os': 'stella' } response = self._head_request(base_url, params=param) assert (requests.codes.not_found == response.status_code, self.response_info_failure_message(base_url, param, response)) parsed_url = urlparse(response.url) assert ('http' == parsed_url.scheme, 'Failed to redirect to the correct scheme. %s' % self.response_info_failure_message(base_url, param, response)) assert (urlparse(base_url).netloc == parsed_url.netloc, self.response_info_failure_message(base_url, param, response)) assert (urlencode(param) == parsed_url.query, self.response_info_failure_message(base_url, param, response)) assert ('Unknown' != self.get_x_backend_server(response), 'Failed, x-backend-server was not in the response object. %s' % self.response_info_failure_message(base_url, param, response))
def get_show(self, imdb, tvdb, show, show_alt, year): try: query = self.search_link post = urllib.urlencode({'searchquery': show, 'searchin': '2'}) result = '' links = [self.link_1, self.link_3] for base_link in links: result = client.source(urlparse.urljoin(base_link, query), post=post, headers=self.headers) if 'widget search-page' in str(result): break result = client.parseDOM(result, "div", attrs = { "class": "widget search-page" })[0] result = client.parseDOM(result, "td") shows = [cleantitle.tv(show), cleantitle.tv(show_alt)] years = ['(%s)' % str(year), '(%s)' % str(int(year)+1), '(%s)' % str(int(year)-1)] result = [(client.parseDOM(i, "a", ret="href")[-1], client.parseDOM(i, "a")[-1]) for i in result] result = [i for i in result if any(x == cleantitle.tv(i[1]) for x in shows)] result = [i[0] for i in result if any(x in i[1] for x in years)][0] url = client.replaceHTMLCodes(result) try: url = urlparse.parse_qs(urlparse.urlparse(url).query)['u'][0] except: pass url = urlparse.urlparse(url).path url = url.encode('utf-8') return url except: return
def sources(self, url, hostDict, hostprDict): try: sources = [] if url == None: return sources url = urlparse.urljoin(self.base_link, url) r = proxy.request(url, 'tv shows') links = client.parseDOM(r, 'a', ret='href', attrs = {'target': '.+?'}) links = [x for y,x in enumerate(links) if x not in links[:y]] for i in links: try: url = i url = proxy.parse(url) url = urlparse.parse_qs(urlparse.urlparse(url).query)['r'][0] url = url.decode('base64') url = client.replaceHTMLCodes(url) url = url.encode('utf-8') host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(url.strip().lower()).netloc)[0] if not host in hostDict: raise Exception() host = host.encode('utf-8') sources.append({'source': host, 'quality': 'SD', 'language': 'en', 'url': url, 'direct': False, 'debridonly': False}) except: pass return sources except: return sources
def get_download_url_ssl(self): """ SSL-enabled links should be used for the specific verions, except the Windows stub installers. """ # SSL-enabled links won't be used for 26.0 url = firefox_details.get_download_url("OS X", "pt-BR", "26.0") self.assertListEqual( parse_qsl(urlparse(url).query), [("product", "firefox-26.0"), ("os", "osx"), ("lang", "pt-BR")] ) # SSL-enabled links won't be used for 27.0 Windows builds (but SSL # download is enabled by default for stub installers) url = firefox_details.get_download_url("Windows", "pt-BR", "27.0") self.assertListEqual( parse_qsl(urlparse(url).query), [("product", "firefox-27.0"), ("os", "win"), ("lang", "pt-BR")] ) # SSL-enabled links will be used for 27.0 OS X builds url = firefox_details.get_download_url("OS X", "pt-BR", "27.0") self.assertListEqual( parse_qsl(urlparse(url).query), [("product", "firefox-27.0-SSL"), ("os", "osx"), ("lang", "pt-BR")] ) # SSL-enabled links will be used for 27.0 Linux builds url = firefox_details.get_download_url("Linux", "pt-BR", "27.0") self.assertListEqual( parse_qsl(urlparse(url).query), [("product", "firefox-27.0-SSL"), ("os", "linux"), ("lang", "pt-BR")] )
def mangle_url(self, url): self.check_connection() try: endpoint_url = urlparse.urlparse(url) except Exception as e: script_unknown("you must provide an endpoint_url in the form" + "<scheme>://<url>/ (%s)\n" % e) scheme = endpoint_url.scheme if scheme is None: script_unknown("you must provide an endpoint_url in the form" + "<scheme>://<url>/ (%s)\n" % e) catalog_url = None try: catalog_url = urlparse.urlparse( self.nova_client.client.management_url) except Exception as e: script_unknown("unknown error parsing the catalog url : %s\n" % e) port = endpoint_url.port if port is None: if catalog_url.port is None: port = 8774 else: port = catalog_url.port netloc = "%s:%i" % (endpoint_url.hostname, port) url = urlparse.urlunparse([scheme, netloc, catalog_url.path, catalog_url.params, catalog_url.query, catalog_url.fragment]) self.nova_client.client.set_management_url(url)
def generateUrls(url): baseulp = urlparse(url) host = baseulp.netloc paths = getCrawlerPaths(url) #pprint(paths) urls = [] rulefile = BASEDIR + '/lib/db/compresed_file.rule' for eachpath in paths: eachulp = urlparse(eachpath) if eachulp.path == '': host = eachulp.netloc domain = GetFirstLevelDomain(host) args = {'host':host,'com':domain} else: pos = eachulp.path.rfind('/') tmp = eachulp.path[pos+1:] args = {'com':tmp} rf = RuleFile(rulefile,args) rf._getRules() for i in rf.ret: urls.append(eachpath + '/' +i) ret = list(set(urls)) ret.sort() return ret
def get_all_href_list(root_my_url, soup, file_encode): root_parse = urlparse.urlparse(root_my_url.get_abs_url()) href_list = [] if not root_parse.hostname: return href_list # get tags' href tag_list = soup.find_all(['a', 'img', 'link']) href_filter = r'#|\n|(mailto:)' for tag in tag_list: add_my_url = DownloadUrl(None, None, root_my_url.get_abs_path()) if tag.get('href') and not re.search(href_filter, tag.get('href')): add_my_url.url = tag.get('href') elif tag.get('src'): add_my_url.url = tag.get('src') if add_my_url.url: temp_parse = urlparse.urlparse(add_my_url.url) if temp_parse.hostname: add_my_url.host = temp_parse.hostname else: add_my_url.host = root_parse.hostname href_list.append(add_my_url) return href_list
def do_POST(self): try: refer = self.headers.getheader('Referer') netloc = urlparse.urlparse(refer).netloc if not netloc.startswith("127.0.0.1") and not netloc.startswitch("localhost"): xlog.warn("web control ref:%s refuse", netloc) return except: pass xlog.debug ('GAEProxy web_control %s %s %s ', self.address_string(), self.command, self.path) try: ctype, pdict = cgi.parse_header(self.headers.getheader('content-type')) if ctype == 'multipart/form-data': self.postvars = cgi.parse_multipart(self.rfile, pdict) elif ctype == 'application/x-www-form-urlencoded': length = int(self.headers.getheader('content-length')) self.postvars = urlparse.parse_qs(self.rfile.read(length), keep_blank_values=1) else: self.postvars = {} except: self.postvars = {} path = urlparse.urlparse(self.path).path if path == '/deploy': return self.req_deploy_handler() elif path == "/config": return self.req_config_handler() elif path == "/scan_ip": return self.req_scan_ip_handler() elif path.startswith("/importip"): return self.req_importip_handler() else: self.wfile.write(b'HTTP/1.1 404\r\nContent-Type: text/plain\r\nConnection: close\r\n\r\n404 Not Found') xlog.info('%s "%s %s HTTP/1.1" 404 -', self.address_string(), self.command, self.path)
__PathdirtyDataFile = '/home/kostas/AraxniProject/scripts/spiderOutput.csv' __PathcleanDataFile = '/home/kostas/AraxniProject/scripts/sqlErrorCheck/input.txt' #pass to sqlErrorCheck.py goodLinks = [] index = 0 idx=0 print 'Starting IceFilter, long waiting time process.', sys.stdout.flush() for inputLink in open(__PathdirtyDataFile,'r').readlines(): regexBanned = re.search(r'google|facebook.com|youtube.com|yahoo.com|baidu.com|wikipedia.org|live.com|twitter.com|qq.com|msn.com|yahoo.co.jp|linkedin.com|taobao.com|google.co.in|sina.com.cn|amazon.com|wordpress.com|google.com.hk|google.de|bing.com|google.co.uk|yandex.ru|ebay.com|163.com|google.co.jp|google.fr|microsoft.com|paypal.com|google.com.br|mail.ru|craigslist.org|fc2.com|google.it|apple.com|google.es|imdb.com|google.ru|weibo.com|vkontakte.ru|sohu.com|bbc.co.uk|ask.com|tumblr.com|livejasmin.com|xvideos.com|go.com|youku.com|bp.blogspot.com|cnn.com|soso.com|google.ca|aol.com|tudou.com|xhamster.com|ifeng.com|megaupload.com|mediafire.com|zedo.com|ameblo.jp|pornhub.com|google.co.id|godaddy.com|adobe.com|about.com|rakuten.co.jp|espn.go.com|alibaba.com|conduit.com|ebay.de|4shared.com|wordpress.org|livejournal.com|google.com.mx|google.com.tr|livedoor.com|yieldmanager.com|google.com.au|blogger.com|youporn.com|renren.com|cnet.com|uol.com.br|google.pl|myspace.com|ebay.co.uk|chinaz.com|nytimes.com|thepiratebay.org|doubleclick.com',inputLink) if regexBanned is None: flagStepOver = False for index,goodLink in enumerate(goodLinks): inputLinkNetloc = urlparse(inputLink).netloc goodLinkNetloc = urlparse(goodLink).netloc if inputLinkNetloc == goodLinkNetloc: goodlinkQuery = urlparse(goodLink).query inputLinkQuery = urlparse(inputLink).query if len(inputLinkQuery) > len(goodlinkQuery): #replace link idx = idx - 1 del goodLinks[idx] goodLinks.append(inputLink) print '[',inputLink,'] inserting/replacing...' idx = idx + 1 flagStepOver = True #gia na min ektelesti to if sto telos else: #print ' (len)inputLinkQuery < (len)goodlinkQuery DO NOTHING' flagStepOver = True if flagStepOver == False:
def get_canonical_string(self, url, headers, method): parsedurl = urlparse(url) objectkey = parsedurl.path[1:] query_args = sorted(parsedurl.query.split('&')) bucket = parsedurl.netloc[:-len(self.service_base_url)] if len(bucket) > 1: # remove last dot bucket = bucket[:-1] interesting_headers = { 'content-md5': '', 'content-type': '', 'date': '' } for key in headers: lk = key.lower() try: lk = lk.decode('utf-8') except: pass if headers[key] and (lk in interesting_headers.keys() or lk.startswith('x-amz-')): interesting_headers[lk] = headers[key].strip() # If x-amz-date is used it supersedes the date header. if not py3k: if 'x-amz-date' in interesting_headers: interesting_headers['date'] = '' else: if 'x-amz-date' in interesting_headers: interesting_headers['date'] = '' buf = '%s\n' % method for key in sorted(interesting_headers.keys()): val = interesting_headers[key] if key.startswith('x-amz-'): buf += '%s:%s\n' % (key, val) else: buf += '%s\n' % val # append the bucket if it exists if bucket != '': buf += '/%s' % bucket # add the objectkey. even if it doesn't exist, add the slash buf += '/%s' % objectkey params_found = False # handle special query string arguments for q in query_args: k = q.split('=')[0] if k in self.special_params: buf += '&' if params_found else '?' params_found = True try: k, v = q.split('=', 1) except ValueError: buf += q else: # Riak CS multipart upload ids look like this, `TFDSheOgTxC2Tsh1qVK73A==`, # is should be escaped to be included as part of a query string. # # A requests mp upload part request may look like # resp = requests.put( # 'https://url_here', # params={ # 'partNumber': 1, # 'uploadId': 'TFDSheOgTxC2Tsh1qVK73A==' # }, # data='some data', # auth=S3Auth('access_key', 'secret_key') # ) # # Requests automatically escapes the values in the `params` dict, so now # our uploadId is `TFDSheOgTxC2Tsh1qVK73A%3D%3D`, # if we sign the request with the encoded value the signature will # not be valid, we'll get 403 Access Denied. # So we unquote, this is no-op if the value isn't encoded. buf += '{key}={value}'.format(key=k, value=unquote(v)) return buf
def get_host_ip_addr(): xs_url = urlparse.urlparse(FLAGS.xenapi_connection_url) return xs_url.netloc
def parse_qs(): url = 'https://www.google.com.hk/#newwindow=1&safe=strict&q=bekk-garch%E6%A8%A1%E5%9E%8B' result = urlparse.urlparse(url) print(result) pass
def signOut(): print 'Signing out user' response.delete_cookie("account") redirect(urlparse.parse_qs(urlparse.urlparse(request.url).query)['redirectUrl'][0])
def getSignin(): try: redirectUrl = urlparse.parse_qs(urlparse.urlparse(request.url).query)['redirectUrl'][0] except: redirectUrl = 'home' return bottle.template('signin', redirectUrl=redirectUrl)
def getRegistration(): try: redirectUrl = urlparse.parse_qs(urlparse.urlparse(request.url).query)['redirectUrl'][0] except: redirectUrl = 'home' return bottle.template('register', redirectUrl = redirectUrl)
def get_data(self, already_downloaded_sources): """ Descarga los archivos webbug.log y almacena las detecciones en el archivo wb-[servidor]-[nombreusuario]-events.log """ self.fetched_data = [] if not self.enabled: return False for ftpsite in self.__webbug_log: addr = urlparse.urlparse(ftpsite) url_downloaded = [ downloaded[0] for downloaded in already_downloaded_sources ] sremoteaddr = addr.scheme + "://" + addr.hostname + addr.path if url_downloaded.count( sremoteaddr ): # Si el fichero ya ha sido descargado por otro plugin, se copia already_downloaded_file = already_downloaded_sources[ url_downloaded.index(sremoteaddr)] self.fetched_data.append(already_downloaded_file) self.__logger.info('Reusado %s de %s descargado recientemente', addr.path, addr.hostname) else: # Si no es así, se descarga filename = "temp/wb-" + addr.hostname + '-' + self.person + "-access.log" localfile = open(filename, 'wb') try: connftp = ftplib.FTP(addr.hostname) connftp.login(addr.username, addr.password) connftp.retrbinary('RETR ' + addr.path, localfile.write) self.fetched_data.append([ addr.scheme + "://" + addr.hostname + addr.path, filename ]) self.__logger.info('Descargado webbug.log de %s', addr.hostname) void_file = open("temp/void_file", 'wb') void_file.close() void_file = open("temp/void_file", "rb") connftp.storbinary("STOR " + addr.path, void_file) # Vaciamos el webbug.log void_file.close() connftp.quit() localfile.close() except: self.__logger.error( 'Error conectando al servidor [%s] de %s', addr.hostname, self.person) localfile.close() line_parser = apache_log_parser.make_parser( self.__webbug_log_format.decode('string_escape')) for fdownloaded in self.fetched_data: ftemp_accesslog = open(fdownloaded[1], 'r') addr = urlparse.urlparse(fdownloaded[0]) fevents_filename = "data/wb-" + addr.hostname + '-' + self.person + "-events.log" try: # lee la fecha y hora de la última linea, si es que existe el archivo with open(fevents_filename, "rb") as sf: sfirstline = sf.readline() sf.seek(-2, 2) try: while sf.read(1) != "\n": sf.seek(-2, 1) slastline = sf.readline() except IOError: slastline = sfirstline last_event_logged = line_parser(slastline) last_event_logged_time = last_event_logged[ 'time_received_datetimeobj'] except IOError: last_event_logged_time = datetime(1, 1, 1, 1, 1, 1) fevents = open(fevents_filename, 'a') while True: linea = ftemp_accesslog.readline() if not linea: break log_line_data = line_parser(linea) referer_url = log_line_data['request_header_referer'] web_bug_location = log_line_data['request_first_line'] ref = Referer(referer_url) if (last_event_logged_time < log_line_data['time_received_datetimeobj']) and (web_bug_location != "-")\ and ((self.__weight_visit > 0) or (ref.medium == 'search')): if ref.search_term is not None: ref_list = unidecode( ref.search_term.decode('utf-8')).replace( "\"", "").split() if eval(self.__eval_expression): fevents.write(linea) else: fevents.write(linea) ftemp_accesslog.close() fevents.close() if self.fetched_data: return True
def eval_data(self, time_frame, analyzed_time, given_time, confirmed_ips): """ Devuelve una lista con un elemento por cada uno de los últimos 'check_interval' minutos antes de la hora 'given_time'. Cada elemento de la lista devuelta contiene el valor acumulado de las detecciones durante los 'time_frame' minutos anteriores. """ eval_time = time_frame + analyzed_time detect_list = [0] * eval_time acum_list = [0] * analyzed_time if not self.enabled: return acum_list time_now_utc = datetime(given_time.year, given_time.month, given_time.day, given_time.hour, given_time.minute) line_parser = apache_log_parser.make_parser( self.__webbug_log_format.decode('string_escape')) for remoteaddr in self.__webbug_log: addr = urlparse.urlparse( remoteaddr) # Se obtiene el nombre del fichero de eventos filename = "data/wb-" + addr.hostname + '-' + self.person + "-events.log" with open(filename, 'r') as f: linea = f.readline( ) # Detección de zona horaria en la primera linea del log if linea: p = re.compile(r"[\+|-]\d\d\d\d\]") tz = p.findall(linea)[0] timezone = timedelta(hours=int(tz[0:3]), minutes=int(tz[0] + tz[3:5])) visiting_ips = [] while linea: log_line_data = line_parser(linea) current_ip = log_line_data['remote_host'] if confirmed_ips.count(current_ip): l = log_line_data['time_received_datetimeobj'] line_time_utc = datetime(l.year, l.month, l.day, l.hour, l.minute) - timezone if line_time_utc > time_now_utc: break i = int( (time_now_utc - line_time_utc).total_seconds() / 60) # Conversión hora a índice if i < eval_time: ref = Referer( log_line_data['request_header_referer']) origin = urlparse.urlparse( log_line_data['request_first_line']) if (ref.medium == 'search') and ( ref.search_term is not None): # Una búsqueda con términos detect_list[eval_time - i - 1] += self.__weight elif (ref.medium == 'search') and ( ref.search_term is None): # Una búsqueda sin términos detect_list[eval_time - i - 1] += self.__weight_no_search_terms elif (self.__weight_visit > 0) and \ (not visiting_ips.count([current_ip, origin.hostname])): # Una simple visita visiting_ips.append([ current_ip, origin.hostname ]) # Solo puntuan una vez por ip/origen detect_list[eval_time - i - 1] += self.__weight_visit linea = f.readline() for i in range( 1, analyzed_time + 1 ): # Acumulacción de pesos de detección para los rangos dados #print "acumulado", analyzed_time - i, "= suma desde", eval_time - time_frame - i, "hasta", eval_time - i, "=", detect_list[eval_time - time_frame - i:eval_time - i + 1], "=", sum(detect_list[eval_time - time_frame - i:eval_time - i]) acum_list[analyzed_time - i] = sum( detect_list[eval_time - time_frame - i:eval_time - i + 1]) return acum_list
def parse_endpoint(endpoint): return urlparse.urlparse(endpoint)
def test_home_page(self): response = self.client.get(url_for('main.index')) self.assertEqual( urlparse(response.location).path, url_for('auth.login'))
def get_report_data(self, time_frame, given_time, confirmed_ips): """ Devuelve una lista con cada una de las detecciones durante los 'time_frame' minutos previos a la hora 'given_time'. Cada elemento contiene la hora de la detección, el sitio donde se detectó, la ip del footprinter, la puntuación y un texto descriptivo sobre la misma. """ report_list = [] if not self.enabled: return report_list delta_frame = timedelta(minutes=time_frame) line_parser = apache_log_parser.make_parser( self.__webbug_log_format.decode('string_escape')) for remoteaddr in self.__webbug_log: addr = urlparse.urlparse( remoteaddr) # Se obtiene el nombre del fichero de eventos filename = "data/wb-" + addr.hostname + '-' + self.person + "-events.log" with open(filename, 'r') as f: linea = f.readline( ) # Detección de zona horaria en la primera linea del log if linea: p = re.compile(r"[\+|-]\d\d\d\d\]") tz = p.findall(linea)[0] timezone = timedelta(hours=int(tz[0:3]), minutes=int(tz[0] + tz[3:5])) simple_visits = [] while True: if not linea: break log_line_data = line_parser(linea) ip = log_line_data['remote_host'] if confirmed_ips.count(ip): line_time_utc = log_line_data[ 'time_received_datetimeobj'] - timezone if line_time_utc > given_time: break if line_time_utc > given_time - delta_frame: origin = log_line_data['request_first_line'] ref = Referer( log_line_data['request_header_referer']) origin_hostname = urlparse.urlparse( origin).hostname if (ref.medium == 'search') and ( ref.search_term is not None): # Una búsqueda con términos sterms = ref.search_term.decode('utf-8') sengine = ref.referer.decode('utf-8') description = u'Una busqueda desde [' + sengine + u'] con los terminos: [' + sterms +\ u'] ha llegado a [' + origin + ']' report_list.append([ line_time_utc, log_line_data['remote_host'], description, 'Web Bug' ]) elif (ref.medium == 'search') and ( ref.search_term is None): # Una búsqueda sin términos sengine = ref.referer.decode('utf-8') description = u'Una busqueda desde [' + sengine + u'] ha llegado a [' + origin + ']' report_list.append([ line_time_utc, log_line_data['remote_host'], description, 'Web Bug' ]) elif (self.__weight_visit > 0) and (not simple_visits.count( [ip, origin_hostname])): simple_visits.append([ip, origin_hostname ]) # Una simple visita description = u'Una visita ha llegado a [' + origin + ']' report_list.append([ line_time_utc, log_line_data['remote_host'], description, 'Web Bug' ]) linea = f.readline() if report_list: return sorted(report_list, key=itemgetter(0)) else: return report_list
def keywords(stmt): kw = {k.arg: k.value.s for k in stmt.keywords if k.arg in KEYS} path = kw.get("importpath", kw.get("remote")) u = urlparse(path) return u.netloc + u.path, kw["name"]
def install_commands_ubuntu(package_name, distribution, package_source, base_url): """ Install Flocker package on Ubuntu. The ClusterHQ repo is added for downloading latest releases. If ``package_source`` contains a branch, then a BuildBot repo will also be added to the package search path, to use in-development packages. Note, the ClusterHQ repo is always enabled, to provide dependencies. :param bytes distribution: The distribution the node is running. :param PackageSource package_source: The source from which to install the package. :param base_url: URL of repository, or ``None`` if we're not using development branch. :return: a sequence of commands to run on the distribution """ flocker_version = package_source.version if not flocker_version: # support empty values other than None, as '' sometimes used to # indicate latest version, due to previous behaviour flocker_version = get_installable_version(version) commands = [ # Minimal images often have cleared apt caches and are missing # packages that are common in a typical release. These commands # ensure that we start from a good base system with the required # capabilities, particularly that the add-apt-repository command # is available, and HTTPS URLs are supported. run_from_args(["apt-get", "update"]), run_from_args([ "apt-get", "-y", "install", "apt-transport-https", "software-properties-common" ]), # Add ClusterHQ repo for installation of Flocker packages. run(command='add-apt-repository -y "deb {} /"'.format( get_repository_url(distribution=distribution, flocker_version=flocker_version))) ] if base_url is not None: # Add BuildBot repo for running tests commands.append( run_from_args( ["add-apt-repository", "-y", "deb {} /".format(base_url)])) # During a release, the ClusterHQ repo may contain packages with # a higher version number than the Buildbot repo for a branch. # Use a pin file to ensure that any Buildbot repo has higher # priority than the ClusterHQ repo. We only add the Buildbot # repo when a branch is specified, so it wil not interfere with # attempts to install a release (when no branch is specified). buildbot_host = urlparse(package_source.build_server).hostname commands.append( put( dedent('''\ Package: * Pin: origin {} Pin-Priority: 700 '''.format(buildbot_host)), '/tmp/apt-pref')) commands.append( run_from_args( ['mv', '/tmp/apt-pref', '/etc/apt/preferences.d/buildbot-700'])) # Update to read package info from new repos commands.append(run_from_args(["apt-get", "update"])) os_version = package_source.os_version() if os_version: # Set the version of the top-level package package_name += '=%s' % (os_version, ) # If a specific version is required, ensure that the version for # all ClusterHQ packages is consistent. This prevents conflicts # between the top-level package, which may depend on a lower # version of a dependency, and apt, which wants to install the # most recent version. Note that this trumps the Buildbot # pinning above. commands.append( put( dedent('''\ Package: clusterhq-* Pin: version {} Pin-Priority: 900 '''.format(os_version)), '/tmp/apt-pref')) commands.append( run_from_args([ 'mv', '/tmp/apt-pref', '/etc/apt/preferences.d/clusterhq-900' ])) # Install package and all dependencies commands.append( run_from_args( ['apt-get', '-y', '--force-yes', 'install', package_name])) return sequence(commands)
parser = argparse.ArgumentParser() parser.add_argument('db', help='leveldb root directory') args = parser.parse_args(sys.argv[1:]) db = leveldb.LevelDB(args.db) stats = defaultdict(lambda: defaultdict(int)) header = None domain = None # valid_languages = [l.lower() for l in args.lang] for line in sys.stdin: if line.startswith(magic_number): header = parse_line(line) domain = urlparse(uri).netloc continue lang, percent, confidence = line.split() percent = int(percent) if percent < args.minpercent: continue if valid_languages and lang.lower() not in valid_languages: continue bytes_in_lang = header["bytes"] * percent / 100 if bytes_in_lang >= args.minbytes: stats[full_domain][lang] += bytes_in_lang
'USER': '******', 'PASSWORD': '******', 'HOST': '', 'PORT': '', } } urlparse.uses_netloc.append('postgres') urlparse.uses_netloc.append('mysql') try: if 'DATABASES' not in locals(): DATABASES = {} if 'DATABASE_URL' in os.environ: url = urlparse.urlparse(os.environ['DATABASE_URL']) # Ensure default database exists. DATABASES['default'] = DATABASES.get('default', {}) # Update with environment configuration. DATABASES['default'].update({ 'NAME': url.path[1:], 'USER': url.username, 'PASSWORD': url.password, 'HOST': url.hostname, 'PORT': url.port, }) if url.scheme == 'postgres': DATABASES['default'][ 'ENGINE'] = 'django.db.backends.postgresql_psycopg2'
def sources(self, url, hostDict, hostprDict): try: sources = [] if url == None: return sources if debrid.status() == False: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) try: if not 'tvshowtitle' in data: raise Exception() links = [] f = ['S%02dE%02d' % (int(data['season']), int(data['episode']))] t = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', '', data['tvshowtitle']) t = t.replace("&", "") q = self.search_link + urllib.quote_plus('%s %s' % (t, f[0])) q = urlparse.urljoin(self.base_link, q) result = client.request(q) result = json.loads(result) result = result['results'] except: links = result = [] for i in result: try: if not cleantitle.get(t) == cleantitle.get(i['showName']): raise Exception() y = i['release'] y = re.compile('[\.|\(|\[|\s](\d{4}|S\d*E\d*)[\.|\)|\]|\s]').findall(y)[-1] y = y.upper() if not any(x == y for x in f): raise Exception() quality = i['quality'] quality = quality.upper() size = i['size'] size = float(size)/1024 size = '%.2f GB' % size if any(x in quality for x in ['HEVC', 'X265', 'H265']): info = '%s | HEVC' % size else: info = size if '1080P' in quality: quality = '1080p' elif '720P' in quality: quality = 'HD' else: quality = 'SD' url = i['links'] #for x in url.keys(): links.append({'url': url[x], 'quality': quality, 'info': info}) links = [] for x in url.keys(): links.append({'url': url[x], 'quality': quality}) for link in links: try: url = link['url'] quality2 = link['quality'] #url = url[1] #url = link if len(url) > 1: raise Exception() url = url[0].encode('utf-8') host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(url.strip().lower()).netloc)[0] if not host in hostprDict: raise Exception() host = host.encode('utf-8') sources.append({'source': host, 'quality': quality2, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True}) except: pass except: pass return sources except: return sources
def download(args): if args.arch: config.arch = args.arch installed_build = builds.get_installed_build() def build_suffix(build): if build > installed_build: symbol = '+' elif build < installed_build: symbol = '-' else: symbol = '=' return symbol build_sources = sources.build_sources() if args.source: source_name = args.source try: build_source = build_sources[source_name] except KeyError: parsed = urlparse(source_name) if parsed.scheme in ('http', 'https') and parsed.netloc: if args.releases: build_url = builds.BuildsURL( source_name, extractor=builds.ReleaseLinkExtractor) else: build_url = builds.BuildsURL(source_name) else: print( '"{}" is not in the list of available sources ' 'and is not a valid HTTP URL').format(args.source) print 'Valid options are:\n\t{}'.format("\n\t".join( build_sources.keys())) sys.exit(1) else: source_name = get_choice(build_sources.keys()) build_source = build_sources[source_name] print print "Arch: {}".format(config.arch) print "Installed build: {}".format(installed_build) try: links = build_source.builds() except requests.RequestException as e: print str(e) except builds.BuildURLError as e: print str(e) else: if links: build = get_choice(links, build_suffix, reverse=True) remote = build.remote_file() file_path = os.path.join(libreelec.UPDATE_DIR, build.filename) print print "Downloading {0} ...".format(build.url) try: with open(file_path, 'w') as out: process(remote, out, build.size) except KeyboardInterrupt: os.remove(file_path) print print "Download cancelled" sys.exit() if build.compressed: tar_path = os.path.join(libreelec.UPDATE_DIR, build.tar_name) size = os.path.getsize(file_path) print print "Decompressing {0} ...".format(file_path) with open(file_path, 'r') as fin, open(tar_path, 'w') as fout: process(fin, fout, size, decompress) os.remove(file_path) funcs.create_notify_file(source_name, build) print print "The update is ready to be installed. Please reboot." else: print print "No builds available"
def run(self): def download(): return [] result = cache.bennu_download_get(download, 600000000, table='rel_dl') for item in result: self.name = item['name'] ; self.image = item['image'] ; self.url = item['url'] sysname = self.name.translate(None, '\/:*?"<>|').strip('.') url = self.url.split('|')[0] try: headers = dict(urlparse.parse_qsl(self.url.rsplit('|', 1)[1])) except: headers = dict('') ext = os.path.splitext(urlparse.urlparse(url).path)[1][1:].lower() hdlr = re.compile('.+? ([(]\d{4}[)]|S\d*E\d*)$').findall(self.name) if len(hdlr) == 0: self.content = 'Uncategorised' if ext in ['m4a', 'mp3', 'aac']: self.content = 'Music' hdlr = re.compile('.+? (S\d*E\d*)$').findall(self.name) if len(hdlr) > 0: self.content = 'TVShows' hdlr = re.compile('.+? [(](\d{4})[)]$').findall(self.name) if len(hdlr) > 0: self.content = 'Movies' if self.content == 'Movies': dest = os.path.join(downloadPath, self.content) control.makeFile(dest) dest = os.path.join(dest, sysname) control.makeFile(dest) elif self.content == 'TVShows': d = re.compile('(.+?) S(\d*)E(\d*)$').findall(sysname)[0] dest = os.path.join(downloadPath, self.content) control.makeFile(dest) dest = os.path.join(dest, d[0]) control.makeFile(dest) dest = os.path.join(dest, 'Season %01d' % int(d[1])) control.makeFile(dest) else: dest = os.path.join(downloadPath, self.content) control.makeFile(dest) if not ext in ['mp4', 'm4a', 'mp3', 'aac', 'mkv', 'flv', 'avi', 'mpg']: ext = 'mp4' dest = os.path.join(dest, sysname + '.' + ext) control.infoDialog(self.name + ' Is Downloading', 'Downloads Started', self.image, time=7000) try: req = urllib2.Request(url, headers=headers) resp = urllib2.urlopen(req, timeout=30) except Exception,e: removeDownload(self.url) print '%s ERROR - File Failed To Open' % (dest) continue try: self.size = int(resp.headers['Content-Length']) except: self.size = 0 if self.size < 1: removeDownload(self.url) print '%s Unknown filesize - Unable to download' % (dest) continue try: resumable = 'bytes' in resp.headers['Accept-Ranges'].lower() except: resumable = False size = 1024 * 1024 if self.size < size: size = self.size gb = '%.2f GB' % (float(self.size) / 1073741824) start = time.clock() total = 0 ; notify = 0 ; errors = 0 ; count = 0 ; resume = 0 ; sleep = 0 self.clear() control.window.setProperty(property + '.status', 'downloading') control.window.setProperty(property + '.name', str(self.name)) control.window.setProperty(property + '.image', str(self.image)) control.window.setProperty(property + '.size', str(gb)) f = control.openFile(dest, 'wb') chunk = None chunks = [] while True: downloaded = total for c in chunks: downloaded += len(c) percent = min(100 * downloaded / self.size, 100) self.speed = str(int((downloaded / 1024) / (time.clock() - start))) + ' KB/s' self.percent = str(percent) + '%' control.window.setProperty(property + '.percent', str(self.percent)) control.window.setProperty(property + '.speed', str(self.speed)) if percent >= notify: control.infoDialog('Downloaded %s' % self.percent, self.name, self.image, time=5000) notify += 10 chunk = None error = False try: chunk = resp.read(size) if not chunk: if self.percent < 99: error = True else: while len(chunks) > 0: c = chunks.pop(0) f.write(c) del c f.close() print '%s download complete' % (dest) break except Exception, e: print str(e) error = True sleep = 10 errno = 0 if hasattr(e, 'errno'): errno = e.errno if errno == 10035: # 'A non-blocking socket operation could not be completed immediately' pass if errno == 10054: #'An existing connection was forcibly closed by the remote host' errors = 10 #force resume sleep = 30 if errno == 11001: # 'getaddrinfo failed' errors = 10 #force resume sleep = 30 if chunk: errors = 0 chunks.append(chunk) if len(chunks) > 5: c = chunks.pop(0) f.write(c) total += len(c) del c if error: errors += 1 count += 1 print '%d Error(s) whilst downloading %s' % (count, dest) control.sleep(sleep*1000) if (resumable and errors > 0) or errors >= 10: if (not resumable and resume >= 50) or resume >= 500: #Give up! print '%s download canceled - too many error whilst downloading' % (dest) break resume += 1 errors = 0 if resumable: chunks = [] #create new response print 'Download resumed (%d) %s' % (resume, dest) h = headers ; h['Range'] = 'bytes=%d-' % int(total) try: resp = urllib2.urlopen(urllib2.Request(url, headers=h), timeout=10) except: resp = None else: #use existing response pass if control.window.getProperty(property + '.status') == 'stop': control.infoDialog('Process Complete', 'Downloads', time=5000) return self.clear()
/usr/lib/spark/bin/spark-submit --conf spark.hadoop.yarn.resourcemanager.connect.max-wait.ms=60000 --conf spark.hadoop.fs.defaultFS=hdfs://ip-172-31-38-180.us-west-2.compute.internal:8020 --conf spark.hadoop.yarn.resourcemanager.address=ip-172-31-38-180.us-west-2.compute.internal:8032 --conf spark.dynamicAllocation.enabled=true --conf spark.shuffle.service.enabled=true --conf spark.dynamicAllocation.minExecutors=1 --conf spark.dynamicAllocation.maxExecutors=18 --conf spark.executor.memory=5g --conf spark.executor.cores=4 --name tape --master yarn --deploy-mode cluster --jars /opt/amazon/superjar/glue-assembly.jar --files /tmp/glue-default.conf,/tmp/glue-override.conf,/opt/amazon/certs/InternalAndExternalAndAWSTrustStore.jks,/opt/amazon/certs/rds-combined-ca-bundle.pem,/tmp/g-ef1db6367ac2ca9900a1ec51e0610890dd85420b-2014450836307118787/script_2018-01-16-19-19-57.py --py-files /tmp/PyGlue.zip --driver-memory 5g --executor-memory 5g /tmp/runscript.py script_2018-01-16-19-19-57.py --JOB_NAME PropertyDim_Full_Refresh --JOB_ID j_50471421e9761b8bb5ab038777ad7d47ca763b820c11175075f667160acf650b --s3_bucket_sql_file move-dataeng-lstp-dev --s3_prefix_sql_file edw/propertydim_temp_scripts/property_dim_dedupe.sql --JOB_RUN_ID jr_9b9536625270bf934b4fd16afb589ac5019e1d1792b9ff228b8f6084ad403e08 --job-bookmark-option job-bookmark-disable --temp_table_name property_dim_All --s3_source_path s3://move-dataeng-lstp-prod/edw/processed-data-xact/property_dim/year=2011/month=04/day=21/hour=07 --TempDir s3://move-dataeng-temp-dev/glue-results/mk/propertyfullrefresh/ """ # Extract the Glue Job Arguments args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 's3_source_path', 's3_target_path', 'source_sql_file_path', 'temp_table_name' ]) print "Job Name is: ", args['JOB_NAME'] print "S3 Source File Path: ", args['s3_source_path'] print "S3 Target File Path: ", args['s3_target_path'] print "Source SQL File Path: ", args['source_sql_file_path'] print "Temp Table Name: ", args['temp_table_name'] #Parse Bucket and Prefix of the SQL File Path source_sql_path = urlparse(args['source_sql_file_path']) s3_bucket_sql_file = source_sql_path.netloc s3_prefix_sql_file = source_sql_path.path.lstrip('/') # Method to read S3 file as a string def getStringFromFile(bucket_name, key): s3_client = boto3.client('s3', region_name='us-west-2') response = s3_client.get_object(Bucket=bucket_name, Key=key) data = response['Body'].read() return data #1. Variables #s3://move-dataeng-lstp-prod/edw/processed-data-xact/property_dim
def replaceurl(url, port): parsed = urlparse(url) newurl = 'https://' + parsed.hostname + ':' + str(port) return (newurl)
def GetChangeIdForReview(self, review_url): # pragma: no cover u = urlparse.urlparse(review_url) return u.path.split('/')[-1]
def req_config_handler(self): req = urlparse.urlparse(self.path).query reqs = urlparse.parse_qs(req, keep_blank_values=True) data = '' appid_updated = False try: if reqs['cmd'] == ['get_config']: data = json.dumps(user_config.user_special, default=lambda o: o.__dict__) elif reqs['cmd'] == ['set_config']: appids = self.postvars['appid'][0] if appids != user_config.user_special.appid: if appids and ip_manager.good_ip_num: fail_appid_list = test_appid.test_appids(appids) if len(fail_appid_list): fail_appid = "|".join(fail_appid_list) return self.send_response_nc('text/html', '{"res":"fail", "reason":"appid fail:%s"}' % fail_appid) appid_updated = True user_config.user_special.appid = appids user_config.user_special.proxy_enable = self.postvars['proxy_enable'][0] user_config.user_special.proxy_type = self.postvars['proxy_type'][0] user_config.user_special.proxy_host = self.postvars['proxy_host'][0] user_config.user_special.proxy_port = self.postvars['proxy_port'][0] try: user_config.user_special.proxy_port = int(user_config.user_special.proxy_port) except: user_config.user_special.proxy_port = 0 user_config.user_special.proxy_user = self.postvars['proxy_user'][0] user_config.user_special.proxy_passwd = self.postvars['proxy_passwd'][0] user_config.user_special.host_appengine_mode = self.postvars['host_appengine_mode'][0] use_ipv6 = int(self.postvars['use_ipv6'][0]) if user_config.user_special.use_ipv6 != use_ipv6: if use_ipv6: if not check_local_network.check_ipv6(): xlog.warn("IPv6 was enabled, but check failed.") return self.send_response_nc('text/html', '{"res":"fail", "reason":"IPv6 fail"}') user_config.user_special.use_ipv6 = use_ipv6 user_config.save() config.load() appid_manager.reset_appid() import connect_manager connect_manager.load_proxy_config() connect_manager.https_manager.load_config() if appid_updated: http_dispatch.close_all_worker() ip_manager.reset() check_ip.load_proxy_config() data = '{"res":"success"}' self.send_response_nc('text/html', data) #http_request("http://127.0.0.1:8085/init_module?module=gae_proxy&cmd=restart") return except Exception as e: xlog.exception("req_config_handler except:%s", e) data = '{"res":"fail", "except":"%s"}' % e self.send_response_nc('text/html', data)
def __init__(self, username=None, password=None, security_token=None, session_id=None, instance=None, instance_url=None, organizationId=None, sandbox=False, version=DEFAULT_API_VERSION, proxies=None, session=None, client_id=None): """Initialize the instance with the given parameters. Available kwargs Password Authentication: * username -- the Salesforce username to use for authentication * password -- the password for the username * security_token -- the security token for the username * sandbox -- True if you want to login to `test.salesforce.com`, False if you want to login to `login.salesforce.com`. Direct Session and Instance Access: * session_id -- Access token for this session Then either * instance -- Domain of your Salesforce instance, i.e. `na1.salesforce.com` OR * instance_url -- Full URL of your instance i.e. `https://na1.salesforce.com Universal Kwargs: * version -- the version of the Salesforce API to use, for example `29.0` * proxies -- the optional map of scheme to proxy server * session -- Custom requests session, created in calling code. This enables the use of requests Session features not otherwise exposed by simple_salesforce. """ # Determine if the user passed in the optional version and/or sandbox # kwargs super(Salesforce, self).__init__(session=session) self.sf_version = version self.sandbox = sandbox self.proxies = self.session.proxies # override custom session proxies dance if proxies is not None: if not session: self.session.proxies = self.proxies = proxies else: logger.warning( 'Proxies must be defined on custom session object, ' 'ignoring proxies: %s', proxies) # Determine if the user wants to use our username/password auth or pass # in their own information if all(arg is not None for arg in (username, password, security_token)): self.auth_type = "password" # Pass along the username/password to our login helper self.session_id, self.sf_instance = SalesforceLogin( session=self.session, username=username, password=password, security_token=security_token, sandbox=self.sandbox, sf_version=self.sf_version, proxies=self.proxies, client_id=client_id) elif all(arg is not None for arg in (session_id, instance or instance_url)): self.auth_type = "direct" self.session_id = session_id # If the user provides the full url (as returned by the OAuth # interface for example) extract the hostname (which we rely on) if instance_url is not None: self.sf_instance = urlparse(instance_url).hostname else: self.sf_instance = instance elif all(arg is not None for arg in (username, password, organizationId)): self.auth_type = 'ipfilter' # Pass along the username/password to our login helper self.session_id, self.sf_instance = SalesforceLogin( session=self.session, username=username, password=password, organizationId=organizationId, sandbox=self.sandbox, sf_version=self.sf_version, proxies=self.proxies, client_id=client_id) else: raise TypeError( 'You must provide login information or an instance and token') if self.sandbox: self.auth_site = 'https://test.salesforce.com' else: self.auth_site = 'https://login.salesforce.com' self.base_url = ('https://{instance}/services/data/v{version}/'.format( instance=self.sf_instance, version=self.sf_version)) self.apex_url = ('https://{instance}/services/apexrest/'.format( instance=self.sf_instance))
# Patch admin site for stats application patch(admin.site) admin.autodiscover() handler404 = 'shared.views.view_404' handler500 = 'shared.views.view_500' urlpatterns = patterns( '', (r'', include('shared.urls')), (r'', include('badges.urls')), (r'', include('banners.urls')), (r'^accounts/', include('users.urls')), (r'^browserid/', include('browserid.urls')), (r'^fb/', include('facebook.urls')), (r'^admin/', include('smuggler.urls')), (r'^admin/', include(admin.site.urls)), ) ## In DEBUG mode, serve media files through Django. if settings.DEBUG or settings.SERVE_MEDIA: # Remove host, leading and trailing slashes so the regex matches. media_url = urlparse(settings.MEDIA_URL).path.lstrip('/').rstrip('/') urlpatterns += patterns( '', (r'^%s/(?P<path>.*)$' % media_url, 'django.views.static.serve', { 'document_root': settings.MEDIA_ROOT }), )
print '%d total emojis in master.' % len(master_list) # Check if emojis in master is missing from all_emojis for phrase, img_url in master_list.items(): #if phrase not in all_emojis and phrase.encode('utf-8') not in all_emojis: if not all_emojis.get(phrase) and not all_emojis.get( phrase.encode('utf-8')): print 'MISSING: ', phrase, img_url for phrase, img_url in all_emojis.items(): if args.debug: print phrase + '\t' + img_url if img_url.startswith(WEIBO_EMOJI_WEBROOT): filename = img_url.replace(WEIBO_EMOJI_WEBROOT, '') else: filename = urlparse.urlparse(img_url).path[1:] # to catch img urls like http://img.t.sinajs.cn/t4/appstyle/expression/ext/normal/c8/../e0/hongbao1_org.gif filename = re.sub(r'[^/]+/\.\./', '', filename) output_filename = os.path.join(args.output_folder, filename) # create folder if necessary try: if not args.simulate: if not os.path.exists(os.path.dirname(output_filename)): os.makedirs(os.path.dirname(output_filename)) if not os.path.isfile(output_filename): # download if file does not exist request = urllib2.Request( img_url, headers={'User-agent': 'Mozilla/5.0'})
if len(sys.argv) < 2 or len(sys.argv) > 4: print('%s [domain] [output]'%sys.argv[0]) sys.exit(0) all_links = [] domain = sys.argv[1] try: filename = sys.argv[2] except IndexError: filename = None import urlparse parse = urlparse.urlparse(domain) if parse.netloc: domain = parse.netloc elif parse.path != '' and parse.netloc == '': domain = parse.path else: domain = domain def archive(): content = requests.get('http://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&collapse=urlkey'%domain).content c = json.loads(content) for i in c: for b in i: if domain in b and b.startswith('http'): if b not in all_links: all_links.append(b)
# Generate a list of machines to use. machines = [] for machine in options.machines.split(','): if machine in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']: for unit in [0, 1, 2, 3]: machines.append("sfxc-" + machine + str(unit)) continue pass else: machines.append(machine) pass continue # Select input nodes. for station in json_input["data_sources"]: url = urlparse.urlparse(json_input["data_sources"][station][0]) if url.netloc: if url.port: data_socket[station] = url.port else: data_socket[station] = 8888 pass elif url.path: data_socket[station] = os.path.dirname(url.path) else: data_socket[station] = '/tmp/mk5read' pass continue readers = {} reader_slots = {}
def sources(self, url, hostDict, hostprDict): try: sources = [] if url == None: return sources if debrid.status() == False: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] hdlr = 'S%02dE%02d' % (int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else data['year'] query = '%s S%02dE%02d' % (data['tvshowtitle'], int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else '%s %s' % (data['title'], data['year']) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = self.search_link % urllib.quote_plus(query) url = urlparse.urljoin(self.base_link, url) r = client.request(url) posts = client.parseDOM(r, 'item') hostDict = hostprDict + hostDict items = [] for post in posts: try: t = client.parseDOM(post, 'title')[0] u = re.findall('<p>(http(?:s|)://.+?)</p>', post) items += [(t, i) for i in u] except: pass for item in items: try: name = item[0] name = client.replaceHTMLCodes(name) t = re.sub('(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*|3D)(\.|\)|\]|\s|)(.+|)', '', name) if not cleantitle.get(t) == cleantitle.get(title): raise Exception() y = re.findall('[\.|\(|\[|\s](\d{4}|S\d*E\d*|S\d*)[\.|\)|\]|\s]', name)[-1].upper() if not y == hdlr: raise Exception() fmt = re.sub('(.+)(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*)(\.|\)|\]|\s)', '', name.upper()) fmt = re.split('\.|\(|\)|\[|\]|\s|\-', fmt) fmt = [i.lower() for i in fmt] if any(i.endswith(('subs', 'sub', 'dubbed', 'dub')) for i in fmt): raise Exception() if any(i in ['extras'] for i in fmt): raise Exception() if '1080p' in fmt: quality = '1080p' elif '720p' in fmt: quality = 'HD' else: quality = 'SD' if any(i in ['dvdscr', 'r5', 'r6'] for i in fmt): quality = 'SCR' elif any(i in ['camrip', 'tsrip', 'hdcam', 'hdts', 'dvdcam', 'dvdts', 'cam', 'telesync', 'ts'] for i in fmt): quality = 'CAM' info = [] if '3d' in fmt: info.append('3D') try: size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) [M|G]B)', name)[-1] div = 1 if size.endswith(' GB') else 1024 size = float(re.sub('[^0-9|/.|/,]', '', size))/div size = '%.2f GB' % size info.append(size) except: pass if any(i in ['hevc', 'h265', 'x265'] for i in fmt): info.append('HEVC') info = ' | '.join(info) url = item[1] if any(x in url for x in ['.rar', '.zip', '.iso']): raise Exception() url = client.replaceHTMLCodes(url) url = url.encode('utf-8') host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(url.strip().lower()).netloc)[0] if not host in hostDict: raise Exception() host = client.replaceHTMLCodes(host) host = host.encode('utf-8') sources.append({'source': host, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True}) except: pass check = [i for i in sources if not i['quality'] == 'CAM'] if check: sources = check return sources except: return sources
from urlparse import urlparse import sys for line in sys.stdin: data = line.strip().split("GET") if len(data) == 2: print urlparse(data[1]).path