def build_request_with_data(url, data, api_key, method): """Build a request with the received method.""" http_redirect_with_data_handler = HTTPRedirectWithDataHandler(method=method) opener = build_opener(http_redirect_with_data_handler) install_opener(opener) url = make_url(url, api_key=api_key, args=None) request = Request(url, headers={'Content-Type': 'application/json'}, data=json.dumps(data)) request_method = request.get_method() if request_method != method: request.get_method = lambda: method return opener, request
def redirect_request(self, request, fp, code, msg, headers, new_url): request_method = request.get_method() if str(code) in self.redirect_codes and request_method in self.valid_methods: new_url = new_url.replace(' ', '%20') request = Request(new_url, data=request.data, headers=request.headers, origin_req_host=request.get_origin_req_host(), unverifiable=True) if self.method in self.valid_methods: if request.get_method() != self.method: request.get_method = lambda: self.method return request else: HTTPRedirectHandler.redirect_request(request, fp, code, msg, headers, new_url)
def execute(cls, uri, http_verb, extra_headers=None, batch=False, _body=None, **kw): """ if batch == False, execute a command with the given parameters and return the response JSON. If batch == True, return the dictionary that would be used in a batch command. """ if batch: urlsplitter = urlparse(API_ROOT).netloc ret = {"method": http_verb, "path": uri.split(urlsplitter, 1)[1]} if kw: ret["body"] = kw return ret if not ('app_id' in ACCESS_KEYS and 'rest_key' in ACCESS_KEYS): raise core.ParseError('Missing connection credentials') app_id = ACCESS_KEYS.get('app_id') rest_key = ACCESS_KEYS.get('rest_key') master_key = ACCESS_KEYS.get('master_key') url = uri if uri.startswith(API_ROOT) else cls.ENDPOINT_ROOT + uri if _body is None: data = kw and json.dumps(kw, default=date_handler) or "{}" else: data = _body if http_verb == 'GET' and data: url += '?%s' % urlencode(kw) data = None else: data = data headers = { 'Content-type': 'application/json', 'X-Parse-Application-Id': app_id, 'X-Parse-REST-API-Key': rest_key } headers.update(extra_headers or {}) request = Request(url.encode('utf-8'), data, headers) if ACCESS_KEYS.get('session_token'): request.add_header('X-Parse-Session-Token', ACCESS_KEYS.get('session_token')) elif master_key: request.add_header('X-Parse-Master-Key', master_key) request.get_method = lambda: http_verb try: response = urlopen(request, timeout=CONNECTION_TIMEOUT) except HTTPError as e: exc = { 400: core.ResourceRequestBadRequest, 401: core.ResourceRequestLoginRequired, 403: core.ResourceRequestForbidden, 404: core.ResourceRequestNotFound }.get(e.code, core.ParseError) raise exc(e.read()) return json.loads(response.read().decode('utf-8'))
def get_genome_space_launch_apps( atm_url, url_opener, file_url, file_type ): gs_request = Request( "%s/%s/webtool/descriptor" % ( atm_url, GENOMESPACE_API_VERSION_STRING ) ) gs_request.get_method = lambda: 'GET' opened_gs_request = url_opener.open( gs_request ) webtool_descriptors = json.loads( opened_gs_request.read() ) webtools = [] for webtool in webtool_descriptors: webtool_name = webtool.get( 'name' ) base_url = webtool.get( 'baseUrl' ) use_tool = False for param in webtool.get( 'fileParameters', [] ): for format in param.get( 'formats', [] ): if format.get( 'name' ) == file_type: use_tool = True break if use_tool: file_param_name = param.get( 'name' ) # file_name_delimiters = param.get( 'nameDelimiters' ) if '?' in base_url: url_delimiter = "&" else: url_delimiter = "?" launch_url = "%s%s%s" % ( base_url, url_delimiter, urlencode( [ ( file_param_name, file_url ) ] ) ) webtools.append( ( launch_url, webtool_name ) ) break return webtools
def _pd_api(self, url, data=None, method='GET'): url = '%s/%s' % (PD_API_BASE, url) request_args = { 'headers': dict(self._pd_headers) } if six.PY3: # pragma: no cover request_args['method'] = method if data is not None: request_args['data'] = json.dumps(data).encode('utf-8') request_args['headers']['Content-Type'] = APPLICATION_JSON request = Request(url, **request_args) if six.PY2: # pragma: no cover request.get_method = lambda: method try: response = urlopen(request) return json.loads(response.read().decode('utf-8')) except HTTPError as e: response = e.read().decode('utf-8') logger.warning("API error: %s", response) if method == 'GET' and e.code == 404: return None else: raise e
def _request(self, url, data=None, headers=None, checker=None, method=None): if not headers: headers = {} if self.token: headers["X-API-Key"] = self.token self.log.debug("Request: %s %s %s", method if method else 'GET', url, data[:self.logger_limit] if data else None) # .encode("utf-8") is probably better data = data.encode() if isinstance(data, six.text_type) else data request = Request(url, data, headers) if method: request.get_method = lambda: method response = urlopen(request, timeout=self.timeout) if checker: checker(response) resp = response.read() if not isinstance(resp, str): resp = resp.decode() self.log.debug("Response: %s", resp[:self.logger_limit] if resp else None) return json.loads(resp) if len(resp) else {}
def set_genomespace_format_identifiers( url_opener, dm_site ): gs_request = Request( "%s/%s/dataformat/list" % ( dm_site, GENOMESPACE_API_VERSION_STRING ) ) gs_request.get_method = lambda: 'GET' opened_gs_request = url_opener.open( gs_request ) genomespace_formats = json.loads( opened_gs_request.read() ) for format in genomespace_formats: GENOMESPACE_FORMAT_IDENTIFIER_TO_GENOMESPACE_EXT[ format['url'] ] = format['name']
def __del(api_key, url, data): """ Do the actual DELETE """ url = make_url(api_key, url) req = Request(url, headers={'Content-Type': 'application/json'}, data=json.dumps(data)) req.get_method = lambda: 'DELETE' return json.loads(urlopen(req).read())
def create_directory( url_opener, directory_dict, new_dir, dm_url ): payload = { "isDirectory": True } for dir_slice in new_dir: if dir_slice in ( '', '/', None ): continue url = '/'.join( ( directory_dict['url'], quote( dir_slice.replace( '/', '_' ), safe='' ) ) ) new_dir_request = Request( url, headers={ 'Content-Type': 'application/json', 'Accept': 'application/json' }, data=json.dumps( payload ) ) new_dir_request.get_method = lambda: 'PUT' directory_dict = json.loads( url_opener.open( new_dir_request ).read() ) return directory_dict
def clear_all_queries(self, cluster_name=DEFAULT_CLUSTER): """ Clear all the primed queries from a particular cluster :param cluster_name: cluster to clear queries from """ opener = build_opener(HTTPHandler) request = Request("http://{0}/{1}/{2}".format( self.admin_addr, "prime", cluster_name)) request.get_method = lambda: 'DELETE' connection = opener.open(request) return connection.read()
def test_upload_no_boundary(self): with self.assertRaises(HTTPError) as handler: data = b"" request = Request(self._url('top/middle/'), data=data) request.add_header("Content-Length", len(data)) request.add_header("Content-Type", "multipart/form-data") request.get_method = lambda: "POST" urlopen(request) self.assertEqual(handler.exception.code, 400) self.assertEqual(handler.exception.reason, "'Content-Type' header does not contain a boundary")
def get_system(token, method, api_cmd, api_cmd_headers=None, api_cmd_payload=None, timeout=10): """ Make a rest-api request Returns: response as a dictionary """ LOG.debug("%s cmd:%s hdr:%s payload:%s" % (method, api_cmd, api_cmd_headers, api_cmd_payload)) response = None try: request_info = Request(api_cmd) request_info.get_method = lambda: method if token: request_info.add_header("X-Auth-Token", token.get_id()) request_info.add_header("Accept", "application/json") if api_cmd_headers is not None: for header_type, header_value in api_cmd_headers.items(): request_info.add_header(header_type, header_value) if api_cmd_payload is not None: request_info.add_data(api_cmd_payload) request = urlopen(request_info, timeout=timeout) response = request.read() if response == "": response = json.loads("{}") else: response = json.loads(response) request.close() except HTTPError as e: if 401 == e.code: if token: token.set_expired() LOG.warn("HTTP Error e.code=%s e=%s" % (e.code, e)) if hasattr(e, 'msg') and e.msg: response = json.loads(e.msg) else: response = json.loads("{}") raise except URLError: LOG.error("Cannot access %s" % api_cmd) raise finally: return response
def req(self, path, data=None, method=None): url = self.server + path if data: req = Request(url, headers={'Content-Type': 'application/json'}, data=json.dumps(data)) else: req = Request(url, headers={'Content-Type': 'application/json'}) if method: req.get_method = lambda: method res = self.opener.open(req) print('==> at %s (%s)' % (url, method or 'GET')) assert res.getcode() == 200, url return res
def submit_request(self, query): opener = build_opener(HTTPHandler) data = json.dumps(query.fetch_json()).encode('utf8') request = Request("http://{}/{}{}".format( self.admin_addr, query.path, query.fetch_url_params()), data=data) request.get_method = lambda: query.method request.add_header("Content-Type", 'application/json') request.add_header("Content-Length", len(data)) connection = opener.open(request) return connection.read().decode('utf-8')
def submit_request(self, query): opener = build_opener(HTTPHandler) data = json.dumps(query.fetch_json()).encode('utf8') request = Request("http://{}/{}{}".format( self.admin_addr, query.path, query.fetch_url_params()), data=data) request.get_method = lambda: 'POST' request.add_header("Content-Type", 'application/json') request.add_header("Content-Length", len(data)) connection = opener.open(request) return connection.read().decode('utf-8')
def execute(cls, uri, http_verb, extra_headers=None, batch=False, **kw): """ if batch == False, execute a command with the given parameters and return the response JSON. If batch == True, return the dictionary that would be used in a batch command. """ if batch: ret = {"method": http_verb, "path": uri.split("parse.com", 1)[1]} if kw: ret["body"] = kw return ret if not ('app_id' in ACCESS_KEYS and 'rest_key' in ACCESS_KEYS): raise core.ParseError('Missing connection credentials') app_id = ACCESS_KEYS.get('app_id') rest_key = ACCESS_KEYS.get('rest_key') master_key = ACCESS_KEYS.get('master_key') headers = extra_headers or {} url = uri if uri.startswith(API_ROOT) else cls.ENDPOINT_ROOT + uri data = kw and json.dumps(kw) or "{}" if http_verb == 'GET' and data: url += '?%s' % urlencode(kw) data = None else: data = data.encode('utf-8') request = Request(url, data, headers) request.add_header('Content-type', 'application/json') request.add_header('X-Parse-Application-Id', app_id) request.add_header('X-Parse-REST-API-Key', rest_key) if master_key and 'X-Parse-Session-Token' not in headers.keys(): request.add_header('X-Parse-Master-Key', master_key) request.get_method = lambda: http_verb try: response = urlopen(request) except HTTPError as e: exc = { 400: core.ResourceRequestBadRequest, 401: core.ResourceRequestLoginRequired, 403: core.ResourceRequestForbidden, 404: core.ResourceRequestNotFound }.get(e.code, core.ParseError) raise exc(e.read()) return json.loads(response.read().decode('utf-8'))
def set_genomespace_format_identifiers(url_opener, dm_site): gs_request = Request("%s/%s/dataformat/list" % (dm_site, GENOMESPACE_API_VERSION_STRING)) gs_request.get_method = lambda: 'GET' opened_gs_request = url_opener.open(gs_request) genomespace_formats = json.loads(opened_gs_request.read()) for format in genomespace_formats: GENOMESPACE_FORMAT_IDENTIFIER_TO_GENOMESPACE_EXT[ format['url']] = format['name'] global GENOMESPACE_FORMAT_IDENTIFIER_UNKNOWN GENOMESPACE_FORMAT_IDENTIFIER_UNKNOWN = dict( (x[1], x[0]) for x in GENOMESPACE_FORMAT_IDENTIFIER_TO_GENOMESPACE_EXT.items()).get( GENOMESPACE_UNKNOWN_FORMAT_KEY, GENOMESPACE_FORMAT_IDENTIFIER_UNKNOWN)
def rest_api_request(token, method, api_cmd, api_cmd_payload=None, timeout=10): """ Make a rest-api request Returns: response as a dictionary """ api_cmd_headers = dict() api_cmd_headers['Content-type'] = "application/json" api_cmd_headers['User-Agent'] = "cert-mon/1.0" try: request_info = Request(api_cmd) request_info.get_method = lambda: method if token: request_info.add_header("X-Auth-Token", token.get_id()) request_info.add_header("Accept", "application/json") if api_cmd_headers is not None: for header_type, header_value in api_cmd_headers.items(): request_info.add_header(header_type, header_value) if api_cmd_payload is not None: request_info.add_data(api_cmd_payload) request = None try: request = urlopen(request_info, timeout=timeout) response = request.read() finally: if request: request.close() if response == "": response = json.loads("{}") else: response = json.loads(response) except HTTPError as e: if 401 == e.code: if token: token.set_expired() raise except URLError: LOG.error("Cannot access %s" % api_cmd) raise return response
def upload(self, filename): if not self.cdash_upload_url: return # Compute md5 checksum for the contents of this file. md5sum = checksum(hashlib.md5, filename, block_size=8192) opener = build_opener(HTTPHandler) with open(filename, 'rb') as f: url = "{0}&MD5={1}".format(self.cdash_upload_url, md5sum) request = Request(url, data=f) request.add_header('Content-Type', 'text/xml') request.add_header('Content-Length', os.path.getsize(filename)) # By default, urllib2 only support GET and POST. # CDash needs expects this file to be uploaded via PUT. request.get_method = lambda: 'PUT' url = opener.open(request)
def create_directory(url_opener, directory_dict, new_dir, dm_url): payload = {"isDirectory": True} for dir_slice in new_dir: if dir_slice in ('', '/', None): continue url = '/'.join( (directory_dict['url'], quote(dir_slice.replace('/', '_'), safe=''))) new_dir_request = Request(url, headers={ 'Content-Type': 'application/json', 'Accept': 'application/json' }, data=json.dumps(payload)) new_dir_request.get_method = lambda: 'PUT' directory_dict = json.loads(url_opener.open(new_dir_request).read()) return directory_dict
def try_del(httpd, querystr): """Try DEL calls to the server.""" num_requests["del_handler"] = 0 opener = build_opener(HTTPHandler) request = Request(httpd_url(httpd, "/api/resource/1", querystr)) request.get_method = lambda: "DEL" f = opener.open(request) assert f.getcode() == 200 assert json.loads(f.read()) == { "called": 1, "id": "1", "query": querystr } assert num_requests["del_handler"] == 1
def populate_buildgroup(job_names, group_name, project, site, credentials, cdash_url): url = "{0}/api/v1/buildgroup.php".format(cdash_url) headers = { 'Authorization': 'Bearer {0}'.format(credentials), 'Content-Type': 'application/json', } opener = build_opener(HTTPHandler) parent_group_id = _create_buildgroup(opener, headers, url, project, group_name, 'Daily') group_id = _create_buildgroup(opener, headers, url, project, 'Latest {0}'.format(group_name), 'Latest') if not parent_group_id or not group_id: msg = 'Failed to create or retrieve buildgroups for {0}'.format( group_name) raise SpackError(msg) data = { 'project': project, 'buildgroupid': group_id, 'dynamiclist': [{ 'match': name, 'parentgroupid': parent_group_id, 'site': site } for name in job_names] } enc_data = json.dumps(data).encode('utf-8') request = Request(url, data=enc_data, headers=headers) request.get_method = lambda: 'PUT' response = opener.open(request) response_code = response.getcode() if response_code != 200: msg = 'Error response code ({0}) in populate_buildgroup'.format( response_code) raise SpackError(msg)
def recurse_directory_dict( url_opener, cur_options, url ): cur_directory = Request( url, headers={ 'Content-Type': 'application/json', 'Accept': 'application/json, text/plain' } ) cur_directory.get_method = lambda: 'GET' # get url to upload to try: cur_directory = url_opener.open( cur_directory ).read() except HTTPError as e: log.debug( 'GenomeSpace export tool failed reading a directory "%s": %s' % ( url, e ) ) return # bad url, go to next cur_directory = json.loads( cur_directory ) directory = cur_directory.get( 'directory', {} ) contents = cur_directory.get( 'contents', [] ) if directory.get( 'isDirectory', False ): selected = directory.get( 'path' ) == value cur_options.append( { 'name': directory.get( 'name' ), 'value': directory.get( 'path'), 'options': [], 'selected': selected } ) for sub_dir in contents: if sub_dir.get( 'isDirectory', False ): recurse_directory_dict( url_opener, cur_options[-1]['options'], sub_dir.get( 'url' ) )
def http_request(self, url, method, data="", headers=None, timeout=None): if url[0:7].lower() != "http://": url = "http://%s" % url if hasattr(self, 'logger') and self.logger is not None: self.logger.debug("Sending http request. Url: %s, Data: %s, Headers: %s" % (url, str(data), str(headers))) req = Request(url, data, headers) req.get_method = lambda: method # The timeout parameter in urllib2.urlopen has strange behavior, and # seems to raise errors when set to a number. Using an opener works however. opener = build_opener() if timeout is None: response = opener.open(req) else: response = opener.open(req, timeout=timeout) return response
def http_call(method, url, data=None): """Utility method for making HTTP requests.""" LOG.debug("http_call(): Calling %s %s" % (method, url)) opener = build_opener(HTTPHandler) if data: data = simplejson.dumps(data) LOG.debug("http_call(): With body: %s" % data) request = Request(url, data) request.add_header('Accept', 'application/json') if data: request.add_header('Content-Type', 'application/json') request.get_method = lambda: method resp = opener.open(request) if resp.getcode() >= 400: raise exceptions.RomanaException("Error in %s %s with payload %s: %s", method, url, data, resp) body = resp.read() data = simplejson.loads(body) return data
def get_directory( url_opener, dm_url, path ): url = dm_url i = None dir_dict = {} for i, sub_path in enumerate( path ): url = "%s/%s" % ( url, sub_path ) dir_request = Request( url, headers={ 'Content-Type': 'application/json', 'Accept': 'application/json' } ) dir_request.get_method = lambda: 'GET' try: dir_dict = json.loads( url_opener.open( dir_request ).read() ) except HTTPError: # print "e", e, url #punting, assuming lack of permissions at this low of a level... continue break if i is not None: path = path[i + 1:] else: path = [] return ( dir_dict, path )
def try_del(self, server_port, querystr): self.resource_del_called = 0 opener = build_opener(HTTPHandler) request = Request( self.get_url('/api/resource/1', server_port, querystr)) request.get_method = lambda: 'DEL' f = opener.open(request) try: self.assertEqual(f.getcode(), 200) except AttributeError: pass # python 2.4 self.assertEqual(json.loads(f.read()), { 'called': 1, 'id': str(1), 'query': querystr }) self.assertEqual(self.resource_del_called, 1)
def revoke_token(cls, token, uid): if not cls.REVOKE_TOKEN_URL: return url = cls.REVOKE_TOKEN_URL.format(token=token, uid=uid) params = cls.revoke_token_params(token, uid) or {} headers = cls.revoke_token_headers(token, uid) or {} data = None if cls.REVOKE_TOKEN_METHOD == 'GET': url = '{}?{}'.format(url, urlencode(params)) else: data = urlencode(params) request = Request(url, data=data, headers=headers) if cls.REVOKE_TOKEN_URL.lower() not in ('get', 'post'): # Patch get_method to return the needed method request.get_method = lambda: cls.REVOKE_TOKEN_METHOD response = dsa_urlopen(request) return cls.process_revoke_token_response(response)
def revoke_token(cls, token, uid): if not cls.REVOKE_TOKEN_URL: return url = cls.REVOKE_TOKEN_URL.format(token=token, uid=uid) params = cls.revoke_token_params(token, uid) or {} headers = cls.revoke_token_headers(token, uid) or {} data = None if cls.REVOKE_TOKEN_METHOD == "GET": url = u"{}?{}".format(url, urlencode(params)) else: data = urlencode(params) request = Request(url, data=data, headers=headers) if cls.REVOKE_TOKEN_URL.lower() not in ("get", "post"): # Patch get_method to return the needed method request.get_method = lambda: cls.REVOKE_TOKEN_METHOD response = dsa_urlopen(request) return cls.process_revoke_token_response(response)
def test_nonexistent_resources(httpd_no_urlhandlers): # GET: Return 404 for non-existent endpoint with pytest.raises(HTTPError) as excinfo: urlopen(httpd_url(httpd_no_urlhandlers, "/api/resource/")) assert excinfo.value.code == 404 # POST: POST should also return 404 with pytest.raises(HTTPError) as excinfo: urlopen(httpd_url(httpd_no_urlhandlers, "/api/resource/"), data=json.dumps({})) assert excinfo.value.code == 404 # DEL: DEL should also return 404 opener = build_opener(HTTPHandler) request = Request(httpd_url(httpd_no_urlhandlers, "/api/resource/")) request.get_method = lambda: "DEL" with pytest.raises(HTTPError) as excinfo: opener.open(request) assert excinfo.value.code == 404
def request(url, data=None, headers=None, params=None): u'''Simple HTTP Client''' if params is not None: query = urlencode(params) url = '%s?%s' % (url, query) req = Request(url, headers=headers) if data is not None: req.add_data(data) try: logging.debug("%s %s", req.get_method(), url) res = urlopen(req) return json.loads(res.read()) except HTTPError as err: logging.error("%s. Client error GET %s with status %d.", err.reason, url, err.code) except URLError as err: logging.exception(err) except (ValueError, TypeError) as err: logging.error(err) return None
def wait_for_spark_workers(num_of_expected_workers, timeout): """ This queries the spark master and checks for the expected number of workers """ start_time = time.time() while True: opener = build_opener(HTTPHandler) request = Request("http://{0}:7080".format(CASSANDRA_IP)) request.get_method = lambda: 'GET' connection = opener.open(request) match = re.search('Alive Workers:.*(\d+)</li>', connection.read().decode('utf-8')) num_workers = int(match.group(1)) if num_workers == num_of_expected_workers: match = True break elif time.time() - start_time > timeout: match = True break time.sleep(1) return match
def test_nonexistent_resources(self): # Create a server with a placeholder handler so we don't fall back # to serving local files httpd = mozhttpd.MozHttpd(port=0) httpd.start(block=False) server_port = httpd.httpd.server_port # GET: Return 404 for non-existent endpoint exception_thrown = False try: urlopen(self.get_url('/api/resource/', server_port, None)) except HTTPError as e: self.assertEqual(e.code, 404) exception_thrown = True self.assertTrue(exception_thrown) # POST: POST should also return 404 exception_thrown = False try: urlopen( self.get_url('/api/resource/', server_port, None), data=json.dumps({}), ) except HTTPError as e: self.assertEqual(e.code, 404) exception_thrown = True self.assertTrue(exception_thrown) # DEL: DEL should also return 404 exception_thrown = False try: opener = build_opener(HTTPHandler) request = Request(self.get_url('/api/resource/', server_port, None)) request.get_method = lambda: 'DEL' opener.open(request) except HTTPError: self.assertEqual(e.code, 404) exception_thrown = True self.assertTrue(exception_thrown)
def invoke_storlet_on_copy_dest(self): # No COPY in swiftclient. Using urllib instead... url = '%s/%s/%s' % (self.url, self.container, self.storlet_file) headers = { 'X-Auth-Token': self.token, 'X-Run-Storlet': self.storlet_name, 'X-Object-Meta-Name': 'thumbnail', 'Destination': '%s/gen_thumb_on_copy_.jpg' % self.container } headers.update(self.additional_headers) req = Request(url, headers=headers) req.get_method = lambda: 'COPY' conn = urlopen(req, timeout=10) status = conn.getcode() self.assertIn(status, [201, 202]) headers = c.head_object(self.url, self.token, self.container, 'gen_thumb_on_copy_.jpg') self.assertLess(int(headers['content-length']), 1087318) self.assertEqual('thumbnail', headers['x-object-meta-name']) self.assertTrue('x-object-meta-x-timestamp' not in headers) self.assertTrue('x-timestamp' in headers)
def upload(self, filename): if not self.cdash_upload_url: return # Compute md5 checksum for the contents of this file. md5sum = checksum(hashlib.md5, filename, block_size=8192) opener = build_opener(HTTPHandler) with open(filename, 'rb') as f: params_dict = { 'build': self.buildname, 'site': self.site, 'stamp': self.buildstamp, 'MD5': md5sum, } encoded_params = urlencode(params_dict) url = "{0}&{1}".format(self.cdash_upload_url, encoded_params) request = Request(url, data=f) request.add_header('Content-Type', 'text/xml') request.add_header('Content-Length', os.path.getsize(filename)) if self.authtoken: request.add_header('Authorization', 'Bearer {0}'.format(self.authtoken)) try: # By default, urllib2 only support GET and POST. # CDash needs expects this file to be uploaded via PUT. request.get_method = lambda: 'PUT' response = opener.open(request) if self.current_package_name not in self.buildIds: resp_value = response.read() if isinstance(resp_value, bytes): resp_value = resp_value.decode('utf-8') match = self.buildid_regexp.search(resp_value) if match: buildid = match.group(1) self.buildIds[self.current_package_name] = buildid except Exception as e: print("Upload to CDash failed: {0}".format(e))
def download_from_genomespace_file_browser( json_parameter_file, genomespace_site, gs_toolname ): json_params = json.loads( open( json_parameter_file, 'r' ).read() ) datasource_params = json_params.get( 'param_dict' ) username = datasource_params.get( "gs-username", None ) token = datasource_params.get( "gs-token", None ) assert None not in [ username, token ], "Missing GenomeSpace username or token." output_filename = datasource_params.get( "output", None ) dataset_id = json_params['output_data'][0]['dataset_id'] hda_id = json_params['output_data'][0]['hda_id'] url_opener = get_cookie_opener( username, token, gs_toolname=gs_toolname ) # load and set genomespace format ids to galaxy exts genomespace_site_dict = get_genomespace_site_urls()[ genomespace_site ] set_genomespace_format_identifiers( url_opener, genomespace_site_dict['dmServer'] ) file_url_prefix = "fileUrl" file_type_prefix = "fileFormat" metadata_parameter_file = open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb' ) # setup datatypes registry for sniffing datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=json_params[ 'job_config' ][ 'GALAXY_ROOT_DIR' ], config=json_params[ 'job_config' ][ 'GALAXY_DATATYPES_CONF_FILE' ] ) file_numbers = [] for name in datasource_params.keys(): if name.startswith( file_url_prefix ): name = name[len( file_url_prefix ):] file_numbers.append( int( name ) ) if not file_numbers: if output_filename: open( output_filename, 'wb' ) # erase contents of file raise Exception( "You must select at least one file to import into Galaxy." ) file_numbers.sort() used_filenames = [] for file_num in file_numbers: url_key = "%s%i" % ( file_url_prefix, file_num ) download_url = datasource_params.get( url_key, None ) if download_url is None: break filetype_key = "%s%i" % ( file_type_prefix, file_num ) filetype_url = datasource_params.get( filetype_key, None ) galaxy_ext = get_galaxy_ext_from_genomespace_format_url( url_opener, filetype_url ) formatted_download_url = "%s?%s" % ( download_url, urlencode( [ ( 'dataformat', filetype_url ) ] ) ) new_file_request = Request( formatted_download_url ) new_file_request.get_method = lambda: 'GET' target_download_url = url_opener.open( new_file_request ) filename = None if 'Content-Disposition' in target_download_url.info(): # If the response has Content-Disposition, try to get filename from it content_disposition = dict( x.strip().split('=') if '=' in x else ( x.strip(), '' ) for x in target_download_url.info()['Content-Disposition'].split( ';' ) ) if 'filename' in content_disposition: filename = content_disposition[ 'filename' ].strip( "\"'" ) if not filename: parsed_url = urlparse( download_url ) filename = unquote_plus( parsed_url[2].split( '/' )[-1] ) if not filename: filename = download_url metadata_dict = None original_filename = filename if output_filename is None: filename = ''.join( c in FILENAME_VALID_CHARS and c or '-' for c in filename ) while filename in used_filenames: filename = "-%s" % filename used_filenames.append( filename ) output_filename = os.path.join( os.getcwd(), 'primary_%i_%s_visible_%s' % ( hda_id, filename, galaxy_ext ) ) metadata_dict = dict( type='new_primary_dataset', base_dataset_id=dataset_id, ext=galaxy_ext, filename=output_filename, name="GenomeSpace import on %s" % ( original_filename ) ) else: if dataset_id is not None: metadata_dict = dict( type='dataset', dataset_id=dataset_id, ext=galaxy_ext, name="GenomeSpace import on %s" % ( filename ) ) output_file = open( output_filename, 'wb' ) chunk_write( target_download_url, output_file ) output_file.close() if ( galaxy_ext == AUTO_GALAXY_EXT or filetype_url == GENOMESPACE_FORMAT_IDENTIFIER_UNKNOWN ) and metadata_dict: # try to sniff datatype try: galaxy_ext = sniff.handle_uploaded_dataset_file( output_filename, datatypes_registry ) except: # sniff failed galaxy_ext = original_filename.rsplit( '.', 1 )[-1] if galaxy_ext not in datatypes_registry.datatypes_by_extension: galaxy_ext = DEFAULT_GALAXY_EXT metadata_dict[ 'ext' ] = galaxy_ext output_filename = None # only have one filename available # write out metadata info if metadata_dict: metadata_parameter_file.write( "%s\n" % json.dumps( metadata_dict ) ) metadata_parameter_file.close() return True
def send_file_to_genomespace( genomespace_site, username, token, source_filename, target_directory, target_filename, file_type, content_type, log_filename, gs_toolname ): target_filename = target_filename.replace( '/', '-' ) # Slashes no longer allowed in filenames url_opener = get_cookie_opener( username, token, gs_toolname=gs_toolname ) genomespace_site_dict = get_genomespace_site_urls()[ genomespace_site ] dm_url = genomespace_site_dict['dmServer'] # get default directory if target_directory and target_directory[0] == '/': directory_dict, target_directory = get_directory( url_opener, dm_url, [ "%s/%s/%s" % ( GENOMESPACE_API_VERSION_STRING, 'file', target_directory[1] ) ] + target_directory[2:] ) directory_dict = directory_dict['directory'] else: directory_dict = get_personal_directory( url_opener, dm_url )['directory'] # this is the base for the auto-generated galaxy export directories # what directory to stuff this in target_directory_dict = create_directory( url_opener, directory_dict, target_directory, dm_url ) content_length = os.path.getsize( source_filename ) input_file = open( source_filename, 'rb' ) if content_length > TARGET_SIMPLE_PUT_UPLOAD_SIZE: # Determine sizes of each part. split_count = content_length / TARGET_SPLIT_SIZE last_size = content_length - ( split_count * TARGET_SPLIT_SIZE ) sizes = [ TARGET_SPLIT_SIZE ] * split_count if last_size: if last_size < MIN_MULTIPART_UPLOAD_SIZE: if sizes: sizes[-1] = sizes[-1] + last_size else: sizes = [ last_size ] else: sizes.append( last_size ) print("Performing multi-part upload in %i parts." % ( len( sizes ) )) # get upload url upload_url = "uploadinfo" upload_url = "%s/%s/%s%s/%s" % ( dm_url, GENOMESPACE_API_VERSION_STRING, upload_url, target_directory_dict['path'], quote( target_filename, safe='' ) ) upload_request = Request( upload_url, headers={ 'Content-Type': 'application/json', 'Accept': 'application/json' } ) upload_request.get_method = lambda: 'GET' upload_info = json.loads( url_opener.open( upload_request ).read() ) conn = S3Connection( aws_access_key_id=upload_info['amazonCredentials']['accessKey'], aws_secret_access_key=upload_info['amazonCredentials']['secretKey'], security_token=upload_info['amazonCredentials']['sessionToken'] ) # Cannot use conn.get_bucket due to permissions, manually create bucket object bucket = boto.s3.bucket.Bucket( connection=conn, name=upload_info['s3BucketName'] ) mp = bucket.initiate_multipart_upload( upload_info['s3ObjectKey'] ) for i, part_size in enumerate( sizes, start=1 ): fh = tempfile.TemporaryFile( 'wb+' ) while part_size: if CHUNK_SIZE > part_size: read_size = part_size else: read_size = CHUNK_SIZE chunk = input_file.read( read_size ) fh.write( chunk ) part_size = part_size - read_size fh.flush() fh.seek(0) mp.upload_part_from_file( fh, i ) fh.close() upload_result = mp.complete_upload() else: print('Performing simple put upload.') upload_url = "uploadurl" content_md5 = hashlib.md5() chunk_write( input_file, content_md5, target_method="update" ) input_file.seek( 0 ) # back to start, for uploading upload_params = { 'Content-Length': content_length, 'Content-MD5': base64.standard_b64encode( content_md5.digest() ), 'Content-Type': content_type } upload_url = "%s/%s/%s%s/%s?%s" % ( dm_url, GENOMESPACE_API_VERSION_STRING, upload_url, target_directory_dict['path'], quote( target_filename, safe='' ), urlencode( upload_params ) ) new_file_request = Request( upload_url ) # , headers = { 'Content-Type': 'application/json', 'Accept': 'application/text' } ) #apparently http://www.genomespace.org/team/specs/updated-dm-rest-api:"Every HTTP request to the Data Manager should include the Accept header with a preference for the media types application/json and application/text." is not correct new_file_request.get_method = lambda: 'GET' # get url to upload to target_upload_url = url_opener.open( new_file_request ).read() # upload file to determined url upload_headers = dict( upload_params ) # upload_headers[ 'x-amz-meta-md5-hash' ] = content_md5.hexdigest() upload_headers[ 'Accept' ] = 'application/json' upload_file_request = Request( target_upload_url, headers=upload_headers, data=input_file ) upload_file_request.get_method = lambda: 'PUT' upload_result = urlopen( upload_file_request ).read() result_url = "%s/%s" % ( target_directory_dict['url'], quote( target_filename, safe='' ) ) # determine available gs launch apps web_tools = get_genome_space_launch_apps( genomespace_site_dict['atmServer'], url_opener, result_url, file_type ) if log_filename: log_file = open( log_filename, 'wb' ) log_file.write( "<html><head><title>File uploaded to GenomeSpace from Galaxy</title></head><body>\n" ) log_file.write( '<p>Uploaded <a href="%s">%s/%s</a> to GenomeSpace.</p>\n' % ( result_url, target_directory_dict['path'], target_filename ) ) if web_tools: log_file.write( "<p>You may open this file directly in the following applications:</p>\n" ) log_file.write( '<p><ul>\n' ) for web_tool in web_tools: log_file.write( '<li><a href="%s">%s</a></li>\n' % ( web_tool ) ) log_file.write( '</p></ul>\n' ) else: log_file.write( '<p>There are no GenomeSpace applications available for file type: %s</p>\n' % ( file_type ) ) log_file.write( "</body></html>\n" ) return upload_result
def builtwith(url, headers=None, html=None, user_agent='builtwith'): """Detect the technology used to build a website FIXME: test data (maybe compare against node wappalyzer-cli)? """ techs = {} # check URL for app_name, app_spec in data['apps'].items(): if 'url' in app_spec: if contains(url, app_spec['url']): add_app(techs, app_name, app_spec) # download content if None in (headers, html): try: request = Request(url, None, {'User-Agent': user_agent}) if html: # already have HTML so just need to make HEAD request for headers request.get_method = lambda : 'HEAD' response = urlopen(request) if headers is None: headers = response.headers if html is None: html = response.read().decode('utf-8') except Exception as e: print('Error:', e) request = None # check headers if headers: for app_name, app_spec in data['apps'].items(): if 'headers' in app_spec: if contains_dict(headers, app_spec['headers']): add_app(techs, app_name, app_spec) # check html if html: # node version only looks in script tag itself script_tags = RE_SCRIPTS.findall(html) + RE_LINKS.findall(html) for app_name, app_spec in data['apps'].items(): for s_tag in script_tags: snippets = app_spec.get('script', []) if not isinstance(snippets, list): snippets = [snippets] for snippet in snippets: if contains(s_tag, snippet): add_app(techs, app_name, app_spec) break snippets = app_spec.get('html', []) if not isinstance(snippets, list): snippets = [snippets] for snippet in snippets: if contains(html, snippet): add_app(techs, app_name, app_spec) break # check meta # XXX add proper meta data parsing metas = dict(RE_META.findall(html)) for app_name, app_spec in data['apps'].items(): for name, content in app_spec.get('meta', {}).items(): if name in metas: if contains(metas[name], content): add_app(techs, app_name, app_spec) break return techs
def execute(cls, uri, http_verb, extra_headers=None, batch=False, _body=None, **kw): """ if batch == False, execute a command with the given parameters and return the response JSON. If batch == True, return the dictionary that would be used in a batch command. """ if batch: urlsplitter = urlparse(API_ROOT).netloc ret = {"method": http_verb, "path": uri.split(urlsplitter, 1)[1]} if kw: ret["body"] = kw return ret if not ('app_id' in ACCESS_KEYS and 'rest_key' in ACCESS_KEYS): raise core.ParseError('Missing connection credentials') app_id = ACCESS_KEYS.get('app_id') rest_key = ACCESS_KEYS.get('rest_key') master_key = ACCESS_KEYS.get('master_key') url = uri if uri.startswith(API_ROOT) else cls.ENDPOINT_ROOT + uri if _body is None: data = kw and json.dumps(kw, default=date_handler) or "{}" else: data = _body if http_verb == 'GET' and data: url += '?%s' % urlencode(kw) data = None else: if cls.__name__ == 'File': data = data else: data = data.encode('utf-8') headers = { 'Content-type': 'application/json', 'X-Parse-Application-Id': app_id, 'X-Parse-REST-API-Key': rest_key } headers.update(extra_headers or {}) if cls.__name__ == 'File': #request = Request(url.encode('utf-8'), data, headers) request = Request(url, data, headers) else: request = Request(url, data, headers) if ACCESS_KEYS.get('session_token'): request.add_header('X-Parse-Session-Token', ACCESS_KEYS.get('session_token')) elif master_key: request.add_header('X-Parse-Master-Key', master_key) request.get_method = lambda: http_verb try: response = urlopen(request, timeout=CONNECTION_TIMEOUT) except HTTPError as e: exc = { 400: core.ResourceRequestBadRequest, 401: core.ResourceRequestLoginRequired, 403: core.ResourceRequestForbidden, 404: core.ResourceRequestNotFound }.get(e.code, core.ParseError) raise exc(e.read()) return json.loads(response.read().decode('utf-8'))
def _spider(url, visited, root, depth, max_depth, raise_on_error): """Fetches URL and any pages it links to up to max_depth. depth should initially be zero, and max_depth is the max depth of links to follow from the root. Prints out a warning only if the root can't be fetched; it ignores errors with pages that the root links to. Returns a tuple of: - pages: dict of pages visited (URL) mapped to their full text. - links: set of links encountered while visiting the pages. """ pages = {} # dict from page URL -> text content. links = set() # set of all links seen on visited pages. # root may end with index.html -- chop that off. if root.endswith('/index.html'): root = re.sub('/index.html$', '', root) try: context = None verify_ssl = spack.config.get('config:verify_ssl') pyver = sys.version_info if (pyver < (2, 7, 9) or (3,) < pyver < (3, 4, 3)): if verify_ssl: tty.warn("Spack will not check SSL certificates. You need to " "update your Python to enable certificate " "verification.") elif verify_ssl: # We explicitly create default context to avoid error described in # https://blog.sucuri.net/2016/03/beware-unverified-tls-certificates-php-python.html context = ssl.create_default_context() else: context = ssl._create_unverified_context() # Make a HEAD request first to check the content type. This lets # us ignore tarballs and gigantic files. # It would be nice to do this with the HTTP Accept header to avoid # one round-trip. However, most servers seem to ignore the header # if you ask for a tarball with Accept: text/html. req = Request(url) req.get_method = lambda: "HEAD" resp = _urlopen(req, timeout=_timeout, context=context) if "Content-type" not in resp.headers: tty.debug("ignoring page " + url) return pages, links if not resp.headers["Content-type"].startswith('text/html'): tty.debug("ignoring page " + url + " with content type " + resp.headers["Content-type"]) return pages, links # Do the real GET request when we know it's just HTML. req.get_method = lambda: "GET" response = _urlopen(req, timeout=_timeout, context=context) response_url = response.geturl() # Read the page and and stick it in the map we'll return page = response.read().decode('utf-8') pages[response_url] = page # Parse out the links in the page link_parser = LinkParser() subcalls = [] link_parser.feed(page) while link_parser.links: raw_link = link_parser.links.pop() abs_link = urljoin(response_url, raw_link.strip()) links.add(abs_link) # Skip stuff that looks like an archive if any(raw_link.endswith(suf) for suf in ALLOWED_ARCHIVE_TYPES): continue # Skip things outside the root directory if not abs_link.startswith(root): continue # Skip already-visited links if abs_link in visited: continue # If we're not at max depth, follow links. if depth < max_depth: subcalls.append((abs_link, visited, root, depth + 1, max_depth, raise_on_error)) visited.add(abs_link) if subcalls: pool = NonDaemonPool(processes=len(subcalls)) try: results = pool.map(_spider_wrapper, subcalls) for sub_pages, sub_links in results: pages.update(sub_pages) links.update(sub_links) finally: pool.terminate() pool.join() except URLError as e: tty.debug(e) if hasattr(e, 'reason') and isinstance(e.reason, ssl.SSLError): tty.warn("Spack was unable to fetch url list due to a certificate " "verification problem. You can try running spack -k, " "which will not check SSL certificates. Use this at your " "own risk.") if raise_on_error: raise NoNetworkConnectionError(str(e), url) except HTMLParseError as e: # This error indicates that Python's HTML parser sucks. msg = "Got an error parsing HTML." # Pre-2.7.3 Pythons in particular have rather prickly HTML parsing. if sys.version_info[:3] < (2, 7, 3): msg += " Use Python 2.7.3 or newer for better HTML parsing." tty.warn(msg, url, "HTMLParseError: " + str(e)) except Exception as e: # Other types of errors are completely ignored, except in debug mode. tty.debug("Error in _spider: %s:%s" % (type(e), e), traceback.format_exc()) return pages, links
def download_from_genomespace_importer(username, token, json_parameter_file, genomespace_site, gs_toolname): json_params = json.loads(open(json_parameter_file, 'r').read()) datasource_params = json_params.get('param_dict') assert None not in [username, token], "Missing GenomeSpace username or token." output_filename = datasource_params.get("output_file1", None) dataset_id = base_dataset_id = json_params['output_data'][0]['dataset_id'] hda_id = json_params['output_data'][0]['hda_id'] url_opener = get_cookie_opener(username, token, gs_toolname=gs_toolname) # load and set genomespace format ids to galaxy exts genomespace_site_dict = get_genomespace_site_urls()[genomespace_site] set_genomespace_format_identifiers(url_opener, genomespace_site_dict['dmServer']) file_url_name = "URL" metadata_parameter_file = open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb') # setup datatypes registry for sniffing datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=json_params['job_config']['GALAXY_ROOT_DIR'], config=json_params['job_config']['GALAXY_DATATYPES_CONF_FILE']) url_param = datasource_params.get(file_url_name, None) used_filenames = [] for download_url in url_param.split(','): using_temp_file = False parsed_url = urlparse(download_url) query_params = parse_qs(parsed_url[4]) # write file to disk new_file_request = Request(download_url) new_file_request.get_method = lambda: 'GET' target_download_url = url_opener.open(new_file_request) filename = None if 'Content-Disposition' in target_download_url.info(): content_disposition = dict( x.strip().split('=') if '=' in x else (x.strip(), '') for x in target_download_url.info()['Content-Disposition'].split(';')) if 'filename' in content_disposition: filename = content_disposition['filename'].strip("\"'") if not filename: parsed_url = urlparse(download_url) query_params = parse_qs(parsed_url[4]) filename = unquote_plus(parsed_url[2].split('/')[-1]) if not filename: filename = download_url if output_filename is None: # need to use a temp file here, because we do not know the ext yet using_temp_file = True output_filename = tempfile.NamedTemporaryFile( prefix='tmp-genomespace-importer-').name output_file = open(output_filename, 'wb') chunk_write(target_download_url, output_file) output_file.close() # determine file format file_type = None if 'dataformat' in query_params: # this is a converted dataset file_type = query_params['dataformat'][0] file_type = get_galaxy_ext_from_genomespace_format_url( url_opener, file_type) else: try: # get and use GSMetadata object download_file_path = download_url.split( "%s/file/" % (genomespace_site_dict['dmServer']), 1 )[-1] # FIXME: This is a very bad way to get the path for determining metadata. There needs to be a way to query API using download URLto get to the metadata object metadata_request = Request( "%s/%s/filemetadata/%s" % (genomespace_site_dict['dmServer'], GENOMESPACE_API_VERSION_STRING, download_file_path)) metadata_request.get_method = lambda: 'GET' metadata_url = url_opener.open(metadata_request) file_metadata_dict = json.loads(metadata_url.read()) metadata_url.close() file_type = file_metadata_dict.get('dataFormat', None) if file_type and file_type.get('url'): file_type = file_type.get('url') file_type = get_galaxy_ext_from_genomespace_format_url( url_opener, file_type, default=None) except: pass if file_type is None: # try to sniff datatype try: file_type = sniff.handle_uploaded_dataset_file( output_filename, datatypes_registry) except: pass # sniff failed if file_type is None and '.' in parsed_url[2]: # still no known datatype, fall back to using extension file_type = parsed_url[2].rsplit('.', 1)[-1] file_type = GENOMESPACE_EXT_TO_GALAXY_EXT.get(file_type, file_type) if file_type is None: # use default extension (e.g. 'data') file_type = DEFAULT_GALAXY_EXT # save json info for single primary dataset if dataset_id is not None: metadata_parameter_file.write("%s\n" % json.dumps( dict(type='dataset', dataset_id=dataset_id, ext=file_type, name="GenomeSpace importer on %s" % (filename)))) # if using tmp file, move the file to the new file path dir to get scooped up later if using_temp_file: original_filename = filename filename = ''.join(c in FILENAME_VALID_CHARS and c or '-' for c in filename) while filename in used_filenames: filename = "-%s" % filename used_filenames.append(filename) target_output_filename = os.path.join( os.getcwd(), 'primary_%i_%s_visible_%s' % (hda_id, filename, file_type)) shutil.move(output_filename, target_output_filename) metadata_parameter_file.write("%s\n" % json.dumps( dict(type='new_primary_dataset', base_dataset_id=base_dataset_id, ext=file_type, filename=target_output_filename, name="GenomeSpace importer on %s" % (original_filename)))) dataset_id = None # only one primary dataset available output_filename = None # only have one filename available metadata_parameter_file.close() return True
def rest_api_request(token, method, api_cmd, api_cmd_headers=None, api_cmd_payload=None, timeout=10): """ Make a rest-api request Returns: response as a dictionary """ # signal.signal(signal.SIGALRM, _timeout_handler) # if hasattr(signal, 'SIGALRM'): # signal.alarm(timeout) LOG.info("%s cmd:%s hdr:%s payload:%s" % (method, api_cmd, api_cmd_headers, api_cmd_payload)) response = None try: request_info = Request(api_cmd) request_info.get_method = lambda: method if token: request_info.add_header("X-Auth-Token", token.get_id()) request_info.add_header("Accept", "application/json") if api_cmd_headers is not None: for header_type, header_value in api_cmd_headers.items(): request_info.add_header(header_type, header_value) if api_cmd_payload is not None: request_info.add_data(api_cmd_payload) request = urlopen(request_info, timeout=timeout) response = request.read() if response == "": response = json.loads("{}") else: response = json.loads(response) request.close() LOG.info("Response=%s" % response) except HTTPError as e: if 401 == e.code: if token: token.set_expired() LOG.warn("HTTP Error e.code=%s e=%s" % (e.code, e)) if hasattr(e, 'msg') and e.msg: response = json.loads(e.msg) else: response = json.loads("{}") LOG.info("HTTPError response=%s" % (response)) raise OpenStackRestAPIException(e.message, e.code, "%s" % e) except URLError as e: LOG.warn("URLError Error e=%s" % (e)) raise OpenStackException(e.message, "%s" % e) except si_exception.SysInvSignalTimeout as e: LOG.warn("Timeout Error e=%s" % (e)) raise OpenStackException(e.message, "%s" % e) finally: signal.alarm(0) return response
def _spider(url, visited, root, depth, max_depth, raise_on_error): """Fetches URL and any pages it links to up to max_depth. depth should initially be zero, and max_depth is the max depth of links to follow from the root. Prints out a warning only if the root can't be fetched; it ignores errors with pages that the root links to. Returns a tuple of: - pages: dict of pages visited (URL) mapped to their full text. - links: set of links encountered while visiting the pages. """ pages = {} # dict from page URL -> text content. links = set() # set of all links seen on visited pages. # root may end with index.html -- chop that off. if root.endswith('/index.html'): root = re.sub('/index.html$', '', root) try: context = None if sys.version_info < (2, 7, 9) or \ ((3,) < sys.version_info < (3, 4, 3)): if not spack.insecure: tty.warn("Spack will not check SSL certificates. You need to " "update your Python to enable certificate " "verification.") else: # We explicitly create default context to avoid error described in # https://blog.sucuri.net/2016/03/beware-unverified-tls-certificates-php-python.html context = ssl._create_unverified_context() \ if spack.insecure \ else ssl.create_default_context() # Make a HEAD request first to check the content type. This lets # us ignore tarballs and gigantic files. # It would be nice to do this with the HTTP Accept header to avoid # one round-trip. However, most servers seem to ignore the header # if you ask for a tarball with Accept: text/html. req = Request(url) req.get_method = lambda: "HEAD" resp = _urlopen(req, timeout=_timeout, context=context) if "Content-type" not in resp.headers: tty.debug("ignoring page " + url) return pages, links if not resp.headers["Content-type"].startswith('text/html'): tty.debug("ignoring page " + url + " with content type " + resp.headers["Content-type"]) return pages, links # Do the real GET request when we know it's just HTML. req.get_method = lambda: "GET" response = _urlopen(req, timeout=_timeout, context=context) response_url = response.geturl() # Read the page and and stick it in the map we'll return page = response.read().decode('utf-8') pages[response_url] = page # Parse out the links in the page link_parser = LinkParser() subcalls = [] link_parser.feed(page) while link_parser.links: raw_link = link_parser.links.pop() abs_link = urljoin(response_url, raw_link.strip()) links.add(abs_link) # Skip stuff that looks like an archive if any(raw_link.endswith(suf) for suf in ALLOWED_ARCHIVE_TYPES): continue # Skip things outside the root directory if not abs_link.startswith(root): continue # Skip already-visited links if abs_link in visited: continue # If we're not at max depth, follow links. if depth < max_depth: subcalls.append((abs_link, visited, root, depth + 1, max_depth, raise_on_error)) visited.add(abs_link) if subcalls: pool = NonDaemonPool(processes=len(subcalls)) try: results = pool.map(_spider_wrapper, subcalls) for sub_pages, sub_links in results: pages.update(sub_pages) links.update(sub_links) finally: pool.terminate() pool.join() except URLError as e: tty.debug(e) if isinstance(e.reason, ssl.SSLError): tty.warn("Spack was unable to fetch url list due to a certificate " "verification problem. You can try running spack -k, " "which will not check SSL certificates. Use this at your " "own risk.") if raise_on_error: raise NoNetworkConnectionError(str(e), url) except HTMLParseError as e: # This error indicates that Python's HTML parser sucks. msg = "Got an error parsing HTML." # Pre-2.7.3 Pythons in particular have rather prickly HTML parsing. if sys.version_info[:3] < (2, 7, 3): msg += " Use Python 2.7.3 or newer for better HTML parsing." tty.warn(msg, url, "HTMLParseError: " + str(e)) except Exception as e: # Other types of errors are completely ignored, except in debug mode. tty.debug("Error in _spider: %s:%s" % (type(e), e), traceback.format_exc()) return pages, links
def read_from_url(url, accept_content_type=None): url = url_util.parse(url) context = None verify_ssl = spack.config.get('config:verify_ssl') # Don't even bother with a context unless the URL scheme is one that uses # SSL certs. if uses_ssl(url): if verify_ssl: if __UNABLE_TO_VERIFY_SSL: # User wants SSL verification, but it cannot be provided. warn_no_ssl_cert_checking() else: # User wants SSL verification, and it *can* be provided. context = ssl.create_default_context() # novm else: # User has explicitly indicated that they do not want SSL # verification. if not __UNABLE_TO_VERIFY_SSL: context = ssl._create_unverified_context() url_scheme = url.scheme url = url_util.format(url) if sys.platform == "win32" and url_scheme == "file": url = convert_to_posix_path(url) req = Request(url) content_type = None is_web_url = url_scheme in ('http', 'https') if accept_content_type and is_web_url: # Make a HEAD request first to check the content type. This lets # us ignore tarballs and gigantic files. # It would be nice to do this with the HTTP Accept header to avoid # one round-trip. However, most servers seem to ignore the header # if you ask for a tarball with Accept: text/html. req.get_method = lambda: "HEAD" resp = _urlopen(req, timeout=_timeout, context=context) content_type = get_header(resp.headers, 'Content-type') # Do the real GET request when we know it's just HTML. req.get_method = lambda: "GET" try: response = _urlopen(req, timeout=_timeout, context=context) except URLError as err: raise SpackWebError('Download failed: {ERROR}'.format(ERROR=str(err))) if accept_content_type and not is_web_url: content_type = get_header(response.headers, 'Content-type') reject_content_type = (accept_content_type and (content_type is None or not content_type.startswith(accept_content_type))) if reject_content_type: tty.debug("ignoring page {0}{1}{2}".format( url, " with content type " if content_type is not None else "", content_type or "")) return None, None, None return response.geturl(), response.headers, response
def download_from_genomespace_importer( username, token, json_parameter_file, genomespace_site, gs_toolname ): json_params = json.loads( open( json_parameter_file, 'r' ).read() ) datasource_params = json_params.get( 'param_dict' ) assert None not in [ username, token ], "Missing GenomeSpace username or token." output_filename = datasource_params.get( "output_file1", None ) dataset_id = base_dataset_id = json_params['output_data'][0]['dataset_id'] hda_id = json_params['output_data'][0]['hda_id'] url_opener = get_cookie_opener( username, token, gs_toolname=gs_toolname ) # load and set genomespace format ids to galaxy exts genomespace_site_dict = get_genomespace_site_urls()[ genomespace_site ] set_genomespace_format_identifiers( url_opener, genomespace_site_dict['dmServer'] ) file_url_name = "URL" metadata_parameter_file = open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb' ) # setup datatypes registry for sniffing datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=json_params[ 'job_config' ][ 'GALAXY_ROOT_DIR' ], config=json_params[ 'job_config' ][ 'GALAXY_DATATYPES_CONF_FILE' ] ) url_param = datasource_params.get( file_url_name, None ) used_filenames = [] for download_url in url_param.split( ',' ): using_temp_file = False parsed_url = urlparse( download_url ) query_params = parse_qs( parsed_url[4] ) # write file to disk new_file_request = Request( download_url ) new_file_request.get_method = lambda: 'GET' target_download_url = url_opener.open( new_file_request ) filename = None if 'Content-Disposition' in target_download_url.info(): content_disposition = dict( x.strip().split('=') if '=' in x else ( x.strip(), '' ) for x in target_download_url.info()['Content-Disposition'].split( ';' ) ) if 'filename' in content_disposition: filename = content_disposition[ 'filename' ].strip( "\"'" ) if not filename: parsed_url = urlparse( download_url ) query_params = parse_qs( parsed_url[4] ) filename = unquote_plus( parsed_url[2].split( '/' )[-1] ) if not filename: filename = download_url if output_filename is None: # need to use a temp file here, because we do not know the ext yet using_temp_file = True output_filename = tempfile.NamedTemporaryFile( prefix='tmp-genomespace-importer-' ).name output_file = open( output_filename, 'wb' ) chunk_write( target_download_url, output_file ) output_file.close() # determine file format file_type = None if 'dataformat' in query_params: # this is a converted dataset file_type = query_params[ 'dataformat' ][0] file_type = get_galaxy_ext_from_genomespace_format_url( url_opener, file_type ) else: try: # get and use GSMetadata object download_file_path = download_url.split( "%s/file/" % ( genomespace_site_dict['dmServer'] ), 1)[-1] # FIXME: This is a very bad way to get the path for determining metadata. There needs to be a way to query API using download URLto get to the metadata object metadata_request = Request( "%s/%s/filemetadata/%s" % ( genomespace_site_dict['dmServer'], GENOMESPACE_API_VERSION_STRING, download_file_path ) ) metadata_request.get_method = lambda: 'GET' metadata_url = url_opener.open( metadata_request ) file_metadata_dict = json.loads( metadata_url.read() ) metadata_url.close() file_type = file_metadata_dict.get( 'dataFormat', None ) if file_type and file_type.get( 'url' ): file_type = file_type.get( 'url' ) file_type = get_galaxy_ext_from_genomespace_format_url( url_opener, file_type, default=None ) except: pass if file_type is None: # try to sniff datatype try: file_type = sniff.handle_uploaded_dataset_file( output_filename, datatypes_registry ) except: pass # sniff failed if file_type is None and '.' in parsed_url[2]: # still no known datatype, fall back to using extension file_type = parsed_url[2].rsplit( '.', 1 )[-1] file_type = GENOMESPACE_EXT_TO_GALAXY_EXT.get( file_type, file_type ) if file_type is None: # use default extension (e.g. 'data') file_type = DEFAULT_GALAXY_EXT # save json info for single primary dataset if dataset_id is not None: metadata_parameter_file.write( "%s\n" % json.dumps( dict( type='dataset', dataset_id=dataset_id, ext=file_type, name="GenomeSpace importer on %s" % ( filename ) ) ) ) # if using tmp file, move the file to the new file path dir to get scooped up later if using_temp_file: original_filename = filename filename = ''.join( c in FILENAME_VALID_CHARS and c or '-' for c in filename ) while filename in used_filenames: filename = "-%s" % filename used_filenames.append( filename ) target_output_filename = os.path.join( os.getcwd(), 'primary_%i_%s_visible_%s' % ( hda_id, filename, file_type ) ) shutil.move( output_filename, target_output_filename ) metadata_parameter_file.write( "%s\n" % json.dumps( dict( type='new_primary_dataset', base_dataset_id=base_dataset_id, ext=file_type, filename=target_output_filename, name="GenomeSpace importer on %s" % ( original_filename ) ) ) ) dataset_id = None # only one primary dataset available output_filename = None # only have one filename available metadata_parameter_file.close() return True