def __init__(self, base_url, username=None, password=None, default_prefix='/rest/v1', biospassword=None, sessionkey=None): """Initialization of the base class RestClientBase :param base_url: The url of the remote system :type base_url: str :param username: The username used for authentication :type username: str :param password: The password used for authentication :type password: str :param default_prefix: The default root point :type default_prefix: str :param biospassword: biospassword for base_url if needed :type biospassword: str :param sessionkey: sessionkey for the current login of base_url :type sessionkey: str """ self.__base_url = base_url self.__username = username self.__password = password self.__biospassword = biospassword self.__url = urlparse2.urlparse(base_url) self.__session_key = sessionkey self.__authorization_key = None self.__session_location = None self._conn = None self._conn_count = 0 self.login_url = None self.default_prefix = default_prefix self.__init_connection() self.get_root_object() self.__destroy_connection()
def fillDetails(self): sep = urlparse(self.accused) #protocol if sep.scheme != '': self.details["protocol"] = sep.scheme #hostname if bool(re.search(r'\d.\d.', self.without(sep.netloc))): self.containsIp = 1 self.details["ipaddress"] = sep.netloc else: self.details['hostname'] = sep.netloc self.length['hostnameLength'] = len(sep.netloc) self.details['topLevelDomain'] = get_tld(self.without(sep.netloc), fix_protocol=True) self.length['topLevelDomainLength'] = len( self.details['topLevelDomain']) self.details['primaryDomain'] = get_fld(self.without(sep.netloc), fix_protocol=True) self.length['primaryDomainLength'] = len( self.details['primaryDomain']) try: self.details["ipaddress"] = socket.gethostbyname(sep.netloc) except: pass #path & querry self.details['path'] = sep.path self.details['query'] = sep.query self.details['noOfQuery'] = len(sep.query.split('&')) if sep.query == '': self.details['noOfQuery'] -= 1 self.length['pathLength'] = len(self.details['path']) + len( self.details['query']) self.tokens()
def __init__(self, base_url, username=None, password=None, default_prefix='/redfish/v1/', sessionkey=None): """Initialization of the base class RestClientBase :param base_url: The URL of the remote system :type base_url: str :param username: The user name used for authentication :type username: str :param password: The password used for authentication :type password: str :param default_prefix: The default root point :type default_prefix: str :param sessionkey: session key for the current login of base_url :type sessionkey: str """ self.__base_url = base_url self.__username = username self.__password = password self.__url = urlparse2.urlparse(base_url) self.__session_key = sessionkey self.__authorization_key = None self.__session_location = None self._conn = None self._conn_count = 0 self.login_url = None self.default_prefix = default_prefix self.__init_connection() self.get_root_object() self.__destroy_connection()
def _extract_url(self): u"""根据网址进行链接解析. Args: url: 待分析网页网址 Returns: 本链接网页内的同域名网址列表 """ while int(self.client.get('image_max_num')) is not 0: if self.client.llen('web_url_goto') is 0: time.sleep(1) else: url = self.client.rpop('web_url_goto') try: html = urllib2.urlopen(url).read() except: logging.warning("url cant open: %s" % url) continue domain = urlparse2.urlparse(url).netloc web_url_list = self._extract_web_url(html, url, domain) image_url_list = self._extract_img_url(html, domain) for web_url in web_url_list: if int(self.client.sismember('web_url_visited', web_url)) is 0: self.client.sadd('web_url_visited', web_url) self.client.lpush('web_url_goto', web_url) for image_url in image_url_list: if int(self.client.sismember('image_url_visited', image_url)) is 0: self.client.sadd('image_url_visited', image_url) self.client.lpush('image_url_goto', image_url) logging.info("%s--->%s" % (url, image_url))
def _download_request(self, request, spider): proxies = {} proxy = request.meta.get('proxy', '') if proxy: for p in self.proxies: if p.find(proxy) != -1: scheme = urlparse(p).scheme proxies[scheme] = p break timeout = request.meta.get('download_timeout', self.timeout) url = request.url method = request.method headers = headers_scrapy2dict(request.headers) data = request.body session = self._session or requests.sessions.Session() st = time.time() requests_response = session.request(method, url, headers=headers, data=data, timeout=timeout, proxies=proxies) et = time.time() cost = et - st request.meta['download_latency'] = cost headers = Headers(dict(requests_response.headers)) respcls = responsetypes.from_args(headers=headers, url=requests_response.url, body=requests_response.content) response_url = requests_response.url.encode(requests_response.encoding) response = respcls(url=response_url, status=requests_response.status_code, headers=headers, body=requests_response.content, ) return response
def __init__(self, url, nr_sockets, counter): super(Striker, self).__init__() self.counter = counter self.nr_socks = nr_sockets parsedUrl = urlparse2.urlparse(url) if parsedUrl.scheme == 'https': self.ssl = True self.host = parsedUrl.netloc.split(':')[0] self.url = parsedUrl.path self.port = parsedUrl.port if not self.port: self.port = 80 if not self.ssl else 443 self.referers = [ 'http://www.google.com/', 'http://www.bing.com/', 'http://www.baidu.com/', 'http://www.yandex.com/', 'http://' + self.host + '/' ]
def get_json(url, wjson): parsed_u = urlparse(url) if wjson == True: # it's a specific .json? link url = "{}://{}{}.json?{}".format(parsed_u.scheme, parsed_u.netloc, parsed_u.path, parsed_u.query) print "Requesting {}".format(url) content = requests.get(url, headers=REQUEST_HEADERS) data = json.loads(content.content) return data
def parseFromSite(address): print(address) req = urllib2.Request(address, headers={'User-Agent' : "Magic Browser"}) url = urllib2.urlopen(req).read() soup = BeautifulSoup(url, "lxml") for line in soup.find_all('a'): o = urlparse(address) try: if o.hostname in line.get('href') or '.' not in line.get('href') and line.get('href') is not "": if(o.hostname in line.get('href')): parseFromPage(line.get('href')) else: parseFromPage(address+line.get('href')) except: pass
def _extract_web_url(self, html, url, domain): u"""从html内容中解析出同域名网址列表. Args: html: 待解析的内容 url: 爬取页面的地址 domain: 当前网站域名 Return: html内容中的同域名网址 """ url_list = [] content = BeautifulSoup(html).findAll('a') for item in content: href = item.get('href') ans = urlparse2.urljoin(url, href) ans_netloc = urlparse2.urlparse(ans).netloc if domain == ans_netloc: url_list.append(ans) return url_list
def _get_msg_id_from_url(self, url): msg_id = '' try: msg_id = os.path.split(urlparse(url).path)[1] finally: return msg_id
def getID(url): pUrl=urlparse2.urlparse(url) return urlparse2.parse_qs(pUrl.query)['id'][0]
def _rest_request(self, path, method='GET', args=None, body=None, \ headers=None): """Rest request main function :param path: path within tree :type path: str :param method: method to be implemented :type method: str :param args: the arguments for method :type args: dict :param body: body payload for the rest call :type body: dict :param headers: provide additional headers :type headers: dict :returns: returns a RestResponse object """ headers = self._get_req_headers(headers) reqpath = path.replace('//', '/') if body is not None: if isinstance(body, dict) or isinstance(body, list): headers['Content-Type'] = u'application/json' body = json.dumps(body) else: headers['Content-Type'] = u'application/x-www-form-urlencoded' body = urllib.urlencode(body) if method == 'PUT': resp = self._rest_request(path=path) try: if resp.getheader('content-encoding') == 'gzip': buf = StringIO() gfile = gzip.GzipFile(mode='wb', fileobj=buf) try: gfile.write(str(body)) finally: gfile.close() compresseddata = buf.getvalue() if compresseddata: data = bytearray() data.extend(buffer(compresseddata)) body = data except BaseException as excp: LOGGER.error('Error occur while compressing body: %s', excp) raise headers['Content-Length'] = len(body) if args: if method == 'GET': reqpath += '?' + urllib.urlencode(args) elif method == 'PUT' or method == 'POST' or method == 'PATCH': headers['Content-Type'] = u'application/x-www-form-urlencoded' body = urllib.urlencode(args) restreq = RestRequest(reqpath, method=method, body=body) attempts = 0 while attempts < self.MAX_RETRY: if logging.getLogger().isEnabledFor(logging.DEBUG): try: LOGGER.debug('HTTP REQUEST: %s\n\tPATH: %s\n\tBODY: %s'% \ (restreq.method, restreq.path, restreq.body)) except: LOGGER.debug('HTTP REQUEST: %s\n\tPATH: %s\n\tBODY: %s'% \ (restreq.method, restreq.path, 'binary body')) attempts = attempts + 1 LOGGER.info('Attempt %s of %s', attempts, path) try: while True: if self._conn is None: self.__init_connection() self._conn.request(method.upper(), reqpath, body=body, \ headers=headers) self._conn_count += 1 inittime = time.clock() resp = self._conn.getresponse() endtime = time.clock() LOGGER.info('Response Time to %s: %s seconds.'% \ (restreq.path, str(endtime-inittime))) if resp.getheader('Connection') == 'close': self.__destroy_connection() if resp.status not in range(300, 399) or \ resp.status == 304: break newloc = resp.getheader('location') newurl = urlparse2.urlparse(newloc) reqpath = newurl.path self.__init_connection(newurl) restresp = RestResponse(restreq, resp) try: if restresp.getheader('content-encoding') == "gzip": compressedfile = StringIO(restresp.text) decompressedfile = gzip.GzipFile(fileobj=compressedfile) restresp.text = decompressedfile.read() except Exception as excp: LOGGER.error('Error occur while decompressing body: %s', \ excp) raise DecompressResponseError() except Exception as excp: if isinstance(excp, DecompressResponseError): raise LOGGER.info('Retrying %s [%s]'% (path, excp)) time.sleep(1) self.__init_connection() continue else: break self.__destroy_connection() if attempts < self.MAX_RETRY: if logging.getLogger().isEnabledFor(logging.DEBUG): headerstr = '' for header in restresp._http_response.msg.headers: headerstr += '\t' + header.rstrip() + '\n' try: LOGGER.debug('HTTP RESPONSE for %s:\nCode: %s\nHeaders:\n' \ '%s\nBody Response of %s: %s'%\ (restresp.request.path,\ str(restresp._http_response.status)+ ' ' + \ restresp._http_response.reason, \ headerstr, restresp.request.path, restresp.read)) except: LOGGER.debug('HTTP RESPONSE:\nCode:%s', (restresp)) return restresp else: raise RetriesExhaustedError()
def _rest_request(self, path, method='GET', args=None, body=None, headers=None, optionalpassword=None, providerheader=None): """Rest request main function :param path: path within tree :type path: str :param method: method to be implemented :type method: str :param args: the arguments for method :type args: dict :param body: body payload for the rest call :type body: dict :param headers: provide additional headers :type headers: dict :param optionalpassword: provide password for authentication :type optionalpassword: str :param provideheader: provider id for the header :type providerheader: str """ headers = self._get_req_headers(headers, providerheader, \ optionalpassword) reqpath = path.replace('//', '/') if body: if isinstance(body, dict) or isinstance(body, list): headers['Content-Type'] = u'application/json' body = json.dumps(body) else: headers['Content-Type'] = u'application/x-www-form-urlencoded' body = urllib.urlencode(body) if method == 'PUT': resp = self._rest_request(path=path) try: if resp.getheader('content-encoding') == 'gzip': buf = StringIO() gfile = gzip.GzipFile(mode='wb', fileobj=buf) try: gfile.write(str(body)) finally: gfile.close() compresseddata = buf.getvalue() if compresseddata: data = bytearray() data.extend(buffer(compresseddata)) body = data except BaseException as excp: LOGGER.error('Error occur while compressing body: %s', excp) raise headers['Content-Length'] = len(body) if args: if method == 'GET': reqpath += '?' + urllib.urlencode(args) elif method == 'PUT' or method == 'POST' or method == 'PATCH': headers['Content-Type'] = u'application/x-www-form-urlencoded' body = urllib.urlencode(args) restreq = RestRequest(reqpath, method=method, body=body) attempts = 0 while attempts < self.MAX_RETRY: if logging.getLogger().isEnabledFor(logging.DEBUG): LOGGER.debug('REQ %s', (restreq)) attempts = attempts + 1 try: while True: if self._conn is None: self.__init_connection() self._conn.request(method.upper(), reqpath, body=body, \ headers=headers) self._conn_count += 1 resp = self._conn.getresponse() if resp.getheader('Connection') == 'close': self.__destroy_connection() if resp.status not in range(300, 399): break newloc = resp.getheader('location') newurl = urlparse2.urlparse(newloc) reqpath = newurl.path self.__init_connection(newurl) restresp = RestResponse(restreq, resp) try: if restresp.getheader('content-encoding') == "gzip": compressedfile = StringIO(restresp.text) decompressedfile = gzip.GzipFile(fileobj=compressedfile) restresp.text = decompressedfile.read() except Exception as excp: LOGGER.error('Error occur while decompressing body: %s', \ excp) raise DecompressResponseError() except Exception as excp: if isinstance(excp, DecompressResponseError): raise LOGGER.info('Retrying [%s]', excp) time.sleep(1) self.__init_connection() continue else: break self.__destroy_connection() if attempts < self.MAX_RETRY: if logging.getLogger().isEnabledFor(logging.DEBUG): LOGGER.debug('RESP %s', (restresp)) return restresp else: raise RetriesExhaustedError()
def _rest_request(self, path='', method="GET", args=None, body=None, headers=None, optionalpassword=None, providerheader=None): """Rest request for blob store client :param path: path within tree :type path: str :param method: method to be implemented :type method: str :param args: the arguments for method :type args: dict :param body: body payload for the rest call :type body: dict :param headers: provide additional headers :type headers: dict :param optionalpassword: provide password for authentication :type optionalpassword: str :param provideheader: provider id for the header :type providerheader: str """ headers = self._get_req_headers(headers, providerheader, optionalpassword) if (not self.is_redfish and self.default_prefix in path and path[-1] == '/'): path = path[0:-1] elif (self.is_redfish and self.default_prefix in path and path[-1] != '/'): path = path + '/' else: pass reqpath = path.replace('//', '/') if body: if isinstance(body, dict) or isinstance(body, list): headers['Content-Type'] = u'application/json' body = json.dumps(body) else: headers['Content-Type'] = u'application/x-www-form-urlencoded' body = urllib.urlencode(body) if method == 'PUT': resp = self._rest_request(path=path) try: if resp.getheader('content-encoding') == 'gzip': buf = StringIO() gfile = gzip.GzipFile(mode='wb', fileobj=buf) try: gfile.write(str(body)) finally: gfile.close() compresseddata = buf.getvalue() if compresseddata: data = bytearray() data.extend(buffer(compresseddata)) body = data except BaseException as excp: LOGGER.error('Error occur while compressing body: %s', excp) raise headers['Content-Length'] = len(body) self._method = method str1 = '%s %s %s\r\n' % (method, reqpath,\ Blobstore2RestClient._http_vsn_str) str1 += 'Host: \r\n' str1 += 'Accept-Encoding: identity\r\n' for header, value in headers.iteritems(): str1 += '%s: %s\r\n' % (header, value) str1 += '\r\n' if body and len(body) > 0: if isinstance(body, bytearray): str1 = str1.encode("ASCII") + body else: str1 += body bs2 = BlobStore2() if not isinstance(str1, bytearray): str1 = str1.encode("ASCII") resp_txt = bs2.rest_immediate(str1) #Dummy response to support a bad host response if len(resp_txt) == 0: resp_txt = "HTTP/1.1 500 Not Found\r\nAllow: " \ "GET\r\nCache-Control: no-cache\r\nContent-length: " \ "0\r\nContent-type: text/html\r\nDate: Tues, 1 Apr 2025 " \ "00:00:01 GMT\r\nServer: " \ "HP-iLO-Server/1.30\r\nX_HP-CHRP-Service-Version: 1.0.3\r\n\r\n\r\n" restreq = RestRequest(reqpath, method=method, body=body) rest_response = RisRestResponse(restreq, resp_txt) try: if rest_response.getheader('content-encoding') == 'gzip': compressedfile = StringIO(rest_response.text) decompressedfile = gzip.GzipFile(fileobj=compressedfile) rest_response.text = decompressedfile.read() except StandardError: pass if rest_response.status in range(300, 399): newloc = rest_response.getheader("location") newurl = urlparse2.urlparse(newloc) rest_response = self._rest_request(newurl.path, \ method, args, body, headers, \ optionalpassword, providerheader) return rest_response
def _load(self, path, skipcrawl=False, originaluri=None, includelogs=False,\ skipinit=False, loadtype='href', loadcomplete=False): """Helper function to main load function. :param path: path to start load from. :type path: str. :param skipcrawl: flag to determine if load should traverse found links. :type skipcrawl: boolean. :param originaluri: variable to assist in determining originating path. :type originaluri: str. :param includelogs: flag to determine if logs should be downloaded also. :type includelogs: boolean. :param skipinit: flag to determine if first run of load. :type skipinit: boolean. :param loadtype: flag to determine if load is meant for only href items. :type loadtype: str. :param loadcomplete: flag to download the entire monolith :type loadcomplete: boolean """ if path.endswith("?page=1"): return elif not includelogs: if "/Logs/" in path: return #TODO: need to find a better way to support non ascii characters path = path.replace("|", "%7C") #remove fragments newpath = urlparse2.urlparse(path) newpath.fragment = '' path = urlparse2.urlunparse(newpath) LOGGER.debug(u'_loading %s', path) if not self.reload: if path.lower() in self._visited_urls: return resp = self._client.get(path) if resp.status != 200: path = path + '/' resp = self._client.get(path) if resp.status == 401: raise SessionExpiredRis("Invalid session. Please logout and "\ "log back in or include credentials.") elif resp.status != 200: return self.queue.put((resp, path, skipinit, self)) if loadtype == 'href': #follow all the href attributes jsonpath_expr = jsonpath_rw.parse(u"$..'@odata.id'") matches = jsonpath_expr.find(resp.dict) if 'links' in resp.dict and 'NextPage' in resp.dict['links']: if originaluri: next_link_uri = originaluri + '?page=' + \ str(resp.dict['links']['NextPage']['page']) href = u'%s' % next_link_uri self._load(href, originaluri=originaluri, \ includelogs=includelogs, skipcrawl=skipcrawl, \ skipinit=skipinit) else: next_link_uri = path + '?page=' + \ str(resp.dict['links']['NextPage']['page']) href = u'%s' % next_link_uri self._load(href, originaluri=path, includelogs=includelogs,\ skipcrawl=skipcrawl, skipinit=skipinit) if not skipcrawl: for match in matches: if str(match.full_path) == "*****@*****.**" or \ str(match.full_path) == "*****@*****.**": continue if match.value == path: continue href = u'%s' % match.value self._load(href, skipcrawl=skipcrawl, \ originaluri=originaluri, includelogs=includelogs, \ skipinit=skipinit) if loadcomplete: for match in matches: self._load(match.value, skipcrawl=skipcrawl, originaluri=\ originaluri, includelogs=includelogs, skipinit=skipinit)
def _rest_request(self, path='', method="GET", args=None, body=None, headers=None, optionalpassword=None, providerheader=None): """Rest request for blob store client :param path: path within tree :type path: str :param method: method to be implemented :type method: str :param args: the arguments for method :type args: dict :param body: body payload for the rest call :type body: dict :param headers: provide additional headers :type headers: dict :param optionalpassword: provide password for authentication :type optionalpassword: str :param provideheader: provider id for the header :type providerheader: str :return: returns a RestResponse object """ headers = self._get_req_headers(headers, providerheader, \ optionalpassword) if not self.is_redfish and self.default_prefix in path and \ path[-1] == '/': path = path[0:-1] elif self.is_redfish and self.default_prefix in path and \ path[-1] != '/': path = path + '/' else: pass reqpath = path.replace('//', '/') if body is not None: if isinstance(body, dict) or isinstance(body, list): headers['Content-Type'] = u'application/json' body = json.dumps(body) else: headers['Content-Type'] = u'application/x-www-form-urlencoded' body = urllib.urlencode(body) if method == 'PUT': resp = self._rest_request(path=path) try: if resp.getheader('content-encoding') == 'gzip': buf = StringIO() gfile = gzip.GzipFile(mode='wb', fileobj=buf) try: gfile.write(str(body)) finally: gfile.close() compresseddata = buf.getvalue() if compresseddata: data = bytearray() data.extend(buffer(compresseddata)) body = data except BaseException as excp: LOGGER.error('Error occur while compressing body: %s', excp) raise headers['Content-Length'] = len(body) if args: if method == 'GET': reqpath += '?' + urllib.urlencode(args) elif method == 'PUT' or method == 'POST' or method == 'PATCH': headers['Content-Type'] = u'application/x-www-form-urlencoded' body = urllib.urlencode(args) str1 = '%s %s %s\r\n' % (method, reqpath, \ Blobstore2RestClient._http_vsn_str) str1 += 'Host: \r\n' str1 += 'Accept-Encoding: identity\r\n' for header, value in headers.iteritems(): str1 += '%s: %s\r\n' % (header, value) str1 += '\r\n' if body and len(body) > 0: if isinstance(body, bytearray): str1 = str1.encode("ASCII") + body else: str1 += body bs2 = BlobStore2() if not isinstance(str1, bytearray): str1 = str1.encode("ASCII") if logging.getLogger().isEnabledFor(logging.DEBUG): LOGGER.debug('Blobstore REQUEST: %s\n\tPATH: %s\n\tBODY: %s'% \ (method, path, body)) inittime = time.clock() resp_txt = bs2.rest_immediate(str1) endtime = time.clock() LOGGER.info("iLO Response Time to %s: %s secs."% \ (path, str(endtime-inittime))) #Dummy response to support a bad host response if len(resp_txt) == 0: resp_txt = "HTTP/1.1 500 Not Found\r\nAllow: " \ "GET\r\nCache-Control: no-cache\r\nContent-length: " \ "0\r\nContent-type: text/html\r\nDate: Tues, 1 Apr 2025 " \ "00:00:01 GMT\r\nServer: " \ "HP-iLO-Server/1.30\r\nX_HP-CHRP-Service-Version: 1.0.3\r\n\r\n\r\n" restreq = RestRequest(reqpath, method=method, body=body) rest_response = RisRestResponse(restreq, resp_txt) if rest_response.status in range(300, 399) and \ rest_response.status != 304: newloc = rest_response.getheader("location") newurl = urlparse2.urlparse(newloc) rest_response = self._rest_request(newurl.path, method, args, \ body, headers, optionalpassword, providerheader) try: if rest_response.getheader('content-encoding') == 'gzip': compressedfile = StringIO(rest_response.text) decompressedfile = gzip.GzipFile(fileobj=compressedfile) rest_response.text = decompressedfile.read() except StandardError: pass if logging.getLogger().isEnabledFor(logging.DEBUG): headerstr = '' for header in rest_response._http_response.msg.headers: headerstr += '\t' + header.rstrip() + '\n' try: LOGGER.debug('Blobstore RESPONSE for %s:\nCode: %s\nHeaders:\n%s'\ '\nBody of %s: %s'%\ (rest_response.request.path,\ str(rest_response._http_response.status)+ ' ' + \ rest_response._http_response.reason, \ headerstr, rest_response.request.path, rest_response.read)) except: LOGGER.debug('Blobstore RESPONSE for %s:\nCode:%s'% \ (rest_response.request.path, rest_response)) return rest_response
def run_test_case(s3_client, my_test_case): # ----------------------- # Download test case query # ----------------------- with tempfile.NamedTemporaryFile(mode='w+b', delete=True) as test_case_object: logger.info("Downloading test case from: " + 's3://' + s3_testcases_bucket + '/' + s3_testcases_path + '/' + my_test_case['query']) try: s3_client.download_fileobj(s3_testcases_bucket, s3_testcases_path + '/' + my_test_case['query'], test_case_object) except Exception as e: logger.error("Failed to download S3 file object " + my_test_case['query'] + " because of error: %s" % e) raise e test_case_object.seek(0) test_query = test_case_object.read().replace('\n', '') logger.debug('Downloaded test case using temp file: ' + test_case_object.name) logger.debug('Test query: ' + test_query) # ----------------------- # Download test fixture # ----------------------- with tempfile.NamedTemporaryFile(mode='w+b', delete=True) as test_fixture_object: logger.info("Downloading test result from: " + 's3://' + s3_testcases_bucket + '/' + s3_testcases_path + '/' + my_test_case['fixture']) try: s3_client.download_fileobj(s3_testcases_bucket, s3_testcases_path + '/' + my_test_case['fixture'], test_fixture_object) except Exception as e: logger.error("Failed to download S3 file object " + my_test_case['fixture'] + " because of error: %s" % e) raise e test_fixture_object.seek(0) test_fixture = test_fixture_object.read() logger.debug('Downloaded test fixture using temp file: ' + test_fixture_object.name) logger.debug('Test fixture: ' + test_fixture) # ----------------------- # Open Athena Connection # ----------------------- try: logger.debug("Attempting to open connection to Athena") athena_client = boto3.client('athena') logger.debug("Connection to Athena successfully opened") except ClientError as e: logger.error("Failed to connect to Athena because of error: %s" % e) raise e except Exception as e: logger.error("Failed to connect to Athena because of error: %s" % e) raise e # ----------------------- # Execute Athena query # ----------------------- try: logger.debug("Attempting to submit query to Athena") response = athena_client.start_query_execution( QueryString=test_query, ResultConfiguration={'OutputLocation': 's3://' + s3_testruns_bucket + '/' + s3_testruns_path} ) logger.debug("Query submitted to Athena successfully") except ClientError as e: if e.response['Error']['Code'] == 'InternalServerException': logger.error("Query failed submission to Athena due to an InternalServerException") raise e else: logger.error("Query failed submission to Athena due to an unexpected error: %s" % e) raise e finally: # Check status and log progress query_id = response['QueryExecutionId'] if response['ResponseMetadata']['HTTPStatusCode'] != 200: logger.error("HTTP error response code: " + str(response['ResponseMetadata']['HTTPStatusCode'])) logger.info("Query execution id: " + query_id) # ----------------------- # Lookup Athena query information # to get exact output_location # ----------------------- try: logger.debug("Attempting to query information about query: " + query_id) response = athena_client.get_query_execution( QueryExecutionId=query_id ) logger.debug("Retrieved information about query: " + query_id) # Check status and log progress # if response['ResponseMetadata']['HTTPStatusCode'] != 200: # logger.error("HTTP error response code: " + str(response['ResponseMetadata']['HTTPStatusCode'])) output_location = response['QueryExecution']['ResultConfiguration']['OutputLocation'] logger.debug("Athena query output location: " + output_location) output_url = urlparse(output_location) output_bucket = output_url.netloc output_object = output_url.path.strip("/") logger.debug("Parsed Athena output: Bucket=" + output_bucket + " Object=" + output_object) except ClientError as e: if e.response['Error']['Code'] == 'InternalServerException': logger.error("Failed to retrieve information about query: " + query_id + "due to InternalServerException") raise e else: logger.error("Failed to retrieve information about query: " + query_id + "due to unexpected error: %s" % e) raise e # ----------------------- # Wait for Query Execution in S3 # ----------------------- logger.info("Begin waiting for Bucket=" + output_bucket + " Object=" + output_object) try: logger.debug('Creating S3 Waiter client object') waiter = s3_client.get_waiter('object_exists') logger.debug('Done Creating S3 Waiter client object') except ClientError as e: logger.error("Failed to create waiter client client because of error: %s" % e) raise e except Exception as e: logger.error("Failed to create waiter client client because of error: %s" % e) raise e try: logger.debug( "Creating waiter for S3 Object: Bucket=" + output_bucket + " Object=" + output_object) waiter.wait( Bucket=output_bucket, Key=output_object, WaiterConfig={ 'Delay': float(waiter_delay), 'MaxAttempts': waiter_attempts } ) except ClientError as e: logger.error("Failed to create waiter because of error: %s" % e) raise e except Exception as e: logger.error("Failed to create waiter because of error: %s" % e) raise e logger.debug( "Finished waiting for S3 Object: Bucket=" + output_bucket + " Object=" + output_object) # ----------------------- # Download test result # ----------------------- with tempfile.NamedTemporaryFile(mode='w+b', delete=True) as test_result_object: logger.debug( "Downloading test result from: Bucket=" + output_bucket + " Object=" + output_object) try: s3_client.download_fileobj(output_bucket, output_object, test_result_object) test_result_object.seek(0) test_result = test_result_object.read() except ClientError as e: logger.error("Failed to download S3 file object because of error: %s" % e) raise e except Exception as e: logger.error("Failed to download S3 file object because of error: %s" % e) raise e logger.debug('Downloaded test result using temp file: ' + test_result_object.name) logger.debug('Test results: ' + test_result) # Determine Diffs if test_fixture == test_result: logger.log(STATUS, 'Query \"' + my_test_case['query'] + "\" with fixture \"" + my_test_case[ 'fixture'] + '\" test passes validation') return 0 else: logger.error('Query \"' + my_test_case['query'] + "\" with fixture \"" + my_test_case[ 'fixture'] + '\" test fails validation') deepdiff = DeepDiff(test_fixture, test_result) print ("------------------------ Begin Diff ------------------------") print (deepdiff["values_changed"]["root"]["diff"].encode('utf-8')) print ("------------------------- End Diff -------------------------") return 1
def get_cache_dirname(self): """The rest client's current base url converted to path""" parts = urlparse2.urlparse(self.get_base_url()) pathstr = '%s/%s' % (parts.netloc, parts.path) return pathstr.replace('//', '/')
def parse_domain(url): parsed = urlparse(url) if parsed.scheme: return parsed.hostname
def _url2path(url): """ Function to convert given url to path """ parts = urlparse2.urlparse(url) pathstr = '%s/%s' % (parts.netloc, parts.path) return pathstr.replace('//', '/')
def _load(self, path, skipcrawl=False, originaluri=None, includelogs=False,\ skipinit=False, loadtype='href', loadcomplete=False): """Helper function to main load function. :param path: path to start load from. :type path: str. :param skipcrawl: flag to determine if load should traverse found links. :type skipcrawl: boolean. :param originaluri: variable to assist in determining originating path. :type originaluri: str. :param includelogs: flag to determine if logs should be downloaded also. :type includelogs: boolean. :param skipinit: flag to determine if first run of load. :type skipinit: boolean. :param loadtype: flag to determine if load is meant for only href items. :type loadtype: str. :param loadcomplete: flag to download the entire monolith :type loadcomplete: boolean """ if path.endswith("?page=1"): return elif not includelogs: if "/Logs/" in path: return #TODO: need to find a better way to support non ascii characters path = path.replace("|", "%7C") #remove fragments newpath = urlparse2.urlparse(path) newpath.fragment = '' path = urlparse2.urlunparse(newpath) LOGGER.debug(u'_loading %s', path) if not self.reload: if path.lower() in self._visited_urls: return resp = self._client.get(path) if resp.status != 200 and path.lower() == self._client.typepath.defs.\ biospath: raise BiosUnregisteredError() elif resp.status != 200: path = path + '/' resp = self._client.get(path) if resp.status == 401: raise SessionExpiredRis("Invalid session. Please logout and "\ "log back in or include credentials.") elif resp.status != 200: return if loadtype == "ref": self.parse_schema(resp) self.queue.put((resp, path, skipinit, self)) if loadtype == 'href': #follow all the href attributes if self.is_redfish: jsonpath_expr = jsonpath_rw.parse(u"$..'@odata.id'") else: jsonpath_expr = jsonpath_rw.parse(u'$..href') matches = jsonpath_expr.find(resp.dict) if 'links' in resp.dict and 'NextPage' in resp.dict['links']: if originaluri: next_link_uri = originaluri + '?page=' + \ str(resp.dict['links']['NextPage']['page']) href = u'%s' % next_link_uri self._load(href, originaluri=originaluri, \ includelogs=includelogs, skipcrawl=skipcrawl, \ skipinit=skipinit) else: next_link_uri = path + '?page=' + \ str(resp.dict['links']['NextPage']['page']) href = u'%s' % next_link_uri self._load(href, originaluri=path, includelogs=includelogs,\ skipcrawl=skipcrawl, skipinit=skipinit) (newversion, dirmatch) = self.check_for_directory(matches) if not newversion and not skipcrawl: for match in matches: if path == "/rest/v1": if str(match.full_path) == "links.Schemas.href" or \ str(match.full_path) == "links.Registries.href": continue else: if str(match.full_path) == "*****@*****.**" or \ str(match.full_path) == "*****@*****.**": continue if match.value == path: continue href = u'%s' % match.value self._load(href, skipcrawl=skipcrawl, \ originaluri=originaluri, includelogs=includelogs, \ skipinit=skipinit) elif not skipcrawl: href = u'%s' % dirmatch.value self._load(href, skipcrawl=skipcrawl, originaluri=originaluri, \ includelogs=includelogs, skipinit=skipinit) if loadcomplete: for match in matches: self._load(match.value, skipcrawl=skipcrawl, originaluri=\ originaluri, includelogs=includelogs, skipinit=skipinit)
def parse_list_page(self, response): multi_xpath = '/html/body/div[@id and @class="c"]' html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) multi_hxs = hxs.select(multi_xpath) list_url = response.url query = response.meta.get('query') for hxs in multi_hxs: nick = ''.join(hxs.select('./div[1]/a//text()').extract()) user_url = ''.join(hxs.select('./div[1]/a/@href').extract()) user_url = urllib.unquote(user_url).strip() user_url_up = urlparse(user_url) user_url_up.query = '' user_url = urlunparse(user_url_up) div3 = hxs.select('./div[3]') if div3: content = ''.join(div3.select('.//text()').extract()[1:-10]) else: content = ''.join(hxs.select('./div[1]/span//text()').extract()) misc1 = hxs.select('.//a//text()') zan_count, zhuanfa_count, pinglun_count = self._ana_misc1(misc1) misc2 = hxs.select('.//span[@class="ct"]//text()') time, from_info = self._ana_misc2(misc2) misc3 = hxs.select('.//a[@class="cc"]/@href') own_msg_id, forward_msg_id = self._get_msg_id(misc3) own_user_id, forward_user_id = self._get_user_id(misc3) if forward_msg_id and forward_user_id: is_forward = True forward_msg_url1 = 'http://weibo.com/%s/%s' % (forward_user_id, forward_msg_id) forward_msg_url2 = 'http://weibo.cn/%s/%s' % (forward_user_id, forward_msg_id) else: is_forward = False forward_msg_url1 = '' forward_msg_url2 = '' doc = { 'data_source': '新浪微博搜索', 'nick': nick, 'user_url': user_url, 'content': content, 'zan_count': zan_count, 'zhuanfa_count': zhuanfa_count, 'pinglun_count': pinglun_count, 'time': time, 'from_info': from_info, 'own_user_id': own_user_id, 'own_msg_id': own_msg_id, 'own_msg_url1': 'http://weibo.com/%s/%s' % (own_user_id, own_msg_id), 'own_msg_url2': 'http://weibo.cn/%s/%s' % (own_user_id, own_msg_id), 'forward_user_id': forward_user_id, 'forward_msg_id': forward_msg_id, 'forward_msg_url1': forward_msg_url1, 'forward_msg_url2': forward_msg_url2, 'is_forward': is_forward, 'sort': self.sort, } #暂不处理weibo用户的首页头像 # user_homepage = user_url # if not user_homepage: # next_request = None # else: # next_request = Request(user_homepage, callback=self.parse_user_homepage) item = WeiboItem(doc=doc, next_request=None, list_url=list_url, query=query) yield self.item_or_request(item) #暂不处理weibo用户的首页头像 # def parse_user_homepage(self, response): # item = response.meta['item'] # item['doc']['detail'] = response.body_as_unicode() # yield self.item_or_request(item)
def _rest_request(self, path='', method="GET", args=None, body=None, headers=None, optionalpassword=None, providerheader=None): """Rest request for blob store client :param path: path within tree :type path: str :param method: method to be implemented :type method: str :param args: the arguments for method :type args: dict :param body: body payload for the rest call :type body: dict :param headers: provide additional headers :type headers: dict :param optionalpassword: provide password for authentication :type optionalpassword: str :param provideheader: provider id for the header :type providerheader: str :return: returns a RestResponse object """ headers = self._get_req_headers(headers, providerheader, \ optionalpassword) reqpath = path.replace('//', '/') oribody = body if body is not None: if isinstance(body, dict) or isinstance(body, list): headers['Content-Type'] = u'application/json' body = json.dumps(body) else: headers['Content-Type'] = u'application/x-www-form-urlencoded' body = urllib.urlencode(body) if method == 'PUT': resp = self._rest_request(path=path) try: if resp.getheader('content-encoding') == 'gzip': buf = StringIO() gfile = gzip.GzipFile(mode='wb', fileobj=buf) try: gfile.write(str(body)) finally: gfile.close() compresseddata = buf.getvalue() if compresseddata: data = bytearray() data.extend(buffer(compresseddata)) body = data except BaseException as excp: LOGGER.error('Error occur while compressing body: %s', excp) raise headers['Content-Length'] = len(body) if args: if method == 'GET': reqpath += '?' + urllib.urlencode(args) elif method == 'PUT' or method == 'POST' or method == 'PATCH': headers['Content-Type'] = u'application/x-www-form-urlencoded' body = urllib.urlencode(args) str1 = '%s %s %s\r\n' % (method, reqpath, \ Blobstore2RestClient._http_vsn_str) str1 += 'Host: \r\n' str1 += 'Accept-Encoding: identity\r\n' for header, value in headers.iteritems(): str1 += '%s: %s\r\n' % (header, value) str1 += '\r\n' if body and len(body) > 0: if isinstance(body, bytearray): str1 = str1.encode("ASCII") + body else: str1 += body bs2 = BlobStore2() if not isinstance(str1, bytearray): str1 = str1.encode("ASCII") if logging.getLogger().isEnabledFor(logging.DEBUG): try: LOGGER.debug('Blobstore REQUEST: %s\n\tPATH: %s\n\tBODY: %s'% \ (method, path, body)) except: LOGGER.debug('Blobstore REQUEST: %s\n\tPATH: %s\n\tBODY: %s'% \ (method, path, 'binary body')) inittime = time.clock() resp_txt = bs2.rest_immediate(str1) endtime = time.clock() bs2.channel.close() LOGGER.info("iLO Response Time to %s: %s secs."% \ (path, str(endtime-inittime))) #Dummy response to support a bad host response if len(resp_txt) == 0: resp_txt = "HTTP/1.1 500 Not Found\r\nAllow: " \ "GET\r\nCache-Control: no-cache\r\nContent-length: " \ "0\r\nContent-type: text/html\r\nDate: Tues, 1 Apr 2025 " \ "00:00:01 GMT\r\nServer: " \ "HP-iLO-Server/1.30\r\nX_HP-CHRP-Service-Version: 1.0.3\r\n\r\n\r\n" restreq = RestRequest(reqpath, method=method, body=body) rest_response = RisRestResponse(restreq, resp_txt) if rest_response.status in range(300, 399) and \ rest_response.status != 304: newloc = rest_response.getheader("location") newurl = urlparse2.urlparse(newloc) rest_response = self._rest_request(newurl.path, method, args, \ oribody, headers, optionalpassword, providerheader) try: if rest_response.getheader('content-encoding') == 'gzip': compressedfile = StringIO(rest_response.text) decompressedfile = gzip.GzipFile(fileobj=compressedfile) rest_response.text = decompressedfile.read() except StandardError: pass if logging.getLogger().isEnabledFor(logging.DEBUG): headerstr = '' for header in rest_response._http_response.msg.headers: headerstr += '\t' + header.rstrip() + '\n' try: LOGGER.debug('Blobstore RESPONSE for %s:\nCode: %s\nHeaders:\n%s'\ '\nBody of %s: %s'%\ (rest_response.request.path,\ str(rest_response._http_response.status)+ ' ' + \ rest_response._http_response.reason, \ headerstr, rest_response.request.path, rest_response.read)) except: LOGGER.debug('Blobstore RESPONSE for %s:\nCode:%s'% \ (rest_response.request.path, rest_response)) return rest_response
def get_cache_dirname(self): """The rest client's current base URL converted to path""" parts = urlparse2.urlparse(self.get_base_url()) pathstr = '%s/%s' % (parts.netloc, parts.path) return pathstr.replace('//', '/')
def parse_url(url): parsed = urlparse(url) if parsed.scheme: return url else: pass