def test_init(har_data): # Make sure we only tolerate valid input with pytest.raises(ValueError): har_parser = HarParser('please_dont_work') assert har_parser har_data = har_data('humanssuck.net.har') har_parser = HarParser(har_data) for page in har_parser.pages: assert isinstance(page, HarPage) assert har_parser.browser == {'name': 'Firefox', 'version': '25.0.1'} assert har_parser.version == '1.1' assert har_parser.creator == {'name': 'Firebug', 'version': '1.12'}
def test_init(har_data): """ Test the object loading """ with pytest.raises(ValueError): page = HarPage(PAGE_ID) init_data = har_data('humanssuck.net.har') # Throws PageNotFoundException with bad page ID with pytest.raises(PageNotFoundError): page = HarPage(BAD_PAGE_ID, har_data=init_data) # Make sure it can load with either har_data or a parser page = HarPage(PAGE_ID, har_data=init_data) assert isinstance(page, HarPage) parser = HarParser(init_data) page = HarPage(PAGE_ID, har_parser=parser) assert isinstance(page, HarPage) assert len(page.entries) == 4 # Make sure that the entries are actually in order. Going a little bit # old school here. for index in range(0, len(page.entries)): if index != len(page.entries) - 1: current_date = dateutil.parser.parse( page.entries[index]['startedDateTime']) next_date = dateutil.parser.parse( page.entries[index + 1]['startedDateTime']) assert current_date <= next_date
def save_har_to_csv(test, testname, service_list, desc_list): import csv harname = os.path.join(os.path.abspath(os.path.join(os.path.abspath(os.path.dirname(__file__)), os.pardir)), 'temp', testname+'.har') csv_name = os.path.join(os.path.abspath(os.path.join(os.path.abspath(os.path.dirname(__file__)), os.pardir)), 'temp', testname+'.csv') if os.path.exists(csv_name): os.remove(csv_name) with open(harname, 'r') as f: har_parser = HarParser(json.loads(f.read())) with open(csv_name, mode='x') as csv_file: csv_writer = csv.writer(csv_file, delimiter=',', lineterminator='\n') csv_writer.writerow(['desc','url','status', 'response_type','time', 'starttime']) for x in har_parser.har_data['entries']: if x['request']['url'] in service_list: desc = desc_list[service_list.index(x['request']['url'])] url = x['request']['url'] status = x['response']['status'] time = x['time'] start = x['startedDateTime'] csv_writer.writerow([desc, url, status, 'actual', time, start]) #write expected to csv for x in test['api']: csv_writer.writerow([x['description'], x['servicename'], x['status_code'], 'expected', x['expectedresponseinms'], 0]) return csv_name
def test_create_asset_timeline(har_data): """ Tests the asset timeline function by making sure that it inserts one object correctly. """ init_data = har_data('humanssuck.net.har') har_parser = HarParser(init_data) entry = har_data('single_entry.har') # Get the datetime object of the start time and total load time time_key = dateutil.parser.parse(entry['startedDateTime']) load_time = int(entry['time']) asset_timeline = har_parser.create_asset_timeline([entry]) # The number of entries in the timeline should match the load time assert len(asset_timeline) == load_time for t in range(1, load_time): assert time_key in asset_timeline assert len(asset_timeline[time_key]) == 1 # Compare the dicts for key, value in iteritems(entry): assert asset_timeline[time_key][0][key] == entry[key] time_key = time_key + datetime.timedelta(milliseconds=1)
def check_service_in_har(har_data, service_name): logging.info('Checking for service -->'+ service_name) har_parser = HarParser(json.loads(har_data)) for x in har_parser.har_data['entries']: if x['request']['url'] == service_name: logging.info('got service -> '+service_name) return True
def scan_files(path): data = [] # Parse all files in directory for filename in os.listdir(path): with open(os.path.join(path, filename), 'r') as f: har_parser = HarParser(json.loads(f.read())) start_time = dateutil.parser.parse(har_parser.pages[0].entries[0]["startedDateTime"]) latest_time = start_time # Parse all resources HTML, CSS, JS... for entry in (har_parser.pages[0].entries): if entry["time"] == None: s = 0 else: s = float(entry["time"])/1000 current_time = dateutil.parser.parse(entry["startedDateTime"]) + datetime.timedelta(seconds = s) if (current_time > latest_time): latest_time = current_time total = latest_time - start_time # if total < datetime.timedelta(seconds = 1000): # os.remove(os.path.join(path, filename)) # print(filename) data.append(total.total_seconds()*1000) return data
def harparser(self): """ Captures the har and converts to a HarParser object :return: HarParser object, a page from har capture """ result_har = json.dumps(self._client.har, ensure_ascii=False) har_parser = HarParser(json.loads(result_har)) return har_parser.pages[0]
def __get_page_content_from_har(self): with open(self.har_path, "r") as f: har_parser = HarParser(json.loads(f.read())) for page in har_parser.pages[:1]: for file in page.html_files: return file["response"]["content"]["text"] raise Exception("Unable to access HAR file.")
def get_pages(self): if self.pages: return self.pages try: if 'har' not in self: return [] har_parser = HarParser(self['har']) self.pages = har_parser.pages return self.pages except Exception as e: logging.warning('Saw exception when parsing HAR: {}'.format(e)) return []
def setHeadersFromHarFile(self, fileName, urlMustContain): if not os.path.exists(fileName): return try: from pathlib import Path headersList = [] if Path(fileName).suffix == '.har': from haralyzer import HarParser file = helpers.getFile(fileName) j = json.loads(file) har_page = HarParser(har_data=j) # find the right url for page in har_page.pages: for entry in page.entries: if urlMustContain in entry['request']['url']: headersList = entry['request']['headers'] break else: headersList = helpers.getJsonFile(fileName) headersList = get(headersList, 'headers') headers = [] for header in headersList: name = header.get('name', '') value = header.get('value', '') # ignore pseudo-headers if name.startswith(':'): continue if name.lower() == 'content-length' or name.lower() == 'host': continue # otherwise response will stay compressed and unreadable if name.lower() == 'accept-encoding' and not self.hasBrotli: value = value.replace(', br', '') newHeader = (name, value) headers.append(newHeader) self.headers = OrderedDict(headers) except Exception as e: helpers.handleException(e)
def parse_file(f): har_parser = HarParser(json.loads(f)) rows = [['X-CACHE-HEADER', 'BYTES', 'URL']] hosts = {} size = {} total_bytes = 0 #total bytes for all content across the entire thing for page in har_parser.pages: assert isinstance(page, HarPage) for entry in page.entries: cdn = [] headers = entry['response']['headers'] #print(entry['response'], file=sys.stderr) cdn_str = None total_bytes += entry['response']['content']['size'] #pp.pprint(entry['request']) url = urlparse(entry['request']['url']) for h in headers: if (h['name'] == 'x-cache'): hosts[url.netloc] = 1 #print(url, file=sys.stderr) cdn_str = h['value'] cdn.append(cdn_str) if (cdn_str in size): size[cdn_str] = size[cdn_str] + entry['response']['content'][ 'size'] else: size[cdn_str] = entry['response']['content']['size'] print("\t".join([ str(cdn), str(entry['response']['content']['size']), entry['request']['url'], url.netloc ])) rows.append([ cdn, entry['response']['content']['size'], linkify(entry['request']['url']) ]) bysize = [['CACHE TAG', '% OF BYTES']] for sk in size.keys(): bysize.append([sk, "{:.1%}".format(size[sk] / total_bytes)]) bysize_t = list(map(list, zip(*bysize))) hosts_t = list(map(list, zip(*[hosts.keys()]))) return { 'total_bytes': total_bytes, 'hosts_t': hosts_t, 'bysize': bysize, 'rows': rows }
def main(args): logging.basicConfig(level=args.level) with open(args.archive, "r", encoding="utf-8") as f: body = json.load(f) har_parser = HarParser(body) from visitors import HttpArchiveVisitor visitor = HttpArchiveVisitor() visitor.visit(har_parser) visitor.summarize()
def get_entries(filename: str, entry_id: int = None) -> (dict, list): """Gets either all the entries or a certain one""" with open( os.path.join(os.getenv("UPLOAD_FOLDER", "/tmp"), filename), # nosec "r", encoding="utf-8", ) as process_file: render_pages = HarParser(json.loads(process_file.read())).pages items = [entry for page in render_pages for entry in page.entries] if isinstance(entry_id, int): return items[entry_id] return items
def capture_url_traffic(self, url, wait_time=0): """ Capture the har for a given url :param str url: url to capture traffic for :param int wait_time: time to wait after the page load :return: HarParser object, a page from har capture """ self._client.new_har(options={'captureHeaders': True}) self._driver.goto_url(url, absolute=True) time.sleep(wait_time) result_har = json.dumps(self._client.har, ensure_ascii=False) har_parser = HarParser(json.loads(result_har)) return har_parser.pages[0]
def test_init_entry_with_no_pageref(har_data): ''' If we find an entry with no pageref it should end up in a HarPage object with page ID of unknown ''' data = har_data('missing_pageref.har') har_parser = HarParser(data) # We should have two pages. One is defined in the pages key of the har file # but has no entries. The other should be our unknown page, with a single # entry assert len(har_parser.pages) == 2 page = [p for p in har_parser.pages if p.page_id == 'unknown'][0] assert len(page.entries) == 1
def test_match_status_code(har_data): """ Tests the ability of the parser to match status codes. """ init_data = har_data('humanssuck.net.har') har_parser = HarParser(init_data) entry = har_data('single_entry.har') # TEST THE REGEX FEATURE FIRST # assert har_parser.match_status_code(entry, '2.*') assert not har_parser.match_status_code(entry, '3.*') # TEST LITERAL STRING MATCH # assert har_parser.match_status_code(entry, '200', regex=False) assert not har_parser.match_status_code(entry, '201', regex=False)
def test_http_version(har_data): """ Tests the ability of the parser to match status codes. """ init_data = har_data('humanssuck.net.har') har_parser = HarParser(init_data) entry = har_data('single_entry.har') # TEST THE REGEX FEATURE FIRST # assert har_parser.match_http_version(entry, '.*1.1') assert not har_parser.match_http_version(entry, '.*2') # TEST LITERAL STRING MATCH # assert har_parser.match_http_version(entry, 'HTTP/1.1', regex=False) assert not har_parser.match_http_version(entry, 'HTTP/2.0', regex=False)
def setHeadersFromHarFile(self, fileName, urlMustContain): try: from pathlib import Path headersList = [] if Path(fileName).suffix == '.har': from haralyzer import HarParser file = helpers.getFile(fileName) j = json.loads(file) har_page = HarParser(har_data=j) # find the right url for page in har_page.pages: for entry in page.entries: if urlMustContain in entry['request']['url']: headersList = entry['request']['headers'] break else: headersList = helpers.getJsonFile(fileName) headersList = get(headersList, 'headers') headers = [] for header in headersList: name = header.get('name', '') # ignore pseudo-headers if name.startswith(':'): continue if name.lower() == 'content-length' or name.lower() == 'host': continue newHeader = (name, header.get('value', '')) headers.append(newHeader) self.headers = OrderedDict(headers) except Exception as e: helpers.handleException(e)
def test_match_request_type(har_data): """ Tests the ability of the parser to match a request type. """ # The HarParser does not work without a full har file, but we only want # to test a piece, so this initial load is just so we can get the object # loaded, we don't care about the data in that HAR file. init_data = har_data('humanssuck.net.har') har_parser = HarParser(init_data) entry = har_data('single_entry.har') # TEST THE REGEX FEATURE FIRST # assert har_parser.match_request_type(entry, '.*ET') assert not har_parser.match_request_type(entry, '.*ST') # TEST LITERAL STRING MATCH # assert har_parser.match_request_type(entry, 'GET', regex=False) assert not har_parser.match_request_type(entry, 'POST', regex=False)
def extract_adobe_from_har(file_path_to_har_file): list_to_print = [] with open(file_path_to_har_file, "r") as f: har_parser = HarParser(json.loads(f.read())) for har_page in har_parser.pages: ## POST requests post_requests = har_page.post_requests # filter for adobe hits adobe_post_hits = [] for request in post_requests: if "https://woolworthsfoodgroup.sc.omtrdc" in request["request"]["url"]: adobe_post_hits.append(request) # print(json.dumps(request, indent=4)) for adobe_post_hit in adobe_post_hits: query = parse_query_string(adobe_post_hit["request"]["postData"]["text"]) list_to_print.append(query) ## GET requests get_requests = har_page.get_requests # filter adobe requests for request in get_requests: if "https://woolworthsfoodgroup.sc.omtrdc" in request["request"]["url"]: # print(request["request"]["url"]) my_url = request["request"]["url"] parsed = urllib.parse.urlparse(my_url) data_sent = urllib.parse.unquote(str(parsed.query)) query = parse_query_string(parsed.query) list_to_print.append(query) new_list = sorted(list_to_print, key=lambda k: k["t"]) return new_list
def parse_har_file(har_file): """ Parse a HAR file into a list of request objects This currently filters requests by content_type (text/html) """ har_parser = HarParser(json.load(har_file)) requests = [] for page in har_parser.pages: entries = page.filter_entries(content_type='text/html') for entry in entries: entry_request = entry['request'] request_base_url = "{0.scheme}://{0.netloc}".format( urlsplit(entry_request['url'])) request = { 'method': entry_request['method'], 'url': entry_request['url'].replace(request_base_url, ""), 'datetime': dateutil.parser.parse(entry['startedDateTime']), } if entry_request['method'] == 'POST': request['data'] = { unquote_plus(item['name']): unquote_plus(item['value']) for item in entry_request['postData']['params'] } request['data'].pop('csrf_token', None) requests.append(request) requests.sort(key=itemgetter('datetime')) for request in requests: request.pop('datetime', None) return {'requests': requests}
def get_info_from_har(file_path): with open(file_path, 'r', encoding='UTF8') as f: har_parser = HarParser(json.loads(f.read())) method = har_parser.pages[0].actual_page['request']['method'] url = har_parser.pages[0].actual_page['request']['url'] headers = {} for header in har_parser.pages[0].actual_page['request']['headers']: key = header['name'] value = header['value'] headers[key] = value queryString = har_parser.pages[0].actual_page['request']['queryString'] cookies = har_parser.pages[0].actual_page['request']['cookies'] context = { 'method': method, 'url': url, 'headers': headers, 'queryString': queryString, 'cookies': cookies } return context
def get_response_contents_from_har(har_path): response_contents = defaultdict(str) with open(har_path, 'r') as f: try: har_parser = HarParser(json.loads(f.read())) except ValueError: return response_contents for page in har_parser.pages: for entry in page.entries: try: url = entry["request"]["url"] base_url = url.split("?")[0].split("#")[0] mime_type = entry["response"]["content"]["mimeType"] if "image" in mime_type or "font" in mime_type or \ "css" in mime_type: continue # print mime_type body = entry["response"]["content"]["text"] # print url, body[:128] # response_contents.append((url, body)) response_contents[base_url] += ("\n======\n" + body) except Exception: pass return response_contents
'WWW-Authenticate', 'X-Frame-Options', 'A-IM', 'Accept', 'Accept-Charset', 'Accept-Datetime', 'Accept-Encoding', 'Accept-Language', 'Access-Control-Request-Method', 'Access-Control-Request-Headers', 'Authorization', 'Cache-Control', 'Connection', 'Content-Length', 'Content-MD5', 'Content-Type', 'Cookie', 'Date', 'Expect', 'Forwarded', 'From', 'Host', 'HTTP2-Settings', 'If-Match', 'If-Modified-Since', 'If-None-Match', 'If-Range', 'If-Unmodified-Since', 'Max-Forwards', 'Origin', 'Pragma', 'Proxy-Authorization', 'Range', 'Referer', 'TE', 'Upgrade', 'User-Agent', 'Via', 'Warning' ] FIELDS = [] for a in FIELDSs: FIELDS.append(a.lower()) with open('arcCSP.har', 'r') as f: data = HarParser(json.loads(f.read())) for page in data.pages: toprint = "" toprint = toprint + "=========================\n" + str(page) print(toprint) for entry in page.entries: tab = entry['request']['headers'] toprinta = "" toprinta = toprinta + entry['request']['url'] + "\n" + entry[ 'request']['httpVersion'] + "\n" #print(entry['request']['url']) #print(entry['request']['httpVersion']) #print(' ') i = 0 for aa in tab:
from haralyzer import HarParser, HarPage from numpy import trapz import pandas as pd import asciiplotlib as apl # import matplotlib.pyplot as plt # Handle too many or not enough inputs if len(sys.argv) < 2: raise Exception("Error: need a path to HAR file as command-line argument") elif len(sys.argv) > 2: raise Exception("Error: gave too many command-line arguments") # Get HAR archive File name (as command-line argument) har = sys.argv[1] with open(har, 'r') as f: har_parser = HarParser(json.loads(f.read())) # Get onLoad per page load page_onLoad = [] for item in har_parser.har_data["pages"]: page_onLoad.append(item.get("pageTimings").get("onLoad")) # Get total in bytes for _bytesIn and _objectSize numPages = 0 total_bytesIn = [] total_objectSize = [] for page in har_parser.pages: numPages += 1 byteSize = objSize = 0 for entry in page.entries: byteSize += int(entry["_bytesIn"])
url = "https://www.instagram.com/p/%s/" % shortcode ts = int(time) utc = datetime.utcfromtimestamp(ts).strftime( '%Y-%m-%d %H:%M:%S') g.write("%s,%s,%s,%s,%s,%s\n" % (shortcode, url, time, utc, likes, comments)) except Exception as e: #print(e) pass #return shortcode_list2 if __name__ == "__main__": with open(sys.argv[1], 'rb') as f: har = f.read() har_parser = HarParser(json.loads(har)) har_page = HarPage('page_4', har_data=json.loads(har)) x = len(har_page.entries) for i in range(0, x): resource_type = har_page.entries[i]['_resourceType'] #print(resource_type) req_url = har_page.entries[i]['request']['url'] if req_url == "https://www.instagram.com/katyperry/": #First 12 posts res = har_page.entries[0]['response']['content']['text'] #print(res) first_12_posts = get_shortcode_first(res) elif resource_type == "xhr" and req_url.startswith( "https://www.instagram.com/graphql/query/?query_hash="): #for other posts res = har_page.entries[i]['response']['content']['text']
def read_har(harfile): # Read harfile and return haralyzer parser with open(harfile, 'r') as f: har_parser = HarParser(json.loads(f.read())) return har_parser
def test_match_headers(har_data): # The HarParser does not work without a full har file, but we only want # to test a piece, so this initial load is just so we can get the object # loaded, we don't care about the data in that HAR file. init_data = har_data('humanssuck.net.har') har_parser = HarParser(init_data) raw_headers = har_data('single_entry.har') # Make sure that bad things happen if we don't give it response/request test_data = { 'captain beefheart': { 'accept': '.*text/html,application/xhtml.*', 'host': 'humanssuck.*', 'accept-encoding': '.*deflate', }, } with pytest.raises(ValueError): _headers_test(har_parser, raw_headers, test_data, True, True) # TEST THE REGEX FEATURE FIRST # # These should all be True test_data = { 'request': { 'accept': '.*text/html,application/xhtml.*', 'host': 'humanssuck.*', 'accept-encoding': '.*deflate', }, 'response': { 'server': 'nginx', 'content-type': 'text.*', 'connection': '.*alive', }, } _headers_test(har_parser, raw_headers, test_data, True, True) test_data = { 'request': { 'accept': '.*text/xml,application/xhtml.*', 'host': 'humansrule.*', 'accept-encoding': 'i dont accept that', }, 'response': { 'server': 'apache', 'content-type': 'application.*', 'connection': '.*dead', }, } _headers_test(har_parser, raw_headers, test_data, False, True) # Test literal string matches # # These should all be True test_data = { 'request': { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'host': 'humanssuck.net', 'accept-encoding': 'gzip, deflate', }, 'response': { 'server': 'nginx', 'content-type': 'text/html; charset=UTF-8', 'connection': 'keep-alive', }, } _headers_test(har_parser, raw_headers, test_data, True, False) test_data = { 'request': { 'accept': 'I accept nothing', 'host': 'humansrule.guru', 'accept-encoding': 'i dont accept that', }, 'response': { 'server': 'apache', 'content-type': 'your mom', 'connection': 'not keep-alive', }, } _headers_test(har_parser, raw_headers, test_data, False, False)
def __init__(self, pydict): self.__har = pydict self.__har_parser = HarParser(pydict)
if args.encode == "1": #IE and firefox encode has strange way to encode none english charater. "头" utf8 hex is e5a4b4, but IE and firefox change it to c3a5c2a4c2b4, so need to fix it here hex = har_text.encode("hex") n = 2 hex = ' '.join([hex[j:j + n] for j in range(0, len(hex), n)]) hex_r = re.sub(r'c3 a(\w) c2 (\w{2}) c2 (\w{2})', r'e\1\2\3', hex).replace(' ', '') har_text = hex_r.decode('hex') encodlist = os.popen( 'grep \'"name": "Content-Type"\' ' + i + ' -A 1|grep -iPo \'charset=.*"\'|sort -u|grep -iv "utf-8"' ).readlines() for encode in encodlist: replacestring = encode.strip().replace('"', '').split('=')[1] har_text = har_text.replace(replacestring, 'UTF-8') har_parser = HarParser(json.loads(har_text)) #for page in har_parser.pages: for entry in har_parser.har_data['entries']: entries.append(entry) print "Found requests number: " + str(len(entries)) #generates dic has entries list ID, started_time and url. Then can always use entries list ID to match request and response. #{0: ['2019-01-31T01:51:06.305Z', 'POST:/dvwa/vulnerabilities/xss_r/?name=test','username=123&passowrd=123']} start_time_dict = {} for i in range(len(entries)): start_time_dict[i] = [] start_time_dict[i].append(str(entries[i]['startedDateTime'])) #print entries[i]['request']['url'] url_match = re.search(r'(http://|https://)(.*?\/)(.*)', str(entries[i]['request']['url']))