def test_init(har_data): """ Test the object loading """ with pytest.raises(ValueError): page = HarPage(PAGE_ID) init_data = har_data('humanssuck.net.har') # Throws PageNotFoundException with bad page ID with pytest.raises(PageNotFoundError): page = HarPage(BAD_PAGE_ID, har_data=init_data) # Make sure it can load with either har_data or a parser page = HarPage(PAGE_ID, har_data=init_data) assert isinstance(page, HarPage) parser = HarParser(init_data) page = HarPage(PAGE_ID, har_parser=parser) assert isinstance(page, HarPage) assert len(page.entries) == 4 # Make sure that the entries are actually in order. Going a little bit # old school here. for index in range(0, len(page.entries)): if index != len(page.entries) - 1: current_date = dateutil.parser.parse( page.entries[index]['startedDateTime']) next_date = dateutil.parser.parse( page.entries[index + 1]['startedDateTime']) assert current_date <= next_date
def test_get_load_time(har_data): """ Tests HarPage.get_load_time() """ init_data = har_data('humanssuck.net.har') page = HarPage(PAGE_ID, har_data=init_data) assert page.get_load_time(request_type='GET') == 463 assert page.get_load_time(request_type='GET', async=False) == 843 assert page.get_load_time(content_type='image.*') == 304 assert page.get_load_time(status_code='2.*') == 463
def test_filter_entries_load_time(har_data): """ Tests ability to filter entries by load time """ init_data = har_data('humanssuck.net_duplicate_url.har') page = HarPage(PAGE_ID, har_data=init_data) entries = page.filter_entries(load_time__gt=100) assert len(entries) == 4 entries = page.filter_entries(load_time__gt=300) assert len(entries) == 3 entries = page.filter_entries(load_time__gt=500) assert len(entries) == 0
def test_time_to_first_byte(har_data): """ Tests that TTFB is correctly reported as a property of the page. """ init_data = har_data('humanssuck.net.har') page = HarPage(PAGE_ID, har_data=init_data) assert page.time_to_first_byte == 153
def test_hostname(har_data): """ Makes sure that the correct hostname is returned. """ init_data = har_data('humanssuck.net.har') page = HarPage(PAGE_ID, har_data=init_data) assert page.hostname == 'humanssuck.net'
def test_url(har_data): """ Makes sure that the correct URL is returned. """ init_data = har_data('humanssuck.net.har') page = HarPage(PAGE_ID, har_data=init_data) assert page.url == 'http://humanssuck.net/'
def test_no_title(har_data): ''' A page with no title should set the title property as an empty string instead of throwing an exeption. ''' init_data = har_data('no_title.har') page = HarPage(PAGE_ID, har_data=init_data) assert page.title == ''
def parse_har_file(path): with open(path, 'r') as f: try: har_page = HarPage('page_1', har_data=json.loads(f.read())) return har_page.time_to_first_byte, har_page.page_load_time except: print(f'Failed to parse HAR file from {path}') return None, None
def test_duplicate_urls_count(har_data): """ Makes sure that the correct number of urls that appear more than once in har is displayed. """ init_data = har_data('humanssuck.net_duplicate_url.har') page = HarPage(PAGE_ID, har_data=init_data) assert page.duplicate_url_request == { 'http://humanssuck.net/jquery-1.7.1.min.js': 2 }
def GetResources(url, site, initial_url): filename = url + '.har' with open(filename, 'r', encoding='UTF-8') as f_1: temp = json.loads(f_1.read()) # 转换成dict类型 page_id = temp['log']['pages'][0]['id'] # 获取page_id har_page = HarPage(page_id, har_data=temp) filename = url + '.json' f_2 = open(filename, 'w') list = [] for i in range(len(har_page.entries)): num = i + 1 url = har_page.entries[i]['request']['url'] domain_list = re.findall(r"//(.+?)/", url) ip = har_page.entries[i]['serverIPAddress'] mimeType = har_page.entries[i]['response']['content']['mimeType'] # size_kb = har_page.entries[i]['response']['content']['size'] / 1024 # size = round(size_kb, 1) # 保留小数点后1位 size_b = har_page.entries[i]['response']['bodySize'] size_kb = har_page.entries[i]['response']['bodySize'] / 1024 size_kb = round(size_kb, 1) # 保留小数点后1位 data = {'num': num, 'url': url, 'ip': ip, 'mimeType': mimeType, 'size': size_b, 'domain':domain_list[0]} list.append(data) resources = json.dumps(list, indent=4) print(resources, file=f_2) #print(resources) f_2.close # 得到总共有几个域名 domain_list = [] for item in json.loads(resources): # print(item) domain = item['domain'] if domain in domain_list: continue else: domain_list.append(domain) # print(domain_list) for host in domain_list: size = 0 for item in json.loads(resources): domain = item['domain'] if domain == host: ip = item['ip'] size = size + item['size'] else: continue expired = "2086-03-09T21:00:00+08:00" data = {"site": site, 'domain': host, 'ip': ip, 'expired': expired , 'size':size} data_json = json.dumps(data) print(data_json)
def test_sizes_trans(har_data): init_data = har_data('cnn-chrome.har') page = HarPage('page_1', har_data=init_data) assert page.page_size_trans == 2609508 assert page.text_size_trans == 569814 assert page.css_size_trans == 169573 assert page.js_size_trans == 1600321 assert page.image_size_trans == 492950 # TODO - Get test data for audio and video assert page.audio_size_trans == 0 assert page.video_size_trans == 0
def test_sizes(har_data): init_data = har_data('humanssuck.net.har') page = HarPage(PAGE_ID, har_data=init_data) assert page.page_size == 62204 assert page.text_size == 246 assert page.css_size == 8 assert page.js_size == 38367 assert page.image_size == 23591 # TODO - Get test data for audio and video assert page.audio_size == 0 assert page.video_size == 0
def test_request_types(har_data): """ Test request type filters """ init_data = har_data('humanssuck.net.har') page = HarPage(PAGE_ID, har_data=init_data) # Check request type lists for req in page.get_requests: assert req['request']['method'] == 'GET' for req in page.post_requests: assert req['request']['method'] == 'POST'
def test_file_types(har_data): """ Test file type properties """ init_data = har_data('cnn.har') page = HarPage(PAGE_ID, har_data=init_data) file_types = {'image_files': ['image'], 'css_files': ['css'], 'js_files': ['javascript'], 'audio_files': ['audio'], 'video_files': ['video', 'flash'], 'text_files': ['text'], 'html_files': ['html']} for k, v in iteritems(file_types): for asset in getattr(page, k, None): assert _correct_file_type(asset, v)
def test_filter_entries(har_data): """ Tests ability to filter entries, with or without regex """ init_data = har_data('humanssuck.net.har') page = HarPage(PAGE_ID, har_data=init_data) # Filter by request type only entries = page.filter_entries(request_type='.*ET') assert len(entries) == 4 for entry in entries: assert entry['request']['method'] == 'GET' # Filter by request type and content_type entries = page.filter_entries(request_type='.*ET', content_type='image.*') assert len(entries) == 1 for entry in entries: assert entry['request']['method'] == 'GET' for header in entry['response']['headers']: if header['name'] == 'Content-Type': assert re.search('image.*', header['value']) # Filter by request type, content type, and status code entries = page.filter_entries(request_type='.*ET', content_type='image.*', status_code='2.*') assert len(entries) == 1 for entry in entries: assert entry['request']['method'] == 'GET' assert re.search('2.*', str(entry['response']['status'])) for header in entry['response']['headers']: if header['name'] == 'Content-Type': assert re.search('image.*', header['value']) entries = page.filter_entries(request_type='.*ST') assert len(entries) == 0 entries = page.filter_entries(request_type='.*ET', content_type='video.*') assert len(entries) == 0 entries = page.filter_entries(request_type='.*ET', content_type='image.*', status_code='3.*')
def _pre_parse(self, har_file_data, har_file, page_url, sitemap_url): """ Prepare the data to be parsed :param HarFileData har_file_data: :param dict har_file: :param str page_url: :param str sitemap_url: :return: """ har_file_data.page_url = page_url if har_file['log']['entries']: har_file_data.log_entries = har_file['log']['entries'] page_id = har_file['log']['pages'][0].get('id') if page_id: har_file_data.har_page = HarPage(page_id, har_data=har_file) har_file_data.lower_datetime = datetime.now() + timedelta(hours=1) har_file_data.higher_datetime = datetime.now() - timedelta(hours=1) har_file_data.sitemap_domain = urlparse(sitemap_url).netloc
def test_load_times(har_data): """ This whole test really needs better sample data. I need to make a web page with like 2-3 of each asset type to really test the load times. """ init_data = har_data('humanssuck.net.har') page = HarPage(PAGE_ID, har_data=init_data) # Check initial page load assert page.actual_page['request']['url'] == 'http://humanssuck.net/' # Check initial page load times assert page.initial_load_time == 153 assert page.content_load_time == 543 # Check content type browser (async) load times assert page.image_load_time == 304 assert page.css_load_time == 76 assert page.js_load_time == 310 assert page.html_load_time == 153 assert page.page_load_time == 567 # TODO - Need to get sample data for these types assert page.audio_load_time == 0 assert page.video_load_time == 0
def test_entries(har_data): init_data = har_data('humanssuck.net.har') page = HarPage(PAGE_ID, har_data=init_data) for entry in page.entries: assert entry['pageref'] == page.page_id
Rename all files from 0 to n. Like 0.har, 1.har and so on. Specific max number as limit in the for loop. for x in range (0, n) Make sure CSV file is empty and exists as it writes into it. ''' for x in range(0, 86): with open(os.path.join('har_files', f'{x}.har'), 'r') as f: har_page = HarPage('page_1', har_data=json.loads(f.read())) entries = har_page.filter_entries() with open("output.csv", "a") as csv_file: csv_app = csv.writer(csv_file) csv_app.writerow([ har_page.url, har_page.hostname, har_page.page_load_time, har_page.time_to_first_byte, har_page.page_size, har_page.initial_load_time, har_page.content_load_time, har_page.html_load_time, har_page.css_load_time, har_page.js_load_time, har_page.image_load_time, har_page.audio_load_time, har_page.video_load_time ]) ''' ### WORK WITH LOAD TIMES (all load times are in ms) ###
def parser(websites, repetition): for dom in websites: print( '################################################################') print(dom) jssize = [] cssize = [] imagesize = [] videosize = [] pagesize = [] jsnum = [] csnum = [] imagenum = [] videonum = [] pagenum = [] for i in range(0, repetition): #General har files string = str(i) + dom + 'har.har' with open(string, 'r') as f: har_parser = HarPage(dom, har_data=json.loads(f.read())) #for each har file of the web site under review #print ("Image Load time: " + str(har_parser.image_load_time)) #print ("Html Load time: " + str(har_parser.html_load_time)) #combines all the diffrent har files generated for that secific web site and shows average/min/max #for the size of the diffrent files jssize.append(har_parser.js_size) cssize.append(har_parser.css_size) imagesize.append(har_parser.image_size) videosize.append(har_parser.video_size) pagesize.append(har_parser.page_size) #for the number of diffrent files jsnum.append( len( har_parser.filter_entries(content_type='js.*', status_code='2.*'))) csnum.append( len( har_parser.filter_entries(content_type='css.*', status_code='2.*'))) imagenum.append( len( har_parser.filter_entries(content_type='image.*', status_code='2.*'))) videonum.append( len( har_parser.filter_entries(content_type='video.*', status_code='2.*'))) pagenum.append(len(har_parser.filter_entries(status_code='2.*'))) h = har_parser.filter_entries(content_type='text.*', status_code='2.*') print("Max page size: " + str(max(pagesize)), end='\n') print("Max js size: " + str(max(jssize)), end='\n') print("Max css size: " + str(max(cssize)), end='\n') print("Max image size: " + str(max(imagesize)), end='\n') print("Max video size: " + str(max(videosize)), end='\n') print("Max page number: " + str(max(pagenum)), end='\n') print("Max js number: " + str(max(jsnum)), end='\n') print("Max css number: " + str(max(csnum)), end='\n') print("Max image number: " + str(max(imagenum)), end='\n') print("Max video number: " + str(max(videonum)), end='\n') print( '--------------------------------------------------------------------' ) print("Min page size: " + str(min(pagesize)), end='\n') print("Min js size: " + str(min(jssize)), end='\n') print("Min css size: " + str(min(cssize)), end='\n') print("Min image size: " + str(min(imagesize)), end='\n') print("Min video size: " + str(min(videosize)), end='\n') print("Min page number: " + str(min(pagenum)), end='\n') print("Min js number: " + str(min(jsnum)), end='\n') print("Min css number: " + str(min(csnum)), end='\n') print("Min image number: " + str(min(imagenum)), end='\n') print("Min video number: " + str(min(videonum)), end='\n') print( '--------------------------------------------------------------------' ) print("Average page size: " + str(mean(pagesize)), end='\n') print("Average js size: " + str(mean(jssize)), end='\n') print("Average css size: " + str(mean(cssize)), end='\n') print("Average image size: " + str(mean(imagesize)), end='\n') print("Average video size: " + str(mean(videosize)), end='\n') print("Average page number: " + str(mean(pagenum)), end='\n') print("Average js number: " + str(mean(jsnum)), end='\n') print("Average css number: " + str(mean(csnum)), end='\n') print("Average image number: " + str(mean(imagenum)), end='\n') print("Average video number: " + str(mean(videonum)), end='\n') print( '--------------------------------------------------------------------' ) #har files with delay specification string2 = str(i) + dom + 'Performance.har' with open(string2) as data_file: data = json.load(data_file) #All in milliseconds #Calculate the total time required to load a page print("Total time required to load the page: " + str(data["timing"]["loadEventEnd"] - data["timing"]["navigationStart"])) #Calculate request response times print("Request response times: " + str(data["timing"]["responseEnd"] - data["timing"]["requestStart"])) #connect start: represents the moment the request to open a connection is sent to the network #responce Start: represents the moment the browser received the first byte of the response print("Time to first byte: " + str(data["timing"]["responseStart"] - data["timing"]["connectStart"]))
import json import hashlib from haralyzer import HarParser, HarPage import requests with open('www.reddit.com.har', 'r', encoding="utf8") as f: har_page = HarPage('page_1', har_data=json.loads(f.read())) for url in har_page.entries: if url.request.method == 'GET': print(url.response.status) if url.response.status in [0, 200]: if not url.request.cookies: t1 = requests.get(url.url).content t2 = requests.get(url.url).content if t1 == t2: print(url.url)
import json from haralyzer import HarPage with open('b.har', 'r') as f: har_page = HarPage('page_1', har_data=json.loads(f.read())) ### WORK WITH LOAD TIMES (all load times are in ms) ### # Link http://pythonhosted.org/haralyzer/haralyzer.html entries = har_page.filter_entries() #print(entries) print(har_page.get_total_size(entries)) print(har_page.get_load_time(request_type='GET')) print(har_page.total_load_time(request_type='GET')) ''' # Audio Load Time print(har_page.audio_load_time) #Video Load Time print(har_page.video_load_time) #JS Load Time print(har_page.js_load_time) #Hostname print(har_page.hostname)
ts = int(time) utc = datetime.utcfromtimestamp(ts).strftime( '%Y-%m-%d %H:%M:%S') g.write("%s,%s,%s,%s,%s,%s\n" % (shortcode, url, time, utc, likes, comments)) except Exception as e: #print(e) pass #return shortcode_list2 if __name__ == "__main__": with open(sys.argv[1], 'rb') as f: har = f.read() har_parser = HarParser(json.loads(har)) har_page = HarPage('page_4', har_data=json.loads(har)) x = len(har_page.entries) for i in range(0, x): resource_type = har_page.entries[i]['_resourceType'] #print(resource_type) req_url = har_page.entries[i]['request']['url'] if req_url == "https://www.instagram.com/katyperry/": #First 12 posts res = har_page.entries[0]['response']['content']['text'] #print(res) first_12_posts = get_shortcode_first(res) elif resource_type == "xhr" and req_url.startswith( "https://www.instagram.com/graphql/query/?query_hash="): #for other posts res = har_page.entries[i]['response']['content']['text'] #print(res)