Example #1
0
def test_init(har_data):
    """
    Test the object loading
    """
    with pytest.raises(ValueError):
        page = HarPage(PAGE_ID)

    init_data = har_data('humanssuck.net.har')

    # Throws PageNotFoundException with bad page ID
    with pytest.raises(PageNotFoundError):
        page = HarPage(BAD_PAGE_ID, har_data=init_data)

    # Make sure it can load with either har_data or a parser
    page = HarPage(PAGE_ID, har_data=init_data)
    assert isinstance(page, HarPage)
    parser = HarParser(init_data)
    page = HarPage(PAGE_ID, har_parser=parser)
    assert isinstance(page, HarPage)

    assert len(page.entries) == 4
    # Make sure that the entries are actually in order. Going a little bit
    # old school here.
    for index in range(0, len(page.entries)):
        if index != len(page.entries) - 1:
            current_date = dateutil.parser.parse(
                page.entries[index]['startedDateTime'])
            next_date = dateutil.parser.parse(
                page.entries[index + 1]['startedDateTime'])
            assert current_date <= next_date
Example #2
0
def test_get_load_time(har_data):
    """
    Tests HarPage.get_load_time()
    """
    init_data = har_data('humanssuck.net.har')
    page = HarPage(PAGE_ID, har_data=init_data)

    assert page.get_load_time(request_type='GET') == 463
    assert page.get_load_time(request_type='GET', async=False) == 843
    assert page.get_load_time(content_type='image.*') == 304
    assert page.get_load_time(status_code='2.*') == 463
Example #3
0
def test_get_load_time(har_data):
    """
    Tests HarPage.get_load_time()
    """
    init_data = har_data('humanssuck.net.har')
    page = HarPage(PAGE_ID, har_data=init_data)

    assert page.get_load_time(request_type='GET') == 463
    assert page.get_load_time(request_type='GET', async=False) == 843
    assert page.get_load_time(content_type='image.*') == 304
    assert page.get_load_time(status_code='2.*') == 463
Example #4
0
def test_filter_entries_load_time(har_data):
    """
    Tests ability to filter entries by load time
    """
    init_data = har_data('humanssuck.net_duplicate_url.har')
    page = HarPage(PAGE_ID, har_data=init_data)

    entries = page.filter_entries(load_time__gt=100)
    assert len(entries) == 4
    entries = page.filter_entries(load_time__gt=300)
    assert len(entries) == 3
    entries = page.filter_entries(load_time__gt=500)
    assert len(entries) == 0
Example #5
0
def test_time_to_first_byte(har_data):
    """
    Tests that TTFB is correctly reported as a property of the page.
    """
    init_data = har_data('humanssuck.net.har')
    page = HarPage(PAGE_ID, har_data=init_data)
    assert page.time_to_first_byte == 153
Example #6
0
def test_hostname(har_data):
    """
    Makes sure that the correct hostname is returned.
    """
    init_data = har_data('humanssuck.net.har')
    page = HarPage(PAGE_ID, har_data=init_data)
    assert page.hostname == 'humanssuck.net'
Example #7
0
def test_url(har_data):
    """
    Makes sure that the correct URL is returned.
    """
    init_data = har_data('humanssuck.net.har')
    page = HarPage(PAGE_ID, har_data=init_data)
    assert page.url == 'http://humanssuck.net/'
Example #8
0
def test_no_title(har_data):
    '''
    A page with no title should set the title property as an empty string
    instead of throwing an exeption.
    '''
    init_data = har_data('no_title.har')
    page = HarPage(PAGE_ID, har_data=init_data)
    assert page.title == ''
Example #9
0
def parse_har_file(path):
    with open(path, 'r') as f:
        try:
            har_page = HarPage('page_1', har_data=json.loads(f.read()))
            return har_page.time_to_first_byte, har_page.page_load_time
        except:
            print(f'Failed to parse HAR file from {path}')

    return None, None
Example #10
0
def test_duplicate_urls_count(har_data):
    """
    Makes sure that the correct number of urls that appear more than once in har is displayed.
    """
    init_data = har_data('humanssuck.net_duplicate_url.har')
    page = HarPage(PAGE_ID, har_data=init_data)
    assert page.duplicate_url_request == {
        'http://humanssuck.net/jquery-1.7.1.min.js': 2
    }
Example #11
0
def GetResources(url, site, initial_url):
    filename = url + '.har'
    with open(filename, 'r', encoding='UTF-8') as f_1:
        temp = json.loads(f_1.read())
        # 转换成dict类型
        page_id = temp['log']['pages'][0]['id']
        # 获取page_id
        har_page = HarPage(page_id, har_data=temp)

    filename = url + '.json'
    f_2 = open(filename, 'w')
    list = []
    for i in range(len(har_page.entries)):
        num = i + 1
        url = har_page.entries[i]['request']['url']
        domain_list = re.findall(r"//(.+?)/", url)
        ip = har_page.entries[i]['serverIPAddress']
        mimeType = har_page.entries[i]['response']['content']['mimeType']
        # size_kb = har_page.entries[i]['response']['content']['size'] / 1024
        # size = round(size_kb, 1)  # 保留小数点后1位
        size_b = har_page.entries[i]['response']['bodySize']
        size_kb = har_page.entries[i]['response']['bodySize'] / 1024
        size_kb = round(size_kb, 1)  # 保留小数点后1位
        data = {'num': num, 'url': url, 'ip': ip, 'mimeType': mimeType, 'size': size_b, 'domain':domain_list[0]}
        list.append(data)
    resources = json.dumps(list, indent=4)
    print(resources, file=f_2)
    #print(resources)
    f_2.close

    # 得到总共有几个域名
    domain_list = []
    for item in json.loads(resources):
        # print(item)
        domain = item['domain']

        if domain in domain_list:
            continue
        else:
            domain_list.append(domain)
    # print(domain_list)

    for host in domain_list:
        size = 0
        for item in json.loads(resources):
            domain = item['domain']
            if domain == host:
                ip = item['ip']
                size = size + item['size']
            else:
                continue
        expired = "2086-03-09T21:00:00+08:00"
        data = {"site": site, 'domain': host, 'ip': ip, 'expired': expired , 'size':size}
        data_json = json.dumps(data)
        print(data_json)
Example #12
0
def test_sizes_trans(har_data):
    init_data = har_data('cnn-chrome.har')
    page = HarPage('page_1', har_data=init_data)

    assert page.page_size_trans == 2609508
    assert page.text_size_trans == 569814
    assert page.css_size_trans == 169573
    assert page.js_size_trans == 1600321
    assert page.image_size_trans == 492950
    # TODO - Get test data for audio and video
    assert page.audio_size_trans == 0
    assert page.video_size_trans == 0
Example #13
0
def test_sizes(har_data):
    init_data = har_data('humanssuck.net.har')
    page = HarPage(PAGE_ID, har_data=init_data)

    assert page.page_size == 62204
    assert page.text_size == 246
    assert page.css_size == 8
    assert page.js_size == 38367
    assert page.image_size == 23591
    # TODO - Get test data for audio and video
    assert page.audio_size == 0
    assert page.video_size == 0
Example #14
0
def test_request_types(har_data):
    """
    Test request type filters
    """
    init_data = har_data('humanssuck.net.har')
    page = HarPage(PAGE_ID, har_data=init_data)

    # Check request type lists
    for req in page.get_requests:
        assert req['request']['method'] == 'GET'

    for req in page.post_requests:
        assert req['request']['method'] == 'POST'
Example #15
0
def test_file_types(har_data):
    """
    Test file type properties
    """
    init_data = har_data('cnn.har')
    page = HarPage(PAGE_ID, har_data=init_data)

    file_types = {'image_files': ['image'], 'css_files': ['css'],
                  'js_files': ['javascript'], 'audio_files': ['audio'],
                  'video_files': ['video', 'flash'], 'text_files': ['text'],
                  'html_files': ['html']}

    for k, v in iteritems(file_types):
        for asset in getattr(page, k, None):
            assert _correct_file_type(asset, v)
Example #16
0
def test_filter_entries(har_data):
    """
    Tests ability to filter entries, with or without regex
    """
    init_data = har_data('humanssuck.net.har')
    page = HarPage(PAGE_ID, har_data=init_data)

    # Filter by request type only
    entries = page.filter_entries(request_type='.*ET')
    assert len(entries) == 4
    for entry in entries:
        assert entry['request']['method'] == 'GET'

    # Filter by request type and content_type
    entries = page.filter_entries(request_type='.*ET', content_type='image.*')
    assert len(entries) == 1
    for entry in entries:
        assert entry['request']['method'] == 'GET'
        for header in entry['response']['headers']:
            if header['name'] == 'Content-Type':
                assert re.search('image.*', header['value'])

    # Filter by request type, content type, and status code
    entries = page.filter_entries(request_type='.*ET',
                                  content_type='image.*',
                                  status_code='2.*')
    assert len(entries) == 1
    for entry in entries:
        assert entry['request']['method'] == 'GET'
        assert re.search('2.*', str(entry['response']['status']))
        for header in entry['response']['headers']:
            if header['name'] == 'Content-Type':
                assert re.search('image.*', header['value'])

    entries = page.filter_entries(request_type='.*ST')
    assert len(entries) == 0
    entries = page.filter_entries(request_type='.*ET', content_type='video.*')
    assert len(entries) == 0
    entries = page.filter_entries(request_type='.*ET',
                                  content_type='image.*',
                                  status_code='3.*')
Example #17
0
    def _pre_parse(self, har_file_data, har_file, page_url, sitemap_url):
        """
        Prepare the data to be parsed

        :param HarFileData har_file_data:
        :param dict har_file:
        :param str page_url:
        :param str sitemap_url:
        :return:
        """
        har_file_data.page_url = page_url
        if har_file['log']['entries']:
            har_file_data.log_entries = har_file['log']['entries']
        page_id = har_file['log']['pages'][0].get('id')
        if page_id:
            har_file_data.har_page = HarPage(page_id, har_data=har_file)
        har_file_data.lower_datetime = datetime.now() + timedelta(hours=1)
        har_file_data.higher_datetime = datetime.now() - timedelta(hours=1)
        har_file_data.sitemap_domain = urlparse(sitemap_url).netloc
Example #18
0
def test_load_times(har_data):
    """
    This whole test really needs better sample data. I need to make a
    web page with like 2-3 of each asset type to really test the load times.
    """
    init_data = har_data('humanssuck.net.har')
    page = HarPage(PAGE_ID, har_data=init_data)
    # Check initial page load
    assert page.actual_page['request']['url'] == 'http://humanssuck.net/'

    # Check initial page load times
    assert page.initial_load_time == 153
    assert page.content_load_time == 543
    # Check content type browser (async) load times
    assert page.image_load_time == 304
    assert page.css_load_time == 76
    assert page.js_load_time == 310
    assert page.html_load_time == 153
    assert page.page_load_time == 567
    # TODO - Need to get sample data for these types
    assert page.audio_load_time == 0
    assert page.video_load_time == 0
Example #19
0
def test_filter_entries(har_data):
    """
    Tests ability to filter entries, with or without regex
    """
    init_data = har_data('humanssuck.net.har')
    page = HarPage(PAGE_ID, har_data=init_data)

    # Filter by request type only
    entries = page.filter_entries(request_type='.*ET')
    assert len(entries) == 4
    for entry in entries:
        assert entry['request']['method'] == 'GET'

    # Filter by request type and content_type
    entries = page.filter_entries(request_type='.*ET', content_type='image.*')
    assert len(entries) == 1
    for entry in entries:
        assert entry['request']['method'] == 'GET'
        for header in entry['response']['headers']:
            if header['name'] == 'Content-Type':
                assert re.search('image.*', header['value'])

    # Filter by request type, content type, and status code
    entries = page.filter_entries(request_type='.*ET', content_type='image.*',
                                  status_code='2.*')
    assert len(entries) == 1
    for entry in entries:
        assert entry['request']['method'] == 'GET'
        assert re.search('2.*', str(entry['response']['status']))
        for header in entry['response']['headers']:
            if header['name'] == 'Content-Type':
                assert re.search('image.*', header['value'])

    entries = page.filter_entries(request_type='.*ST')
    assert len(entries) == 0
    entries = page.filter_entries(request_type='.*ET', content_type='video.*')
    assert len(entries) == 0
    entries = page.filter_entries(request_type='.*ET', content_type='image.*',
                                  status_code='3.*')
Example #20
0
def test_entries(har_data):
    init_data = har_data('humanssuck.net.har')
    page = HarPage(PAGE_ID, har_data=init_data)

    for entry in page.entries:
        assert entry['pageref'] == page.page_id
Example #21
0
Rename all files from 0 to n. Like 0.har, 1.har and so on.

Specific max number as limit in the for loop. for x in range (0, n)

Make sure CSV file is empty and exists as it writes into it.




'''

for x in range(0, 86):

    with open(os.path.join('har_files', f'{x}.har'), 'r') as f:
        har_page = HarPage('page_1', har_data=json.loads(f.read()))

    entries = har_page.filter_entries()

    with open("output.csv", "a") as csv_file:
        csv_app = csv.writer(csv_file)
        csv_app.writerow([
            har_page.url, har_page.hostname, har_page.page_load_time,
            har_page.time_to_first_byte, har_page.page_size,
            har_page.initial_load_time, har_page.content_load_time,
            har_page.html_load_time, har_page.css_load_time,
            har_page.js_load_time, har_page.image_load_time,
            har_page.audio_load_time, har_page.video_load_time
        ])
'''
### WORK WITH LOAD TIMES (all load times are in ms) ###
Example #22
0
def parser(websites, repetition):
    for dom in websites:
        print(
            '################################################################')
        print(dom)
        jssize = []
        cssize = []
        imagesize = []
        videosize = []
        pagesize = []
        jsnum = []
        csnum = []
        imagenum = []
        videonum = []
        pagenum = []
        for i in range(0, repetition):
            #General har files
            string = str(i) + dom + 'har.har'
            with open(string, 'r') as f:
                har_parser = HarPage(dom, har_data=json.loads(f.read()))

            #for each har file of the web site under review
            #print ("Image Load time: " + str(har_parser.image_load_time))
            #print ("Html Load time: " + str(har_parser.html_load_time))

            #combines all the diffrent har files generated for that secific web site and shows average/min/max
            #for the size of the diffrent files
            jssize.append(har_parser.js_size)
            cssize.append(har_parser.css_size)
            imagesize.append(har_parser.image_size)
            videosize.append(har_parser.video_size)
            pagesize.append(har_parser.page_size)

            #for the number of diffrent files
            jsnum.append(
                len(
                    har_parser.filter_entries(content_type='js.*',
                                              status_code='2.*')))
            csnum.append(
                len(
                    har_parser.filter_entries(content_type='css.*',
                                              status_code='2.*')))
            imagenum.append(
                len(
                    har_parser.filter_entries(content_type='image.*',
                                              status_code='2.*')))
            videonum.append(
                len(
                    har_parser.filter_entries(content_type='video.*',
                                              status_code='2.*')))
            pagenum.append(len(har_parser.filter_entries(status_code='2.*')))
            h = har_parser.filter_entries(content_type='text.*',
                                          status_code='2.*')

        print("Max page size: " + str(max(pagesize)), end='\n')
        print("Max js size: " + str(max(jssize)), end='\n')
        print("Max css size: " + str(max(cssize)), end='\n')
        print("Max image size: " + str(max(imagesize)), end='\n')
        print("Max video size: " + str(max(videosize)), end='\n')

        print("Max page number: " + str(max(pagenum)), end='\n')
        print("Max js number: " + str(max(jsnum)), end='\n')
        print("Max css number: " + str(max(csnum)), end='\n')
        print("Max image number: " + str(max(imagenum)), end='\n')
        print("Max video number: " + str(max(videonum)), end='\n')
        print(
            '--------------------------------------------------------------------'
        )

        print("Min page size: " + str(min(pagesize)), end='\n')
        print("Min js size: " + str(min(jssize)), end='\n')
        print("Min css size: " + str(min(cssize)), end='\n')
        print("Min image size: " + str(min(imagesize)), end='\n')
        print("Min video size: " + str(min(videosize)), end='\n')

        print("Min page number: " + str(min(pagenum)), end='\n')
        print("Min js number: " + str(min(jsnum)), end='\n')
        print("Min css number: " + str(min(csnum)), end='\n')
        print("Min image number: " + str(min(imagenum)), end='\n')
        print("Min video number: " + str(min(videonum)), end='\n')
        print(
            '--------------------------------------------------------------------'
        )
        print("Average page size: " + str(mean(pagesize)), end='\n')
        print("Average js size: " + str(mean(jssize)), end='\n')
        print("Average css size: " + str(mean(cssize)), end='\n')
        print("Average image size: " + str(mean(imagesize)), end='\n')
        print("Average video size: " + str(mean(videosize)), end='\n')

        print("Average page number: " + str(mean(pagenum)), end='\n')
        print("Average js number: " + str(mean(jsnum)), end='\n')
        print("Average css number: " + str(mean(csnum)), end='\n')
        print("Average image number: " + str(mean(imagenum)), end='\n')
        print("Average video number: " + str(mean(videonum)), end='\n')
        print(
            '--------------------------------------------------------------------'
        )

        #har files with delay specification
        string2 = str(i) + dom + 'Performance.har'
        with open(string2) as data_file:
            data = json.load(data_file)
        #All in milliseconds
        #Calculate the total time required to load a page
        print("Total time required to load the page: " +
              str(data["timing"]["loadEventEnd"] -
                  data["timing"]["navigationStart"]))
        #Calculate request response times
        print("Request response times: " + str(data["timing"]["responseEnd"] -
                                               data["timing"]["requestStart"]))
        #connect start: represents the moment the request to open a connection is sent to the network
        #responce Start: represents the moment the browser received the first byte of the response
        print("Time to first byte: " + str(data["timing"]["responseStart"] -
                                           data["timing"]["connectStart"]))
Example #23
0
import json
import hashlib
from haralyzer import HarParser, HarPage
import requests

with open('www.reddit.com.har', 'r', encoding="utf8") as f:
    har_page = HarPage('page_1', har_data=json.loads(f.read()))

for url in har_page.entries:
    if url.request.method == 'GET':
        print(url.response.status)
        if url.response.status in [0, 200]:
            if not url.request.cookies:
                t1 = requests.get(url.url).content
                t2 = requests.get(url.url).content
                if t1 == t2:
                    print(url.url)
Example #24
0
import json
from haralyzer import HarPage

with open('b.har', 'r') as f:
    har_page = HarPage('page_1', har_data=json.loads(f.read()))

### WORK WITH LOAD TIMES (all load times are in ms) ###
# Link http://pythonhosted.org/haralyzer/haralyzer.html

entries = har_page.filter_entries()

#print(entries)

print(har_page.get_total_size(entries))

print(har_page.get_load_time(request_type='GET'))

print(har_page.total_load_time(request_type='GET'))
'''
# Audio Load Time
print(har_page.audio_load_time)

#Video Load Time
print(har_page.video_load_time)

#JS Load Time
print(har_page.js_load_time)

#Hostname
print(har_page.hostname)
Example #25
0
                ts = int(time)
                utc = datetime.utcfromtimestamp(ts).strftime(
                    '%Y-%m-%d %H:%M:%S')
                g.write("%s,%s,%s,%s,%s,%s\n" %
                        (shortcode, url, time, utc, likes, comments))
        except Exception as e:
            #print(e)
            pass
    #return shortcode_list2


if __name__ == "__main__":
    with open(sys.argv[1], 'rb') as f:
        har = f.read()
        har_parser = HarParser(json.loads(har))
        har_page = HarPage('page_4', har_data=json.loads(har))
    x = len(har_page.entries)
    for i in range(0, x):
        resource_type = har_page.entries[i]['_resourceType']
        #print(resource_type)
        req_url = har_page.entries[i]['request']['url']
        if req_url == "https://www.instagram.com/katyperry/":
            #First 12 posts
            res = har_page.entries[0]['response']['content']['text']
            #print(res)
            first_12_posts = get_shortcode_first(res)
        elif resource_type == "xhr" and req_url.startswith(
                "https://www.instagram.com/graphql/query/?query_hash="):
            #for other posts
            res = har_page.entries[i]['response']['content']['text']
            #print(res)