Exemple #1
0
for line in all_lines:
	all_nodes.append(logbasic.NodeFromLog(line))
all_nodes.sort(lambda x,y: cmp(x,y), lambda x: x.start_time, False)

all_pages = []
last_page = None
for node in all_nodes:
	if node.is_root():
		new_page = WebPage()
		new_page.add_obj(node, root = True)
		all_pages.append(new_page)
		last_page = new_page
	else:
		if last_page is not None:
			last_page.add_obj(node)

print len(all_nodes)
print len(all_pages)

all_urls = [i.root.url for i in all_pages]
ofile = open(detected_pageurl, 'wb')
ofile.write('\n'.join(all_urls))
ofile.close()

page_gt = input_file.split('.')[0]+'.page'
cmd = 'python tools/check_urls.py "{0}" "{1}"'.format(detected_pageurl, page_gt)
f = Popen(cmd, shell=True, stdout=PIPE).stdout
for line in f:
    log_h.log(line.strip(" \r\n"))

log_h.close()
Exemple #2
0
###### read HTTP log
print 'Reading log...'
all_lines = logbasic.read(input_file)

print 'Processing rrp...'
all_nodes = []
for line in all_lines:
	all_nodes.append(logbasic.NodeFromLog(line))
all_nodes.sort(lambda x,y: cmp(x,y), lambda x: x.start_time, False)

print len(all_nodes)

T = [i/10.0 for i in range(2, 200, 2)]
for t in T:
	log_h.log('########################\n')
	log_h.log(str(t))
	all_pages = []
	last_page = None
	last_node = None
	for node in all_nodes:
		if last_page is None and node.is_root():
			new_page = WebPage()
			all_pages.append(new_page)
			new_page.add_obj(node, root = True)
			last_page = new_page
		else:
			if node.is_root() and \
node.start_time - last_node.start_time >= datetime.timedelta(seconds=t):
				new_page = WebPage()
				all_pages.append(new_page)
Exemple #3
0
print 'Reading log...'
all_lines = logbasic.read(input_file)

print 'Processing rrp...'
all_nodes = []
for line in all_lines:
    all_nodes.append(logbasic.NodeFromLog(line))
all_nodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False)

print len(all_nodes)

T = [i / 10.0 for i in range(2, 202, 2)]

for t in T:
    log_h.log('########################\n')
    all_pages = []
    last_page = None
    last_node = None
    for node in all_nodes:
        if last_page is None:
            new_page = WebPage()
            all_pages.append(new_page)
            new_page.add_obj(node, root=True)
            last_page = new_page
        else:
            if node.start_time - last_node.start_time >= datetime.timedelta(
                    seconds=t):
                new_page = WebPage()
                all_pages.append(new_page)
                new_page.add_obj(node, root=True)
Exemple #4
0
    all_nodes.append(logbasic.NodeFromLog(line))
all_nodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False)

all_pages = []
last_page = None
for node in all_nodes:
    if node.is_root():
        new_page = WebPage()
        new_page.add_obj(node, root=True)
        all_pages.append(new_page)
        last_page = new_page
    else:
        if last_page is not None:
            last_page.add_obj(node)

print len(all_nodes)
print len(all_pages)

all_urls = [i.root.url for i in all_pages]
ofile = open(detected_pageurl, 'wb')
ofile.write('\n'.join(all_urls))
ofile.close()

page_gt = input_file.split('.')[0] + '.page'
cmd = 'python tools/check_urls.py "{0}" "{1}"'.format(detected_pageurl,
                                                      page_gt)
f = Popen(cmd, shell=True, stdout=PIPE).stdout
for line in f:
    log_h.log(line.strip(" \r\n"))

log_h.close()