for line in all_lines: all_nodes.append(logbasic.NodeFromLog(line)) all_nodes.sort(lambda x,y: cmp(x,y), lambda x: x.start_time, False) all_pages = [] last_page = None for node in all_nodes: if node.is_root(): new_page = WebPage() new_page.add_obj(node, root = True) all_pages.append(new_page) last_page = new_page else: if last_page is not None: last_page.add_obj(node) print len(all_nodes) print len(all_pages) all_urls = [i.root.url for i in all_pages] ofile = open(detected_pageurl, 'wb') ofile.write('\n'.join(all_urls)) ofile.close() page_gt = input_file.split('.')[0]+'.page' cmd = 'python tools/check_urls.py "{0}" "{1}"'.format(detected_pageurl, page_gt) f = Popen(cmd, shell=True, stdout=PIPE).stdout for line in f: log_h.log(line.strip(" \r\n")) log_h.close()
###### read HTTP log print 'Reading log...' all_lines = logbasic.read(input_file) print 'Processing rrp...' all_nodes = [] for line in all_lines: all_nodes.append(logbasic.NodeFromLog(line)) all_nodes.sort(lambda x,y: cmp(x,y), lambda x: x.start_time, False) print len(all_nodes) T = [i/10.0 for i in range(2, 200, 2)] for t in T: log_h.log('########################\n') log_h.log(str(t)) all_pages = [] last_page = None last_node = None for node in all_nodes: if last_page is None and node.is_root(): new_page = WebPage() all_pages.append(new_page) new_page.add_obj(node, root = True) last_page = new_page else: if node.is_root() and \ node.start_time - last_node.start_time >= datetime.timedelta(seconds=t): new_page = WebPage() all_pages.append(new_page)
print 'Reading log...' all_lines = logbasic.read(input_file) print 'Processing rrp...' all_nodes = [] for line in all_lines: all_nodes.append(logbasic.NodeFromLog(line)) all_nodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False) print len(all_nodes) T = [i / 10.0 for i in range(2, 202, 2)] for t in T: log_h.log('########################\n') all_pages = [] last_page = None last_node = None for node in all_nodes: if last_page is None: new_page = WebPage() all_pages.append(new_page) new_page.add_obj(node, root=True) last_page = new_page else: if node.start_time - last_node.start_time >= datetime.timedelta( seconds=t): new_page = WebPage() all_pages.append(new_page) new_page.add_obj(node, root=True)
all_nodes.append(logbasic.NodeFromLog(line)) all_nodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False) all_pages = [] last_page = None for node in all_nodes: if node.is_root(): new_page = WebPage() new_page.add_obj(node, root=True) all_pages.append(new_page) last_page = new_page else: if last_page is not None: last_page.add_obj(node) print len(all_nodes) print len(all_pages) all_urls = [i.root.url for i in all_pages] ofile = open(detected_pageurl, 'wb') ofile.write('\n'.join(all_urls)) ofile.close() page_gt = input_file.split('.')[0] + '.page' cmd = 'python tools/check_urls.py "{0}" "{1}"'.format(detected_pageurl, page_gt) f = Popen(cmd, shell=True, stdout=PIPE).stdout for line in f: log_h.log(line.strip(" \r\n")) log_h.close()