Exemple #1
0
def process_log(logfile):
    ###### preprocess log
    print "Processing HTTP logs..."
    all_lines = basic.read(logfile)
    all_nodes = []
    for line in all_lines:
        all_nodes.append(basic.NodeFromLog(line))
    all_nodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False)
    return all_nodes
Exemple #2
0
def process_log(logfile):
	###### preprocess log
	print 'Processing HTTP logs...'
	all_lines = basic.read(logfile)
	all_nodes = []
	for line in all_lines:
		all_nodes.append(basic.NodeFromLog(line))
	all_nodes.sort(lambda x,y: cmp(x,y), lambda x: x.start_time, False)
	return all_nodes
Exemple #3
0
from lib.myWeb import WebPage, WebObject
from lib.utilities import Logger

parser = argparse.ArgumentParser(description='Page reconstruction from weblog using type-based approach.')
parser.add_argument('logfile', type=str, help= 'log file containing the request/response pair')
args = parser.parse_args()
input_file = args.logfile
detected_pageurl = input_file+'.page.tmp'

###### logging
this_log = './log/'+sys.argv[0].replace('.', '_')+'.log'
log_h = Logger(this_log)
print 'log file: %s' % this_log

print 'Reading log...'
all_lines = logbasic.read(input_file)

print 'Processing rrp...'
all_nodes = []
for line in all_lines:
	all_nodes.append(logbasic.NodeFromLog(line))
all_nodes.sort(lambda x,y: cmp(x,y), lambda x: x.start_time, False)

all_pages = []
last_page = None
for node in all_nodes:
	if node.is_root():
		new_page = WebPage()
		new_page.add_obj(node, root = True)
		all_pages.append(new_page)
		last_page = new_page
Exemple #4
0
parser.add_argument('logfile',
                    type=str,
                    help='log file containing the request/response pair')
args = parser.parse_args()
input_file = args.logfile
detected_pageurl = input_file + '.page.tmp'

print 'detected pages: %s' % detected_pageurl

###### logging
this_log = './log/' + sys.argv[0].replace('.', '_') + '.log'
log_h = Logger(this_log)
print 'log file: %s' % this_log

print 'Reading log...'
all_lines = logbasic.read(input_file)

print 'Processing rrp...'
all_nodes = []
for line in all_lines:
    all_nodes.append(logbasic.NodeFromLog(line))
all_nodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False)

print len(all_nodes)

T = [i / 10.0 for i in range(2, 202, 2)]

for t in T:
    log_h.log('########################\n')
    all_pages = []
    last_page = None
def main():
	parser = argparse.ArgumentParser(description='Page reconstruction from weblog using StreamStructure algorithm proposed by S. Ihm on IMC 2011.')
	parser.add_argument('-k', type=int, default = 2, help= 'T parameter')
	parser.add_argument('-t', type=int, default = 5, help= 'T parameter')
	parser.add_argument('logfile', type=str, help= 'log file containing the request/response pair')
	args = parser.parse_args()
	log_file = args.logfile
	detected_pageurl = log_file+'.page.tmp'
	K = [args.k]
	T = [args.t]

	print 'Reading log...'
	all_lines = basic.read(log_file)

	print 'Processing rrp...'
	all_nodes = []
	for line in all_lines:
		all_nodes.append(basic.NodeFromLog(line))
	all_nodes.sort(lambda x,y: cmp(x,y), lambda x: x.start_time, False)

	###### construct trees

	print 'Creating graph...'
	new_graph = Graph()
	for node in all_nodes:
		new_graph.add_node(node)
	trees = new_graph.all_trees()
	junk_nodes = new_graph.junk_nodes
	# little trick: treat a tree with one node 
	# as the invalid and add its nodes to 'junk_nodes'
	valid_trees = []
	for tree in trees:
		if len(tree.nodes) > 1:
			valid_trees.append(tree)
		else:
			junk_nodes += tree.nodes

	print('valid trees: {0}, junk_nodes: {1}'.format(len(valid_trees), len(junk_nodes)))

	###### cut pages
	K = [1]
	T = [i/10.0 for i in range(2, 200, 2)]

	for k in K:
		for t in T:
			log('#############')
			log('K = %d, T = %.2f' % (k, t))

			all_pages = []
			for tree in valid_trees:
				all_pages += process_tree(tree, k, t)

			log('Pages:%d' % len(all_pages))
			
			all_urls = [i.root.url for i in all_pages]
			ofile = open(detected_pageurl, 'wb')
			ofile.write('\n'.join(all_urls))
			ofile.close()

			page_gt = log_file.split('.')[0]+'.page'
			cmd = 'python tools/check_urls.py "{0}" "{1}"'.format(detected_pageurl, page_gt)
			f = Popen(cmd, shell=True, stdout=PIPE).stdout
			for line in f:
				log(line.strip(" \r\n"))
Exemple #6
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Page reconstruction from weblog using StreamStructure algorithm proposed by S. Ihm on IMC 2011.'
    )
    parser.add_argument('-k', type=int, default=2, help='T parameter')
    parser.add_argument('-t', type=int, default=5, help='T parameter')
    parser.add_argument('logfile',
                        type=str,
                        help='log file containing the request/response pair')
    args = parser.parse_args()
    log_file = args.logfile
    detected_pageurl = log_file + '.page.tmp'
    K = [args.k]
    T = [args.t]

    print 'Reading log...'
    all_lines = basic.read(log_file)

    print 'Processing rrp...'
    all_nodes = []
    for line in all_lines:
        all_nodes.append(basic.NodeFromLog(line))
    all_nodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False)

    ###### construct trees

    print 'Creating graph...'
    new_graph = Graph()
    for node in all_nodes:
        new_graph.add_node(node)
    trees = new_graph.all_trees()
    junk_nodes = new_graph.junk_nodes
    # little trick: treat a tree with one node
    # as the invalid and add its nodes to 'junk_nodes'
    valid_trees = []
    for tree in trees:
        if len(tree.nodes) > 1:
            valid_trees.append(tree)
        else:
            junk_nodes += tree.nodes

    print('valid trees: {0}, junk_nodes: {1}'.format(len(valid_trees),
                                                     len(junk_nodes)))

    ###### cut pages
    K = [1]
    T = [i / 10.0 for i in range(2, 200, 2)]

    for k in K:
        for t in T:
            log('#############')
            log('K = %d, T = %.2f' % (k, t))

            all_pages = []
            for tree in valid_trees:
                all_pages += process_tree(tree, k, t)

            log('Pages:%d' % len(all_pages))

            all_urls = [i.root.url for i in all_pages]
            ofile = open(detected_pageurl, 'wb')
            ofile.write('\n'.join(all_urls))
            ofile.close()

            page_gt = log_file.split('.')[0] + '.page'
            cmd = 'python tools/check_urls.py "{0}" "{1}"'.format(
                detected_pageurl, page_gt)
            f = Popen(cmd, shell=True, stdout=PIPE).stdout
            for line in f:
                log(line.strip(" \r\n"))