Esempio n. 1
0
def parse_pages_har(harfolder):
    print 'Processing har files...'
    # Processing all HAR file under the folder
    all_real_pages = []
    all_objects = []
    for root, dirs, files in os.walk(harfolder):
        for file in files:
            suffix = file.rsplit('.', 1)[1]
            if suffix != 'har':
                continue

            inputfile = os.path.join(root, file)
            # Open HAR file
            har_log = json.load(codecs.open(inputfile, 'rb', 'utf-8'))['log']
            har_pages = har_log['pages']
            har_objects = har_log['entries']

            # Extract web objects and order them in time
            allnodes = []
            for i in har_objects:
                new_node = NodeFromHar(i)  # new node
                allnodes.append(new_node)
            allnodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time,
                          False)

            all_objects += allnodes

            # Find valid trees from raw web objects
            trees = []
            junk_nodes = [
            ]  # who can't find referrer and is not the type of root
            tot = 0
            for new_node in allnodes:
                tot += 1

                try:
                    # Start linking
                    linked_flag = False
                    for tree in trees:
                        pred_id = None
                        if new_node.referrer:
                            for item in tree.nodes[::-1]:
                                if utilities.cmp_url(new_node.referrer,
                                                     item.url, 'strict'):
                                    pred_id = item.identifier
                                    break
                        if pred_id:
                            # Predecessor found...
                            tree.add_node(new_node, pred_id)
                            linked_flag = True
                            break
                        # After all the trees are checked:
                    if not linked_flag:
                        raise NewTreeNeeded

                except NewTreeNeeded:
                    if new_node.is_root():
                        if new_node.status == 200:
                            new_tree = mod_tree.Tree()  # new tree
                            new_tree.add_node(new_node, None)
                            linked_flag = True
                            trees.append(new_tree)
                    else:
                        junk_nodes.append(new_node)

            # Sort trees in the order of ascending time
            trees.sort(lambda x, y: cmp(x, y), lambda x: x[x.root].start_time,
                       False)

            # little trick: treat a tree with one node as the invalid
            # and add its nodes to 'junk_nodes'
            valid_trees = []
            for tree in trees:
                if len(tree.nodes) > 1:
                    valid_trees.append(tree)
                else:
                    junk_nodes += tree.nodes

            #log('{0} {1} {2}'.format(tot, len(junk_nodes), input))

            # find real page(s) from valid trees.
            real_pages = []
            last = None
            for tree in valid_trees:
                # one tree -> one page
                new_page = WebPage()  # Treat the tree with more than
                # one nodes as a valid tree
                new_page.root = tree[tree.root]
                new_page.objs = tree.nodes
                real_pages.append(new_page)
                last = tree

            # Options: process junk web objects:
            # Add the each object to the nearest
            # web page of 'real_pages'
            junk2 = 0
            for node in junk_nodes:
                found_flag = False
                for page in real_pages[::-1]:
                    if cmp(page.root.start_time, node.start_time) < 0:
                        found_flag = True
                        break
                if found_flag:
                    page.objs.append(node)
                else:
                    junk2 += 1

            all_real_pages += real_pages[0:1]
            # little trick: with foreknowledge, the first page is the real page
            # so we obtain the first one and drop the others as invalid ones.

    return all_real_pages, all_objects
Esempio n. 2
0
def parse_pages_svm(all_nodes, valid_urls):

	print '#Total nodes:', len(all_nodes)
	print '#Validurl:', len(valid_urls)

	all_nodes.sort(lambda x,y: cmp(x,y), lambda x: x.start_time, False)

	###### construct link trees
	print 'Building referrer trees...'
	new_graph = Graph()
	for node in all_nodes:
		new_graph.add_node(node)
	trees = new_graph.all_trees()
	junk_nodes = new_graph.junk_nodes

	# little trick: treat a tree with one node 
	# as the invalid and add its nodes to 'junk_nodes'
	valid_trees = []
	for tree in trees:
		if len(tree.nodes) > 1:
			valid_trees.append(tree)
		else:
			junk_nodes += tree.nodes
	
	print('#Valid trees: {0}\n#Junk_nodes: {1}'.format(len(valid_trees), len(junk_nodes)))
	
	###### parse page cands
	print 'Constructing page-level objects...'
	all_pages = []
	for tree in valid_trees:
		###### Detect valid HTML element to be Main Object Candidates (MOCs)
		mocs = []
		for node in tree.expand_tree(mode=_WIDTH):	# must be _WIDTH
			if tree[node].is_root() and int(tree[node].status) == 200:
				mocs.append(node)

		tmp = []
		for moc in mocs[::-1]:
			bp = tree[moc].bpointer
			if bp is None:
				tmp.append(moc)
			else:
				valid_nodes = 0
				for i in tree.expand_tree(moc,filter=lambda x: x==moc or x not in tmp):
					valid_nodes += 1
				# little trick: do not cut the sub tree with only one node
				if valid_nodes>1:
					tmp.append(moc)
		mocs = tmp

		###### parse pages
		# for rootid in mocs[:]:
		# 	new_page = WebPage()
		# 	all_pages.append(new_page)
		# 	for nodeid in tree.expand_tree(rootid, filter = lambda x: x==rootid or x not in mocs):
		# 		if nodeid == rootid:
		# 			new_page.add_obj(tree[nodeid], root=True)
		# 		else:
		# 			new_page.add_obj(tree[nodeid])
		# 	if utilities.search_url(new_page.root.url, valid_urls) is True:
		# 		new_page.isvalid = True
		# 	if tree[rootid].bpointer is not None:
		# 		new_page.ref = tree[tree[rootid].bpointer]

		###### parse pages according to paper
		real = []
		for moc in mocs:
			vurl_arr = [i[0] for i in valid_urls]
			if utilities.search_url(tree[moc].url, vurl_arr) is True:
				real.append(moc)

		for rootid in mocs[:]:
			new_page = WebPage()
			all_pages.append(new_page)
			for nodeid in tree.expand_tree(rootid, filter = lambda x: x==rootid or x not in real):
				if nodeid == rootid:
					new_page.add_obj(tree[nodeid], root=True)
				else:
					new_page.add_obj(tree[nodeid])
			if new_page.root.identifier in real:
				new_page.isvalid = True
			if tree[rootid].bpointer is not None:
				new_page.ref = tree[tree[rootid].bpointer]

	all_pages.sort(lambda x,y: cmp(x,y), lambda x: x.root.start_time, False)
	print('#Pages-level objs:%d' % len(all_pages))

	return valid_trees, all_pages, junk_nodes
Esempio n. 3
0
def parse_pages_har(harfolder):
	print 'Processing har files...'
	# Processing all HAR file under the folder
	all_real_pages = []
	all_objects = []
	for root, dirs, files in os.walk(harfolder):
		for file in files:
			suffix = file.rsplit('.', 1)[1]
			if suffix != 'har':
				continue

			inputfile = os.path.join(root, file)
			# Open HAR file
			har_log = json.load(codecs.open(inputfile, 'rb', 'utf-8'))['log']
			har_pages = har_log['pages']
			har_objects = har_log['entries']

			# Extract web objects and order them in time
			allnodes = []
			for i in har_objects:
				new_node = NodeFromHar(i)		# new node
				allnodes.append(new_node)
			allnodes.sort(lambda x,y: cmp(x, y), lambda x: x.start_time, False)
			
			all_objects += allnodes
			
			# Find valid trees from raw web objects
			trees = []
			junk_nodes = []		# who can't find referrer and is not the type of root
			tot = 0
			for new_node in allnodes:
				tot += 1

				try:
					# Start linking
					linked_flag = False
					for tree in trees:
						pred_id = None
						if new_node.referrer:
							for item in tree.nodes[::-1]:
								if utilities.cmp_url(new_node.referrer, item.url, 'strict'):
									pred_id = item.identifier
									break
						if pred_id:
							# Predecessor found...
							tree.add_node(new_node, pred_id)
							linked_flag = True
							break
						# After all the trees are checked:	
					if not linked_flag:
						raise NewTreeNeeded

				except NewTreeNeeded:
					if new_node.is_root():
						if new_node.status == 200:
							new_tree = mod_tree.Tree()		# new tree
							new_tree.add_node(new_node, None)
							linked_flag = True
							trees.append(new_tree)
					else:
						junk_nodes.append(new_node)

			# Sort trees in the order of ascending time
			trees.sort(lambda x,y: cmp(x,y), lambda x: x[x.root].start_time, False)

			# little trick: treat a tree with one node as the invalid
			# and add its nodes to 'junk_nodes'
			valid_trees = []
			for tree in trees:
				if len(tree.nodes) > 1:
					valid_trees.append(tree)
				else:
					junk_nodes += tree.nodes

			#log('{0} {1} {2}'.format(tot, len(junk_nodes), input))

			# find real page(s) from valid trees.
			real_pages = []
			last = None
			for tree in valid_trees:
				# one tree -> one page
				new_page = WebPage()					# Treat the tree with more than
													# one nodes as a valid tree
				new_page.root = tree[tree.root]
				new_page.objs = tree.nodes
				real_pages.append(new_page)
				last = tree

			# Options: process junk web objects:
			# Add the each object to the nearest
			# web page of 'real_pages'
			junk2 = 0
			for node in junk_nodes:
				found_flag = False
				for page in real_pages[::-1]:
					if cmp(page.root.start_time, node.start_time) < 0:
						found_flag = True
						break
				if found_flag:
					page.objs.append(node)
				else:
					junk2 += 1

			all_real_pages += real_pages[0:1]
			# little trick: with foreknowledge, the first page is the real page
			# so we obtain the first one and drop the others as invalid ones. 

	return all_real_pages, all_objects
Esempio n. 4
0
def parse_pages_svm(all_nodes, valid_urls):

    print '#Total nodes:', len(all_nodes)
    print '#Validurl:', len(valid_urls)

    all_nodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False)

    ###### construct link trees
    print 'Building referrer trees...'
    new_graph = Graph()
    for node in all_nodes:
        new_graph.add_node(node)
    trees = new_graph.all_trees()
    junk_nodes = new_graph.junk_nodes

    # little trick: treat a tree with one node
    # as the invalid and add its nodes to 'junk_nodes'
    valid_trees = []
    for tree in trees:
        if len(tree.nodes) > 1:
            valid_trees.append(tree)
        else:
            junk_nodes += tree.nodes

    print('#Valid trees: {0}\n#Junk_nodes: {1}'.format(len(valid_trees),
                                                       len(junk_nodes)))

    ###### parse page cands
    print 'Constructing page-level objects...'
    all_pages = []
    for tree in valid_trees:
        ###### Detect valid HTML element to be Main Object Candidates (MOCs)
        mocs = []
        for node in tree.expand_tree(mode=_WIDTH):  # must be _WIDTH
            if tree[node].is_root() and int(tree[node].status) == 200:
                mocs.append(node)

        tmp = []
        for moc in mocs[::-1]:
            bp = tree[moc].bpointer
            if bp is None:
                tmp.append(moc)
            else:
                valid_nodes = 0
                for i in tree.expand_tree(
                        moc, filter=lambda x: x == moc or x not in tmp):
                    valid_nodes += 1
                # little trick: do not cut the sub tree with only one node
                if valid_nodes > 1:
                    tmp.append(moc)
        mocs = tmp

        ###### parse pages
        # for rootid in mocs[:]:
        # 	new_page = WebPage()
        # 	all_pages.append(new_page)
        # 	for nodeid in tree.expand_tree(rootid, filter = lambda x: x==rootid or x not in mocs):
        # 		if nodeid == rootid:
        # 			new_page.add_obj(tree[nodeid], root=True)
        # 		else:
        # 			new_page.add_obj(tree[nodeid])
        # 	if utilities.search_url(new_page.root.url, valid_urls) is True:
        # 		new_page.isvalid = True
        # 	if tree[rootid].bpointer is not None:
        # 		new_page.ref = tree[tree[rootid].bpointer]

        ###### parse pages according to paper
        real = []
        for moc in mocs:
            vurl_arr = [i[0] for i in valid_urls]
            if utilities.search_url(tree[moc].url, vurl_arr) is True:
                real.append(moc)

        for rootid in mocs[:]:
            new_page = WebPage()
            all_pages.append(new_page)
            for nodeid in tree.expand_tree(
                    rootid, filter=lambda x: x == rootid or x not in real):
                if nodeid == rootid:
                    new_page.add_obj(tree[nodeid], root=True)
                else:
                    new_page.add_obj(tree[nodeid])
            if new_page.root.identifier in real:
                new_page.isvalid = True
            if tree[rootid].bpointer is not None:
                new_page.ref = tree[tree[rootid].bpointer]

    all_pages.sort(lambda x, y: cmp(x, y), lambda x: x.root.start_time, False)
    print('#Pages-level objs:%d' % len(all_pages))

    return valid_trees, all_pages, junk_nodes