Example #1
0
    def add_node(self, new_node):
        """ Create the referrer trees depending on 
			nodes' referrer relationships.
		"""
        # Search for corresponding subgraph
        if new_node.user_ip is None:
            print 'Source IP is lost in request/response pair.'
            exit(-1)

        subgraph = self.get_subgraph(new_node.user_ip)
        if subgraph == None:
            subgraph = SubGraph(new_node.user_ip)
            self.subgraphs.append(subgraph)

        try:
            # Start linking
            if new_node.user_agent is not None:
                if new_node.user_agent in subgraph.ua_trees_d.keys():
                    linked_flag = False
                    for tree in subgraph.ua_trees_d[new_node.user_agent][::-1]:
                        # Session idle time of 15 minutes used
                        if new_node.start_time - tree.nodes[
                                -1].start_time <= datetime.timedelta(
                                    minutes=15):
                            # Find its predecessor
                            pred_id = None
                            if new_node.referrer:
                                for item in tree.nodes[::-1]:
                                    #if utilities.cmp_url(new_node.referrer, item.url, 'loose'):
                                    if utilities.cmp_url(
                                            new_node.referrer, item.url,
                                            'strict'):
                                        pred_id = item.identifier
                                        break
                            if pred_id != None:
                                # Predecessor found...
                                tree.add_node(new_node, pred_id)
                                linked_flag = True
                                break
                    # After all the trees are checked:
                    if not linked_flag:
                        raise NewTreeNeeded
                else:
                    # new user agent index and new tree
                    raise NewTreeNeeded

        except NewTreeNeeded:
            if new_node.is_root():
                if int(new_node.status) == 200:
                    new_tree = mod_tree.Tree()
                    new_tree.add_node(new_node, parent=None)
                    # Update the graph
                    try:
                        subgraph.ua_trees_d[new_node.user_agent].append(
                            new_tree)
                    except:
                        subgraph.ua_trees_d[new_node.user_agent] = [new_tree]
            else:
                self.junk_nodes.append(new_node)
Example #2
0
	def add_node(self, new_node):
		""" Create the referrer trees depending on 
			nodes' referrer relationships.
		"""
		# Search for corresponding subgraph
		if new_node.user_ip is None:
			print 'Source IP is lost in request/response pair.'
			exit(-1)

		subgraph = self.get_subgraph(new_node.user_ip)
		if subgraph == None:
			subgraph = SubGraph(new_node.user_ip)
			self.subgraphs.append(subgraph)


		try:
			# Start linking
			if new_node.user_agent is not None:
				if new_node.user_agent in subgraph.ua_trees_d.keys():	
					linked_flag = False
					for tree in subgraph.ua_trees_d[new_node.user_agent][::-1]:
						# Session idle time of 15 minutes used
						if new_node.start_time - tree.nodes[-1].start_time <= datetime.timedelta(minutes = 15):
							# Find its predecessor
							pred_id = None
							if new_node.referrer:
								for item in tree.nodes[::-1]:
									#if utilities.cmp_url(new_node.referrer, item.url, 'loose'):
									if utilities.cmp_url(new_node.referrer, item.url, 'strict'):
										pred_id = item.identifier
										break
							if pred_id != None:
								# Predecessor found...
								tree.add_node(new_node, pred_id)
								linked_flag = True
								break
					# After all the trees are checked:	
					if not linked_flag:
						raise NewTreeNeeded
				else:
					# new user agent index and new tree
					raise NewTreeNeeded

		except NewTreeNeeded:
			if new_node.is_root():
				if int(new_node.status) == 200:
					new_tree = mod_tree.Tree()
					new_tree.add_node(new_node, parent=None)
					# Update the graph
					try:
						subgraph.ua_trees_d[new_node.user_agent].append(new_tree)
					except:
						subgraph.ua_trees_d[new_node.user_agent] = [new_tree]
			else:
				self.junk_nodes.append(new_node)
Example #3
0
def parse_pages_har(harfolder):
    print 'Processing har files...'
    # Processing all HAR file under the folder
    all_real_pages = []
    all_objects = []
    for root, dirs, files in os.walk(harfolder):
        for file in files:
            suffix = file.rsplit('.', 1)[1]
            if suffix != 'har':
                continue

            inputfile = os.path.join(root, file)
            # Open HAR file
            har_log = json.load(codecs.open(inputfile, 'rb', 'utf-8'))['log']
            har_pages = har_log['pages']
            har_objects = har_log['entries']

            # Extract web objects and order them in time
            allnodes = []
            for i in har_objects:
                new_node = NodeFromHar(i)  # new node
                allnodes.append(new_node)
            allnodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time,
                          False)

            all_objects += allnodes

            # Find valid trees from raw web objects
            trees = []
            junk_nodes = [
            ]  # who can't find referrer and is not the type of root
            tot = 0
            for new_node in allnodes:
                tot += 1

                try:
                    # Start linking
                    linked_flag = False
                    for tree in trees:
                        pred_id = None
                        if new_node.referrer:
                            for item in tree.nodes[::-1]:
                                if utilities.cmp_url(new_node.referrer,
                                                     item.url, 'strict'):
                                    pred_id = item.identifier
                                    break
                        if pred_id:
                            # Predecessor found...
                            tree.add_node(new_node, pred_id)
                            linked_flag = True
                            break
                        # After all the trees are checked:
                    if not linked_flag:
                        raise NewTreeNeeded

                except NewTreeNeeded:
                    if new_node.is_root():
                        if new_node.status == 200:
                            new_tree = mod_tree.Tree()  # new tree
                            new_tree.add_node(new_node, None)
                            linked_flag = True
                            trees.append(new_tree)
                    else:
                        junk_nodes.append(new_node)

            # Sort trees in the order of ascending time
            trees.sort(lambda x, y: cmp(x, y), lambda x: x[x.root].start_time,
                       False)

            # little trick: treat a tree with one node as the invalid
            # and add its nodes to 'junk_nodes'
            valid_trees = []
            for tree in trees:
                if len(tree.nodes) > 1:
                    valid_trees.append(tree)
                else:
                    junk_nodes += tree.nodes

            #log('{0} {1} {2}'.format(tot, len(junk_nodes), input))

            # find real page(s) from valid trees.
            real_pages = []
            last = None
            for tree in valid_trees:
                # one tree -> one page
                new_page = WebPage()  # Treat the tree with more than
                # one nodes as a valid tree
                new_page.root = tree[tree.root]
                new_page.objs = tree.nodes
                real_pages.append(new_page)
                last = tree

            # Options: process junk web objects:
            # Add the each object to the nearest
            # web page of 'real_pages'
            junk2 = 0
            for node in junk_nodes:
                found_flag = False
                for page in real_pages[::-1]:
                    if cmp(page.root.start_time, node.start_time) < 0:
                        found_flag = True
                        break
                if found_flag:
                    page.objs.append(node)
                else:
                    junk2 += 1

            all_real_pages += real_pages[0:1]
            # little trick: with foreknowledge, the first page is the real page
            # so we obtain the first one and drop the others as invalid ones.

    return all_real_pages, all_objects
Example #4
0
def parse_pages_har(harfolder):
	print 'Processing har files...'
	# Processing all HAR file under the folder
	all_real_pages = []
	all_objects = []
	for root, dirs, files in os.walk(harfolder):
		for file in files:
			suffix = file.rsplit('.', 1)[1]
			if suffix != 'har':
				continue

			inputfile = os.path.join(root, file)
			# Open HAR file
			har_log = json.load(codecs.open(inputfile, 'rb', 'utf-8'))['log']
			har_pages = har_log['pages']
			har_objects = har_log['entries']

			# Extract web objects and order them in time
			allnodes = []
			for i in har_objects:
				new_node = NodeFromHar(i)		# new node
				allnodes.append(new_node)
			allnodes.sort(lambda x,y: cmp(x, y), lambda x: x.start_time, False)
			
			all_objects += allnodes
			
			# Find valid trees from raw web objects
			trees = []
			junk_nodes = []		# who can't find referrer and is not the type of root
			tot = 0
			for new_node in allnodes:
				tot += 1

				try:
					# Start linking
					linked_flag = False
					for tree in trees:
						pred_id = None
						if new_node.referrer:
							for item in tree.nodes[::-1]:
								if utilities.cmp_url(new_node.referrer, item.url, 'strict'):
									pred_id = item.identifier
									break
						if pred_id:
							# Predecessor found...
							tree.add_node(new_node, pred_id)
							linked_flag = True
							break
						# After all the trees are checked:	
					if not linked_flag:
						raise NewTreeNeeded

				except NewTreeNeeded:
					if new_node.is_root():
						if new_node.status == 200:
							new_tree = mod_tree.Tree()		# new tree
							new_tree.add_node(new_node, None)
							linked_flag = True
							trees.append(new_tree)
					else:
						junk_nodes.append(new_node)

			# Sort trees in the order of ascending time
			trees.sort(lambda x,y: cmp(x,y), lambda x: x[x.root].start_time, False)

			# little trick: treat a tree with one node as the invalid
			# and add its nodes to 'junk_nodes'
			valid_trees = []
			for tree in trees:
				if len(tree.nodes) > 1:
					valid_trees.append(tree)
				else:
					junk_nodes += tree.nodes

			#log('{0} {1} {2}'.format(tot, len(junk_nodes), input))

			# find real page(s) from valid trees.
			real_pages = []
			last = None
			for tree in valid_trees:
				# one tree -> one page
				new_page = WebPage()					# Treat the tree with more than
													# one nodes as a valid tree
				new_page.root = tree[tree.root]
				new_page.objs = tree.nodes
				real_pages.append(new_page)
				last = tree

			# Options: process junk web objects:
			# Add the each object to the nearest
			# web page of 'real_pages'
			junk2 = 0
			for node in junk_nodes:
				found_flag = False
				for page in real_pages[::-1]:
					if cmp(page.root.start_time, node.start_time) < 0:
						found_flag = True
						break
				if found_flag:
					page.objs.append(node)
				else:
					junk2 += 1

			all_real_pages += real_pages[0:1]
			# little trick: with foreknowledge, the first page is the real page
			# so we obtain the first one and drop the others as invalid ones. 

	return all_real_pages, all_objects