def parse_pages_har(harfolder): print 'Processing har files...' # Processing all HAR file under the folder all_real_pages = [] all_objects = [] for root, dirs, files in os.walk(harfolder): for file in files: suffix = file.rsplit('.', 1)[1] if suffix != 'har': continue inputfile = os.path.join(root, file) # Open HAR file har_log = json.load(codecs.open(inputfile, 'rb', 'utf-8'))['log'] har_pages = har_log['pages'] har_objects = har_log['entries'] # Extract web objects and order them in time allnodes = [] for i in har_objects: new_node = NodeFromHar(i) # new node allnodes.append(new_node) allnodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False) all_objects += allnodes # Find valid trees from raw web objects trees = [] junk_nodes = [ ] # who can't find referrer and is not the type of root tot = 0 for new_node in allnodes: tot += 1 try: # Start linking linked_flag = False for tree in trees: pred_id = None if new_node.referrer: for item in tree.nodes[::-1]: if utilities.cmp_url(new_node.referrer, item.url, 'strict'): pred_id = item.identifier break if pred_id: # Predecessor found... tree.add_node(new_node, pred_id) linked_flag = True break # After all the trees are checked: if not linked_flag: raise NewTreeNeeded except NewTreeNeeded: if new_node.is_root(): if new_node.status == 200: new_tree = mod_tree.Tree() # new tree new_tree.add_node(new_node, None) linked_flag = True trees.append(new_tree) else: junk_nodes.append(new_node) # Sort trees in the order of ascending time trees.sort(lambda x, y: cmp(x, y), lambda x: x[x.root].start_time, False) # little trick: treat a tree with one node as the invalid # and add its nodes to 'junk_nodes' valid_trees = [] for tree in trees: if len(tree.nodes) > 1: valid_trees.append(tree) else: junk_nodes += tree.nodes #log('{0} {1} {2}'.format(tot, len(junk_nodes), input)) # find real page(s) from valid trees. real_pages = [] last = None for tree in valid_trees: # one tree -> one page new_page = WebPage() # Treat the tree with more than # one nodes as a valid tree new_page.root = tree[tree.root] new_page.objs = tree.nodes real_pages.append(new_page) last = tree # Options: process junk web objects: # Add the each object to the nearest # web page of 'real_pages' junk2 = 0 for node in junk_nodes: found_flag = False for page in real_pages[::-1]: if cmp(page.root.start_time, node.start_time) < 0: found_flag = True break if found_flag: page.objs.append(node) else: junk2 += 1 all_real_pages += real_pages[0:1] # little trick: with foreknowledge, the first page is the real page # so we obtain the first one and drop the others as invalid ones. return all_real_pages, all_objects
def parse_pages_svm(all_nodes, valid_urls): print '#Total nodes:', len(all_nodes) print '#Validurl:', len(valid_urls) all_nodes.sort(lambda x,y: cmp(x,y), lambda x: x.start_time, False) ###### construct link trees print 'Building referrer trees...' new_graph = Graph() for node in all_nodes: new_graph.add_node(node) trees = new_graph.all_trees() junk_nodes = new_graph.junk_nodes # little trick: treat a tree with one node # as the invalid and add its nodes to 'junk_nodes' valid_trees = [] for tree in trees: if len(tree.nodes) > 1: valid_trees.append(tree) else: junk_nodes += tree.nodes print('#Valid trees: {0}\n#Junk_nodes: {1}'.format(len(valid_trees), len(junk_nodes))) ###### parse page cands print 'Constructing page-level objects...' all_pages = [] for tree in valid_trees: ###### Detect valid HTML element to be Main Object Candidates (MOCs) mocs = [] for node in tree.expand_tree(mode=_WIDTH): # must be _WIDTH if tree[node].is_root() and int(tree[node].status) == 200: mocs.append(node) tmp = [] for moc in mocs[::-1]: bp = tree[moc].bpointer if bp is None: tmp.append(moc) else: valid_nodes = 0 for i in tree.expand_tree(moc,filter=lambda x: x==moc or x not in tmp): valid_nodes += 1 # little trick: do not cut the sub tree with only one node if valid_nodes>1: tmp.append(moc) mocs = tmp ###### parse pages # for rootid in mocs[:]: # new_page = WebPage() # all_pages.append(new_page) # for nodeid in tree.expand_tree(rootid, filter = lambda x: x==rootid or x not in mocs): # if nodeid == rootid: # new_page.add_obj(tree[nodeid], root=True) # else: # new_page.add_obj(tree[nodeid]) # if utilities.search_url(new_page.root.url, valid_urls) is True: # new_page.isvalid = True # if tree[rootid].bpointer is not None: # new_page.ref = tree[tree[rootid].bpointer] ###### parse pages according to paper real = [] for moc in mocs: vurl_arr = [i[0] for i in valid_urls] if utilities.search_url(tree[moc].url, vurl_arr) is True: real.append(moc) for rootid in mocs[:]: new_page = WebPage() all_pages.append(new_page) for nodeid in tree.expand_tree(rootid, filter = lambda x: x==rootid or x not in real): if nodeid == rootid: new_page.add_obj(tree[nodeid], root=True) else: new_page.add_obj(tree[nodeid]) if new_page.root.identifier in real: new_page.isvalid = True if tree[rootid].bpointer is not None: new_page.ref = tree[tree[rootid].bpointer] all_pages.sort(lambda x,y: cmp(x,y), lambda x: x.root.start_time, False) print('#Pages-level objs:%d' % len(all_pages)) return valid_trees, all_pages, junk_nodes
def parse_pages_har(harfolder): print 'Processing har files...' # Processing all HAR file under the folder all_real_pages = [] all_objects = [] for root, dirs, files in os.walk(harfolder): for file in files: suffix = file.rsplit('.', 1)[1] if suffix != 'har': continue inputfile = os.path.join(root, file) # Open HAR file har_log = json.load(codecs.open(inputfile, 'rb', 'utf-8'))['log'] har_pages = har_log['pages'] har_objects = har_log['entries'] # Extract web objects and order them in time allnodes = [] for i in har_objects: new_node = NodeFromHar(i) # new node allnodes.append(new_node) allnodes.sort(lambda x,y: cmp(x, y), lambda x: x.start_time, False) all_objects += allnodes # Find valid trees from raw web objects trees = [] junk_nodes = [] # who can't find referrer and is not the type of root tot = 0 for new_node in allnodes: tot += 1 try: # Start linking linked_flag = False for tree in trees: pred_id = None if new_node.referrer: for item in tree.nodes[::-1]: if utilities.cmp_url(new_node.referrer, item.url, 'strict'): pred_id = item.identifier break if pred_id: # Predecessor found... tree.add_node(new_node, pred_id) linked_flag = True break # After all the trees are checked: if not linked_flag: raise NewTreeNeeded except NewTreeNeeded: if new_node.is_root(): if new_node.status == 200: new_tree = mod_tree.Tree() # new tree new_tree.add_node(new_node, None) linked_flag = True trees.append(new_tree) else: junk_nodes.append(new_node) # Sort trees in the order of ascending time trees.sort(lambda x,y: cmp(x,y), lambda x: x[x.root].start_time, False) # little trick: treat a tree with one node as the invalid # and add its nodes to 'junk_nodes' valid_trees = [] for tree in trees: if len(tree.nodes) > 1: valid_trees.append(tree) else: junk_nodes += tree.nodes #log('{0} {1} {2}'.format(tot, len(junk_nodes), input)) # find real page(s) from valid trees. real_pages = [] last = None for tree in valid_trees: # one tree -> one page new_page = WebPage() # Treat the tree with more than # one nodes as a valid tree new_page.root = tree[tree.root] new_page.objs = tree.nodes real_pages.append(new_page) last = tree # Options: process junk web objects: # Add the each object to the nearest # web page of 'real_pages' junk2 = 0 for node in junk_nodes: found_flag = False for page in real_pages[::-1]: if cmp(page.root.start_time, node.start_time) < 0: found_flag = True break if found_flag: page.objs.append(node) else: junk2 += 1 all_real_pages += real_pages[0:1] # little trick: with foreknowledge, the first page is the real page # so we obtain the first one and drop the others as invalid ones. return all_real_pages, all_objects
def parse_pages_svm(all_nodes, valid_urls): print '#Total nodes:', len(all_nodes) print '#Validurl:', len(valid_urls) all_nodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False) ###### construct link trees print 'Building referrer trees...' new_graph = Graph() for node in all_nodes: new_graph.add_node(node) trees = new_graph.all_trees() junk_nodes = new_graph.junk_nodes # little trick: treat a tree with one node # as the invalid and add its nodes to 'junk_nodes' valid_trees = [] for tree in trees: if len(tree.nodes) > 1: valid_trees.append(tree) else: junk_nodes += tree.nodes print('#Valid trees: {0}\n#Junk_nodes: {1}'.format(len(valid_trees), len(junk_nodes))) ###### parse page cands print 'Constructing page-level objects...' all_pages = [] for tree in valid_trees: ###### Detect valid HTML element to be Main Object Candidates (MOCs) mocs = [] for node in tree.expand_tree(mode=_WIDTH): # must be _WIDTH if tree[node].is_root() and int(tree[node].status) == 200: mocs.append(node) tmp = [] for moc in mocs[::-1]: bp = tree[moc].bpointer if bp is None: tmp.append(moc) else: valid_nodes = 0 for i in tree.expand_tree( moc, filter=lambda x: x == moc or x not in tmp): valid_nodes += 1 # little trick: do not cut the sub tree with only one node if valid_nodes > 1: tmp.append(moc) mocs = tmp ###### parse pages # for rootid in mocs[:]: # new_page = WebPage() # all_pages.append(new_page) # for nodeid in tree.expand_tree(rootid, filter = lambda x: x==rootid or x not in mocs): # if nodeid == rootid: # new_page.add_obj(tree[nodeid], root=True) # else: # new_page.add_obj(tree[nodeid]) # if utilities.search_url(new_page.root.url, valid_urls) is True: # new_page.isvalid = True # if tree[rootid].bpointer is not None: # new_page.ref = tree[tree[rootid].bpointer] ###### parse pages according to paper real = [] for moc in mocs: vurl_arr = [i[0] for i in valid_urls] if utilities.search_url(tree[moc].url, vurl_arr) is True: real.append(moc) for rootid in mocs[:]: new_page = WebPage() all_pages.append(new_page) for nodeid in tree.expand_tree( rootid, filter=lambda x: x == rootid or x not in real): if nodeid == rootid: new_page.add_obj(tree[nodeid], root=True) else: new_page.add_obj(tree[nodeid]) if new_page.root.identifier in real: new_page.isvalid = True if tree[rootid].bpointer is not None: new_page.ref = tree[tree[rootid].bpointer] all_pages.sort(lambda x, y: cmp(x, y), lambda x: x.root.start_time, False) print('#Pages-level objs:%d' % len(all_pages)) return valid_trees, all_pages, junk_nodes