def get_svm_pages(all_objects, valid_urls, predicted_file): (valid_trees, all_pages, junk_nodes) = svm.parse_pages_svm(all_objects, valid_urls) # read pridicted lables all_labels = [i.rstrip(' \r\n') for i in open(predicted_file, 'rb')] tp_pages = [] fp_pages = [] print len(all_pages), len(all_labels) assert len(all_pages) == len(all_labels) for i in range(0, len(all_pages)): if all_labels[i] == '1': if all_pages[i].isvalid: tp_pages.append(all_pages[i]) else: fp_pages.append(all_pages[i]) pos_pages = tp_pages + fp_pages tp_roots = [i.root.identifier for i in tp_pages] fp_roots = [i.root.identifier for i in fp_pages] pos_roots = [i.root.identifier for i in pos_pages] # recut trees using predicted page candidates print 'Predicted pos:', len(pos_roots) recut_pos_pages = [] for tree in valid_trees: local_pos_roots = [i for i in tree.expand_tree(filter = lambda x: x in pos_roots)] for root in local_pos_roots: new_page = WebPage() new_page.add_obj(tree[root], root=True) for node in tree.expand_tree(root, filter = lambda x: x==root or x not in local_pos_roots): new_page.add_obj(tree[node]) recut_pos_pages.append(new_page) recut_pos_pages.sort(lambda x,y: cmp(x,y), lambda x: x.root.start_time, False) # add junk nodes to recut pos pages junk2 = len(junk_nodes) for node in junk_nodes: found_flag = False for page in recut_pos_pages[::-1]: if cmp(page.root.start_time, node.start_time) < 0: found_flag = True break if found_flag: page.junk_objs.append(node) junk2 -= 1 recut_tp_pages = [] recut_fp_pages = [] for page in recut_pos_pages: if page.root.identifier in tp_roots: recut_tp_pages.append(page) elif page.root.identifier in fp_roots: recut_fp_pages.append(page) return recut_pos_pages, recut_tp_pages
def gen_instances(all_nodes, valid_urls): global log_h # Parse pages for SVM (valid_trees, all_pages, junk_nodes) = svm.parse_pages_svm(all_nodes, valid_urls) ###### add junk junk2 = len(junk_nodes) for node in junk_nodes: found_flag = False for page in all_pages[::-1]: if cmp(page.root.start_time, node.start_time) < 0: found_flag = True break if found_flag: page.junk_objs.append(node) junk2 -= 1 ###### extract instances print len(all_pages) all_instances = [] instance_pos_url = [] pos_cnt = 0 neg_cnt = 0 for page in all_pages: pf = PageFeature(page) if page.isvalid: # log('{0} {1}'.format(page.root.url, len(page.objs))) instance_pos_url.append(page.root.url) label = 1 pos_cnt += 1 else: label = -1 neg_cnt += 1 instance = pf.assemble_instance(label) all_instances.append(instance) log_h.log("#Page:{0}\n#Non-page:{1}".format(pos_cnt, neg_cnt)) return all_instances, instance_pos_url
def gen_instances(all_nodes, valid_urls): global log_h # Parse pages for SVM (valid_trees, all_pages, junk_nodes) = svm.parse_pages_svm(all_nodes, valid_urls) ###### add junk junk2 = len(junk_nodes) for node in junk_nodes: found_flag = False for page in all_pages[::-1]: if cmp(page.root.start_time, node.start_time) < 0: found_flag = True break if found_flag: page.junk_objs.append(node) junk2 -= 1 ###### extract instances print len(all_pages) all_instances = [] instance_pos_url = [] pos_cnt = 0 neg_cnt = 0 for page in all_pages: pf = PageFeature(page) if page.isvalid: #log('{0} {1}'.format(page.root.url, len(page.objs))) instance_pos_url.append(page.root.url) label = 1 pos_cnt += 1 else: label = -1 neg_cnt += 1 instance = pf.assemble_instance(label) all_instances.append(instance) log_h.log('#Page:{0}\n#Non-page:{1}'.format(pos_cnt, neg_cnt)) return all_instances, instance_pos_url
def get_svm_pages(all_objects, valid_urls, predicted_file): (valid_trees, all_pages, junk_nodes) = svm.parse_pages_svm(all_objects, valid_urls) # read pridicted lables all_labels = [i.rstrip(' \r\n') for i in open(predicted_file, 'rb')] tp_pages = [] fp_pages = [] print len(all_pages), len(all_labels) assert len(all_pages) == len(all_labels) for i in range(0, len(all_pages)): if all_labels[i] == '1': if all_pages[i].isvalid: tp_pages.append(all_pages[i]) else: fp_pages.append(all_pages[i]) pos_pages = tp_pages + fp_pages tp_roots = [i.root.identifier for i in tp_pages] fp_roots = [i.root.identifier for i in fp_pages] pos_roots = [i.root.identifier for i in pos_pages] # recut trees using predicted page candidates print 'Predicted pos:', len(pos_roots) recut_pos_pages = [] for tree in valid_trees: local_pos_roots = [ i for i in tree.expand_tree(filter=lambda x: x in pos_roots) ] for root in local_pos_roots: new_page = WebPage() new_page.add_obj(tree[root], root=True) for node in tree.expand_tree( root, filter=lambda x: x == root or x not in local_pos_roots): new_page.add_obj(tree[node]) recut_pos_pages.append(new_page) recut_pos_pages.sort(lambda x, y: cmp(x, y), lambda x: x.root.start_time, False) # add junk nodes to recut pos pages junk2 = len(junk_nodes) for node in junk_nodes: found_flag = False for page in recut_pos_pages[::-1]: if cmp(page.root.start_time, node.start_time) < 0: found_flag = True break if found_flag: page.junk_objs.append(node) junk2 -= 1 recut_tp_pages = [] recut_fp_pages = [] for page in recut_pos_pages: if page.root.identifier in tp_roots: recut_tp_pages.append(page) elif page.root.identifier in fp_roots: recut_fp_pages.append(page) return recut_pos_pages, recut_tp_pages