Esempio n. 1
0
def get_svm_pages(all_objects, valid_urls, predicted_file):

	(valid_trees, all_pages, junk_nodes) = svm.parse_pages_svm(all_objects, valid_urls)

	# read pridicted lables
	all_labels = [i.rstrip(' \r\n') for i in open(predicted_file, 'rb')]
	tp_pages = []
	fp_pages = []

	print len(all_pages), len(all_labels)
	assert len(all_pages) == len(all_labels)

	for i in range(0, len(all_pages)):
		if all_labels[i] == '1':
			if all_pages[i].isvalid:
				tp_pages.append(all_pages[i])
			else:
				fp_pages.append(all_pages[i])

	pos_pages = tp_pages + fp_pages
	tp_roots = [i.root.identifier for i in tp_pages]
	fp_roots = [i.root.identifier for i in fp_pages]
	pos_roots = [i.root.identifier for i in pos_pages]

	# recut trees using predicted page candidates
	print 'Predicted pos:', len(pos_roots)
	recut_pos_pages = []
	for tree in valid_trees:
		local_pos_roots = [i for i in tree.expand_tree(filter = lambda x: x in pos_roots)]
		for root in local_pos_roots:
			new_page = WebPage()
			new_page.add_obj(tree[root], root=True)
			for node in tree.expand_tree(root, filter = lambda x: x==root or x not in local_pos_roots):
				new_page.add_obj(tree[node])
			recut_pos_pages.append(new_page)
			

	recut_pos_pages.sort(lambda x,y: cmp(x,y), lambda x: x.root.start_time, False)

	# add junk nodes to recut pos pages
	junk2 = len(junk_nodes)
	for node in junk_nodes:
		found_flag = False
		for page in recut_pos_pages[::-1]:
			if cmp(page.root.start_time, node.start_time) < 0:
				found_flag = True
				break
		if found_flag:
			page.junk_objs.append(node)
			junk2 -= 1

	recut_tp_pages = []
	recut_fp_pages = []
	for page in recut_pos_pages:
		if page.root.identifier in tp_roots:
			recut_tp_pages.append(page)
		elif page.root.identifier in fp_roots:
			recut_fp_pages.append(page)

	return recut_pos_pages, recut_tp_pages
Esempio n. 2
0
def gen_instances(all_nodes, valid_urls):
    global log_h

    # Parse pages for SVM
    (valid_trees, all_pages, junk_nodes) = svm.parse_pages_svm(all_nodes, valid_urls)

    ###### add junk
    junk2 = len(junk_nodes)
    for node in junk_nodes:
        found_flag = False
        for page in all_pages[::-1]:
            if cmp(page.root.start_time, node.start_time) < 0:
                found_flag = True
                break
        if found_flag:
            page.junk_objs.append(node)
            junk2 -= 1

            ###### extract instances

    print len(all_pages)

    all_instances = []
    instance_pos_url = []
    pos_cnt = 0
    neg_cnt = 0
    for page in all_pages:
        pf = PageFeature(page)
        if page.isvalid:
            # log('{0} {1}'.format(page.root.url, len(page.objs)))
            instance_pos_url.append(page.root.url)
            label = 1
            pos_cnt += 1
        else:
            label = -1
            neg_cnt += 1
        instance = pf.assemble_instance(label)
        all_instances.append(instance)

    log_h.log("#Page:{0}\n#Non-page:{1}".format(pos_cnt, neg_cnt))

    return all_instances, instance_pos_url
Esempio n. 3
0
def gen_instances(all_nodes, valid_urls):
	global log_h

	# Parse pages for SVM
	(valid_trees, all_pages, junk_nodes) = svm.parse_pages_svm(all_nodes, valid_urls)
	
	###### add junk
	junk2 = len(junk_nodes)
	for node in junk_nodes:
		found_flag = False
		for page in all_pages[::-1]:
			if cmp(page.root.start_time, node.start_time) < 0:
				found_flag = True
				break
		if found_flag:
			page.junk_objs.append(node)
			junk2 -= 1

	###### extract instances

	print len(all_pages)

	all_instances = []
	instance_pos_url = []
	pos_cnt = 0
	neg_cnt = 0	
	for page in all_pages:
		pf = PageFeature(page)
		if page.isvalid:
			#log('{0} {1}'.format(page.root.url, len(page.objs)))
			instance_pos_url.append(page.root.url)
			label = 1
			pos_cnt += 1
		else:
			label = -1
			neg_cnt += 1
		instance = pf.assemble_instance(label)
		all_instances.append(instance)

	log_h.log('#Page:{0}\n#Non-page:{1}'.format(pos_cnt, neg_cnt))

	return all_instances, instance_pos_url
Esempio n. 4
0
def get_svm_pages(all_objects, valid_urls, predicted_file):

    (valid_trees, all_pages,
     junk_nodes) = svm.parse_pages_svm(all_objects, valid_urls)

    # read pridicted lables
    all_labels = [i.rstrip(' \r\n') for i in open(predicted_file, 'rb')]
    tp_pages = []
    fp_pages = []

    print len(all_pages), len(all_labels)
    assert len(all_pages) == len(all_labels)

    for i in range(0, len(all_pages)):
        if all_labels[i] == '1':
            if all_pages[i].isvalid:
                tp_pages.append(all_pages[i])
            else:
                fp_pages.append(all_pages[i])

    pos_pages = tp_pages + fp_pages
    tp_roots = [i.root.identifier for i in tp_pages]
    fp_roots = [i.root.identifier for i in fp_pages]
    pos_roots = [i.root.identifier for i in pos_pages]

    # recut trees using predicted page candidates
    print 'Predicted pos:', len(pos_roots)
    recut_pos_pages = []
    for tree in valid_trees:
        local_pos_roots = [
            i for i in tree.expand_tree(filter=lambda x: x in pos_roots)
        ]
        for root in local_pos_roots:
            new_page = WebPage()
            new_page.add_obj(tree[root], root=True)
            for node in tree.expand_tree(
                    root,
                    filter=lambda x: x == root or x not in local_pos_roots):
                new_page.add_obj(tree[node])
            recut_pos_pages.append(new_page)

    recut_pos_pages.sort(lambda x, y: cmp(x, y), lambda x: x.root.start_time,
                         False)

    # add junk nodes to recut pos pages
    junk2 = len(junk_nodes)
    for node in junk_nodes:
        found_flag = False
        for page in recut_pos_pages[::-1]:
            if cmp(page.root.start_time, node.start_time) < 0:
                found_flag = True
                break
        if found_flag:
            page.junk_objs.append(node)
            junk2 -= 1

    recut_tp_pages = []
    recut_fp_pages = []
    for page in recut_pos_pages:
        if page.root.identifier in tp_roots:
            recut_tp_pages.append(page)
        elif page.root.identifier in fp_roots:
            recut_fp_pages.append(page)

    return recut_pos_pages, recut_tp_pages