Example #1
0
def get_svm_pages(all_objects, valid_urls, predicted_file):

	(valid_trees, all_pages, junk_nodes) = svm.parse_pages_svm(all_objects, valid_urls)

	# read pridicted lables
	all_labels = [i.rstrip(' \r\n') for i in open(predicted_file, 'rb')]
	tp_pages = []
	fp_pages = []

	print len(all_pages), len(all_labels)
	assert len(all_pages) == len(all_labels)

	for i in range(0, len(all_pages)):
		if all_labels[i] == '1':
			if all_pages[i].isvalid:
				tp_pages.append(all_pages[i])
			else:
				fp_pages.append(all_pages[i])

	pos_pages = tp_pages + fp_pages
	tp_roots = [i.root.identifier for i in tp_pages]
	fp_roots = [i.root.identifier for i in fp_pages]
	pos_roots = [i.root.identifier for i in pos_pages]

	# recut trees using predicted page candidates
	print 'Predicted pos:', len(pos_roots)
	recut_pos_pages = []
	for tree in valid_trees:
		local_pos_roots = [i for i in tree.expand_tree(filter = lambda x: x in pos_roots)]
		for root in local_pos_roots:
			new_page = WebPage()
			new_page.add_obj(tree[root], root=True)
			for node in tree.expand_tree(root, filter = lambda x: x==root or x not in local_pos_roots):
				new_page.add_obj(tree[node])
			recut_pos_pages.append(new_page)
			

	recut_pos_pages.sort(lambda x,y: cmp(x,y), lambda x: x.root.start_time, False)

	# add junk nodes to recut pos pages
	junk2 = len(junk_nodes)
	for node in junk_nodes:
		found_flag = False
		for page in recut_pos_pages[::-1]:
			if cmp(page.root.start_time, node.start_time) < 0:
				found_flag = True
				break
		if found_flag:
			page.junk_objs.append(node)
			junk2 -= 1

	recut_tp_pages = []
	recut_fp_pages = []
	for page in recut_pos_pages:
		if page.root.identifier in tp_roots:
			recut_tp_pages.append(page)
		elif page.root.identifier in fp_roots:
			recut_fp_pages.append(page)

	return recut_pos_pages, recut_tp_pages
Example #2
0
def process_tree(tree, k, t):
	mocs = []
	for node in tree.expand_tree(mode=_WIDTH):	# must be _WIDTH
		if tree[node].is_root() and int(tree[node].status) == 200:
			mocs.append(node)

	valid = []
	for moc in mocs[::-1]:
		root = tree[moc]
		bp = tree[moc].bpointer
		if bp is None:
			valid.append(moc)
		else:
			pred = tree[bp]
			all_nodes = []
			for i in tree.expand_tree(moc,filter=lambda x: x==moc or x not in valid):
				all_nodes.append(i)

			if len(all_nodes)>k:
				if root.start_time - pred.start_time >= datetime.timedelta(seconds=t):
					valid.append(moc)

	###### parse pages
	pages = []
	for rootid in valid[::-1]:
		new_page = WebPage()
		new_page.add_obj(tree[rootid], True)
		pages.append(new_page)
		for nodeid in tree.expand_tree(rootid, filter = lambda x: x==rootid or x not in valid):
			new_page.add_obj(tree[nodeid])

	return pages
Example #3
0
def get_timetype_pages(all_objects, valid_pages):
    t = 1.2  #################

    all_objects.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False)

    all_pages = []
    last_page = None
    last_node = None
    for node in all_objects:
        if last_page is None and node.is_root():
            new_page = WebPage()
            all_pages.append(new_page)
            new_page.add_obj(node, root=True)
            last_page = new_page
        else:
            if node.is_root() and \
            node.start_time - last_node.start_time >= datetime.timedelta(seconds=t):
                new_page = WebPage()
                all_pages.append(new_page)
                new_page.add_obj(node, root=True)
                last_page = new_page
            else:
                last_page.add_obj(node)
        last_node = node

    tp_pages = []
    fp_pages = []
    for page in all_pages:
        if page.root.identifier in valid_pages:
            tp_pages.append(page)
        else:
            fp_pages.append(page)
    return all_pages, tp_pages
def get_timetype_pages(all_objects, valid_pages):
	t = 1.2	#################

	all_objects.sort(lambda x,y: cmp(x,y), lambda x: x.start_time, False)

	all_pages = []
	last_page = None
	last_node = None
	for node in all_objects:
		if last_page is None and node.is_root():
			new_page = WebPage()
			all_pages.append(new_page)
			new_page.add_obj(node, root = True)
			last_page = new_page
		else:
			if node.is_root() and \
			node.start_time - last_node.start_time >= datetime.timedelta(seconds=t):
				new_page = WebPage()
				all_pages.append(new_page)
				new_page.add_obj(node, root = True)
				last_page = new_page
			else:
				last_page.add_obj(node)
		last_node = node

	tp_pages = []
	fp_pages = []
	for page in all_pages:
		if page.root.identifier in valid_pages:
			tp_pages.append(page)
		else:
			fp_pages.append(page)
	return all_pages, tp_pages
Example #5
0
def process_tree(tree, k, t):
    mocs = []
    for node in tree.expand_tree(mode=_WIDTH):  # must be _WIDTH
        if tree[node].is_root() and int(tree[node].status) == 200:
            mocs.append(node)

    valid = []
    for moc in mocs[::-1]:
        root = tree[moc]
        bp = tree[moc].bpointer
        if bp is None:
            valid.append(moc)
        else:
            pred = tree[bp]
            all_nodes = []
            for i in tree.expand_tree(
                    moc, filter=lambda x: x == moc or x not in valid):
                all_nodes.append(i)

            if len(all_nodes) > k:
                if root.start_time - pred.start_time >= datetime.timedelta(
                        seconds=t):
                    valid.append(moc)

    ###### parse pages
    pages = []
    for rootid in valid[::-1]:
        new_page = WebPage()
        new_page.add_obj(tree[rootid], True)
        pages.append(new_page)
        for nodeid in tree.expand_tree(
                rootid, filter=lambda x: x == rootid or x not in valid):
            new_page.add_obj(tree[nodeid])

    return pages
Example #6
0
def get_type_pages(all_nodes, valid_pages):
	all_nodes.sort(lambda x,y: cmp(x,y), lambda x: x.start_time, False)

	all_pages = []
	last_page = None
	for node in all_nodes:
		if node.is_root():
			new_page = WebPage()
			new_page.add_obj(node, root = True)
			all_pages.append(new_page)
			last_page = new_page
		else:
			if last_page is not None:
				last_page.add_obj(node)

	tp_pages = []
	fp_pages = []
	for page in all_pages:
		if page.root.identifier in valid_pages:
			tp_pages.append(page)
		else:
			fp_pages.append(page)
	return all_pages, tp_pages
Example #7
0
def get_type_pages(all_nodes, valid_pages):
    all_nodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False)

    all_pages = []
    last_page = None
    for node in all_nodes:
        if node.is_root():
            new_page = WebPage()
            new_page.add_obj(node, root=True)
            all_pages.append(new_page)
            last_page = new_page
        else:
            if last_page is not None:
                last_page.add_obj(node)

    tp_pages = []
    fp_pages = []
    for page in all_pages:
        if page.root.identifier in valid_pages:
            tp_pages.append(page)
        else:
            fp_pages.append(page)
    return all_pages, tp_pages
Example #8
0
def get_svm_pages(all_objects, valid_urls, predicted_file):

    (valid_trees, all_pages,
     junk_nodes) = svm.parse_pages_svm(all_objects, valid_urls)

    # read pridicted lables
    all_labels = [i.rstrip(' \r\n') for i in open(predicted_file, 'rb')]
    tp_pages = []
    fp_pages = []

    print len(all_pages), len(all_labels)
    assert len(all_pages) == len(all_labels)

    for i in range(0, len(all_pages)):
        if all_labels[i] == '1':
            if all_pages[i].isvalid:
                tp_pages.append(all_pages[i])
            else:
                fp_pages.append(all_pages[i])

    pos_pages = tp_pages + fp_pages
    tp_roots = [i.root.identifier for i in tp_pages]
    fp_roots = [i.root.identifier for i in fp_pages]
    pos_roots = [i.root.identifier for i in pos_pages]

    # recut trees using predicted page candidates
    print 'Predicted pos:', len(pos_roots)
    recut_pos_pages = []
    for tree in valid_trees:
        local_pos_roots = [
            i for i in tree.expand_tree(filter=lambda x: x in pos_roots)
        ]
        for root in local_pos_roots:
            new_page = WebPage()
            new_page.add_obj(tree[root], root=True)
            for node in tree.expand_tree(
                    root,
                    filter=lambda x: x == root or x not in local_pos_roots):
                new_page.add_obj(tree[node])
            recut_pos_pages.append(new_page)

    recut_pos_pages.sort(lambda x, y: cmp(x, y), lambda x: x.root.start_time,
                         False)

    # add junk nodes to recut pos pages
    junk2 = len(junk_nodes)
    for node in junk_nodes:
        found_flag = False
        for page in recut_pos_pages[::-1]:
            if cmp(page.root.start_time, node.start_time) < 0:
                found_flag = True
                break
        if found_flag:
            page.junk_objs.append(node)
            junk2 -= 1

    recut_tp_pages = []
    recut_fp_pages = []
    for page in recut_pos_pages:
        if page.root.identifier in tp_roots:
            recut_tp_pages.append(page)
        elif page.root.identifier in fp_roots:
            recut_fp_pages.append(page)

    return recut_pos_pages, recut_tp_pages
Example #9
0
print 'log file: %s' % this_log

print 'Reading log...'
all_lines = logbasic.read(input_file)

print 'Processing rrp...'
all_nodes = []
for line in all_lines:
	all_nodes.append(logbasic.NodeFromLog(line))
all_nodes.sort(lambda x,y: cmp(x,y), lambda x: x.start_time, False)

all_pages = []
last_page = None
for node in all_nodes:
	if node.is_root():
		new_page = WebPage()
		new_page.add_obj(node, root = True)
		all_pages.append(new_page)
		last_page = new_page
	else:
		if last_page is not None:
			last_page.add_obj(node)

print len(all_nodes)
print len(all_pages)

all_urls = [i.root.url for i in all_pages]
ofile = open(detected_pageurl, 'wb')
ofile.write('\n'.join(all_urls))
ofile.close()
Example #10
0
for line in all_lines:
    all_nodes.append(logbasic.NodeFromLog(line))
all_nodes.sort(lambda x, y: cmp(x, y), lambda x: x.start_time, False)

print len(all_nodes)

T = [i / 10.0 for i in range(2, 202, 2)]

for t in T:
    log_h.log('########################\n')
    all_pages = []
    last_page = None
    last_node = None
    for node in all_nodes:
        if last_page is None:
            new_page = WebPage()
            all_pages.append(new_page)
            new_page.add_obj(node, root=True)
            last_page = new_page
        else:
            if node.start_time - last_node.start_time >= datetime.timedelta(
                    seconds=t):
                new_page = WebPage()
                all_pages.append(new_page)
                new_page.add_obj(node, root=True)
                last_page = new_page
            else:
                last_page.add_obj(node)
        last_node = node

    print 'Page count: %d' % len(all_pages)