Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Extracting features as the input of LIBSVM from log. Output = logfile.instance'
    )
    parser.add_argument('harfolder', type=str, help='')
    parser.add_argument('predictfile', type=str, help='')

    args = parser.parse_args()
    harfolder = args.harfolder
    predicted_file = args.predictfile

    # Ground truth
    (all_real_pages, all_objects) = har.parse_pages_har(harfolder)
    valid_urls = [i.root.url for i in all_real_pages]

    # Reset nodes. One whole day wasted...Shit!@@
    for node in all_objects:
        node.bpointer = None
        node.fpointer = []

    # detected pages with SVM
    (recut_pos_pages, recut_tp_pages) = get_svm_pages(all_objects, valid_urls,
                                                      predicted_file)

    timetuple = pagetime(all_real_pages, recut_tp_pages)

    # page timings
    pagetimings = [
        '{0} {1}'.format(i[0], i[1]) for i in timetuple
        if i[0] > 0 and i[1] > 0
    ]
    ofile = open('m_pagetime.txt', 'wb')
    ofile.write('\n'.join(pagetimings))
    ofile.close()
Ejemplo n.º 2
0
def main():
	parser = argparse.ArgumentParser(description='Extracting features as the input of LIBSVM from log. Output = logfile.instance')
	parser.add_argument('harfolder', type=str, help= '')
	parser.add_argument('predictfile', type=str, help= '')

	args = parser.parse_args()
	harfolder = args.harfolder
	predicted_file = args.predictfile

	# Ground truth
	(all_real_pages, all_objects) = har.parse_pages_har(harfolder)
	valid_urls = [i.root.url for i in all_real_pages]

	# Reset nodes. One whole day wasted...Shit!@@
	for node in all_objects:
		node.bpointer = None
		node.fpointer = []
	
	# detected pages with SVM
	(recut_pos_pages, recut_tp_pages) = get_svm_pages(all_objects, valid_urls, predicted_file)


	timetuple = pagetime(all_real_pages, recut_tp_pages)

	# page timings
	pagetimings = ['{0} {1}'.format(i[0], i[1]) for i in timetuple if i[0]>0 and i[1]>0]
	ofile = open('m_pagetime.txt', 'wb')
	ofile.write('\n'.join(pagetimings))
	ofile.close()
Ejemplo n.º 3
0
def main():
	parser = argparse.ArgumentParser(description='This program extracts real page \
												urls of HAR files as groundtruth.')
	parser.add_argument('harfolder', type=str, help= 'File folder containing HAR \
												file(s). All the HAR files under \
												this folder will be processed.')
	parser.add_argument('output', type=str, help= 'Output data.')

	args = parser.parse_args()
	harfolder = args.harfolder
	dumpfile = args.output

	(all_pages, all_objs) = HAR.parse_pages_har(harfolder)

	# Write to file
	if os.path.exists(dumpfile):
		os.remove(dumpfile)
	ofile = open(dumpfile, 'wb')
	
	i = 0	# counter
	for p in all_pages:
		if p.root:
			i += 1
			ofile.write("{0}\t{1}\n".format(p.root.url, p.root.start_time))

	print('write {0} real pages to: {1}'.format(i, dumpfile))
	ofile.flush()
	ofile.close()
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Extracting features as the input of LIBSVM from log. Output = logfile.instance'
    )
    parser.add_argument('harfolder', type=str, help='')

    args = parser.parse_args()
    harfolder = args.harfolder

    # Ground truth
    (all_real_pages, all_objects) = HAR.parse_pages_har(harfolder)
    valid_pages = [i.root.identifier for i in all_real_pages]

    for node in all_objects:
        node.bpointer = None
        node.fpointer = []

    # detected pages with SVM
    (pos_pages, tp_pages) = get_ss_pages(all_objects, valid_pages)

    # objects status
    print 'real pages:', len(all_real_pages)
    print 'detected pages:', len(pos_pages)

    (classified_right, classified_wrong, missed) = \
      check_objects(all_real_pages, pos_pages)

    print 'right {0} wrong {1} missed {2}'.format(len(classified_right),
                                                  len(classified_wrong),
                                                  len(missed))

    def which_type(obj):
        subtype_re = {
            r'.*(jpeg|jpg|gif|png|bmp|ppm|pgm|pbm|pnm|tiff|exif|cgm|svg).*':
            'image',
            r'.*(flash|flv).*': 'flash',
            r'.*(css).*': 'css',
            r'.*(javascript|js).*': 'js',
            r'.*(html|htm).*': 'html',
        }
        if obj.type != None:
            for regex in subtype_re.keys():
                if re.match(re.compile(regex, re.I), obj.type):
                    return subtype_re[regex]
                else:
                    continue
        return 'others'

    def stat(objects):
        html = 0
        js = 0
        css = 0
        flash = 0
        image = 0
        others = 0
        for obj in objects:
            objtype = which_type(obj)
            if objtype == 'html':
                html += 1
            elif objtype == 'js':
                js += 1
            elif objtype == 'css':
                css += 1
            elif objtype == 'flash':
                flash += 1
            elif objtype == 'image':
                image += 1
            elif objtype == 'others':
                others += 1
        print 'html {0} js {1} css {2} flash {3} image {4} others{5}'.\
          format(html, js, css, flash,image,others)

    stat(classified_right)
    stat(classified_wrong)
    stat(missed)
Ejemplo n.º 5
0
def main():
	parser = argparse.ArgumentParser(description='Extracting features as the input of LIBSVM from log. Output = logfile.instance')
	parser.add_argument('harfolder', type=str, help= '')

	args = parser.parse_args()
	harfolder = args.harfolder

	# Ground truth
	(all_real_pages, all_objects) = HAR.parse_pages_har(harfolder)
	valid_pages = [i.root.identifier for i in all_real_pages]

	for node in all_objects:
		node.bpointer = None
		node.fpointer = []

	# detected pages with SVM
	(pos_pages, tp_pages) = get_timetype_pages(all_objects, valid_pages)

	# objects status
	print 'real pages:',len(all_real_pages)
	print 'detected pages:', len(pos_pages)

	(classified_right, classified_wrong, missed) = \
			check_objects(all_real_pages, pos_pages)

	print 'right {0} wrong {1} missed {2}'.format(len(classified_right), len(classified_wrong), len(missed))

	def which_type(obj):
		subtype_re = {
			r'.*(jpeg|jpg|gif|png|bmp|ppm|pgm|pbm|pnm|tiff|exif|cgm|svg).*': 'image',
			r'.*(flash|flv).*': 'flash',
			r'.*(css).*': 'css',
			r'.*(javascript|js).*': 'js',
			r'.*(html|htm).*': 'html',
		}
		if obj.type != None:
			for regex in subtype_re.keys():
				if re.match(re.compile(regex, re.I), obj.type):
					return subtype_re[regex]
				else:
					continue
		return 'others'

	def stat(objects):
		html = 0
		js = 0
		css = 0
		flash = 0
		image = 0
		others = 0
		for obj in objects:
			objtype = which_type(obj)
			if objtype == 'html':
				html += 1
			elif objtype == 'js':
				js += 1
			elif objtype =='css':
				css += 1
			elif objtype == 'flash':
				flash += 1
			elif objtype == 'image':
				image += 1
			elif objtype == 'others':
				others += 1
		print 'html {0} js {1} css {2} flash {3} image {4} others{5}'.\
				format(html, js, css, flash,image,others)

	stat(classified_right)
	stat(classified_wrong)
	stat(missed)
Ejemplo n.º 6
0
def main():
	parser = argparse.ArgumentParser(description='Extracting features as the input of LIBSVM from log. Output = logfile.instance')
	parser.add_argument('harfolder', type=str, help= '')
	parser.add_argument('predictfile', type=str, help= '')

	args = parser.parse_args()
	harfolder = args.harfolder
	predicted_file = args.predictfile

	# Ground truth
	(all_real_pages, all_objects) = har.parse_pages_har(harfolder)
	valid_urls = [i.root.url for i in all_real_pages]

	# Reset nodes. One whole day wasted...Shit!@@
	for node in all_objects:
		node.bpointer = None
		node.fpointer = []

	# Ground truth pagetime
	dumpfile = 'pagetime_gt.txt'
	ofile = open(dumpfile, 'wb')
	for page in all_real_pages:
		ofile.write(str(page.total_seconds())+'\n')
	ofile.close()
	
	# detected pages with SVM
	(recut_pos_pages, recut_tp_pages) = get_svm_pages(all_objects, valid_urls, predicted_file)

	# page timings
	# pagetimings = [str(i.total_seconds()) for i in recut_pos_pages if i.total_seconds() > 0]
	# ofile = open('pagetime_svm_pos.txt', 'wb')
	# ofile.write('\n'.join(pagetimings))
	# ofile.close()

	# pagetimings = [str(i.total_seconds()) for i in recut_tp_pages if i.total_seconds() > 0]
	# ofile = open('pagetime_svm_tp.txt', 'wb')
	# ofile.write('\n'.join(pagetimings))
	# ofile.close()

	# objects status
	(classified_right, classified_wrong, missed) = \
			check_objects(all_real_pages, recut_pos_pages)

	print 'right {0} wrong {1} missed {2}'.format(len(classified_right), len(classified_wrong), len(missed))

	def which_type(obj):
		subtype_re = {
			r'.*(jpeg|jpg|gif|png|bmp|ppm|pgm|pbm|pnm|tiff|exif|cgm|svg).*': 'image',
			r'.*(flash|flv).*': 'flash',
			r'.*(css).*': 'css',
			r'.*(javascript|js).*': 'js',
			r'.*(html|htm).*': 'html',
		}
		if obj.type != None:
			for regex in subtype_re.keys():
				if re.match(re.compile(regex, re.I), obj.type):
					return subtype_re[regex]
				else:
					continue
		return 'others'

	def stat(objects):
		html = 0
		js = 0
		css = 0
		flash = 0
		image = 0
		others = 0
		for obj in objects:
			objtype = which_type(obj)
			if objtype == 'html':
				html += 1
			elif objtype == 'js':
				js += 1
			elif objtype =='css':
				css += 1
			elif objtype == 'flash':
				flash += 1
			elif objtype == 'image':
				image += 1
			elif objtype == 'others':
				others += 1
		print 'html {0} js {1} css {2} flash {3} image {4} others{5}'.\
				format(html, js, css, flash,image,others)

	stat(classified_right)
	stat(classified_wrong)
	stat(missed)
Ejemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Extracting features as the input of LIBSVM from log. Output = logfile.instance'
    )
    parser.add_argument('harfolder', type=str, help='')
    parser.add_argument('predictfile', type=str, help='')

    args = parser.parse_args()
    harfolder = args.harfolder
    predicted_file = args.predictfile

    # Ground truth
    (all_real_pages, all_objects) = har.parse_pages_har(harfolder)
    valid_urls = [i.root.url for i in all_real_pages]

    # Reset nodes. One whole day wasted...Shit!@@
    for node in all_objects:
        node.bpointer = None
        node.fpointer = []

    # Ground truth pagetime
    dumpfile = 'pagetime_gt.txt'
    ofile = open(dumpfile, 'wb')
    for page in all_real_pages:
        ofile.write(str(page.total_seconds()) + '\n')
    ofile.close()

    # detected pages with SVM
    (recut_pos_pages, recut_tp_pages) = get_svm_pages(all_objects, valid_urls,
                                                      predicted_file)

    # page timings
    # pagetimings = [str(i.total_seconds()) for i in recut_pos_pages if i.total_seconds() > 0]
    # ofile = open('pagetime_svm_pos.txt', 'wb')
    # ofile.write('\n'.join(pagetimings))
    # ofile.close()

    # pagetimings = [str(i.total_seconds()) for i in recut_tp_pages if i.total_seconds() > 0]
    # ofile = open('pagetime_svm_tp.txt', 'wb')
    # ofile.write('\n'.join(pagetimings))
    # ofile.close()

    # objects status
    (classified_right, classified_wrong, missed) = \
      check_objects(all_real_pages, recut_pos_pages)

    print 'right {0} wrong {1} missed {2}'.format(len(classified_right),
                                                  len(classified_wrong),
                                                  len(missed))

    def which_type(obj):
        subtype_re = {
            r'.*(jpeg|jpg|gif|png|bmp|ppm|pgm|pbm|pnm|tiff|exif|cgm|svg).*':
            'image',
            r'.*(flash|flv).*': 'flash',
            r'.*(css).*': 'css',
            r'.*(javascript|js).*': 'js',
            r'.*(html|htm).*': 'html',
        }
        if obj.type != None:
            for regex in subtype_re.keys():
                if re.match(re.compile(regex, re.I), obj.type):
                    return subtype_re[regex]
                else:
                    continue
        return 'others'

    def stat(objects):
        html = 0
        js = 0
        css = 0
        flash = 0
        image = 0
        others = 0
        for obj in objects:
            objtype = which_type(obj)
            if objtype == 'html':
                html += 1
            elif objtype == 'js':
                js += 1
            elif objtype == 'css':
                css += 1
            elif objtype == 'flash':
                flash += 1
            elif objtype == 'image':
                image += 1
            elif objtype == 'others':
                others += 1
        print 'html {0} js {1} css {2} flash {3} image {4} others{5}'.\
          format(html, js, css, flash,image,others)

    stat(classified_right)
    stat(classified_wrong)
    stat(missed)