def calcTwoHTMLDistance(contents1, contents2): try: soup1 = BeautifulSoup(contents1, "html5lib") except Exception as e: print "Error parsing DOM using html5 ", str(e) soup1 = BeautifulSoup(contents1.decode('utf-8'), "html5lib") try: soup2 = BeautifulSoup(contents2, "html5lib") except Exception as e: print "Error parsing DOM using html5 ", str(e) soup2 = BeautifulSoup(contents2.decode('utf-8'), "html5lib") node1 = Node("doc") node2 = Node("doc") traverseDOMTree(soup1.html, node1, 0) traverseDOMTree(soup2.html, node2, 0) ld1, ld1_script_hosts, ld1_script_contents = getLDPairRepr(node1) ld2, ld2_script_hosts, ld2_script_contents = getLDPairRepr(node2) print "script length for ld1: %d %d " % (len(ld1_script_hosts), len(ld1_script_contents)) print "script length for ld2: %d %d " % (len(ld2_script_hosts), len(ld2_script_contents)) D = mmdiff(ld1, ld2) return mmdiffR(ld1, ld2, D, \ ld1_script_hosts,ld1_script_contents, ld2_script_hosts, ld2_script_contents)
def extractScriptFromContents(contents): if contents == None or len(contents) == 0: return None, None try: soup = BeautifulSoup(contents, "html5lib") except Exception as e: print "Error parsing DOM using html5 ", str(e) soup = BeautifulSoup(contents.decode('utf-8'), "html5lib") node = Node("doc") traverseDOMTree(soup.html, node, 0) script_hosts, script_contents = extractScriptFromDOMTree(node) #for host in script_hosts: # print "host: %s" %host #for content in script_contents: # print "content: %s" %content #print "summary Host:%d Contents:%d" %(len(script_hosts), len(script_contents)) return script_hosts, script_contents
def extractScriptFromContents(contents): if contents == None or len(contents)==0: return None, None try: soup = BeautifulSoup(contents, "html5lib") except Exception as e: print "Error parsing DOM using html5 ",str(e) soup = BeautifulSoup(contents.decode('utf-8'), "html5lib") node = Node("doc") traverseDOMTree(soup.html,node, 0) script_hosts, script_contents = extractScriptFromDOMTree(node) #for host in script_hosts: # print "host: %s" %host #for content in script_contents: # print "content: %s" %content #print "summary Host:%d Contents:%d" %(len(script_hosts), len(script_contents)) return script_hosts, script_contents
def calcTwoHTMLDistance(contents1, contents2): try: soup1 = BeautifulSoup(contents1, "html5lib") except Exception as e: print "Error parsing DOM using html5 ",str(e) soup1 = BeautifulSoup(contents1.decode('utf-8'), "html5lib") try: soup2 = BeautifulSoup(contents2, "html5lib") except Exception as e: print "Error parsing DOM using html5 ", str(e) soup2 = BeautifulSoup(contents2.decode('utf-8'), "html5lib") node1 = Node("doc") node2 = Node("doc") traverseDOMTree(soup1.html,node1, 0) traverseDOMTree(soup2.html,node2, 0) ld1, ld1_script_hosts, ld1_script_contents = getLDPairRepr(node1) ld2, ld2_script_hosts, ld2_script_contents = getLDPairRepr(node2) print "script length for ld1: %d %d " % (len(ld1_script_hosts),len(ld1_script_contents)) print "script length for ld2: %d %d " % (len(ld2_script_hosts),len(ld2_script_contents)) D = mmdiff(ld1, ld2) return mmdiffR(ld1, ld2, D, \ ld1_script_hosts,ld1_script_contents, ld2_script_hosts, ld2_script_contents)