def matchTreesFromDomainWithScriptsFromURLListS2(domain, url_list_path): treedict = getTreesForDomainFromDB(domain) if treedict == None or len(treedict) == 0: print "failed to fetch trees for domain ", domain return None, None passed_sc = [] failed_sc = [] f = open(url_list_path) for line in f: url = line.strip() print "process url "+url hosts, inlines = fetchScripts(url) if inlines==None or len(inlines) ==0: print "no inlines for "+url continue for inline in inlines: passed, failed = matchTreesFromDomainWithScript(domain, inline, treedict) if passed == None: print "failed for inline [S] ", inline[:100],' [E]' else: passed_sc += passed failed_sc += failed rate = float(len(passed_sc))/float(len(passed_sc)+len(failed_sc)) print "passed %d; failed: %d; rate:%f" %(len(passed_sc), len(failed_sc), rate) print "match details : ", str(global_count)
def main(): t1 = time.time() url = 'https://www.cnn.com' tldextract.TLDExtract(suffix_list_url=False) o = tldextract.extract(url) domain = o.domain + '.' + o.suffix domain_trees = getTreesForDomainFromDB(domain) trees = {} trees[domain] = domain_trees contents = open(sys.argv[1]).read() #print contents logger.debug('url:%s domain:%s' %(url, domain)) t2 = time.time() try: soup = BeautifulSoup( contents, "html5lib") except Exception as e: soup = BeautifulSoup( contents, 'lxml') analyzer = DOMAnalyzer(soup, \ 'https://localhost:4433/allowed-resources/', './js_repository/', trees, url) analyzer.process() t3 = time.time() logger.debug("time difference: DOM:%f, whole:%f" %((t3-t2), (t3-t1))) #logger.debug('NEXT ROUND') #analyzer.process() new_tag = soup.new_tag("script", src="https://localhost:4433/libs/client_lib.js") analyzer.soup.head.insert(1, new_tag)
def matchScriptWithDomainTemplate(domain, script, treedict=None): if treedict == None: treedict = getTreesForDomainFromDB(domain) if treedict == None or len(treedict) == 0: print "failed to fetch trees for domain ", domain return None, None #print "fetched %d trees for domain" %(len(treedict)) is_json = False rs, sc = analyzeJSCodesFinerBlock(script) if rs == None: rs = analyzeJSON(script) is_json = True if rs == None: print "no script nor json" return [], [] allowed_sc = [] failed_sc = [] t1 = time() if is_json: tree = TemplateTree(rs, None) if simpleCompare(treedict, tree): #if compare(treedict, tree): allowed_sc.append(rs) print "JSON allowed " else: failed_sc.append(rs) print "JSON failed " else: print "generate %d subtrees for target script" % (len(rs)) for index in range(len(rs)): seq = rs[index] tree = TemplateTree(seq, None) key = tree.key #if simpleCompare(treedict, tree): if compare(treedict, tree): allowed_sc.append(sc[index]) else: failed_sc.append(sc[index]) print "allowed %d blocks, failed %d blocks" % (len(allowed_sc), len(failed_sc)) t2 = time() total_time = t2 - t1 total_size = len(allowed_sc) + len(failed_sc) if total_size != 0: avg_time = total_time / total_size print "MATCH_TIME: %f " % (avg_time) return allowed_sc, failed_sc
def matchScriptWithDomainTemplate(domain, script, treedict = None): if treedict == None: treedict = getTreesForDomainFromDB(domain) if treedict == None or len(treedict) == 0: print "failed to fetch trees for domain ", domain return None, None #print "fetched %d trees for domain" %(len(treedict)) is_json = False rs, sc = analyzeJSCodesFinerBlock(script) if rs == None: rs = analyzeJSON(script) is_json = True if rs == None: print "no script nor json" return [], [] allowed_sc = [] failed_sc = [] t1 = time() if is_json: tree = TemplateTree(rs, None) if simpleCompare(treedict, tree): #if compare(treedict, tree): allowed_sc.append(rs) print "JSON allowed " else: failed_sc.append(rs) print "JSON failed " else: print "generate %d subtrees for target script" %(len(rs)) for index in range(len(rs)): seq = rs[index] tree = TemplateTree(seq, None) key = tree.key #if simpleCompare(treedict, tree): if compare(treedict, tree): allowed_sc.append(sc[index]) else: failed_sc.append(sc[index]) print "allowed %d blocks, failed %d blocks" %(len(allowed_sc), len(failed_sc)) t2 = time() total_time = t2 - t1 total_size = len(allowed_sc) + len(failed_sc) if total_size != 0: avg_time = total_time / total_size print "MATCH_TIME: %f " %(avg_time) return allowed_sc, failed_sc
def matchScriptsFromURLFileWithDomainTemplate(domain, url_list_path): treedict = getTreesForDomainFromDB(domain) if treedict == None or len(treedict) == 0: print "failed to fetch trees for domain ", domain return None, None passed_sc = [] failed_sc = [] passed_dict = {} failed_dict = {} f_pass = open('passlist', 'w') f_empty = open('emptylist', 'w') f_fail = open('faillist', 'w') f = open(url_list_path) for line in f: url = line.strip() print "process url " + url hosts, inlines = fetchScripts(url) if inlines == None or len(inlines) == 0: print "no inlines for " + url f_empty.write(url + '\n') continue for inline in inlines: passed, failed = matchScriptWithDomainTemplate( domain, inline, treedict) if len(failed) == 0: for fa in passed: passed_dict[fa] = 1 f_pass.write(fa + '\n') else: for fa in failed: failed_dict[fa] = 1 f_fail.write(fa + '\n') if passed == None: print "failed for inline [S] ", inline[:100], ' [E]' else: passed_sc += passed failed_sc += failed rate = float(len(passed_sc)) / float(len(passed_sc) + len(failed_sc)) rate2 = float( len(passed_dict)) / float(len(passed_dict) + len(failed_dict)) print "passed %d; failed: %d; rate:%f" % (len(passed_dict), len(failed_dict), rate2) print "passed %d; failed: %d; rate:%f" % (len(passed_sc), len(failed_sc), rate) print "match details : ", str(global_count)
def matchTreesFromDomainWithScript(domain, script, treedict = None): if treedict == None: treedict = getTreesForDomainFromDB(domain) if treedict == None or len(treedict) == 0: print "failed to fetch trees for domain ", domain return None, None #print "fetched %d trees for domain" %(len(treedict)) is_json = False rs, sc = analyzeJSCodesFinerBlock(script) if rs == None: rs = analyzeJSON(script) is_json = True if rs == None: print "no script nor json" return [], [] allowed_sc = [] failed_sc = [] if is_json: tree = TemplateTree(rs, None) #if simpleCompare(treedict, tree): if compare(treedict, tree): allowed_sc.append(rs) print "JSON allowed " else: failed_sc.append(rs) print "JSON failed " else: print "generate %d subtrees for target script" %(len(rs)) for index in range(len(rs)): seq = rs[index] tree = TemplateTree(seq, None) key = tree.key if simpleCompare(treedict, tree): #if compare(treedict, tree): allowed_sc.append(sc[index]) else: failed_sc.append(sc[index]) print "allowed %d blocks, failed %d blocks" %(len(allowed_sc), len(failed_sc)) return allowed_sc, failed_sc
def matchScriptsFromURLFileWithDomainTemplate(domain, url_list_path): treedict = getTreesForDomainFromDB(domain) if treedict == None or len(treedict) == 0: print "failed to fetch trees for domain ", domain return None, None passed_sc = [] failed_sc = [] passed_dict = {} failed_dict = {} f_pass = open('passlist','w') f_empty = open('emptylist','w') f_fail = open('faillist','w') f = open(url_list_path) for line in f: url = line.strip() print "process url "+url hosts, inlines = fetchScripts(url) if inlines==None or len(inlines) ==0: print "no inlines for "+url f_empty.write(url+'\n') continue for inline in inlines: passed, failed = matchScriptWithDomainTemplate(domain, inline, treedict) if len(failed) == 0: for fa in passed: passed_dict[fa] = 1 f_pass.write(fa+'\n') else: for fa in failed: failed_dict[fa] = 1 f_fail.write(fa+'\n') if passed == None: print "failed for inline [S] ", inline[:100],' [E]' else: passed_sc += passed failed_sc += failed rate = float(len(passed_sc))/float(len(passed_sc)+len(failed_sc)) rate2 = float(len(passed_dict))/float(len(passed_dict)+len(failed_dict)) print "passed %d; failed: %d; rate:%f" %(len(passed_dict), len(failed_dict), rate2) print "passed %d; failed: %d; rate:%f" %(len(passed_sc), len(failed_sc), rate) print "match details : ", str(global_count)
def completeMatchTreesFromDomainWithScriptsFromURLList(domain, url_list_path): treedict = getTreesForDomainFromDB(domain) if treedict == None or len(treedict) == 0: print "failed to fetch trees for domain ", domain return print "fetched %d trees for domain" %(len(treedict)) scriptdict, count_dict, json_count = extractScriptsAndGenerateASTNodesFromURLListFinerBlock(url_list_path) match_script = 0 match_uniq_script = 0 nonmatch_script = 0 nonmatch_uniq_script = 0 nonmatch_tree = 0 nomatch_list = [] for key in scriptdict: if key in treedict: flag = True for item in scriptdict[key]: target_tree = item[2] if not treedict[key].match(target_tree): print "Matching failure " print " template_tree: %s " %(treedict[key].debug()) print " target_tree: %s " %(target_tree.debug()) nonmatch_uniq_script += len(scriptdict[key]) nonmatch_script += count_dict[key] flag = False break if flag: match_uniq_script += len(scriptdict[key]) match_script += count_dict[key] else: nonmatch_uniq_script += len(scriptdict[key]) nonmatch_script += count_dict[key] nonmatch_tree += 1 print "non match script: %s " %(scriptdict[key][0][0]) print "matched scripts:%d[%d] \n nonmatched scripts:%d[%d] nonmatch_tree:%d" \ %(match_uniq_script,match_script, nonmatch_uniq_script, nonmatch_script, nonmatch_tree)
def matchTreesFromDomainWithScriptsFromURLList(domain, url_list_path): treedict = getTreesForDomainFromDB(domain) if treedict == None or len(treedict) == 0: print "failed to fetch trees for domain ", domain return print "fetched %d trees for domain" %(len(treedict)) scriptdict, count_dict, json_count = extractScriptsAndGenerateASTNodesFromURLListFinerBlock(url_list_path) match_script = 0 match_uniq_script = 0 nonmatch_script = 0 nonmatch_uniq_script = 0 nonmatch_tree = 0 nomatch_list = [] for key in scriptdict: if key in treedict: match_uniq_script += len(scriptdict[key]) match_script += count_dict[key] else: nonmatch_uniq_script += len(scriptdict[key]) nonmatch_script += count_dict[key] nonmatch_tree += 1 print "non match script: %s " %(scriptdict[key][0][0]) print "matched scripts:%d[%d] \n nonmatched scripts:%d[%d] nonmatch_tree:%d" \ %(match_uniq_script,match_script, nonmatch_uniq_script, nonmatch_script, nonmatch_tree)