Example #1
0
def matchTreesFromDomainWithScriptsFromURLListS2(domain, url_list_path):
  treedict = getTreesForDomainFromDB(domain)
  if treedict == None or len(treedict) == 0:
    print "failed to fetch trees for domain ", domain
    return None, None
  passed_sc = []
  failed_sc = []
  f = open(url_list_path)
  for line in f:
    url = line.strip()
    print "process url "+url
    hosts, inlines = fetchScripts(url)
    if inlines==None or len(inlines) ==0:
      print "no inlines for "+url
      continue
    for inline in inlines:
      passed, failed = matchTreesFromDomainWithScript(domain, inline, treedict)
      if passed == None:
        print "failed for inline [S] ", inline[:100],' [E]'
      else:
        passed_sc += passed
        failed_sc += failed
  rate = float(len(passed_sc))/float(len(passed_sc)+len(failed_sc))
  print "passed %d; failed: %d; rate:%f" %(len(passed_sc), len(failed_sc), rate)
  print "match details : ", str(global_count)
Example #2
0
def main():
	t1 = time.time()
	url = 'https://www.cnn.com'
	tldextract.TLDExtract(suffix_list_url=False)
	o = tldextract.extract(url)
	domain = o.domain + '.' + o.suffix
	domain_trees = getTreesForDomainFromDB(domain)
	trees = {}
	trees[domain] = domain_trees
	contents = open(sys.argv[1]).read()
	#print contents
	logger.debug('url:%s domain:%s' %(url, domain))
	t2 = time.time()
	try:
		soup = BeautifulSoup( contents, "html5lib")
	except Exception as e:
		soup = BeautifulSoup( contents, 'lxml')
	analyzer = DOMAnalyzer(soup, \
		'https://localhost:4433/allowed-resources/', './js_repository/', trees, url)
	analyzer.process()
	t3 = time.time()
	logger.debug("time difference: DOM:%f, whole:%f" %((t3-t2), (t3-t1)))
	#logger.debug('NEXT ROUND')
	#analyzer.process()
	new_tag = soup.new_tag("script", src="https://localhost:4433/libs/client_lib.js")
	analyzer.soup.head.insert(1, new_tag)
Example #3
0
def matchScriptWithDomainTemplate(domain, script, treedict=None):
    if treedict == None:
        treedict = getTreesForDomainFromDB(domain)
    if treedict == None or len(treedict) == 0:
        print "failed to fetch trees for domain ", domain
        return None, None
    #print "fetched %d trees for domain" %(len(treedict))

    is_json = False
    rs, sc = analyzeJSCodesFinerBlock(script)
    if rs == None:
        rs = analyzeJSON(script)
        is_json = True
    if rs == None:
        print "no script nor json"
        return [], []

    allowed_sc = []
    failed_sc = []

    t1 = time()
    if is_json:
        tree = TemplateTree(rs, None)
        if simpleCompare(treedict, tree):
            #if compare(treedict, tree):
            allowed_sc.append(rs)
            print "JSON allowed "
        else:
            failed_sc.append(rs)
            print "JSON failed "
    else:
        print "generate %d subtrees for target script" % (len(rs))
        for index in range(len(rs)):
            seq = rs[index]
            tree = TemplateTree(seq, None)
            key = tree.key

            #if simpleCompare(treedict, tree):
            if compare(treedict, tree):
                allowed_sc.append(sc[index])
            else:
                failed_sc.append(sc[index])

        print "allowed %d blocks, failed %d blocks" % (len(allowed_sc),
                                                       len(failed_sc))
    t2 = time()
    total_time = t2 - t1
    total_size = len(allowed_sc) + len(failed_sc)
    if total_size != 0:
        avg_time = total_time / total_size
        print "MATCH_TIME: %f " % (avg_time)
    return allowed_sc, failed_sc
Example #4
0
def matchScriptWithDomainTemplate(domain, script, treedict = None):
  if treedict == None:
    treedict = getTreesForDomainFromDB(domain)
  if treedict == None or len(treedict) == 0:
    print "failed to fetch trees for domain ", domain
    return None, None
  #print "fetched %d trees for domain" %(len(treedict))
  
  is_json = False
  rs, sc = analyzeJSCodesFinerBlock(script)
  if rs == None:
    rs = analyzeJSON(script)
    is_json = True
  if rs == None:
    print "no script nor json"
    return [], []

  allowed_sc = []
  failed_sc = []

  t1 = time()
  if is_json:
    tree = TemplateTree(rs, None)
    if simpleCompare(treedict, tree):
    #if compare(treedict, tree):
      allowed_sc.append(rs)
      print "JSON allowed "
    else:
      failed_sc.append(rs)
      print "JSON failed "
  else:
    print "generate %d subtrees for target script" %(len(rs))
    for index in range(len(rs)):
      seq = rs[index]
      tree = TemplateTree(seq, None)
      key = tree.key

      #if simpleCompare(treedict, tree):
      if compare(treedict, tree):
        allowed_sc.append(sc[index])
      else:
        failed_sc.append(sc[index])

    print "allowed %d blocks, failed %d blocks" %(len(allowed_sc), len(failed_sc))
  t2 = time()
  total_time = t2 - t1
  total_size = len(allowed_sc) + len(failed_sc)
  if total_size != 0:
    avg_time = total_time / total_size
    print "MATCH_TIME: %f " %(avg_time)  
  return allowed_sc, failed_sc
Example #5
0
def matchScriptsFromURLFileWithDomainTemplate(domain, url_list_path):
    treedict = getTreesForDomainFromDB(domain)
    if treedict == None or len(treedict) == 0:
        print "failed to fetch trees for domain ", domain
        return None, None
    passed_sc = []
    failed_sc = []
    passed_dict = {}
    failed_dict = {}
    f_pass = open('passlist', 'w')
    f_empty = open('emptylist', 'w')
    f_fail = open('faillist', 'w')

    f = open(url_list_path)
    for line in f:
        url = line.strip()
        print "process url " + url
        hosts, inlines = fetchScripts(url)
        if inlines == None or len(inlines) == 0:
            print "no inlines for " + url
            f_empty.write(url + '\n')
            continue
        for inline in inlines:
            passed, failed = matchScriptWithDomainTemplate(
                domain, inline, treedict)
            if len(failed) == 0:
                for fa in passed:
                    passed_dict[fa] = 1
                    f_pass.write(fa + '\n')
            else:
                for fa in failed:
                    failed_dict[fa] = 1
                    f_fail.write(fa + '\n')
            if passed == None:
                print "failed for inline [S] ", inline[:100], ' [E]'
            else:
                passed_sc += passed
                failed_sc += failed
    rate = float(len(passed_sc)) / float(len(passed_sc) + len(failed_sc))
    rate2 = float(
        len(passed_dict)) / float(len(passed_dict) + len(failed_dict))
    print "passed %d; failed: %d; rate:%f" % (len(passed_dict),
                                              len(failed_dict), rate2)
    print "passed %d; failed: %d; rate:%f" % (len(passed_sc), len(failed_sc),
                                              rate)
    print "match details : ", str(global_count)
Example #6
0
def matchTreesFromDomainWithScript(domain, script, treedict = None):
  if treedict == None:
    treedict = getTreesForDomainFromDB(domain)
  if treedict == None or len(treedict) == 0:
    print "failed to fetch trees for domain ", domain
    return None, None
  #print "fetched %d trees for domain" %(len(treedict))
  
  is_json = False
  rs, sc = analyzeJSCodesFinerBlock(script)
  if rs == None:
    rs = analyzeJSON(script)
    is_json = True
  if rs == None:
    print "no script nor json"
    return [], []

  allowed_sc = []
  failed_sc = []

  if is_json:
    tree = TemplateTree(rs, None)
    #if simpleCompare(treedict, tree):
    if compare(treedict, tree):
      allowed_sc.append(rs)
      print "JSON allowed "
    else:
      failed_sc.append(rs)
      print "JSON failed "
  else:
    print "generate %d subtrees for target script" %(len(rs))
    for index in range(len(rs)):
      seq = rs[index]
      tree = TemplateTree(seq, None)
      key = tree.key

      if simpleCompare(treedict, tree):
      #if compare(treedict, tree):
        allowed_sc.append(sc[index])
      else:
        failed_sc.append(sc[index])

    print "allowed %d blocks, failed %d blocks" %(len(allowed_sc), len(failed_sc))
  return allowed_sc, failed_sc
Example #7
0
def matchScriptsFromURLFileWithDomainTemplate(domain, url_list_path):
  treedict = getTreesForDomainFromDB(domain)
  if treedict == None or len(treedict) == 0:
    print "failed to fetch trees for domain ", domain
    return None, None
  passed_sc = []
  failed_sc = []
  passed_dict = {}
  failed_dict = {}
  f_pass = open('passlist','w') 
  f_empty = open('emptylist','w') 
  f_fail = open('faillist','w') 

  f = open(url_list_path)
  for line in f:
    url = line.strip()
    print "process url "+url
    hosts, inlines = fetchScripts(url)
    if inlines==None or len(inlines) ==0:
      print "no inlines for "+url
      f_empty.write(url+'\n')
      continue
    for inline in inlines:
      passed, failed = matchScriptWithDomainTemplate(domain, inline, treedict)
      if len(failed) == 0:
        for fa in passed:
          passed_dict[fa] = 1
          f_pass.write(fa+'\n')
      else:
        for fa in failed:
          failed_dict[fa] = 1
          f_fail.write(fa+'\n')
      if passed == None:
        print "failed for inline [S] ", inline[:100],' [E]'
      else:
        passed_sc += passed
        failed_sc += failed
  rate = float(len(passed_sc))/float(len(passed_sc)+len(failed_sc))
  rate2 = float(len(passed_dict))/float(len(passed_dict)+len(failed_dict))
  print "passed %d; failed: %d; rate:%f" %(len(passed_dict), len(failed_dict), rate2)
  print "passed %d; failed: %d; rate:%f" %(len(passed_sc), len(failed_sc), rate)
  print "match details : ", str(global_count)
Example #8
0
def completeMatchTreesFromDomainWithScriptsFromURLList(domain, url_list_path):
  treedict = getTreesForDomainFromDB(domain)
  if treedict == None or len(treedict) == 0:
    print "failed to fetch trees for domain ", domain
    return
  print "fetched %d trees for domain" %(len(treedict))
  scriptdict, count_dict, json_count = extractScriptsAndGenerateASTNodesFromURLListFinerBlock(url_list_path)
  match_script = 0
  match_uniq_script = 0
  nonmatch_script = 0
  nonmatch_uniq_script = 0
  nonmatch_tree = 0
  nomatch_list = []
  for key in scriptdict:
    if key in treedict:
      flag = True
      for item in scriptdict[key]:
        target_tree = item[2]
        if not treedict[key].match(target_tree):
          print "Matching failure "
          print "  template_tree: %s " %(treedict[key].debug())
          print "  target_tree:   %s " %(target_tree.debug())
          nonmatch_uniq_script += len(scriptdict[key])
          nonmatch_script += count_dict[key]
          flag = False
          break
      if flag:
        match_uniq_script += len(scriptdict[key])
        match_script += count_dict[key]
    else:
      nonmatch_uniq_script += len(scriptdict[key])
      nonmatch_script += count_dict[key]
      nonmatch_tree += 1
      print "non match script: %s " %(scriptdict[key][0][0])
  print "matched scripts:%d[%d] \n nonmatched scripts:%d[%d] nonmatch_tree:%d" \
    %(match_uniq_script,match_script, nonmatch_uniq_script, nonmatch_script, nonmatch_tree)
Example #9
0
def matchTreesFromDomainWithScriptsFromURLList(domain, url_list_path):
  treedict = getTreesForDomainFromDB(domain)
  if treedict == None or len(treedict) == 0:
    print "failed to fetch trees for domain ", domain
    return
  print "fetched %d trees for domain" %(len(treedict))
  scriptdict, count_dict, json_count = extractScriptsAndGenerateASTNodesFromURLListFinerBlock(url_list_path)
  match_script = 0
  match_uniq_script = 0
  nonmatch_script = 0
  nonmatch_uniq_script = 0
  nonmatch_tree = 0
  nomatch_list = []
  for key in scriptdict:
    if key in treedict:
      match_uniq_script += len(scriptdict[key])
      match_script += count_dict[key]
    else:
      nonmatch_uniq_script += len(scriptdict[key])
      nonmatch_script += count_dict[key]
      nonmatch_tree += 1
      print "non match script: %s " %(scriptdict[key][0][0])
  print "matched scripts:%d[%d] \n nonmatched scripts:%d[%d] nonmatch_tree:%d" \
    %(match_uniq_script,match_script, nonmatch_uniq_script, nonmatch_script, nonmatch_tree)