Ejemplo n.º 1
0
def arrayToDict(arr):
  #debug_id = str(uuid.uuid4())
  #print >> sys.stderr,"[START:%s] arrayToDict:%s" %(debug_id, (arr))
  rs = {}
  if arr == None:
    print "ERROR: arr is NULL in arrayToDict"
    return rs
  try:
    for obj in arr:
      if isinstance(obj, basestring):
        if not "basestring_" in rs:
          rs['basestring_'] = [obj]
        else:
          rs['basestring_'].append(obj)
      elif isinstance(obj, list):
        #print >> sys.stderr, "DEBUG1 [%s] list %s" %(debug_id, str(obj))
        subarr = extractArrayValues(obj)
        subrs = arrayToDict(subarr)
        mergeTwoArrayDict(rs,subrs)
        #print >> sys.stderr, "DEBUG2 [%s] list %s" %(debug_id, str(obj))
      elif isinstance(obj, dict):
        #print >> sys.stderr, "DEBUG1 [%s] dict %s" %(debug_id, str(obj))
        for k in obj:
          if not k in rs:
            rs[k] = [obj[k]]
          else:
            rs[k].append(obj[k])
        #print >> sys.stderr, "DEBUG2 [%s] dict %s" %(debug_id, str(obj))
    
  except Exception as e:
    displayErrorMsg('arrayToDict',str(e)+' '+str(arr))
Ejemplo n.º 2
0
def extractObjectValues(data):
  if data == None:
    return {}
  rs = {} #key : [val1, val2]
  for k in data:
    if not k in rs:
      rs[k] = []
    val = data[k]
    if isinstance(val, dict):
      sub_rs = extractObjectValues(val)
      mergeTwoArrayDict(rs, sub_rs)
    elif isinstance(val, list) or isinstance(val, tuple):
      arr = extractArrayValues(val)
      for item in arr:
        if isinstance(item, dict):
          mergeTwoArrayDict(rs, item)
        elif isinstance(item, basestring) or isinstance(item, int) or isinstance(item, float):
          rs[k].append(item)
        else:
          displayErrorMsg('extractObjectValues',\
            "shouldn't have other types"+str(type(item)))
    elif isinstance(val, basestring) or isinstance(val, int) or isinstance(val, float):
      rs[k].append(val)
    else:
      displayErrorMsg('extractObjectValues',\
            "unknown type:"+str(type(val)))
      continue
  invalid_keys = [k for k in rs if len(rs[k]) == 0]
  for k in invalid_keys:
    del rs[k]
  return rs
Ejemplo n.º 3
0
def extractObjectValues(data):
    if data == None:
        return {}
    rs = {}  #key : [val1, val2]
    for k in data:
        if not k in rs:
            rs[k] = []
        val = data[k]
        if isinstance(val, dict):
            sub_rs = extractObjectValues(val)
            mergeTwoArrayDict(rs, sub_rs)
        elif isinstance(val, list) or isinstance(val, tuple):
            arr = extractArrayValues(val)
            for item in arr:
                if isinstance(item, dict):
                    mergeTwoArrayDict(rs, item)
                elif isinstance(item, basestring) or isinstance(
                        item, int) or isinstance(item, float):
                    rs[k].append(item)
                else:
                    displayErrorMsg('extractObjectValues',\
                      "shouldn't have other types"+str(type(item)))
        elif isinstance(val, basestring) or isinstance(val, int) or isinstance(
                val, float):
            rs[k].append(val)
        else:
            displayErrorMsg('extractObjectValues',\
                  "unknown type:"+str(type(val)))
            continue
    invalid_keys = [k for k in rs if len(rs[k]) == 0]
    for k in invalid_keys:
        del rs[k]
    return rs
Ejemplo n.º 4
0
def arrayToDict(arr):
    #debug_id = str(uuid.uuid4())
    #print >> sys.stderr,"[START:%s] arrayToDict:%s" %(debug_id, (arr))
    rs = {}
    if arr == None:
        print "ERROR: arr is NULL in arrayToDict"
        return rs
    try:
        for obj in arr:
            if isinstance(obj, basestring):
                if not "basestring_" in rs:
                    rs['basestring_'] = [obj]
                else:
                    rs['basestring_'].append(obj)
            elif isinstance(obj, list):
                #print >> sys.stderr, "DEBUG1 [%s] list %s" %(debug_id, str(obj))
                subarr = extractArrayValues(obj)
                subrs = arrayToDict(subarr)
                mergeTwoArrayDict(rs, subrs)
                #print >> sys.stderr, "DEBUG2 [%s] list %s" %(debug_id, str(obj))
            elif isinstance(obj, dict):
                #print >> sys.stderr, "DEBUG1 [%s] dict %s" %(debug_id, str(obj))
                for k in obj:
                    if not k in rs:
                        rs[k] = [obj[k]]
                    else:
                        rs[k].append(obj[k])
                #print >> sys.stderr, "DEBUG2 [%s] dict %s" %(debug_id, str(obj))

    except Exception as e:
        displayErrorMsg('arrayToDict', str(e) + ' ' + str(arr))
Ejemplo n.º 5
0
    def match(self, target_tree):
        if not isinstance(target_tree, TemplateTree):
            displayErrorMsg('TemplateTree.match,'\
              "matching tree, target tree should be TemplateTree" )
            return False

        if self.key != target_tree.key:
            return False

        if self.type == 'json' and target_tree.type == 'json':
            return True
        elif self.type != target_tree.type:
            return False

        length = len(target_tree.nodes)
        for i in range(length):
            try:
                if self.nodes[i].tag != target_tree.nodes[i].tag:
                    return False
                if self.nodes[i].tag == 'String':
                    if not self.string_types[str(i)].match(
                            target_tree.nodes[i].value):
                        return False
                elif self.nodes[i].tag == 'Object':
                    target_obj = extractObjectValues(
                        target_tree.nodes[i].value)
                    for k in target_obj:
                        if not k in self.object_types[str(i)]:
                            return False
                        if isinstance(target_obj[k], list):
                            for item in target_obj[k]:
                                if not self.object_types[str(i)][k].match(
                                        item):
                                    return False
                        else:
                            if not self.object_types[str(i)][k].match(
                                    target_obj[k]):
                                return False
                elif self.nodes[i].tag == 'Array':
                    if target_tree.nodes[i].value == None:
                        continue
                    target_obj = arrayToDict(target_tree.nodes[i].value)
                    target_obj = extractObjectValues(target_obj)
                    for k in target_obj:
                        if not k in self.array_types[str(i)]:
                            return False
                        if isinstance(target_obj[k], list):
                            for item in target_obj[k]:
                                if not self.array_types[str(i)][k].match(item):
                                    return False
                        else:
                            if not self.array_types[str(i)][k].match(
                                    target_obj[k]):
                                return False
            except Exception as e:
                displayErrorMsg('TemplateTree.match', str(e))
                return False
        return True
Ejemplo n.º 6
0
  def match(self, target_tree):
    if not isinstance(target_tree, TemplateTree):
      displayErrorMsg('TemplateTree.match,'\
        "matching tree, target tree should be TemplateTree" ) 
      return False

    if self.key != target_tree.key:
      return False

    if self.type == 'json' and target_tree.type == 'json':
      return True
    elif self.type != target_tree.type:
      return False

    length = len(target_tree.nodes)
    for i in range(length):
      try:
        if self.nodes[i].tag != target_tree.nodes[i].tag:
          return False
        if self.nodes[i].tag == 'String':
          if not self.string_types[str(i)].match(target_tree.nodes[i].value):
            return False
        elif self.nodes[i].tag == 'Object':
          target_obj = extractObjectValues(target_tree.nodes[i].value)
          for k in target_obj:
            if not k in self.object_types[str(i)]:
              return False
            if isinstance(target_obj[k], list):
              for item in target_obj[k]:
                if not self.object_types[str(i)][k].match(item):
                  return False
            else:
              if not self.object_types[str(i)][k].match(target_obj[k]):
                return False
        elif self.nodes[i].tag == 'Array':
          if target_tree.nodes[i].value == None:
            continue
          target_obj = arrayToDict(target_tree.nodes[i].value)
          target_obj = extractObjectValues(target_obj)
          for k in target_obj:
            if not k in self.array_types[str(i)]:
              return False
            if isinstance(target_obj[k], list):
              for item in target_obj[k]:
                if not self.array_types[str(i)][k].match(item):
                  return False
            else:
              if not self.array_types[str(i)][k].match(target_obj[k]):
                return False
      except Exception as e:
        displayErrorMsg('TemplateTree.match', str(e))
        return False
    return True
Ejemplo n.º 7
0
def extractArrayValues(data):
  rs = []
  for item in data:
    if isinstance(item, basestring) or isinstance(item, int) or isinstance(item, float):
      rs.append(item)
    elif isinstance(item, list) or isinstance(item, tuple):
      val = extractArrayValues(item)
      rs += val
    elif isinstance(item, dict):
      rs.append(extractObjectValues(item))
    else:
      displayErrorMsg('extractArrayValues', "unknown type "+str(type(item)))
  return rs
Ejemplo n.º 8
0
 def dumps(self):
     try:
         obj = {'type': self.type}
         if self.type == 'json':
             obj['tree'] = self.nodes
         else:
             obj['tree'] = ','.join([b64encode(x.tag) for x in self.nodes])
         obj['string_types_str'] = self.string_types_str
         obj['object_types_str'] = self.object_types_str
         obj['array_types_str'] = self.array_types_str
         return json.dumps(obj)
     except Exception as e:
         displayErrorMsg("TemplateTree.dumps", str(e))
         return None
Ejemplo n.º 9
0
 def dumps(self):
   try:
     obj = {'type' : self.type}
     if self.type == 'json':
       obj['tree'] = self.nodes
     else:
       obj['tree'] = ','.join([b64encode(x.tag) for x in self.nodes])
     obj['string_types_str'] = self.string_types_str
     obj['object_types_str'] = self.object_types_str
     obj['array_types_str'] = self.array_types_str
     return json.dumps(obj)
   except Exception as e:
     displayErrorMsg("TemplateTree.dumps", str(e))
     return None
Ejemplo n.º 10
0
def extractArrayValues(data):
    rs = []
    for item in data:
        if isinstance(item, basestring) or isinstance(item, int) or isinstance(
                item, float):
            rs.append(item)
        elif isinstance(item, list) or isinstance(item, tuple):
            val = extractArrayValues(item)
            rs += val
        elif isinstance(item, dict):
            rs.append(extractObjectValues(item))
        else:
            displayErrorMsg('extractArrayValues',
                            "unknown type " + str(type(item)))
    return rs
Ejemplo n.º 11
0
def getTreesForDomainFromDB(domain):
    tree_strings = fetchTrees(domain)
    if tree_strings == None:
        return None

    tree_dict = {}  #{key : TemplateTree}
    for key in tree_strings:
        tree = TemplateTree(None, None)
        try:
            tree.loads(tree_strings[key])
            tree_dict[key] = tree
        except Exception as e:
            displayErrorMsg("getTreesForDomainFromDB", str(e))

    return tree_dict
Ejemplo n.º 12
0
def getTreesForDomainFromDB(domain):
  tree_strings = fetchTrees(domain)
  if tree_strings == None:
    return None

  tree_dict = {} #{key : TemplateTree}
  for key in tree_strings:
    tree = TemplateTree(None, None)
    try:
      tree.loads(tree_strings[key])
      tree_dict[key] = tree
    except Exception as e:
      displayErrorMsg("getTreesForDomainFromDB",str(e))
  
  return tree_dict  
Ejemplo n.º 13
0
  def loads(self, obj_str):
    try:
      obj = json.loads(obj_str)
      self.type = obj['type']
      if self.type == 'json':
        self.nodes = obj['tree']
      elif self.type == 'js':
        nodes = obj['tree'].split(',')
        self.nodes = [ASTOutputNode(b64decode(x)) for x in nodes]
      self.calc_key()
      
      self.string_types_str = obj['string_types_str']
      for key in self.string_types_str:
        node_pattern = NodePattern()
        node_pattern.loads(self.string_types_str[key])
        self.string_types[key] = node_pattern
      #print "successfully loaded %d string patterns" %(len(self.string_types_str))

      self.object_types_str = obj['object_types_str']
      for index in self.object_types_str:
        self.object_types[index] = {}
        for key in self.object_types_str[index]:
          node_pattern = NodePattern()
          node_pattern.loads(self.object_types_str[index][key])
          self.object_types[index][key] = node_pattern
            
      #print "successfully loaded %d object patterns" %(len(self.object_types_str))  
         
      self.array_types_str = obj['array_types_str']
      for index in self.array_types_str:
        self.array_types[index] = {}
        for key in self.array_types_str[index]:
          node_pattern = NodePattern()
          node_pattern.loads(self.array_types_str[index][key])
          self.array_types[index][key] = node_pattern
            
      #print "successfully loaded %d array patterns" %(len(self.array_types_str)) 

      return True
    except Exception as e:
      displayErrorMsg("TemplateTree.loads", str(e))
      return False
Ejemplo n.º 14
0
    def loads(self, obj_str):
        try:
            obj = json.loads(obj_str)
            self.type = obj['type']
            if self.type == 'json':
                self.nodes = obj['tree']
            elif self.type == 'js':
                nodes = obj['tree'].split(',')
                self.nodes = [ASTOutputNode(b64decode(x)) for x in nodes]
            self.calc_key()

            self.string_types_str = obj['string_types_str']
            for key in self.string_types_str:
                node_pattern = NodePattern()
                node_pattern.loads(self.string_types_str[key])
                self.string_types[key] = node_pattern
            #print "successfully loaded %d string patterns" %(len(self.string_types_str))

            self.object_types_str = obj['object_types_str']
            for index in self.object_types_str:
                self.object_types[index] = {}
                for key in self.object_types_str[index]:
                    node_pattern = NodePattern()
                    node_pattern.loads(self.object_types_str[index][key])
                    self.object_types[index][key] = node_pattern

            #print "successfully loaded %d object patterns" %(len(self.object_types_str))

            self.array_types_str = obj['array_types_str']
            for index in self.array_types_str:
                self.array_types[index] = {}
                for key in self.array_types_str[index]:
                    node_pattern = NodePattern()
                    node_pattern.loads(self.array_types_str[index][key])
                    self.array_types[index][key] = node_pattern

            #print "successfully loaded %d array patterns" %(len(self.array_types_str))

            return True
        except Exception as e:
            displayErrorMsg("TemplateTree.loads", str(e))
            return False
Ejemplo n.º 15
0
 def calc_key(self):
     if self.nodes != None:
         m = hashlib.md5()
         if self.type == "json":
             keys = sorted(self.nodes.keys())
             for k in keys:
                 m.update(k)
         elif self.type == "js":
             for node in self.nodes:
                 m.update(node.tag)
         else:
             debug_msg = "TemplateTree nodes format error: %s %d %d" \
               %(self.nodes[0].__class__, id(type(self.nodes[0])), id(ASTOutputNode))
             displayErrorMsg('TemplateTree.calc_key', debug_msg)
             self.key = None
         key = m.hexdigest()
     else:
         displayErrorMsg('TemplateTree.calc_key', "Nodes are None")
         self.key = None
     self.key = key
Ejemplo n.º 16
0
 def calc_key(self):
   if self.nodes != None:
     m = hashlib.md5()
     if self.type == "json":
       keys = sorted(self.nodes.keys())
       for k in keys:
         m.update(k)
     elif self.type == "js":
       for node in self.nodes:
         m.update(node.tag)
     else:
       debug_msg = "TemplateTree nodes format error: %s %d %d" \
         %(self.nodes[0].__class__, id(type(self.nodes[0])), id(ASTOutputNode))
       displayErrorMsg('TemplateTree.calc_key', debug_msg)
       self.key = None
     key = m.hexdigest()
   else:
     displayErrorMsg('TemplateTree.calc_key', "Nodes are None")
     self.key = None
   self.key = key
Ejemplo n.º 17
0
def generateTemplateBasedOnURLsFromFile(path, dst_path):
    f = open(path)
    scriptdict = {}
    total_script_blocks = 0
    total_uniq_script_blocks = 0
    debug_dict = {}

    static_scripts = 0
    dynamic_scripts = 0

    for line in f:
        url = line.strip()
        print "process url " + url
        hosts, inlines = fetchScripts(url)
        if inlines == None or len(inlines) == 0:
            print "no inlines for " + url
            continue
        for inline in inlines:
            is_json = False
            rs, sc = analyzeJSCodesFinerBlock(inline)
            if rs == None:
                rs = analyzeJSON(inline)
                is_json = True
            if rs == None:
                continue

            if is_json:
                tree = TemplateTree(rs, None)
                if not tree.key in scriptdict:
                    scriptdict[tree.key] = [(inline, url, tree, -1)]
                    debug_dict[tree.key] = [inline]
                else:
                    debug_dict[tree.key].append(inline)
                    contents = [x[0] for x in scriptdict[key]]
                    if not inline in contents:
                        scriptdict[tree.key].append((inline, url, tree, -1))
                        total_uniq_script_blocks += 1
                total_script_blocks += 1

            else:
                for index in range(len(rs)):
                    total_script_blocks += 1
                    seq = rs[index]
                    tree = TemplateTree(seq, None)
                    key = tree.key
                    if not key in scriptdict:
                        debug_dict[key] = [sc[index]]
                        scriptdict[key] = [(sc[index], url, tree, index)]
                        print "  add key  %s" % key
                    else:
                        contents = [x[0] for x in scriptdict[key]]
                        debug_dict[key].append(sc[index])
                        if not sc[index] in contents:
                            scriptdict[key].append(
                                (sc[index], url, tree, index))
                            print "  item %s has %d unique scripts" % (
                                key, len(scriptdict[key]))
                            total_uniq_script_blocks += 1

    fw = open(os.path.join(dst_path, 'debug'), 'w')
    for k in debug_dict:
        vals = debug_dict[k]
        fw.write("%d %s \n" % (len(vals), k))
        fw.write("  --EXAMPLE-- %s\n" % vals[0])
    fw.close()
    #start to analyze trees
    #scriptdict[tree_key] = [(script, url, tree, index)]
    trees = []
    insufficient_urls = {}
    keys = sorted(scriptdict.keys(), key=lambda k: len(scriptdict[k]))
    for key in keys:
        is_static = True
        name = "%d_%s" % (len(scriptdict[key]), key)
        fw = open(os.path.join(dst_path, name), 'w')
        for item in scriptdict[key]:
            fw.write(item[1] + "||" + str(item[3]) + "  " + str(item[0]) +
                     "\n")

        #make sure all template trees with the same key are the same
        script_list = scriptdict[key]
        length_list = sorted([len(item[2].nodes) for item in script_list])
        seq_length = 0
        if length_list[0] != length_list[-1]:
            fw.write("[ALERT] seq length is not consistent")
            fw.close()
            continue
        else:
            seq_length = length_list[0]

        #only handle JavaScript for now
        tree = script_list[0][2]
        if tree.type == "json":
            print "the inline is json!"
            fw.write("[TODO]: the inline is json. This is next step\n")
            fw.close()
            trees.append(tree)
            continue

        #process String/Object/Array nodes
        #script_list: [(script, url, tree, index)]
        fw.write("start analyzeing values\n")
        script_length = len(script_list)

        for i in range(seq_length):
            node = script_list[0][2].nodes[i]
            try:
                if node.tag == "String":
                    vals = [item[2].nodes[i].value for item in script_list]
                    encoded_val = [b64encode(x) for x in vals]
                    #item = 'string%d: %s' %(i, ','.join(encoded_val))
                    #fw.write(item+"\n")
                    tree.strings[i] = vals
                    node_pattern = generateNodePattern(vals)
                    if is_static and \
                      ((node_pattern.tp!=StringType.CONST) and (node_pattern.tp!=StringType.INSUFFICIENT)):
                        is_static = False
                        dynamic_scripts += script_length

                    tree.string_types_str[str(i)] = node_pattern.dumps()
                    if node_pattern.is_insufficient():
                        if not key in insufficient_urls:
                            insufficient_urls[key] = \
                              [item[1] for item in script_list]
                        else:
                            insufficient_urls[key] += [
                                item[1] for item in script_list
                            ]
                    # testing
                    #node_pattern = NodePattern()
                    #r = node_pattern.loads(tree.string_types_str[i])
                    #if r == False:
                    #  print "node_pattern failed to load: "+tree.string_types_str[i]
                    #else:
                    #  print "successfully loaded tree: "+tree.string_types_str[i]
                    print "STRING%d: [TYPE:%s] [VALUE:%s]" \
                      %(i, tree.string_types_str[str(i)],','.join(encoded_val))
                if node.tag == "Object":
                    #debug = "tag:%s val:%s" \
                    #  %(script_list[0][2].nodes[i].tag,str(script_list[0][2].nodes[i].value))
                    #print "DEBUG: %s" %debug
                    rs = analyzeObjectResultHelper(script_list, i)
                    rs = extractObjectValues(rs)
                    type_dict = {}
                    for k in rs:
                        encoded_val = [b64encode(x) for x in rs[k]]
                        node_pattern = generateNodePattern(rs[k])
                        if is_static and \
                          ((node_pattern.tp!=StringType.CONST) and (node_pattern.tp!=StringType.INSUFFICIENT)):
                            is_static = False
                            dynamic_scripts += script_length

                        type_dict[k] = node_pattern.dumps()
                        if node_pattern.is_insufficient():
                            if not key in insufficient_urls:
                                insufficient_urls[key] = \
                                  [item[1] for item in script_list]
                            else:
                                insufficient_urls[key] += [
                                    item[1] for item in script_list
                                ]
                        #testing
                        #node_pattern = NodePattern()
                        #r = node_pattern.loads(type_dict[k])
                        #if r == False:
                        #  print "node_pattern failed to load: "+type_dict[k]
                        #else:
                        #  print "successfully loaded tree: "+type_dict[k]
                        print "OBJECT%d: [TYPE:%s] [KEY:%s][VALUE:%s]" \
                          %(i, type_dict[k], k, ','.join(encoded_val))
                    tree.objects[i] = rs
                    tree.object_types_str[str(i)] = type_dict
                if node.tag == "Array":
                    rs = analyzeArrayResultHelper(script_list, i)
                    rs = extractObjectValues(rs)
                    type_dict = {}
                    for k in rs:
                        encoded_val = [b64encode(x) for x in rs[k]]
                        #fw.write("array%d: %s:%s\n" % (i, k, ','.join(encoded_val)) )
                        node_pattern = generateNodePattern(rs[k])
                        if is_static and \
                          ((node_pattern.tp!=StringType.CONST) and (node_pattern.tp!=StringType.INSUFFICIENT)):
                            is_static = False
                            dynamic_scripts += script_length

                        type_dict[k] = node_pattern.dumps()
                        if node_pattern.is_insufficient():
                            if not key in insufficient_urls:
                                insufficient_urls[key] = \
                                  [item[1] for item in script_list]
                            else:
                                insufficient_urls[key] += [
                                    item[1] for item in script_list
                                ]
                        #testing
                        #node_pattern = NodePattern()
                        #r = node_pattern.loads(type_dict[k])
                        #if r == False:
                        #  print "node_pattern failed to load: "+type_dict[k]
                        #else:
                        #  print "successfully loaded tree: "+type_dict[k]
                        print "ARRAY%d: [TYPE:%s] [KEY:%s][VALUE:%s]" \
                          %(i, type_dict[k], k, ','.join(encoded_val))
                    tree.arrays[i] = rs
                    tree.array_types_str[str(i)] = type_dict
            except Exception as e:
                displayErrorMsg("fetchAndProcessScriptsOfURLsFromFile",\
                   "excpetion in analyzing node %d %s " %(i, str(e)))

        if is_static:
            static_scripts += script_length

        print "Done writing %d items for file %s " % (len(
            scriptdict[key]), name)
        trees.append(tree)

        fw.close()

    #store trees
    trees = sorted(trees, key=lambda x: x.get_length())
    fw = open(os.path.join(dst_path, "trees"), 'w')
    fw_json = open(os.path.join(dst_path, "jsons"), 'w')
    for i in range(len(trees)):
        tree_val = trees[i].dumps()
        url = scriptdict[trees[i].key][0][1]
        storeTree(url, trees[i].key, tree_val)
        fw.write("1 %.3d: %s\n" % (i, tree_val))
        new_tree = TemplateTree(None, None)
        new_tree.loads(tree_val)

        if trees[i].type == "js":
            fw.write("2 %.3d: %s\n" % (i, getTreeSeq(new_tree.nodes)))
        elif trees[i].type == 'json':
            fw.write("2 %.3d: %s\n" % (i, json.dumps(new_tree.nodes)))
    fw.close()
    fw_json.close()
    print "generate %d trees for %d scripts uniqe[%d]" \
      %(len(trees), total_script_blocks, total_uniq_script_blocks)

    print "static_scripts:%d  dynamic_scripts:%d" % (static_scripts,
                                                     dynamic_scripts)

    return insufficient_urls
Ejemplo n.º 18
0
def generateTemplateBasedOnURLsFromFile(path, dst_path):
  f = open(path)
  scriptdict = {}
  total_script_blocks = 0
  total_uniq_script_blocks = 0
  debug_dict = {}

  static_scripts = 0
  dynamic_scripts = 0

  for line in f:
    url = line.strip()
    print "process url "+url
    hosts, inlines = fetchScripts(url)
    if inlines==None or len(inlines) ==0:
      print "no inlines for "+url
      continue
    for inline in inlines:
      is_json = False
      rs, sc = analyzeJSCodesFinerBlock(inline)
      if rs == None:
        rs = analyzeJSON(inline)
        is_json = True
      if rs == None:
        continue
      
      if is_json:
        tree = TemplateTree(rs, None)
        if not tree.key in scriptdict:
          scriptdict[tree.key] = [(inline, url, tree, -1)]
          debug_dict[tree.key] = [inline]
        else:
          debug_dict[tree.key].append(inline)
          contents = [x[0] for x in scriptdict[key]]
          if not inline in contents:
            scriptdict[tree.key].append((inline, url, tree, -1))
            total_uniq_script_blocks += 1
        total_script_blocks += 1

      else:
        for index in range(len(rs)):
          total_script_blocks += 1
          seq = rs[index]
          tree = TemplateTree(seq, None)
          key = tree.key
          if not key in scriptdict:
            debug_dict[key] = [sc[index]]
            scriptdict[key] = [(sc[index], url, tree, index)]
            print "  add key  %s" %key
          else:
            contents = [x[0] for x in scriptdict[key]]
            debug_dict[key].append(sc[index])
            if not sc[index] in contents: 
              scriptdict[key].append((sc[index],url, tree, index))
              print "  item %s has %d unique scripts" %(key, len(scriptdict[key]))
              total_uniq_script_blocks += 1
 
  fw = open(os.path.join(dst_path,'debug'),'w')
  for k in debug_dict:
    vals = debug_dict[k]
    fw.write("%d %s \n" %(len(vals),k)) 
    fw.write("  --EXAMPLE-- %s\n" %vals[0])
  fw.close()
  #start to analyze trees
  #scriptdict[tree_key] = [(script, url, tree, index)]
  trees = []
  insufficient_urls = {}
  keys = sorted(scriptdict.keys(), key=lambda k:len(scriptdict[k]))
  for key in keys:
    is_static = True
    name = "%d_%s" %(len(scriptdict[key]),key)
    fw = open(os.path.join(dst_path,name), 'w')
    for item in scriptdict[key]:
      fw.write(item[1]+"||"+str(item[3])+"  "+str(item[0])+"\n")
    
    #make sure all template trees with the same key are the same
    script_list = scriptdict[key]
    length_list = sorted([len(item[2].nodes) for item in script_list])
    seq_length = 0
    if length_list[0] != length_list[-1]:
      fw.write("[ALERT] seq length is not consistent")
      fw.close()
      continue
    else:
      seq_length = length_list[0]

    #only handle JavaScript for now
    tree = script_list[0][2]
    if tree.type == "json":
      print "the inline is json!"
      fw.write("[TODO]: the inline is json. This is next step\n")
      fw.close()
      trees.append(tree)
      continue  
    
    #process String/Object/Array nodes
    #script_list: [(script, url, tree, index)]
    fw.write("start analyzeing values\n")    
    script_length = len(script_list)

    for i in range(seq_length):
      node = script_list[0][2].nodes[i]
      try:
        if node.tag == "String":
          vals = [item[2].nodes[i].value for item in script_list]
          encoded_val = [b64encode(x) for x in vals]
          #item = 'string%d: %s' %(i, ','.join(encoded_val))
          #fw.write(item+"\n")
          tree.strings[i] = vals
          node_pattern = generateNodePattern(vals)
          if is_static and \
            ((node_pattern.tp!=StringType.CONST) and (node_pattern.tp!=StringType.INSUFFICIENT)):
            is_static = False
            dynamic_scripts += script_length

          tree.string_types_str[str(i)] = node_pattern.dumps()
          if node_pattern.is_insufficient():
            if not key in insufficient_urls:
              insufficient_urls[key] = \
                [item[1] for item in script_list]
            else:
              insufficient_urls[key] += [item[1] for item in script_list]
          # testing
          #node_pattern = NodePattern()
          #r = node_pattern.loads(tree.string_types_str[i])
          #if r == False:
          #  print "node_pattern failed to load: "+tree.string_types_str[i]
          #else:
          #  print "successfully loaded tree: "+tree.string_types_str[i]
          print "STRING%d: [TYPE:%s] [VALUE:%s]" \
            %(i, tree.string_types_str[str(i)],','.join(encoded_val))
        if node.tag == "Object":
          #debug = "tag:%s val:%s" \
          #  %(script_list[0][2].nodes[i].tag,str(script_list[0][2].nodes[i].value))
          #print "DEBUG: %s" %debug
          rs = analyzeObjectResultHelper(script_list, i)
          rs = extractObjectValues(rs)
          type_dict = {}
          for k in rs:
            encoded_val = [b64encode(x) for x in rs[k]]
            node_pattern = generateNodePattern(rs[k])
            if is_static and \
              ((node_pattern.tp!=StringType.CONST) and (node_pattern.tp!=StringType.INSUFFICIENT)):
              is_static = False
              dynamic_scripts += script_length

            type_dict[k] = node_pattern.dumps()
            if node_pattern.is_insufficient():
              if not key in insufficient_urls:
                insufficient_urls[key] = \
                  [item[1] for item in script_list]
              else:
                insufficient_urls[key] += [item[1] for item in script_list]
            #testing
            #node_pattern = NodePattern()
            #r = node_pattern.loads(type_dict[k])
            #if r == False:
            #  print "node_pattern failed to load: "+type_dict[k]
            #else:
            #  print "successfully loaded tree: "+type_dict[k]
            print "OBJECT%d: [TYPE:%s] [KEY:%s][VALUE:%s]" \
              %(i, type_dict[k], k, ','.join(encoded_val))
          tree.objects[i] = rs
          tree.object_types_str[str(i)] = type_dict
        if node.tag == "Array":
          rs = analyzeArrayResultHelper(script_list, i)
          rs = extractObjectValues(rs)
          type_dict = {}
          for k in rs:
            encoded_val = [b64encode(x) for x in rs[k]]
            #fw.write("array%d: %s:%s\n" % (i, k, ','.join(encoded_val)) )
            node_pattern = generateNodePattern(rs[k])
            if is_static and \
              ((node_pattern.tp!=StringType.CONST) and (node_pattern.tp!=StringType.INSUFFICIENT)):
              is_static = False
              dynamic_scripts += script_length

            type_dict[k] = node_pattern.dumps()
            if node_pattern.is_insufficient():
              if not key in insufficient_urls:
                insufficient_urls[key] = \
                  [item[1] for item in script_list]
              else:
                insufficient_urls[key] += [item[1] for item in script_list]
            #testing
            #node_pattern = NodePattern()
            #r = node_pattern.loads(type_dict[k])
            #if r == False:
            #  print "node_pattern failed to load: "+type_dict[k]
            #else:
            #  print "successfully loaded tree: "+type_dict[k]
            print "ARRAY%d: [TYPE:%s] [KEY:%s][VALUE:%s]" \
              %(i, type_dict[k], k, ','.join(encoded_val))
          tree.arrays[i] = rs
          tree.array_types_str[str(i)] = type_dict
      except Exception as e:
        displayErrorMsg("fetchAndProcessScriptsOfURLsFromFile",\
           "excpetion in analyzing node %d %s " %(i, str(e))) 
    
    if is_static:
      static_scripts += script_length

    print "Done writing %d items for file %s " %(len(scriptdict[key]), name)
    trees.append(tree)
    
    fw.close()
  
  #store trees
  trees = sorted(trees, key=lambda x:x.get_length())
  fw = open(os.path.join(dst_path,"trees"), 'w')
  fw_json = open(os.path.join(dst_path,"jsons"), 'w')
  for i in range(len(trees)):
    tree_val = trees[i].dumps()
    url = scriptdict[trees[i].key][0][1]
    storeTree(url,trees[i].key, tree_val)
    fw.write( "1 %.3d: %s\n" %(i, tree_val))
    new_tree = TemplateTree(None, None)
    new_tree.loads(tree_val)

    if trees[i].type == "js":
      fw.write( "2 %.3d: %s\n" %(i, getTreeSeq(new_tree.nodes)))
    elif trees[i].type == 'json':
      fw.write("2 %.3d: %s\n" % (i, json.dumps(new_tree.nodes)))
  fw.close()
  fw_json.close()
  print "generate %d trees for %d scripts uniqe[%d]" \
    %(len(trees), total_script_blocks, total_uniq_script_blocks)

  print "static_scripts:%d  dynamic_scripts:%d" %(static_scripts, dynamic_scripts)

  return insufficient_urls