def arrayToDict(arr): #debug_id = str(uuid.uuid4()) #print >> sys.stderr,"[START:%s] arrayToDict:%s" %(debug_id, (arr)) rs = {} if arr == None: print "ERROR: arr is NULL in arrayToDict" return rs try: for obj in arr: if isinstance(obj, basestring): if not "basestring_" in rs: rs['basestring_'] = [obj] else: rs['basestring_'].append(obj) elif isinstance(obj, list): #print >> sys.stderr, "DEBUG1 [%s] list %s" %(debug_id, str(obj)) subarr = extractArrayValues(obj) subrs = arrayToDict(subarr) mergeTwoArrayDict(rs,subrs) #print >> sys.stderr, "DEBUG2 [%s] list %s" %(debug_id, str(obj)) elif isinstance(obj, dict): #print >> sys.stderr, "DEBUG1 [%s] dict %s" %(debug_id, str(obj)) for k in obj: if not k in rs: rs[k] = [obj[k]] else: rs[k].append(obj[k]) #print >> sys.stderr, "DEBUG2 [%s] dict %s" %(debug_id, str(obj)) except Exception as e: displayErrorMsg('arrayToDict',str(e)+' '+str(arr))
def extractObjectValues(data): if data == None: return {} rs = {} #key : [val1, val2] for k in data: if not k in rs: rs[k] = [] val = data[k] if isinstance(val, dict): sub_rs = extractObjectValues(val) mergeTwoArrayDict(rs, sub_rs) elif isinstance(val, list) or isinstance(val, tuple): arr = extractArrayValues(val) for item in arr: if isinstance(item, dict): mergeTwoArrayDict(rs, item) elif isinstance(item, basestring) or isinstance(item, int) or isinstance(item, float): rs[k].append(item) else: displayErrorMsg('extractObjectValues',\ "shouldn't have other types"+str(type(item))) elif isinstance(val, basestring) or isinstance(val, int) or isinstance(val, float): rs[k].append(val) else: displayErrorMsg('extractObjectValues',\ "unknown type:"+str(type(val))) continue invalid_keys = [k for k in rs if len(rs[k]) == 0] for k in invalid_keys: del rs[k] return rs
def extractObjectValues(data): if data == None: return {} rs = {} #key : [val1, val2] for k in data: if not k in rs: rs[k] = [] val = data[k] if isinstance(val, dict): sub_rs = extractObjectValues(val) mergeTwoArrayDict(rs, sub_rs) elif isinstance(val, list) or isinstance(val, tuple): arr = extractArrayValues(val) for item in arr: if isinstance(item, dict): mergeTwoArrayDict(rs, item) elif isinstance(item, basestring) or isinstance( item, int) or isinstance(item, float): rs[k].append(item) else: displayErrorMsg('extractObjectValues',\ "shouldn't have other types"+str(type(item))) elif isinstance(val, basestring) or isinstance(val, int) or isinstance( val, float): rs[k].append(val) else: displayErrorMsg('extractObjectValues',\ "unknown type:"+str(type(val))) continue invalid_keys = [k for k in rs if len(rs[k]) == 0] for k in invalid_keys: del rs[k] return rs
def arrayToDict(arr): #debug_id = str(uuid.uuid4()) #print >> sys.stderr,"[START:%s] arrayToDict:%s" %(debug_id, (arr)) rs = {} if arr == None: print "ERROR: arr is NULL in arrayToDict" return rs try: for obj in arr: if isinstance(obj, basestring): if not "basestring_" in rs: rs['basestring_'] = [obj] else: rs['basestring_'].append(obj) elif isinstance(obj, list): #print >> sys.stderr, "DEBUG1 [%s] list %s" %(debug_id, str(obj)) subarr = extractArrayValues(obj) subrs = arrayToDict(subarr) mergeTwoArrayDict(rs, subrs) #print >> sys.stderr, "DEBUG2 [%s] list %s" %(debug_id, str(obj)) elif isinstance(obj, dict): #print >> sys.stderr, "DEBUG1 [%s] dict %s" %(debug_id, str(obj)) for k in obj: if not k in rs: rs[k] = [obj[k]] else: rs[k].append(obj[k]) #print >> sys.stderr, "DEBUG2 [%s] dict %s" %(debug_id, str(obj)) except Exception as e: displayErrorMsg('arrayToDict', str(e) + ' ' + str(arr))
def match(self, target_tree): if not isinstance(target_tree, TemplateTree): displayErrorMsg('TemplateTree.match,'\ "matching tree, target tree should be TemplateTree" ) return False if self.key != target_tree.key: return False if self.type == 'json' and target_tree.type == 'json': return True elif self.type != target_tree.type: return False length = len(target_tree.nodes) for i in range(length): try: if self.nodes[i].tag != target_tree.nodes[i].tag: return False if self.nodes[i].tag == 'String': if not self.string_types[str(i)].match( target_tree.nodes[i].value): return False elif self.nodes[i].tag == 'Object': target_obj = extractObjectValues( target_tree.nodes[i].value) for k in target_obj: if not k in self.object_types[str(i)]: return False if isinstance(target_obj[k], list): for item in target_obj[k]: if not self.object_types[str(i)][k].match( item): return False else: if not self.object_types[str(i)][k].match( target_obj[k]): return False elif self.nodes[i].tag == 'Array': if target_tree.nodes[i].value == None: continue target_obj = arrayToDict(target_tree.nodes[i].value) target_obj = extractObjectValues(target_obj) for k in target_obj: if not k in self.array_types[str(i)]: return False if isinstance(target_obj[k], list): for item in target_obj[k]: if not self.array_types[str(i)][k].match(item): return False else: if not self.array_types[str(i)][k].match( target_obj[k]): return False except Exception as e: displayErrorMsg('TemplateTree.match', str(e)) return False return True
def match(self, target_tree): if not isinstance(target_tree, TemplateTree): displayErrorMsg('TemplateTree.match,'\ "matching tree, target tree should be TemplateTree" ) return False if self.key != target_tree.key: return False if self.type == 'json' and target_tree.type == 'json': return True elif self.type != target_tree.type: return False length = len(target_tree.nodes) for i in range(length): try: if self.nodes[i].tag != target_tree.nodes[i].tag: return False if self.nodes[i].tag == 'String': if not self.string_types[str(i)].match(target_tree.nodes[i].value): return False elif self.nodes[i].tag == 'Object': target_obj = extractObjectValues(target_tree.nodes[i].value) for k in target_obj: if not k in self.object_types[str(i)]: return False if isinstance(target_obj[k], list): for item in target_obj[k]: if not self.object_types[str(i)][k].match(item): return False else: if not self.object_types[str(i)][k].match(target_obj[k]): return False elif self.nodes[i].tag == 'Array': if target_tree.nodes[i].value == None: continue target_obj = arrayToDict(target_tree.nodes[i].value) target_obj = extractObjectValues(target_obj) for k in target_obj: if not k in self.array_types[str(i)]: return False if isinstance(target_obj[k], list): for item in target_obj[k]: if not self.array_types[str(i)][k].match(item): return False else: if not self.array_types[str(i)][k].match(target_obj[k]): return False except Exception as e: displayErrorMsg('TemplateTree.match', str(e)) return False return True
def extractArrayValues(data): rs = [] for item in data: if isinstance(item, basestring) or isinstance(item, int) or isinstance(item, float): rs.append(item) elif isinstance(item, list) or isinstance(item, tuple): val = extractArrayValues(item) rs += val elif isinstance(item, dict): rs.append(extractObjectValues(item)) else: displayErrorMsg('extractArrayValues', "unknown type "+str(type(item))) return rs
def dumps(self): try: obj = {'type': self.type} if self.type == 'json': obj['tree'] = self.nodes else: obj['tree'] = ','.join([b64encode(x.tag) for x in self.nodes]) obj['string_types_str'] = self.string_types_str obj['object_types_str'] = self.object_types_str obj['array_types_str'] = self.array_types_str return json.dumps(obj) except Exception as e: displayErrorMsg("TemplateTree.dumps", str(e)) return None
def dumps(self): try: obj = {'type' : self.type} if self.type == 'json': obj['tree'] = self.nodes else: obj['tree'] = ','.join([b64encode(x.tag) for x in self.nodes]) obj['string_types_str'] = self.string_types_str obj['object_types_str'] = self.object_types_str obj['array_types_str'] = self.array_types_str return json.dumps(obj) except Exception as e: displayErrorMsg("TemplateTree.dumps", str(e)) return None
def extractArrayValues(data): rs = [] for item in data: if isinstance(item, basestring) or isinstance(item, int) or isinstance( item, float): rs.append(item) elif isinstance(item, list) or isinstance(item, tuple): val = extractArrayValues(item) rs += val elif isinstance(item, dict): rs.append(extractObjectValues(item)) else: displayErrorMsg('extractArrayValues', "unknown type " + str(type(item))) return rs
def getTreesForDomainFromDB(domain): tree_strings = fetchTrees(domain) if tree_strings == None: return None tree_dict = {} #{key : TemplateTree} for key in tree_strings: tree = TemplateTree(None, None) try: tree.loads(tree_strings[key]) tree_dict[key] = tree except Exception as e: displayErrorMsg("getTreesForDomainFromDB", str(e)) return tree_dict
def getTreesForDomainFromDB(domain): tree_strings = fetchTrees(domain) if tree_strings == None: return None tree_dict = {} #{key : TemplateTree} for key in tree_strings: tree = TemplateTree(None, None) try: tree.loads(tree_strings[key]) tree_dict[key] = tree except Exception as e: displayErrorMsg("getTreesForDomainFromDB",str(e)) return tree_dict
def loads(self, obj_str): try: obj = json.loads(obj_str) self.type = obj['type'] if self.type == 'json': self.nodes = obj['tree'] elif self.type == 'js': nodes = obj['tree'].split(',') self.nodes = [ASTOutputNode(b64decode(x)) for x in nodes] self.calc_key() self.string_types_str = obj['string_types_str'] for key in self.string_types_str: node_pattern = NodePattern() node_pattern.loads(self.string_types_str[key]) self.string_types[key] = node_pattern #print "successfully loaded %d string patterns" %(len(self.string_types_str)) self.object_types_str = obj['object_types_str'] for index in self.object_types_str: self.object_types[index] = {} for key in self.object_types_str[index]: node_pattern = NodePattern() node_pattern.loads(self.object_types_str[index][key]) self.object_types[index][key] = node_pattern #print "successfully loaded %d object patterns" %(len(self.object_types_str)) self.array_types_str = obj['array_types_str'] for index in self.array_types_str: self.array_types[index] = {} for key in self.array_types_str[index]: node_pattern = NodePattern() node_pattern.loads(self.array_types_str[index][key]) self.array_types[index][key] = node_pattern #print "successfully loaded %d array patterns" %(len(self.array_types_str)) return True except Exception as e: displayErrorMsg("TemplateTree.loads", str(e)) return False
def calc_key(self): if self.nodes != None: m = hashlib.md5() if self.type == "json": keys = sorted(self.nodes.keys()) for k in keys: m.update(k) elif self.type == "js": for node in self.nodes: m.update(node.tag) else: debug_msg = "TemplateTree nodes format error: %s %d %d" \ %(self.nodes[0].__class__, id(type(self.nodes[0])), id(ASTOutputNode)) displayErrorMsg('TemplateTree.calc_key', debug_msg) self.key = None key = m.hexdigest() else: displayErrorMsg('TemplateTree.calc_key', "Nodes are None") self.key = None self.key = key
def generateTemplateBasedOnURLsFromFile(path, dst_path): f = open(path) scriptdict = {} total_script_blocks = 0 total_uniq_script_blocks = 0 debug_dict = {} static_scripts = 0 dynamic_scripts = 0 for line in f: url = line.strip() print "process url " + url hosts, inlines = fetchScripts(url) if inlines == None or len(inlines) == 0: print "no inlines for " + url continue for inline in inlines: is_json = False rs, sc = analyzeJSCodesFinerBlock(inline) if rs == None: rs = analyzeJSON(inline) is_json = True if rs == None: continue if is_json: tree = TemplateTree(rs, None) if not tree.key in scriptdict: scriptdict[tree.key] = [(inline, url, tree, -1)] debug_dict[tree.key] = [inline] else: debug_dict[tree.key].append(inline) contents = [x[0] for x in scriptdict[key]] if not inline in contents: scriptdict[tree.key].append((inline, url, tree, -1)) total_uniq_script_blocks += 1 total_script_blocks += 1 else: for index in range(len(rs)): total_script_blocks += 1 seq = rs[index] tree = TemplateTree(seq, None) key = tree.key if not key in scriptdict: debug_dict[key] = [sc[index]] scriptdict[key] = [(sc[index], url, tree, index)] print " add key %s" % key else: contents = [x[0] for x in scriptdict[key]] debug_dict[key].append(sc[index]) if not sc[index] in contents: scriptdict[key].append( (sc[index], url, tree, index)) print " item %s has %d unique scripts" % ( key, len(scriptdict[key])) total_uniq_script_blocks += 1 fw = open(os.path.join(dst_path, 'debug'), 'w') for k in debug_dict: vals = debug_dict[k] fw.write("%d %s \n" % (len(vals), k)) fw.write(" --EXAMPLE-- %s\n" % vals[0]) fw.close() #start to analyze trees #scriptdict[tree_key] = [(script, url, tree, index)] trees = [] insufficient_urls = {} keys = sorted(scriptdict.keys(), key=lambda k: len(scriptdict[k])) for key in keys: is_static = True name = "%d_%s" % (len(scriptdict[key]), key) fw = open(os.path.join(dst_path, name), 'w') for item in scriptdict[key]: fw.write(item[1] + "||" + str(item[3]) + " " + str(item[0]) + "\n") #make sure all template trees with the same key are the same script_list = scriptdict[key] length_list = sorted([len(item[2].nodes) for item in script_list]) seq_length = 0 if length_list[0] != length_list[-1]: fw.write("[ALERT] seq length is not consistent") fw.close() continue else: seq_length = length_list[0] #only handle JavaScript for now tree = script_list[0][2] if tree.type == "json": print "the inline is json!" fw.write("[TODO]: the inline is json. This is next step\n") fw.close() trees.append(tree) continue #process String/Object/Array nodes #script_list: [(script, url, tree, index)] fw.write("start analyzeing values\n") script_length = len(script_list) for i in range(seq_length): node = script_list[0][2].nodes[i] try: if node.tag == "String": vals = [item[2].nodes[i].value for item in script_list] encoded_val = [b64encode(x) for x in vals] #item = 'string%d: %s' %(i, ','.join(encoded_val)) #fw.write(item+"\n") tree.strings[i] = vals node_pattern = generateNodePattern(vals) if is_static and \ ((node_pattern.tp!=StringType.CONST) and (node_pattern.tp!=StringType.INSUFFICIENT)): is_static = False dynamic_scripts += script_length tree.string_types_str[str(i)] = node_pattern.dumps() if node_pattern.is_insufficient(): if not key in insufficient_urls: insufficient_urls[key] = \ [item[1] for item in script_list] else: insufficient_urls[key] += [ item[1] for item in script_list ] # testing #node_pattern = NodePattern() #r = node_pattern.loads(tree.string_types_str[i]) #if r == False: # print "node_pattern failed to load: "+tree.string_types_str[i] #else: # print "successfully loaded tree: "+tree.string_types_str[i] print "STRING%d: [TYPE:%s] [VALUE:%s]" \ %(i, tree.string_types_str[str(i)],','.join(encoded_val)) if node.tag == "Object": #debug = "tag:%s val:%s" \ # %(script_list[0][2].nodes[i].tag,str(script_list[0][2].nodes[i].value)) #print "DEBUG: %s" %debug rs = analyzeObjectResultHelper(script_list, i) rs = extractObjectValues(rs) type_dict = {} for k in rs: encoded_val = [b64encode(x) for x in rs[k]] node_pattern = generateNodePattern(rs[k]) if is_static and \ ((node_pattern.tp!=StringType.CONST) and (node_pattern.tp!=StringType.INSUFFICIENT)): is_static = False dynamic_scripts += script_length type_dict[k] = node_pattern.dumps() if node_pattern.is_insufficient(): if not key in insufficient_urls: insufficient_urls[key] = \ [item[1] for item in script_list] else: insufficient_urls[key] += [ item[1] for item in script_list ] #testing #node_pattern = NodePattern() #r = node_pattern.loads(type_dict[k]) #if r == False: # print "node_pattern failed to load: "+type_dict[k] #else: # print "successfully loaded tree: "+type_dict[k] print "OBJECT%d: [TYPE:%s] [KEY:%s][VALUE:%s]" \ %(i, type_dict[k], k, ','.join(encoded_val)) tree.objects[i] = rs tree.object_types_str[str(i)] = type_dict if node.tag == "Array": rs = analyzeArrayResultHelper(script_list, i) rs = extractObjectValues(rs) type_dict = {} for k in rs: encoded_val = [b64encode(x) for x in rs[k]] #fw.write("array%d: %s:%s\n" % (i, k, ','.join(encoded_val)) ) node_pattern = generateNodePattern(rs[k]) if is_static and \ ((node_pattern.tp!=StringType.CONST) and (node_pattern.tp!=StringType.INSUFFICIENT)): is_static = False dynamic_scripts += script_length type_dict[k] = node_pattern.dumps() if node_pattern.is_insufficient(): if not key in insufficient_urls: insufficient_urls[key] = \ [item[1] for item in script_list] else: insufficient_urls[key] += [ item[1] for item in script_list ] #testing #node_pattern = NodePattern() #r = node_pattern.loads(type_dict[k]) #if r == False: # print "node_pattern failed to load: "+type_dict[k] #else: # print "successfully loaded tree: "+type_dict[k] print "ARRAY%d: [TYPE:%s] [KEY:%s][VALUE:%s]" \ %(i, type_dict[k], k, ','.join(encoded_val)) tree.arrays[i] = rs tree.array_types_str[str(i)] = type_dict except Exception as e: displayErrorMsg("fetchAndProcessScriptsOfURLsFromFile",\ "excpetion in analyzing node %d %s " %(i, str(e))) if is_static: static_scripts += script_length print "Done writing %d items for file %s " % (len( scriptdict[key]), name) trees.append(tree) fw.close() #store trees trees = sorted(trees, key=lambda x: x.get_length()) fw = open(os.path.join(dst_path, "trees"), 'w') fw_json = open(os.path.join(dst_path, "jsons"), 'w') for i in range(len(trees)): tree_val = trees[i].dumps() url = scriptdict[trees[i].key][0][1] storeTree(url, trees[i].key, tree_val) fw.write("1 %.3d: %s\n" % (i, tree_val)) new_tree = TemplateTree(None, None) new_tree.loads(tree_val) if trees[i].type == "js": fw.write("2 %.3d: %s\n" % (i, getTreeSeq(new_tree.nodes))) elif trees[i].type == 'json': fw.write("2 %.3d: %s\n" % (i, json.dumps(new_tree.nodes))) fw.close() fw_json.close() print "generate %d trees for %d scripts uniqe[%d]" \ %(len(trees), total_script_blocks, total_uniq_script_blocks) print "static_scripts:%d dynamic_scripts:%d" % (static_scripts, dynamic_scripts) return insufficient_urls
def generateTemplateBasedOnURLsFromFile(path, dst_path): f = open(path) scriptdict = {} total_script_blocks = 0 total_uniq_script_blocks = 0 debug_dict = {} static_scripts = 0 dynamic_scripts = 0 for line in f: url = line.strip() print "process url "+url hosts, inlines = fetchScripts(url) if inlines==None or len(inlines) ==0: print "no inlines for "+url continue for inline in inlines: is_json = False rs, sc = analyzeJSCodesFinerBlock(inline) if rs == None: rs = analyzeJSON(inline) is_json = True if rs == None: continue if is_json: tree = TemplateTree(rs, None) if not tree.key in scriptdict: scriptdict[tree.key] = [(inline, url, tree, -1)] debug_dict[tree.key] = [inline] else: debug_dict[tree.key].append(inline) contents = [x[0] for x in scriptdict[key]] if not inline in contents: scriptdict[tree.key].append((inline, url, tree, -1)) total_uniq_script_blocks += 1 total_script_blocks += 1 else: for index in range(len(rs)): total_script_blocks += 1 seq = rs[index] tree = TemplateTree(seq, None) key = tree.key if not key in scriptdict: debug_dict[key] = [sc[index]] scriptdict[key] = [(sc[index], url, tree, index)] print " add key %s" %key else: contents = [x[0] for x in scriptdict[key]] debug_dict[key].append(sc[index]) if not sc[index] in contents: scriptdict[key].append((sc[index],url, tree, index)) print " item %s has %d unique scripts" %(key, len(scriptdict[key])) total_uniq_script_blocks += 1 fw = open(os.path.join(dst_path,'debug'),'w') for k in debug_dict: vals = debug_dict[k] fw.write("%d %s \n" %(len(vals),k)) fw.write(" --EXAMPLE-- %s\n" %vals[0]) fw.close() #start to analyze trees #scriptdict[tree_key] = [(script, url, tree, index)] trees = [] insufficient_urls = {} keys = sorted(scriptdict.keys(), key=lambda k:len(scriptdict[k])) for key in keys: is_static = True name = "%d_%s" %(len(scriptdict[key]),key) fw = open(os.path.join(dst_path,name), 'w') for item in scriptdict[key]: fw.write(item[1]+"||"+str(item[3])+" "+str(item[0])+"\n") #make sure all template trees with the same key are the same script_list = scriptdict[key] length_list = sorted([len(item[2].nodes) for item in script_list]) seq_length = 0 if length_list[0] != length_list[-1]: fw.write("[ALERT] seq length is not consistent") fw.close() continue else: seq_length = length_list[0] #only handle JavaScript for now tree = script_list[0][2] if tree.type == "json": print "the inline is json!" fw.write("[TODO]: the inline is json. This is next step\n") fw.close() trees.append(tree) continue #process String/Object/Array nodes #script_list: [(script, url, tree, index)] fw.write("start analyzeing values\n") script_length = len(script_list) for i in range(seq_length): node = script_list[0][2].nodes[i] try: if node.tag == "String": vals = [item[2].nodes[i].value for item in script_list] encoded_val = [b64encode(x) for x in vals] #item = 'string%d: %s' %(i, ','.join(encoded_val)) #fw.write(item+"\n") tree.strings[i] = vals node_pattern = generateNodePattern(vals) if is_static and \ ((node_pattern.tp!=StringType.CONST) and (node_pattern.tp!=StringType.INSUFFICIENT)): is_static = False dynamic_scripts += script_length tree.string_types_str[str(i)] = node_pattern.dumps() if node_pattern.is_insufficient(): if not key in insufficient_urls: insufficient_urls[key] = \ [item[1] for item in script_list] else: insufficient_urls[key] += [item[1] for item in script_list] # testing #node_pattern = NodePattern() #r = node_pattern.loads(tree.string_types_str[i]) #if r == False: # print "node_pattern failed to load: "+tree.string_types_str[i] #else: # print "successfully loaded tree: "+tree.string_types_str[i] print "STRING%d: [TYPE:%s] [VALUE:%s]" \ %(i, tree.string_types_str[str(i)],','.join(encoded_val)) if node.tag == "Object": #debug = "tag:%s val:%s" \ # %(script_list[0][2].nodes[i].tag,str(script_list[0][2].nodes[i].value)) #print "DEBUG: %s" %debug rs = analyzeObjectResultHelper(script_list, i) rs = extractObjectValues(rs) type_dict = {} for k in rs: encoded_val = [b64encode(x) for x in rs[k]] node_pattern = generateNodePattern(rs[k]) if is_static and \ ((node_pattern.tp!=StringType.CONST) and (node_pattern.tp!=StringType.INSUFFICIENT)): is_static = False dynamic_scripts += script_length type_dict[k] = node_pattern.dumps() if node_pattern.is_insufficient(): if not key in insufficient_urls: insufficient_urls[key] = \ [item[1] for item in script_list] else: insufficient_urls[key] += [item[1] for item in script_list] #testing #node_pattern = NodePattern() #r = node_pattern.loads(type_dict[k]) #if r == False: # print "node_pattern failed to load: "+type_dict[k] #else: # print "successfully loaded tree: "+type_dict[k] print "OBJECT%d: [TYPE:%s] [KEY:%s][VALUE:%s]" \ %(i, type_dict[k], k, ','.join(encoded_val)) tree.objects[i] = rs tree.object_types_str[str(i)] = type_dict if node.tag == "Array": rs = analyzeArrayResultHelper(script_list, i) rs = extractObjectValues(rs) type_dict = {} for k in rs: encoded_val = [b64encode(x) for x in rs[k]] #fw.write("array%d: %s:%s\n" % (i, k, ','.join(encoded_val)) ) node_pattern = generateNodePattern(rs[k]) if is_static and \ ((node_pattern.tp!=StringType.CONST) and (node_pattern.tp!=StringType.INSUFFICIENT)): is_static = False dynamic_scripts += script_length type_dict[k] = node_pattern.dumps() if node_pattern.is_insufficient(): if not key in insufficient_urls: insufficient_urls[key] = \ [item[1] for item in script_list] else: insufficient_urls[key] += [item[1] for item in script_list] #testing #node_pattern = NodePattern() #r = node_pattern.loads(type_dict[k]) #if r == False: # print "node_pattern failed to load: "+type_dict[k] #else: # print "successfully loaded tree: "+type_dict[k] print "ARRAY%d: [TYPE:%s] [KEY:%s][VALUE:%s]" \ %(i, type_dict[k], k, ','.join(encoded_val)) tree.arrays[i] = rs tree.array_types_str[str(i)] = type_dict except Exception as e: displayErrorMsg("fetchAndProcessScriptsOfURLsFromFile",\ "excpetion in analyzing node %d %s " %(i, str(e))) if is_static: static_scripts += script_length print "Done writing %d items for file %s " %(len(scriptdict[key]), name) trees.append(tree) fw.close() #store trees trees = sorted(trees, key=lambda x:x.get_length()) fw = open(os.path.join(dst_path,"trees"), 'w') fw_json = open(os.path.join(dst_path,"jsons"), 'w') for i in range(len(trees)): tree_val = trees[i].dumps() url = scriptdict[trees[i].key][0][1] storeTree(url,trees[i].key, tree_val) fw.write( "1 %.3d: %s\n" %(i, tree_val)) new_tree = TemplateTree(None, None) new_tree.loads(tree_val) if trees[i].type == "js": fw.write( "2 %.3d: %s\n" %(i, getTreeSeq(new_tree.nodes))) elif trees[i].type == 'json': fw.write("2 %.3d: %s\n" % (i, json.dumps(new_tree.nodes))) fw.close() fw_json.close() print "generate %d trees for %d scripts uniqe[%d]" \ %(len(trees), total_script_blocks, total_uniq_script_blocks) print "static_scripts:%d dynamic_scripts:%d" %(static_scripts, dynamic_scripts) return insufficient_urls