def processFile(js_file_name): candidates = [] lexer = Lexer(js_file_name) iBuilder = IndexBuilder(lexer.tokenList) scopeAnalyst = ScopeAnalyst(js_file_name) nameOrigin = scopeAnalyst.nameOrigin isGlobal = scopeAnalyst.isGlobal nameDefScope2pos = scopeAnalyst.nameDefScope2pos for (name, def_scope) in nameOrigin.iterkeys(): pos = nameDefScope2pos[(name, def_scope)] (lin, col) = iBuilder.revFlatMat[pos] scope = iBuilder.revTokMap[(lin, col)] glb = isGlobal.get((name, pos), True) if name != 'TOKEN_LITERAL_STRING' and \ name != 'TOKEN_LITERAL_NUMBER': candidates.append((scope, name, pos, (lin, col), glb, def_scope)) print print for c in sorted(candidates, key=lambda e: e[0]): (scope, name, pos, (lin, col), glb, def_scope) = c if name == 'n' or name == 'calendarEventId': print '\t', scope, name, pos, (lin, col), glb print '\t\t', def_scope
def testFiles(self): tf = [1, 5, 6, 7, 8, 9, 10, 11] #tf = [11] for i in tf: print("-----------------------------------------------------") lexed = Lexer(self.fileList[i - 1]) ib = IndexBuilder(lexed.tokenList) #print(ib) sa = ScopeAnalyst(self.fileList[i - 1]) print(sa) nameCount = {} #TODO: Grab only the non-globals to look at (get the start key and look it up) for variable in sa.nameDefScope2pos.keys(): start = sa.nameDefScope2pos[variable] name = variable[0] if (not sa.isGlobal[(name, start)]): if (name in nameCount): nameCount[name] += 1 else: nameCount[name] = 1 print( str(name) + " : " + str(sa.nameDefScope2pos[variable]) + " -> " + str(ib.revFlatMat[sa.nameDefScope2pos[variable]]) + " Manual: " + str(self.file_definitions[i][name])) assert (ib.revFlatMat[sa.nameDefScope2pos[variable]][0] in self.file_definitions[i][name]) #Finally make sure that the count of definitions matches our manual check. for name, count in nameCount.iteritems(): print(name + " : " + str(count) + " =?= " + str(len(self.file_definitions[i][name]))) assert (len(self.file_definitions[i][name]) == count)
def load(pth): lexer = Lexer(pth) iBuilder = IndexBuilder(lexer.tokenList) scopeAnalyst = ScopeAnalyst( os.path.join(os.path.dirname(os.path.realpath(__file__)), pth)) return (iBuilder, scopeAnalyst)
def testFiles(self): #Known bugs: The definitions of sum and numberEquals in test_file1 seem to be pointing to the wrong instance... i = 1 lexed = Lexer(self.fileList[0]) ib = IndexBuilder(lexed.tokenList) sa = ScopeAnalyst(self.fileList[0]) for variable in sa.nameDefScope2pos.keys(): print( str(variable[0]) + " : " + str(sa.nameDefScope2pos[variable]) + " -> " + str(ib.revFlatMat[sa.nameDefScope2pos[variable]]))
def testMinifiableLines(self): expected = {} expected[0] = set([1, 2, 3, 4, 5, 7, 8, 9, 11, 12, 15, 16, 17, 20]) expected[5] = set([8, 9]) for i in [0, 5]: ib = IndexBuilder(self.clearLexed[i].tokenList) sa = ScopeAnalyst(self.clearTextFiles[i]) lines = sa.getMinifiableLines(ib) print("i:" + str(i)) print(lines) print(expected[i]) self.assertTrue(lines == expected[i]) text = ib.get_text_on_lines_wo_literals(lines) print(text) print(len(text.split("\n"))) print(len(expected[i])) self.assertTrue(len(text.split("\n")) == len(expected[i]))
def processFile(l): js_file_name = l candidates = [] if(True): #try: print(js_file_name) lexer = Lexer(js_file_name) return IndexBuilder(lexer.tokenList)
def processFile(l): js_file_name = l candidates = [] try: lexer = Lexer(os.path.join(results_path, js_file_name)) iBuilder = IndexBuilder(lexer.tokenList) scopeAnalyst = ScopeAnalyst(os.path.join(results_path, js_file_name)) nameOrigin = scopeAnalyst.nameOrigin isGlobal = scopeAnalyst.isGlobal nameDefScope2pos = scopeAnalyst.nameDefScope2pos for (name, def_scope) in nameOrigin.iterkeys(): pos = nameDefScope2pos[(name, def_scope)] (lin, col) = iBuilder.revFlatMat[pos] scope = iBuilder.revTokMap[(lin, col)] glb = isGlobal.get((name, pos), True) # print name, def_scope, pos, scope, glb #, (lin,col) # if not isGlobal.get((name, pos), True): # scope = def_scope.replace("\"","") # i = scope.find('[variables][_values]') # if i > -1: # scope = scope[:i+len('[variables][_values]')] # i = scope.find('[functions][_values]') # if i > -1: # scope = scope[:i+len('[functions][_values]')] if name != 'TOKEN_LITERAL_STRING' and \ name != 'TOKEN_LITERAL_NUMBER': candidates.append((scope, name, glb)) except: return (js_file_name, None, 'ScopeAnalyst fail') # print 'candidates------------------' # for candidate in candidates: # print candidate return (js_file_name, 'OK', candidates)
def summarizeUnscopedTranslation(renaming_map, f_path, translation_strategy, output_path, base_name, name_candidates, name_positions, iBuilder): nc = [] f_base = os.path.basename(f_path) training_strategy = f_base.split('.')[1] tmp_path = '%s.%s.js' % (f_base[:-3], translation_strategy) o_path = '%s.%s.unscoped.%s.js' % (base_name, training_strategy, translation_strategy) # print f_path, f_base, training_strategy, tmp_path, o_path, base_name writeTmpLines(renameHashed(iBuilder, name_positions, renaming_map), tmp_path) clear = Beautifier() ok = clear.run(tmp_path, os.path.join(output_path, o_path)) if not ok: return False try: lexer = Lexer(os.path.join(output_path, o_path)) iBuilder_local = IndexBuilder(lexer.tokenList) scopeAnalyst_local = ScopeAnalyst(os.path.join(output_path, o_path)) except: return False nameOrigin = scopeAnalyst_local.nameOrigin isGlobal = scopeAnalyst_local.isGlobal for (name, def_scope) in nameOrigin.iterkeys(): pos = scopeAnalyst_local.nameDefScope2pos[(name, def_scope)] if not False: #isGlobal.get((name, pos), True): (lin, col) = iBuilder_local.revFlatMat[pos] (tok_lin, tok_col) = iBuilder_local.revTokMap[(lin, col)] nc.append( ('%s.unscoped.%s' % (training_strategy, translation_strategy), def_scope, tok_lin, tok_col, isGlobal.get( (name, pos), True), name, '', '')) return nc
def processFile(l): js_file_path = l[0] pid = int(multiprocessing.current_process().ident) try: # Temp files to be created during processing path_tmp = 'tmp_%d.js' % pid path_tmp_b = 'tmp_%d.b.js' % pid # Strip comments, replace literals, etc try: prepro = Preprocessor(os.path.join(corpus_root, js_file_path)) prepro.write_temp_file(path_tmp) except: cleanup(pid) return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout clear = Beautifier() ok = clear.run(path_tmp, path_tmp_b) if not ok: cleanup(pid) return (js_file_path, None, 'Beautifier fail') try: iBuilder_clear = IndexBuilder(Lexer(path_tmp_b).tokenList) except: cleanup(pid) return (js_file_path, None, 'IndexBuilder fail') n_lines = len(iBuilder_clear.tokens) max_line_len = max([len(l) for l in iBuilder_clear.tokens]) cleanup(pid) return (js_file_path, n_lines, max_line_len) except Exception, e: cleanup(pid) return (js_file_path, None, str(e))
def processFile(js_file_path): try: js_text = open(os.path.join(files_root, js_file_path), 'r').read() # Strip comments, replace literals, etc try: prepro = WebLMPreprocessor(js_text) prepro_text = str(prepro) except: return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout clear = Beautifier() (ok, beautified_text, _err) = clear.web_run(prepro_text) if not ok: return (js_file_path, None, 'Beautifier fail') try: lex_clear = WebLexer(beautified_text) tok1 = lex_clear.tokenList except: return (js_file_path, None, 'Lexer fail') try: iBuilder1 = IndexBuilder(tok1) except: return (js_file_path, None, 'IndexBuilder fail') orig = [] for _line_idx, line in enumerate(iBuilder1.tokens): orig.append(' '.join([t for (_tt,t) in line]) + "\n") return (js_file_path, orig) except Exception, e: return (js_file_path, None, str(e))
def testfileDebug(self): for f in self.fileList: print("---------------------------------- " + f + " ----------------------------------") orig = f + ".js" min = f + ".u.js" lo = Lexer(orig) lm = Lexer(min) print( "---------------------------------- original text ----------------------------------" ) print(lo.programText) print( "---------------------------------- minified text ----------------------------------" ) print(lm.programText) for id in self.ids: to_read = f + id + ".js" print("---------------------------------- " + to_read + " ----------------------------------") lexed = Lexer(to_read) print( "---------------------------------- text ----------------------------------" ) print(lexed.programText) print( "---------------------------------- tokenlist ----------------------------------" ) print(lexed.tokenList) ib = IndexBuilder(lexed.tokenList) print( "---------------------------------- IndexBuilder ----------------------------------" ) print(ib) sa = ScopeAnalyst(to_read) print( "---------------------------------- ScopeAnalyst ----------------------------------" ) print(sa)
def processFile(l): base_name = l[0] js_file_path = l[1] print(base_name) print(js_file_path) #if(True): try: lexed = Lexer(js_file_path) ib = IndexBuilder(lexed.tokenList) sa = ScopeAnalyst(js_file_path) #num globals = all in is_global == True + all unique names #in name2CharPositions not in is_global base_global = set( [name for name, value in sa.isGlobal.iteritems() if value == True]) #Get all known names in the file. known_names = set([name for name, value in sa.isGlobal.iteritems()]) for name, loc in ib.name2CharPositions.iteritems(): if (name not in known_names): #if never seen, its a global base_global.add(name) return [base_name, len(base_global)] except: return [base_name, None]
def processFile(l): js_file_path = l[0] # if True: try: js_text = open(os.path.join(corpus_root, js_file_path), 'r').read() # Strip comments, replace literals, etc try: prepro = WebLMPreprocessor(js_text) prepro_text = str(prepro) except: return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout clear = Beautifier() (ok, tmp_beautified_text, _err) = clear.web_run(prepro_text) if not ok: return (js_file_path, None, 'Beautifier fail') # Minify ugly = Uglifier() (ok, tmp_minified_text, _err) = ugly.web_run(tmp_beautified_text) if not ok: return (js_file_path, None, 'Uglifier fail') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() (aligned_clear, aligned_minified) = aligner.web_align( WebLexer(tmp_beautified_text).tokenList, WebLexer(tmp_minified_text).tokenList) except: return (js_file_path, None, 'Aligner fail') # Pass through beautifier to fix layout (ok, beautified_text, _err) = clear.web_run(aligned_clear) if not ok: return (js_file_path, None, 'Beautifier fail') (ok, minified_text, _err) = clear.web_run(aligned_minified) if not ok: return (js_file_path, None, 'Beautifier fail') # Num tokens before vs after try: lex_clear = WebLexer(beautified_text) tok_clear = lex_clear.tokenList lex_ugly = WebLexer(minified_text) tok_ugly = lex_ugly.tokenList except: return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): return (js_file_path, None, 'Num tokens mismatch') if beautified_text == minified_text: return (js_file_path, None, 'Not minified') try: iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: return (js_file_path, None, 'IndexBuilder fail') try: scopeAnalyst = WebScopeAnalyst(minified_text) except: return (js_file_path, None, 'ScopeAnalyst fail') processed = [] # Try different renaming strategies (hash, etc) for r_strategy in RS.all(): try: # if True: # Rename input prior to translation preRen = PreRenamer() after_text = preRen.rename(r_strategy, iBuilder_ugly, scopeAnalyst) (ok, beautified_after_text, _err) = clear.web_run(after_text) if not ok: return (js_file_path, None, 'Beautifier fail') processed.append((r_strategy, beautified_after_text)) except: return (js_file_path, None, 'Renaming fail') with open(os.path.join(output_path, 'orig', js_file_path), 'w') as f: f.write(beautified_text) for (r_strategy, text) in processed: with open(os.path.join(output_path, r_strategy, js_file_path), 'w') as f: f.write(text) return (js_file_path, 'OK', None) except Exception, e: return (js_file_path, None, str(e).replace("\n", ""))
def getMosesTranslation(proxy, r_strategy, RS, a_beautifier, iBuilder_ugly, scopeAnalyst_ugly, debug_mode=False): """ A helper function so that we can run multiple different renaming strategies through moses in a more modular and hopefully parallelizable manner. It performs hashing/no hashing preparation of the file for the renaming strategy specified by r_stategy, and then calls the appropriate moses_server. Parameters ---------- proxy: A pointer to which port the appropriate moses server is listening in on for this particular renaming strategy. r_strategy: One of the renaming strategies from RenamingStrategies RS: A renaming strategies object. a_beautifier: a beautify object to make sure the renamed text is cleanly formatted. iBuilder_ugly: Index Builder for the minified file. scopeAnalyst_ugly: Scope Analyst for the minified file. start: The starting time for the preprocessing step. Used for performance metrics. debug_mode: Print debug information? (True/False - defaults to False) Returns ------- (status, error, translation, name_candidates, a_iBuilder, a_scopeAnalyst, a_name_positions, a_position_names, a_use_scopes, hash_name_map, pre_time, rn_time, m_time, post_start) status: Did this complete without error? If False, then the rest of the output besides error will be empty/null. error: What is the reason for the failure? If status is True (successful completion) this is "". translation: The raw Moses output name_candidates: The set of Moses suggestions for this renaming a_iBuilder,a_scopeAnalyst: Index Builder and Scope Analyst for this renaming a_name_positions, a_posistion_names, a_use_scopes: Addition tracking info hash_name_map: a map from the hashed names to the original minified names rn_time, m_time, lex_time, post_start: The duration of the renaming, Moses translation steps, and lexing steps along with the start time for the postprocessing of the Moses output. """ rn_start = time.time() #We need both the base_text and the hashed_text. preRen = PreRenamer() if (debug_mode): print("Tokens-------------------") print(iBuilder_ugly.tokens) print("Tokens-------------------") #We always need the non hashed names as a fallback. try: after_text = preRen.rename(r_strategy, iBuilder_ugly, scopeAnalyst_ugly) except: return (False, "Renaming failed for " + str(r_strategy), "", {}, None, None, {}, {}, {}, {}, 0, 0, 0, 0) (ok, beautified_after_text, _err) = a_beautifier.web_run(after_text) if not ok: return (False, "Beautifier failed on the renamed text for " + str(r_strategy), "", {}, None, None, {}, {}, {}, {}, 0, 0, 0, 0) # Align hashed and non hashed files, in case the beautifier # line wrapped the extended lines. try: aligner = Aligner() (aligned_after, aligned_before) = aligner.web_align( WebLexer(beautified_after_text).tokenList, WebLexer(iBuilder_ugly.get_text()).tokenList) except: return (False, "Aligner failed on the renamed text for " + str(r_strategy), "", {}, None, None, {}, {}, {}, {}, 0, 0, 0, 0) #print("--------Aligned After-------") #print(aligned_after) #print("----------------------------") a_lexer = WebLexer(aligned_after) a_iBuilder = IndexBuilder(a_lexer.tokenList) a_scopeAnalyst = WebScopeAnalyst(aligned_after) hash_name_map = {} if (r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO): #Something below here is buggy... orderedVarsMin = sorted(scopeAnalyst_ugly.name2defScope.keys(), key=lambda x: x[1]) orderedVarsHash = sorted(a_scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) #print("Min len: " + str(len(orderedVarsMin))) #print("Hash len: " + str(len(orderedVarsHash))) if (len(orderedVarsMin) != len(orderedVarsHash)): return (False, "Mismatch between minified and hashed names.", "", {}, a_iBuilder, a_scopeAnalyst, {}, {}, {}, {}, 0, 0, 0, 0) for i in range(0, len(orderedVarsHash)): name_hash = orderedVarsHash[i][0] def_scope_hash = a_scopeAnalyst.name2defScope[orderedVarsHash[i]] name_min = orderedVarsMin[i][0] def_scope_min = scopeAnalyst_ugly.name2defScope[orderedVarsMin[i]] hash_name_map[(name_hash, def_scope_hash)] = (name_min, def_scope_min) if (debug_mode): print("HASH NAME MAP LEN: " + str(len(hash_name_map))) # We can switch this back once we train models on a corpus with literals # lx = WebLexer(a_iBuilder.get_text()) lx = WebLexer(a_iBuilder.get_text_wo_literals()) #print("-----------------Moses In ----------------------") #print(lx) #print("------------------------------------------------") #print(a_iBuilder.charPosition2Name) #print("------------------------------------------------") #line_subset = a_scopeAnalyst.getMinifiableLines(a_iBuilder) #line_list = sorted(list(line_subset)) #line_map = {} #m_line = 0 #for next_line in line_list: # line_map[m_line] = next_line # m_line += 1 #lx = WebLexer(a_iBuilder.get_text_on_lines_wo_literals(line_subset)) #Performance measures -> wrap up the preprocessing/ renaming #phases end = time.time() rn_time = end - rn_start m_start = time.time() #if(debug_mode): # print("Invoking Moses.") # print(lx.collapsedText) # Translate renamed input #md = WebMosesDecoder(proxy) #(ok, translation, _err) = md.run(lx.collapsedText) (ok, translation, _err) = segmentedTranslation(lx, SEGMENTED_TRANS_SIZE, proxy, debug_mode) if not ok: return (False, "Moses server failed for " + str(r_strategy), translation, {}, a_iBuilder, a_scopeAnalyst, {}, {}, {}, hash_name_map, 0, 0, 0, 0) m_end = time.time() m_time = m_end - m_start post_start = time.time() (a_name_positions, a_position_names, a_use_scopes) = prepHelpers(a_iBuilder, a_scopeAnalyst) if translation is not None: # Parse moses output mp = MosesParser() if (debug_mode): print(translation) name_candidates = mp.parse(translation, a_iBuilder, a_position_names) #, #a_scopeAnalyst) #A slightly modified version of parse to remap the moses #output lines to the correct original lines. #name_candidates = mp.parse_subset(translation, # a_iBuilder, # a_position_names, # line_map) lex_time = lx.build_time + a_lexer.build_time return (True, "", translation, name_candidates, a_iBuilder, a_scopeAnalyst, a_name_positions, a_position_names, a_use_scopes, hash_name_map, rn_time, m_time, lex_time, post_start)
def processFile(l): js_file_path = l[0] base_name = os.path.splitext(os.path.basename(js_file_path))[0] temp_files = { 'orig': '%s.js' % base_name, 'minified': '%s.u.js' % base_name, 'n2p': '%s.n2p.js' % base_name } for r_strategy in RS.all(): temp_files['%s' % (r_strategy)] = \ '%s.%s.js' % (base_name, r_strategy) for c_strategy in CS.all(): temp_files['%s_%s' % (r_strategy, c_strategy)] = \ '%s.%s.%s.js' % (base_name, r_strategy, c_strategy) for k, v in temp_files.iteritems(): temp_files[k] = os.path.join(output_path, v) candidates = [] #Minified Name -> Original Name (name, def_scope) -> (name, def_scope) min_name_map = {} #Hashed Name -> Minified Name (name, def_scope) -> (name, def_scope) hash_name_map = {} #Minified Name -> jsnice name (name, def_scope) -> (name, def_scope) jsnice_name_map = {} #Output Lines for the suggestoin_model.csv model_rows = [] try: js_text = open(os.path.join(corpus_root, js_file_path), 'r').read() # Strip comments, replace literals, etc try: prepro = WebLMPreprocessor(js_text) prepro_text = str(prepro) except: return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout clear = Beautifier() (ok, tmp_beautified_text, _err) = clear.web_run(prepro_text) if not ok: return (js_file_path, None, 'Beautifier fail') # Minify ugly = Uglifier() (ok, tmp_minified_text, _err) = ugly.web_run(tmp_beautified_text) if not ok: return (js_file_path, None, 'Uglifier fail') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() (aligned_clear, aligned_minified) = aligner.web_align( WebLexer(tmp_beautified_text).tokenList, WebLexer(tmp_minified_text).tokenList) except: return (js_file_path, None, 'Aligner fail') # Pass through beautifier to fix layout (ok, beautified_text, _err) = clear.web_run(aligned_clear) if not ok: return (js_file_path, None, 'Beautifier fail') (ok, minified_text, _err) = clear.web_run(aligned_minified) if not ok: return (js_file_path, None, 'Beautifier fail') # Num tokens before vs after try: lex_clear = WebLexer(beautified_text) tok_clear = lex_clear.tokenList lex_ugly = WebLexer(minified_text) tok_ugly = lex_ugly.tokenList except: return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): return (js_file_path, None, 'Num tokens mismatch') if beautified_text == minified_text: return (js_file_path, None, 'Not minified') #try: # iBuilder_clear = IndexBuilder(lex_clear.tokenList) #except: # return (js_file_path, None, "IndexBuilder fail on original file.") try: iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: return (js_file_path, None, 'IndexBuilder fail') with open(temp_files['orig'], 'w') as f: f.write(beautified_text) with open(temp_files['minified'], 'w') as f: f.write(minified_text) # try: # orig_lexer = WebLexer(beautified_text) # orig_iBuilder = IndexBuilder(orig_lexer.tokenList) # orig_scopeAnalyst = WebScopeAnalyst(beautified_text) # except: # return (js_file_path, None, 'IndexBuilder/Scoper fail on original') ######################## # Nice2Predict ######################## # BV: Next block left out until I figure out the pipe issue # BV: Update: I couldn't pipe input to N2P. TODO: FIX # Run the JSNice from http://www.nice2predict.org unuglifyJS = UnuglifyJS() (ok, n2p_text, _err) = unuglifyJS.run(temp_files['minified']) if not ok: return (js_file_path, None, 'Nice2Predict fail') (ok, n2p_text_beautified, _err) = clear.web_run(n2p_text) if not ok: return (js_file_path, None, 'Beautifier fail') with open(temp_files['n2p'], 'w') as f: f.write(n2p_text_beautified) try: n2p_lexer = WebLexer(n2p_text_beautified) n2p_iBuilder = IndexBuilder(n2p_lexer.tokenList) n2p_scopeAnalyst = WebScopeAnalyst(n2p_text_beautified) except: return (js_file_path, None, 'IndexBuilder / ScopeAnalyst fail') # Save some translation stats to compare different methods ts = TranslationSummarizer() candidates += [['n2p', ''] + x for x in ts.compute_summary_unscoped( n2p_iBuilder, n2p_scopeAnalyst)] ################################################ # All other JSNaughty variants ################################################ try: scopeAnalyst = WebScopeAnalyst(minified_text) except: return (js_file_path, None, 'ScopeAnalyst minified fail') try: scopeAnalyst_clear = WebScopeAnalyst(beautified_text) except: return (js_file_path, None, 'ScopeAnalyst clear fail') #if(not check(iBuilder_clear, scopeAnalyst_clear, n2p_iBuilder, n2p_scopeAnalyst)): # return (js_file_path, None, 'JsNice restructured file. Skipping..') #Map the original names to the minified counterparts. orderedVarsNew = sorted(scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) orderedVarsOld = sorted(scopeAnalyst_clear.name2defScope.keys(), key=lambda x: x[1]) orderedVarsN2p = sorted(n2p_scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) if (len(orderedVarsOld) != len(orderedVarsNew)): return (js_file_path, None, "Old and New Name lists different length") if (len(orderedVarsOld) != len(orderedVarsN2p)): return (js_file_path, None, "JsNice and Old Name lists different length") for i in range(0, len(orderedVarsOld)): name_old = orderedVarsOld[i][0] def_scope_old = scopeAnalyst_clear.name2defScope[orderedVarsOld[i]] name_new = orderedVarsNew[i][0] def_scope_new = scopeAnalyst.name2defScope[orderedVarsNew[i]] min_name_map[(name_new, def_scope_new)] = (name_old, def_scope_old) name_n2p = orderedVarsN2p[i][0] def_scope_n2p = scopeAnalyst.name2defScope[orderedVarsNew[i]] jsnice_name_map[(name_new, def_scope_new)] = (name_n2p, def_scope_n2p) #Once we have the scopeAnalyst, iBuilder, and tokenlist for the minified #version, we can get the name properties # vm = VariableMetrics(scopeAnalyst, iBuilder_ugly, lex_ugly.tokenList) # variableKeySet = vm.getVariables() # for variableKey in variableKeySet: # name_features[variableKey] = vm.getNameMetrics(variableKey) (_name_positions, \ position_names, _use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst) # Try different renaming strategies (hash, etc) for r_strategy, proxy in proxies: # Rename input prior to translation preRen = PreRenamer() after_text = preRen.rename(r_strategy, iBuilder_ugly, scopeAnalyst) (ok, beautified_after_text, _err) = clear.web_run(after_text) if not ok: return (js_file_path, None, 'Beautifier fail') # Save renamed input to disk for future inspection with open(temp_files['%s' % (r_strategy)], 'w') as f: f.write(beautified_after_text) a_lexer = WebLexer(beautified_after_text) a_iBuilder = IndexBuilder(a_lexer.tokenList) a_scopeAnalyst = WebScopeAnalyst(beautified_after_text) if (r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO): # try: # scopeAnalyst_hash = WebScopeAnalyst(beautified_after_text) #This should be beautified_after_text instead of after_text # except: # return (js_file_path, None, "ScopeAnalyst hash fail") #Map the hashed names to the minified counterparts. orderedVarsMin = sorted(scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) orderedVarsHash = sorted(a_scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) if (len(orderedVarsMin) != len(orderedVarsHash)): return (js_file_path, None, "Hash and Min lists different length") for i in range(0, len(orderedVarsHash)): name_hash = orderedVarsHash[i][0] def_scope_hash = a_scopeAnalyst.name2defScope[ orderedVarsHash[i]] name_min = orderedVarsMin[i][0] def_scope_min = scopeAnalyst.name2defScope[ orderedVarsMin[i]] hash_name_map[(name_hash, def_scope_hash)] = (name_min, def_scope_min) # We can switch this back once we train models on a corpus with literals # lx = WebLexer(a_iBuilder.get_text()) lx = WebLexer(a_iBuilder.get_text_wo_literals()) # Translate renamed input md = WebMosesDecoder(proxy) (ok, translation, _err) = md.run(lx.collapsedText) if not ok: return (js_file_path, None, 'Moses translation fail') (a_name_positions, a_position_names, a_use_scopes) = prepHelpers(a_iBuilder, a_scopeAnalyst) nc = [] if translation is not None: # Parse moses output mp = MosesParser() name_candidates = mp.parse(translation, a_iBuilder, a_position_names) # name_candidates is a dictionary of dictionaries: # keys are (name, def_scope) tuples; # values are suggested translations with the sets # of line numbers on which they appear. # Update name_candidates with some default values # (in this case the translation without any renaming) # if the translation is empty if r_strategy == RS.NONE: # RS.NONE should always be first, by construction name_candidates_default = name_candidates scopeAnalyst_default = a_scopeAnalyst iBuilder_default = a_iBuilder else: for key_default, suggestions in name_candidates_default.iteritems( ): # (name_default, def_scope_default) = key_default pos_default = scopeAnalyst_default.nameDefScope2pos[ key_default] (lin, col) = iBuilder_default.revFlatMat[pos_default] (line_num, line_idx) = iBuilder_default.revTokMap[(lin, col)] (name, def_scope) = a_position_names[line_num][line_idx] key = (name, def_scope) for name_translation, lines in suggestions.iteritems(): name_candidates.setdefault(key, {}) name_candidates[key].setdefault( name_translation, set([])) name_candidates[key][name_translation].update( lines) # **** BV: This might be all we need to combine Naughty & Nice name_candidates_copy = deepcopy(name_candidates) for key, suggestions in name_candidates_copy.iteritems(): if r_strategy == RS.NONE: (name_n2p, def_scope_n2p) = jsnice_name_map[key] else: (name_n2p, def_scope_n2p) = jsnice_name_map[hash_name_map.get( key, key)] for name_translation, lines in suggestions.iteritems(): name_candidates.setdefault(key, {}) name_candidates[key].setdefault(name_n2p, set([])) name_candidates[key][name_n2p].update(lines) cc = ConsistencyController(debug_mode=False) ts = TranslationSummarizer() # An identifier may have been translated inconsistently # across different lines (Moses treats each line independently). # Try different strategies to resolve inconsistencies, if any for c_strategy in CS.all(): # Compute renaming map (x -> length, y -> width, ...) # Note that x,y here are names after (hash) renaming (temp_renaming_map, seen) = cc.computeRenaming( c_strategy, name_candidates, a_name_positions, a_use_scopes, a_iBuilder, lm_path, {}, hash_name_map) # After computeRenaming, we have both the entropies stored # if we are in LMDrop strategy and have the suggestions # frequency from name_candidates. Fill in suggestion_Features # if(c_strategy == CS.LMDROP and r_strategy not in suggestion_features): # assert(cc.suggestion_cache != None) # suggestion_features[r_strategy] = {} # """ # name_candidates: dict # name_candidates[(name, def_scope)][name_translation] # = set of line numbers in the translation # """ # for variableKey, suggestionDictionary in name_candidates.iteritems(): # for suggestionName, linesSuggested in suggestionDictionary.iteritems(): # # # I need to revert variableKey[0] in the suggestion from its hash to its original minified name. # if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO): # unhashedKey = hash_name_map[variableKey] # suggestionKey = (unhashedKey[0], unhashedKey[1], suggestionName) # else: # suggestionKey = (variableKey[0], variableKey[1], suggestionName) # # entropyVals = cc.suggestion_cache.getEntropyStats(variableKey, suggestionName) # if(entropyVals != (ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR)): # suggestionValue = [len(linesSuggested)] + \ # list(getSuggestionStats(suggestionName)) + \ # list(entropyVals) # # suggestion_features[r_strategy][suggestionKey] = suggestionValue # Fall back on original names in input, if # no translation was suggested postRen = PostRenamer() renaming_map = postRen.updateRenamingMap( a_name_positions, position_names, a_use_scopes, temp_renaming_map, seen, r_strategy) # Apply renaming map and save output for future inspection renamed_text = postRen.applyRenaming( a_iBuilder, a_name_positions, renaming_map) (ok, beautified_renamed_text, _err) = clear.web_run(renamed_text) if not ok: return (js_file_path, None, 'Beautifier fail') with open(temp_files['%s_%s' % (r_strategy, c_strategy)], 'w') as f: f.write(beautified_renamed_text) # Save some stats about which names were renamed to what # This is what enables the comparison between the different # methods. r = [[c_strategy] + x for x in ts.compute_summary_scoped( renaming_map, name_candidates, a_iBuilder, a_scopeAnalyst)] if not r: return (js_file_path, None, 'Compute summary failed') nc += r if nc: candidates += [[r_strategy] + x for x in nc] #create the rows for the suggestion_model.csv # for r_strategy in RS.all(): # for suggestionKey, s_feat in suggestion_features[r_strategy].iteritems(): # variableKey = (suggestionKey[0], suggestionKey[1]) # original_name = min_name_map[variableKey][0] # js_nice_name = jsnice_name_map[variableKey][0] # n_feat = list(name_features[variableKey]) # #Convert the def_scope to an equivalent, but smaller, easier to read key: (line_num, token_num) # newKey = scopeAnalyst.nameDefScope2pos[variableKey] # (keyLine, keyToken) = iBuilder_ugly.revFlatMat[newKey] # model_rows.append([original_name, r_strategy, suggestionKey[0], keyLine, keyToken, suggestionKey[2], js_nice_name] + n_feat + s_feat) return (js_file_path, 'OK', candidates, model_rows) except Exception, e: return (js_file_path, None, str(e).replace("\n", ""), model_rows)
input_file = os.path.abspath(sys.argv[1]) output_file = os.path.abspath(sys.argv[2]) mode = int(sys.argv[3]) prepro = Preprocessor(input_file) prepro.write_temp_file('tmp.js') clear = Beautifier() ok = clear.run('tmp.js', 'tmp.b.js') lexer = Lexer('tmp.b.js') iBuilder = IndexBuilder(lexer.tokenList) scopeAnalyst = ScopeAnalyst(os.path.join( os.path.dirname(os.path.realpath(__file__)), 'tmp.b.js')) hash_renaming = renameUsingHashDefLine(scopeAnalyst, iBuilder, twoLines=False, debug=mode) with open(output_file, 'w') as f: f.writelines(hash_renaming) # writeTmpLines(hash_renaming, output_file) # clear = Beautifier()
def processFile(l): js_file_path = l[0] base_name = os.path.splitext(os.path.basename(js_file_path))[0] pid = int(multiprocessing.current_process().ident) temp_files = { 'path_tmp': 'tmp_%d.js' % pid, 'path_tmp_b': 'tmp_%d.b.js' % pid, 'path_tmp_b_1': 'tmp_%d.b.1.js' % pid, 'path_tmp_b_2': 'tmp_%d.b.2.js' % pid, 'path_tmp_b_a': 'tmp_%d.b.a.js' % pid, 'path_tmp_u': 'tmp_%d.u.js' % pid, 'path_tmp_u_a': 'tmp_%d.u.a.js' % pid, 'path_tmp_unugly': 'tmp_%d.n2p.js' % pid, 'path_tmp_unugly_1': 'tmp_%d.n2p.1.js' % pid, 'path_tmp_unugly_2': 'tmp_%d.n2p.2.js' % pid, 'path_tmp_jsnice': 'tmp_%d.jsnice.js' % pid, 'f2': 'tmp_%d.no_renaming.js' % pid, # 'f3': 'tmp_%d.basic_renaming.js' % pid, # 'f4': 'tmp_%d.hash_renaming.js' % pid, 'f5': 'tmp_%d.hash_def_one_renaming.js' % pid, # 'f6': 'tmp_%d.hash_def_two_renaming.js' % pid, 'f7': 'tmp_%d.hash_def_one_renaming_fb.js' % pid, 'path_orig': os.path.join(output_path, '%s.js' % base_name), 'path_ugly': os.path.join(output_path, '%s.u.js' % base_name), 'path_unugly': os.path.join(output_path, '%s.n2p.js' % base_name), 'path_jsnice': os.path.join(output_path, '%s.jsnice.js' % base_name) } # for strategy in ['js', 'lm.js', 'len.js', 'freqlen.js']: # for renaming in ['no_renaming', 'hash_def_one_renaming']: # temp_files['path_tmp_%s_%s' % (renaming, strategy)] = \ # 'tmp_%d.%s.%s' % (pid, renaming, strategy) candidates = [] # if True: try: # Strip comments, replace literals, etc try: prepro = Preprocessor(os.path.join(corpus_root, js_file_path)) prepro.write_temp_file(temp_files['path_tmp']) except: cleanup(temp_files) return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout clear = Beautifier() ok = clear.run(temp_files['path_tmp'], temp_files['path_tmp_b']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Beautifier fail') # # Pass through beautifier to fix layout # clear = Beautifier() # ok = clear.run(temp_files['path_tmp'], # temp_files['path_tmp_b_1']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'Beautifier fail') # # jsNiceBeautifier = JSNice(flags=['--no-types', '--no-rename']) # # (ok, _out, _err) = jsNiceBeautifier.run(temp_files['path_tmp_b_1'], # temp_files['path_tmp_b_2']) # if not ok: # cleanup(temp_files) # print js_file_path, _err # return (js_file_path, None, 'JSNice Beautifier fail') # # ok = clear.run(temp_files['path_tmp_b_2'], # temp_files['path_tmp_b']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'Beautifier fail') # # # # Weird JSNice renamings despite --no-rename # try: # before = set([token for (token, token_type) in # Lexer(temp_files['path_tmp_b_1']).tokenList # if is_token_subtype(token_type, Token.Name)]) # after = set([token for (token, token_type) in # Lexer(temp_files['path_tmp_b']).tokenList # if is_token_subtype(token_type, Token.Name)]) # # if not before == after: # return (js_file_path, None, 'Weird JSNice renaming') # # except: # cleanup(temp_files) # return (js_file_path, None, 'Lexer fail') # Minify ugly = Uglifier() ok = ugly.run(temp_files['path_tmp_b'], temp_files['path_tmp_u']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Uglifier fail') # Num tokens before vs after try: tok_clear = Lexer(temp_files['path_tmp_b']).tokenList tok_ugly = Lexer(temp_files['path_tmp_u']).tokenList except: cleanup(temp_files) return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): cleanup(temp_files) return (js_file_path, None, 'Num tokens mismatch') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() # This is already the baseline corpus, no (smart) renaming yet aligner.align(temp_files['path_tmp_b'], temp_files['path_tmp_u']) except: cleanup(temp_files) return (js_file_path, None, 'Aligner fail') if open(temp_files['path_tmp_b']).read() == \ open(temp_files['path_tmp_u']).read(): cleanup(temp_files) return (js_file_path, None, 'Not minified') try: lex_ugly = Lexer(temp_files['path_tmp_u_a']) iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: cleanup(temp_files) return (js_file_path, None, 'IndexBuilder fail') ############################################################ # From now on only work with path_tmp_b_a and path_tmp_u_a ############################################################ # Store original and uglified versions ok = clear.run(temp_files['path_tmp_b_a'], temp_files['path_orig']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Beautifier fail') ok = clear.run(temp_files['path_tmp_u_a'], temp_files['path_ugly']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Beautifier fail') # Run the JSNice from http://www.nice2predict.org unuglifyJS = UnuglifyJS() (ok, _out, _err) = unuglifyJS.run(temp_files['path_tmp_u_a'], temp_files['path_tmp_unugly']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Nice2Predict fail') ok = clear.run(temp_files['path_tmp_unugly'], temp_files['path_unugly']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Beautifier fail') # ok = clear.run(temp_files['path_tmp_unugly'], # temp_files['path_tmp_unugly_1']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'Beautifier fail') # # (ok, _out, _err) = jsNiceBeautifier.run(temp_files['path_tmp_unugly_1'], # temp_files['path_tmp_unugly_2']) # if not ok: # cleanup(temp_files) # print js_file_path, _err # return (js_file_path, None, 'JSNice Beautifier fail') # # ok = clear.run(temp_files['path_tmp_unugly_2'], # temp_files['path_unugly']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'Beautifier fail') try: lexer = Lexer(temp_files['path_unugly']) iBuilder = IndexBuilder(lexer.tokenList) except: cleanup(temp_files) return (js_file_path, None, 'IndexBuilder fail') try: scopeAnalyst = ScopeAnalyst( os.path.join(os.path.dirname(os.path.realpath(__file__)), temp_files['path_unugly'])) nameOrigin = scopeAnalyst.nameOrigin isGlobal = scopeAnalyst.isGlobal for (name, def_scope) in nameOrigin.iterkeys(): pos = scopeAnalyst.nameDefScope2pos[(name, def_scope)] (lin, col) = iBuilder.revFlatMat[pos] (tok_lin, tok_col) = iBuilder.revTokMap[(lin, col)] candidates.append(('Nice2Predict', def_scope, tok_lin, tok_col, isGlobal.get((name, pos), True), name, '', '')) except: cleanup(temp_files) return (js_file_path, None, 'ScopeAnalyst fail') # # Run the JSNice from http://www.jsnice.org # jsNice = JSNice() # (ok, _out, _err) = jsNice.run(temp_files['path_tmp_u_a'], # temp_files['path_tmp_jsnice']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'JSNice fail') # # ok = clear.run(temp_files['path_tmp_jsnice'], # temp_files['path_jsnice']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'Beautifier fail') # # try: # lexer = Lexer(temp_files['path_jsnice']) # iBuilder = IndexBuilder(lexer.tokenList) # except: # cleanup(temp_files) # return (js_file_path, None, 'IndexBuilder fail') # # try: # scopeAnalyst = ScopeAnalyst(os.path.join( # os.path.dirname(os.path.realpath(__file__)), # temp_files['path_jsnice'])) # nameOrigin = scopeAnalyst.nameOrigin # isGlobal = scopeAnalyst.isGlobal # # for (name, def_scope) in nameOrigin.iterkeys(): # # pos = scopeAnalyst.nameDefScope2pos[(name, def_scope)] # (lin,col) = iBuilder.revFlatMat[pos] # (tok_lin,tok_col) = iBuilder.revTokMap[(lin,col)] # # candidates.append(('JSNice', def_scope, # tok_lin, tok_col, # isGlobal.get((name, pos), True), # name, '','')) # except: # cleanup(temp_files) # return (js_file_path, None, 'ScopeAnalyst fail') # Compute scoping: name2scope is a dictionary where keys # are (name, start_index) tuples and values are scope identifiers. # Note: start_index is a flat (unidimensional) index, # not a (line_chr_idx, col_chr_idx) index. try: scopeAnalyst = ScopeAnalyst( os.path.join(os.path.dirname(os.path.realpath(__file__)), temp_files['path_tmp_u_a'])) except: cleanup(temp_files) return (js_file_path, None, 'ScopeAnalyst fail') # Baseline translation: No renaming, no scoping no_renaming = [] for _line_idx, line in enumerate(iBuilder_ugly.tokens): no_renaming.append(' '.join([t for (_tt, t) in line]) + "\n") with open(temp_files['f2'], 'w') as f_no_renaming: f_no_renaming.writelines(no_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.no_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation_no_renaming, _err) = moses.run(temp_files['f2']) nc = processTranslationUnscoped(translation_no_renaming, iBuilder_ugly, lm_path, temp_files['f2'], output_path, base_name) if nc: candidates += nc # translation, iBuilder, lm_path, # f_path, output_path, base_name # Default translation: No renaming # no_renaming = [] # for _line_idx, line in enumerate(iBuilder_ugly.tokens): # no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n") # # with open(temp_files['f2'], 'w') as f_no_renaming: # f_no_renaming.writelines(no_renaming) # # moses = MosesDecoder(ini_path=os.path.join(ini_path, \ # 'train.no_renaming', 'tuning', 'moses.ini')) # (_moses_ok, translation, _err) = moses.run(temp_files['f2']) nc = processTranslationScoped(translation_no_renaming, iBuilder_ugly, scopeAnalyst, lm_path, temp_files['f2'], output_path, base_name) if nc: candidates += nc # More complicated renaming: collect the context around # each name (global variables, API calls, punctuation) # and build a hash of the concatenation. hash_def_one_renaming = renameUsingHashDefLine(scopeAnalyst, iBuilder_ugly, twoLines=False, debug=False) with open(temp_files['f5'], 'w') as f_hash_def_one_renaming: f_hash_def_one_renaming.writelines(hash_def_one_renaming) # moses = MosesDecoder(ini_path=os.path.join(ini_path, \ # 'train.hash_def_one_renaming', 'tuning', 'moses.ini')) # (_moses_ok, # translation_hash_renaming, # _err) = moses.run(temp_files['f5']) mosesParams = {} mosesParams["text"] = hash_def_one_renaming #lex_ugly.collapsedText #mosesParams["align"] = "true" #mosesParams["report-all-factors"] = "true" mresults = proxy.translate( mosesParams) # __request("translate", mosesParams) rawText = Postprocessor(mresults["nbest"]) translation_hash_renaming = rawText.getProcessedOutput() nc = processTranslationScoped(translation_hash_renaming, iBuilder_ugly, scopeAnalyst, lm_path, temp_files['f5'], output_path, base_name) if nc: candidates += nc # nc = processTranslationScopedFallback(translation_hash_renaming, # translation_no_renaming, # iBuilder_ugly, # scopeAnalyst, # lm_path, # temp_files['f7'], # output_path, # base_name) # if nc: # candidates += nc cleanup(temp_files) cleanupRenamed(pid) return (js_file_path, 'OK', candidates) except Exception, e: cleanup(temp_files) cleanupRenamed(pid) return (js_file_path, None, str(e).replace("\n", ""))
def testIndexBuilder(self): ''' Check that the index builder has correct values for the test files. ''' ib1 = IndexBuilder(self.clearLexed[0].tokenList) ''' # - map from (line,col) position to name self.charPosition2Name = {} # - map from name to list of (line,col) positions self.name2CharPositions = {} # - map from (line,col) position to flat position self.flatMap = {} # - map from flat position to (line,col) self.revFlatMat = {} # - map from (token_line, token_column) position in the # bidimensional list of tokens to (line,col) text position self.tokMap = {} # - map from (line,col) position to (token_line, token_column) # position in the bidimensional list of tokens self.revTokMap = {} ''' print([item[1] for item in self.clearLexed[0].tokenList]) print(ib1.charPosition2Name) print(len(ib1.charPosition2Name)) #print(len(ib1.charPosition2Name) == 53) #for i in range(0,22): # linecount = 0 # for j in range(0, 110): # if((i,j) in ib1.charPosition2Name): # linecount += 1 # print("Line " + str(i+1) + " has " + str(linecount) + " variables.") #Test charPosition2Name self.assertTrue(len(ib1.charPosition2Name) == 53) self.assertTrue(ib1.charPosition2Name[(0, 4)] == u'geom2d') self.assertTrue(ib1.charPosition2Name[(2, 8)] == u'a') self.assertTrue(ib1.charPosition2Name[(15, 13)] == u'mix') self.assertTrue(ib1.charPosition2Name[(16, 17)] == u'k') #Test name2charPositions self.assertTrue(len(ib1.name2CharPositions) == 16) self.assertTrue( sum([ len(value) for key, value in ib1.name2CharPositions.iteritems() ]) == 53) self.assertTrue(len(ib1.name2CharPositions[u'x']) == 7) self.assertTrue(len(ib1.name2CharPositions[u'Vector2d']) == 4) #Test flatMap self.assertTrue(ib1.flatMap[(1, 8)] == 34) self.assertTrue(ib1.flatMap[(3, 22)] == 128) #Test revFlatMap #Typo Bug: revFlatMat or revFlatMap? self.assertTrue(len(ib1.flatMap) == len(ib1.revFlatMat)) for key, value in ib1.flatMap.iteritems(): self.assertTrue(ib1.revFlatMat[value] == key) #Test tokMap and revTokMap #These are supposed to be different? Yes, includes maps to whitespace. (so leading whitespace also maps to identifiers) print(len(ib1.tokMap)) print(len(ib1.revTokMap)) #self.assertTrue(len(ib1.tokMap) == len(ib1.revTokMap)) #i = 0 for key, value in ib1.tokMap.iteritems(): if (value in ib1.revTokMap.keys()): self.assertTrue(ib1.revTokMap[value] == key)
def processFile(row): js_file_path = os.path.join(corpus_root, row[0]) pid = int(multiprocessing.current_process().ident) base_name = os.path.splitext(os.path.basename(js_file_path))[0] # Temp files to be created during processing temp_files = { 'path_tmp': 'tmp_%d.js' % pid, 'path_tmp_b': 'tmp_%d.b.js' % pid, 'path_tmp_b_a': 'tmp_%d.b.a.js' % pid, 'path_tmp_u': 'tmp_%d.u.js' % pid, 'path_tmp_u_a': 'tmp_%d.u.a.js' % pid } try: # Pass through beautifier to fix layout: # # - once through JSNice without renaming # jsNiceBeautifier = JSNice(flags=['--no-types', '--no-rename']) # # (ok, _out, _err) = jsNiceBeautifier.run(js_file_path, # temp_files['path_tmp']) # if not ok: # cleanup(temp_files) # return (js_file_path, False, 'JSNice Beautifier fail') # # # # Weird JSNice renamings despite --no-rename # try: # before = set([token for (token, token_type) in # Lexer(js_file_path).tokenList # if is_token_subtype(token_type, Token.Name)]) # after = set([token for (token, token_type) in # Lexer(temp_files['path_tmp']).tokenList # if is_token_subtype(token_type, Token.Name)]) # # if not before == after: # return (js_file_path, False, 'Weird JSNice renaming') # # except: # cleanup(temp_files) # return (js_file_path, False, 'Lexer fail') # Strip comments, replace literals, etc try: prepro = Preprocessor(os.path.join(corpus_root, js_file_path)) prepro.write_temp_file(temp_files['path_tmp']) except: cleanup(temp_files) return (js_file_path, None, 'Preprocessor fail') # - and another time through uglifyjs pretty print only clear = Beautifier() ok = clear.run(temp_files['path_tmp'], temp_files['path_tmp_b']) if not ok: cleanup(temp_files) return (js_file_path, False, 'Beautifier fail') # Minify ugly = Uglifier() ok = ugly.run(temp_files['path_tmp_b'], temp_files['path_tmp_u']) if not ok: cleanup(temp_files) return (js_file_path, False, 'Uglifier fail') # Num tokens before vs after try: tok_clear = Lexer(temp_files['path_tmp_b']).tokenList tok_ugly = Lexer(temp_files['path_tmp_u']).tokenList except: cleanup(temp_files) return (js_file_path, False, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): cleanup(temp_files) return (js_file_path, False, 'Num tokens mismatch') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() # This is already the baseline corpus, no (smart) renaming yet aligner.align(temp_files['path_tmp_b'], temp_files['path_tmp_u']) except: cleanup(temp_files) return (js_file_path, False, 'Aligner fail') # Check if minification resulted in any change # It's not very interesting otherwise if open(temp_files['path_tmp_b_a']).read() == \ open(temp_files['path_tmp_u_a']).read(): cleanup(temp_files) return (js_file_path, False, 'Not minified') try: lex_ugly = Lexer(temp_files['path_tmp_u_a']) _iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: cleanup(temp_files) return (js_file_path, False, 'IndexBuilder fail') # Store original and uglified versions ok = clear.run(temp_files['path_tmp_b_a'], os.path.join(output_path, '%s.js' % base_name)) if not ok: cleanup(temp_files) cleanupProcessed(base_name) return (js_file_path, False, 'Beautifier fail') ok = clear.run(temp_files['path_tmp_u_a'], os.path.join(output_path, '%s.u.js' % base_name)) if not ok: cleanup(temp_files) cleanupProcessed(base_name) return (js_file_path, False, 'Beautifier fail') cleanup(temp_files) return (js_file_path, True, 'OK') except Exception, e: cleanup(temp_files) return (js_file_path, False, str(e))
def processFile(js_file_path): # js_file_path = l[0] base_name = os.path.splitext(os.path.basename(js_file_path))[0] if dbg: print js_file_path temp_files = {'orig': '%s.js' % base_name, 'minified': '%s.u.js' % base_name, 'n2p': '%s.n2p.js' % base_name} for r_strategy in RS.all(): temp_files['%s' % (r_strategy)] = \ '%s.%s.js' % (base_name, r_strategy) for c_strategy in CS.all(): temp_files['%s_%s' % (r_strategy, c_strategy)] = \ '%s.%s.%s.js' % (base_name, r_strategy, c_strategy) for k,v in temp_files.iteritems(): temp_files[k] = os.path.join(output_path, v) candidates = [] #Minified Name -> Original Name (name, def_scope) -> (name, def_scope) min_name_map = {} #Hashed Name -> Minified Name (name, def_scope) -> (name, def_scope) hash_name_map = {} #Minified Name -> jsnice name (name, def_scope) -> (name, def_scope) jsnice_name_map = {} #Data for the suggestion model.csv #Map of variable (name, def_scope) -> results of variableMetrics features function name_features = {} #Map of maps of variable-suggestion (name, def_scope, suggestion) -> suggestion line counts + suggestionMetrics features function #The first key is the renaming strategy #Ultimately, we will iterate over this to get the keys out of name_features and build model_rows suggestion_features = {} #Output Lines for the suggestoin_model.csv model_rows = [] if True: # try: # js_text = open(os.path.join(corpus_root, js_file_path), 'r').read() js_text = open(js_file_path, 'r').read() # Strip comments, replace literals, etc # if True: # try: prepro = WebLMPreprocessor(js_text) prepro_text = str(prepro) # except: # return (js_file_path, None, 'Preprocessor fail') # print 'Preprocessor' # print prepro_text # Pass through beautifier to fix layout clear = Beautifier() (ok, tmp_beautified_text, _err) = clear.web_run(prepro_text) print '\nOK:', ok, 'ERR:', _err print tmp_beautified_text if not ok: return (js_file_path, None, 'Beautifier fail') # Minify ugly = Uglifier() (ok, tmp_minified_text, _err) = ugly.web_run(tmp_beautified_text) # print '\nOK:', ok, 'ERR:', _err # print tmp_minified_text if not ok: return (js_file_path, None, 'Uglifier fail') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() (aligned_clear, aligned_minified) = aligner.web_align(WebLexer(tmp_beautified_text).tokenList, WebLexer(tmp_minified_text).tokenList) except: return (js_file_path, None, 'Aligner fail') # print '\nAligned clear' # print aligned_clear # print '\nAligned minified' # print aligned_minified # print # Pass through beautifier to fix layout (ok, beautified_text, _err) = clear.web_run(aligned_clear) if not ok: return (js_file_path, None, 'Beautifier fail') (ok, minified_text, _err) = clear.web_run(aligned_minified) if not ok: return (js_file_path, None, 'Beautifier fail') # print beautified_text # print # print minified_text # Num tokens before vs after try: lex_clear = WebLexer(beautified_text) tok_clear = lex_clear.tokenList lex_ugly = WebLexer(minified_text) tok_ugly = lex_ugly.tokenList except: return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): return (js_file_path, None, 'Num tokens mismatch') if beautified_text == minified_text: return (js_file_path, None, 'Not minified') try: iBuilder_clear = IndexBuilder(lex_clear.tokenList) except: return (js_file_path, None, "IndexBuilder fail on original file.") try: iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: return (js_file_path, None, 'IndexBuilder fail on minified file.') # print 'Writing' with open(temp_files['orig'], 'w') as f: f.write(beautified_text) with open(temp_files['minified'], 'w') as f: f.write(minified_text) ######################## # Nice2Predict ######################## # BV: Next block left out until I figure out the pipe issue # BV: Update: I couldn't pipe input to N2P. TODO: FIX # Run the JSNice from http://www.nice2predict.org unuglifyJS = UnuglifyJS() (ok, n2p_text, _err) = unuglifyJS.run(temp_files['minified']) if not ok: return (js_file_path, None, 'Nice2Predict fail') (ok, n2p_text_beautified, _err) = clear.web_run(n2p_text) if not ok: return (js_file_path, None, 'Beautifier fail') with open(temp_files['n2p'], 'w') as f: f.write(n2p_text_beautified) if(True): #try: n2p_lexer = WebLexer(n2p_text_beautified) n2p_iBuilder = IndexBuilder(n2p_lexer.tokenList) n2p_scopeAnalyst = WebScopeAnalyst(n2p_text_beautified) #except: # return (js_file_path, None, 'IndexBuilder / ScopeAnalyst fail') # print 'n2p' # Save some translation stats to compare different methods ts = TranslationSummarizer() candidates += [['n2p', ''] + x for x in ts.compute_summary_unscoped(n2p_iBuilder, n2p_scopeAnalyst)] ################################################ # All other JSNaughty variants ################################################ try: scopeAnalyst = WebScopeAnalyst(minified_text) except: return (js_file_path, None, 'ScopeAnalyst minified fail') try: scopeAnalyst_clear = WebScopeAnalyst(beautified_text) except: return (js_file_path, None, 'ScopeAnalyst clear fail') if(not check(iBuilder_clear, scopeAnalyst_clear, n2p_iBuilder, n2p_scopeAnalyst)): return (js_file_path, None, 'JsNice restructured file. Skipping..') #Map the original names to the minified counterparts and minified ones to jsnice renamings orderedVarsNew = sorted(scopeAnalyst.name2defScope.keys(), key = lambda x: x[1]) orderedVarsOld = sorted(scopeAnalyst_clear.name2defScope.keys(), key = lambda x: x[1]) orderedVarsN2p = sorted(n2p_scopeAnalyst.name2defScope.keys(), key = lambda x: x[1]) if(len(orderedVarsOld) != len(orderedVarsNew)): return (js_file_path, None, "Old and New Name lists different length") if(len(orderedVarsOld) != len(orderedVarsN2p)): return (js_file_path, None, "JsNice and Old Name lists different length") for i in range(0, len(orderedVarsOld)): name_old = orderedVarsOld[i][0] def_scope_old = scopeAnalyst_clear.name2defScope[orderedVarsOld[i]] name_new = orderedVarsNew[i][0] def_scope_new = scopeAnalyst.name2defScope[orderedVarsNew[i]] min_name_map[(name_new, def_scope_new)] = (name_old, def_scope_old) name_n2p = orderedVarsN2p[i][0] def_scope_n2p = scopeAnalyst.name2defScope[orderedVarsNew[i]] jsnice_name_map[(name_new, def_scope_new)] = (name_n2p, def_scope_n2p) #Once we have the scopeAnalyst, iBuilder, and tokenlist for the minified #version, we can get the name properties vm = VariableMetrics(scopeAnalyst, iBuilder_ugly, lex_ugly.tokenList) variableKeySet = vm.getVariables() for variableKey in variableKeySet: name_features[variableKey] = vm.getNameMetrics(variableKey) (name_positions, \ position_names, use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst) # print 'Helpers' # Try different renaming strategies (hash, etc) for r_strategy, proxy in proxies: if dbg: print '\n=====================' print r_strategy print '=====================\n' # try: # if True: # Rename input prior to translation preRen = PreRenamer() after_text = preRen.rename(r_strategy, iBuilder_ugly, scopeAnalyst) # print 'After text:' # print after_text # print (ok, beautified_after_text, _err) = clear.web_run(after_text) if not ok: return (js_file_path, None, 'Beautifier fail') # print 'Beautified:' # print beautified_after_text # print if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO): try: scopeAnalyst_hash = WebScopeAnalyst(after_text) except: return (js_file_path, None, "ScopeAnalyst hash fail") #Map the hashed names to the minified counterparts. orderedVarsMin = sorted(scopeAnalyst.name2defScope.keys(), key = lambda x: x[1]) orderedVarsHash = sorted(scopeAnalyst_hash.name2defScope.keys(), key = lambda x: x[1]) if(len(orderedVarsMin) != len(orderedVarsHash)): return (js_file_path, None, "Hash and Min lists different length") for i in range(0, len(orderedVarsHash)): name_hash = orderedVarsHash[i][0] def_scope_hash = scopeAnalyst_hash.name2defScope[orderedVarsHash[i]] name_min = orderedVarsMin[i][0] def_scope_min = scopeAnalyst.name2defScope[orderedVarsMin[i]] hash_name_map[(name_hash, def_scope_hash)] = (name_min, def_scope_min) # Save renamed input to disk for future inspection with open(temp_files['%s' % (r_strategy)], 'w') as f: f.write(beautified_after_text) a_lexer = WebLexer(beautified_after_text) a_iBuilder = IndexBuilder(a_lexer.tokenList) a_scopeAnalyst = WebScopeAnalyst(beautified_after_text) # except: # return (js_file_path, None, 'Renaming fail') # print 'Lexing' # lx = WebLexer(a_iBuilder.get_text()) lx = WebLexer(a_iBuilder.get_text_wo_literals()) # print a_iBuilder.get_text_wo_literals() # Translate renamed input md = WebMosesDecoder(proxy) (ok, translation, _err) = md.run(lx.collapsedText) if not ok: return (js_file_path, None, 'Moses translation fail') # print '\ntranslation-------------' # print translation # if r_strategy == RS.HASH_ONE: # exit() (a_name_positions, a_position_names, a_use_scopes) = prepHelpers(a_iBuilder, a_scopeAnalyst) nc = [] if translation is not None: # Parse moses output mp = MosesParser() if dbg: print '\nr_strategy-----------', r_strategy name_candidates = mp.parse(translation, a_iBuilder, a_position_names) # name_candidates is a dictionary of dictionaries: # keys are (name, None) (if scopeAnalyst=None) or # (name, def_scope) tuples (otherwise); # values are suggested translations with the sets # of line numbers on which they appear. # print '\nname_candidates before ----------' # for key, suggestions in name_candidates.iteritems(): # print key[0], key[1][-50:] # # for use_scope, suggestions in val.iteritems(): # # print '\t...', use_scope[-50:] # for name_translation, lines in suggestions.iteritems(): # print '\t', name_translation, lines # Update name_candidates with some default values # (in this case the translation without any renaming) # if the translation is empty if r_strategy == RS.NONE: # RS.NONE should always be first, by construction name_candidates_default = name_candidates scopeAnalyst_default = a_scopeAnalyst iBuilder_default = a_iBuilder else: for key_default, suggestions in name_candidates_default.iteritems(): # (name_default, def_scope_default) = key_default pos_default = scopeAnalyst_default.nameDefScope2pos[key_default] (lin, col) = iBuilder_default.revFlatMat[pos_default] (line_num, line_idx) = iBuilder_default.revTokMap[(lin, col)] (name, def_scope) = a_position_names[line_num][line_idx] key = (name, def_scope) for name_translation, lines in suggestions.iteritems(): name_candidates.setdefault(key, {}) name_candidates[key].setdefault(name_translation, set([])) name_candidates[key][name_translation].update(lines) # for use_scope, suggestions in val.iteritems(): # for name_translation, lines in suggestions.iteritems(): # # key = preRen.simple_direct_map.get(key_default, key_default) # # name_candidates.setdefault(key, {}) # name_candidates[key].setdefault(use_scope, {}) # name_candidates[key][use_scope].setdefault(name_translation, set([])) # name_candidates[key][use_scope][name_translation].update(lines) # print '\nname_candidates after ----------' # for key, suggestions in name_candidates.iteritems(): # print key[0], key[1][-50:] # # for use_scope, suggestions in val.iteritems(): # # print '\t...', use_scope[-50:] # for name_translation, lines in suggestions.iteritems(): # print '\t', name_translation, lines cc = ConsistencyController(debug_mode=True) ts = TranslationSummarizer() # An identifier may have been translated inconsistently # across different lines (Moses treats each line independently). # Try different strategies to resolve inconsistencies, if any for c_strategy in CS.all(): if dbg: print '\nc_strategy----------', c_strategy #assert(hash_name_map != {}) # Compute renaming map (x -> length, y -> width, ...) # Note that x,y here are names after renaming (temp_renaming_map, seen) = cc.computeRenaming(c_strategy, name_candidates, a_name_positions, a_use_scopes, a_iBuilder, lm_path, vm, hash_name_map) #After computeRenaming, we have both the entropies stored #if we are in LMDrop strategy and have the suggestions #frequency from name_candidates. Fill in suggestion_Features if(c_strategy == CS.LMDROP and r_strategy not in suggestion_features): assert(cc.suggestion_cache != None) suggestion_features[r_strategy] = {} #Need some way of iterating over all name, suggestion groups... """ name_candidates: dict name_candidates[(name, def_scope)][name_translation] = set of line numbers in the translation """ for variableKey, suggestionDictionary in name_candidates.iteritems(): for suggestionName, linesSuggested in suggestionDictionary.iteritems(): # I need to revert variableKey[0] in the suggestion from its hash to its original minified name. if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO): unhashedKey = hash_name_map[variableKey] suggestionKey = (unhashedKey[0], unhashedKey[1], suggestionName) else: suggestionKey = (variableKey[0], variableKey[1], suggestionName) entropyVals = cc.suggestion_cache.getEntropyStats(variableKey, suggestionName) if(True): #eval_dbg only #if(entropyVals != (ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR)): suggestionValue = [len(linesSuggested)] + \ list(getSuggestionStats(suggestionName)) + \ list(entropyVals) suggestion_features[r_strategy][suggestionKey] = suggestionValue if dbg: print '\ntemp_renaming_map-------------' for (name, def_scope), renaming in temp_renaming_map.iteritems(): print (name, def_scope[-50:]), renaming # Fall back on original names in input, if # no translation was suggested postRen = PostRenamer() renaming_map = postRen.updateRenamingMap(a_name_positions, position_names, a_use_scopes, temp_renaming_map, seen, r_strategy) # new_name_candidates = {} # # for (name, def_scope), renaming in temp_renaming_map.iteritems(): # (line_num, line_idx) = a_name_positions[(name, def_scope)][0] # (old_name, old_def_scope) = position_names[line_num][line_idx] # # new_name_candidates.setdefault((old_name, old_def_scope), {}) # new_name_candidates[(old_name, old_def_scope)][renaming] = set([1]) # tmp_renamed_text = postRen.applyRenaming(a_iBuilder, # a_name_positions, # temp_renaming_map) # (ok, tmp_beautified_renamed_text, _err) = clear.web_run(tmp_renamed_text) # if not ok: # return (js_file_path, None, 'Beautifier fail') # # tmp_lexer = WebLexer(tmp_beautified_renamed_text) # tmp_iBuilder = IndexBuilder(tmp_lexer.tokenList) # tmp_scopeAnalyst = WebScopeAnalyst(tmp_beautified_renamed_text) # # (tmp_name_positions, # tmp_position_names, # tmp_use_scopes) = prepHelpers(tmp_iBuilder, tmp_scopeAnalyst) # renaming_map = postRen.updateRenamingMap(tmp_name_positions, # position_names, # temp_renaming_map, # r_strategy) # # renaming_map = cc.computeRenaming(CS.FREQLEN, # new_name_candidates, # name_positions, # use_scopes, # iBuilder_ugly, # lm_path) # # Fall back on original names in input, if # # no translation was suggested # postRen = PostRenamer() # renaming_map = postRen.updateRenamingMap(a_name_positions, # position_names, # temp_renaming_map, # r_strategy) if dbg: print '\nrenaming_map-------------' for (name, def_scope), renaming in renaming_map.iteritems(): print (name, def_scope[-50:]), renaming, '(%s)' % temp_renaming_map[(name, def_scope)] # Apply renaming map and save output for future inspection renamed_text = postRen.applyRenaming(a_iBuilder, a_name_positions, renaming_map) print '\nrenamed_text--------------' print renamed_text print (ok, beautified_renamed_text, _err) = clear.web_run(renamed_text) if not ok: return (js_file_path, None, 'Beautifier fail') with open(temp_files['%s_%s' % (r_strategy, c_strategy)], 'w') as f: f.write(beautified_renamed_text) # Save some stats about which names were renamed to what # This is what enables the comparison between the different # methods. r = [[c_strategy] + x for x in ts.compute_summary_scoped(renaming_map, name_candidates, a_iBuilder, a_scopeAnalyst)] if not r: return (js_file_path, None, 'Compute summary failed') nc += r if nc: candidates += [[r_strategy] + x for x in nc] #create the rows for the suggestion_model.csv for r_strategy in RS.all(): for suggestionKey, s_feat in suggestion_features[r_strategy].iteritems(): variableKey = (suggestionKey[0], suggestionKey[1]) original_name = min_name_map[variableKey][0] js_nice_name = jsnice_name_map[variableKey][0] if(variableKey in name_features): #eval_dbg only n_feat = list(name_features[variableKey]) #Convert the def_scope to an equivalent, but smaller, easier to read key: (line_num, token_num) newKey = scopeAnalyst.nameDefScope2pos[variableKey] (keyLine, keyToken) = iBuilder_ugly.revFlatMat[newKey] model_rows.append([original_name, r_strategy, suggestionKey[0], keyLine, keyToken, suggestionKey[2], js_nice_name] + n_feat + s_feat) return (js_file_path, 'OK', candidates, model_rows)
def processFile(l): def localCleanup(output_path, base_names): for base_name in base_names: tryRemove(os.path.join(output_path, base_name)) js_file_path = l[0] base_name = os.path.splitext(os.path.basename(js_file_path))[0] pid = int(multiprocessing.current_process().ident) candidates = [] try: # if True: # Temp files to be created during processing path_tmp = 'tmp_%d.js' % (pid) path_tmp_b = 'tmp_%d.b.js' % (pid) path_tmp_b_a = 'tmp_%d.b.a.js' % (pid) path_tmp_u = 'tmp_%d.u.js' % (pid) path_tmp_u_a = 'tmp_%d.u.a.js' % (pid) path_tmp_unugly = 'tmp_%d.n2p.js' % (pid) path_tmp_jsnice = 'tmp_%d.jsnice.js' % (pid) f2 = 'tmp_%d.no_renaming.js' % (pid) f3 = 'tmp_%d.basic_renaming.js' % (pid) f4 = 'tmp_%d.hash_renaming.js' % (pid) f5 = 'tmp_%d.hash_def_one_renaming.js' % (pid) f6 = 'tmp_%d.hash_def_two_renaming.js' % (pid) path_orig = '%s.js' % (base_name) path_ugly = '%s.u.js' % (base_name) path_unugly = '%s.n2p.js' % (base_name) path_jsnice = '%s.jsnice.js' % (base_name) # Strip comments, replace literals, etc try: prepro = Preprocessor(os.path.join(corpus_root, js_file_path)) prepro.write_temp_file(path_tmp) except: cleanup(pid) return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout clear = Beautifier() ok = clear.run(path_tmp, path_tmp_b+'.tmp1') if not ok: cleanup(pid) return (js_file_path, None, 'Beautifier 1 fail') jsNiceBeautifier = JSNice(flags=['--no-types', '--no-rename']) (ok, _out, _err) = jsNiceBeautifier.run(path_tmp_b+'.tmp1', path_tmp_b+'.tmp2') if not ok: cleanup(pid) return (js_file_path, None, 'JSNice Beautifier 1 fail') ok = clear.run(path_tmp_b+'.tmp2', path_tmp_b) if not ok: cleanup(pid) return (js_file_path, None, 'Beautifier 1 fail') # Minify ugly = Uglifier() ok = ugly.run(path_tmp_b, path_tmp_u) if not ok: cleanup(pid) return (js_file_path, None, 'Uglifier fail') # Num tokens before vs after try: tok_clear = Lexer(path_tmp_b).tokenList tok_ugly = Lexer(path_tmp_u).tokenList except: cleanup(pid) return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): cleanup(pid) return (js_file_path, None, 'Num tokens mismatch') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() # This is already the baseline corpus, no (smart) renaming yet aligner.align(path_tmp_b, path_tmp_u) except: cleanup(pid) return (js_file_path, None, 'Aligner fail') try: # iBuilder_clear = IndexBuilder(Lexer(path_tmp_b_a).tokenList) iBuilder_ugly = IndexBuilder(Lexer(path_tmp_u_a).tokenList) except: cleanup(pid) return (js_file_path, None, 'IndexBuilder fail') # Store original and uglified versions ok = clear.run(path_tmp_u_a, os.path.join(output_path, path_ugly)) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly]) return (js_file_path, None, 'Beautifier 2 fail') ok = clear.run(path_tmp_b_a, os.path.join(output_path, path_orig)) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig]) return (js_file_path, None, 'Beautifier 3 fail') # Run the JSNice from http://www.nice2predict.org unuglifyJS = UnuglifyJS() (ok, _out, _err) = unuglifyJS.run(path_tmp_b_a, path_tmp_unugly) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig]) return (js_file_path, None, 'Nice2Predict fail') ok = clear.run(path_tmp_unugly, path_tmp_unugly+'.tmp1') if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, path_unugly]) return (js_file_path, None, 'Beautifier 4 fail') (ok, _out, _err) = jsNiceBeautifier.run(path_tmp_unugly+'.tmp1', path_tmp_unugly+'.tmp2') if not ok: cleanup(pid) return (js_file_path, None, 'JSNice Beautifier 2 fail') ok = clear.run(path_tmp_unugly+'.tmp2', os.path.join(output_path, path_unugly)) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, path_unugly]) return (js_file_path, None, 'Beautifier 4 fail') try: scopeAnalyst = ScopeAnalyst(os.path.join( os.path.dirname(os.path.realpath(__file__)), path_tmp_unugly)) nameOrigin = scopeAnalyst.nameOrigin for (name, def_scope) in nameOrigin.iterkeys(): candidates.append(('Nice2Predict', def_scope, name, '', '')) except: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, path_unugly]) return (js_file_path, None, 'ScopeAnalyst fail') # Run the JSNice from http://www.jsnice.org jsNice = JSNice() (ok, _out, _err) = jsNice.run(path_tmp_b_a, path_tmp_jsnice) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, path_unugly]) return (js_file_path, None, 'JSNice fail') ok = clear.run(path_tmp_jsnice, os.path.join(output_path, path_jsnice)) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, \ path_unugly, path_jsnice]) return (js_file_path, None, 'Beautifier 5 fail') try: scopeAnalyst = ScopeAnalyst(os.path.join( os.path.dirname(os.path.realpath(__file__)), path_tmp_jsnice)) nameOrigin = scopeAnalyst.nameOrigin for (name, def_scope) in nameOrigin.iterkeys(): candidates.append(('JSNice', def_scope, name, '', '')) except: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, \ path_unugly, path_jsnice]) return (js_file_path, None, 'ScopeAnalyst fail') # Compute scoping: name2scope is a dictionary where keys # are (name, start_index) tuples and values are scope identifiers. # Note: start_index is a flat (unidimensional) index, # not a (line_chr_idx, col_chr_idx) index. try: scopeAnalyst = ScopeAnalyst(os.path.join( os.path.dirname(os.path.realpath(__file__)), path_tmp_u_a)) _name2defScope = scopeAnalyst.resolve_scope() _isGlobal = scopeAnalyst.isGlobal _name2useScope = scopeAnalyst.resolve_use_scope() except: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, \ path_unugly, path_jsnice]) return (js_file_path, None, 'ScopeAnalyst fail') no_renaming = [] for _line_idx, line in enumerate(iBuilder_ugly.tokens): no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n") with open(f2, 'w') as f_no_renaming: f_no_renaming.writelines(no_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.no_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation, _err) = moses.run(f2) nc = processTranslation(translation, iBuilder_ugly, scopeAnalyst, lm_path, f2, output_path, base_name, clear) if nc: candidates += nc # Simple renaming: disambiguate overloaded names using scope id basic_renaming = renameUsingScopeId(scopeAnalyst, iBuilder_ugly) with open(f3, 'w') as f_basic_renaming: f_basic_renaming.writelines(basic_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.basic_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation, _err) = moses.run(f3) nc = processTranslation(translation, iBuilder_ugly, scopeAnalyst, lm_path, f3, output_path, base_name, clear) if nc: candidates += nc # More complicated renaming: collect the context around # each name (global variables, API calls, punctuation) # and build a hash of the concatenation. hash_renaming = renameUsingHashAllPrec(scopeAnalyst, iBuilder_ugly, debug=False) # print hash_renaming with open(f4, 'w') as f_hash_renaming: f_hash_renaming.writelines(hash_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.hash_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation, _err) = moses.run(f4) nc = processTranslation(translation, iBuilder_ugly, scopeAnalyst, lm_path, f4, output_path, base_name, clear) if nc: candidates += nc hash_def_one_renaming = renameUsingHashDefLine(scopeAnalyst, iBuilder_ugly, twoLines=False, debug=False) with open(f5, 'w') as f_hash_def_one_renaming: f_hash_def_one_renaming.writelines(hash_def_one_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.hash_def_one_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation, _err) = moses.run(f5) nc = processTranslation(translation, iBuilder_ugly, scopeAnalyst, lm_path, f5, output_path, base_name, clear) if nc: candidates += nc hash_def_two_renaming = renameUsingHashDefLine(scopeAnalyst, iBuilder_ugly, twoLines=True, debug=False) with open(f6, 'w') as f_hash_def_two_renaming: f_hash_def_two_renaming.writelines(hash_def_two_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.hash_def_two_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation, _err) = moses.run(f6) nc = processTranslation(translation, iBuilder_ugly, scopeAnalyst, lm_path, f6, output_path, base_name, clear) if nc: candidates += nc cleanup(pid) cleanupRenamed(pid) return (js_file_path, 'OK', candidates) except Exception, e: cleanup(pid) cleanupRenamed(pid) return (js_file_path, None, str(e).replace("\n", ""))
def processFile(js_file_path): # Load in the minified file minified = open(js_file_path).read() # Create lexer lexer = get_lexer_for_filename(js_file_path) # Tokenize input and compute mappings between the different # indices used: (line, col), flat, (l,c) in token list indexBuilder = IndexBuilder(lex(minified, lexer)) tokens = indexBuilder.tokens # print 'RUNNING IndexBuilder:', len(tokens)>0 # Compute scoping: name2scope is a dictionary where keys # are (name, start_index) tuples and values are scope identifiers. # Note: start_index is a flat (unidimensional) index, # not a (line_chr_idx, col_chr_idx) index. scopeAnalyst = ScopeAnalyst(js_file_path) name2defScope = scopeAnalyst.resolve_scope() isGlobal = scopeAnalyst.isGlobal name2useScope = scopeAnalyst.name2useScope name2pth = scopeAnalyst.name2pth nameOrigin = scopeAnalyst.nameOrigin scopes = set(name2useScope.values()) print print '=== FOUND %d SCOPES ===' % len(scopes) print for scope in scopes: print 'USE SCOPE:', scope lc_list = [ indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]] for (t, pos) in name2useScope.keys() if name2useScope[(t, pos)] == scope ] highlight(tokens, lc_list) print scopes = set(name2defScope.values()) print print '=== FOUND %d NAME SCOPES ===' % len(scopes) print for scope in scopes: print 'DEF SCOPE:', scope lc_list = [ indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]] for (t, pos) in name2defScope.keys() if name2defScope[(t, pos)] == scope ] highlight(tokens, lc_list) print # Discover the path to the source map map_path = sourcemap.discover(minified) # Read and parse our sourcemap if map_path: sourcemapIndex = sourcemap.load(open(map_path)) # Cluster names by scope nameScope2Positions = {} # Index data by (name,scope) for token, l in indexBuilder.name2CharPositions.iteritems(): for (line, col) in sorted(l, key=lambda (a, b): (a, b)): pos = indexBuilder.flatMap[(line, col)] if name2defScope.has_key((token, pos)): scope = name2defScope[(token, pos)] use_scope = name2useScope[(token, pos)] pth = name2pth[(token, pos)] glb = isGlobal[(token, pos)] nameScope2Positions.setdefault((token, scope, glb), []) nameScope2Positions[(token, scope, glb)].append((line, col)) # print token, pos # print 'def:', scope # print 'use:', use_scope # print 'pth:', pth # highlight(tokens, [indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]]]) # print print print for (token,scope,glb), positions in sorted(nameScope2Positions.iteritems(), \ key=lambda (x,y):x[0]): pos = sorted(positions, key=lambda e: (e[0], e[1])) tt = [] line_tok_idxs = set([]) for (l, c) in pos: (tl, tc) = indexBuilder.revTokMap[(l, c)] line_tok_idxs.add(tl) p = indexBuilder.flatMap[(l, c)] if map_path: orig = sourcemapIndex.lookup(line=l, column=c).name else: orig = token print token, scope, (l, c), orig tt.append(((tl, tc), p, orig)) # t.append(orig) # if token == 'n': print '\nNAME:', token.encode( 'utf-8'), '( isGlobal =', glb, '; original =', orig, ')' # print scope # highlight(tokens, [indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]]]) for ((tli, tci), p, orig) in tt: scope = name2defScope[(token, p)] use_scope = name2useScope[(token, p)] pth = name2pth[(token, p)] origin = nameOrigin[(token, scope)] # print token #, p, origin # print # print 'def:', scope # print 'use:', use_scope # print 'pth:', pth # print for tl in sorted(set([tli for ((tli, tci), p, orig) in tt])): l = list(tokens[tl]) for tc in [tci for ((tli, tci), p, orig) in tt if tli == tl]: l[tc] = (l[tc][0], unichr(0x2588) + token + unichr(0x2588)) # pos = indexBuilder.flatMap[(line,col)] print ' ', '%d:' % (tl + 1), ' '.join( [x[1].encode('utf-8') for x in l]) print return
def testFiles(self): #TODO: Automated checks against the files. #Known bugs: The definitions of sum and numberEquals in test_file1 seem to be pointing to the wrong instance... i = 1 for nextFile in self.fileList: print(nextFile) lexed = Lexer(nextFile) ib = IndexBuilder(lexed.tokenList) sa = ScopeAnalyst(nextFile) s_min = ScopeAnalyst( os.path.join(self.testDir.path, "test_file1.obs.js")) #print(s_min.name2defScope) #print("TokenList----------------------------------------------------------------") #print(lexed.tokenList) #print("Index Builder----------------------------------------------------------------") #print(ib) #print("Scope Analyst----------------------------------------------------------------") #print(sa) vm = VariableMetrics(sa, ib, lexed.tokenList) #print("VM----------------------------------------------------------------") #print(vm) #print("VM----------------------------------------------------------------") for var in vm.getVariables(): print(var) print( "Num Lines,Max Lines,Global Def,Global Usage,For,While,Literal Def,Literal Usage,Max Length Line,Ave Line Length" ) print vm.getNameMetrics(var) #Automated tests: csv_file = os.path.join(self.testDir.path, "test_file" + str(i) + ".csv") print(csv_file) if (os.path.exists(csv_file)): with open(csv_file, 'r') as f: csv_reader = csv.reader(f, delimiter=",") #Skip header next(csv_reader, None) for row in csv_reader: key = (row[0], row[1]) print(key) (num_lines, max_lines, external_def, external_use, in_for, in_while, literal_def, literal_use, max_length_line, ave_line_length) = vm.getNameMetrics(key) self.assertTrue(num_lines == int(row[2])) self.assertTrue(max_lines == int(row[3])) self.assertTrue(external_def == self.asBool(row[4])) self.assertTrue(external_use == int(row[5])) self.assertTrue(in_for == int(row[6])) self.assertTrue(in_while == int(row[7])) self.assertTrue(literal_def == self.asBool(row[8])) self.assertTrue(literal_use == int(row[9])) self.assertTrue(max_length_line == int(row[10])) self.assertAlmostEqual(ave_line_length, float(row[11]), places=3) else: print("no manually annotated csv file for: " + nextFile) break
def processFile(l): js_file_path = l[0] if js_file_path in seen: return (js_file_path, None, 'Skipped') pid = int(multiprocessing.current_process().ident) # Temp files to be created during processing temp_files = {'path_tmp': 'tmp_%d.js' % pid, 'path_tmp_b': 'tmp_%d.b.js' % pid, 'path_tmp_b_n': 'tmp_%d.b.n.js' % pid, 'path_tmp_u': 'tmp_%d.u.js' % pid, 'path_tmp_u_n': 'tmp_%d.u.n.js' % pid, 'path_tmp_b_a': 'tmp_%d.b.a.js' % pid, 'path_tmp_u_a': 'tmp_%d.u.a.js' % pid} try: # Strip comments, replace literals, etc try: prepro = Preprocessor(os.path.join(corpus_root, js_file_path)) prepro.write_temp_file(temp_files['path_tmp']) except: cleanup(temp_files) return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout: # - once through JSNice without renaming # jsNiceBeautifier = JSNice(flags=['--no-types', '--no-rename']) # # (ok, _out, _err) = jsNiceBeautifier.run(temp_files['path_tmp'], # temp_files['path_tmp_b_n']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'JSNice Beautifier fail') # # - and another time through uglifyjs pretty print only # clear = Beautifier() # ok = clear.run(temp_files['path_tmp_b_n'], # temp_files['path_tmp_b']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'Beautifier fail') # # JSNice is down! clear = Beautifier() ok = clear.run(temp_files['path_tmp'], temp_files['path_tmp_b_n']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Beautifier fail') # Normalize norm = Normalizer() ok = norm.run(os.path.join(os.path.dirname(os.path.realpath(__file__)), temp_files['path_tmp_b_n']), False, temp_files['path_tmp_b']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Normalizer fail') # Minify ugly = Uglifier() ok = ugly.run(temp_files['path_tmp_b'], temp_files['path_tmp_u_n']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Uglifier fail') # Normalize norm = Normalizer() ok = norm.run(os.path.join(os.path.dirname(os.path.realpath(__file__)), temp_files['path_tmp_u_n']), False, temp_files['path_tmp_u']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Normalizer fail') # Num tokens before vs after try: tok_clear = Lexer(temp_files['path_tmp_b']).tokenList tok_ugly = Lexer(temp_files['path_tmp_u']).tokenList except: cleanup(temp_files) return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): cleanup(temp_files) return (js_file_path, None, 'Num tokens mismatch') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() # This is already the baseline corpus, no (smart) renaming yet aligner.align(temp_files['path_tmp_b'], temp_files['path_tmp_u']) except: cleanup(temp_files) return (js_file_path, None, 'Aligner fail') try: lex_clear = Lexer(temp_files['path_tmp_b_a']) iBuilder_clear = IndexBuilder(lex_clear.tokenList) lex_ugly = Lexer(temp_files['path_tmp_u_a']) iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: cleanup(temp_files) return (js_file_path, None, 'IndexBuilder fail') # Normalize norm = Normalizer() ok = norm.run(os.path.join(os.path.dirname(os.path.realpath(__file__)), temp_files['path_tmp_b']), True, temp_files['path_tmp_u_n']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Normalizer fail') try: lex_norm = Lexer(temp_files['path_tmp_u_n']) iBuilder_norm = IndexBuilder(lex_norm.tokenList) except: cleanup(temp_files) return (js_file_path, None, 'IndexBuilder fail') normalized = [] for line_idx, line in enumerate(iBuilder_norm.tokens): normalized.append(' '.join([t for (_tt,t) in line]) + "\n") # Compute scoping: name2scope is a dictionary where keys # are (name, start_index) tuples and values are scope identifiers. # Note: start_index is a flat (unidimensional) index, # not a (line_chr_idx, col_chr_idx) index. try: scopeAnalyst = ScopeAnalyst(os.path.join( os.path.dirname(os.path.realpath(__file__)), temp_files['path_tmp_u_a'])) # _name2defScope = scopeAnalyst.resolve_scope() # _isGlobal = scopeAnalyst.isGlobal # _name2useScope = scopeAnalyst.resolve_use_scope() except: cleanup(temp_files) return (js_file_path, None, 'ScopeAnalyst fail') orig = [] no_renaming = [] for line_idx, line in enumerate(iBuilder_ugly.tokens): orig.append(' '.join([t for (_tt,t) in \ iBuilder_clear.tokens[line_idx]]) + "\n") no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n") # # Simple renaming: disambiguate overloaded names using scope id basic_renaming = renameUsingScopeId(scopeAnalyst, iBuilder_ugly) # More complicated renaming: collect the context around # each name (global variables, API calls, punctuation) # and build a hash of the concatenation. # hash_renaming = renameUsingHashAllPrec(scopeAnalyst, # iBuilder_ugly, # debug=True) hash_def_one_renaming = renameUsingHashDefLine(scopeAnalyst, iBuilder_ugly, twoLines=False, debug=False) hash_def_two_renaming = renameUsingHashDefLine(scopeAnalyst, iBuilder_ugly, twoLines=True, debug=False) cleanup(temp_files) return (js_file_path, orig, no_renaming, basic_renaming, normalized, # hash_renaming, hash_def_one_renaming, hash_def_two_renaming) except Exception, e: cleanup(temp_files) return (js_file_path, None, str(e))
def deobfuscateJS(self, obfuscatedCode, transactionID): proxy = xmlrpclib.ServerProxy("http://godeep.cs.ucdavis.edu:8080/RPC2") mosesParams = {} candidates = [] baseDir = "/home/ccasal/temp/" tempFile = baseDir + str(transactionID) + "_temp.js" lm_path = "/data/bogdanv/deobfuscator/experiments/corpora/corpus.lm.970k/js.blm.lm" preproFile = baseDir + str(transactionID) + "_prepro.js" beautFile = baseDir + str(transactionID) + "_beaut.js" # Strip comments, replace literals, etc try: prepro = WebPreprocessor(obfuscatedCode) #TODO replace with: prepro = WebPreprocessor(text) prepro.write_temp_file(preproFile) except: cleanup([preproFile]) print("Preprocessor failed") return ("Preprocessor Failed") clear = Beautifier() #TODO: Need a text version of beautifier to avoid the file read and write. #(ok, beautText, err) = clear.webRun(preproText) ok = clear.run(preproFile, beautFile) print(ok) if (not ok): cleanup([preproFile, beautFile]) return ("Beautifier Failed") #quit() try: lex_ugly = Lexer(beautFile) iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: cleanup([preproFile, beautFile]) print("IndexBuilder fail") return ("IndexBuilder Failed") lex_ugly.write_temp_file(tempFile) #Do Scope related tasks #a raw text version try: scopeAnalyst = ScopeAnalyst(tempFile) except: cleanup({"temp": tempFile}) print("ScopeAnalyst Fail") return ("ScopeAnalyst Failed") #Do Rename related tasks #In our case, I don't think we need to actually do anything for no_renaming #no_renaming = [] #for _line_idx, line in enumerate(iBuilder_ugly.tokens): # no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n") #Hash_def_one_renaming #beautText = renameUsingHashDefLine(scopeAnalyst, # iBuilder_ugly, # twoLines=False, # debug=False) print(lex_ugly.collapsedText) mosesParams["text"] = lex_ugly.collapsedText mosesParams["align"] = "true" mosesParams["report-all-factors"] = "true" results = proxy.translate( mosesParams) # __request("translate", mosesParams) rawText = Postprocessor(results["nbest"]) translation = rawText.getProcessedOutput() #Send to output: cleanup([preproFile, beautFile, tempFile]) return (translation)
def processFile(js_file_path): try: # Num tokens before vs after try: tok1 = Lexer(os.path.join(files_root, 'orig', js_file_path)).tokenList tok2 = Lexer(os.path.join(files_root, 'no_renaming', js_file_path)).tokenList # tok3 = Lexer(os.path.join(files_root, 'basic_renaming', js_file_path)).tokenList # tok4 = Lexer(os.path.join(files_root, 'normalized', js_file_path)).tokenList tok5 = Lexer( os.path.join(files_root, 'hash_def_one_renaming', js_file_path)).tokenList tok6 = Lexer( os.path.join(files_root, 'hash_def_two_renaming', js_file_path)).tokenList except: return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(set([len(tok1), len(tok2), len(tok5), len(tok6)])) == 1: return (js_file_path, None, 'Num tokens mismatch') clear = Beautifier() # Align minified and clear files, in case the beautifier # did something weird aligner = Aligner() (aligned1, aligned2) = aligner.web_align(tok1, tok2) (ok, beautified1, _err) = clear.web_run(aligned1) tok11 = WebLexer(beautified1).tokenList (ok, beautified2, _err) = clear.web_run(aligned2) tok22 = WebLexer(beautified2).tokenList (aligned5, aligned2) = aligner.web_align(tok5, tok2) (ok, beautified5, _err) = clear.web_run(aligned5) tok55 = WebLexer(beautified5).tokenList (aligned6, aligned2) = aligner.web_align(tok6, tok2) (ok, beautified6, _err) = clear.web_run(aligned6) tok66 = WebLexer(beautified6).tokenList # try: # aligner = Aligner() # # This is already the baseline corpus, no (smart) renaming yet # aligner.align(temp_files['path_tmp_b'], # temp_files['path_tmp_u']) # except: # return (js_file_path, None, 'Aligner fail') try: iBuilder1 = IndexBuilder(tok11) iBuilder2 = IndexBuilder(tok22) # iBuilder3 = IndexBuilder(tok3) # iBuilder4 = IndexBuilder(tok4) iBuilder5 = IndexBuilder(tok55) iBuilder6 = IndexBuilder(tok66) except: return (js_file_path, None, 'IndexBuilder fail') # Check that at least one variable was renamed during minification orig_names = set([ token for line in iBuilder1.tokens for (token_type, token) in line if is_token_subtype(token_type, Token.Name) ]) ugly_names = set([ token for line in iBuilder2.tokens for (token_type, token) in line if is_token_subtype(token_type, Token.Name) ]) if not len(orig_names.difference(ugly_names)): return (js_file_path, None, 'Not minified') orig = [] no_renaming = [] # basic_renaming = [] # normalized = [] hash_def_one_renaming = [] hash_def_two_renaming = [] for _line_idx, line in enumerate(iBuilder1.tokens): orig.append(' '.join([t for (_tt, t) in line]) + "\n") for _line_idx, line in enumerate(iBuilder2.tokens): no_renaming.append(' '.join([t for (_tt, t) in line]) + "\n") # for _line_idx, line in enumerate(iBuilder3.tokens): # basic_renaming.append(' '.join([t for (_tt,t) in line]) + "\n") # for _line_idx, line in enumerate(iBuilder4.tokens): # normalized.append(' '.join([t for (_tt,t) in line]) + "\n") for _line_idx, line in enumerate(iBuilder5.tokens): hash_def_one_renaming.append(' '.join([t for (_tt, t) in line]) + "\n") for _line_idx, line in enumerate(iBuilder6.tokens): hash_def_two_renaming.append(' '.join([t for (_tt, t) in line]) + "\n") return ( js_file_path, orig, no_renaming, # basic_renaming, # normalized, hash_def_one_renaming, hash_def_two_renaming) except Exception, e: return (js_file_path, None, str(e))
def testHashDefRenaming(self): ''' TODO: Test the hashing functions are using the context correctly for both one and two line options. Goals are to confirm a) correct line summarization b) consistency of naming of the same variable. However, two different variables may map to the same name with insufficient context. ''' #print(self.obsfuscatedTextFiles[0]) ib1 = IndexBuilder(self.obsLexed[0].tokenList) sa1 = ScopeAnalyst(self.obsfuscatedTextFiles[0]) RS = RenamingStrategies() preRen = PreRenamer() oneLine1 = preRen.rename(RS.HASH_ONE, ib1, sa1, True) twoLine1 = preRen.rename(RS.HASH_TWO, ib1, sa1, True) # oneLine1 = renameUsingHashDefLine(sa1, ib1, False, True) # twoLine1 = renameUsingHashDefLine(sa1, ib1, True, True) #print("OneLine1------------------------------------------------") #print(oneLine1) #print("TwoLine1------------------------------------------------") #print(twoLine1) #One line tests lines = oneLine1.split("\n") self.assertTrue(lines[0] == "var geom2d = function ( ) {") #var <<var#=numeric.sum,=numeric.numberEquals;>> = numeric . sum , <<var=numeric.sum,#=numeric.numberEquals;>> = numeric . numberEquals ; self.assertTrue( lines[1] == "var <<var#=numeric.sum,=numeric.numberEquals;>> = numeric . sum , <<var=numeric.sum,#=numeric.numberEquals;>> = numeric . numberEquals ;" ) self.assertTrue( lines[3] == "function <<function#(,){>> ( <<function(#,){>> , <<function(,#){>> ) {" ) self.assertTrue(lines[4] == "this . x = <<function(#,){>> ;" ) #Why is x not transformed? Global, can't change... #print(lines[7]) self.assertTrue( lines[7] == "u ( <<function#(,){>> , {" ) #Why is u not transformed? -> Because u's hash <<function#(,){>> is ALREADY IN USE IN THE SAME SCOPE!! (This is why u can be translated in 2-lines) self.assertTrue( lines[16] == "for ( var <<for(var#in)[]=[];>> in <<function(,#){>> ) <<function(#,){>> [ <<for(var#in)[]=[];>> ] = <<function(,#){>> [ <<for(var#in)[]=[];>> ] ;" ) self.assertTrue(lines[20] == "Vector2d : <<function#(,){>>") #Two line tests (TODO) lines = twoLine1.split("\n") self.assertTrue(lines[0] == "var geom2d = function ( ) {") self.assertTrue( lines[1] == "var <<var#=numeric.sum,=numeric.numberEquals;return#([this.x*.x,this.y*.y]);>> = numeric . sum , <<var=numeric.sum,#=numeric.numberEquals;return#(this.x,.x,)&&(this.y,.y,);>> = numeric . numberEquals ;" ) # function <<function#(,){(#,{>> ( <<function(#,){this.x=#;>> , <<function(,#){this.y=#;>> ) { self.assertTrue( lines[3] == "function <<function#(,){(#,{>> ( <<function(#,){this.x=#;>> , <<function(,#){this.y=#;>> ) {" ) self.assertTrue(lines[4] == "this . x = <<function(#,){this.x=#;>> ;" ) #Why is x not transformed? Global, can't change... #u(r, { # #<<function#(,){#(,{>> ( <<function#(,){(#,{>> , { self.assertTrue( lines[7] == "<<function#(,){#(,{>> ( <<function#(,){(#,{>> , {" ) # is transformed, but order seems backwards. self.assertTrue( lines[16] == "for ( var <<for(var#in)[]=[];for(varin)[#]=[];>> in <<function(,#){for(varin#)[]=[];>> ) <<function(#,){for(varin)#[]=[];>> [ <<for(var#in)[]=[];for(varin)[#]=[];>> ] = <<function(,#){for(varin#)[]=[];>> [ <<for(var#in)[]=[];for(varin)[#]=[];>> ] ;" ) #Not really two lines, but two references? self.assertTrue(lines[20] == "Vector2d : <<function#(,){(#,{>>") self.assertTrue(True)
def processFile(js_file_path): js_file_path = os.path.abspath(js_file_path) print 'READING:', js_file_path acorn = Acorn() (_stdout, acorn_ok) = acorn.run(js_file_path) print 'RUNNING Acorn:', acorn_ok # Load in the minified file minified = open(js_file_path).read() b = Beautifier() (ok, out, err) = b.web_run(minified) # print out # Create lexer lexer = get_lexer_for_filename(js_file_path) # Tokenize input and compute mappings between the different # indices used: (line, col), flat, (l,c) in token list indexBuilder = IndexBuilder(lex(minified, lexer)) tokens = indexBuilder.tokens print 'RUNNING IndexBuilder:', len(tokens) > 0 #nice1 = JSNice() #(ok, _out, _err) = nice1.run(js_file_path) #print 'RUNNING JSNice:', ok #nice2 = UnuglifyJS() #(ok, _out, _err) = nice2.run(js_file_path) #print 'RUNNING UnuglifyJS:', ok _pid = multiprocessing.current_process().ident # Compute scoping: name2scope is a dictionary where keys # are (name, start_index) tuples and values are scope identifiers. # Note: start_index is a flat (unidimensional) index, # not a (line_chr_idx, col_chr_idx) index. # scopeAnalyst = ScopeAnalyst(js_file_path) # name2defScope = scopeAnalyst.resolve_scope() # isGlobal = scopeAnalyst.isGlobal scopeAnalyst = WebScopeAnalyst(minified) name2defScope = scopeAnalyst.resolve_scope() isGlobal = scopeAnalyst.isGlobal print 'RUNNING ScopeAnalyst:', len(name2defScope) > 0 name2useScope = scopeAnalyst.name2useScope name2pth = scopeAnalyst.name2pth nameOrigin = scopeAnalyst.nameOrigin scopes = set(name2useScope.values()) for scope in scopes: print scope lc_list = [ indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]] for (t, pos) in name2useScope.keys() if name2useScope[(t, pos)] == scope ] highlight(tokens, lc_list) print # Discover the path to the source map _map_path = sourcemap.discover(minified) # Read and parse our sourcemap # sourcemapIndex = sourcemap.load(open(map_path)) # Cluster names by scope nameScope2Positions = {} # Index data by (name,scope) for token, l in indexBuilder.name2CharPositions.iteritems(): for (line, col) in sorted(l, key=lambda (a, b): (a, b)): pos = indexBuilder.flatMap[(line, col)] if name2defScope.has_key((token, pos)): scope = name2defScope[(token, pos)] use_scope = name2useScope[(token, pos)] pth = name2pth[(token, pos)] glb = isGlobal[(token, pos)] nameScope2Positions.setdefault((token, scope, glb), []) nameScope2Positions[(token, scope, glb)].append((line, col)) # print token, pos # print 'def:', scope # print 'use:', use_scope # print 'pth:', pth # highlight(tokens, [indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]]]) # print for (token,scope,glb), positions in sorted(nameScope2Positions.iteritems(), \ key=lambda (x,y):x[0]): if glb: continue pos = sorted(positions, key=lambda e: (e[0], e[1])) # t = [] tt = [] line_tok_idxs = set([]) for (l, c) in pos: # orig = sourcemapIndex.lookup(line=l, column=c).name (tl, tc) = indexBuilder.revTokMap[(l, c)] line_tok_idxs.add(tl) p = indexBuilder.flatMap[(l, c)] tt.append(((tl, tc), p)) # t.append(orig) # if token == 'n': print '\nNAME:', token.encode('utf-8'), 'isGlobal =', glb # print scope # highlight(tokens, [indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]]]) for ((tli, tci), p) in tt: scope = name2defScope[(token, p)] use_scope = name2useScope[(token, p)] pth = name2pth[(token, p)] origin = nameOrigin[(token, scope)] # print token #, p, origin # print # print 'def:', scope # print 'use:', use_scope # print 'pth:', pth # print for tl in sorted(set([tli for ((tli, tci), p) in tt])): l = list(tokens[tl]) for tc in [tci for ((tli, tci), p) in tt if tli == tl]: l[tc] = (l[tc][0], unichr(0x2588) + token + unichr(0x2588)) # pos = indexBuilder.flatMap[(line,col)] print ' ', '%d:' % (tl + 1), ' '.join( [x[1].encode('utf-8') for x in l]) print return
def deobfuscateJS(self, obfuscatedCode, use_mix, transactionID, debug_output=False, parallel=True, use_local=True): """ Take a string representing minified javascript code and attempt to translate it into a version with better renamings. Parameters ---------- obfuscatedCode: The minified javascript text. use_mix: True/False -> should we invoke JSNice and throw the names into the language model mix? transactionID: an ID for storing temp files - used currently only to identify the input to JSNice. debug_output: should we print debugging output in this pass (TRUE/FALSE) parallel: enable parallelization performance enhancements -> such as calling the moses servers in parallel. Returns ------- A tuple: renamed_text - the renamed text jsnice_error - "" if no error, otherwise a message stating where the jsnice mixing failed Third element is a tuple of TIMING_COUNT performance times preprocess time - total time to preprocess before invoking moses servers prepre time - how long does the first step of the preprocessor take? jsnice time - part of the preprocessing, how long does it take to get and parse jsnice names renaming time - how long did the hashing steps in preprocess take lex_total_time - how long did all the lexers take, builder_time - how long did all the Index Builders take scoper_time - how long did all the scopeAnalysts take moses time - how long did the moses servers take moses_rn_parallel - total time for the parallel moses and renaming to complete postprocess time - how long did the consistency resolution and language model queries take. """ RS = RenamingStrategies() CS = ConsistencyStrategies() r_strategy = RS.HASH_ONE #c_strategy = CS.FREQLEN # or CS.LM? (CS.LM requires a language model + a querylm from moses) #c_strategy = CS.LM c_strategy = CS.LOGMODEL if (use_local == False): proxies = MosesProxy().web_proxies else: proxies = MosesProxy().web_local mosesParams = {} #lm_path = "/data/bogdanv/deobfuscator/experiments/corpora/corpus.lm.970k/js.blm.lm" #lm_path = "/data/bogdanv/deobfuscator/experiments/corpora/corpus.lm.500k/js.blm.lm" lm_path = "./phrase-tables/langmodels/js.blm.lm" #if socket.gethostname() == 'bogdan.mac': # lm_path = "/Users/bogdanv/workspace2/deobfuscator/data/lm/js.blm.lm" #elif socket.gethostname() == "Caseys-MacBook-Pro.local" or socket.gethostname() == "campus-019-136.ucdavis.edu": # lm_path = "/Users/caseycas/jsnaughty_lms/js970k.blm.lm" #Hashed Name -> Minified Name (name, def_scope) -> (name, def_scope) hash_name_map = {} #Minified Name -> jsnice name (name, def_scope) -> (name, def_scope) jsnice_name_map = {} #Record of any errors we get in the js mixing. #If this feature is enabled (to be added as a switch on the website) #it should not crash the input if there is a failure. If the query #doesn't work for some reason, then we should just use the candidate #names provided by moses. jsnice_errors = [] start = time.time() # Strip comments, replace literals, etc try: #if True: prepro = WebLMPreprocessor(obfuscatedCode) prepro_text = str(prepro) if (debug_output): print("Prepro_text----------------------------------") print(prepro_text) print("Prepro_text----------------------------------") except: return ((prepro_error, "", (0, ) * TIMING_COUNT)) prepre_end = time.time() prepre_time = prepre_end - start clear = Beautifier() (ok, beautified_text, _err) = clear.web_run(prepro_text) if (debug_output): print("Beautified Text") print(beautified_text) if (not ok): return ((beaut_error, "", (0, ) * TIMING_COUNT)) #Due to a bug? in the jsnice web service, we need to save the #input text as a file. min_input_file = os.path.join(self.tmpDir, str(transactionID) + ".u.js") with open(min_input_file, 'w') as f: f.write(beautified_text) try: # lex_ugly = Lexer(beautFile) lex_ugly = WebLexer(beautified_text) if (debug_output): print("Lex_ugly---------------------") print(lex_ugly.tokenList) print("Lex_ugly---------------------") iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: return ((ib_error, "", (0, ) * TIMING_COUNT)) #Do Scope related tasks #a raw text version try: # scopeAnalyst = ScopeAnalyst(beautFile) scopeAnalyst = WebScopeAnalyst(beautified_text) except: return ((sa_error, "", (0, ) * TIMING_COUNT)) #Cut short if no variables if (not scopeAnalyst.hasMinifiableVariables()): return ((beautified_text, "No Minifiable Variables", (0, ) * TIMING_COUNT)) elif (debug_output): print("GLOBAL VAR MAP: " + str(scopeAnalyst.isGlobal)) #lex_ugly.write_temp_file(tempFile) js_start = time.time() ######################## # Nice2Predict start ######################## #Don't want a crashing failure for jsnice query. # BV: Next block left out until I figure out the pipe issue # BV: Update: I couldn't pipe input to N2P. TODO: FIX # Run the JSNice from http://www.nice2predict.org if (use_mix): unuglifyJS = UnuglifyJS() (ok, n2p_text, _err) = unuglifyJS.run(min_input_file) #ok = False #Failure test if not ok: jsnice_errors.append('Nice2Predict fail') #return (js_file_path, None, 'Nice2Predict fail') if (use_mix and jsnice_errors == []): (ok, n2p_text_beautified, _err) = clear.web_run(n2p_text) if not ok: jsnice_errors.append('Beautifier failed for JSNice.') #return (js_file_path, None, 'Beautifier fail') if (debug_output): print("JSNice Text") print(n2p_text_beautified) try: n2p_lexer = WebLexer(n2p_text_beautified) n2p_iBuilder = IndexBuilder(n2p_lexer.tokenList) n2p_scopeAnalyst = WebScopeAnalyst(n2p_text_beautified) except: jsnice_errors.append( "IndexBuilder or ScopeAnalysted failed for JSNice.") #return (js_file_path, None, 'IndexBuilder / ScopeAnalyst fail') ######################## # Nice2Predict End ######################## js_end = time.time() js_time = js_end - js_start #Do Scope related tasks (name_positions, position_names, use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst) #Map the jsnice names to the minified counterparts. if (use_mix and jsnice_errors == [] ): #only attempt if we are error free for jsnice up to this point. try: orderedVarsNew = sorted(scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) orderedVarsN2p = sorted(n2p_scopeAnalyst.name2defScope.keys(), key=lambda x: x[1]) if (len(orderedVarsNew) != len(orderedVarsN2p)): jsnice_errors.append( "JSNice and minified name lists different lengths.") #raise IndexError("Length Mismatch") #Probably better to have our own defined error type, but this will do for now #return ("JsNice and New Name lists different length") for i in range(0, len(orderedVarsNew)): name_new = orderedVarsNew[i][0] def_scope_new = scopeAnalyst.name2defScope[ orderedVarsNew[i]] name_n2p = orderedVarsN2p[i][0] def_scope_n2p = scopeAnalyst.name2defScope[ orderedVarsNew[i]] jsnice_name_map[(name_new, def_scope_new)] = (name_n2p, def_scope_n2p) except: jsnice_errors.append( "JSNice to minified name map building failed.") (_name_positions, \ position_names, _use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst) #Note: we want to put these in parallel once we've tested the #serial version... pre_outer_end = time.time() pre_time = pre_outer_end - start if (not parallel): #Get moses output for no_renaming (status, error_msg, translation_default, name_candidates_default, iBuilder_default, scopeAnalyst_default, name_positions_default, position_names_default, use_scopes_default, hash_name_map_default, rn_time_default, m_time_default, lex_time_default, post_start_default) = getMosesTranslation(proxies[RS.NONE], RS.NONE, RS, clear, iBuilder_ugly, scopeAnalyst, debug_output) #print("MOSES NO RENAMING: " + str(m_time_default)) if (not status): return ((error_msg, "", (0, ) * TIMING_COUNT)) #Get moses output for hash_renaming (status, error_msg, translation, name_candidates, a_iBuilder, a_scopeAnalyst, a_name_positions, a_position_names, a_use_scopes, hash_name_map, rn_time, m_time, lex_time, post_start) = getMosesTranslation(proxies[r_strategy], r_strategy, RS, clear, iBuilder_ugly, scopeAnalyst, debug_output) #print("MOSES HASH RENAMING: " + str(m_time)) if (not status): return ((error_msg, "", (0, ) * TIMING_COUNT)) m_parallel_time = 0 else: #Parallel version none_wrapper = (RS.NONE, RS, clear, iBuilder_ugly, scopeAnalyst, debug_output, use_local) hash_wrapper = (r_strategy, RS, clear, iBuilder_ugly, scopeAnalyst, debug_output, use_local) wrappers = [none_wrapper, hash_wrapper] pool = multiprocessing.Pool(processes=2) m_parallel_start = time.time() for result in pool.imap(getMosesTranslationParallel, wrappers): if (result[0] == RS.NONE): #No renaming (status, error_msg, translation_default, name_candidates_default, iBuilder_default, scopeAnalyst_default, name_positions_default, position_names_default, use_scopes_default, hash_name_map_default, rn_time_default, m_time_default, lex_time_default, post_start_default) = result[1] #print("MOSES NO RENAMING: " + str(m_time_default)) if (not status): return ((error_msg, "", (0, ) * TIMING_COUNT)) else: (status, error_msg, translation, name_candidates, a_iBuilder, a_scopeAnalyst, a_name_positions, a_position_names, a_use_scopes, hash_name_map, rn_time, m_time, lex_time, post_start) = result[1] #print("MOSES HASH RENAMING: " + str(m_time)) if (not status): return ((error_msg, "", (0, ) * TIMING_COUNT)) m_parallel_time = time.time() - m_parallel_start pre_time += rn_time_default + rn_time if (debug_output): print("Serial: " + str(m_time + m_time_default + rn_time + rn_time_default)) print("Parallel: " + str(m_parallel_time)) if translation is not None and translation_default is not None: for key_default, suggestions in name_candidates_default.iteritems( ): # (name_default, def_scope_default) = key_default pos_default = scopeAnalyst_default.nameDefScope2pos[ key_default] (lin, col) = iBuilder_default.revFlatMat[pos_default] (line_num, line_idx) = iBuilder_default.revTokMap[(lin, col)] (name, def_scope) = a_position_names[line_num][line_idx] key = (name, def_scope) for name_translation, lines in suggestions.iteritems(): name_candidates.setdefault(key, {}) name_candidates[key].setdefault(name_translation, set([])) name_candidates[key][name_translation].update(lines) # name_candidates is a dictionary of dictionaries: # keys are (name, None) (if scopeAnalyst=None) or # (name, def_scope) tuples (otherwise); # values are suggested translations with the sets # of line numbers on which they appear. #if(True): if (debug_output): print("Name_candidates") print(name_candidates) print("jsnice_name_map") print(jsnice_name_map) print("hash_name_map") print(hash_name_map) # **** BV: This might be all we need to combine Naughty & Nice if ( use_mix and jsnice_errors == [] ): #only attempt if we are error free for jsnice up to this point. try: name_candidates_copy = deepcopy(name_candidates) for key, suggestions in name_candidates_copy.iteritems(): if (debug_output): print("Key: " + str(key)) print("Suggestions: " + str(suggestions)) if r_strategy == RS.NONE: (name_n2p, def_scope_n2p) = jsnice_name_map[key] else: (name_n2p, def_scope_n2p ) = jsnice_name_map[hash_name_map.get(key, key)] for name_translation, lines in suggestions.iteritems(): name_candidates.setdefault(key, {}) name_candidates[key].setdefault(name_n2p, set([])) name_candidates[key][name_n2p].update(lines) except: jsnice_errors.append( "Failure while adding jsnice names to candidate pool.") cr = ConsistencyController(debug_mode=debug_output) # An identifier may have been translated inconsistently # across different lines (Moses treats each line independently). # Try different strategies to resolve inconsistencies, if any # Compute renaming map (x -> length, y -> width, ...) # Note that x,y here are names after renaming #Hash error is occuring in here. try: (temp_renaming_map, seen) = cr.computeRenaming( c_strategy, name_candidates, a_name_positions, a_use_scopes, a_iBuilder, lm_path, {}, hash_name_map) except: return ("Compute renaming fail.", "", (0, ) * TIMING_COUNT) if (debug_output): print("Temp renaming map") print(temp_renaming_map) # Fall back on original names in input, if # no translation was suggested postRen = PostRenamer() renaming_map = postRen.updateRenamingMap(a_name_positions, position_names, a_use_scopes, temp_renaming_map, seen, r_strategy) if (debug_output): print("Renaming Map") print(renaming_map) # Apply renaming map and save output for future inspection renamed_text = postRen.applyRenaming(a_iBuilder, a_name_positions, renaming_map) (ok, beautified_renamed_text, _err) = clear.web_run_end(renamed_text) #print(name_candidates) #print("--------------") #print(renamed_text) #print("--------------") #print(beautified_renamed_text) #print("--------------") #print(" ".join(jsnice_errors)) if not ok: return ((beaut_error, "", (0, ) * TIMING_COUNT)) if (debug_output): print("Renamed text") print(beautified_renamed_text) #Time Calculations... (Will need to update for when it becomes parallel post_end = time.time() post_time = post_end - post_start #Record any jsnice errors (but leave output blank if there are none). jsnice_error_string = "" if (jsnice_errors != []): jsnice_error_string = "JSNice mixing attempt failed. Reporting renaming with only our method. \nJSNice Errors : \n" jsnice_error_string += "\n".join(jsnice_errors) + "\n" #Tally up the build times for the lexers, indexbuilders and scopers. if (not use_mix): n2pLexTime = 0 n2pBuildTime = 0 n2pSATime = 0 else: n2pLexTime = n2p_lexer.build_time n2pBuildTime = n2p_iBuilder.build_time n2pSATime = n2p_scopeAnalyst.build_time #Lexers lex_total_time = lex_time + lex_time_default + lex_ugly.build_time + n2pLexTime #IndexBuilders builder_time = iBuilder_ugly.build_time + n2pBuildTime + a_iBuilder.build_time + iBuilder_default.build_time #scopers scoper_time = n2pSATime + scopeAnalyst.build_time + scopeAnalyst_default.build_time + a_scopeAnalyst.build_time #Change the presentation of this to return performance information #and error codes as separate elements in a tuple #New return: translation, jsnice_error, preprocess time, js_time, rename_time #m_time, post_time. return ((str(beautified_renamed_text), jsnice_error_string, (pre_time, prepre_time, js_time, rn_time + rn_time_default, lex_total_time, builder_time, scoper_time, m_time + m_time_default, m_parallel_time, post_time)))