def processFile(js_file_path): pid = int(multiprocessing.current_process().ident) try: signal.alarm(600) prepro = Preprocessor(js_file_path) prepro.write_temp_file('tmp_%d.js' % pid) beauty = Beautifier() ok = beauty.run('tmp_%d.js' % pid, 'tmp_%d.b.js' % pid) if ok: mc = MiniChecker('tmp_%d.b.js' % pid) try: isMini = mc.compare(keep_mini=False) except Exception as e: isMini = str(e) cleanup(pid) return [os.path.basename(js_file_path), isMini] else: cleanup(pid) return [os.path.basename(js_file_path), 'Beautifier failed'] except TimeExceededError: cleanup(pid) return [os.path.basename(js_file_path), 'Timeout']
def summarizeUnscopedTranslation(renaming_map, f_path, translation_strategy, output_path, base_name, name_candidates, name_positions, iBuilder): nc = [] f_base = os.path.basename(f_path) training_strategy = f_base.split('.')[1] tmp_path = '%s.%s.js' % (f_base[:-3], translation_strategy) o_path = '%s.%s.unscoped.%s.js' % (base_name, training_strategy, translation_strategy) # print f_path, f_base, training_strategy, tmp_path, o_path, base_name writeTmpLines(renameHashed(iBuilder, name_positions, renaming_map), tmp_path) clear = Beautifier() ok = clear.run(tmp_path, os.path.join(output_path, o_path)) if not ok: return False try: lexer = Lexer(os.path.join(output_path, o_path)) iBuilder_local = IndexBuilder(lexer.tokenList) scopeAnalyst_local = ScopeAnalyst(os.path.join(output_path, o_path)) except: return False nameOrigin = scopeAnalyst_local.nameOrigin isGlobal = scopeAnalyst_local.isGlobal for (name, def_scope) in nameOrigin.iterkeys(): pos = scopeAnalyst_local.nameDefScope2pos[(name, def_scope)] if not False: #isGlobal.get((name, pos), True): (lin, col) = iBuilder_local.revFlatMat[pos] (tok_lin, tok_col) = iBuilder_local.revTokMap[(lin, col)] nc.append( ('%s.unscoped.%s' % (training_strategy, translation_strategy), def_scope, tok_lin, tok_col, isGlobal.get( (name, pos), True), name, '', '')) return nc
def processFile(l): js_file_path = l[0] pid = int(multiprocessing.current_process().ident) try: # Temp files to be created during processing path_tmp = 'tmp_%d.js' % pid path_tmp_b = 'tmp_%d.b.js' % pid # Strip comments, replace literals, etc try: prepro = Preprocessor(os.path.join(corpus_root, js_file_path)) prepro.write_temp_file(path_tmp) except: cleanup(pid) return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout clear = Beautifier() ok = clear.run(path_tmp, path_tmp_b) if not ok: cleanup(pid) return (js_file_path, None, 'Beautifier fail') try: iBuilder_clear = IndexBuilder(Lexer(path_tmp_b).tokenList) except: cleanup(pid) return (js_file_path, None, 'IndexBuilder fail') n_lines = len(iBuilder_clear.tokens) max_line_len = max([len(l) for l in iBuilder_clear.tokens]) cleanup(pid) return (js_file_path, n_lines, max_line_len) except Exception, e: cleanup(pid) return (js_file_path, None, str(e))
def summarizeFallbackTranslation(renaming_map, fallback_renaming_map, f_path, translation_strategy, output_path, base_name, name_candidates, name_positions, iBuilder, scopeAnalyst): nc = [] f_base = os.path.basename(f_path) training_strategy = f_base.split('.')[1] tmp_path = '%s.%s.js' % (f_base[:-3], translation_strategy) o_path = '%s.%s.%s.js' % (base_name, training_strategy, translation_strategy) # print f_path, f_base, training_strategy, tmp_path, o_path, base_name isGlobal = scopeAnalyst.isGlobal for (name, def_scope), renaming in renaming_map.iteritems(): pos = scopeAnalyst.nameDefScope2pos[(name, def_scope)] (lin, col) = iBuilder.revFlatMat[pos] (tok_lin, tok_col) = iBuilder.revTokMap[(lin, col)] nc.append(('%s.%s' % (training_strategy, translation_strategy), def_scope, tok_lin, tok_col, isGlobal.get((name, pos), True), renaming, ','.join(name_candidates[(name, def_scope)]))) writeTmpLines( renameHashedFallback(iBuilder, name_positions, renaming_map, fallback_renaming_map), tmp_path) clear = Beautifier() ok = clear.run(tmp_path, os.path.join(output_path, o_path)) if not ok: return False return nc
def processFile(row): js_file_path = os.path.join(corpus_root, row[0]) pid = int(multiprocessing.current_process().ident) base_name = os.path.splitext(os.path.basename(js_file_path))[0] # Temp files to be created during processing temp_files = { 'path_tmp': 'tmp_%d.js' % pid, 'path_tmp_b': 'tmp_%d.b.js' % pid, 'path_tmp_b_a': 'tmp_%d.b.a.js' % pid, 'path_tmp_u': 'tmp_%d.u.js' % pid, 'path_tmp_u_a': 'tmp_%d.u.a.js' % pid } try: # Pass through beautifier to fix layout: # # - once through JSNice without renaming # jsNiceBeautifier = JSNice(flags=['--no-types', '--no-rename']) # # (ok, _out, _err) = jsNiceBeautifier.run(js_file_path, # temp_files['path_tmp']) # if not ok: # cleanup(temp_files) # return (js_file_path, False, 'JSNice Beautifier fail') # # # # Weird JSNice renamings despite --no-rename # try: # before = set([token for (token, token_type) in # Lexer(js_file_path).tokenList # if is_token_subtype(token_type, Token.Name)]) # after = set([token for (token, token_type) in # Lexer(temp_files['path_tmp']).tokenList # if is_token_subtype(token_type, Token.Name)]) # # if not before == after: # return (js_file_path, False, 'Weird JSNice renaming') # # except: # cleanup(temp_files) # return (js_file_path, False, 'Lexer fail') # Strip comments, replace literals, etc try: prepro = Preprocessor(os.path.join(corpus_root, js_file_path)) prepro.write_temp_file(temp_files['path_tmp']) except: cleanup(temp_files) return (js_file_path, None, 'Preprocessor fail') # - and another time through uglifyjs pretty print only clear = Beautifier() ok = clear.run(temp_files['path_tmp'], temp_files['path_tmp_b']) if not ok: cleanup(temp_files) return (js_file_path, False, 'Beautifier fail') # Minify ugly = Uglifier() ok = ugly.run(temp_files['path_tmp_b'], temp_files['path_tmp_u']) if not ok: cleanup(temp_files) return (js_file_path, False, 'Uglifier fail') # Num tokens before vs after try: tok_clear = Lexer(temp_files['path_tmp_b']).tokenList tok_ugly = Lexer(temp_files['path_tmp_u']).tokenList except: cleanup(temp_files) return (js_file_path, False, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): cleanup(temp_files) return (js_file_path, False, 'Num tokens mismatch') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() # This is already the baseline corpus, no (smart) renaming yet aligner.align(temp_files['path_tmp_b'], temp_files['path_tmp_u']) except: cleanup(temp_files) return (js_file_path, False, 'Aligner fail') # Check if minification resulted in any change # It's not very interesting otherwise if open(temp_files['path_tmp_b_a']).read() == \ open(temp_files['path_tmp_u_a']).read(): cleanup(temp_files) return (js_file_path, False, 'Not minified') try: lex_ugly = Lexer(temp_files['path_tmp_u_a']) _iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: cleanup(temp_files) return (js_file_path, False, 'IndexBuilder fail') # Store original and uglified versions ok = clear.run(temp_files['path_tmp_b_a'], os.path.join(output_path, '%s.js' % base_name)) if not ok: cleanup(temp_files) cleanupProcessed(base_name) return (js_file_path, False, 'Beautifier fail') ok = clear.run(temp_files['path_tmp_u_a'], os.path.join(output_path, '%s.u.js' % base_name)) if not ok: cleanup(temp_files) cleanupProcessed(base_name) return (js_file_path, False, 'Beautifier fail') cleanup(temp_files) return (js_file_path, True, 'OK') except Exception, e: cleanup(temp_files) return (js_file_path, False, str(e))
def processFile(l): js_file_path = l[0] if js_file_path in seen: return (js_file_path, None, 'Skipped') pid = int(multiprocessing.current_process().ident) # Temp files to be created during processing temp_files = {'path_tmp': 'tmp_%d.js' % pid, 'path_tmp_b': 'tmp_%d.b.js' % pid, 'path_tmp_b_n': 'tmp_%d.b.n.js' % pid, 'path_tmp_u': 'tmp_%d.u.js' % pid, 'path_tmp_u_n': 'tmp_%d.u.n.js' % pid, 'path_tmp_b_a': 'tmp_%d.b.a.js' % pid, 'path_tmp_u_a': 'tmp_%d.u.a.js' % pid} try: # Strip comments, replace literals, etc try: prepro = Preprocessor(os.path.join(corpus_root, js_file_path)) prepro.write_temp_file(temp_files['path_tmp']) except: cleanup(temp_files) return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout: # - once through JSNice without renaming # jsNiceBeautifier = JSNice(flags=['--no-types', '--no-rename']) # # (ok, _out, _err) = jsNiceBeautifier.run(temp_files['path_tmp'], # temp_files['path_tmp_b_n']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'JSNice Beautifier fail') # # - and another time through uglifyjs pretty print only # clear = Beautifier() # ok = clear.run(temp_files['path_tmp_b_n'], # temp_files['path_tmp_b']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'Beautifier fail') # # JSNice is down! clear = Beautifier() ok = clear.run(temp_files['path_tmp'], temp_files['path_tmp_b_n']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Beautifier fail') # Normalize norm = Normalizer() ok = norm.run(os.path.join(os.path.dirname(os.path.realpath(__file__)), temp_files['path_tmp_b_n']), False, temp_files['path_tmp_b']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Normalizer fail') # Minify ugly = Uglifier() ok = ugly.run(temp_files['path_tmp_b'], temp_files['path_tmp_u_n']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Uglifier fail') # Normalize norm = Normalizer() ok = norm.run(os.path.join(os.path.dirname(os.path.realpath(__file__)), temp_files['path_tmp_u_n']), False, temp_files['path_tmp_u']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Normalizer fail') # Num tokens before vs after try: tok_clear = Lexer(temp_files['path_tmp_b']).tokenList tok_ugly = Lexer(temp_files['path_tmp_u']).tokenList except: cleanup(temp_files) return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): cleanup(temp_files) return (js_file_path, None, 'Num tokens mismatch') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() # This is already the baseline corpus, no (smart) renaming yet aligner.align(temp_files['path_tmp_b'], temp_files['path_tmp_u']) except: cleanup(temp_files) return (js_file_path, None, 'Aligner fail') try: lex_clear = Lexer(temp_files['path_tmp_b_a']) iBuilder_clear = IndexBuilder(lex_clear.tokenList) lex_ugly = Lexer(temp_files['path_tmp_u_a']) iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: cleanup(temp_files) return (js_file_path, None, 'IndexBuilder fail') # Normalize norm = Normalizer() ok = norm.run(os.path.join(os.path.dirname(os.path.realpath(__file__)), temp_files['path_tmp_b']), True, temp_files['path_tmp_u_n']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Normalizer fail') try: lex_norm = Lexer(temp_files['path_tmp_u_n']) iBuilder_norm = IndexBuilder(lex_norm.tokenList) except: cleanup(temp_files) return (js_file_path, None, 'IndexBuilder fail') normalized = [] for line_idx, line in enumerate(iBuilder_norm.tokens): normalized.append(' '.join([t for (_tt,t) in line]) + "\n") # Compute scoping: name2scope is a dictionary where keys # are (name, start_index) tuples and values are scope identifiers. # Note: start_index is a flat (unidimensional) index, # not a (line_chr_idx, col_chr_idx) index. try: scopeAnalyst = ScopeAnalyst(os.path.join( os.path.dirname(os.path.realpath(__file__)), temp_files['path_tmp_u_a'])) # _name2defScope = scopeAnalyst.resolve_scope() # _isGlobal = scopeAnalyst.isGlobal # _name2useScope = scopeAnalyst.resolve_use_scope() except: cleanup(temp_files) return (js_file_path, None, 'ScopeAnalyst fail') orig = [] no_renaming = [] for line_idx, line in enumerate(iBuilder_ugly.tokens): orig.append(' '.join([t for (_tt,t) in \ iBuilder_clear.tokens[line_idx]]) + "\n") no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n") # # Simple renaming: disambiguate overloaded names using scope id basic_renaming = renameUsingScopeId(scopeAnalyst, iBuilder_ugly) # More complicated renaming: collect the context around # each name (global variables, API calls, punctuation) # and build a hash of the concatenation. # hash_renaming = renameUsingHashAllPrec(scopeAnalyst, # iBuilder_ugly, # debug=True) hash_def_one_renaming = renameUsingHashDefLine(scopeAnalyst, iBuilder_ugly, twoLines=False, debug=False) hash_def_two_renaming = renameUsingHashDefLine(scopeAnalyst, iBuilder_ugly, twoLines=True, debug=False) cleanup(temp_files) return (js_file_path, orig, no_renaming, basic_renaming, normalized, # hash_renaming, hash_def_one_renaming, hash_def_two_renaming) except Exception, e: cleanup(temp_files) return (js_file_path, None, str(e))
def processFile(l): js_file_path = l[0] base_name = os.path.splitext(os.path.basename(js_file_path))[0] pid = int(multiprocessing.current_process().ident) temp_files = { 'path_tmp': 'tmp_%d.js' % pid, 'path_tmp_b': 'tmp_%d.b.js' % pid, 'path_tmp_b_1': 'tmp_%d.b.1.js' % pid, 'path_tmp_b_2': 'tmp_%d.b.2.js' % pid, 'path_tmp_b_a': 'tmp_%d.b.a.js' % pid, 'path_tmp_u': 'tmp_%d.u.js' % pid, 'path_tmp_u_a': 'tmp_%d.u.a.js' % pid, 'path_tmp_unugly': 'tmp_%d.n2p.js' % pid, 'path_tmp_unugly_1': 'tmp_%d.n2p.1.js' % pid, 'path_tmp_unugly_2': 'tmp_%d.n2p.2.js' % pid, 'path_tmp_jsnice': 'tmp_%d.jsnice.js' % pid, 'f2': 'tmp_%d.no_renaming.js' % pid, # 'f3': 'tmp_%d.basic_renaming.js' % pid, # 'f4': 'tmp_%d.hash_renaming.js' % pid, 'f5': 'tmp_%d.hash_def_one_renaming.js' % pid, # 'f6': 'tmp_%d.hash_def_two_renaming.js' % pid, 'f7': 'tmp_%d.hash_def_one_renaming_fb.js' % pid, 'path_orig': os.path.join(output_path, '%s.js' % base_name), 'path_ugly': os.path.join(output_path, '%s.u.js' % base_name), 'path_unugly': os.path.join(output_path, '%s.n2p.js' % base_name), 'path_jsnice': os.path.join(output_path, '%s.jsnice.js' % base_name) } # for strategy in ['js', 'lm.js', 'len.js', 'freqlen.js']: # for renaming in ['no_renaming', 'hash_def_one_renaming']: # temp_files['path_tmp_%s_%s' % (renaming, strategy)] = \ # 'tmp_%d.%s.%s' % (pid, renaming, strategy) candidates = [] # if True: try: # Strip comments, replace literals, etc try: prepro = Preprocessor(os.path.join(corpus_root, js_file_path)) prepro.write_temp_file(temp_files['path_tmp']) except: cleanup(temp_files) return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout clear = Beautifier() ok = clear.run(temp_files['path_tmp'], temp_files['path_tmp_b']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Beautifier fail') # # Pass through beautifier to fix layout # clear = Beautifier() # ok = clear.run(temp_files['path_tmp'], # temp_files['path_tmp_b_1']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'Beautifier fail') # # jsNiceBeautifier = JSNice(flags=['--no-types', '--no-rename']) # # (ok, _out, _err) = jsNiceBeautifier.run(temp_files['path_tmp_b_1'], # temp_files['path_tmp_b_2']) # if not ok: # cleanup(temp_files) # print js_file_path, _err # return (js_file_path, None, 'JSNice Beautifier fail') # # ok = clear.run(temp_files['path_tmp_b_2'], # temp_files['path_tmp_b']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'Beautifier fail') # # # # Weird JSNice renamings despite --no-rename # try: # before = set([token for (token, token_type) in # Lexer(temp_files['path_tmp_b_1']).tokenList # if is_token_subtype(token_type, Token.Name)]) # after = set([token for (token, token_type) in # Lexer(temp_files['path_tmp_b']).tokenList # if is_token_subtype(token_type, Token.Name)]) # # if not before == after: # return (js_file_path, None, 'Weird JSNice renaming') # # except: # cleanup(temp_files) # return (js_file_path, None, 'Lexer fail') # Minify ugly = Uglifier() ok = ugly.run(temp_files['path_tmp_b'], temp_files['path_tmp_u']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Uglifier fail') # Num tokens before vs after try: tok_clear = Lexer(temp_files['path_tmp_b']).tokenList tok_ugly = Lexer(temp_files['path_tmp_u']).tokenList except: cleanup(temp_files) return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): cleanup(temp_files) return (js_file_path, None, 'Num tokens mismatch') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() # This is already the baseline corpus, no (smart) renaming yet aligner.align(temp_files['path_tmp_b'], temp_files['path_tmp_u']) except: cleanup(temp_files) return (js_file_path, None, 'Aligner fail') if open(temp_files['path_tmp_b']).read() == \ open(temp_files['path_tmp_u']).read(): cleanup(temp_files) return (js_file_path, None, 'Not minified') try: lex_ugly = Lexer(temp_files['path_tmp_u_a']) iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: cleanup(temp_files) return (js_file_path, None, 'IndexBuilder fail') ############################################################ # From now on only work with path_tmp_b_a and path_tmp_u_a ############################################################ # Store original and uglified versions ok = clear.run(temp_files['path_tmp_b_a'], temp_files['path_orig']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Beautifier fail') ok = clear.run(temp_files['path_tmp_u_a'], temp_files['path_ugly']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Beautifier fail') # Run the JSNice from http://www.nice2predict.org unuglifyJS = UnuglifyJS() (ok, _out, _err) = unuglifyJS.run(temp_files['path_tmp_u_a'], temp_files['path_tmp_unugly']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Nice2Predict fail') ok = clear.run(temp_files['path_tmp_unugly'], temp_files['path_unugly']) if not ok: cleanup(temp_files) return (js_file_path, None, 'Beautifier fail') # ok = clear.run(temp_files['path_tmp_unugly'], # temp_files['path_tmp_unugly_1']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'Beautifier fail') # # (ok, _out, _err) = jsNiceBeautifier.run(temp_files['path_tmp_unugly_1'], # temp_files['path_tmp_unugly_2']) # if not ok: # cleanup(temp_files) # print js_file_path, _err # return (js_file_path, None, 'JSNice Beautifier fail') # # ok = clear.run(temp_files['path_tmp_unugly_2'], # temp_files['path_unugly']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'Beautifier fail') try: lexer = Lexer(temp_files['path_unugly']) iBuilder = IndexBuilder(lexer.tokenList) except: cleanup(temp_files) return (js_file_path, None, 'IndexBuilder fail') try: scopeAnalyst = ScopeAnalyst( os.path.join(os.path.dirname(os.path.realpath(__file__)), temp_files['path_unugly'])) nameOrigin = scopeAnalyst.nameOrigin isGlobal = scopeAnalyst.isGlobal for (name, def_scope) in nameOrigin.iterkeys(): pos = scopeAnalyst.nameDefScope2pos[(name, def_scope)] (lin, col) = iBuilder.revFlatMat[pos] (tok_lin, tok_col) = iBuilder.revTokMap[(lin, col)] candidates.append(('Nice2Predict', def_scope, tok_lin, tok_col, isGlobal.get((name, pos), True), name, '', '')) except: cleanup(temp_files) return (js_file_path, None, 'ScopeAnalyst fail') # # Run the JSNice from http://www.jsnice.org # jsNice = JSNice() # (ok, _out, _err) = jsNice.run(temp_files['path_tmp_u_a'], # temp_files['path_tmp_jsnice']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'JSNice fail') # # ok = clear.run(temp_files['path_tmp_jsnice'], # temp_files['path_jsnice']) # if not ok: # cleanup(temp_files) # return (js_file_path, None, 'Beautifier fail') # # try: # lexer = Lexer(temp_files['path_jsnice']) # iBuilder = IndexBuilder(lexer.tokenList) # except: # cleanup(temp_files) # return (js_file_path, None, 'IndexBuilder fail') # # try: # scopeAnalyst = ScopeAnalyst(os.path.join( # os.path.dirname(os.path.realpath(__file__)), # temp_files['path_jsnice'])) # nameOrigin = scopeAnalyst.nameOrigin # isGlobal = scopeAnalyst.isGlobal # # for (name, def_scope) in nameOrigin.iterkeys(): # # pos = scopeAnalyst.nameDefScope2pos[(name, def_scope)] # (lin,col) = iBuilder.revFlatMat[pos] # (tok_lin,tok_col) = iBuilder.revTokMap[(lin,col)] # # candidates.append(('JSNice', def_scope, # tok_lin, tok_col, # isGlobal.get((name, pos), True), # name, '','')) # except: # cleanup(temp_files) # return (js_file_path, None, 'ScopeAnalyst fail') # Compute scoping: name2scope is a dictionary where keys # are (name, start_index) tuples and values are scope identifiers. # Note: start_index is a flat (unidimensional) index, # not a (line_chr_idx, col_chr_idx) index. try: scopeAnalyst = ScopeAnalyst( os.path.join(os.path.dirname(os.path.realpath(__file__)), temp_files['path_tmp_u_a'])) except: cleanup(temp_files) return (js_file_path, None, 'ScopeAnalyst fail') # Baseline translation: No renaming, no scoping no_renaming = [] for _line_idx, line in enumerate(iBuilder_ugly.tokens): no_renaming.append(' '.join([t for (_tt, t) in line]) + "\n") with open(temp_files['f2'], 'w') as f_no_renaming: f_no_renaming.writelines(no_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.no_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation_no_renaming, _err) = moses.run(temp_files['f2']) nc = processTranslationUnscoped(translation_no_renaming, iBuilder_ugly, lm_path, temp_files['f2'], output_path, base_name) if nc: candidates += nc # translation, iBuilder, lm_path, # f_path, output_path, base_name # Default translation: No renaming # no_renaming = [] # for _line_idx, line in enumerate(iBuilder_ugly.tokens): # no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n") # # with open(temp_files['f2'], 'w') as f_no_renaming: # f_no_renaming.writelines(no_renaming) # # moses = MosesDecoder(ini_path=os.path.join(ini_path, \ # 'train.no_renaming', 'tuning', 'moses.ini')) # (_moses_ok, translation, _err) = moses.run(temp_files['f2']) nc = processTranslationScoped(translation_no_renaming, iBuilder_ugly, scopeAnalyst, lm_path, temp_files['f2'], output_path, base_name) if nc: candidates += nc # More complicated renaming: collect the context around # each name (global variables, API calls, punctuation) # and build a hash of the concatenation. hash_def_one_renaming = renameUsingHashDefLine(scopeAnalyst, iBuilder_ugly, twoLines=False, debug=False) with open(temp_files['f5'], 'w') as f_hash_def_one_renaming: f_hash_def_one_renaming.writelines(hash_def_one_renaming) # moses = MosesDecoder(ini_path=os.path.join(ini_path, \ # 'train.hash_def_one_renaming', 'tuning', 'moses.ini')) # (_moses_ok, # translation_hash_renaming, # _err) = moses.run(temp_files['f5']) mosesParams = {} mosesParams["text"] = hash_def_one_renaming #lex_ugly.collapsedText #mosesParams["align"] = "true" #mosesParams["report-all-factors"] = "true" mresults = proxy.translate( mosesParams) # __request("translate", mosesParams) rawText = Postprocessor(mresults["nbest"]) translation_hash_renaming = rawText.getProcessedOutput() nc = processTranslationScoped(translation_hash_renaming, iBuilder_ugly, scopeAnalyst, lm_path, temp_files['f5'], output_path, base_name) if nc: candidates += nc # nc = processTranslationScopedFallback(translation_hash_renaming, # translation_no_renaming, # iBuilder_ugly, # scopeAnalyst, # lm_path, # temp_files['f7'], # output_path, # base_name) # if nc: # candidates += nc cleanup(temp_files) cleanupRenamed(pid) return (js_file_path, 'OK', candidates) except Exception, e: cleanup(temp_files) cleanupRenamed(pid) return (js_file_path, None, str(e).replace("\n", ""))
def processFile(l): def localCleanup(output_path, base_names): for base_name in base_names: tryRemove(os.path.join(output_path, base_name)) js_file_path = l[0] base_name = os.path.splitext(os.path.basename(js_file_path))[0] pid = int(multiprocessing.current_process().ident) candidates = [] try: # if True: # Temp files to be created during processing path_tmp = 'tmp_%d.js' % (pid) path_tmp_b = 'tmp_%d.b.js' % (pid) path_tmp_b_a = 'tmp_%d.b.a.js' % (pid) path_tmp_u = 'tmp_%d.u.js' % (pid) path_tmp_u_a = 'tmp_%d.u.a.js' % (pid) path_tmp_unugly = 'tmp_%d.n2p.js' % (pid) path_tmp_jsnice = 'tmp_%d.jsnice.js' % (pid) f2 = 'tmp_%d.no_renaming.js' % (pid) f3 = 'tmp_%d.basic_renaming.js' % (pid) f4 = 'tmp_%d.hash_renaming.js' % (pid) f5 = 'tmp_%d.hash_def_one_renaming.js' % (pid) f6 = 'tmp_%d.hash_def_two_renaming.js' % (pid) path_orig = '%s.js' % (base_name) path_ugly = '%s.u.js' % (base_name) path_unugly = '%s.n2p.js' % (base_name) path_jsnice = '%s.jsnice.js' % (base_name) # Strip comments, replace literals, etc try: prepro = Preprocessor(os.path.join(corpus_root, js_file_path)) prepro.write_temp_file(path_tmp) except: cleanup(pid) return (js_file_path, None, 'Preprocessor fail') # Pass through beautifier to fix layout clear = Beautifier() ok = clear.run(path_tmp, path_tmp_b+'.tmp1') if not ok: cleanup(pid) return (js_file_path, None, 'Beautifier 1 fail') jsNiceBeautifier = JSNice(flags=['--no-types', '--no-rename']) (ok, _out, _err) = jsNiceBeautifier.run(path_tmp_b+'.tmp1', path_tmp_b+'.tmp2') if not ok: cleanup(pid) return (js_file_path, None, 'JSNice Beautifier 1 fail') ok = clear.run(path_tmp_b+'.tmp2', path_tmp_b) if not ok: cleanup(pid) return (js_file_path, None, 'Beautifier 1 fail') # Minify ugly = Uglifier() ok = ugly.run(path_tmp_b, path_tmp_u) if not ok: cleanup(pid) return (js_file_path, None, 'Uglifier fail') # Num tokens before vs after try: tok_clear = Lexer(path_tmp_b).tokenList tok_ugly = Lexer(path_tmp_u).tokenList except: cleanup(pid) return (js_file_path, None, 'Lexer fail') # For now only work with minified files that have # the same number of tokens as the originals if not len(tok_clear) == len(tok_ugly): cleanup(pid) return (js_file_path, None, 'Num tokens mismatch') # Align minified and clear files, in case the beautifier # did something weird try: aligner = Aligner() # This is already the baseline corpus, no (smart) renaming yet aligner.align(path_tmp_b, path_tmp_u) except: cleanup(pid) return (js_file_path, None, 'Aligner fail') try: # iBuilder_clear = IndexBuilder(Lexer(path_tmp_b_a).tokenList) iBuilder_ugly = IndexBuilder(Lexer(path_tmp_u_a).tokenList) except: cleanup(pid) return (js_file_path, None, 'IndexBuilder fail') # Store original and uglified versions ok = clear.run(path_tmp_u_a, os.path.join(output_path, path_ugly)) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly]) return (js_file_path, None, 'Beautifier 2 fail') ok = clear.run(path_tmp_b_a, os.path.join(output_path, path_orig)) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig]) return (js_file_path, None, 'Beautifier 3 fail') # Run the JSNice from http://www.nice2predict.org unuglifyJS = UnuglifyJS() (ok, _out, _err) = unuglifyJS.run(path_tmp_b_a, path_tmp_unugly) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig]) return (js_file_path, None, 'Nice2Predict fail') ok = clear.run(path_tmp_unugly, path_tmp_unugly+'.tmp1') if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, path_unugly]) return (js_file_path, None, 'Beautifier 4 fail') (ok, _out, _err) = jsNiceBeautifier.run(path_tmp_unugly+'.tmp1', path_tmp_unugly+'.tmp2') if not ok: cleanup(pid) return (js_file_path, None, 'JSNice Beautifier 2 fail') ok = clear.run(path_tmp_unugly+'.tmp2', os.path.join(output_path, path_unugly)) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, path_unugly]) return (js_file_path, None, 'Beautifier 4 fail') try: scopeAnalyst = ScopeAnalyst(os.path.join( os.path.dirname(os.path.realpath(__file__)), path_tmp_unugly)) nameOrigin = scopeAnalyst.nameOrigin for (name, def_scope) in nameOrigin.iterkeys(): candidates.append(('Nice2Predict', def_scope, name, '', '')) except: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, path_unugly]) return (js_file_path, None, 'ScopeAnalyst fail') # Run the JSNice from http://www.jsnice.org jsNice = JSNice() (ok, _out, _err) = jsNice.run(path_tmp_b_a, path_tmp_jsnice) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, path_unugly]) return (js_file_path, None, 'JSNice fail') ok = clear.run(path_tmp_jsnice, os.path.join(output_path, path_jsnice)) if not ok: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, \ path_unugly, path_jsnice]) return (js_file_path, None, 'Beautifier 5 fail') try: scopeAnalyst = ScopeAnalyst(os.path.join( os.path.dirname(os.path.realpath(__file__)), path_tmp_jsnice)) nameOrigin = scopeAnalyst.nameOrigin for (name, def_scope) in nameOrigin.iterkeys(): candidates.append(('JSNice', def_scope, name, '', '')) except: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, \ path_unugly, path_jsnice]) return (js_file_path, None, 'ScopeAnalyst fail') # Compute scoping: name2scope is a dictionary where keys # are (name, start_index) tuples and values are scope identifiers. # Note: start_index is a flat (unidimensional) index, # not a (line_chr_idx, col_chr_idx) index. try: scopeAnalyst = ScopeAnalyst(os.path.join( os.path.dirname(os.path.realpath(__file__)), path_tmp_u_a)) _name2defScope = scopeAnalyst.resolve_scope() _isGlobal = scopeAnalyst.isGlobal _name2useScope = scopeAnalyst.resolve_use_scope() except: cleanup(pid) localCleanup(output_path, [path_ugly, path_orig, \ path_unugly, path_jsnice]) return (js_file_path, None, 'ScopeAnalyst fail') no_renaming = [] for _line_idx, line in enumerate(iBuilder_ugly.tokens): no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n") with open(f2, 'w') as f_no_renaming: f_no_renaming.writelines(no_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.no_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation, _err) = moses.run(f2) nc = processTranslation(translation, iBuilder_ugly, scopeAnalyst, lm_path, f2, output_path, base_name, clear) if nc: candidates += nc # Simple renaming: disambiguate overloaded names using scope id basic_renaming = renameUsingScopeId(scopeAnalyst, iBuilder_ugly) with open(f3, 'w') as f_basic_renaming: f_basic_renaming.writelines(basic_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.basic_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation, _err) = moses.run(f3) nc = processTranslation(translation, iBuilder_ugly, scopeAnalyst, lm_path, f3, output_path, base_name, clear) if nc: candidates += nc # More complicated renaming: collect the context around # each name (global variables, API calls, punctuation) # and build a hash of the concatenation. hash_renaming = renameUsingHashAllPrec(scopeAnalyst, iBuilder_ugly, debug=False) # print hash_renaming with open(f4, 'w') as f_hash_renaming: f_hash_renaming.writelines(hash_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.hash_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation, _err) = moses.run(f4) nc = processTranslation(translation, iBuilder_ugly, scopeAnalyst, lm_path, f4, output_path, base_name, clear) if nc: candidates += nc hash_def_one_renaming = renameUsingHashDefLine(scopeAnalyst, iBuilder_ugly, twoLines=False, debug=False) with open(f5, 'w') as f_hash_def_one_renaming: f_hash_def_one_renaming.writelines(hash_def_one_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.hash_def_one_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation, _err) = moses.run(f5) nc = processTranslation(translation, iBuilder_ugly, scopeAnalyst, lm_path, f5, output_path, base_name, clear) if nc: candidates += nc hash_def_two_renaming = renameUsingHashDefLine(scopeAnalyst, iBuilder_ugly, twoLines=True, debug=False) with open(f6, 'w') as f_hash_def_two_renaming: f_hash_def_two_renaming.writelines(hash_def_two_renaming) moses = MosesDecoder(ini_path=os.path.join(ini_path, \ 'train.hash_def_two_renaming', 'tuning', 'moses.ini')) (_moses_ok, translation, _err) = moses.run(f6) nc = processTranslation(translation, iBuilder_ugly, scopeAnalyst, lm_path, f6, output_path, base_name, clear) if nc: candidates += nc cleanup(pid) cleanupRenamed(pid) return (js_file_path, 'OK', candidates) except Exception, e: cleanup(pid) cleanupRenamed(pid) return (js_file_path, None, str(e).replace("\n", ""))
def deobfuscateJS(self, obfuscatedCode, transactionID): proxy = xmlrpclib.ServerProxy("http://godeep.cs.ucdavis.edu:8080/RPC2") mosesParams = {} candidates = [] baseDir = "/home/ccasal/temp/" tempFile = baseDir + str(transactionID) + "_temp.js" lm_path = "/data/bogdanv/deobfuscator/experiments/corpora/corpus.lm.970k/js.blm.lm" preproFile = baseDir + str(transactionID) + "_prepro.js" beautFile = baseDir + str(transactionID) + "_beaut.js" # Strip comments, replace literals, etc try: prepro = WebPreprocessor(obfuscatedCode) #TODO replace with: prepro = WebPreprocessor(text) prepro.write_temp_file(preproFile) except: cleanup([preproFile]) print("Preprocessor failed") return ("Preprocessor Failed") clear = Beautifier() #TODO: Need a text version of beautifier to avoid the file read and write. #(ok, beautText, err) = clear.webRun(preproText) ok = clear.run(preproFile, beautFile) print(ok) if (not ok): cleanup([preproFile, beautFile]) return ("Beautifier Failed") #quit() try: lex_ugly = Lexer(beautFile) iBuilder_ugly = IndexBuilder(lex_ugly.tokenList) except: cleanup([preproFile, beautFile]) print("IndexBuilder fail") return ("IndexBuilder Failed") lex_ugly.write_temp_file(tempFile) #Do Scope related tasks #a raw text version try: scopeAnalyst = ScopeAnalyst(tempFile) except: cleanup({"temp": tempFile}) print("ScopeAnalyst Fail") return ("ScopeAnalyst Failed") #Do Rename related tasks #In our case, I don't think we need to actually do anything for no_renaming #no_renaming = [] #for _line_idx, line in enumerate(iBuilder_ugly.tokens): # no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n") #Hash_def_one_renaming #beautText = renameUsingHashDefLine(scopeAnalyst, # iBuilder_ugly, # twoLines=False, # debug=False) print(lex_ugly.collapsedText) mosesParams["text"] = lex_ugly.collapsedText mosesParams["align"] = "true" mosesParams["report-all-factors"] = "true" results = proxy.translate( mosesParams) # __request("translate", mosesParams) rawText = Postprocessor(results["nbest"]) translation = rawText.getProcessedOutput() #Send to output: cleanup([preproFile, beautFile, tempFile]) return (translation)
for line in lines]).encode('utf8')) js_tmp.write('\n') js_tmp.close() input_file = os.path.abspath(sys.argv[1]) output_file = os.path.abspath(sys.argv[2]) mode = int(sys.argv[3]) prepro = Preprocessor(input_file) prepro.write_temp_file('tmp.js') clear = Beautifier() ok = clear.run('tmp.js', 'tmp.b.js') lexer = Lexer('tmp.b.js') iBuilder = IndexBuilder(lexer.tokenList) scopeAnalyst = ScopeAnalyst(os.path.join( os.path.dirname(os.path.realpath(__file__)), 'tmp.b.js')) hash_renaming = renameUsingHashDefLine(scopeAnalyst, iBuilder, twoLines=False, debug=mode) with open(output_file, 'w') as f: f.writelines(hash_renaming)