Exemple #1
0
    def testScopeAnalyst(self):
        '''
        TODO: Check that the scope analyst works properly
        '''
        #__main__.py in tools is a useful tool for examining these.
        #print(self.obsfuscatedTextFiles[0])
        #This doesn't work when run inside pyDev for some weird reason.
        sa1 = ScopeAnalyst(self.obsfuscatedTextFiles[0])
        #print(sa1)
        #Not really sure how to test this effectively.

        #Check (using minified file) if identifier name maps to different variables if
        #they are in different scopes.  Can look at __main__.py
        #Variables: geom2d,t,i,r,x,y,n,e,o,u
        #Why do x and y not appear in the variables? (Is it b/c they are not defined anywhere in this snippet?)
        self.assertTrue(len(sa1.nameScopes[(u'geom2d')]) == 1)
        self.assertTrue(len(sa1.nameScopes[(u'numeric')]) == 1)
        self.assertTrue(len(sa1.nameScopes[(u't')]) == 3)
        self.assertTrue(len(sa1.nameScopes[(u'i')]) == 1)
        self.assertTrue(len(sa1.nameScopes[(u'r')]) == 4)
        self.assertTrue(len(sa1.nameScopes[(u'n')]) == 4)
        #self.assertTrue(len(sa1.nameScopes[(u'x')]) == 2)
        #self.assertTrue(len(sa1.nameScopes[(u'y')]) == 2)
        self.assertTrue(len(sa1.nameScopes[(u'u')]) == 1)
        self.assertTrue(len(sa1.nameScopes[(u'e')]) == 1)
        self.assertTrue(len(sa1.nameScopes[(u'o')]) == 1)

        #isGlobal:
        #print("IsGlobal-----------------------------------------------")
        #print(sa1.isGlobal)
        #print("IsGlobal-----------------------------------------------")
        self.assertTrue(sa1.isGlobal[(u'geom2d', 4)] == True)
        self.assertTrue(sa1.isGlobal[(u'i', 85)] == False)
        self.assertTrue(True)
Exemple #2
0
def processFile(js_file_name):

    candidates = []

    lexer = Lexer(js_file_name)
    iBuilder = IndexBuilder(lexer.tokenList)

    scopeAnalyst = ScopeAnalyst(js_file_name)
    nameOrigin = scopeAnalyst.nameOrigin
    isGlobal = scopeAnalyst.isGlobal
    nameDefScope2pos = scopeAnalyst.nameDefScope2pos

    for (name, def_scope) in nameOrigin.iterkeys():
        pos = nameDefScope2pos[(name, def_scope)]

        (lin, col) = iBuilder.revFlatMat[pos]
        scope = iBuilder.revTokMap[(lin, col)]

        glb = isGlobal.get((name, pos), True)


        if name != 'TOKEN_LITERAL_STRING' and \
                name != 'TOKEN_LITERAL_NUMBER':
            candidates.append((scope, name, pos, (lin, col), glb, def_scope))

    print
    print
    for c in sorted(candidates, key=lambda e: e[0]):
        (scope, name, pos, (lin, col), glb, def_scope) = c

        if name == 'n' or name == 'calendarEventId':
            print '\t', scope, name, pos, (lin, col), glb
            print '\t\t', def_scope
Exemple #3
0
    def testFiles(self):
        tf = [1, 5, 6, 7, 8, 9, 10, 11]
        #tf = [11]

        for i in tf:
            print("-----------------------------------------------------")
            lexed = Lexer(self.fileList[i - 1])
            ib = IndexBuilder(lexed.tokenList)
            #print(ib)
            sa = ScopeAnalyst(self.fileList[i - 1])
            print(sa)
            nameCount = {}
            #TODO: Grab only the non-globals to look at (get the start key and look it up)
            for variable in sa.nameDefScope2pos.keys():
                start = sa.nameDefScope2pos[variable]
                name = variable[0]
                if (not sa.isGlobal[(name, start)]):
                    if (name in nameCount):
                        nameCount[name] += 1
                    else:
                        nameCount[name] = 1
                    print(
                        str(name) + " : " +
                        str(sa.nameDefScope2pos[variable]) + " -> " +
                        str(ib.revFlatMat[sa.nameDefScope2pos[variable]]) +
                        " Manual: " + str(self.file_definitions[i][name]))
                    assert (ib.revFlatMat[sa.nameDefScope2pos[variable]][0]
                            in self.file_definitions[i][name])

            #Finally make sure that the count of definitions matches our manual check.
            for name, count in nameCount.iteritems():
                print(name + " : " + str(count) + " =?= " +
                      str(len(self.file_definitions[i][name])))
                assert (len(self.file_definitions[i][name]) == count)
Exemple #4
0
        def load(pth):
            lexer = Lexer(pth)
            iBuilder = IndexBuilder(lexer.tokenList)

            scopeAnalyst = ScopeAnalyst(
                os.path.join(os.path.dirname(os.path.realpath(__file__)), pth))

            return (iBuilder, scopeAnalyst)
Exemple #5
0
    def testMinifiableLines(self):
        expected = {}
        expected[0] = set([1, 2, 3, 4, 5, 7, 8, 9, 11, 12, 15, 16, 17, 20])
        expected[5] = set([8, 9])

        for i in [0, 5]:
            ib = IndexBuilder(self.clearLexed[i].tokenList)
            sa = ScopeAnalyst(self.clearTextFiles[i])

            lines = sa.getMinifiableLines(ib)
            print("i:" + str(i))
            print(lines)
            print(expected[i])
            self.assertTrue(lines == expected[i])
            text = ib.get_text_on_lines_wo_literals(lines)
            print(text)
            print(len(text.split("\n")))
            print(len(expected[i]))
            self.assertTrue(len(text.split("\n")) == len(expected[i]))
Exemple #6
0
 def testFiles(self):
     #Known bugs:  The definitions of sum and numberEquals in test_file1 seem to be pointing to the wrong instance...
     i = 1
     lexed = Lexer(self.fileList[0])
     ib = IndexBuilder(lexed.tokenList)
     sa = ScopeAnalyst(self.fileList[0])
     for variable in sa.nameDefScope2pos.keys():
         print(
             str(variable[0]) + " : " + str(sa.nameDefScope2pos[variable]) +
             " -> " + str(ib.revFlatMat[sa.nameDefScope2pos[variable]]))
def summarizeUnscopedTranslation(renaming_map, f_path, translation_strategy,
                                 output_path, base_name, name_candidates,
                                 name_positions, iBuilder):

    nc = []

    f_base = os.path.basename(f_path)
    training_strategy = f_base.split('.')[1]
    tmp_path = '%s.%s.js' % (f_base[:-3], translation_strategy)
    o_path = '%s.%s.unscoped.%s.js' % (base_name, training_strategy,
                                       translation_strategy)

    #     print f_path, f_base, training_strategy, tmp_path, o_path, base_name

    writeTmpLines(renameHashed(iBuilder, name_positions, renaming_map),
                  tmp_path)

    clear = Beautifier()
    ok = clear.run(tmp_path, os.path.join(output_path, o_path))
    if not ok:
        return False

    try:
        lexer = Lexer(os.path.join(output_path, o_path))
        iBuilder_local = IndexBuilder(lexer.tokenList)

        scopeAnalyst_local = ScopeAnalyst(os.path.join(output_path, o_path))
    except:
        return False

    nameOrigin = scopeAnalyst_local.nameOrigin
    isGlobal = scopeAnalyst_local.isGlobal

    for (name, def_scope) in nameOrigin.iterkeys():

        pos = scopeAnalyst_local.nameDefScope2pos[(name, def_scope)]

        if not False:  #isGlobal.get((name, pos), True):
            (lin, col) = iBuilder_local.revFlatMat[pos]
            (tok_lin, tok_col) = iBuilder_local.revTokMap[(lin, col)]

            nc.append(
                ('%s.unscoped.%s' % (training_strategy, translation_strategy),
                 def_scope, tok_lin, tok_col, isGlobal.get(
                     (name, pos), True), name, '', ''))

    return nc
Exemple #8
0
def processFile(l):

    js_file_name = l

    candidates = []

    try:
        lexer = Lexer(os.path.join(results_path, js_file_name))
        iBuilder = IndexBuilder(lexer.tokenList)

        scopeAnalyst = ScopeAnalyst(os.path.join(results_path, js_file_name))
        nameOrigin = scopeAnalyst.nameOrigin
        isGlobal = scopeAnalyst.isGlobal
        nameDefScope2pos = scopeAnalyst.nameDefScope2pos

        for (name, def_scope) in nameOrigin.iterkeys():
            pos = nameDefScope2pos[(name, def_scope)]

            (lin, col) = iBuilder.revFlatMat[pos]
            scope = iBuilder.revTokMap[(lin, col)]

            glb = isGlobal.get((name, pos), True)

            #             print name, def_scope, pos, scope, glb #, (lin,col)

            #             if not isGlobal.get((name, pos), True):
            #                 scope = def_scope.replace("\"","")
            #                 i = scope.find('[variables][_values]')
            #                 if i > -1:
            #                     scope = scope[:i+len('[variables][_values]')]
            #                 i = scope.find('[functions][_values]')
            #                 if i > -1:
            #                     scope = scope[:i+len('[functions][_values]')]

            if name != 'TOKEN_LITERAL_STRING' and \
                    name != 'TOKEN_LITERAL_NUMBER':
                candidates.append((scope, name, glb))

    except:
        return (js_file_name, None, 'ScopeAnalyst fail')


#     print 'candidates------------------'
#     for candidate in candidates:
#         print candidate

    return (js_file_name, 'OK', candidates)
Exemple #9
0
 def testfileDebug(self):
     for f in self.fileList:
         print("---------------------------------- " + f +
               " ----------------------------------")
         orig = f + ".js"
         min = f + ".u.js"
         lo = Lexer(orig)
         lm = Lexer(min)
         print(
             "---------------------------------- original text ----------------------------------"
         )
         print(lo.programText)
         print(
             "---------------------------------- minified text ----------------------------------"
         )
         print(lm.programText)
         for id in self.ids:
             to_read = f + id + ".js"
             print("---------------------------------- " + to_read +
                   " ----------------------------------")
             lexed = Lexer(to_read)
             print(
                 "---------------------------------- text ----------------------------------"
             )
             print(lexed.programText)
             print(
                 "---------------------------------- tokenlist ----------------------------------"
             )
             print(lexed.tokenList)
             ib = IndexBuilder(lexed.tokenList)
             print(
                 "---------------------------------- IndexBuilder ----------------------------------"
             )
             print(ib)
             sa = ScopeAnalyst(to_read)
             print(
                 "---------------------------------- ScopeAnalyst ----------------------------------"
             )
             print(sa)
Exemple #10
0
def processFile(l):
    base_name = l[0]
    js_file_path = l[1]
    print(base_name)
    print(js_file_path)
    #if(True):
    try:
        lexed = Lexer(js_file_path)
        ib = IndexBuilder(lexed.tokenList)
        sa = ScopeAnalyst(js_file_path)
        #num globals = all in is_global == True + all unique names
        #in name2CharPositions not in is_global
        base_global = set(
            [name for name, value in sa.isGlobal.iteritems() if value == True])
        #Get all known names in the file.
        known_names = set([name for name, value in sa.isGlobal.iteritems()])
        for name, loc in ib.name2CharPositions.iteritems():
            if (name not in known_names):  #if never seen, its a global
                base_global.add(name)

        return [base_name, len(base_global)]
    except:
        return [base_name, None]
Exemple #11
0
    def testFiles(self):
        #TODO: Automated checks against the files.
        #Known bugs:  The definitions of sum and numberEquals in test_file1 seem to be pointing to the wrong instance...
        i = 1
        for nextFile in self.fileList:
            print(nextFile)
            lexed = Lexer(nextFile)
            ib = IndexBuilder(lexed.tokenList)
            sa = ScopeAnalyst(nextFile)
            s_min = ScopeAnalyst(
                os.path.join(self.testDir.path, "test_file1.obs.js"))
            #print(s_min.name2defScope)
            #print("TokenList----------------------------------------------------------------")
            #print(lexed.tokenList)
            #print("Index Builder----------------------------------------------------------------")
            #print(ib)
            #print("Scope Analyst----------------------------------------------------------------")
            #print(sa)
            vm = VariableMetrics(sa, ib, lexed.tokenList)
            #print("VM----------------------------------------------------------------")
            #print(vm)
            #print("VM----------------------------------------------------------------")
            for var in vm.getVariables():
                print(var)
                print(
                    "Num Lines,Max Lines,Global Def,Global Usage,For,While,Literal Def,Literal Usage,Max Length Line,Ave Line Length"
                )
                print vm.getNameMetrics(var)

            #Automated tests:
            csv_file = os.path.join(self.testDir.path,
                                    "test_file" + str(i) + ".csv")
            print(csv_file)
            if (os.path.exists(csv_file)):
                with open(csv_file, 'r') as f:
                    csv_reader = csv.reader(f, delimiter=",")
                    #Skip header
                    next(csv_reader, None)
                    for row in csv_reader:
                        key = (row[0], row[1])
                        print(key)
                        (num_lines, max_lines, external_def, external_use,
                         in_for, in_while, literal_def, literal_use,
                         max_length_line,
                         ave_line_length) = vm.getNameMetrics(key)
                        self.assertTrue(num_lines == int(row[2]))
                        self.assertTrue(max_lines == int(row[3]))
                        self.assertTrue(external_def == self.asBool(row[4]))
                        self.assertTrue(external_use == int(row[5]))
                        self.assertTrue(in_for == int(row[6]))
                        self.assertTrue(in_while == int(row[7]))
                        self.assertTrue(literal_def == self.asBool(row[8]))
                        self.assertTrue(literal_use == int(row[9]))
                        self.assertTrue(max_length_line == int(row[10]))
                        self.assertAlmostEqual(ave_line_length,
                                               float(row[11]),
                                               places=3)

            else:
                print("no manually annotated csv file for: " + nextFile)

            break
def processFile(l):
    
    js_file_path = l[0]
    
    if js_file_path in seen:
        return (js_file_path, None, 'Skipped')
    
    pid = int(multiprocessing.current_process().ident)
    
    # Temp files to be created during processing
    temp_files = {'path_tmp': 'tmp_%d.js' % pid,
                  'path_tmp_b': 'tmp_%d.b.js' % pid,
                  'path_tmp_b_n': 'tmp_%d.b.n.js' % pid,
                  'path_tmp_u': 'tmp_%d.u.js' % pid,
                  'path_tmp_u_n': 'tmp_%d.u.n.js' % pid,
                  'path_tmp_b_a': 'tmp_%d.b.a.js' % pid,
                  'path_tmp_u_a': 'tmp_%d.u.a.js' % pid}
    
    try:        
        # Strip comments, replace literals, etc
        try:
            prepro = Preprocessor(os.path.join(corpus_root, js_file_path))
            prepro.write_temp_file(temp_files['path_tmp'])
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'Preprocessor fail')
        
        
        # Pass through beautifier to fix layout:
        # - once through JSNice without renaming
#         jsNiceBeautifier = JSNice(flags=['--no-types', '--no-rename'])
#         
#         (ok, _out, _err) = jsNiceBeautifier.run(temp_files['path_tmp'], 
#                                                 temp_files['path_tmp_b_n'])
#         if not ok:
#             cleanup(temp_files)
#             return (js_file_path, None, 'JSNice Beautifier fail')
        
        
#         # - and another time through uglifyjs pretty print only 
#         clear = Beautifier()
#         ok = clear.run(temp_files['path_tmp_b_n'], 
#                        temp_files['path_tmp_b'])
#         if not ok:
#             cleanup(temp_files)
#             return (js_file_path, None, 'Beautifier fail')
        
#         # JSNice is down! 
        clear = Beautifier()
        ok = clear.run(temp_files['path_tmp'], 
                       temp_files['path_tmp_b_n'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Beautifier fail')
        # Normalize
        norm = Normalizer()
        ok = norm.run(os.path.join(os.path.dirname(os.path.realpath(__file__)), 
                                 temp_files['path_tmp_b_n']),
                      False, 
                      temp_files['path_tmp_b'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Normalizer fail')
        
        
        
        # Minify
        ugly = Uglifier()
        ok = ugly.run(temp_files['path_tmp_b'], 
                      temp_files['path_tmp_u_n'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Uglifier fail')
        # Normalize
        norm = Normalizer()
        ok = norm.run(os.path.join(os.path.dirname(os.path.realpath(__file__)), 
                                 temp_files['path_tmp_u_n']),
                      False, 
                      temp_files['path_tmp_u'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Normalizer fail')
        
        
        
        # Num tokens before vs after
        try:
            tok_clear = Lexer(temp_files['path_tmp_b']).tokenList
            tok_ugly = Lexer(temp_files['path_tmp_u']).tokenList
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'Lexer fail')
        
        # For now only work with minified files that have
        # the same number of tokens as the originals
        if not len(tok_clear) == len(tok_ugly):
            cleanup(temp_files)
            return (js_file_path, None, 'Num tokens mismatch')
        
        
        # Align minified and clear files, in case the beautifier 
        # did something weird
        try:
            aligner = Aligner()
            # This is already the baseline corpus, no (smart) renaming yet
            aligner.align(temp_files['path_tmp_b'], 
                          temp_files['path_tmp_u'])
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'Aligner fail')
        
        try:
            lex_clear = Lexer(temp_files['path_tmp_b_a'])
            iBuilder_clear = IndexBuilder(lex_clear.tokenList)
            
            lex_ugly = Lexer(temp_files['path_tmp_u_a'])
            iBuilder_ugly = IndexBuilder(lex_ugly.tokenList)
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'IndexBuilder fail')
        
        
        
        # Normalize
        norm = Normalizer()
        ok = norm.run(os.path.join(os.path.dirname(os.path.realpath(__file__)), 
                                 temp_files['path_tmp_b']),
                      True, 
                      temp_files['path_tmp_u_n'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Normalizer fail')
        
        try:
            lex_norm = Lexer(temp_files['path_tmp_u_n'])
            iBuilder_norm = IndexBuilder(lex_norm.tokenList)
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'IndexBuilder fail')
        
        normalized = []
        for line_idx, line in enumerate(iBuilder_norm.tokens):
            normalized.append(' '.join([t for (_tt,t) in line]) + "\n")
        
        
        
        # Compute scoping: name2scope is a dictionary where keys
        # are (name, start_index) tuples and values are scope identifiers. 
        # Note: start_index is a flat (unidimensional) index, 
        # not a (line_chr_idx, col_chr_idx) index.
        try:
            scopeAnalyst = ScopeAnalyst(os.path.join(
                                 os.path.dirname(os.path.realpath(__file__)), 
                                 temp_files['path_tmp_u_a']))
#             _name2defScope = scopeAnalyst.resolve_scope()
#             _isGlobal = scopeAnalyst.isGlobal
#             _name2useScope = scopeAnalyst.resolve_use_scope()
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'ScopeAnalyst fail')
        
        orig = []
        no_renaming = []
        
        for line_idx, line in enumerate(iBuilder_ugly.tokens):
            orig.append(' '.join([t for (_tt,t) in \
                                  iBuilder_clear.tokens[line_idx]]) + "\n")
            
            no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n")
            
#         # Simple renaming: disambiguate overloaded names using scope id
        basic_renaming = renameUsingScopeId(scopeAnalyst, 
                                            iBuilder_ugly)
        
        # More complicated renaming: collect the context around  
        # each name (global variables, API calls, punctuation)
        # and build a hash of the concatenation.
#         hash_renaming = renameUsingHashAllPrec(scopeAnalyst, 
#                                                 iBuilder_ugly,
#                                                 debug=True)
        
        hash_def_one_renaming = renameUsingHashDefLine(scopeAnalyst, 
                                                   iBuilder_ugly, 
                                                   twoLines=False,
                                                   debug=False)

        hash_def_two_renaming = renameUsingHashDefLine(scopeAnalyst, 
                                                    iBuilder_ugly, 
                                                    twoLines=True,
                                                    debug=False)

        cleanup(temp_files)
        return (js_file_path,
                orig, 
                no_renaming, 
                basic_renaming,
                normalized, 
#                 hash_renaming,
                hash_def_one_renaming,
                hash_def_two_renaming)
        
    except Exception, e:
        cleanup(temp_files)
        return (js_file_path, None, str(e))
Exemple #13
0
    def testMosesPerformance(self):
        '''
        Run the deobfuscateJS method on each of our files and recorib.py:1279(request)
what the 
        times were for each into a csv style report.
        '''
        i = 0
        restart_attempt = False
        with open("./testing/PerformanceMetrics" + str(id_start) + ".csv",
                  'w') as output_csv:
            writer = csv.writer(output_csv, delimiter=",")
            writer.writerow([
                "file", "is_parallel", "lines", "minifiable_instances",
                "local_name_count", "jsnice_status", "preprocess_time",
                "prepreprocessor_time", "jsnice_time", "renaming_time",
                "lex_time", "builder_time", "scoper_time", "moses_time_serial",
                "moses_rn_parallel", "postprocessing_time", "total_time"
            ])
            for next_file in self.clearTextFiles:
                print(next_file)
                if (i < id_start
                    ):  # Skip until at start ID (used in failure cases)
                    i += 1
                    continue
                #if("220053" not in next_file):
                #    continue
                text = open(next_file, 'r').read()
                lineCount = text.count("\n") + 1
                print(lineCount)
                #if(lineCount > 500): #Bogdan didn't count these correctly? or was counting SLOC?
                #    continue
                for is_parallel in [True, False]:
                    #if(True):
                    try:
                        sa = ScopeAnalyst(next_file)

                        local = [
                            n for n, isG in sa.isGlobal.iteritems()
                            if isG == False
                        ]
                        local_instances = [
                            n for n, def_scope in sa.name2defScope.iteritems()
                            if n in local
                        ]
                        minCount = len(local_instances)
                        uniqueCount = len(local)
                        start = time.time()
                        #result = self.client.deobfuscateJS(text,True,i,True,is_parallel,use_local) #Debug mode
                        result = self.client.deobfuscateJS(
                            text, True, i, False, is_parallel,
                            use_local)  #For timings
                        total_time = time.time() - start
                        if ("Moses server failed" in result[0]):
                            #Skip and wait for revival scrip to restart the server?
                            if (not restart_attempt):
                                restart_attempt = True
                                #Wait 10 minutes for restarting script to try to boot up servers again
                                #Only do this once per server crash.
                                time.sleep(10 * 60)
                        else:
                            restart_attempt = False  #Server is working, make sure we reset restarter flag if needed
                    except:
                        minCount = 0
                        uniqueCount = 0
                        result = [
                            text, "other error.",
                            (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
                        ]

                    #Write output to a separate file.
                    file_id = str(self.getFileId(next_file))
                    output_file = file_id + str(is_parallel) + ".out.js"
                    with open(
                            os.path.join("./testing/performance_output/",
                                         output_file), "w") as f2:
                        f2.write(result[0])
                    #Write js_error + times to csv.
                    writer.writerow([
                        file_id, is_parallel, lineCount, minCount, uniqueCount,
                        result[1]
                    ] + list(result[2]) + [total_time])

                i += 1
Exemple #14
0
def processFile(js_file_path):

    # Load in the minified file
    minified = open(js_file_path).read()

    # Create lexer
    lexer = get_lexer_for_filename(js_file_path)

    # Tokenize input and compute mappings between the different
    # indices used: (line, col), flat, (l,c) in token list
    indexBuilder = IndexBuilder(lex(minified, lexer))
    tokens = indexBuilder.tokens
    #    print 'RUNNING IndexBuilder:', len(tokens)>0

    # Compute scoping: name2scope is a dictionary where keys
    # are (name, start_index) tuples and values are scope identifiers.
    # Note: start_index is a flat (unidimensional) index,
    # not a (line_chr_idx, col_chr_idx) index.
    scopeAnalyst = ScopeAnalyst(js_file_path)
    name2defScope = scopeAnalyst.resolve_scope()
    isGlobal = scopeAnalyst.isGlobal

    name2useScope = scopeAnalyst.name2useScope
    name2pth = scopeAnalyst.name2pth
    nameOrigin = scopeAnalyst.nameOrigin

    scopes = set(name2useScope.values())

    print
    print '=== FOUND %d SCOPES ===' % len(scopes)
    print

    for scope in scopes:
        print 'USE SCOPE:', scope
        lc_list = [
            indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]]
            for (t, pos) in name2useScope.keys()
            if name2useScope[(t, pos)] == scope
        ]
        highlight(tokens, lc_list)
        print

    scopes = set(name2defScope.values())

    print
    print '=== FOUND %d NAME SCOPES ===' % len(scopes)
    print

    for scope in scopes:
        print 'DEF SCOPE:', scope
        lc_list = [
            indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]]
            for (t, pos) in name2defScope.keys()
            if name2defScope[(t, pos)] == scope
        ]
        highlight(tokens, lc_list)
        print

    # Discover the path to the source map
    map_path = sourcemap.discover(minified)
    # Read and parse our sourcemap
    if map_path:
        sourcemapIndex = sourcemap.load(open(map_path))

    # Cluster names by scope
    nameScope2Positions = {}

    # Index data by (name,scope)
    for token, l in indexBuilder.name2CharPositions.iteritems():
        for (line, col) in sorted(l, key=lambda (a, b): (a, b)):
            pos = indexBuilder.flatMap[(line, col)]
            if name2defScope.has_key((token, pos)):
                scope = name2defScope[(token, pos)]
                use_scope = name2useScope[(token, pos)]
                pth = name2pth[(token, pos)]

                glb = isGlobal[(token, pos)]

                nameScope2Positions.setdefault((token, scope, glb), [])
                nameScope2Positions[(token, scope, glb)].append((line, col))

#                print token, pos
#                print 'def:', scope
#                print 'use:', use_scope
#                print 'pth:', pth
#                highlight(tokens, [indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]]])
#                print

    print
    print

    for (token,scope,glb), positions in sorted(nameScope2Positions.iteritems(), \
                                           key=lambda (x,y):x[0]):

        pos = sorted(positions, key=lambda e: (e[0], e[1]))
        tt = []
        line_tok_idxs = set([])
        for (l, c) in pos:
            (tl, tc) = indexBuilder.revTokMap[(l, c)]
            line_tok_idxs.add(tl)
            p = indexBuilder.flatMap[(l, c)]
            if map_path:
                orig = sourcemapIndex.lookup(line=l, column=c).name
            else:
                orig = token
            print token, scope, (l, c), orig
            tt.append(((tl, tc), p, orig))
#             t.append(orig)

#         if token == 'n':
        print '\nNAME:', token.encode(
            'utf-8'), '( isGlobal =', glb, '; original =', orig, ')'
        #         print scope
        #         highlight(tokens, [indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]]])

        for ((tli, tci), p, orig) in tt:
            scope = name2defScope[(token, p)]
            use_scope = name2useScope[(token, p)]
            pth = name2pth[(token, p)]
            origin = nameOrigin[(token, scope)]
#             print token #, p, origin
#             print
#             print 'def:', scope
#             print 'use:', use_scope
#             print 'pth:', pth
#             print

        for tl in sorted(set([tli for ((tli, tci), p, orig) in tt])):
            l = list(tokens[tl])
            for tc in [tci for ((tli, tci), p, orig) in tt if tli == tl]:
                l[tc] = (l[tc][0], unichr(0x2588) + token + unichr(0x2588))


#                 pos = indexBuilder.flatMap[(line,col)]

            print '  ', '%d:' % (tl + 1), ' '.join(
                [x[1].encode('utf-8') for x in l])

        print

    return
def processFile(l):

    js_file_path = l[0]
    base_name = os.path.splitext(os.path.basename(js_file_path))[0]

    pid = int(multiprocessing.current_process().ident)

    temp_files = {
        'path_tmp': 'tmp_%d.js' % pid,
        'path_tmp_b': 'tmp_%d.b.js' % pid,
        'path_tmp_b_1': 'tmp_%d.b.1.js' % pid,
        'path_tmp_b_2': 'tmp_%d.b.2.js' % pid,
        'path_tmp_b_a': 'tmp_%d.b.a.js' % pid,
        'path_tmp_u': 'tmp_%d.u.js' % pid,
        'path_tmp_u_a': 'tmp_%d.u.a.js' % pid,
        'path_tmp_unugly': 'tmp_%d.n2p.js' % pid,
        'path_tmp_unugly_1': 'tmp_%d.n2p.1.js' % pid,
        'path_tmp_unugly_2': 'tmp_%d.n2p.2.js' % pid,
        'path_tmp_jsnice': 'tmp_%d.jsnice.js' % pid,
        'f2': 'tmp_%d.no_renaming.js' % pid,
        #                   'f3': 'tmp_%d.basic_renaming.js' % pid,
        #                   'f4': 'tmp_%d.hash_renaming.js' % pid,
        'f5': 'tmp_%d.hash_def_one_renaming.js' % pid,
        #                   'f6': 'tmp_%d.hash_def_two_renaming.js' % pid,
        'f7': 'tmp_%d.hash_def_one_renaming_fb.js' % pid,
        'path_orig': os.path.join(output_path, '%s.js' % base_name),
        'path_ugly': os.path.join(output_path, '%s.u.js' % base_name),
        'path_unugly': os.path.join(output_path, '%s.n2p.js' % base_name),
        'path_jsnice': os.path.join(output_path, '%s.jsnice.js' % base_name)
    }

    #     for strategy in ['js', 'lm.js', 'len.js', 'freqlen.js']:
    #         for renaming in ['no_renaming', 'hash_def_one_renaming']:
    #             temp_files['path_tmp_%s_%s' % (renaming, strategy)] = \
    #                     'tmp_%d.%s.%s' % (pid, renaming, strategy)

    candidates = []

    #     if True:
    try:

        # Strip comments, replace literals, etc
        try:
            prepro = Preprocessor(os.path.join(corpus_root, js_file_path))
            prepro.write_temp_file(temp_files['path_tmp'])
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'Preprocessor fail')

        # Pass through beautifier to fix layout
        clear = Beautifier()
        ok = clear.run(temp_files['path_tmp'], temp_files['path_tmp_b'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Beautifier fail')

#         # Pass through beautifier to fix layout
#         clear = Beautifier()
#         ok = clear.run(temp_files['path_tmp'],
#                        temp_files['path_tmp_b_1'])
#         if not ok:
#             cleanup(temp_files)
#             return (js_file_path, None, 'Beautifier fail')
#
#         jsNiceBeautifier = JSNice(flags=['--no-types', '--no-rename'])
#
#         (ok, _out, _err) = jsNiceBeautifier.run(temp_files['path_tmp_b_1'],
#                                                 temp_files['path_tmp_b_2'])
#         if not ok:
#             cleanup(temp_files)
#             print js_file_path, _err
#             return (js_file_path, None, 'JSNice Beautifier fail')
#
#         ok = clear.run(temp_files['path_tmp_b_2'],
#                        temp_files['path_tmp_b'])
#         if not ok:
#             cleanup(temp_files)
#             return (js_file_path, None, 'Beautifier fail')
#
#
#         # Weird JSNice renamings despite --no-rename
#         try:
#             before = set([token for (token, token_type) in
#                           Lexer(temp_files['path_tmp_b_1']).tokenList
#                           if is_token_subtype(token_type, Token.Name)])
#             after = set([token for (token, token_type) in
#                           Lexer(temp_files['path_tmp_b']).tokenList
#                           if is_token_subtype(token_type, Token.Name)])
#
#             if not before == after:
#                 return (js_file_path, None, 'Weird JSNice renaming')
#
#         except:
#             cleanup(temp_files)
#             return (js_file_path, None, 'Lexer fail')

# Minify
        ugly = Uglifier()
        ok = ugly.run(temp_files['path_tmp_b'], temp_files['path_tmp_u'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Uglifier fail')

        # Num tokens before vs after
        try:
            tok_clear = Lexer(temp_files['path_tmp_b']).tokenList
            tok_ugly = Lexer(temp_files['path_tmp_u']).tokenList
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'Lexer fail')

        # For now only work with minified files that have
        # the same number of tokens as the originals
        if not len(tok_clear) == len(tok_ugly):
            cleanup(temp_files)
            return (js_file_path, None, 'Num tokens mismatch')

        # Align minified and clear files, in case the beautifier
        # did something weird
        try:
            aligner = Aligner()
            # This is already the baseline corpus, no (smart) renaming yet
            aligner.align(temp_files['path_tmp_b'], temp_files['path_tmp_u'])
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'Aligner fail')



        if open(temp_files['path_tmp_b']).read() == \
                open(temp_files['path_tmp_u']).read():
            cleanup(temp_files)
            return (js_file_path, None, 'Not minified')

        try:
            lex_ugly = Lexer(temp_files['path_tmp_u_a'])
            iBuilder_ugly = IndexBuilder(lex_ugly.tokenList)
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'IndexBuilder fail')

        ############################################################
        # From now on only work with path_tmp_b_a and path_tmp_u_a
        ############################################################

        # Store original and uglified versions
        ok = clear.run(temp_files['path_tmp_b_a'], temp_files['path_orig'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Beautifier fail')

        ok = clear.run(temp_files['path_tmp_u_a'], temp_files['path_ugly'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Beautifier fail')

        # Run the JSNice from http://www.nice2predict.org
        unuglifyJS = UnuglifyJS()
        (ok, _out, _err) = unuglifyJS.run(temp_files['path_tmp_u_a'],
                                          temp_files['path_tmp_unugly'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Nice2Predict fail')

        ok = clear.run(temp_files['path_tmp_unugly'],
                       temp_files['path_unugly'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Beautifier fail')

#         ok = clear.run(temp_files['path_tmp_unugly'],
#                        temp_files['path_tmp_unugly_1'])
#         if not ok:
#             cleanup(temp_files)
#             return (js_file_path, None, 'Beautifier fail')
#
#         (ok, _out, _err) = jsNiceBeautifier.run(temp_files['path_tmp_unugly_1'],
#                                                 temp_files['path_tmp_unugly_2'])
#         if not ok:
#             cleanup(temp_files)
#             print js_file_path, _err
#             return (js_file_path, None, 'JSNice Beautifier fail')
#
#         ok = clear.run(temp_files['path_tmp_unugly_2'],
#                        temp_files['path_unugly'])
#         if not ok:
#             cleanup(temp_files)
#             return (js_file_path, None, 'Beautifier fail')

        try:
            lexer = Lexer(temp_files['path_unugly'])
            iBuilder = IndexBuilder(lexer.tokenList)
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'IndexBuilder fail')

        try:
            scopeAnalyst = ScopeAnalyst(
                os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             temp_files['path_unugly']))
            nameOrigin = scopeAnalyst.nameOrigin
            isGlobal = scopeAnalyst.isGlobal

            for (name, def_scope) in nameOrigin.iterkeys():

                pos = scopeAnalyst.nameDefScope2pos[(name, def_scope)]
                (lin, col) = iBuilder.revFlatMat[pos]
                (tok_lin, tok_col) = iBuilder.revTokMap[(lin, col)]

                candidates.append(('Nice2Predict', def_scope, tok_lin, tok_col,
                                   isGlobal.get((name, pos),
                                                True), name, '', ''))
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'ScopeAnalyst fail')

#         # Run the JSNice from http://www.jsnice.org
#         jsNice = JSNice()
#         (ok, _out, _err) = jsNice.run(temp_files['path_tmp_u_a'],
#                                       temp_files['path_tmp_jsnice'])
#         if not ok:
#             cleanup(temp_files)
#             return (js_file_path, None, 'JSNice fail')
#
#         ok = clear.run(temp_files['path_tmp_jsnice'],
#                        temp_files['path_jsnice'])
#         if not ok:
#             cleanup(temp_files)
#             return (js_file_path, None, 'Beautifier fail')
#
#         try:
#             lexer = Lexer(temp_files['path_jsnice'])
#             iBuilder = IndexBuilder(lexer.tokenList)
#         except:
#             cleanup(temp_files)
#             return (js_file_path, None, 'IndexBuilder fail')
#
#         try:
#             scopeAnalyst = ScopeAnalyst(os.path.join(
#                                  os.path.dirname(os.path.realpath(__file__)),
#                                  temp_files['path_jsnice']))
#             nameOrigin = scopeAnalyst.nameOrigin
#             isGlobal = scopeAnalyst.isGlobal
#
#             for (name, def_scope) in nameOrigin.iterkeys():
#
#                 pos = scopeAnalyst.nameDefScope2pos[(name, def_scope)]
#                 (lin,col) = iBuilder.revFlatMat[pos]
#                 (tok_lin,tok_col) = iBuilder.revTokMap[(lin,col)]
#
#                 candidates.append(('JSNice', def_scope,
#                                    tok_lin, tok_col,
#                                    isGlobal.get((name, pos), True),
#                                    name, '',''))
#         except:
#             cleanup(temp_files)
#             return (js_file_path, None, 'ScopeAnalyst fail')

# Compute scoping: name2scope is a dictionary where keys
# are (name, start_index) tuples and values are scope identifiers.
# Note: start_index is a flat (unidimensional) index,
# not a (line_chr_idx, col_chr_idx) index.
        try:
            scopeAnalyst = ScopeAnalyst(
                os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             temp_files['path_tmp_u_a']))
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'ScopeAnalyst fail')

        # Baseline translation: No renaming, no scoping
        no_renaming = []
        for _line_idx, line in enumerate(iBuilder_ugly.tokens):
            no_renaming.append(' '.join([t for (_tt, t) in line]) + "\n")

        with open(temp_files['f2'], 'w') as f_no_renaming:
            f_no_renaming.writelines(no_renaming)

        moses = MosesDecoder(ini_path=os.path.join(ini_path, \
                           'train.no_renaming', 'tuning', 'moses.ini'))
        (_moses_ok, translation_no_renaming,
         _err) = moses.run(temp_files['f2'])

        nc = processTranslationUnscoped(translation_no_renaming, iBuilder_ugly,
                                        lm_path, temp_files['f2'], output_path,
                                        base_name)
        if nc:
            candidates += nc

#  translation, iBuilder, lm_path,
#                                f_path, output_path, base_name
# Default translation: No renaming
#         no_renaming = []
#         for _line_idx, line in enumerate(iBuilder_ugly.tokens):
#             no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n")
#
#         with open(temp_files['f2'], 'w') as f_no_renaming:
#             f_no_renaming.writelines(no_renaming)
#
#         moses = MosesDecoder(ini_path=os.path.join(ini_path, \
#                            'train.no_renaming', 'tuning', 'moses.ini'))
#         (_moses_ok, translation, _err) = moses.run(temp_files['f2'])

        nc = processTranslationScoped(translation_no_renaming, iBuilder_ugly,
                                      scopeAnalyst, lm_path, temp_files['f2'],
                                      output_path, base_name)
        if nc:
            candidates += nc

        # More complicated renaming: collect the context around
        # each name (global variables, API calls, punctuation)
        # and build a hash of the concatenation.
        hash_def_one_renaming = renameUsingHashDefLine(scopeAnalyst,
                                                       iBuilder_ugly,
                                                       twoLines=False,
                                                       debug=False)
        with open(temp_files['f5'], 'w') as f_hash_def_one_renaming:
            f_hash_def_one_renaming.writelines(hash_def_one_renaming)

#        moses = MosesDecoder(ini_path=os.path.join(ini_path, \
#                           'train.hash_def_one_renaming', 'tuning', 'moses.ini'))
#        (_moses_ok,
#            translation_hash_renaming,
#            _err) = moses.run(temp_files['f5'])

        mosesParams = {}
        mosesParams["text"] = hash_def_one_renaming  #lex_ugly.collapsedText
        #mosesParams["align"] = "true"
        #mosesParams["report-all-factors"] = "true"

        mresults = proxy.translate(
            mosesParams)  # __request("translate", mosesParams)
        rawText = Postprocessor(mresults["nbest"])
        translation_hash_renaming = rawText.getProcessedOutput()

        nc = processTranslationScoped(translation_hash_renaming, iBuilder_ugly,
                                      scopeAnalyst, lm_path, temp_files['f5'],
                                      output_path, base_name)
        if nc:
            candidates += nc


#        nc = processTranslationScopedFallback(translation_hash_renaming,
#                                              translation_no_renaming,
#                                              iBuilder_ugly,
#                                              scopeAnalyst,
#                                              lm_path,
#                                              temp_files['f7'],
#                                              output_path,
#                                              base_name)
#        if nc:
#            candidates += nc

        cleanup(temp_files)
        cleanupRenamed(pid)
        return (js_file_path, 'OK', candidates)

    except Exception, e:
        cleanup(temp_files)
        cleanupRenamed(pid)
        return (js_file_path, None, str(e).replace("\n", ""))
Exemple #16
0
    def deobfuscateJS(self, obfuscatedCode, transactionID):
        proxy = xmlrpclib.ServerProxy("http://godeep.cs.ucdavis.edu:8080/RPC2")

        mosesParams = {}
        candidates = []
        baseDir = "/home/ccasal/temp/"
        tempFile = baseDir + str(transactionID) + "_temp.js"
        lm_path = "/data/bogdanv/deobfuscator/experiments/corpora/corpus.lm.970k/js.blm.lm"

        preproFile = baseDir + str(transactionID) + "_prepro.js"
        beautFile = baseDir + str(transactionID) + "_beaut.js"

        # Strip comments, replace literals, etc
        try:
            prepro = WebPreprocessor(obfuscatedCode)
            #TODO replace with: prepro = WebPreprocessor(text)
            prepro.write_temp_file(preproFile)
        except:
            cleanup([preproFile])
            print("Preprocessor failed")
            return ("Preprocessor Failed")

        clear = Beautifier()
        #TODO: Need a text version of beautifier to avoid the file read and write.
        #(ok, beautText, err) = clear.webRun(preproText)
        ok = clear.run(preproFile, beautFile)
        print(ok)
        if (not ok):
            cleanup([preproFile, beautFile])
            return ("Beautifier Failed")
            #quit()

        try:
            lex_ugly = Lexer(beautFile)
            iBuilder_ugly = IndexBuilder(lex_ugly.tokenList)
        except:
            cleanup([preproFile, beautFile])
            print("IndexBuilder fail")
            return ("IndexBuilder Failed")

        lex_ugly.write_temp_file(tempFile)

        #Do Scope related tasks
        #a raw text version
        try:
            scopeAnalyst = ScopeAnalyst(tempFile)
        except:
            cleanup({"temp": tempFile})
            print("ScopeAnalyst Fail")
            return ("ScopeAnalyst Failed")

        #Do Rename related tasks
        #In our case, I don't think we need to actually do anything for no_renaming
        #no_renaming = []
        #for _line_idx, line in enumerate(iBuilder_ugly.tokens):
        #    no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n")

        #Hash_def_one_renaming
        #beautText = renameUsingHashDefLine(scopeAnalyst,
        #                                               iBuilder_ugly,
        #                                               twoLines=False,
        #                                                debug=False)
        print(lex_ugly.collapsedText)
        mosesParams["text"] = lex_ugly.collapsedText
        mosesParams["align"] = "true"
        mosesParams["report-all-factors"] = "true"

        results = proxy.translate(
            mosesParams)  # __request("translate", mosesParams)
        rawText = Postprocessor(results["nbest"])
        translation = rawText.getProcessedOutput()

        #Send to output:
        cleanup([preproFile, beautFile, tempFile])
        return (translation)
Exemple #17
0
def processFile(l):
    
    def localCleanup(output_path, base_names):
        for base_name in base_names:
            tryRemove(os.path.join(output_path, base_name))
    
    js_file_path = l[0]
    base_name = os.path.splitext(os.path.basename(js_file_path))[0]
    
    pid = int(multiprocessing.current_process().ident)

    candidates = []
    
    try:
#     if True:
        # Temp files to be created during processing
        path_tmp = 'tmp_%d.js' % (pid)
        path_tmp_b = 'tmp_%d.b.js' % (pid)
        path_tmp_b_a = 'tmp_%d.b.a.js' % (pid)
        path_tmp_u = 'tmp_%d.u.js' % (pid)
        path_tmp_u_a = 'tmp_%d.u.a.js' % (pid)
        path_tmp_unugly = 'tmp_%d.n2p.js' % (pid)
        path_tmp_jsnice = 'tmp_%d.jsnice.js' % (pid)
        
        f2 = 'tmp_%d.no_renaming.js' % (pid)
        f3 = 'tmp_%d.basic_renaming.js' % (pid)
        f4 = 'tmp_%d.hash_renaming.js' % (pid)
        f5 = 'tmp_%d.hash_def_one_renaming.js' % (pid)
        f6 = 'tmp_%d.hash_def_two_renaming.js' % (pid)
        
        path_orig = '%s.js' % (base_name)
        path_ugly = '%s.u.js' % (base_name)
        path_unugly = '%s.n2p.js' % (base_name)
        path_jsnice = '%s.jsnice.js' % (base_name)
        
        # Strip comments, replace literals, etc
        try:
            prepro = Preprocessor(os.path.join(corpus_root, js_file_path))
            prepro.write_temp_file(path_tmp)
        except:
            cleanup(pid)
            return (js_file_path, None, 'Preprocessor fail')
        
        # Pass through beautifier to fix layout
        clear = Beautifier()
        ok = clear.run(path_tmp, path_tmp_b+'.tmp1')
        if not ok:
            cleanup(pid)
            return (js_file_path, None, 'Beautifier 1 fail')
         
        jsNiceBeautifier = JSNice(flags=['--no-types', '--no-rename'])
        
        (ok, _out, _err) = jsNiceBeautifier.run(path_tmp_b+'.tmp1', path_tmp_b+'.tmp2')
        if not ok:
            cleanup(pid)
            return (js_file_path, None, 'JSNice Beautifier 1 fail')

        ok = clear.run(path_tmp_b+'.tmp2', path_tmp_b)
        if not ok:
            cleanup(pid)
            return (js_file_path, None, 'Beautifier 1 fail')
         
        # Minify
        ugly = Uglifier()
        ok = ugly.run(path_tmp_b, path_tmp_u)
        
        if not ok:
            cleanup(pid)
            return (js_file_path, None, 'Uglifier fail')
        
        # Num tokens before vs after
        try:
            tok_clear = Lexer(path_tmp_b).tokenList
            tok_ugly = Lexer(path_tmp_u).tokenList
        except:
            cleanup(pid)
            return (js_file_path, None, 'Lexer fail')
       
        # For now only work with minified files that have
        # the same number of tokens as the originals
        if not len(tok_clear) == len(tok_ugly):
            cleanup(pid)
            return (js_file_path, None, 'Num tokens mismatch')
        
        # Align minified and clear files, in case the beautifier 
        # did something weird
        try:
            aligner = Aligner()
            # This is already the baseline corpus, no (smart) renaming yet
            aligner.align(path_tmp_b, path_tmp_u)
        except:
            cleanup(pid)
            return (js_file_path, None, 'Aligner fail')
        
        try:
#             iBuilder_clear = IndexBuilder(Lexer(path_tmp_b_a).tokenList)
            iBuilder_ugly = IndexBuilder(Lexer(path_tmp_u_a).tokenList)
        except:
            cleanup(pid)
            return (js_file_path, None, 'IndexBuilder fail')
        
        
        # Store original and uglified versions
        ok = clear.run(path_tmp_u_a, os.path.join(output_path, path_ugly))
        if not ok:
            cleanup(pid)
            localCleanup(output_path, [path_ugly])
            return (js_file_path, None, 'Beautifier 2 fail')
        
        ok = clear.run(path_tmp_b_a, os.path.join(output_path, path_orig))
        if not ok:
            cleanup(pid)
            localCleanup(output_path, [path_ugly, path_orig])
            return (js_file_path, None, 'Beautifier 3 fail')
        
        
        # Run the JSNice from http://www.nice2predict.org
        unuglifyJS = UnuglifyJS()
        (ok, _out, _err) = unuglifyJS.run(path_tmp_b_a, path_tmp_unugly)
        if not ok:
            cleanup(pid)
            localCleanup(output_path, [path_ugly, path_orig])
            return (js_file_path, None, 'Nice2Predict fail')
        
        ok = clear.run(path_tmp_unugly, path_tmp_unugly+'.tmp1')
        if not ok:
            cleanup(pid)
            localCleanup(output_path, [path_ugly, path_orig, path_unugly])
            return (js_file_path, None, 'Beautifier 4 fail')
        
        (ok, _out, _err) = jsNiceBeautifier.run(path_tmp_unugly+'.tmp1', path_tmp_unugly+'.tmp2')
        if not ok:
            cleanup(pid)
            return (js_file_path, None, 'JSNice Beautifier 2 fail')
    
        ok = clear.run(path_tmp_unugly+'.tmp2', os.path.join(output_path, path_unugly))
        if not ok:
            cleanup(pid)
            localCleanup(output_path, [path_ugly, path_orig, path_unugly])
            return (js_file_path, None, 'Beautifier 4 fail')

        try:
            scopeAnalyst = ScopeAnalyst(os.path.join(
                                 os.path.dirname(os.path.realpath(__file__)), 
                                 path_tmp_unugly))
            nameOrigin = scopeAnalyst.nameOrigin
            for (name, def_scope) in nameOrigin.iterkeys():
                candidates.append(('Nice2Predict', def_scope, name, '', ''))
        except:
            cleanup(pid)
            localCleanup(output_path, [path_ugly, path_orig, path_unugly])
            return (js_file_path, None, 'ScopeAnalyst fail')
    
    
    
        # Run the JSNice from http://www.jsnice.org
        jsNice = JSNice()
        (ok, _out, _err) = jsNice.run(path_tmp_b_a, path_tmp_jsnice)
        if not ok:
            cleanup(pid)
            localCleanup(output_path, [path_ugly, path_orig, path_unugly])
            return (js_file_path, None, 'JSNice fail')

        ok = clear.run(path_tmp_jsnice, os.path.join(output_path, path_jsnice))
        if not ok:
            cleanup(pid)
            localCleanup(output_path, [path_ugly, path_orig, \
                                       path_unugly, path_jsnice])
            return (js_file_path, None, 'Beautifier 5 fail')
        
        try:
            scopeAnalyst = ScopeAnalyst(os.path.join(
                                 os.path.dirname(os.path.realpath(__file__)), 
                                 path_tmp_jsnice))
            nameOrigin = scopeAnalyst.nameOrigin
            for (name, def_scope) in nameOrigin.iterkeys():
                candidates.append(('JSNice', def_scope, name, '', ''))
        except:
            cleanup(pid)
            localCleanup(output_path, [path_ugly, path_orig, \
                                       path_unugly, path_jsnice])
            return (js_file_path, None, 'ScopeAnalyst fail')
        
        
        
        # Compute scoping: name2scope is a dictionary where keys
        # are (name, start_index) tuples and values are scope identifiers. 
        # Note: start_index is a flat (unidimensional) index, 
        # not a (line_chr_idx, col_chr_idx) index.
        try:
            scopeAnalyst = ScopeAnalyst(os.path.join(
                                 os.path.dirname(os.path.realpath(__file__)), 
                                 path_tmp_u_a))
            _name2defScope = scopeAnalyst.resolve_scope()
            _isGlobal = scopeAnalyst.isGlobal
            _name2useScope = scopeAnalyst.resolve_use_scope()
        except:
            cleanup(pid)
            localCleanup(output_path, [path_ugly, path_orig, \
                                       path_unugly, path_jsnice])
            return (js_file_path, None, 'ScopeAnalyst fail')
        
        
        no_renaming = []
        for _line_idx, line in enumerate(iBuilder_ugly.tokens):
            no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n")
        
        with open(f2, 'w') as f_no_renaming:
            f_no_renaming.writelines(no_renaming)
        
        moses = MosesDecoder(ini_path=os.path.join(ini_path, \
                           'train.no_renaming', 'tuning', 'moses.ini'))
        (_moses_ok, translation, _err) = moses.run(f2)

        nc = processTranslation(translation, iBuilder_ugly, 
                       scopeAnalyst, lm_path, f2,
                       output_path, base_name, clear)
        if nc:
            candidates += nc
        
        
        # Simple renaming: disambiguate overloaded names using scope id
        basic_renaming = renameUsingScopeId(scopeAnalyst, iBuilder_ugly)
        with open(f3, 'w') as f_basic_renaming:
            f_basic_renaming.writelines(basic_renaming)
        
        moses = MosesDecoder(ini_path=os.path.join(ini_path, \
                           'train.basic_renaming', 'tuning', 'moses.ini'))
        (_moses_ok, translation, _err) = moses.run(f3)
        nc = processTranslation(translation, iBuilder_ugly, 
                       scopeAnalyst, lm_path, f3,
                       output_path, base_name, clear)
        if nc:
            candidates += nc
            
        
        # More complicated renaming: collect the context around  
        # each name (global variables, API calls, punctuation)
        # and build a hash of the concatenation.
        hash_renaming = renameUsingHashAllPrec(scopeAnalyst, 
                                               iBuilder_ugly,
                                               debug=False)
#         print hash_renaming
        with open(f4, 'w') as f_hash_renaming:
            f_hash_renaming.writelines(hash_renaming)
        
        moses = MosesDecoder(ini_path=os.path.join(ini_path, \
                           'train.hash_renaming', 'tuning', 'moses.ini'))
        (_moses_ok, translation, _err) = moses.run(f4)
        
        nc = processTranslation(translation, iBuilder_ugly, 
                       scopeAnalyst, lm_path, f4,
                       output_path, base_name, clear)
        if nc:
            candidates += nc
        
        hash_def_one_renaming = renameUsingHashDefLine(scopeAnalyst, 
                                                   iBuilder_ugly, 
                                                   twoLines=False,
                                                   debug=False)
        with open(f5, 'w') as f_hash_def_one_renaming:
            f_hash_def_one_renaming.writelines(hash_def_one_renaming)

        moses = MosesDecoder(ini_path=os.path.join(ini_path, \
                           'train.hash_def_one_renaming', 'tuning', 'moses.ini'))
        (_moses_ok, translation, _err) = moses.run(f5)
        
        nc = processTranslation(translation, iBuilder_ugly, 
                       scopeAnalyst, lm_path, f5,
                       output_path, base_name, clear)
        if nc:
            candidates += nc
            

        hash_def_two_renaming = renameUsingHashDefLine(scopeAnalyst, 
                                                   iBuilder_ugly, 
                                                   twoLines=True,
                                                   debug=False)
        with open(f6, 'w') as f_hash_def_two_renaming: 
            f_hash_def_two_renaming.writelines(hash_def_two_renaming)
        
        moses = MosesDecoder(ini_path=os.path.join(ini_path, \
                           'train.hash_def_two_renaming', 'tuning', 'moses.ini'))
        (_moses_ok, translation, _err) = moses.run(f6)
        
        nc = processTranslation(translation, iBuilder_ugly, 
                       scopeAnalyst, lm_path, f6,
                       output_path, base_name, clear)
        if nc:
            candidates += nc
            
        
        cleanup(pid)
        cleanupRenamed(pid)
        return (js_file_path, 'OK', candidates)


    except Exception, e:
        cleanup(pid)
        cleanupRenamed(pid)
        return (js_file_path, None, str(e).replace("\n", ""))
Exemple #18
0
    def testHashDefRenaming(self):
        '''
        TODO: Test the hashing functions are using the context correctly for both one and two line
        options.  Goals are to confirm a) correct line summarization b) consistency of naming
        of the same variable.  However, two different variables may map to the same name with
        insufficient context.
        '''
        #print(self.obsfuscatedTextFiles[0])
        ib1 = IndexBuilder(self.obsLexed[0].tokenList)
        sa1 = ScopeAnalyst(self.obsfuscatedTextFiles[0])

        RS = RenamingStrategies()
        preRen = PreRenamer()
        oneLine1 = preRen.rename(RS.HASH_ONE, ib1, sa1, True)
        twoLine1 = preRen.rename(RS.HASH_TWO, ib1, sa1, True)

        #         oneLine1 = renameUsingHashDefLine(sa1, ib1, False, True)
        #         twoLine1 = renameUsingHashDefLine(sa1, ib1, True, True)

        #print("OneLine1------------------------------------------------")
        #print(oneLine1)
        #print("TwoLine1------------------------------------------------")
        #print(twoLine1)

        #One line tests
        lines = oneLine1.split("\n")
        self.assertTrue(lines[0] == "var geom2d = function ( ) {")
        #var <<var#=numeric.sum,=numeric.numberEquals;>> = numeric . sum , <<var=numeric.sum,#=numeric.numberEquals;>> = numeric . numberEquals ;
        self.assertTrue(
            lines[1] ==
            "var <<var#=numeric.sum,=numeric.numberEquals;>> = numeric . sum , <<var=numeric.sum,#=numeric.numberEquals;>> = numeric . numberEquals ;"
        )
        self.assertTrue(
            lines[3] ==
            "function <<function#(,){>> ( <<function(#,){>> , <<function(,#){>> ) {"
        )
        self.assertTrue(lines[4] == "this . x = <<function(#,){>> ;"
                        )  #Why is x not transformed? Global, can't change...
        #print(lines[7])
        self.assertTrue(
            lines[7] == "u ( <<function#(,){>> , {"
        )  #Why is u not transformed? -> Because u's hash <<function#(,){>> is ALREADY IN USE IN THE SAME SCOPE!!  (This is why u can be translated in 2-lines)
        self.assertTrue(
            lines[16] ==
            "for ( var <<for(var#in)[]=[];>> in <<function(,#){>> ) <<function(#,){>> [ <<for(var#in)[]=[];>> ] = <<function(,#){>> [ <<for(var#in)[]=[];>> ] ;"
        )
        self.assertTrue(lines[20] == "Vector2d : <<function#(,){>>")
        #Two line tests (TODO)
        lines = twoLine1.split("\n")
        self.assertTrue(lines[0] == "var geom2d = function ( ) {")

        self.assertTrue(
            lines[1] ==
            "var <<var#=numeric.sum,=numeric.numberEquals;return#([this.x*.x,this.y*.y]);>> = numeric . sum , <<var=numeric.sum,#=numeric.numberEquals;return#(this.x,.x,)&&(this.y,.y,);>> = numeric . numberEquals ;"
        )
        #                            function <<function#(,){(#,{>> ( <<function(#,){this.x=#;>> , <<function(,#){this.y=#;>> ) {
        self.assertTrue(
            lines[3] ==
            "function <<function#(,){(#,{>> ( <<function(#,){this.x=#;>> , <<function(,#){this.y=#;>> ) {"
        )
        self.assertTrue(lines[4] == "this . x = <<function(#,){this.x=#;>> ;"
                        )  #Why is x not transformed? Global, can't change...

        #u(r, {
        #                            #<<function#(,){#(,{>> ( <<function#(,){(#,{>> , {
        self.assertTrue(
            lines[7] == "<<function#(,){#(,{>> ( <<function#(,){(#,{>> , {"
        )  # is transformed, but order seems backwards.
        self.assertTrue(
            lines[16] ==
            "for ( var <<for(var#in)[]=[];for(varin)[#]=[];>> in <<function(,#){for(varin#)[]=[];>> ) <<function(#,){for(varin)#[]=[];>> [ <<for(var#in)[]=[];for(varin)[#]=[];>> ] = <<function(,#){for(varin#)[]=[];>> [ <<for(var#in)[]=[];for(varin)[#]=[];>> ] ;"
        )  #Not really two lines, but two references?
        self.assertTrue(lines[20] == "Vector2d : <<function#(,){(#,{>>")

        self.assertTrue(True)
Exemple #19
0
output_file = os.path.abspath(sys.argv[2])
mode = int(sys.argv[3])


prepro = Preprocessor(input_file)
prepro.write_temp_file('tmp.js')

clear = Beautifier()
ok = clear.run('tmp.js', 
               'tmp.b.js')
  
lexer = Lexer('tmp.b.js')
iBuilder = IndexBuilder(lexer.tokenList)

scopeAnalyst = ScopeAnalyst(os.path.join(
                         os.path.dirname(os.path.realpath(__file__)), 
                         'tmp.b.js'))

hash_renaming = renameUsingHashDefLine(scopeAnalyst, 
                                   iBuilder, 
                                   twoLines=False,
                                   debug=mode)

with open(output_file, 'w') as f:
    f.writelines(hash_renaming)
# writeTmpLines(hash_renaming, output_file)
 
# clear = Beautifier()
# ok = clear.run(tmp_path, os.path.join(output_path, o_path))
# if not ok:
#     return False