Example #1
0
def processFile(js_file_name):

    candidates = []

    lexer = Lexer(js_file_name)
    iBuilder = IndexBuilder(lexer.tokenList)

    scopeAnalyst = ScopeAnalyst(js_file_name)
    nameOrigin = scopeAnalyst.nameOrigin
    isGlobal = scopeAnalyst.isGlobal
    nameDefScope2pos = scopeAnalyst.nameDefScope2pos

    for (name, def_scope) in nameOrigin.iterkeys():
        pos = nameDefScope2pos[(name, def_scope)]

        (lin, col) = iBuilder.revFlatMat[pos]
        scope = iBuilder.revTokMap[(lin, col)]

        glb = isGlobal.get((name, pos), True)


        if name != 'TOKEN_LITERAL_STRING' and \
                name != 'TOKEN_LITERAL_NUMBER':
            candidates.append((scope, name, pos, (lin, col), glb, def_scope))

    print
    print
    for c in sorted(candidates, key=lambda e: e[0]):
        (scope, name, pos, (lin, col), glb, def_scope) = c

        if name == 'n' or name == 'calendarEventId':
            print '\t', scope, name, pos, (lin, col), glb
            print '\t\t', def_scope
Example #2
0
    def testFiles(self):
        tf = [1, 5, 6, 7, 8, 9, 10, 11]
        #tf = [11]

        for i in tf:
            print("-----------------------------------------------------")
            lexed = Lexer(self.fileList[i - 1])
            ib = IndexBuilder(lexed.tokenList)
            #print(ib)
            sa = ScopeAnalyst(self.fileList[i - 1])
            print(sa)
            nameCount = {}
            #TODO: Grab only the non-globals to look at (get the start key and look it up)
            for variable in sa.nameDefScope2pos.keys():
                start = sa.nameDefScope2pos[variable]
                name = variable[0]
                if (not sa.isGlobal[(name, start)]):
                    if (name in nameCount):
                        nameCount[name] += 1
                    else:
                        nameCount[name] = 1
                    print(
                        str(name) + " : " +
                        str(sa.nameDefScope2pos[variable]) + " -> " +
                        str(ib.revFlatMat[sa.nameDefScope2pos[variable]]) +
                        " Manual: " + str(self.file_definitions[i][name]))
                    assert (ib.revFlatMat[sa.nameDefScope2pos[variable]][0]
                            in self.file_definitions[i][name])

            #Finally make sure that the count of definitions matches our manual check.
            for name, count in nameCount.iteritems():
                print(name + " : " + str(count) + " =?= " +
                      str(len(self.file_definitions[i][name])))
                assert (len(self.file_definitions[i][name]) == count)
Example #3
0
        def load(pth):
            lexer = Lexer(pth)
            iBuilder = IndexBuilder(lexer.tokenList)

            scopeAnalyst = ScopeAnalyst(
                os.path.join(os.path.dirname(os.path.realpath(__file__)), pth))

            return (iBuilder, scopeAnalyst)
Example #4
0
 def testFiles(self):
     #Known bugs:  The definitions of sum and numberEquals in test_file1 seem to be pointing to the wrong instance...
     i = 1
     lexed = Lexer(self.fileList[0])
     ib = IndexBuilder(lexed.tokenList)
     sa = ScopeAnalyst(self.fileList[0])
     for variable in sa.nameDefScope2pos.keys():
         print(
             str(variable[0]) + " : " + str(sa.nameDefScope2pos[variable]) +
             " -> " + str(ib.revFlatMat[sa.nameDefScope2pos[variable]]))
Example #5
0
    def testMinifiableLines(self):
        expected = {}
        expected[0] = set([1, 2, 3, 4, 5, 7, 8, 9, 11, 12, 15, 16, 17, 20])
        expected[5] = set([8, 9])

        for i in [0, 5]:
            ib = IndexBuilder(self.clearLexed[i].tokenList)
            sa = ScopeAnalyst(self.clearTextFiles[i])

            lines = sa.getMinifiableLines(ib)
            print("i:" + str(i))
            print(lines)
            print(expected[i])
            self.assertTrue(lines == expected[i])
            text = ib.get_text_on_lines_wo_literals(lines)
            print(text)
            print(len(text.split("\n")))
            print(len(expected[i]))
            self.assertTrue(len(text.split("\n")) == len(expected[i]))
def processFile(l):
    
    js_file_name = l
    
    candidates = []
    
    if(True):    
    #try:
        print(js_file_name)
        lexer = Lexer(js_file_name)
        return IndexBuilder(lexer.tokenList)        
Example #7
0
def processFile(l):

    js_file_name = l

    candidates = []

    try:
        lexer = Lexer(os.path.join(results_path, js_file_name))
        iBuilder = IndexBuilder(lexer.tokenList)

        scopeAnalyst = ScopeAnalyst(os.path.join(results_path, js_file_name))
        nameOrigin = scopeAnalyst.nameOrigin
        isGlobal = scopeAnalyst.isGlobal
        nameDefScope2pos = scopeAnalyst.nameDefScope2pos

        for (name, def_scope) in nameOrigin.iterkeys():
            pos = nameDefScope2pos[(name, def_scope)]

            (lin, col) = iBuilder.revFlatMat[pos]
            scope = iBuilder.revTokMap[(lin, col)]

            glb = isGlobal.get((name, pos), True)

            #             print name, def_scope, pos, scope, glb #, (lin,col)

            #             if not isGlobal.get((name, pos), True):
            #                 scope = def_scope.replace("\"","")
            #                 i = scope.find('[variables][_values]')
            #                 if i > -1:
            #                     scope = scope[:i+len('[variables][_values]')]
            #                 i = scope.find('[functions][_values]')
            #                 if i > -1:
            #                     scope = scope[:i+len('[functions][_values]')]

            if name != 'TOKEN_LITERAL_STRING' and \
                    name != 'TOKEN_LITERAL_NUMBER':
                candidates.append((scope, name, glb))

    except:
        return (js_file_name, None, 'ScopeAnalyst fail')


#     print 'candidates------------------'
#     for candidate in candidates:
#         print candidate

    return (js_file_name, 'OK', candidates)
def summarizeUnscopedTranslation(renaming_map, f_path, translation_strategy,
                                 output_path, base_name, name_candidates,
                                 name_positions, iBuilder):

    nc = []

    f_base = os.path.basename(f_path)
    training_strategy = f_base.split('.')[1]
    tmp_path = '%s.%s.js' % (f_base[:-3], translation_strategy)
    o_path = '%s.%s.unscoped.%s.js' % (base_name, training_strategy,
                                       translation_strategy)

    #     print f_path, f_base, training_strategy, tmp_path, o_path, base_name

    writeTmpLines(renameHashed(iBuilder, name_positions, renaming_map),
                  tmp_path)

    clear = Beautifier()
    ok = clear.run(tmp_path, os.path.join(output_path, o_path))
    if not ok:
        return False

    try:
        lexer = Lexer(os.path.join(output_path, o_path))
        iBuilder_local = IndexBuilder(lexer.tokenList)

        scopeAnalyst_local = ScopeAnalyst(os.path.join(output_path, o_path))
    except:
        return False

    nameOrigin = scopeAnalyst_local.nameOrigin
    isGlobal = scopeAnalyst_local.isGlobal

    for (name, def_scope) in nameOrigin.iterkeys():

        pos = scopeAnalyst_local.nameDefScope2pos[(name, def_scope)]

        if not False:  #isGlobal.get((name, pos), True):
            (lin, col) = iBuilder_local.revFlatMat[pos]
            (tok_lin, tok_col) = iBuilder_local.revTokMap[(lin, col)]

            nc.append(
                ('%s.unscoped.%s' % (training_strategy, translation_strategy),
                 def_scope, tok_lin, tok_col, isGlobal.get(
                     (name, pos), True), name, '', ''))

    return nc
Example #9
0
def processFile(l):

    js_file_path = l[0]

    pid = int(multiprocessing.current_process().ident)

    try:
        # Temp files to be created during processing
        path_tmp = 'tmp_%d.js' % pid
        path_tmp_b = 'tmp_%d.b.js' % pid

        # Strip comments, replace literals, etc
        try:
            prepro = Preprocessor(os.path.join(corpus_root, js_file_path))
            prepro.write_temp_file(path_tmp)
        except:
            cleanup(pid)
            return (js_file_path, None, 'Preprocessor fail')

        # Pass through beautifier to fix layout
        clear = Beautifier()
        ok = clear.run(path_tmp, path_tmp_b)

        if not ok:
            cleanup(pid)
            return (js_file_path, None, 'Beautifier fail')

        try:
            iBuilder_clear = IndexBuilder(Lexer(path_tmp_b).tokenList)
        except:
            cleanup(pid)
            return (js_file_path, None, 'IndexBuilder fail')

        n_lines = len(iBuilder_clear.tokens)
        max_line_len = max([len(l) for l in iBuilder_clear.tokens])

        cleanup(pid)
        return (js_file_path, n_lines, max_line_len)

    except Exception, e:
        cleanup(pid)
        return (js_file_path, None, str(e))
Example #10
0
def processFile(js_file_path):

    try:        
        
        js_text = open(os.path.join(files_root, js_file_path), 'r').read()
        
        # Strip comments, replace literals, etc
        try:
            prepro = WebLMPreprocessor(js_text)
            prepro_text = str(prepro)
        except:
            return (js_file_path, None, 'Preprocessor fail')
         
        # Pass through beautifier to fix layout
        clear = Beautifier()
        (ok, beautified_text, _err) = clear.web_run(prepro_text)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')
        
        try:
            lex_clear = WebLexer(beautified_text)
            tok1 = lex_clear.tokenList
        except:
            return (js_file_path, None, 'Lexer fail')
        
        try:
            iBuilder1 = IndexBuilder(tok1)
        except:
            return (js_file_path, None, 'IndexBuilder fail')

        orig = [] 
        
        for _line_idx, line in enumerate(iBuilder1.tokens):
            orig.append(' '.join([t for (_tt,t) in line]) + "\n")
        
        return (js_file_path, orig)
        
    except Exception, e:
        return (js_file_path, None, str(e))
Example #11
0
 def testfileDebug(self):
     for f in self.fileList:
         print("---------------------------------- " + f +
               " ----------------------------------")
         orig = f + ".js"
         min = f + ".u.js"
         lo = Lexer(orig)
         lm = Lexer(min)
         print(
             "---------------------------------- original text ----------------------------------"
         )
         print(lo.programText)
         print(
             "---------------------------------- minified text ----------------------------------"
         )
         print(lm.programText)
         for id in self.ids:
             to_read = f + id + ".js"
             print("---------------------------------- " + to_read +
                   " ----------------------------------")
             lexed = Lexer(to_read)
             print(
                 "---------------------------------- text ----------------------------------"
             )
             print(lexed.programText)
             print(
                 "---------------------------------- tokenlist ----------------------------------"
             )
             print(lexed.tokenList)
             ib = IndexBuilder(lexed.tokenList)
             print(
                 "---------------------------------- IndexBuilder ----------------------------------"
             )
             print(ib)
             sa = ScopeAnalyst(to_read)
             print(
                 "---------------------------------- ScopeAnalyst ----------------------------------"
             )
             print(sa)
Example #12
0
def processFile(l):
    base_name = l[0]
    js_file_path = l[1]
    print(base_name)
    print(js_file_path)
    #if(True):
    try:
        lexed = Lexer(js_file_path)
        ib = IndexBuilder(lexed.tokenList)
        sa = ScopeAnalyst(js_file_path)
        #num globals = all in is_global == True + all unique names
        #in name2CharPositions not in is_global
        base_global = set(
            [name for name, value in sa.isGlobal.iteritems() if value == True])
        #Get all known names in the file.
        known_names = set([name for name, value in sa.isGlobal.iteritems()])
        for name, loc in ib.name2CharPositions.iteritems():
            if (name not in known_names):  #if never seen, its a global
                base_global.add(name)

        return [base_name, len(base_global)]
    except:
        return [base_name, None]
Example #13
0
def processFile(l):

    js_file_path = l[0]

    #     if True:
    try:
        js_text = open(os.path.join(corpus_root, js_file_path), 'r').read()

        # Strip comments, replace literals, etc
        try:
            prepro = WebLMPreprocessor(js_text)
            prepro_text = str(prepro)
        except:
            return (js_file_path, None, 'Preprocessor fail')

        # Pass through beautifier to fix layout
        clear = Beautifier()
        (ok, tmp_beautified_text, _err) = clear.web_run(prepro_text)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')

        # Minify
        ugly = Uglifier()
        (ok, tmp_minified_text, _err) = ugly.web_run(tmp_beautified_text)
        if not ok:
            return (js_file_path, None, 'Uglifier fail')

        # Align minified and clear files, in case the beautifier
        # did something weird
        try:
            aligner = Aligner()
            (aligned_clear, aligned_minified) = aligner.web_align(
                WebLexer(tmp_beautified_text).tokenList,
                WebLexer(tmp_minified_text).tokenList)
        except:
            return (js_file_path, None, 'Aligner fail')

        # Pass through beautifier to fix layout
        (ok, beautified_text, _err) = clear.web_run(aligned_clear)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')
        (ok, minified_text, _err) = clear.web_run(aligned_minified)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')

        # Num tokens before vs after
        try:
            lex_clear = WebLexer(beautified_text)
            tok_clear = lex_clear.tokenList

            lex_ugly = WebLexer(minified_text)
            tok_ugly = lex_ugly.tokenList
        except:
            return (js_file_path, None, 'Lexer fail')

        # For now only work with minified files that have
        # the same number of tokens as the originals
        if not len(tok_clear) == len(tok_ugly):
            return (js_file_path, None, 'Num tokens mismatch')

        if beautified_text == minified_text:
            return (js_file_path, None, 'Not minified')

        try:
            iBuilder_ugly = IndexBuilder(lex_ugly.tokenList)
        except:
            return (js_file_path, None, 'IndexBuilder fail')

        try:
            scopeAnalyst = WebScopeAnalyst(minified_text)
        except:
            return (js_file_path, None, 'ScopeAnalyst fail')

        processed = []

        # Try different renaming strategies (hash, etc)
        for r_strategy in RS.all():

            try:
                #             if True:
                # Rename input prior to translation
                preRen = PreRenamer()
                after_text = preRen.rename(r_strategy, iBuilder_ugly,
                                           scopeAnalyst)

                (ok, beautified_after_text, _err) = clear.web_run(after_text)
                if not ok:
                    return (js_file_path, None, 'Beautifier fail')

                processed.append((r_strategy, beautified_after_text))

            except:
                return (js_file_path, None, 'Renaming fail')

        with open(os.path.join(output_path, 'orig', js_file_path), 'w') as f:
            f.write(beautified_text)

        for (r_strategy, text) in processed:
            with open(os.path.join(output_path, r_strategy, js_file_path),
                      'w') as f:
                f.write(text)

        return (js_file_path, 'OK', None)

    except Exception, e:
        return (js_file_path, None, str(e).replace("\n", ""))
Example #14
0
def getMosesTranslation(proxy,
                        r_strategy,
                        RS,
                        a_beautifier,
                        iBuilder_ugly,
                        scopeAnalyst_ugly,
                        debug_mode=False):
    """
    A helper function so that we can run multiple different renaming
    strategies through moses in a more modular and hopefully parallelizable
    manner.  It performs hashing/no hashing preparation of the file for
    the renaming strategy specified by r_stategy, and then calls the
    appropriate moses_server.
    
    Parameters
    ----------
    proxy: A pointer to which port the appropriate moses server is listening in on
    for this particular renaming strategy.

    r_strategy: One of the renaming strategies from RenamingStrategies
    
    RS: A renaming strategies object.
    
    a_beautifier: a beautify object to make sure the renamed text is 
    cleanly formatted.
   
    iBuilder_ugly: Index Builder for the minified file.
   
    scopeAnalyst_ugly: Scope Analyst for the minified file.
   
    start: The starting time for the preprocessing step.  Used for performance
    metrics.
    
    debug_mode: Print debug information? (True/False - defaults to False)
    
    Returns
    -------
    (status, error, translation, name_candidates, 
            a_iBuilder, a_scopeAnalyst, a_name_positions, 
            a_position_names, a_use_scopes, hash_name_map,
            pre_time, rn_time, m_time, post_start)
    
    status: Did this complete without error?  If False, then the rest of the output
    besides error will be empty/null.
    
    error: What is the reason for the failure?  If status is True (successful
    completion) this is "".
    
    translation: The raw Moses output
    
    name_candidates: The set of Moses suggestions for this renaming
    
    a_iBuilder,a_scopeAnalyst: Index Builder and Scope Analyst for this renaming
    
    a_name_positions, a_posistion_names, a_use_scopes: Addition tracking info
    
    hash_name_map: a map from the hashed names to the original minified names 
    
    rn_time, m_time, lex_time, post_start: The duration of the
    renaming, Moses translation steps, and lexing steps along with the start time for the
    postprocessing of the Moses output. 
    """
    rn_start = time.time()

    #We need both the base_text and the hashed_text.
    preRen = PreRenamer()
    if (debug_mode):
        print("Tokens-------------------")
        print(iBuilder_ugly.tokens)
        print("Tokens-------------------")
    #We always need the non hashed names as a fallback.
    try:
        after_text = preRen.rename(r_strategy, iBuilder_ugly,
                                   scopeAnalyst_ugly)
    except:
        return (False, "Renaming failed for " + str(r_strategy), "", {}, None,
                None, {}, {}, {}, {}, 0, 0, 0, 0)

    (ok, beautified_after_text, _err) = a_beautifier.web_run(after_text)
    if not ok:
        return (False,
                "Beautifier failed on the renamed text for " + str(r_strategy),
                "", {}, None, None, {}, {}, {}, {}, 0, 0, 0, 0)

    # Align hashed and non hashed  files, in case the beautifier
    # line wrapped the extended lines.
    try:
        aligner = Aligner()
        (aligned_after, aligned_before) = aligner.web_align(
            WebLexer(beautified_after_text).tokenList,
            WebLexer(iBuilder_ugly.get_text()).tokenList)
    except:
        return (False,
                "Aligner failed on the renamed text for " + str(r_strategy),
                "", {}, None, None, {}, {}, {}, {}, 0, 0, 0, 0)

    #print("--------Aligned After-------")
    #print(aligned_after)
    #print("----------------------------")

    a_lexer = WebLexer(aligned_after)
    a_iBuilder = IndexBuilder(a_lexer.tokenList)
    a_scopeAnalyst = WebScopeAnalyst(aligned_after)

    hash_name_map = {}

    if (r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO):

        #Something below here is buggy...
        orderedVarsMin = sorted(scopeAnalyst_ugly.name2defScope.keys(),
                                key=lambda x: x[1])
        orderedVarsHash = sorted(a_scopeAnalyst.name2defScope.keys(),
                                 key=lambda x: x[1])
        #print("Min len: " + str(len(orderedVarsMin)))
        #print("Hash len: " + str(len(orderedVarsHash)))
        if (len(orderedVarsMin) != len(orderedVarsHash)):
            return (False, "Mismatch between minified and hashed names.", "",
                    {}, a_iBuilder, a_scopeAnalyst, {}, {}, {}, {}, 0, 0, 0, 0)

        for i in range(0, len(orderedVarsHash)):
            name_hash = orderedVarsHash[i][0]
            def_scope_hash = a_scopeAnalyst.name2defScope[orderedVarsHash[i]]

            name_min = orderedVarsMin[i][0]
            def_scope_min = scopeAnalyst_ugly.name2defScope[orderedVarsMin[i]]
            hash_name_map[(name_hash, def_scope_hash)] = (name_min,
                                                          def_scope_min)

    if (debug_mode):
        print("HASH NAME MAP LEN: " + str(len(hash_name_map)))

    # We can switch this back once we train models on a corpus with literals
    # lx = WebLexer(a_iBuilder.get_text())
    lx = WebLexer(a_iBuilder.get_text_wo_literals())
    #print("-----------------Moses In ----------------------")
    #print(lx)
    #print("------------------------------------------------")
    #print(a_iBuilder.charPosition2Name)
    #print("------------------------------------------------")
    #line_subset = a_scopeAnalyst.getMinifiableLines(a_iBuilder)
    #line_list = sorted(list(line_subset))
    #line_map = {}
    #m_line = 0
    #for next_line in line_list:
    #    line_map[m_line] = next_line
    #    m_line += 1
    #lx = WebLexer(a_iBuilder.get_text_on_lines_wo_literals(line_subset))

    #Performance measures -> wrap up the preprocessing/ renaming
    #phases
    end = time.time()
    rn_time = end - rn_start
    m_start = time.time()
    #if(debug_mode):
    #    print("Invoking Moses.")
    #    print(lx.collapsedText)
    # Translate renamed input
    #md = WebMosesDecoder(proxy)
    #(ok, translation, _err) = md.run(lx.collapsedText)
    (ok, translation, _err) = segmentedTranslation(lx, SEGMENTED_TRANS_SIZE,
                                                   proxy, debug_mode)
    if not ok:
        return (False, "Moses server failed for " + str(r_strategy),
                translation, {}, a_iBuilder, a_scopeAnalyst, {}, {}, {},
                hash_name_map, 0, 0, 0, 0)

    m_end = time.time()
    m_time = m_end - m_start

    post_start = time.time()

    (a_name_positions, a_position_names,
     a_use_scopes) = prepHelpers(a_iBuilder, a_scopeAnalyst)

    if translation is not None:
        # Parse moses output
        mp = MosesParser()
        if (debug_mode):
            print(translation)

        name_candidates = mp.parse(translation, a_iBuilder,
                                   a_position_names)  #,
        #a_scopeAnalyst)

        #A slightly modified version of parse to remap the moses
        #output lines to the correct original lines.
        #name_candidates = mp.parse_subset(translation,
        #                                  a_iBuilder,
        #                                  a_position_names,
        #                                  line_map)

    lex_time = lx.build_time + a_lexer.build_time
    return (True, "", translation, name_candidates, a_iBuilder, a_scopeAnalyst,
            a_name_positions, a_position_names, a_use_scopes, hash_name_map,
            rn_time, m_time, lex_time, post_start)
Example #15
0
def processFile(l):

    js_file_path = l[0]
    base_name = os.path.splitext(os.path.basename(js_file_path))[0]

    temp_files = {
        'orig': '%s.js' % base_name,
        'minified': '%s.u.js' % base_name,
        'n2p': '%s.n2p.js' % base_name
    }

    for r_strategy in RS.all():
        temp_files['%s' % (r_strategy)] = \
                    '%s.%s.js' % (base_name, r_strategy)

        for c_strategy in CS.all():
            temp_files['%s_%s' % (r_strategy, c_strategy)] = \
                    '%s.%s.%s.js' % (base_name, r_strategy, c_strategy)

    for k, v in temp_files.iteritems():
        temp_files[k] = os.path.join(output_path, v)

    candidates = []
    #Minified Name -> Original Name (name, def_scope) -> (name, def_scope)
    min_name_map = {}
    #Hashed Name -> Minified Name (name, def_scope) -> (name, def_scope)
    hash_name_map = {}
    #Minified Name -> jsnice name  (name, def_scope) -> (name, def_scope)
    jsnice_name_map = {}
    #Output Lines for the suggestoin_model.csv
    model_rows = []

    try:
        js_text = open(os.path.join(corpus_root, js_file_path), 'r').read()

        # Strip comments, replace literals, etc
        try:
            prepro = WebLMPreprocessor(js_text)
            prepro_text = str(prepro)
        except:
            return (js_file_path, None, 'Preprocessor fail')

        # Pass through beautifier to fix layout
        clear = Beautifier()
        (ok, tmp_beautified_text, _err) = clear.web_run(prepro_text)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')

        # Minify
        ugly = Uglifier()
        (ok, tmp_minified_text, _err) = ugly.web_run(tmp_beautified_text)
        if not ok:
            return (js_file_path, None, 'Uglifier fail')

        # Align minified and clear files, in case the beautifier
        # did something weird
        try:
            aligner = Aligner()
            (aligned_clear, aligned_minified) = aligner.web_align(
                WebLexer(tmp_beautified_text).tokenList,
                WebLexer(tmp_minified_text).tokenList)
        except:
            return (js_file_path, None, 'Aligner fail')

        # Pass through beautifier to fix layout
        (ok, beautified_text, _err) = clear.web_run(aligned_clear)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')
        (ok, minified_text, _err) = clear.web_run(aligned_minified)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')

        # Num tokens before vs after
        try:
            lex_clear = WebLexer(beautified_text)
            tok_clear = lex_clear.tokenList

            lex_ugly = WebLexer(minified_text)
            tok_ugly = lex_ugly.tokenList
        except:
            return (js_file_path, None, 'Lexer fail')

        # For now only work with minified files that have
        # the same number of tokens as the originals
        if not len(tok_clear) == len(tok_ugly):
            return (js_file_path, None, 'Num tokens mismatch')

        if beautified_text == minified_text:
            return (js_file_path, None, 'Not minified')

        #try:
        #    iBuilder_clear = IndexBuilder(lex_clear.tokenList)
        #except:
        #    return (js_file_path, None, "IndexBuilder fail on original file.")

        try:
            iBuilder_ugly = IndexBuilder(lex_ugly.tokenList)
        except:
            return (js_file_path, None, 'IndexBuilder fail')

        with open(temp_files['orig'], 'w') as f:
            f.write(beautified_text)

        with open(temp_files['minified'], 'w') as f:
            f.write(minified_text)

#         try:
#             orig_lexer = WebLexer(beautified_text)
#             orig_iBuilder = IndexBuilder(orig_lexer.tokenList)
#             orig_scopeAnalyst = WebScopeAnalyst(beautified_text)
#         except:
#             return (js_file_path, None, 'IndexBuilder/Scoper fail on original')

########################
#     Nice2Predict
########################

# BV: Next block left out until I figure out the pipe issue
# BV: Update: I couldn't pipe input to N2P. TODO: FIX
# Run the JSNice from http://www.nice2predict.org
        unuglifyJS = UnuglifyJS()
        (ok, n2p_text, _err) = unuglifyJS.run(temp_files['minified'])
        if not ok:
            return (js_file_path, None, 'Nice2Predict fail')

        (ok, n2p_text_beautified, _err) = clear.web_run(n2p_text)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')

        with open(temp_files['n2p'], 'w') as f:
            f.write(n2p_text_beautified)

        try:
            n2p_lexer = WebLexer(n2p_text_beautified)
            n2p_iBuilder = IndexBuilder(n2p_lexer.tokenList)
            n2p_scopeAnalyst = WebScopeAnalyst(n2p_text_beautified)
        except:
            return (js_file_path, None, 'IndexBuilder / ScopeAnalyst fail')

        # Save some translation stats to compare different methods
        ts = TranslationSummarizer()
        candidates += [['n2p', ''] + x for x in ts.compute_summary_unscoped(
            n2p_iBuilder, n2p_scopeAnalyst)]

        ################################################
        # All other JSNaughty variants
        ################################################

        try:
            scopeAnalyst = WebScopeAnalyst(minified_text)
        except:
            return (js_file_path, None, 'ScopeAnalyst minified fail')

        try:
            scopeAnalyst_clear = WebScopeAnalyst(beautified_text)
        except:
            return (js_file_path, None, 'ScopeAnalyst clear fail')

        #if(not check(iBuilder_clear, scopeAnalyst_clear, n2p_iBuilder, n2p_scopeAnalyst)):
        #    return (js_file_path, None, 'JsNice restructured file. Skipping..')

        #Map the original names to the minified counterparts.
        orderedVarsNew = sorted(scopeAnalyst.name2defScope.keys(),
                                key=lambda x: x[1])
        orderedVarsOld = sorted(scopeAnalyst_clear.name2defScope.keys(),
                                key=lambda x: x[1])
        orderedVarsN2p = sorted(n2p_scopeAnalyst.name2defScope.keys(),
                                key=lambda x: x[1])

        if (len(orderedVarsOld) != len(orderedVarsNew)):
            return (js_file_path, None,
                    "Old and New Name lists different length")

        if (len(orderedVarsOld) != len(orderedVarsN2p)):
            return (js_file_path, None,
                    "JsNice and Old Name lists different length")

        for i in range(0, len(orderedVarsOld)):
            name_old = orderedVarsOld[i][0]
            def_scope_old = scopeAnalyst_clear.name2defScope[orderedVarsOld[i]]

            name_new = orderedVarsNew[i][0]
            def_scope_new = scopeAnalyst.name2defScope[orderedVarsNew[i]]
            min_name_map[(name_new, def_scope_new)] = (name_old, def_scope_old)

            name_n2p = orderedVarsN2p[i][0]
            def_scope_n2p = scopeAnalyst.name2defScope[orderedVarsNew[i]]
            jsnice_name_map[(name_new, def_scope_new)] = (name_n2p,
                                                          def_scope_n2p)

        #Once we have the scopeAnalyst, iBuilder, and tokenlist for the minified
        #version, we can get the name properties
#        vm = VariableMetrics(scopeAnalyst, iBuilder_ugly, lex_ugly.tokenList)
#        variableKeySet = vm.getVariables()
#        for variableKey in variableKeySet:
#            name_features[variableKey] = vm.getNameMetrics(variableKey)



        (_name_positions, \
         position_names,
         _use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst)

        # Try different renaming strategies (hash, etc)
        for r_strategy, proxy in proxies:

            # Rename input prior to translation
            preRen = PreRenamer()
            after_text = preRen.rename(r_strategy, iBuilder_ugly, scopeAnalyst)

            (ok, beautified_after_text, _err) = clear.web_run(after_text)
            if not ok:
                return (js_file_path, None, 'Beautifier fail')

            # Save renamed input to disk for future inspection
            with open(temp_files['%s' % (r_strategy)], 'w') as f:
                f.write(beautified_after_text)

            a_lexer = WebLexer(beautified_after_text)
            a_iBuilder = IndexBuilder(a_lexer.tokenList)
            a_scopeAnalyst = WebScopeAnalyst(beautified_after_text)

            if (r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO):
                #                 try:
                #                     scopeAnalyst_hash = WebScopeAnalyst(beautified_after_text) #This should be beautified_after_text instead of after_text
                #                 except:
                #                     return (js_file_path, None, "ScopeAnalyst hash fail")

                #Map the hashed names to the minified counterparts.
                orderedVarsMin = sorted(scopeAnalyst.name2defScope.keys(),
                                        key=lambda x: x[1])
                orderedVarsHash = sorted(a_scopeAnalyst.name2defScope.keys(),
                                         key=lambda x: x[1])

                if (len(orderedVarsMin) != len(orderedVarsHash)):
                    return (js_file_path, None,
                            "Hash and Min lists different length")

                for i in range(0, len(orderedVarsHash)):
                    name_hash = orderedVarsHash[i][0]
                    def_scope_hash = a_scopeAnalyst.name2defScope[
                        orderedVarsHash[i]]

                    name_min = orderedVarsMin[i][0]
                    def_scope_min = scopeAnalyst.name2defScope[
                        orderedVarsMin[i]]
                    hash_name_map[(name_hash,
                                   def_scope_hash)] = (name_min, def_scope_min)

            # We can switch this back once we train models on a corpus with literals
            # lx = WebLexer(a_iBuilder.get_text())
            lx = WebLexer(a_iBuilder.get_text_wo_literals())

            # Translate renamed input
            md = WebMosesDecoder(proxy)
            (ok, translation, _err) = md.run(lx.collapsedText)
            if not ok:
                return (js_file_path, None, 'Moses translation fail')

            (a_name_positions, a_position_names,
             a_use_scopes) = prepHelpers(a_iBuilder, a_scopeAnalyst)

            nc = []

            if translation is not None:
                # Parse moses output
                mp = MosesParser()

                name_candidates = mp.parse(translation, a_iBuilder,
                                           a_position_names)
                # name_candidates is a dictionary of dictionaries:
                # keys are (name, def_scope) tuples;
                # values are suggested translations with the sets
                # of line numbers on which they appear.

                # Update name_candidates with some default values
                # (in this case the translation without any renaming)
                # if the translation is empty
                if r_strategy == RS.NONE:
                    # RS.NONE should always be first, by construction
                    name_candidates_default = name_candidates
                    scopeAnalyst_default = a_scopeAnalyst
                    iBuilder_default = a_iBuilder
                else:
                    for key_default, suggestions in name_candidates_default.iteritems(
                    ):
                        #                         (name_default, def_scope_default) = key_default

                        pos_default = scopeAnalyst_default.nameDefScope2pos[
                            key_default]
                        (lin, col) = iBuilder_default.revFlatMat[pos_default]
                        (line_num,
                         line_idx) = iBuilder_default.revTokMap[(lin, col)]

                        (name,
                         def_scope) = a_position_names[line_num][line_idx]
                        key = (name, def_scope)

                        for name_translation, lines in suggestions.iteritems():
                            name_candidates.setdefault(key, {})
                            name_candidates[key].setdefault(
                                name_translation, set([]))
                            name_candidates[key][name_translation].update(
                                lines)

                # **** BV: This might be all we need to combine Naughty & Nice
                name_candidates_copy = deepcopy(name_candidates)
                for key, suggestions in name_candidates_copy.iteritems():

                    if r_strategy == RS.NONE:
                        (name_n2p, def_scope_n2p) = jsnice_name_map[key]
                    else:
                        (name_n2p,
                         def_scope_n2p) = jsnice_name_map[hash_name_map.get(
                             key, key)]

                    for name_translation, lines in suggestions.iteritems():
                        name_candidates.setdefault(key, {})
                        name_candidates[key].setdefault(name_n2p, set([]))
                        name_candidates[key][name_n2p].update(lines)

                cc = ConsistencyController(debug_mode=False)
                ts = TranslationSummarizer()

                # An identifier may have been translated inconsistently
                # across different lines (Moses treats each line independently).
                # Try different strategies to resolve inconsistencies, if any
                for c_strategy in CS.all():

                    # Compute renaming map (x -> length, y -> width, ...)
                    # Note that x,y here are names after (hash) renaming
                    (temp_renaming_map, seen) = cc.computeRenaming(
                        c_strategy, name_candidates, a_name_positions,
                        a_use_scopes, a_iBuilder, lm_path, {}, hash_name_map)

                    # After computeRenaming, we have both the entropies stored
                    # if we are in LMDrop strategy and have the suggestions
                    # frequency from name_candidates.  Fill in suggestion_Features
                    #                    if(c_strategy == CS.LMDROP and r_strategy not in suggestion_features):
                    #                        assert(cc.suggestion_cache != None)
                    #                        suggestion_features[r_strategy] = {}
                    #                        """
                    #                        name_candidates: dict
                    #                            name_candidates[(name, def_scope)][name_translation]
                    #                            = set of line numbers in the translation
                    #                        """
                    #                        for variableKey, suggestionDictionary in name_candidates.iteritems():
                    #                            for suggestionName, linesSuggested in suggestionDictionary.iteritems():
                    #
                    #                                # I need to revert variableKey[0] in the suggestion from its hash to its original minified name.
                    #                                if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO):
                    #                                    unhashedKey = hash_name_map[variableKey]
                    #                                    suggestionKey = (unhashedKey[0], unhashedKey[1], suggestionName)
                    #                                else:
                    #                                    suggestionKey = (variableKey[0], variableKey[1], suggestionName)
                    #
                    #                                entropyVals = cc.suggestion_cache.getEntropyStats(variableKey, suggestionName)
                    #                                if(entropyVals != (ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR)):
                    #                                    suggestionValue = [len(linesSuggested)] + \
                    #                                                       list(getSuggestionStats(suggestionName)) + \
                    #                                                       list(entropyVals)
                    #
                    #                                    suggestion_features[r_strategy][suggestionKey] = suggestionValue

                    # Fall back on original names in input, if
                    # no translation was suggested
                    postRen = PostRenamer()
                    renaming_map = postRen.updateRenamingMap(
                        a_name_positions, position_names, a_use_scopes,
                        temp_renaming_map, seen, r_strategy)

                    # Apply renaming map and save output for future inspection
                    renamed_text = postRen.applyRenaming(
                        a_iBuilder, a_name_positions, renaming_map)

                    (ok, beautified_renamed_text,
                     _err) = clear.web_run(renamed_text)
                    if not ok:
                        return (js_file_path, None, 'Beautifier fail')
                    with open(temp_files['%s_%s' % (r_strategy, c_strategy)],
                              'w') as f:
                        f.write(beautified_renamed_text)

                    # Save some stats about which names were renamed to what
                    # This is what enables the comparison between the different
                    # methods.
                    r = [[c_strategy] + x for x in ts.compute_summary_scoped(
                        renaming_map, name_candidates, a_iBuilder,
                        a_scopeAnalyst)]

                    if not r:
                        return (js_file_path, None, 'Compute summary failed')
                    nc += r

            if nc:
                candidates += [[r_strategy] + x for x in nc]

        #create the rows for the suggestion_model.csv


#        for r_strategy in RS.all():
#            for suggestionKey, s_feat in suggestion_features[r_strategy].iteritems():
#                variableKey = (suggestionKey[0], suggestionKey[1])
#                original_name = min_name_map[variableKey][0]
#                js_nice_name = jsnice_name_map[variableKey][0]
#                n_feat = list(name_features[variableKey])
#                #Convert the def_scope to an equivalent, but smaller, easier to read key: (line_num, token_num)
#                newKey = scopeAnalyst.nameDefScope2pos[variableKey]
#                (keyLine, keyToken) = iBuilder_ugly.revFlatMat[newKey]
#                model_rows.append([original_name, r_strategy, suggestionKey[0], keyLine, keyToken, suggestionKey[2], js_nice_name] + n_feat + s_feat)

        return (js_file_path, 'OK', candidates, model_rows)

    except Exception, e:
        return (js_file_path, None, str(e).replace("\n", ""), model_rows)
Example #16
0
    
input_file = os.path.abspath(sys.argv[1])
output_file = os.path.abspath(sys.argv[2])
mode = int(sys.argv[3])


prepro = Preprocessor(input_file)
prepro.write_temp_file('tmp.js')

clear = Beautifier()
ok = clear.run('tmp.js', 
               'tmp.b.js')
  
lexer = Lexer('tmp.b.js')
iBuilder = IndexBuilder(lexer.tokenList)

scopeAnalyst = ScopeAnalyst(os.path.join(
                         os.path.dirname(os.path.realpath(__file__)), 
                         'tmp.b.js'))

hash_renaming = renameUsingHashDefLine(scopeAnalyst, 
                                   iBuilder, 
                                   twoLines=False,
                                   debug=mode)

with open(output_file, 'w') as f:
    f.writelines(hash_renaming)
# writeTmpLines(hash_renaming, output_file)
 
# clear = Beautifier()
def processFile(l):

    js_file_path = l[0]
    base_name = os.path.splitext(os.path.basename(js_file_path))[0]

    pid = int(multiprocessing.current_process().ident)

    temp_files = {
        'path_tmp': 'tmp_%d.js' % pid,
        'path_tmp_b': 'tmp_%d.b.js' % pid,
        'path_tmp_b_1': 'tmp_%d.b.1.js' % pid,
        'path_tmp_b_2': 'tmp_%d.b.2.js' % pid,
        'path_tmp_b_a': 'tmp_%d.b.a.js' % pid,
        'path_tmp_u': 'tmp_%d.u.js' % pid,
        'path_tmp_u_a': 'tmp_%d.u.a.js' % pid,
        'path_tmp_unugly': 'tmp_%d.n2p.js' % pid,
        'path_tmp_unugly_1': 'tmp_%d.n2p.1.js' % pid,
        'path_tmp_unugly_2': 'tmp_%d.n2p.2.js' % pid,
        'path_tmp_jsnice': 'tmp_%d.jsnice.js' % pid,
        'f2': 'tmp_%d.no_renaming.js' % pid,
        #                   'f3': 'tmp_%d.basic_renaming.js' % pid,
        #                   'f4': 'tmp_%d.hash_renaming.js' % pid,
        'f5': 'tmp_%d.hash_def_one_renaming.js' % pid,
        #                   'f6': 'tmp_%d.hash_def_two_renaming.js' % pid,
        'f7': 'tmp_%d.hash_def_one_renaming_fb.js' % pid,
        'path_orig': os.path.join(output_path, '%s.js' % base_name),
        'path_ugly': os.path.join(output_path, '%s.u.js' % base_name),
        'path_unugly': os.path.join(output_path, '%s.n2p.js' % base_name),
        'path_jsnice': os.path.join(output_path, '%s.jsnice.js' % base_name)
    }

    #     for strategy in ['js', 'lm.js', 'len.js', 'freqlen.js']:
    #         for renaming in ['no_renaming', 'hash_def_one_renaming']:
    #             temp_files['path_tmp_%s_%s' % (renaming, strategy)] = \
    #                     'tmp_%d.%s.%s' % (pid, renaming, strategy)

    candidates = []

    #     if True:
    try:

        # Strip comments, replace literals, etc
        try:
            prepro = Preprocessor(os.path.join(corpus_root, js_file_path))
            prepro.write_temp_file(temp_files['path_tmp'])
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'Preprocessor fail')

        # Pass through beautifier to fix layout
        clear = Beautifier()
        ok = clear.run(temp_files['path_tmp'], temp_files['path_tmp_b'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Beautifier fail')

#         # Pass through beautifier to fix layout
#         clear = Beautifier()
#         ok = clear.run(temp_files['path_tmp'],
#                        temp_files['path_tmp_b_1'])
#         if not ok:
#             cleanup(temp_files)
#             return (js_file_path, None, 'Beautifier fail')
#
#         jsNiceBeautifier = JSNice(flags=['--no-types', '--no-rename'])
#
#         (ok, _out, _err) = jsNiceBeautifier.run(temp_files['path_tmp_b_1'],
#                                                 temp_files['path_tmp_b_2'])
#         if not ok:
#             cleanup(temp_files)
#             print js_file_path, _err
#             return (js_file_path, None, 'JSNice Beautifier fail')
#
#         ok = clear.run(temp_files['path_tmp_b_2'],
#                        temp_files['path_tmp_b'])
#         if not ok:
#             cleanup(temp_files)
#             return (js_file_path, None, 'Beautifier fail')
#
#
#         # Weird JSNice renamings despite --no-rename
#         try:
#             before = set([token for (token, token_type) in
#                           Lexer(temp_files['path_tmp_b_1']).tokenList
#                           if is_token_subtype(token_type, Token.Name)])
#             after = set([token for (token, token_type) in
#                           Lexer(temp_files['path_tmp_b']).tokenList
#                           if is_token_subtype(token_type, Token.Name)])
#
#             if not before == after:
#                 return (js_file_path, None, 'Weird JSNice renaming')
#
#         except:
#             cleanup(temp_files)
#             return (js_file_path, None, 'Lexer fail')

# Minify
        ugly = Uglifier()
        ok = ugly.run(temp_files['path_tmp_b'], temp_files['path_tmp_u'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Uglifier fail')

        # Num tokens before vs after
        try:
            tok_clear = Lexer(temp_files['path_tmp_b']).tokenList
            tok_ugly = Lexer(temp_files['path_tmp_u']).tokenList
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'Lexer fail')

        # For now only work with minified files that have
        # the same number of tokens as the originals
        if not len(tok_clear) == len(tok_ugly):
            cleanup(temp_files)
            return (js_file_path, None, 'Num tokens mismatch')

        # Align minified and clear files, in case the beautifier
        # did something weird
        try:
            aligner = Aligner()
            # This is already the baseline corpus, no (smart) renaming yet
            aligner.align(temp_files['path_tmp_b'], temp_files['path_tmp_u'])
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'Aligner fail')



        if open(temp_files['path_tmp_b']).read() == \
                open(temp_files['path_tmp_u']).read():
            cleanup(temp_files)
            return (js_file_path, None, 'Not minified')

        try:
            lex_ugly = Lexer(temp_files['path_tmp_u_a'])
            iBuilder_ugly = IndexBuilder(lex_ugly.tokenList)
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'IndexBuilder fail')

        ############################################################
        # From now on only work with path_tmp_b_a and path_tmp_u_a
        ############################################################

        # Store original and uglified versions
        ok = clear.run(temp_files['path_tmp_b_a'], temp_files['path_orig'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Beautifier fail')

        ok = clear.run(temp_files['path_tmp_u_a'], temp_files['path_ugly'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Beautifier fail')

        # Run the JSNice from http://www.nice2predict.org
        unuglifyJS = UnuglifyJS()
        (ok, _out, _err) = unuglifyJS.run(temp_files['path_tmp_u_a'],
                                          temp_files['path_tmp_unugly'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Nice2Predict fail')

        ok = clear.run(temp_files['path_tmp_unugly'],
                       temp_files['path_unugly'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Beautifier fail')

#         ok = clear.run(temp_files['path_tmp_unugly'],
#                        temp_files['path_tmp_unugly_1'])
#         if not ok:
#             cleanup(temp_files)
#             return (js_file_path, None, 'Beautifier fail')
#
#         (ok, _out, _err) = jsNiceBeautifier.run(temp_files['path_tmp_unugly_1'],
#                                                 temp_files['path_tmp_unugly_2'])
#         if not ok:
#             cleanup(temp_files)
#             print js_file_path, _err
#             return (js_file_path, None, 'JSNice Beautifier fail')
#
#         ok = clear.run(temp_files['path_tmp_unugly_2'],
#                        temp_files['path_unugly'])
#         if not ok:
#             cleanup(temp_files)
#             return (js_file_path, None, 'Beautifier fail')

        try:
            lexer = Lexer(temp_files['path_unugly'])
            iBuilder = IndexBuilder(lexer.tokenList)
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'IndexBuilder fail')

        try:
            scopeAnalyst = ScopeAnalyst(
                os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             temp_files['path_unugly']))
            nameOrigin = scopeAnalyst.nameOrigin
            isGlobal = scopeAnalyst.isGlobal

            for (name, def_scope) in nameOrigin.iterkeys():

                pos = scopeAnalyst.nameDefScope2pos[(name, def_scope)]
                (lin, col) = iBuilder.revFlatMat[pos]
                (tok_lin, tok_col) = iBuilder.revTokMap[(lin, col)]

                candidates.append(('Nice2Predict', def_scope, tok_lin, tok_col,
                                   isGlobal.get((name, pos),
                                                True), name, '', ''))
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'ScopeAnalyst fail')

#         # Run the JSNice from http://www.jsnice.org
#         jsNice = JSNice()
#         (ok, _out, _err) = jsNice.run(temp_files['path_tmp_u_a'],
#                                       temp_files['path_tmp_jsnice'])
#         if not ok:
#             cleanup(temp_files)
#             return (js_file_path, None, 'JSNice fail')
#
#         ok = clear.run(temp_files['path_tmp_jsnice'],
#                        temp_files['path_jsnice'])
#         if not ok:
#             cleanup(temp_files)
#             return (js_file_path, None, 'Beautifier fail')
#
#         try:
#             lexer = Lexer(temp_files['path_jsnice'])
#             iBuilder = IndexBuilder(lexer.tokenList)
#         except:
#             cleanup(temp_files)
#             return (js_file_path, None, 'IndexBuilder fail')
#
#         try:
#             scopeAnalyst = ScopeAnalyst(os.path.join(
#                                  os.path.dirname(os.path.realpath(__file__)),
#                                  temp_files['path_jsnice']))
#             nameOrigin = scopeAnalyst.nameOrigin
#             isGlobal = scopeAnalyst.isGlobal
#
#             for (name, def_scope) in nameOrigin.iterkeys():
#
#                 pos = scopeAnalyst.nameDefScope2pos[(name, def_scope)]
#                 (lin,col) = iBuilder.revFlatMat[pos]
#                 (tok_lin,tok_col) = iBuilder.revTokMap[(lin,col)]
#
#                 candidates.append(('JSNice', def_scope,
#                                    tok_lin, tok_col,
#                                    isGlobal.get((name, pos), True),
#                                    name, '',''))
#         except:
#             cleanup(temp_files)
#             return (js_file_path, None, 'ScopeAnalyst fail')

# Compute scoping: name2scope is a dictionary where keys
# are (name, start_index) tuples and values are scope identifiers.
# Note: start_index is a flat (unidimensional) index,
# not a (line_chr_idx, col_chr_idx) index.
        try:
            scopeAnalyst = ScopeAnalyst(
                os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             temp_files['path_tmp_u_a']))
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'ScopeAnalyst fail')

        # Baseline translation: No renaming, no scoping
        no_renaming = []
        for _line_idx, line in enumerate(iBuilder_ugly.tokens):
            no_renaming.append(' '.join([t for (_tt, t) in line]) + "\n")

        with open(temp_files['f2'], 'w') as f_no_renaming:
            f_no_renaming.writelines(no_renaming)

        moses = MosesDecoder(ini_path=os.path.join(ini_path, \
                           'train.no_renaming', 'tuning', 'moses.ini'))
        (_moses_ok, translation_no_renaming,
         _err) = moses.run(temp_files['f2'])

        nc = processTranslationUnscoped(translation_no_renaming, iBuilder_ugly,
                                        lm_path, temp_files['f2'], output_path,
                                        base_name)
        if nc:
            candidates += nc

#  translation, iBuilder, lm_path,
#                                f_path, output_path, base_name
# Default translation: No renaming
#         no_renaming = []
#         for _line_idx, line in enumerate(iBuilder_ugly.tokens):
#             no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n")
#
#         with open(temp_files['f2'], 'w') as f_no_renaming:
#             f_no_renaming.writelines(no_renaming)
#
#         moses = MosesDecoder(ini_path=os.path.join(ini_path, \
#                            'train.no_renaming', 'tuning', 'moses.ini'))
#         (_moses_ok, translation, _err) = moses.run(temp_files['f2'])

        nc = processTranslationScoped(translation_no_renaming, iBuilder_ugly,
                                      scopeAnalyst, lm_path, temp_files['f2'],
                                      output_path, base_name)
        if nc:
            candidates += nc

        # More complicated renaming: collect the context around
        # each name (global variables, API calls, punctuation)
        # and build a hash of the concatenation.
        hash_def_one_renaming = renameUsingHashDefLine(scopeAnalyst,
                                                       iBuilder_ugly,
                                                       twoLines=False,
                                                       debug=False)
        with open(temp_files['f5'], 'w') as f_hash_def_one_renaming:
            f_hash_def_one_renaming.writelines(hash_def_one_renaming)

#        moses = MosesDecoder(ini_path=os.path.join(ini_path, \
#                           'train.hash_def_one_renaming', 'tuning', 'moses.ini'))
#        (_moses_ok,
#            translation_hash_renaming,
#            _err) = moses.run(temp_files['f5'])

        mosesParams = {}
        mosesParams["text"] = hash_def_one_renaming  #lex_ugly.collapsedText
        #mosesParams["align"] = "true"
        #mosesParams["report-all-factors"] = "true"

        mresults = proxy.translate(
            mosesParams)  # __request("translate", mosesParams)
        rawText = Postprocessor(mresults["nbest"])
        translation_hash_renaming = rawText.getProcessedOutput()

        nc = processTranslationScoped(translation_hash_renaming, iBuilder_ugly,
                                      scopeAnalyst, lm_path, temp_files['f5'],
                                      output_path, base_name)
        if nc:
            candidates += nc


#        nc = processTranslationScopedFallback(translation_hash_renaming,
#                                              translation_no_renaming,
#                                              iBuilder_ugly,
#                                              scopeAnalyst,
#                                              lm_path,
#                                              temp_files['f7'],
#                                              output_path,
#                                              base_name)
#        if nc:
#            candidates += nc

        cleanup(temp_files)
        cleanupRenamed(pid)
        return (js_file_path, 'OK', candidates)

    except Exception, e:
        cleanup(temp_files)
        cleanupRenamed(pid)
        return (js_file_path, None, str(e).replace("\n", ""))
Example #18
0
    def testIndexBuilder(self):
        '''
        Check that the index builder has correct values for the test files.
        '''
        ib1 = IndexBuilder(self.clearLexed[0].tokenList)
        '''
        # - map from (line,col) position to name
        self.charPosition2Name = {}
        # - map from name to list of (line,col) positions
        self.name2CharPositions = {}
        # - map from (line,col) position to flat position
        self.flatMap = {}
        # - map from flat position to (line,col)
        self.revFlatMat = {}
        # - map from (token_line, token_column) position in the 
        # bidimensional list of tokens to (line,col) text position
        self.tokMap = {}
        # - map from (line,col) position to (token_line, token_column)
        # position in the bidimensional list of tokens
        self.revTokMap = {}
        '''
        print([item[1] for item in self.clearLexed[0].tokenList])
        print(ib1.charPosition2Name)
        print(len(ib1.charPosition2Name))
        #print(len(ib1.charPosition2Name) == 53)
        #for i in range(0,22):
        #    linecount = 0
        #    for j in range(0, 110):
        #        if((i,j) in ib1.charPosition2Name):
        #            linecount += 1
        #    print("Line " + str(i+1) + " has " + str(linecount) + " variables.")

        #Test charPosition2Name
        self.assertTrue(len(ib1.charPosition2Name) == 53)
        self.assertTrue(ib1.charPosition2Name[(0, 4)] == u'geom2d')
        self.assertTrue(ib1.charPosition2Name[(2, 8)] == u'a')
        self.assertTrue(ib1.charPosition2Name[(15, 13)] == u'mix')
        self.assertTrue(ib1.charPosition2Name[(16, 17)] == u'k')

        #Test name2charPositions
        self.assertTrue(len(ib1.name2CharPositions) == 16)
        self.assertTrue(
            sum([
                len(value)
                for key, value in ib1.name2CharPositions.iteritems()
            ]) == 53)
        self.assertTrue(len(ib1.name2CharPositions[u'x']) == 7)
        self.assertTrue(len(ib1.name2CharPositions[u'Vector2d']) == 4)

        #Test flatMap
        self.assertTrue(ib1.flatMap[(1, 8)] == 34)
        self.assertTrue(ib1.flatMap[(3, 22)] == 128)

        #Test revFlatMap
        #Typo Bug: revFlatMat or revFlatMap?
        self.assertTrue(len(ib1.flatMap) == len(ib1.revFlatMat))
        for key, value in ib1.flatMap.iteritems():
            self.assertTrue(ib1.revFlatMat[value] == key)

        #Test tokMap and revTokMap
        #These are supposed to be different? Yes, includes maps to whitespace. (so leading whitespace also maps to identifiers)
        print(len(ib1.tokMap))
        print(len(ib1.revTokMap))
        #self.assertTrue(len(ib1.tokMap) == len(ib1.revTokMap))
        #i = 0
        for key, value in ib1.tokMap.iteritems():
            if (value in ib1.revTokMap.keys()):
                self.assertTrue(ib1.revTokMap[value] == key)
Example #19
0
def processFile(row):

    js_file_path = os.path.join(corpus_root, row[0])

    pid = int(multiprocessing.current_process().ident)
    base_name = os.path.splitext(os.path.basename(js_file_path))[0]

    # Temp files to be created during processing
    temp_files = {
        'path_tmp': 'tmp_%d.js' % pid,
        'path_tmp_b': 'tmp_%d.b.js' % pid,
        'path_tmp_b_a': 'tmp_%d.b.a.js' % pid,
        'path_tmp_u': 'tmp_%d.u.js' % pid,
        'path_tmp_u_a': 'tmp_%d.u.a.js' % pid
    }

    try:
        # Pass through beautifier to fix layout:
        #         # - once through JSNice without renaming
        #         jsNiceBeautifier = JSNice(flags=['--no-types', '--no-rename'])
        #
        #         (ok, _out, _err) = jsNiceBeautifier.run(js_file_path,
        #                                                temp_files['path_tmp'])
        #         if not ok:
        #             cleanup(temp_files)
        #             return (js_file_path, False, 'JSNice Beautifier fail')
        #
        #
        #         # Weird JSNice renamings despite --no-rename
        #         try:
        #             before = set([token for (token, token_type) in
        #                           Lexer(js_file_path).tokenList
        #                           if is_token_subtype(token_type, Token.Name)])
        #             after = set([token for (token, token_type) in
        #                           Lexer(temp_files['path_tmp']).tokenList
        #                           if is_token_subtype(token_type, Token.Name)])
        #
        #             if not before == after:
        #                 return (js_file_path, False, 'Weird JSNice renaming')
        #
        #         except:
        #             cleanup(temp_files)
        #             return (js_file_path, False, 'Lexer fail')

        # Strip comments, replace literals, etc
        try:
            prepro = Preprocessor(os.path.join(corpus_root, js_file_path))
            prepro.write_temp_file(temp_files['path_tmp'])
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'Preprocessor fail')

        # - and another time through uglifyjs pretty print only
        clear = Beautifier()
        ok = clear.run(temp_files['path_tmp'], temp_files['path_tmp_b'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, False, 'Beautifier fail')

        # Minify
        ugly = Uglifier()
        ok = ugly.run(temp_files['path_tmp_b'], temp_files['path_tmp_u'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, False, 'Uglifier fail')

        # Num tokens before vs after
        try:
            tok_clear = Lexer(temp_files['path_tmp_b']).tokenList
            tok_ugly = Lexer(temp_files['path_tmp_u']).tokenList
        except:
            cleanup(temp_files)
            return (js_file_path, False, 'Lexer fail')

        # For now only work with minified files that have
        # the same number of tokens as the originals
        if not len(tok_clear) == len(tok_ugly):
            cleanup(temp_files)
            return (js_file_path, False, 'Num tokens mismatch')

        # Align minified and clear files, in case the beautifier
        # did something weird
        try:
            aligner = Aligner()
            # This is already the baseline corpus, no (smart) renaming yet
            aligner.align(temp_files['path_tmp_b'], temp_files['path_tmp_u'])
        except:
            cleanup(temp_files)
            return (js_file_path, False, 'Aligner fail')

        # Check if minification resulted in any change
        # It's not very interesting otherwise
        if open(temp_files['path_tmp_b_a']).read() == \
                open(temp_files['path_tmp_u_a']).read():
            cleanup(temp_files)
            return (js_file_path, False, 'Not minified')

        try:
            lex_ugly = Lexer(temp_files['path_tmp_u_a'])
            _iBuilder_ugly = IndexBuilder(lex_ugly.tokenList)
        except:
            cleanup(temp_files)
            return (js_file_path, False, 'IndexBuilder fail')

        # Store original and uglified versions
        ok = clear.run(temp_files['path_tmp_b_a'],
                       os.path.join(output_path, '%s.js' % base_name))
        if not ok:
            cleanup(temp_files)
            cleanupProcessed(base_name)
            return (js_file_path, False, 'Beautifier fail')

        ok = clear.run(temp_files['path_tmp_u_a'],
                       os.path.join(output_path, '%s.u.js' % base_name))
        if not ok:
            cleanup(temp_files)
            cleanupProcessed(base_name)
            return (js_file_path, False, 'Beautifier fail')

        cleanup(temp_files)
        return (js_file_path, True, 'OK')

    except Exception, e:
        cleanup(temp_files)
        return (js_file_path, False, str(e))
Example #20
0
def processFile(js_file_path):
    
#     js_file_path = l[0]
    base_name = os.path.splitext(os.path.basename(js_file_path))[0]
    
    if dbg:
        print js_file_path
    
    temp_files = {'orig': '%s.js' % base_name,
                  'minified': '%s.u.js' % base_name,
                  'n2p': '%s.n2p.js' % base_name}
    
    for r_strategy in RS.all():
        temp_files['%s' % (r_strategy)] = \
                    '%s.%s.js' % (base_name, r_strategy)
                    
        for c_strategy in CS.all():
            temp_files['%s_%s' % (r_strategy, c_strategy)] = \
                    '%s.%s.%s.js' % (base_name, r_strategy, c_strategy)
                    
    for k,v in temp_files.iteritems():
        temp_files[k] = os.path.join(output_path, v)
    
    
    candidates = []
    #Minified Name -> Original Name (name, def_scope) -> (name, def_scope)
    min_name_map = {}
    #Hashed Name -> Minified Name (name, def_scope) -> (name, def_scope)
    hash_name_map = {}
    #Minified Name -> jsnice name  (name, def_scope) -> (name, def_scope)
    jsnice_name_map = {}
    #Data for the suggestion model.csv
    #Map of variable (name, def_scope) -> results of variableMetrics features function
    name_features = {}
    
    #Map of maps of variable-suggestion (name, def_scope, suggestion) -> suggestion line counts + suggestionMetrics features function
    #The first key is the renaming strategy
    #Ultimately, we will iterate over this to get the keys out of name_features and build model_rows
    suggestion_features = {}
    
    #Output Lines for the suggestoin_model.csv
    model_rows = [] 
    
    if True:
#     try:
#         js_text = open(os.path.join(corpus_root, js_file_path), 'r').read()
        js_text = open(js_file_path, 'r').read()
        
        # Strip comments, replace literals, etc
#         if True:
#         try:
        prepro = WebLMPreprocessor(js_text)
        prepro_text = str(prepro)
#         except:
#             return (js_file_path, None, 'Preprocessor fail')
        
#         print 'Preprocessor'
#         print prepro_text
        
        
        # Pass through beautifier to fix layout
        clear = Beautifier()
        (ok, tmp_beautified_text, _err) = clear.web_run(prepro_text)
        
        
        print '\nOK:', ok, 'ERR:', _err
        print tmp_beautified_text
        
        if not ok:
            return (js_file_path, None, 'Beautifier fail')
        
            
        # Minify
        ugly = Uglifier()
        (ok, tmp_minified_text, _err) = ugly.web_run(tmp_beautified_text)
        
#         print '\nOK:', ok, 'ERR:', _err
#         print tmp_minified_text
        
        if not ok:
            return (js_file_path, None, 'Uglifier fail')
        
        
        # Align minified and clear files, in case the beautifier 
        # did something weird
        try:
            aligner = Aligner()
            (aligned_clear, aligned_minified) = aligner.web_align(WebLexer(tmp_beautified_text).tokenList,
                                                                 WebLexer(tmp_minified_text).tokenList)
        except:
            return (js_file_path, None, 'Aligner fail')
        
#         print '\nAligned clear'
#         print aligned_clear
#         print '\nAligned minified'
#         print aligned_minified
#         print
        
        # Pass through beautifier to fix layout
        (ok, beautified_text, _err) = clear.web_run(aligned_clear)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')
        (ok, minified_text, _err) = clear.web_run(aligned_minified)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')
        
#         print beautified_text
#         print
#         print minified_text
        
        # Num tokens before vs after
        try:
            lex_clear = WebLexer(beautified_text)
            tok_clear = lex_clear.tokenList
            
            lex_ugly = WebLexer(minified_text)
            tok_ugly = lex_ugly.tokenList
        except:
            return (js_file_path, None, 'Lexer fail')

        # For now only work with minified files that have
        # the same number of tokens as the originals
        if not len(tok_clear) == len(tok_ugly):
            return (js_file_path, None, 'Num tokens mismatch')
        
        
        if beautified_text == minified_text:
            return (js_file_path, None, 'Not minified')

        try:
            iBuilder_clear = IndexBuilder(lex_clear.tokenList)
        except:
            return (js_file_path, None, "IndexBuilder fail on original file.")
            
        try:
            iBuilder_ugly = IndexBuilder(lex_ugly.tokenList)
        except:
            return (js_file_path, None, 'IndexBuilder fail on minified file.')
        
        
#         print 'Writing'
        
        with open(temp_files['orig'], 'w') as f:
            f.write(beautified_text)
            
        with open(temp_files['minified'], 'w') as f:
            f.write(minified_text)
        
        ######################## 
        #     Nice2Predict
        ########################
        
        # BV: Next block left out until I figure out the pipe issue
        # BV: Update: I couldn't pipe input to N2P. TODO: FIX
        # Run the JSNice from http://www.nice2predict.org
        unuglifyJS = UnuglifyJS()
        (ok, n2p_text, _err) = unuglifyJS.run(temp_files['minified'])
        if not ok:
            return (js_file_path, None, 'Nice2Predict fail')

        (ok, n2p_text_beautified, _err) = clear.web_run(n2p_text)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')
        
        with open(temp_files['n2p'], 'w') as f:
            f.write(n2p_text_beautified)
         
        if(True):
        #try:
            n2p_lexer = WebLexer(n2p_text_beautified)
            n2p_iBuilder = IndexBuilder(n2p_lexer.tokenList)
            n2p_scopeAnalyst = WebScopeAnalyst(n2p_text_beautified)
        #except:
        #    return (js_file_path, None, 'IndexBuilder / ScopeAnalyst fail')
        
#         print 'n2p'
        
        # Save some translation stats to compare different methods
        ts = TranslationSummarizer()
        candidates += [['n2p', ''] + x 
                       for x in ts.compute_summary_unscoped(n2p_iBuilder, 
                                                            n2p_scopeAnalyst)]
            
        ################################################
        # All other JSNaughty variants
        ################################################
    
        try:
            scopeAnalyst = WebScopeAnalyst(minified_text)
        except:
            return (js_file_path, None, 'ScopeAnalyst minified fail')
        
        try:
            scopeAnalyst_clear = WebScopeAnalyst(beautified_text)
        except:
            return (js_file_path, None, 'ScopeAnalyst clear fail')
        
        if(not check(iBuilder_clear, scopeAnalyst_clear, n2p_iBuilder, n2p_scopeAnalyst)):
            return (js_file_path, None, 'JsNice restructured file. Skipping..')
        
        #Map the original names to the minified counterparts and minified ones to jsnice renamings
        orderedVarsNew = sorted(scopeAnalyst.name2defScope.keys(), key = lambda x: x[1])
        orderedVarsOld = sorted(scopeAnalyst_clear.name2defScope.keys(), key = lambda x: x[1])
        orderedVarsN2p = sorted(n2p_scopeAnalyst.name2defScope.keys(), key = lambda x: x[1])

        if(len(orderedVarsOld) != len(orderedVarsNew)):
            return (js_file_path, None, "Old and New Name lists different length")
        
        if(len(orderedVarsOld) != len(orderedVarsN2p)):
            return (js_file_path, None, "JsNice and Old Name lists different length")
        
        
        for i in range(0, len(orderedVarsOld)):
            name_old = orderedVarsOld[i][0]
            def_scope_old = scopeAnalyst_clear.name2defScope[orderedVarsOld[i]]

            name_new = orderedVarsNew[i][0]
            def_scope_new = scopeAnalyst.name2defScope[orderedVarsNew[i]]
            min_name_map[(name_new, def_scope_new)] = (name_old, def_scope_old)
            
            name_n2p = orderedVarsN2p[i][0]
            def_scope_n2p = scopeAnalyst.name2defScope[orderedVarsNew[i]]
            jsnice_name_map[(name_new, def_scope_new)] = (name_n2p, def_scope_n2p)

        #Once we have the scopeAnalyst, iBuilder, and tokenlist for the minified
        #version, we can get the name properties
        vm = VariableMetrics(scopeAnalyst, iBuilder_ugly, lex_ugly.tokenList)
        variableKeySet = vm.getVariables()
        for variableKey in variableKeySet:
            name_features[variableKey] = vm.getNameMetrics(variableKey)
         
        (name_positions, \
         position_names,
         use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst)
          
#         print 'Helpers'

        # Try different renaming strategies (hash, etc)
        for r_strategy, proxy in proxies:
            
            if dbg:
                print '\n====================='
                print r_strategy
                print '=====================\n'
        
#             try:
#             if True:
            # Rename input prior to translation
            preRen = PreRenamer()
            after_text = preRen.rename(r_strategy, 
                                      iBuilder_ugly,
                                      scopeAnalyst)
            
#             print 'After text:'
#             print after_text
#             print
            
            (ok, beautified_after_text, _err) = clear.web_run(after_text)
            if not ok:
                return (js_file_path, None, 'Beautifier fail')
            
#             print 'Beautified:'
#             print beautified_after_text
#             print
            
            if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO):
                try:
                    scopeAnalyst_hash = WebScopeAnalyst(after_text)
                except:
                    return (js_file_path, None, "ScopeAnalyst hash fail")

                #Map the hashed names to the minified counterparts.
                orderedVarsMin = sorted(scopeAnalyst.name2defScope.keys(), key = lambda x: x[1])
                orderedVarsHash = sorted(scopeAnalyst_hash.name2defScope.keys(), key = lambda x: x[1])

                if(len(orderedVarsMin) != len(orderedVarsHash)):
                    return (js_file_path, None, "Hash and Min lists different length")

                for i in range(0, len(orderedVarsHash)):
                    name_hash = orderedVarsHash[i][0]
                    def_scope_hash = scopeAnalyst_hash.name2defScope[orderedVarsHash[i]]

                    name_min = orderedVarsMin[i][0]
                    def_scope_min = scopeAnalyst.name2defScope[orderedVarsMin[i]]
                    hash_name_map[(name_hash, def_scope_hash)] = (name_min, def_scope_min)


            # Save renamed input to disk for future inspection
            with open(temp_files['%s' % (r_strategy)], 'w') as f:
                f.write(beautified_after_text)
            
            a_lexer = WebLexer(beautified_after_text)
            a_iBuilder = IndexBuilder(a_lexer.tokenList)
            a_scopeAnalyst = WebScopeAnalyst(beautified_after_text)
                
#             except:
#                 return (js_file_path, None, 'Renaming fail')
            
#             print 'Lexing'
            
#             lx = WebLexer(a_iBuilder.get_text())
            lx = WebLexer(a_iBuilder.get_text_wo_literals())
            
#             print a_iBuilder.get_text_wo_literals()
            
            # Translate renamed input
            md = WebMosesDecoder(proxy)
            (ok, translation, _err) = md.run(lx.collapsedText)
            if not ok:
                return (js_file_path, None, 'Moses translation fail')
            
#             print '\ntranslation-------------'
#             print translation
            
#             if r_strategy == RS.HASH_ONE:
#                 exit()
            
            (a_name_positions, 
             a_position_names,
             a_use_scopes) = prepHelpers(a_iBuilder, a_scopeAnalyst)

            nc = []
             
            if translation is not None:
                # Parse moses output
                mp = MosesParser()
                
                if dbg:
                    print '\nr_strategy-----------', r_strategy
                
                name_candidates = mp.parse(translation,
                                           a_iBuilder,
                                           a_position_names)
                # name_candidates is a dictionary of dictionaries: 
                # keys are (name, None) (if scopeAnalyst=None) or 
                # (name, def_scope) tuples (otherwise); 
                # values are suggested translations with the sets 
                # of line numbers on which they appear.

#                 print '\nname_candidates before ----------'
#                 for key, suggestions in name_candidates.iteritems():
#                     print key[0], key[1][-50:]
# #                     for use_scope, suggestions in val.iteritems():
# #                         print '\t...', use_scope[-50:]
#                     for name_translation, lines in suggestions.iteritems():
#                         print '\t', name_translation, lines
                    
                # Update name_candidates with some default values 
                # (in this case the translation without any renaming)
                # if the translation is empty
                if r_strategy == RS.NONE:
                    # RS.NONE should always be first, by construction
                    name_candidates_default = name_candidates
                    scopeAnalyst_default = a_scopeAnalyst
                    iBuilder_default = a_iBuilder
                else:
                    for key_default, suggestions in name_candidates_default.iteritems():
#                         (name_default, def_scope_default) = key_default
                        
                        pos_default = scopeAnalyst_default.nameDefScope2pos[key_default]
                        (lin, col) = iBuilder_default.revFlatMat[pos_default]
                        (line_num, line_idx) = iBuilder_default.revTokMap[(lin, col)]
                        
                        (name, def_scope) = a_position_names[line_num][line_idx]
                        key = (name, def_scope)
                        
                        for name_translation, lines in suggestions.iteritems():
                            name_candidates.setdefault(key, {})
                            name_candidates[key].setdefault(name_translation, set([]))
                            name_candidates[key][name_translation].update(lines)
                                
#                         for use_scope, suggestions in val.iteritems():
#                             for name_translation, lines in suggestions.iteritems():
# #                                 key = preRen.simple_direct_map.get(key_default, key_default)
#                                  
#                                 name_candidates.setdefault(key, {})
#                                 name_candidates[key].setdefault(use_scope, {})
#                                 name_candidates[key][use_scope].setdefault(name_translation, set([]))
#                                 name_candidates[key][use_scope][name_translation].update(lines)
                                
#                 print '\nname_candidates after ----------'
#                 for key, suggestions in name_candidates.iteritems():
#                     print key[0], key[1][-50:]
# #                     for use_scope, suggestions in val.iteritems():
# #                         print '\t...', use_scope[-50:]
#                     for name_translation, lines in suggestions.iteritems():
#                         print '\t', name_translation, lines
                                
                cc = ConsistencyController(debug_mode=True)
                ts = TranslationSummarizer()
                
                # An identifier may have been translated inconsistently
                # across different lines (Moses treats each line independently).
                # Try different strategies to resolve inconsistencies, if any
                for c_strategy in CS.all():
                    
                    if dbg:
                        print '\nc_strategy----------', c_strategy

                    #assert(hash_name_map != {})
                    
                    # Compute renaming map (x -> length, y -> width, ...)
                    # Note that x,y here are names after renaming
                    (temp_renaming_map, seen) = cc.computeRenaming(c_strategy,
                                                      name_candidates,
                                                      a_name_positions,
                                                      a_use_scopes,
                                                      a_iBuilder,
                                                      lm_path,
                                                      vm,
                                                      hash_name_map)
                    
                    
                    #After computeRenaming, we have both the entropies stored
                    #if we are in LMDrop strategy and have the suggestions
                    #frequency from name_candidates.  Fill in suggestion_Features
                    if(c_strategy == CS.LMDROP and r_strategy not in suggestion_features):
                        assert(cc.suggestion_cache != None)
                        suggestion_features[r_strategy] = {}
                        #Need some way of iterating over all name, suggestion groups...
                        """
                        name_candidates: dict
                            name_candidates[(name, def_scope)][name_translation] 
                            = set of line numbers in the translation
                        """
                        for variableKey, suggestionDictionary in name_candidates.iteritems():
                            for suggestionName, linesSuggested in suggestionDictionary.iteritems():
                                # I need to revert variableKey[0] in the suggestion from its hash to its original minified name.
                                if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO):
                                    unhashedKey = hash_name_map[variableKey]
                                    suggestionKey = (unhashedKey[0], unhashedKey[1], suggestionName)
                                else:
                                    suggestionKey = (variableKey[0], variableKey[1], suggestionName)

                                entropyVals = cc.suggestion_cache.getEntropyStats(variableKey, suggestionName)
                                
                                if(True): #eval_dbg only
                                #if(entropyVals != (ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR)):
                                    suggestionValue = [len(linesSuggested)] + \
                                                       list(getSuggestionStats(suggestionName)) + \
                                                       list(entropyVals)
                                                      
                                    suggestion_features[r_strategy][suggestionKey] = suggestionValue
                    
                    
                    if dbg:
                        print '\ntemp_renaming_map-------------'
                        for (name, def_scope), renaming in temp_renaming_map.iteritems():
                            print (name, def_scope[-50:]), renaming

                    # Fall back on original names in input, if 
                    # no translation was suggested
                    postRen = PostRenamer()
                    renaming_map = postRen.updateRenamingMap(a_name_positions, 
                                                             position_names, 
                                                             a_use_scopes,
                                                             temp_renaming_map, 
                                                             seen,
                                                             r_strategy)

#                     new_name_candidates = {}
# 
#                     for (name, def_scope), renaming in temp_renaming_map.iteritems():
#                         (line_num, line_idx) = a_name_positions[(name, def_scope)][0]
#                         (old_name, old_def_scope) = position_names[line_num][line_idx]
#                         
#                         new_name_candidates.setdefault((old_name, old_def_scope), {})
#                         new_name_candidates[(old_name, old_def_scope)][renaming] = set([1])


#                     tmp_renamed_text = postRen.applyRenaming(a_iBuilder, 
#                                                          a_name_positions, 
#                                                          temp_renaming_map)
#                     (ok, tmp_beautified_renamed_text, _err) = clear.web_run(tmp_renamed_text)
#                     if not ok:
#                         return (js_file_path, None, 'Beautifier fail')
#                     
#                     tmp_lexer = WebLexer(tmp_beautified_renamed_text)
#                     tmp_iBuilder = IndexBuilder(tmp_lexer.tokenList)
#                     tmp_scopeAnalyst = WebScopeAnalyst(tmp_beautified_renamed_text)
#                         
#                     (tmp_name_positions, 
#                      tmp_position_names,
#                      tmp_use_scopes) = prepHelpers(tmp_iBuilder, tmp_scopeAnalyst)
                    
#                     renaming_map = postRen.updateRenamingMap(tmp_name_positions, 
#                                                              position_names, 
#                                                              temp_renaming_map, 
#                                                              r_strategy)
#                     
#                     renaming_map = cc.computeRenaming(CS.FREQLEN,
#                                                       new_name_candidates,
#                                                       name_positions,
#                                                       use_scopes,
#                                                       iBuilder_ugly,
#                                                       lm_path)
                    
#                     # Fall back on original names in input, if 
#                     # no translation was suggested
#                     postRen = PostRenamer()
#                     renaming_map = postRen.updateRenamingMap(a_name_positions, 
#                                                              position_names, 
#                                                              temp_renaming_map, 
#                                                              r_strategy)
                    
                    if dbg:
                        print '\nrenaming_map-------------'
                        for (name, def_scope), renaming in renaming_map.iteritems():
                            print (name, def_scope[-50:]), renaming, '(%s)' % temp_renaming_map[(name, def_scope)]
                    
                    # Apply renaming map and save output for future inspection
                    renamed_text = postRen.applyRenaming(a_iBuilder, 
                                                         a_name_positions, 
                                                         renaming_map)
                    
                    print '\nrenamed_text--------------'
                    print renamed_text
                    print
                    
                    (ok, beautified_renamed_text, _err) = clear.web_run(renamed_text)
                    if not ok:
                        return (js_file_path, None, 'Beautifier fail')
                    with open(temp_files['%s_%s' % (r_strategy, c_strategy)], 'w') as f:
                        f.write(beautified_renamed_text)
                    
                    # Save some stats about which names were renamed to what
                    # This is what enables the comparison between the different 
                    # methods.
                    r = [[c_strategy] + x 
                         for x in ts.compute_summary_scoped(renaming_map,
                                                            name_candidates,
                                                            a_iBuilder,
                                                            a_scopeAnalyst)]
                    
                    if not r:
                        return (js_file_path, None, 'Compute summary failed')
                    nc += r
                
            if nc:
                candidates += [[r_strategy] + x for x in nc]
         

        #create the rows for the suggestion_model.csv
        for r_strategy in RS.all():
            for suggestionKey, s_feat in suggestion_features[r_strategy].iteritems():
                variableKey = (suggestionKey[0], suggestionKey[1])
                original_name = min_name_map[variableKey][0]
                js_nice_name = jsnice_name_map[variableKey][0]
                if(variableKey in name_features): #eval_dbg only
                    n_feat = list(name_features[variableKey])
                    #Convert the def_scope to an equivalent, but smaller, easier to read key: (line_num, token_num)
                    newKey = scopeAnalyst.nameDefScope2pos[variableKey]
                    (keyLine, keyToken) = iBuilder_ugly.revFlatMat[newKey]
                    model_rows.append([original_name, r_strategy, suggestionKey[0], keyLine, keyToken, suggestionKey[2], js_nice_name] + n_feat + s_feat)
 
        return (js_file_path, 'OK', candidates, model_rows)
Example #21
0
def processFile(l):
    
    def localCleanup(output_path, base_names):
        for base_name in base_names:
            tryRemove(os.path.join(output_path, base_name))
    
    js_file_path = l[0]
    base_name = os.path.splitext(os.path.basename(js_file_path))[0]
    
    pid = int(multiprocessing.current_process().ident)

    candidates = []
    
    try:
#     if True:
        # Temp files to be created during processing
        path_tmp = 'tmp_%d.js' % (pid)
        path_tmp_b = 'tmp_%d.b.js' % (pid)
        path_tmp_b_a = 'tmp_%d.b.a.js' % (pid)
        path_tmp_u = 'tmp_%d.u.js' % (pid)
        path_tmp_u_a = 'tmp_%d.u.a.js' % (pid)
        path_tmp_unugly = 'tmp_%d.n2p.js' % (pid)
        path_tmp_jsnice = 'tmp_%d.jsnice.js' % (pid)
        
        f2 = 'tmp_%d.no_renaming.js' % (pid)
        f3 = 'tmp_%d.basic_renaming.js' % (pid)
        f4 = 'tmp_%d.hash_renaming.js' % (pid)
        f5 = 'tmp_%d.hash_def_one_renaming.js' % (pid)
        f6 = 'tmp_%d.hash_def_two_renaming.js' % (pid)
        
        path_orig = '%s.js' % (base_name)
        path_ugly = '%s.u.js' % (base_name)
        path_unugly = '%s.n2p.js' % (base_name)
        path_jsnice = '%s.jsnice.js' % (base_name)
        
        # Strip comments, replace literals, etc
        try:
            prepro = Preprocessor(os.path.join(corpus_root, js_file_path))
            prepro.write_temp_file(path_tmp)
        except:
            cleanup(pid)
            return (js_file_path, None, 'Preprocessor fail')
        
        # Pass through beautifier to fix layout
        clear = Beautifier()
        ok = clear.run(path_tmp, path_tmp_b+'.tmp1')
        if not ok:
            cleanup(pid)
            return (js_file_path, None, 'Beautifier 1 fail')
         
        jsNiceBeautifier = JSNice(flags=['--no-types', '--no-rename'])
        
        (ok, _out, _err) = jsNiceBeautifier.run(path_tmp_b+'.tmp1', path_tmp_b+'.tmp2')
        if not ok:
            cleanup(pid)
            return (js_file_path, None, 'JSNice Beautifier 1 fail')

        ok = clear.run(path_tmp_b+'.tmp2', path_tmp_b)
        if not ok:
            cleanup(pid)
            return (js_file_path, None, 'Beautifier 1 fail')
         
        # Minify
        ugly = Uglifier()
        ok = ugly.run(path_tmp_b, path_tmp_u)
        
        if not ok:
            cleanup(pid)
            return (js_file_path, None, 'Uglifier fail')
        
        # Num tokens before vs after
        try:
            tok_clear = Lexer(path_tmp_b).tokenList
            tok_ugly = Lexer(path_tmp_u).tokenList
        except:
            cleanup(pid)
            return (js_file_path, None, 'Lexer fail')
       
        # For now only work with minified files that have
        # the same number of tokens as the originals
        if not len(tok_clear) == len(tok_ugly):
            cleanup(pid)
            return (js_file_path, None, 'Num tokens mismatch')
        
        # Align minified and clear files, in case the beautifier 
        # did something weird
        try:
            aligner = Aligner()
            # This is already the baseline corpus, no (smart) renaming yet
            aligner.align(path_tmp_b, path_tmp_u)
        except:
            cleanup(pid)
            return (js_file_path, None, 'Aligner fail')
        
        try:
#             iBuilder_clear = IndexBuilder(Lexer(path_tmp_b_a).tokenList)
            iBuilder_ugly = IndexBuilder(Lexer(path_tmp_u_a).tokenList)
        except:
            cleanup(pid)
            return (js_file_path, None, 'IndexBuilder fail')
        
        
        # Store original and uglified versions
        ok = clear.run(path_tmp_u_a, os.path.join(output_path, path_ugly))
        if not ok:
            cleanup(pid)
            localCleanup(output_path, [path_ugly])
            return (js_file_path, None, 'Beautifier 2 fail')
        
        ok = clear.run(path_tmp_b_a, os.path.join(output_path, path_orig))
        if not ok:
            cleanup(pid)
            localCleanup(output_path, [path_ugly, path_orig])
            return (js_file_path, None, 'Beautifier 3 fail')
        
        
        # Run the JSNice from http://www.nice2predict.org
        unuglifyJS = UnuglifyJS()
        (ok, _out, _err) = unuglifyJS.run(path_tmp_b_a, path_tmp_unugly)
        if not ok:
            cleanup(pid)
            localCleanup(output_path, [path_ugly, path_orig])
            return (js_file_path, None, 'Nice2Predict fail')
        
        ok = clear.run(path_tmp_unugly, path_tmp_unugly+'.tmp1')
        if not ok:
            cleanup(pid)
            localCleanup(output_path, [path_ugly, path_orig, path_unugly])
            return (js_file_path, None, 'Beautifier 4 fail')
        
        (ok, _out, _err) = jsNiceBeautifier.run(path_tmp_unugly+'.tmp1', path_tmp_unugly+'.tmp2')
        if not ok:
            cleanup(pid)
            return (js_file_path, None, 'JSNice Beautifier 2 fail')
    
        ok = clear.run(path_tmp_unugly+'.tmp2', os.path.join(output_path, path_unugly))
        if not ok:
            cleanup(pid)
            localCleanup(output_path, [path_ugly, path_orig, path_unugly])
            return (js_file_path, None, 'Beautifier 4 fail')

        try:
            scopeAnalyst = ScopeAnalyst(os.path.join(
                                 os.path.dirname(os.path.realpath(__file__)), 
                                 path_tmp_unugly))
            nameOrigin = scopeAnalyst.nameOrigin
            for (name, def_scope) in nameOrigin.iterkeys():
                candidates.append(('Nice2Predict', def_scope, name, '', ''))
        except:
            cleanup(pid)
            localCleanup(output_path, [path_ugly, path_orig, path_unugly])
            return (js_file_path, None, 'ScopeAnalyst fail')
    
    
    
        # Run the JSNice from http://www.jsnice.org
        jsNice = JSNice()
        (ok, _out, _err) = jsNice.run(path_tmp_b_a, path_tmp_jsnice)
        if not ok:
            cleanup(pid)
            localCleanup(output_path, [path_ugly, path_orig, path_unugly])
            return (js_file_path, None, 'JSNice fail')

        ok = clear.run(path_tmp_jsnice, os.path.join(output_path, path_jsnice))
        if not ok:
            cleanup(pid)
            localCleanup(output_path, [path_ugly, path_orig, \
                                       path_unugly, path_jsnice])
            return (js_file_path, None, 'Beautifier 5 fail')
        
        try:
            scopeAnalyst = ScopeAnalyst(os.path.join(
                                 os.path.dirname(os.path.realpath(__file__)), 
                                 path_tmp_jsnice))
            nameOrigin = scopeAnalyst.nameOrigin
            for (name, def_scope) in nameOrigin.iterkeys():
                candidates.append(('JSNice', def_scope, name, '', ''))
        except:
            cleanup(pid)
            localCleanup(output_path, [path_ugly, path_orig, \
                                       path_unugly, path_jsnice])
            return (js_file_path, None, 'ScopeAnalyst fail')
        
        
        
        # Compute scoping: name2scope is a dictionary where keys
        # are (name, start_index) tuples and values are scope identifiers. 
        # Note: start_index is a flat (unidimensional) index, 
        # not a (line_chr_idx, col_chr_idx) index.
        try:
            scopeAnalyst = ScopeAnalyst(os.path.join(
                                 os.path.dirname(os.path.realpath(__file__)), 
                                 path_tmp_u_a))
            _name2defScope = scopeAnalyst.resolve_scope()
            _isGlobal = scopeAnalyst.isGlobal
            _name2useScope = scopeAnalyst.resolve_use_scope()
        except:
            cleanup(pid)
            localCleanup(output_path, [path_ugly, path_orig, \
                                       path_unugly, path_jsnice])
            return (js_file_path, None, 'ScopeAnalyst fail')
        
        
        no_renaming = []
        for _line_idx, line in enumerate(iBuilder_ugly.tokens):
            no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n")
        
        with open(f2, 'w') as f_no_renaming:
            f_no_renaming.writelines(no_renaming)
        
        moses = MosesDecoder(ini_path=os.path.join(ini_path, \
                           'train.no_renaming', 'tuning', 'moses.ini'))
        (_moses_ok, translation, _err) = moses.run(f2)

        nc = processTranslation(translation, iBuilder_ugly, 
                       scopeAnalyst, lm_path, f2,
                       output_path, base_name, clear)
        if nc:
            candidates += nc
        
        
        # Simple renaming: disambiguate overloaded names using scope id
        basic_renaming = renameUsingScopeId(scopeAnalyst, iBuilder_ugly)
        with open(f3, 'w') as f_basic_renaming:
            f_basic_renaming.writelines(basic_renaming)
        
        moses = MosesDecoder(ini_path=os.path.join(ini_path, \
                           'train.basic_renaming', 'tuning', 'moses.ini'))
        (_moses_ok, translation, _err) = moses.run(f3)
        nc = processTranslation(translation, iBuilder_ugly, 
                       scopeAnalyst, lm_path, f3,
                       output_path, base_name, clear)
        if nc:
            candidates += nc
            
        
        # More complicated renaming: collect the context around  
        # each name (global variables, API calls, punctuation)
        # and build a hash of the concatenation.
        hash_renaming = renameUsingHashAllPrec(scopeAnalyst, 
                                               iBuilder_ugly,
                                               debug=False)
#         print hash_renaming
        with open(f4, 'w') as f_hash_renaming:
            f_hash_renaming.writelines(hash_renaming)
        
        moses = MosesDecoder(ini_path=os.path.join(ini_path, \
                           'train.hash_renaming', 'tuning', 'moses.ini'))
        (_moses_ok, translation, _err) = moses.run(f4)
        
        nc = processTranslation(translation, iBuilder_ugly, 
                       scopeAnalyst, lm_path, f4,
                       output_path, base_name, clear)
        if nc:
            candidates += nc
        
        hash_def_one_renaming = renameUsingHashDefLine(scopeAnalyst, 
                                                   iBuilder_ugly, 
                                                   twoLines=False,
                                                   debug=False)
        with open(f5, 'w') as f_hash_def_one_renaming:
            f_hash_def_one_renaming.writelines(hash_def_one_renaming)

        moses = MosesDecoder(ini_path=os.path.join(ini_path, \
                           'train.hash_def_one_renaming', 'tuning', 'moses.ini'))
        (_moses_ok, translation, _err) = moses.run(f5)
        
        nc = processTranslation(translation, iBuilder_ugly, 
                       scopeAnalyst, lm_path, f5,
                       output_path, base_name, clear)
        if nc:
            candidates += nc
            

        hash_def_two_renaming = renameUsingHashDefLine(scopeAnalyst, 
                                                   iBuilder_ugly, 
                                                   twoLines=True,
                                                   debug=False)
        with open(f6, 'w') as f_hash_def_two_renaming: 
            f_hash_def_two_renaming.writelines(hash_def_two_renaming)
        
        moses = MosesDecoder(ini_path=os.path.join(ini_path, \
                           'train.hash_def_two_renaming', 'tuning', 'moses.ini'))
        (_moses_ok, translation, _err) = moses.run(f6)
        
        nc = processTranslation(translation, iBuilder_ugly, 
                       scopeAnalyst, lm_path, f6,
                       output_path, base_name, clear)
        if nc:
            candidates += nc
            
        
        cleanup(pid)
        cleanupRenamed(pid)
        return (js_file_path, 'OK', candidates)


    except Exception, e:
        cleanup(pid)
        cleanupRenamed(pid)
        return (js_file_path, None, str(e).replace("\n", ""))
Example #22
0
def processFile(js_file_path):

    # Load in the minified file
    minified = open(js_file_path).read()

    # Create lexer
    lexer = get_lexer_for_filename(js_file_path)

    # Tokenize input and compute mappings between the different
    # indices used: (line, col), flat, (l,c) in token list
    indexBuilder = IndexBuilder(lex(minified, lexer))
    tokens = indexBuilder.tokens
    #    print 'RUNNING IndexBuilder:', len(tokens)>0

    # Compute scoping: name2scope is a dictionary where keys
    # are (name, start_index) tuples and values are scope identifiers.
    # Note: start_index is a flat (unidimensional) index,
    # not a (line_chr_idx, col_chr_idx) index.
    scopeAnalyst = ScopeAnalyst(js_file_path)
    name2defScope = scopeAnalyst.resolve_scope()
    isGlobal = scopeAnalyst.isGlobal

    name2useScope = scopeAnalyst.name2useScope
    name2pth = scopeAnalyst.name2pth
    nameOrigin = scopeAnalyst.nameOrigin

    scopes = set(name2useScope.values())

    print
    print '=== FOUND %d SCOPES ===' % len(scopes)
    print

    for scope in scopes:
        print 'USE SCOPE:', scope
        lc_list = [
            indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]]
            for (t, pos) in name2useScope.keys()
            if name2useScope[(t, pos)] == scope
        ]
        highlight(tokens, lc_list)
        print

    scopes = set(name2defScope.values())

    print
    print '=== FOUND %d NAME SCOPES ===' % len(scopes)
    print

    for scope in scopes:
        print 'DEF SCOPE:', scope
        lc_list = [
            indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]]
            for (t, pos) in name2defScope.keys()
            if name2defScope[(t, pos)] == scope
        ]
        highlight(tokens, lc_list)
        print

    # Discover the path to the source map
    map_path = sourcemap.discover(minified)
    # Read and parse our sourcemap
    if map_path:
        sourcemapIndex = sourcemap.load(open(map_path))

    # Cluster names by scope
    nameScope2Positions = {}

    # Index data by (name,scope)
    for token, l in indexBuilder.name2CharPositions.iteritems():
        for (line, col) in sorted(l, key=lambda (a, b): (a, b)):
            pos = indexBuilder.flatMap[(line, col)]
            if name2defScope.has_key((token, pos)):
                scope = name2defScope[(token, pos)]
                use_scope = name2useScope[(token, pos)]
                pth = name2pth[(token, pos)]

                glb = isGlobal[(token, pos)]

                nameScope2Positions.setdefault((token, scope, glb), [])
                nameScope2Positions[(token, scope, glb)].append((line, col))

#                print token, pos
#                print 'def:', scope
#                print 'use:', use_scope
#                print 'pth:', pth
#                highlight(tokens, [indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]]])
#                print

    print
    print

    for (token,scope,glb), positions in sorted(nameScope2Positions.iteritems(), \
                                           key=lambda (x,y):x[0]):

        pos = sorted(positions, key=lambda e: (e[0], e[1]))
        tt = []
        line_tok_idxs = set([])
        for (l, c) in pos:
            (tl, tc) = indexBuilder.revTokMap[(l, c)]
            line_tok_idxs.add(tl)
            p = indexBuilder.flatMap[(l, c)]
            if map_path:
                orig = sourcemapIndex.lookup(line=l, column=c).name
            else:
                orig = token
            print token, scope, (l, c), orig
            tt.append(((tl, tc), p, orig))
#             t.append(orig)

#         if token == 'n':
        print '\nNAME:', token.encode(
            'utf-8'), '( isGlobal =', glb, '; original =', orig, ')'
        #         print scope
        #         highlight(tokens, [indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]]])

        for ((tli, tci), p, orig) in tt:
            scope = name2defScope[(token, p)]
            use_scope = name2useScope[(token, p)]
            pth = name2pth[(token, p)]
            origin = nameOrigin[(token, scope)]
#             print token #, p, origin
#             print
#             print 'def:', scope
#             print 'use:', use_scope
#             print 'pth:', pth
#             print

        for tl in sorted(set([tli for ((tli, tci), p, orig) in tt])):
            l = list(tokens[tl])
            for tc in [tci for ((tli, tci), p, orig) in tt if tli == tl]:
                l[tc] = (l[tc][0], unichr(0x2588) + token + unichr(0x2588))


#                 pos = indexBuilder.flatMap[(line,col)]

            print '  ', '%d:' % (tl + 1), ' '.join(
                [x[1].encode('utf-8') for x in l])

        print

    return
Example #23
0
    def testFiles(self):
        #TODO: Automated checks against the files.
        #Known bugs:  The definitions of sum and numberEquals in test_file1 seem to be pointing to the wrong instance...
        i = 1
        for nextFile in self.fileList:
            print(nextFile)
            lexed = Lexer(nextFile)
            ib = IndexBuilder(lexed.tokenList)
            sa = ScopeAnalyst(nextFile)
            s_min = ScopeAnalyst(
                os.path.join(self.testDir.path, "test_file1.obs.js"))
            #print(s_min.name2defScope)
            #print("TokenList----------------------------------------------------------------")
            #print(lexed.tokenList)
            #print("Index Builder----------------------------------------------------------------")
            #print(ib)
            #print("Scope Analyst----------------------------------------------------------------")
            #print(sa)
            vm = VariableMetrics(sa, ib, lexed.tokenList)
            #print("VM----------------------------------------------------------------")
            #print(vm)
            #print("VM----------------------------------------------------------------")
            for var in vm.getVariables():
                print(var)
                print(
                    "Num Lines,Max Lines,Global Def,Global Usage,For,While,Literal Def,Literal Usage,Max Length Line,Ave Line Length"
                )
                print vm.getNameMetrics(var)

            #Automated tests:
            csv_file = os.path.join(self.testDir.path,
                                    "test_file" + str(i) + ".csv")
            print(csv_file)
            if (os.path.exists(csv_file)):
                with open(csv_file, 'r') as f:
                    csv_reader = csv.reader(f, delimiter=",")
                    #Skip header
                    next(csv_reader, None)
                    for row in csv_reader:
                        key = (row[0], row[1])
                        print(key)
                        (num_lines, max_lines, external_def, external_use,
                         in_for, in_while, literal_def, literal_use,
                         max_length_line,
                         ave_line_length) = vm.getNameMetrics(key)
                        self.assertTrue(num_lines == int(row[2]))
                        self.assertTrue(max_lines == int(row[3]))
                        self.assertTrue(external_def == self.asBool(row[4]))
                        self.assertTrue(external_use == int(row[5]))
                        self.assertTrue(in_for == int(row[6]))
                        self.assertTrue(in_while == int(row[7]))
                        self.assertTrue(literal_def == self.asBool(row[8]))
                        self.assertTrue(literal_use == int(row[9]))
                        self.assertTrue(max_length_line == int(row[10]))
                        self.assertAlmostEqual(ave_line_length,
                                               float(row[11]),
                                               places=3)

            else:
                print("no manually annotated csv file for: " + nextFile)

            break
Example #24
0
def processFile(l):
    
    js_file_path = l[0]
    
    if js_file_path in seen:
        return (js_file_path, None, 'Skipped')
    
    pid = int(multiprocessing.current_process().ident)
    
    # Temp files to be created during processing
    temp_files = {'path_tmp': 'tmp_%d.js' % pid,
                  'path_tmp_b': 'tmp_%d.b.js' % pid,
                  'path_tmp_b_n': 'tmp_%d.b.n.js' % pid,
                  'path_tmp_u': 'tmp_%d.u.js' % pid,
                  'path_tmp_u_n': 'tmp_%d.u.n.js' % pid,
                  'path_tmp_b_a': 'tmp_%d.b.a.js' % pid,
                  'path_tmp_u_a': 'tmp_%d.u.a.js' % pid}
    
    try:        
        # Strip comments, replace literals, etc
        try:
            prepro = Preprocessor(os.path.join(corpus_root, js_file_path))
            prepro.write_temp_file(temp_files['path_tmp'])
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'Preprocessor fail')
        
        
        # Pass through beautifier to fix layout:
        # - once through JSNice without renaming
#         jsNiceBeautifier = JSNice(flags=['--no-types', '--no-rename'])
#         
#         (ok, _out, _err) = jsNiceBeautifier.run(temp_files['path_tmp'], 
#                                                 temp_files['path_tmp_b_n'])
#         if not ok:
#             cleanup(temp_files)
#             return (js_file_path, None, 'JSNice Beautifier fail')
        
        
#         # - and another time through uglifyjs pretty print only 
#         clear = Beautifier()
#         ok = clear.run(temp_files['path_tmp_b_n'], 
#                        temp_files['path_tmp_b'])
#         if not ok:
#             cleanup(temp_files)
#             return (js_file_path, None, 'Beautifier fail')
        
#         # JSNice is down! 
        clear = Beautifier()
        ok = clear.run(temp_files['path_tmp'], 
                       temp_files['path_tmp_b_n'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Beautifier fail')
        # Normalize
        norm = Normalizer()
        ok = norm.run(os.path.join(os.path.dirname(os.path.realpath(__file__)), 
                                 temp_files['path_tmp_b_n']),
                      False, 
                      temp_files['path_tmp_b'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Normalizer fail')
        
        
        
        # Minify
        ugly = Uglifier()
        ok = ugly.run(temp_files['path_tmp_b'], 
                      temp_files['path_tmp_u_n'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Uglifier fail')
        # Normalize
        norm = Normalizer()
        ok = norm.run(os.path.join(os.path.dirname(os.path.realpath(__file__)), 
                                 temp_files['path_tmp_u_n']),
                      False, 
                      temp_files['path_tmp_u'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Normalizer fail')
        
        
        
        # Num tokens before vs after
        try:
            tok_clear = Lexer(temp_files['path_tmp_b']).tokenList
            tok_ugly = Lexer(temp_files['path_tmp_u']).tokenList
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'Lexer fail')
        
        # For now only work with minified files that have
        # the same number of tokens as the originals
        if not len(tok_clear) == len(tok_ugly):
            cleanup(temp_files)
            return (js_file_path, None, 'Num tokens mismatch')
        
        
        # Align minified and clear files, in case the beautifier 
        # did something weird
        try:
            aligner = Aligner()
            # This is already the baseline corpus, no (smart) renaming yet
            aligner.align(temp_files['path_tmp_b'], 
                          temp_files['path_tmp_u'])
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'Aligner fail')
        
        try:
            lex_clear = Lexer(temp_files['path_tmp_b_a'])
            iBuilder_clear = IndexBuilder(lex_clear.tokenList)
            
            lex_ugly = Lexer(temp_files['path_tmp_u_a'])
            iBuilder_ugly = IndexBuilder(lex_ugly.tokenList)
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'IndexBuilder fail')
        
        
        
        # Normalize
        norm = Normalizer()
        ok = norm.run(os.path.join(os.path.dirname(os.path.realpath(__file__)), 
                                 temp_files['path_tmp_b']),
                      True, 
                      temp_files['path_tmp_u_n'])
        if not ok:
            cleanup(temp_files)
            return (js_file_path, None, 'Normalizer fail')
        
        try:
            lex_norm = Lexer(temp_files['path_tmp_u_n'])
            iBuilder_norm = IndexBuilder(lex_norm.tokenList)
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'IndexBuilder fail')
        
        normalized = []
        for line_idx, line in enumerate(iBuilder_norm.tokens):
            normalized.append(' '.join([t for (_tt,t) in line]) + "\n")
        
        
        
        # Compute scoping: name2scope is a dictionary where keys
        # are (name, start_index) tuples and values are scope identifiers. 
        # Note: start_index is a flat (unidimensional) index, 
        # not a (line_chr_idx, col_chr_idx) index.
        try:
            scopeAnalyst = ScopeAnalyst(os.path.join(
                                 os.path.dirname(os.path.realpath(__file__)), 
                                 temp_files['path_tmp_u_a']))
#             _name2defScope = scopeAnalyst.resolve_scope()
#             _isGlobal = scopeAnalyst.isGlobal
#             _name2useScope = scopeAnalyst.resolve_use_scope()
        except:
            cleanup(temp_files)
            return (js_file_path, None, 'ScopeAnalyst fail')
        
        orig = []
        no_renaming = []
        
        for line_idx, line in enumerate(iBuilder_ugly.tokens):
            orig.append(' '.join([t for (_tt,t) in \
                                  iBuilder_clear.tokens[line_idx]]) + "\n")
            
            no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n")
            
#         # Simple renaming: disambiguate overloaded names using scope id
        basic_renaming = renameUsingScopeId(scopeAnalyst, 
                                            iBuilder_ugly)
        
        # More complicated renaming: collect the context around  
        # each name (global variables, API calls, punctuation)
        # and build a hash of the concatenation.
#         hash_renaming = renameUsingHashAllPrec(scopeAnalyst, 
#                                                 iBuilder_ugly,
#                                                 debug=True)
        
        hash_def_one_renaming = renameUsingHashDefLine(scopeAnalyst, 
                                                   iBuilder_ugly, 
                                                   twoLines=False,
                                                   debug=False)

        hash_def_two_renaming = renameUsingHashDefLine(scopeAnalyst, 
                                                    iBuilder_ugly, 
                                                    twoLines=True,
                                                    debug=False)

        cleanup(temp_files)
        return (js_file_path,
                orig, 
                no_renaming, 
                basic_renaming,
                normalized, 
#                 hash_renaming,
                hash_def_one_renaming,
                hash_def_two_renaming)
        
    except Exception, e:
        cleanup(temp_files)
        return (js_file_path, None, str(e))
Example #25
0
    def deobfuscateJS(self, obfuscatedCode, transactionID):
        proxy = xmlrpclib.ServerProxy("http://godeep.cs.ucdavis.edu:8080/RPC2")

        mosesParams = {}
        candidates = []
        baseDir = "/home/ccasal/temp/"
        tempFile = baseDir + str(transactionID) + "_temp.js"
        lm_path = "/data/bogdanv/deobfuscator/experiments/corpora/corpus.lm.970k/js.blm.lm"

        preproFile = baseDir + str(transactionID) + "_prepro.js"
        beautFile = baseDir + str(transactionID) + "_beaut.js"

        # Strip comments, replace literals, etc
        try:
            prepro = WebPreprocessor(obfuscatedCode)
            #TODO replace with: prepro = WebPreprocessor(text)
            prepro.write_temp_file(preproFile)
        except:
            cleanup([preproFile])
            print("Preprocessor failed")
            return ("Preprocessor Failed")

        clear = Beautifier()
        #TODO: Need a text version of beautifier to avoid the file read and write.
        #(ok, beautText, err) = clear.webRun(preproText)
        ok = clear.run(preproFile, beautFile)
        print(ok)
        if (not ok):
            cleanup([preproFile, beautFile])
            return ("Beautifier Failed")
            #quit()

        try:
            lex_ugly = Lexer(beautFile)
            iBuilder_ugly = IndexBuilder(lex_ugly.tokenList)
        except:
            cleanup([preproFile, beautFile])
            print("IndexBuilder fail")
            return ("IndexBuilder Failed")

        lex_ugly.write_temp_file(tempFile)

        #Do Scope related tasks
        #a raw text version
        try:
            scopeAnalyst = ScopeAnalyst(tempFile)
        except:
            cleanup({"temp": tempFile})
            print("ScopeAnalyst Fail")
            return ("ScopeAnalyst Failed")

        #Do Rename related tasks
        #In our case, I don't think we need to actually do anything for no_renaming
        #no_renaming = []
        #for _line_idx, line in enumerate(iBuilder_ugly.tokens):
        #    no_renaming.append(' '.join([t for (_tt,t) in line]) + "\n")

        #Hash_def_one_renaming
        #beautText = renameUsingHashDefLine(scopeAnalyst,
        #                                               iBuilder_ugly,
        #                                               twoLines=False,
        #                                                debug=False)
        print(lex_ugly.collapsedText)
        mosesParams["text"] = lex_ugly.collapsedText
        mosesParams["align"] = "true"
        mosesParams["report-all-factors"] = "true"

        results = proxy.translate(
            mosesParams)  # __request("translate", mosesParams)
        rawText = Postprocessor(results["nbest"])
        translation = rawText.getProcessedOutput()

        #Send to output:
        cleanup([preproFile, beautFile, tempFile])
        return (translation)
Example #26
0
def processFile(js_file_path):

    try:

        # Num tokens before vs after
        try:
            tok1 = Lexer(os.path.join(files_root, 'orig',
                                      js_file_path)).tokenList
            tok2 = Lexer(os.path.join(files_root, 'no_renaming',
                                      js_file_path)).tokenList
            #             tok3 = Lexer(os.path.join(files_root, 'basic_renaming', js_file_path)).tokenList
            #             tok4 = Lexer(os.path.join(files_root, 'normalized', js_file_path)).tokenList
            tok5 = Lexer(
                os.path.join(files_root, 'hash_def_one_renaming',
                             js_file_path)).tokenList
            tok6 = Lexer(
                os.path.join(files_root, 'hash_def_two_renaming',
                             js_file_path)).tokenList
        except:
            return (js_file_path, None, 'Lexer fail')

        # For now only work with minified files that have
        # the same number of tokens as the originals
        if not len(set([len(tok1), len(tok2), len(tok5), len(tok6)])) == 1:
            return (js_file_path, None, 'Num tokens mismatch')

        clear = Beautifier()
        # Align minified and clear files, in case the beautifier
        # did something weird
        aligner = Aligner()

        (aligned1, aligned2) = aligner.web_align(tok1, tok2)

        (ok, beautified1, _err) = clear.web_run(aligned1)
        tok11 = WebLexer(beautified1).tokenList

        (ok, beautified2, _err) = clear.web_run(aligned2)
        tok22 = WebLexer(beautified2).tokenList

        (aligned5, aligned2) = aligner.web_align(tok5, tok2)

        (ok, beautified5, _err) = clear.web_run(aligned5)
        tok55 = WebLexer(beautified5).tokenList

        (aligned6, aligned2) = aligner.web_align(tok6, tok2)

        (ok, beautified6, _err) = clear.web_run(aligned6)
        tok66 = WebLexer(beautified6).tokenList

        #         try:
        #             aligner = Aligner()
        #             # This is already the baseline corpus, no (smart) renaming yet
        #             aligner.align(temp_files['path_tmp_b'],
        #                           temp_files['path_tmp_u'])
        #         except:
        #             return (js_file_path, None, 'Aligner fail')

        try:
            iBuilder1 = IndexBuilder(tok11)
            iBuilder2 = IndexBuilder(tok22)
            #             iBuilder3 = IndexBuilder(tok3)
            #             iBuilder4 = IndexBuilder(tok4)
            iBuilder5 = IndexBuilder(tok55)
            iBuilder6 = IndexBuilder(tok66)
        except:
            return (js_file_path, None, 'IndexBuilder fail')

        # Check that at least one variable was renamed during minification
        orig_names = set([
            token for line in iBuilder1.tokens for (token_type, token) in line
            if is_token_subtype(token_type, Token.Name)
        ])
        ugly_names = set([
            token for line in iBuilder2.tokens for (token_type, token) in line
            if is_token_subtype(token_type, Token.Name)
        ])
        if not len(orig_names.difference(ugly_names)):
            return (js_file_path, None, 'Not minified')

        orig = []
        no_renaming = []
        #         basic_renaming = []
        #         normalized = []
        hash_def_one_renaming = []
        hash_def_two_renaming = []

        for _line_idx, line in enumerate(iBuilder1.tokens):
            orig.append(' '.join([t for (_tt, t) in line]) + "\n")

        for _line_idx, line in enumerate(iBuilder2.tokens):
            no_renaming.append(' '.join([t for (_tt, t) in line]) + "\n")


#         for _line_idx, line in enumerate(iBuilder3.tokens):
#             basic_renaming.append(' '.join([t for (_tt,t) in line]) + "\n")

#         for _line_idx, line in enumerate(iBuilder4.tokens):
#             normalized.append(' '.join([t for (_tt,t) in line]) + "\n")

        for _line_idx, line in enumerate(iBuilder5.tokens):
            hash_def_one_renaming.append(' '.join([t for (_tt, t) in line]) +
                                         "\n")

        for _line_idx, line in enumerate(iBuilder6.tokens):
            hash_def_two_renaming.append(' '.join([t for (_tt, t) in line]) +
                                         "\n")

        return (
            js_file_path,
            orig,
            no_renaming,
            #                 basic_renaming,
            #                 normalized,
            hash_def_one_renaming,
            hash_def_two_renaming)

    except Exception, e:
        return (js_file_path, None, str(e))
Example #27
0
    def testHashDefRenaming(self):
        '''
        TODO: Test the hashing functions are using the context correctly for both one and two line
        options.  Goals are to confirm a) correct line summarization b) consistency of naming
        of the same variable.  However, two different variables may map to the same name with
        insufficient context.
        '''
        #print(self.obsfuscatedTextFiles[0])
        ib1 = IndexBuilder(self.obsLexed[0].tokenList)
        sa1 = ScopeAnalyst(self.obsfuscatedTextFiles[0])

        RS = RenamingStrategies()
        preRen = PreRenamer()
        oneLine1 = preRen.rename(RS.HASH_ONE, ib1, sa1, True)
        twoLine1 = preRen.rename(RS.HASH_TWO, ib1, sa1, True)

        #         oneLine1 = renameUsingHashDefLine(sa1, ib1, False, True)
        #         twoLine1 = renameUsingHashDefLine(sa1, ib1, True, True)

        #print("OneLine1------------------------------------------------")
        #print(oneLine1)
        #print("TwoLine1------------------------------------------------")
        #print(twoLine1)

        #One line tests
        lines = oneLine1.split("\n")
        self.assertTrue(lines[0] == "var geom2d = function ( ) {")
        #var <<var#=numeric.sum,=numeric.numberEquals;>> = numeric . sum , <<var=numeric.sum,#=numeric.numberEquals;>> = numeric . numberEquals ;
        self.assertTrue(
            lines[1] ==
            "var <<var#=numeric.sum,=numeric.numberEquals;>> = numeric . sum , <<var=numeric.sum,#=numeric.numberEquals;>> = numeric . numberEquals ;"
        )
        self.assertTrue(
            lines[3] ==
            "function <<function#(,){>> ( <<function(#,){>> , <<function(,#){>> ) {"
        )
        self.assertTrue(lines[4] == "this . x = <<function(#,){>> ;"
                        )  #Why is x not transformed? Global, can't change...
        #print(lines[7])
        self.assertTrue(
            lines[7] == "u ( <<function#(,){>> , {"
        )  #Why is u not transformed? -> Because u's hash <<function#(,){>> is ALREADY IN USE IN THE SAME SCOPE!!  (This is why u can be translated in 2-lines)
        self.assertTrue(
            lines[16] ==
            "for ( var <<for(var#in)[]=[];>> in <<function(,#){>> ) <<function(#,){>> [ <<for(var#in)[]=[];>> ] = <<function(,#){>> [ <<for(var#in)[]=[];>> ] ;"
        )
        self.assertTrue(lines[20] == "Vector2d : <<function#(,){>>")
        #Two line tests (TODO)
        lines = twoLine1.split("\n")
        self.assertTrue(lines[0] == "var geom2d = function ( ) {")

        self.assertTrue(
            lines[1] ==
            "var <<var#=numeric.sum,=numeric.numberEquals;return#([this.x*.x,this.y*.y]);>> = numeric . sum , <<var=numeric.sum,#=numeric.numberEquals;return#(this.x,.x,)&&(this.y,.y,);>> = numeric . numberEquals ;"
        )
        #                            function <<function#(,){(#,{>> ( <<function(#,){this.x=#;>> , <<function(,#){this.y=#;>> ) {
        self.assertTrue(
            lines[3] ==
            "function <<function#(,){(#,{>> ( <<function(#,){this.x=#;>> , <<function(,#){this.y=#;>> ) {"
        )
        self.assertTrue(lines[4] == "this . x = <<function(#,){this.x=#;>> ;"
                        )  #Why is x not transformed? Global, can't change...

        #u(r, {
        #                            #<<function#(,){#(,{>> ( <<function#(,){(#,{>> , {
        self.assertTrue(
            lines[7] == "<<function#(,){#(,{>> ( <<function#(,){(#,{>> , {"
        )  # is transformed, but order seems backwards.
        self.assertTrue(
            lines[16] ==
            "for ( var <<for(var#in)[]=[];for(varin)[#]=[];>> in <<function(,#){for(varin#)[]=[];>> ) <<function(#,){for(varin)#[]=[];>> [ <<for(var#in)[]=[];for(varin)[#]=[];>> ] = <<function(,#){for(varin#)[]=[];>> [ <<for(var#in)[]=[];for(varin)[#]=[];>> ] ;"
        )  #Not really two lines, but two references?
        self.assertTrue(lines[20] == "Vector2d : <<function#(,){(#,{>>")

        self.assertTrue(True)
Example #28
0
def processFile(js_file_path):

    js_file_path = os.path.abspath(js_file_path)

    print 'READING:', js_file_path

    acorn = Acorn()
    (_stdout, acorn_ok) = acorn.run(js_file_path)
    print 'RUNNING Acorn:', acorn_ok

    # Load in the minified file
    minified = open(js_file_path).read()

    b = Beautifier()
    (ok, out, err) = b.web_run(minified)
    #     print out

    # Create lexer
    lexer = get_lexer_for_filename(js_file_path)

    # Tokenize input and compute mappings between the different
    # indices used: (line, col), flat, (l,c) in token list
    indexBuilder = IndexBuilder(lex(minified, lexer))
    tokens = indexBuilder.tokens
    print 'RUNNING IndexBuilder:', len(tokens) > 0

    #nice1 = JSNice()
    #(ok, _out, _err) = nice1.run(js_file_path)
    #print 'RUNNING JSNice:', ok

    #nice2 = UnuglifyJS()
    #(ok, _out, _err) = nice2.run(js_file_path)
    #print 'RUNNING UnuglifyJS:', ok

    _pid = multiprocessing.current_process().ident

    # Compute scoping: name2scope is a dictionary where keys
    # are (name, start_index) tuples and values are scope identifiers.
    # Note: start_index is a flat (unidimensional) index,
    # not a (line_chr_idx, col_chr_idx) index.
    #     scopeAnalyst = ScopeAnalyst(js_file_path)
    #     name2defScope = scopeAnalyst.resolve_scope()
    #     isGlobal = scopeAnalyst.isGlobal

    scopeAnalyst = WebScopeAnalyst(minified)
    name2defScope = scopeAnalyst.resolve_scope()
    isGlobal = scopeAnalyst.isGlobal

    print 'RUNNING ScopeAnalyst:', len(name2defScope) > 0

    name2useScope = scopeAnalyst.name2useScope
    name2pth = scopeAnalyst.name2pth
    nameOrigin = scopeAnalyst.nameOrigin

    scopes = set(name2useScope.values())

    for scope in scopes:
        print scope
        lc_list = [
            indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]]
            for (t, pos) in name2useScope.keys()
            if name2useScope[(t, pos)] == scope
        ]
        highlight(tokens, lc_list)
        print

    # Discover the path to the source map
    _map_path = sourcemap.discover(minified)
    # Read and parse our sourcemap
    #     sourcemapIndex = sourcemap.load(open(map_path))

    # Cluster names by scope
    nameScope2Positions = {}

    # Index data by (name,scope)
    for token, l in indexBuilder.name2CharPositions.iteritems():
        for (line, col) in sorted(l, key=lambda (a, b): (a, b)):
            pos = indexBuilder.flatMap[(line, col)]
            if name2defScope.has_key((token, pos)):
                scope = name2defScope[(token, pos)]
                use_scope = name2useScope[(token, pos)]
                pth = name2pth[(token, pos)]

                glb = isGlobal[(token, pos)]

                nameScope2Positions.setdefault((token, scope, glb), [])
                nameScope2Positions[(token, scope, glb)].append((line, col))

#                 print token, pos
#                 print 'def:', scope
#                 print 'use:', use_scope
#                 print 'pth:', pth
#                 highlight(tokens, [indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]]])
#                 print


    for (token,scope,glb), positions in sorted(nameScope2Positions.iteritems(), \
                                           key=lambda (x,y):x[0]):

        if glb:
            continue

        pos = sorted(positions, key=lambda e: (e[0], e[1]))
        #         t = []
        tt = []
        line_tok_idxs = set([])
        for (l, c) in pos:
            #             orig = sourcemapIndex.lookup(line=l, column=c).name
            (tl, tc) = indexBuilder.revTokMap[(l, c)]
            line_tok_idxs.add(tl)
            p = indexBuilder.flatMap[(l, c)]
            tt.append(((tl, tc), p))
#             t.append(orig)

#         if token == 'n':
        print '\nNAME:', token.encode('utf-8'), 'isGlobal =', glb
        #         print scope
        #         highlight(tokens, [indexBuilder.revTokMap[indexBuilder.revFlatMat[pos]]])

        for ((tli, tci), p) in tt:
            scope = name2defScope[(token, p)]
            use_scope = name2useScope[(token, p)]
            pth = name2pth[(token, p)]
            origin = nameOrigin[(token, scope)]
#             print token #, p, origin
#             print
#             print 'def:', scope
#             print 'use:', use_scope
#             print 'pth:', pth
#             print

        for tl in sorted(set([tli for ((tli, tci), p) in tt])):
            l = list(tokens[tl])
            for tc in [tci for ((tli, tci), p) in tt if tli == tl]:
                l[tc] = (l[tc][0], unichr(0x2588) + token + unichr(0x2588))


#                 pos = indexBuilder.flatMap[(line,col)]

            print '  ', '%d:' % (tl + 1), ' '.join(
                [x[1].encode('utf-8') for x in l])

        print

    return
Example #29
0
    def deobfuscateJS(self,
                      obfuscatedCode,
                      use_mix,
                      transactionID,
                      debug_output=False,
                      parallel=True,
                      use_local=True):
        """
        Take a string representing minified javascript code and attempt to
        translate it into a version with better renamings.
        
        Parameters
        ----------
        obfuscatedCode: The minified javascript text.
        
        use_mix: True/False -> should we invoke JSNice and throw the names into the language model mix?
        
        transactionID: an ID for storing temp files - used currently
        only to identify the input to JSNice.
        
        debug_output: should we print debugging output in this pass (TRUE/FALSE)
        
        parallel: enable parallelization performance enhancements -> such as calling the
        moses servers in parallel. 
        Returns
        -------
        A tuple:
            renamed_text - the renamed text
            jsnice_error - "" if no error, otherwise a message stating
                           where the jsnice mixing failed
            Third element is a tuple of TIMING_COUNT performance times
            preprocess time - total time to preprocess before invoking
                            moses servers
            prepre time - how long does the first step of the preprocessor take?
            jsnice time - part of the preprocessing, how long does it take
                        to get and parse jsnice names
            renaming time - how long did the hashing steps in preprocess take
            lex_total_time - how long did all the lexers take,
            builder_time - how long did all the Index Builders take
            scoper_time - how long did all the scopeAnalysts take
            moses time - how long did the moses servers take
            moses_rn_parallel - total time for the parallel moses and renaming
            to complete
            postprocess time - how long did the consistency resolution and
                            language model queries take.
        """

        RS = RenamingStrategies()
        CS = ConsistencyStrategies()

        r_strategy = RS.HASH_ONE
        #c_strategy = CS.FREQLEN # or CS.LM? (CS.LM requires a language model + a querylm from moses)
        #c_strategy = CS.LM
        c_strategy = CS.LOGMODEL

        if (use_local == False):
            proxies = MosesProxy().web_proxies
        else:
            proxies = MosesProxy().web_local
        mosesParams = {}

        #lm_path = "/data/bogdanv/deobfuscator/experiments/corpora/corpus.lm.970k/js.blm.lm"
        #lm_path = "/data/bogdanv/deobfuscator/experiments/corpora/corpus.lm.500k/js.blm.lm"
        lm_path = "./phrase-tables/langmodels/js.blm.lm"

        #if socket.gethostname() == 'bogdan.mac':
        #    lm_path = "/Users/bogdanv/workspace2/deobfuscator/data/lm/js.blm.lm"
        #elif socket.gethostname() == "Caseys-MacBook-Pro.local" or socket.gethostname() == "campus-019-136.ucdavis.edu":
        #    lm_path = "/Users/caseycas/jsnaughty_lms/js970k.blm.lm"

        #Hashed Name -> Minified Name (name, def_scope) -> (name, def_scope)
        hash_name_map = {}
        #Minified Name -> jsnice name  (name, def_scope) -> (name, def_scope)
        jsnice_name_map = {}
        #Record of any errors we get in the js mixing.
        #If this feature is enabled (to be added as a switch on the website)
        #it should not crash the input if there is a failure.  If the query
        #doesn't work for some reason, then we should just use the candidate
        #names provided by moses.
        jsnice_errors = []

        start = time.time()
        # Strip comments, replace literals, etc
        try:
            #if True:
            prepro = WebLMPreprocessor(obfuscatedCode)
            prepro_text = str(prepro)
            if (debug_output):
                print("Prepro_text----------------------------------")
                print(prepro_text)
                print("Prepro_text----------------------------------")

        except:
            return ((prepro_error, "", (0, ) * TIMING_COUNT))

        prepre_end = time.time()
        prepre_time = prepre_end - start
        clear = Beautifier()

        (ok, beautified_text, _err) = clear.web_run(prepro_text)

        if (debug_output):
            print("Beautified Text")
            print(beautified_text)

        if (not ok):
            return ((beaut_error, "", (0, ) * TIMING_COUNT))

        #Due to a bug? in the jsnice web service, we need to save the
        #input text as a file.
        min_input_file = os.path.join(self.tmpDir,
                                      str(transactionID) + ".u.js")
        with open(min_input_file, 'w') as f:
            f.write(beautified_text)

        try:
            #             lex_ugly = Lexer(beautFile)
            lex_ugly = WebLexer(beautified_text)
            if (debug_output):
                print("Lex_ugly---------------------")
                print(lex_ugly.tokenList)
                print("Lex_ugly---------------------")
            iBuilder_ugly = IndexBuilder(lex_ugly.tokenList)
        except:

            return ((ib_error, "", (0, ) * TIMING_COUNT))

        #Do Scope related tasks
        #a raw text version
        try:
            #             scopeAnalyst = ScopeAnalyst(beautFile)
            scopeAnalyst = WebScopeAnalyst(beautified_text)
        except:
            return ((sa_error, "", (0, ) * TIMING_COUNT))

        #Cut short if no variables
        if (not scopeAnalyst.hasMinifiableVariables()):
            return ((beautified_text, "No Minifiable Variables",
                     (0, ) * TIMING_COUNT))
        elif (debug_output):
            print("GLOBAL VAR MAP: " + str(scopeAnalyst.isGlobal))

        #lex_ugly.write_temp_file(tempFile)
        js_start = time.time()
        ########################
        #  Nice2Predict start
        ########################
        #Don't want a crashing failure for jsnice query.
        # BV: Next block left out until I figure out the pipe issue
        # BV: Update: I couldn't pipe input to N2P. TODO: FIX
        # Run the JSNice from http://www.nice2predict.org
        if (use_mix):
            unuglifyJS = UnuglifyJS()
            (ok, n2p_text, _err) = unuglifyJS.run(min_input_file)
            #ok = False #Failure test
            if not ok:
                jsnice_errors.append('Nice2Predict fail')
                #return (js_file_path, None, 'Nice2Predict fail')

        if (use_mix and jsnice_errors == []):
            (ok, n2p_text_beautified, _err) = clear.web_run(n2p_text)
            if not ok:
                jsnice_errors.append('Beautifier failed for JSNice.')
                #return (js_file_path, None, 'Beautifier fail')

            if (debug_output):
                print("JSNice Text")
                print(n2p_text_beautified)

            try:
                n2p_lexer = WebLexer(n2p_text_beautified)
                n2p_iBuilder = IndexBuilder(n2p_lexer.tokenList)
                n2p_scopeAnalyst = WebScopeAnalyst(n2p_text_beautified)
            except:
                jsnice_errors.append(
                    "IndexBuilder or ScopeAnalysted failed for JSNice.")
                #return (js_file_path, None, 'IndexBuilder / ScopeAnalyst fail')

        ########################
        #   Nice2Predict End
        ########################
        js_end = time.time()
        js_time = js_end - js_start
        #Do Scope related tasks

        (name_positions, position_names,
         use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst)

        #Map the jsnice names to the minified counterparts.
        if (use_mix and jsnice_errors == []
            ):  #only attempt if we are error free for jsnice up to this point.
            try:
                orderedVarsNew = sorted(scopeAnalyst.name2defScope.keys(),
                                        key=lambda x: x[1])
                orderedVarsN2p = sorted(n2p_scopeAnalyst.name2defScope.keys(),
                                        key=lambda x: x[1])

                if (len(orderedVarsNew) != len(orderedVarsN2p)):
                    jsnice_errors.append(
                        "JSNice and minified name lists different lengths.")
                    #raise IndexError("Length Mismatch") #Probably better to have our own defined error type, but this will do for now
                    #return ("JsNice and New Name lists different length")

                for i in range(0, len(orderedVarsNew)):
                    name_new = orderedVarsNew[i][0]
                    def_scope_new = scopeAnalyst.name2defScope[
                        orderedVarsNew[i]]

                    name_n2p = orderedVarsN2p[i][0]
                    def_scope_n2p = scopeAnalyst.name2defScope[
                        orderedVarsNew[i]]
                    jsnice_name_map[(name_new,
                                     def_scope_new)] = (name_n2p,
                                                        def_scope_n2p)
            except:
                jsnice_errors.append(
                    "JSNice to minified name map building failed.")


        (_name_positions, \
         position_names,
         _use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst)

        #Note: we want to put these in parallel once we've tested the
        #serial version...
        pre_outer_end = time.time()
        pre_time = pre_outer_end - start
        if (not parallel):
            #Get moses output for no_renaming
            (status, error_msg, translation_default, name_candidates_default,
             iBuilder_default, scopeAnalyst_default, name_positions_default,
             position_names_default, use_scopes_default, hash_name_map_default,
             rn_time_default, m_time_default, lex_time_default,
             post_start_default) = getMosesTranslation(proxies[RS.NONE],
                                                       RS.NONE, RS, clear,
                                                       iBuilder_ugly,
                                                       scopeAnalyst,
                                                       debug_output)
            #print("MOSES NO RENAMING: " + str(m_time_default))
            if (not status):
                return ((error_msg, "", (0, ) * TIMING_COUNT))

            #Get moses output for hash_renaming
            (status, error_msg, translation, name_candidates, a_iBuilder,
             a_scopeAnalyst, a_name_positions, a_position_names, a_use_scopes,
             hash_name_map, rn_time, m_time, lex_time,
             post_start) = getMosesTranslation(proxies[r_strategy], r_strategy,
                                               RS, clear, iBuilder_ugly,
                                               scopeAnalyst, debug_output)

            #print("MOSES HASH RENAMING: " + str(m_time))
            if (not status):
                return ((error_msg, "", (0, ) * TIMING_COUNT))
            m_parallel_time = 0
        else:
            #Parallel version
            none_wrapper = (RS.NONE, RS, clear, iBuilder_ugly, scopeAnalyst,
                            debug_output, use_local)
            hash_wrapper = (r_strategy, RS, clear, iBuilder_ugly, scopeAnalyst,
                            debug_output, use_local)
            wrappers = [none_wrapper, hash_wrapper]

            pool = multiprocessing.Pool(processes=2)

            m_parallel_start = time.time()
            for result in pool.imap(getMosesTranslationParallel, wrappers):
                if (result[0] == RS.NONE):  #No renaming
                    (status, error_msg, translation_default,
                     name_candidates_default, iBuilder_default,
                     scopeAnalyst_default, name_positions_default,
                     position_names_default, use_scopes_default,
                     hash_name_map_default, rn_time_default, m_time_default,
                     lex_time_default, post_start_default) = result[1]

                    #print("MOSES NO RENAMING: " + str(m_time_default))
                    if (not status):
                        return ((error_msg, "", (0, ) * TIMING_COUNT))
                else:
                    (status, error_msg, translation, name_candidates,
                     a_iBuilder, a_scopeAnalyst, a_name_positions,
                     a_position_names, a_use_scopes, hash_name_map, rn_time,
                     m_time, lex_time, post_start) = result[1]

                    #print("MOSES HASH RENAMING: " + str(m_time))
                    if (not status):
                        return ((error_msg, "", (0, ) * TIMING_COUNT))

            m_parallel_time = time.time() - m_parallel_start

        pre_time += rn_time_default + rn_time
        if (debug_output):
            print("Serial: " +
                  str(m_time + m_time_default + rn_time + rn_time_default))
            print("Parallel: " + str(m_parallel_time))

        if translation is not None and translation_default is not None:

            for key_default, suggestions in name_candidates_default.iteritems(
            ):
                #                         (name_default, def_scope_default) = key_default

                pos_default = scopeAnalyst_default.nameDefScope2pos[
                    key_default]
                (lin, col) = iBuilder_default.revFlatMat[pos_default]
                (line_num, line_idx) = iBuilder_default.revTokMap[(lin, col)]
                (name, def_scope) = a_position_names[line_num][line_idx]
                key = (name, def_scope)

                for name_translation, lines in suggestions.iteritems():
                    name_candidates.setdefault(key, {})
                    name_candidates[key].setdefault(name_translation, set([]))
                    name_candidates[key][name_translation].update(lines)
            # name_candidates is a dictionary of dictionaries:
            # keys are (name, None) (if scopeAnalyst=None) or
            # (name, def_scope) tuples (otherwise);
            # values are suggested translations with the sets
            # of line numbers on which they appear.
            #if(True):
            if (debug_output):
                print("Name_candidates")
                print(name_candidates)

                print("jsnice_name_map")
                print(jsnice_name_map)

                print("hash_name_map")
                print(hash_name_map)

            # **** BV: This might be all we need to combine Naughty & Nice
            if (
                    use_mix and jsnice_errors == []
            ):  #only attempt if we are error free for jsnice up to this point.
                try:
                    name_candidates_copy = deepcopy(name_candidates)
                    for key, suggestions in name_candidates_copy.iteritems():
                        if (debug_output):
                            print("Key: " + str(key))
                            print("Suggestions: " + str(suggestions))
                        if r_strategy == RS.NONE:
                            (name_n2p, def_scope_n2p) = jsnice_name_map[key]
                        else:
                            (name_n2p, def_scope_n2p
                             ) = jsnice_name_map[hash_name_map.get(key, key)]

                        for name_translation, lines in suggestions.iteritems():
                            name_candidates.setdefault(key, {})
                            name_candidates[key].setdefault(name_n2p, set([]))
                            name_candidates[key][name_n2p].update(lines)
                except:
                    jsnice_errors.append(
                        "Failure while adding jsnice names to candidate pool.")
            cr = ConsistencyController(debug_mode=debug_output)

            # An identifier may have been translated inconsistently
            # across different lines (Moses treats each line independently).
            # Try different strategies to resolve inconsistencies, if any

            # Compute renaming map (x -> length, y -> width, ...)
            # Note that x,y here are names after renaming
            #Hash error is occuring in here.
            try:
                (temp_renaming_map, seen) = cr.computeRenaming(
                    c_strategy, name_candidates, a_name_positions,
                    a_use_scopes, a_iBuilder, lm_path, {}, hash_name_map)
            except:
                return ("Compute renaming fail.", "", (0, ) * TIMING_COUNT)

            if (debug_output):
                print("Temp renaming map")
                print(temp_renaming_map)
            # Fall back on original names in input, if
            # no translation was suggested
            postRen = PostRenamer()
            renaming_map = postRen.updateRenamingMap(a_name_positions,
                                                     position_names,
                                                     a_use_scopes,
                                                     temp_renaming_map, seen,
                                                     r_strategy)
            if (debug_output):
                print("Renaming Map")
                print(renaming_map)
            # Apply renaming map and save output for future inspection
            renamed_text = postRen.applyRenaming(a_iBuilder, a_name_positions,
                                                 renaming_map)

            (ok, beautified_renamed_text,
             _err) = clear.web_run_end(renamed_text)
            #print(name_candidates)
            #print("--------------")
            #print(renamed_text)
            #print("--------------")
            #print(beautified_renamed_text)
            #print("--------------")
            #print(" ".join(jsnice_errors))
            if not ok:
                return ((beaut_error, "", (0, ) * TIMING_COUNT))

            if (debug_output):
                print("Renamed text")
                print(beautified_renamed_text)

        #Time Calculations... (Will need to update for when it becomes parallel
        post_end = time.time()
        post_time = post_end - post_start

        #Record any jsnice errors (but leave output blank if there are none).
        jsnice_error_string = ""
        if (jsnice_errors != []):
            jsnice_error_string = "JSNice mixing attempt failed.  Reporting renaming with only our method. \nJSNice Errors : \n"
            jsnice_error_string += "\n".join(jsnice_errors) + "\n"

        #Tally up the build times for the lexers, indexbuilders and scopers.
        if (not use_mix):
            n2pLexTime = 0
            n2pBuildTime = 0
            n2pSATime = 0
        else:
            n2pLexTime = n2p_lexer.build_time
            n2pBuildTime = n2p_iBuilder.build_time
            n2pSATime = n2p_scopeAnalyst.build_time

        #Lexers
        lex_total_time = lex_time + lex_time_default + lex_ugly.build_time + n2pLexTime
        #IndexBuilders
        builder_time = iBuilder_ugly.build_time + n2pBuildTime + a_iBuilder.build_time + iBuilder_default.build_time
        #scopers
        scoper_time = n2pSATime + scopeAnalyst.build_time + scopeAnalyst_default.build_time + a_scopeAnalyst.build_time

        #Change the presentation of this to return performance information
        #and error codes as separate elements in a tuple
        #New return: translation, jsnice_error, preprocess time, js_time, rename_time
        #m_time, post_time.
        return ((str(beautified_renamed_text), jsnice_error_string,
                 (pre_time, prepre_time, js_time, rn_time + rn_time_default,
                  lex_total_time, builder_time, scoper_time,
                  m_time + m_time_default, m_parallel_time, post_time)))