Beispiel #1
0
def processFile(js_file_path):

    try:        
        
        js_text = open(os.path.join(files_root, js_file_path), 'r').read()
        
        # Strip comments, replace literals, etc
        try:
            prepro = WebLMPreprocessor(js_text)
            prepro_text = str(prepro)
        except:
            return (js_file_path, None, 'Preprocessor fail')
         
        # Pass through beautifier to fix layout
        clear = Beautifier()
        (ok, beautified_text, _err) = clear.web_run(prepro_text)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')
        
        try:
            lex_clear = WebLexer(beautified_text)
            tok1 = lex_clear.tokenList
        except:
            return (js_file_path, None, 'Lexer fail')
        
        try:
            iBuilder1 = IndexBuilder(tok1)
        except:
            return (js_file_path, None, 'IndexBuilder fail')

        orig = [] 
        
        for _line_idx, line in enumerate(iBuilder1.tokens):
            orig.append(' '.join([t for (_tt,t) in line]) + "\n")
        
        return (js_file_path, orig)
        
    except Exception, e:
        return (js_file_path, None, str(e))
Beispiel #2
0
def processFile(js_file_path):

    try:

        # Num tokens before vs after
        try:
            tok1 = Lexer(os.path.join(files_root, 'orig',
                                      js_file_path)).tokenList
            tok2 = Lexer(os.path.join(files_root, 'no_renaming',
                                      js_file_path)).tokenList
            #             tok3 = Lexer(os.path.join(files_root, 'basic_renaming', js_file_path)).tokenList
            #             tok4 = Lexer(os.path.join(files_root, 'normalized', js_file_path)).tokenList
            tok5 = Lexer(
                os.path.join(files_root, 'hash_def_one_renaming',
                             js_file_path)).tokenList
            tok6 = Lexer(
                os.path.join(files_root, 'hash_def_two_renaming',
                             js_file_path)).tokenList
        except:
            return (js_file_path, None, 'Lexer fail')

        # For now only work with minified files that have
        # the same number of tokens as the originals
        if not len(set([len(tok1), len(tok2), len(tok5), len(tok6)])) == 1:
            return (js_file_path, None, 'Num tokens mismatch')

        clear = Beautifier()
        # Align minified and clear files, in case the beautifier
        # did something weird
        aligner = Aligner()

        (aligned1, aligned2) = aligner.web_align(tok1, tok2)

        (ok, beautified1, _err) = clear.web_run(aligned1)
        tok11 = WebLexer(beautified1).tokenList

        (ok, beautified2, _err) = clear.web_run(aligned2)
        tok22 = WebLexer(beautified2).tokenList

        (aligned5, aligned2) = aligner.web_align(tok5, tok2)

        (ok, beautified5, _err) = clear.web_run(aligned5)
        tok55 = WebLexer(beautified5).tokenList

        (aligned6, aligned2) = aligner.web_align(tok6, tok2)

        (ok, beautified6, _err) = clear.web_run(aligned6)
        tok66 = WebLexer(beautified6).tokenList

        #         try:
        #             aligner = Aligner()
        #             # This is already the baseline corpus, no (smart) renaming yet
        #             aligner.align(temp_files['path_tmp_b'],
        #                           temp_files['path_tmp_u'])
        #         except:
        #             return (js_file_path, None, 'Aligner fail')

        try:
            iBuilder1 = IndexBuilder(tok11)
            iBuilder2 = IndexBuilder(tok22)
            #             iBuilder3 = IndexBuilder(tok3)
            #             iBuilder4 = IndexBuilder(tok4)
            iBuilder5 = IndexBuilder(tok55)
            iBuilder6 = IndexBuilder(tok66)
        except:
            return (js_file_path, None, 'IndexBuilder fail')

        # Check that at least one variable was renamed during minification
        orig_names = set([
            token for line in iBuilder1.tokens for (token_type, token) in line
            if is_token_subtype(token_type, Token.Name)
        ])
        ugly_names = set([
            token for line in iBuilder2.tokens for (token_type, token) in line
            if is_token_subtype(token_type, Token.Name)
        ])
        if not len(orig_names.difference(ugly_names)):
            return (js_file_path, None, 'Not minified')

        orig = []
        no_renaming = []
        #         basic_renaming = []
        #         normalized = []
        hash_def_one_renaming = []
        hash_def_two_renaming = []

        for _line_idx, line in enumerate(iBuilder1.tokens):
            orig.append(' '.join([t for (_tt, t) in line]) + "\n")

        for _line_idx, line in enumerate(iBuilder2.tokens):
            no_renaming.append(' '.join([t for (_tt, t) in line]) + "\n")


#         for _line_idx, line in enumerate(iBuilder3.tokens):
#             basic_renaming.append(' '.join([t for (_tt,t) in line]) + "\n")

#         for _line_idx, line in enumerate(iBuilder4.tokens):
#             normalized.append(' '.join([t for (_tt,t) in line]) + "\n")

        for _line_idx, line in enumerate(iBuilder5.tokens):
            hash_def_one_renaming.append(' '.join([t for (_tt, t) in line]) +
                                         "\n")

        for _line_idx, line in enumerate(iBuilder6.tokens):
            hash_def_two_renaming.append(' '.join([t for (_tt, t) in line]) +
                                         "\n")

        return (
            js_file_path,
            orig,
            no_renaming,
            #                 basic_renaming,
            #                 normalized,
            hash_def_one_renaming,
            hash_def_two_renaming)

    except Exception, e:
        return (js_file_path, None, str(e))
def processFile(l):

    js_file_path = l[0]

    #     if True:
    try:
        js_text = open(os.path.join(corpus_root, js_file_path), 'r').read()

        # Strip comments, replace literals, etc
        try:
            prepro = WebLMPreprocessor(js_text)
            prepro_text = str(prepro)
        except:
            return (js_file_path, None, 'Preprocessor fail')

        # Pass through beautifier to fix layout
        clear = Beautifier()
        (ok, tmp_beautified_text, _err) = clear.web_run(prepro_text)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')

        # Minify
        ugly = Uglifier()
        (ok, tmp_minified_text, _err) = ugly.web_run(tmp_beautified_text)
        if not ok:
            return (js_file_path, None, 'Uglifier fail')

        # Align minified and clear files, in case the beautifier
        # did something weird
        try:
            aligner = Aligner()
            (aligned_clear, aligned_minified) = aligner.web_align(
                WebLexer(tmp_beautified_text).tokenList,
                WebLexer(tmp_minified_text).tokenList)
        except:
            return (js_file_path, None, 'Aligner fail')

        # Pass through beautifier to fix layout
        (ok, beautified_text, _err) = clear.web_run(aligned_clear)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')
        (ok, minified_text, _err) = clear.web_run(aligned_minified)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')

        # Num tokens before vs after
        try:
            lex_clear = WebLexer(beautified_text)
            tok_clear = lex_clear.tokenList

            lex_ugly = WebLexer(minified_text)
            tok_ugly = lex_ugly.tokenList
        except:
            return (js_file_path, None, 'Lexer fail')

        # For now only work with minified files that have
        # the same number of tokens as the originals
        if not len(tok_clear) == len(tok_ugly):
            return (js_file_path, None, 'Num tokens mismatch')

        if beautified_text == minified_text:
            return (js_file_path, None, 'Not minified')

        try:
            iBuilder_ugly = IndexBuilder(lex_ugly.tokenList)
        except:
            return (js_file_path, None, 'IndexBuilder fail')

        try:
            scopeAnalyst = WebScopeAnalyst(minified_text)
        except:
            return (js_file_path, None, 'ScopeAnalyst fail')

        processed = []

        # Try different renaming strategies (hash, etc)
        for r_strategy in RS.all():

            try:
                #             if True:
                # Rename input prior to translation
                preRen = PreRenamer()
                after_text = preRen.rename(r_strategy, iBuilder_ugly,
                                           scopeAnalyst)

                (ok, beautified_after_text, _err) = clear.web_run(after_text)
                if not ok:
                    return (js_file_path, None, 'Beautifier fail')

                processed.append((r_strategy, beautified_after_text))

            except:
                return (js_file_path, None, 'Renaming fail')

        with open(os.path.join(output_path, 'orig', js_file_path), 'w') as f:
            f.write(beautified_text)

        for (r_strategy, text) in processed:
            with open(os.path.join(output_path, r_strategy, js_file_path),
                      'w') as f:
                f.write(text)

        return (js_file_path, 'OK', None)

    except Exception, e:
        return (js_file_path, None, str(e).replace("\n", ""))
Beispiel #4
0
def processFile(l):

    js_file_path = l[0]
    base_name = os.path.splitext(os.path.basename(js_file_path))[0]

    temp_files = {
        'orig': '%s.js' % base_name,
        'minified': '%s.u.js' % base_name,
        'n2p': '%s.n2p.js' % base_name
    }

    for r_strategy in RS.all():
        temp_files['%s' % (r_strategy)] = \
                    '%s.%s.js' % (base_name, r_strategy)

        for c_strategy in CS.all():
            temp_files['%s_%s' % (r_strategy, c_strategy)] = \
                    '%s.%s.%s.js' % (base_name, r_strategy, c_strategy)

    for k, v in temp_files.iteritems():
        temp_files[k] = os.path.join(output_path, v)

    candidates = []
    #Minified Name -> Original Name (name, def_scope) -> (name, def_scope)
    min_name_map = {}
    #Hashed Name -> Minified Name (name, def_scope) -> (name, def_scope)
    hash_name_map = {}
    #Minified Name -> jsnice name  (name, def_scope) -> (name, def_scope)
    jsnice_name_map = {}
    #Output Lines for the suggestoin_model.csv
    model_rows = []

    try:
        js_text = open(os.path.join(corpus_root, js_file_path), 'r').read()

        # Strip comments, replace literals, etc
        try:
            prepro = WebLMPreprocessor(js_text)
            prepro_text = str(prepro)
        except:
            return (js_file_path, None, 'Preprocessor fail')

        # Pass through beautifier to fix layout
        clear = Beautifier()
        (ok, tmp_beautified_text, _err) = clear.web_run(prepro_text)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')

        # Minify
        ugly = Uglifier()
        (ok, tmp_minified_text, _err) = ugly.web_run(tmp_beautified_text)
        if not ok:
            return (js_file_path, None, 'Uglifier fail')

        # Align minified and clear files, in case the beautifier
        # did something weird
        try:
            aligner = Aligner()
            (aligned_clear, aligned_minified) = aligner.web_align(
                WebLexer(tmp_beautified_text).tokenList,
                WebLexer(tmp_minified_text).tokenList)
        except:
            return (js_file_path, None, 'Aligner fail')

        # Pass through beautifier to fix layout
        (ok, beautified_text, _err) = clear.web_run(aligned_clear)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')
        (ok, minified_text, _err) = clear.web_run(aligned_minified)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')

        # Num tokens before vs after
        try:
            lex_clear = WebLexer(beautified_text)
            tok_clear = lex_clear.tokenList

            lex_ugly = WebLexer(minified_text)
            tok_ugly = lex_ugly.tokenList
        except:
            return (js_file_path, None, 'Lexer fail')

        # For now only work with minified files that have
        # the same number of tokens as the originals
        if not len(tok_clear) == len(tok_ugly):
            return (js_file_path, None, 'Num tokens mismatch')

        if beautified_text == minified_text:
            return (js_file_path, None, 'Not minified')

        #try:
        #    iBuilder_clear = IndexBuilder(lex_clear.tokenList)
        #except:
        #    return (js_file_path, None, "IndexBuilder fail on original file.")

        try:
            iBuilder_ugly = IndexBuilder(lex_ugly.tokenList)
        except:
            return (js_file_path, None, 'IndexBuilder fail')

        with open(temp_files['orig'], 'w') as f:
            f.write(beautified_text)

        with open(temp_files['minified'], 'w') as f:
            f.write(minified_text)

#         try:
#             orig_lexer = WebLexer(beautified_text)
#             orig_iBuilder = IndexBuilder(orig_lexer.tokenList)
#             orig_scopeAnalyst = WebScopeAnalyst(beautified_text)
#         except:
#             return (js_file_path, None, 'IndexBuilder/Scoper fail on original')

########################
#     Nice2Predict
########################

# BV: Next block left out until I figure out the pipe issue
# BV: Update: I couldn't pipe input to N2P. TODO: FIX
# Run the JSNice from http://www.nice2predict.org
        unuglifyJS = UnuglifyJS()
        (ok, n2p_text, _err) = unuglifyJS.run(temp_files['minified'])
        if not ok:
            return (js_file_path, None, 'Nice2Predict fail')

        (ok, n2p_text_beautified, _err) = clear.web_run(n2p_text)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')

        with open(temp_files['n2p'], 'w') as f:
            f.write(n2p_text_beautified)

        try:
            n2p_lexer = WebLexer(n2p_text_beautified)
            n2p_iBuilder = IndexBuilder(n2p_lexer.tokenList)
            n2p_scopeAnalyst = WebScopeAnalyst(n2p_text_beautified)
        except:
            return (js_file_path, None, 'IndexBuilder / ScopeAnalyst fail')

        # Save some translation stats to compare different methods
        ts = TranslationSummarizer()
        candidates += [['n2p', ''] + x for x in ts.compute_summary_unscoped(
            n2p_iBuilder, n2p_scopeAnalyst)]

        ################################################
        # All other JSNaughty variants
        ################################################

        try:
            scopeAnalyst = WebScopeAnalyst(minified_text)
        except:
            return (js_file_path, None, 'ScopeAnalyst minified fail')

        try:
            scopeAnalyst_clear = WebScopeAnalyst(beautified_text)
        except:
            return (js_file_path, None, 'ScopeAnalyst clear fail')

        #if(not check(iBuilder_clear, scopeAnalyst_clear, n2p_iBuilder, n2p_scopeAnalyst)):
        #    return (js_file_path, None, 'JsNice restructured file. Skipping..')

        #Map the original names to the minified counterparts.
        orderedVarsNew = sorted(scopeAnalyst.name2defScope.keys(),
                                key=lambda x: x[1])
        orderedVarsOld = sorted(scopeAnalyst_clear.name2defScope.keys(),
                                key=lambda x: x[1])
        orderedVarsN2p = sorted(n2p_scopeAnalyst.name2defScope.keys(),
                                key=lambda x: x[1])

        if (len(orderedVarsOld) != len(orderedVarsNew)):
            return (js_file_path, None,
                    "Old and New Name lists different length")

        if (len(orderedVarsOld) != len(orderedVarsN2p)):
            return (js_file_path, None,
                    "JsNice and Old Name lists different length")

        for i in range(0, len(orderedVarsOld)):
            name_old = orderedVarsOld[i][0]
            def_scope_old = scopeAnalyst_clear.name2defScope[orderedVarsOld[i]]

            name_new = orderedVarsNew[i][0]
            def_scope_new = scopeAnalyst.name2defScope[orderedVarsNew[i]]
            min_name_map[(name_new, def_scope_new)] = (name_old, def_scope_old)

            name_n2p = orderedVarsN2p[i][0]
            def_scope_n2p = scopeAnalyst.name2defScope[orderedVarsNew[i]]
            jsnice_name_map[(name_new, def_scope_new)] = (name_n2p,
                                                          def_scope_n2p)

        #Once we have the scopeAnalyst, iBuilder, and tokenlist for the minified
        #version, we can get the name properties
#        vm = VariableMetrics(scopeAnalyst, iBuilder_ugly, lex_ugly.tokenList)
#        variableKeySet = vm.getVariables()
#        for variableKey in variableKeySet:
#            name_features[variableKey] = vm.getNameMetrics(variableKey)



        (_name_positions, \
         position_names,
         _use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst)

        # Try different renaming strategies (hash, etc)
        for r_strategy, proxy in proxies:

            # Rename input prior to translation
            preRen = PreRenamer()
            after_text = preRen.rename(r_strategy, iBuilder_ugly, scopeAnalyst)

            (ok, beautified_after_text, _err) = clear.web_run(after_text)
            if not ok:
                return (js_file_path, None, 'Beautifier fail')

            # Save renamed input to disk for future inspection
            with open(temp_files['%s' % (r_strategy)], 'w') as f:
                f.write(beautified_after_text)

            a_lexer = WebLexer(beautified_after_text)
            a_iBuilder = IndexBuilder(a_lexer.tokenList)
            a_scopeAnalyst = WebScopeAnalyst(beautified_after_text)

            if (r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO):
                #                 try:
                #                     scopeAnalyst_hash = WebScopeAnalyst(beautified_after_text) #This should be beautified_after_text instead of after_text
                #                 except:
                #                     return (js_file_path, None, "ScopeAnalyst hash fail")

                #Map the hashed names to the minified counterparts.
                orderedVarsMin = sorted(scopeAnalyst.name2defScope.keys(),
                                        key=lambda x: x[1])
                orderedVarsHash = sorted(a_scopeAnalyst.name2defScope.keys(),
                                         key=lambda x: x[1])

                if (len(orderedVarsMin) != len(orderedVarsHash)):
                    return (js_file_path, None,
                            "Hash and Min lists different length")

                for i in range(0, len(orderedVarsHash)):
                    name_hash = orderedVarsHash[i][0]
                    def_scope_hash = a_scopeAnalyst.name2defScope[
                        orderedVarsHash[i]]

                    name_min = orderedVarsMin[i][0]
                    def_scope_min = scopeAnalyst.name2defScope[
                        orderedVarsMin[i]]
                    hash_name_map[(name_hash,
                                   def_scope_hash)] = (name_min, def_scope_min)

            # We can switch this back once we train models on a corpus with literals
            # lx = WebLexer(a_iBuilder.get_text())
            lx = WebLexer(a_iBuilder.get_text_wo_literals())

            # Translate renamed input
            md = WebMosesDecoder(proxy)
            (ok, translation, _err) = md.run(lx.collapsedText)
            if not ok:
                return (js_file_path, None, 'Moses translation fail')

            (a_name_positions, a_position_names,
             a_use_scopes) = prepHelpers(a_iBuilder, a_scopeAnalyst)

            nc = []

            if translation is not None:
                # Parse moses output
                mp = MosesParser()

                name_candidates = mp.parse(translation, a_iBuilder,
                                           a_position_names)
                # name_candidates is a dictionary of dictionaries:
                # keys are (name, def_scope) tuples;
                # values are suggested translations with the sets
                # of line numbers on which they appear.

                # Update name_candidates with some default values
                # (in this case the translation without any renaming)
                # if the translation is empty
                if r_strategy == RS.NONE:
                    # RS.NONE should always be first, by construction
                    name_candidates_default = name_candidates
                    scopeAnalyst_default = a_scopeAnalyst
                    iBuilder_default = a_iBuilder
                else:
                    for key_default, suggestions in name_candidates_default.iteritems(
                    ):
                        #                         (name_default, def_scope_default) = key_default

                        pos_default = scopeAnalyst_default.nameDefScope2pos[
                            key_default]
                        (lin, col) = iBuilder_default.revFlatMat[pos_default]
                        (line_num,
                         line_idx) = iBuilder_default.revTokMap[(lin, col)]

                        (name,
                         def_scope) = a_position_names[line_num][line_idx]
                        key = (name, def_scope)

                        for name_translation, lines in suggestions.iteritems():
                            name_candidates.setdefault(key, {})
                            name_candidates[key].setdefault(
                                name_translation, set([]))
                            name_candidates[key][name_translation].update(
                                lines)

                # **** BV: This might be all we need to combine Naughty & Nice
                name_candidates_copy = deepcopy(name_candidates)
                for key, suggestions in name_candidates_copy.iteritems():

                    if r_strategy == RS.NONE:
                        (name_n2p, def_scope_n2p) = jsnice_name_map[key]
                    else:
                        (name_n2p,
                         def_scope_n2p) = jsnice_name_map[hash_name_map.get(
                             key, key)]

                    for name_translation, lines in suggestions.iteritems():
                        name_candidates.setdefault(key, {})
                        name_candidates[key].setdefault(name_n2p, set([]))
                        name_candidates[key][name_n2p].update(lines)

                cc = ConsistencyController(debug_mode=False)
                ts = TranslationSummarizer()

                # An identifier may have been translated inconsistently
                # across different lines (Moses treats each line independently).
                # Try different strategies to resolve inconsistencies, if any
                for c_strategy in CS.all():

                    # Compute renaming map (x -> length, y -> width, ...)
                    # Note that x,y here are names after (hash) renaming
                    (temp_renaming_map, seen) = cc.computeRenaming(
                        c_strategy, name_candidates, a_name_positions,
                        a_use_scopes, a_iBuilder, lm_path, {}, hash_name_map)

                    # After computeRenaming, we have both the entropies stored
                    # if we are in LMDrop strategy and have the suggestions
                    # frequency from name_candidates.  Fill in suggestion_Features
                    #                    if(c_strategy == CS.LMDROP and r_strategy not in suggestion_features):
                    #                        assert(cc.suggestion_cache != None)
                    #                        suggestion_features[r_strategy] = {}
                    #                        """
                    #                        name_candidates: dict
                    #                            name_candidates[(name, def_scope)][name_translation]
                    #                            = set of line numbers in the translation
                    #                        """
                    #                        for variableKey, suggestionDictionary in name_candidates.iteritems():
                    #                            for suggestionName, linesSuggested in suggestionDictionary.iteritems():
                    #
                    #                                # I need to revert variableKey[0] in the suggestion from its hash to its original minified name.
                    #                                if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO):
                    #                                    unhashedKey = hash_name_map[variableKey]
                    #                                    suggestionKey = (unhashedKey[0], unhashedKey[1], suggestionName)
                    #                                else:
                    #                                    suggestionKey = (variableKey[0], variableKey[1], suggestionName)
                    #
                    #                                entropyVals = cc.suggestion_cache.getEntropyStats(variableKey, suggestionName)
                    #                                if(entropyVals != (ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR)):
                    #                                    suggestionValue = [len(linesSuggested)] + \
                    #                                                       list(getSuggestionStats(suggestionName)) + \
                    #                                                       list(entropyVals)
                    #
                    #                                    suggestion_features[r_strategy][suggestionKey] = suggestionValue

                    # Fall back on original names in input, if
                    # no translation was suggested
                    postRen = PostRenamer()
                    renaming_map = postRen.updateRenamingMap(
                        a_name_positions, position_names, a_use_scopes,
                        temp_renaming_map, seen, r_strategy)

                    # Apply renaming map and save output for future inspection
                    renamed_text = postRen.applyRenaming(
                        a_iBuilder, a_name_positions, renaming_map)

                    (ok, beautified_renamed_text,
                     _err) = clear.web_run(renamed_text)
                    if not ok:
                        return (js_file_path, None, 'Beautifier fail')
                    with open(temp_files['%s_%s' % (r_strategy, c_strategy)],
                              'w') as f:
                        f.write(beautified_renamed_text)

                    # Save some stats about which names were renamed to what
                    # This is what enables the comparison between the different
                    # methods.
                    r = [[c_strategy] + x for x in ts.compute_summary_scoped(
                        renaming_map, name_candidates, a_iBuilder,
                        a_scopeAnalyst)]

                    if not r:
                        return (js_file_path, None, 'Compute summary failed')
                    nc += r

            if nc:
                candidates += [[r_strategy] + x for x in nc]

        #create the rows for the suggestion_model.csv


#        for r_strategy in RS.all():
#            for suggestionKey, s_feat in suggestion_features[r_strategy].iteritems():
#                variableKey = (suggestionKey[0], suggestionKey[1])
#                original_name = min_name_map[variableKey][0]
#                js_nice_name = jsnice_name_map[variableKey][0]
#                n_feat = list(name_features[variableKey])
#                #Convert the def_scope to an equivalent, but smaller, easier to read key: (line_num, token_num)
#                newKey = scopeAnalyst.nameDefScope2pos[variableKey]
#                (keyLine, keyToken) = iBuilder_ugly.revFlatMat[newKey]
#                model_rows.append([original_name, r_strategy, suggestionKey[0], keyLine, keyToken, suggestionKey[2], js_nice_name] + n_feat + s_feat)

        return (js_file_path, 'OK', candidates, model_rows)

    except Exception, e:
        return (js_file_path, None, str(e).replace("\n", ""), model_rows)
Beispiel #5
0
def processFile(js_file_path):
    
#     js_file_path = l[0]
    base_name = os.path.splitext(os.path.basename(js_file_path))[0]
    
    if dbg:
        print js_file_path
    
    temp_files = {'orig': '%s.js' % base_name,
                  'minified': '%s.u.js' % base_name,
                  'n2p': '%s.n2p.js' % base_name}
    
    for r_strategy in RS.all():
        temp_files['%s' % (r_strategy)] = \
                    '%s.%s.js' % (base_name, r_strategy)
                    
        for c_strategy in CS.all():
            temp_files['%s_%s' % (r_strategy, c_strategy)] = \
                    '%s.%s.%s.js' % (base_name, r_strategy, c_strategy)
                    
    for k,v in temp_files.iteritems():
        temp_files[k] = os.path.join(output_path, v)
    
    
    candidates = []
    #Minified Name -> Original Name (name, def_scope) -> (name, def_scope)
    min_name_map = {}
    #Hashed Name -> Minified Name (name, def_scope) -> (name, def_scope)
    hash_name_map = {}
    #Minified Name -> jsnice name  (name, def_scope) -> (name, def_scope)
    jsnice_name_map = {}
    #Data for the suggestion model.csv
    #Map of variable (name, def_scope) -> results of variableMetrics features function
    name_features = {}
    
    #Map of maps of variable-suggestion (name, def_scope, suggestion) -> suggestion line counts + suggestionMetrics features function
    #The first key is the renaming strategy
    #Ultimately, we will iterate over this to get the keys out of name_features and build model_rows
    suggestion_features = {}
    
    #Output Lines for the suggestoin_model.csv
    model_rows = [] 
    
    if True:
#     try:
#         js_text = open(os.path.join(corpus_root, js_file_path), 'r').read()
        js_text = open(js_file_path, 'r').read()
        
        # Strip comments, replace literals, etc
#         if True:
#         try:
        prepro = WebLMPreprocessor(js_text)
        prepro_text = str(prepro)
#         except:
#             return (js_file_path, None, 'Preprocessor fail')
        
#         print 'Preprocessor'
#         print prepro_text
        
        
        # Pass through beautifier to fix layout
        clear = Beautifier()
        (ok, tmp_beautified_text, _err) = clear.web_run(prepro_text)
        
        
        print '\nOK:', ok, 'ERR:', _err
        print tmp_beautified_text
        
        if not ok:
            return (js_file_path, None, 'Beautifier fail')
        
            
        # Minify
        ugly = Uglifier()
        (ok, tmp_minified_text, _err) = ugly.web_run(tmp_beautified_text)
        
#         print '\nOK:', ok, 'ERR:', _err
#         print tmp_minified_text
        
        if not ok:
            return (js_file_path, None, 'Uglifier fail')
        
        
        # Align minified and clear files, in case the beautifier 
        # did something weird
        try:
            aligner = Aligner()
            (aligned_clear, aligned_minified) = aligner.web_align(WebLexer(tmp_beautified_text).tokenList,
                                                                 WebLexer(tmp_minified_text).tokenList)
        except:
            return (js_file_path, None, 'Aligner fail')
        
#         print '\nAligned clear'
#         print aligned_clear
#         print '\nAligned minified'
#         print aligned_minified
#         print
        
        # Pass through beautifier to fix layout
        (ok, beautified_text, _err) = clear.web_run(aligned_clear)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')
        (ok, minified_text, _err) = clear.web_run(aligned_minified)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')
        
#         print beautified_text
#         print
#         print minified_text
        
        # Num tokens before vs after
        try:
            lex_clear = WebLexer(beautified_text)
            tok_clear = lex_clear.tokenList
            
            lex_ugly = WebLexer(minified_text)
            tok_ugly = lex_ugly.tokenList
        except:
            return (js_file_path, None, 'Lexer fail')

        # For now only work with minified files that have
        # the same number of tokens as the originals
        if not len(tok_clear) == len(tok_ugly):
            return (js_file_path, None, 'Num tokens mismatch')
        
        
        if beautified_text == minified_text:
            return (js_file_path, None, 'Not minified')

        try:
            iBuilder_clear = IndexBuilder(lex_clear.tokenList)
        except:
            return (js_file_path, None, "IndexBuilder fail on original file.")
            
        try:
            iBuilder_ugly = IndexBuilder(lex_ugly.tokenList)
        except:
            return (js_file_path, None, 'IndexBuilder fail on minified file.')
        
        
#         print 'Writing'
        
        with open(temp_files['orig'], 'w') as f:
            f.write(beautified_text)
            
        with open(temp_files['minified'], 'w') as f:
            f.write(minified_text)
        
        ######################## 
        #     Nice2Predict
        ########################
        
        # BV: Next block left out until I figure out the pipe issue
        # BV: Update: I couldn't pipe input to N2P. TODO: FIX
        # Run the JSNice from http://www.nice2predict.org
        unuglifyJS = UnuglifyJS()
        (ok, n2p_text, _err) = unuglifyJS.run(temp_files['minified'])
        if not ok:
            return (js_file_path, None, 'Nice2Predict fail')

        (ok, n2p_text_beautified, _err) = clear.web_run(n2p_text)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')
        
        with open(temp_files['n2p'], 'w') as f:
            f.write(n2p_text_beautified)
         
        if(True):
        #try:
            n2p_lexer = WebLexer(n2p_text_beautified)
            n2p_iBuilder = IndexBuilder(n2p_lexer.tokenList)
            n2p_scopeAnalyst = WebScopeAnalyst(n2p_text_beautified)
        #except:
        #    return (js_file_path, None, 'IndexBuilder / ScopeAnalyst fail')
        
#         print 'n2p'
        
        # Save some translation stats to compare different methods
        ts = TranslationSummarizer()
        candidates += [['n2p', ''] + x 
                       for x in ts.compute_summary_unscoped(n2p_iBuilder, 
                                                            n2p_scopeAnalyst)]
            
        ################################################
        # All other JSNaughty variants
        ################################################
    
        try:
            scopeAnalyst = WebScopeAnalyst(minified_text)
        except:
            return (js_file_path, None, 'ScopeAnalyst minified fail')
        
        try:
            scopeAnalyst_clear = WebScopeAnalyst(beautified_text)
        except:
            return (js_file_path, None, 'ScopeAnalyst clear fail')
        
        if(not check(iBuilder_clear, scopeAnalyst_clear, n2p_iBuilder, n2p_scopeAnalyst)):
            return (js_file_path, None, 'JsNice restructured file. Skipping..')
        
        #Map the original names to the minified counterparts and minified ones to jsnice renamings
        orderedVarsNew = sorted(scopeAnalyst.name2defScope.keys(), key = lambda x: x[1])
        orderedVarsOld = sorted(scopeAnalyst_clear.name2defScope.keys(), key = lambda x: x[1])
        orderedVarsN2p = sorted(n2p_scopeAnalyst.name2defScope.keys(), key = lambda x: x[1])

        if(len(orderedVarsOld) != len(orderedVarsNew)):
            return (js_file_path, None, "Old and New Name lists different length")
        
        if(len(orderedVarsOld) != len(orderedVarsN2p)):
            return (js_file_path, None, "JsNice and Old Name lists different length")
        
        
        for i in range(0, len(orderedVarsOld)):
            name_old = orderedVarsOld[i][0]
            def_scope_old = scopeAnalyst_clear.name2defScope[orderedVarsOld[i]]

            name_new = orderedVarsNew[i][0]
            def_scope_new = scopeAnalyst.name2defScope[orderedVarsNew[i]]
            min_name_map[(name_new, def_scope_new)] = (name_old, def_scope_old)
            
            name_n2p = orderedVarsN2p[i][0]
            def_scope_n2p = scopeAnalyst.name2defScope[orderedVarsNew[i]]
            jsnice_name_map[(name_new, def_scope_new)] = (name_n2p, def_scope_n2p)

        #Once we have the scopeAnalyst, iBuilder, and tokenlist for the minified
        #version, we can get the name properties
        vm = VariableMetrics(scopeAnalyst, iBuilder_ugly, lex_ugly.tokenList)
        variableKeySet = vm.getVariables()
        for variableKey in variableKeySet:
            name_features[variableKey] = vm.getNameMetrics(variableKey)
         
        (name_positions, \
         position_names,
         use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst)
          
#         print 'Helpers'

        # Try different renaming strategies (hash, etc)
        for r_strategy, proxy in proxies:
            
            if dbg:
                print '\n====================='
                print r_strategy
                print '=====================\n'
        
#             try:
#             if True:
            # Rename input prior to translation
            preRen = PreRenamer()
            after_text = preRen.rename(r_strategy, 
                                      iBuilder_ugly,
                                      scopeAnalyst)
            
#             print 'After text:'
#             print after_text
#             print
            
            (ok, beautified_after_text, _err) = clear.web_run(after_text)
            if not ok:
                return (js_file_path, None, 'Beautifier fail')
            
#             print 'Beautified:'
#             print beautified_after_text
#             print
            
            if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO):
                try:
                    scopeAnalyst_hash = WebScopeAnalyst(after_text)
                except:
                    return (js_file_path, None, "ScopeAnalyst hash fail")

                #Map the hashed names to the minified counterparts.
                orderedVarsMin = sorted(scopeAnalyst.name2defScope.keys(), key = lambda x: x[1])
                orderedVarsHash = sorted(scopeAnalyst_hash.name2defScope.keys(), key = lambda x: x[1])

                if(len(orderedVarsMin) != len(orderedVarsHash)):
                    return (js_file_path, None, "Hash and Min lists different length")

                for i in range(0, len(orderedVarsHash)):
                    name_hash = orderedVarsHash[i][0]
                    def_scope_hash = scopeAnalyst_hash.name2defScope[orderedVarsHash[i]]

                    name_min = orderedVarsMin[i][0]
                    def_scope_min = scopeAnalyst.name2defScope[orderedVarsMin[i]]
                    hash_name_map[(name_hash, def_scope_hash)] = (name_min, def_scope_min)


            # Save renamed input to disk for future inspection
            with open(temp_files['%s' % (r_strategy)], 'w') as f:
                f.write(beautified_after_text)
            
            a_lexer = WebLexer(beautified_after_text)
            a_iBuilder = IndexBuilder(a_lexer.tokenList)
            a_scopeAnalyst = WebScopeAnalyst(beautified_after_text)
                
#             except:
#                 return (js_file_path, None, 'Renaming fail')
            
#             print 'Lexing'
            
#             lx = WebLexer(a_iBuilder.get_text())
            lx = WebLexer(a_iBuilder.get_text_wo_literals())
            
#             print a_iBuilder.get_text_wo_literals()
            
            # Translate renamed input
            md = WebMosesDecoder(proxy)
            (ok, translation, _err) = md.run(lx.collapsedText)
            if not ok:
                return (js_file_path, None, 'Moses translation fail')
            
#             print '\ntranslation-------------'
#             print translation
            
#             if r_strategy == RS.HASH_ONE:
#                 exit()
            
            (a_name_positions, 
             a_position_names,
             a_use_scopes) = prepHelpers(a_iBuilder, a_scopeAnalyst)

            nc = []
             
            if translation is not None:
                # Parse moses output
                mp = MosesParser()
                
                if dbg:
                    print '\nr_strategy-----------', r_strategy
                
                name_candidates = mp.parse(translation,
                                           a_iBuilder,
                                           a_position_names)
                # name_candidates is a dictionary of dictionaries: 
                # keys are (name, None) (if scopeAnalyst=None) or 
                # (name, def_scope) tuples (otherwise); 
                # values are suggested translations with the sets 
                # of line numbers on which they appear.

#                 print '\nname_candidates before ----------'
#                 for key, suggestions in name_candidates.iteritems():
#                     print key[0], key[1][-50:]
# #                     for use_scope, suggestions in val.iteritems():
# #                         print '\t...', use_scope[-50:]
#                     for name_translation, lines in suggestions.iteritems():
#                         print '\t', name_translation, lines
                    
                # Update name_candidates with some default values 
                # (in this case the translation without any renaming)
                # if the translation is empty
                if r_strategy == RS.NONE:
                    # RS.NONE should always be first, by construction
                    name_candidates_default = name_candidates
                    scopeAnalyst_default = a_scopeAnalyst
                    iBuilder_default = a_iBuilder
                else:
                    for key_default, suggestions in name_candidates_default.iteritems():
#                         (name_default, def_scope_default) = key_default
                        
                        pos_default = scopeAnalyst_default.nameDefScope2pos[key_default]
                        (lin, col) = iBuilder_default.revFlatMat[pos_default]
                        (line_num, line_idx) = iBuilder_default.revTokMap[(lin, col)]
                        
                        (name, def_scope) = a_position_names[line_num][line_idx]
                        key = (name, def_scope)
                        
                        for name_translation, lines in suggestions.iteritems():
                            name_candidates.setdefault(key, {})
                            name_candidates[key].setdefault(name_translation, set([]))
                            name_candidates[key][name_translation].update(lines)
                                
#                         for use_scope, suggestions in val.iteritems():
#                             for name_translation, lines in suggestions.iteritems():
# #                                 key = preRen.simple_direct_map.get(key_default, key_default)
#                                  
#                                 name_candidates.setdefault(key, {})
#                                 name_candidates[key].setdefault(use_scope, {})
#                                 name_candidates[key][use_scope].setdefault(name_translation, set([]))
#                                 name_candidates[key][use_scope][name_translation].update(lines)
                                
#                 print '\nname_candidates after ----------'
#                 for key, suggestions in name_candidates.iteritems():
#                     print key[0], key[1][-50:]
# #                     for use_scope, suggestions in val.iteritems():
# #                         print '\t...', use_scope[-50:]
#                     for name_translation, lines in suggestions.iteritems():
#                         print '\t', name_translation, lines
                                
                cc = ConsistencyController(debug_mode=True)
                ts = TranslationSummarizer()
                
                # An identifier may have been translated inconsistently
                # across different lines (Moses treats each line independently).
                # Try different strategies to resolve inconsistencies, if any
                for c_strategy in CS.all():
                    
                    if dbg:
                        print '\nc_strategy----------', c_strategy

                    #assert(hash_name_map != {})
                    
                    # Compute renaming map (x -> length, y -> width, ...)
                    # Note that x,y here are names after renaming
                    (temp_renaming_map, seen) = cc.computeRenaming(c_strategy,
                                                      name_candidates,
                                                      a_name_positions,
                                                      a_use_scopes,
                                                      a_iBuilder,
                                                      lm_path,
                                                      vm,
                                                      hash_name_map)
                    
                    
                    #After computeRenaming, we have both the entropies stored
                    #if we are in LMDrop strategy and have the suggestions
                    #frequency from name_candidates.  Fill in suggestion_Features
                    if(c_strategy == CS.LMDROP and r_strategy not in suggestion_features):
                        assert(cc.suggestion_cache != None)
                        suggestion_features[r_strategy] = {}
                        #Need some way of iterating over all name, suggestion groups...
                        """
                        name_candidates: dict
                            name_candidates[(name, def_scope)][name_translation] 
                            = set of line numbers in the translation
                        """
                        for variableKey, suggestionDictionary in name_candidates.iteritems():
                            for suggestionName, linesSuggested in suggestionDictionary.iteritems():
                                # I need to revert variableKey[0] in the suggestion from its hash to its original minified name.
                                if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO):
                                    unhashedKey = hash_name_map[variableKey]
                                    suggestionKey = (unhashedKey[0], unhashedKey[1], suggestionName)
                                else:
                                    suggestionKey = (variableKey[0], variableKey[1], suggestionName)

                                entropyVals = cc.suggestion_cache.getEntropyStats(variableKey, suggestionName)
                                
                                if(True): #eval_dbg only
                                #if(entropyVals != (ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR)):
                                    suggestionValue = [len(linesSuggested)] + \
                                                       list(getSuggestionStats(suggestionName)) + \
                                                       list(entropyVals)
                                                      
                                    suggestion_features[r_strategy][suggestionKey] = suggestionValue
                    
                    
                    if dbg:
                        print '\ntemp_renaming_map-------------'
                        for (name, def_scope), renaming in temp_renaming_map.iteritems():
                            print (name, def_scope[-50:]), renaming

                    # Fall back on original names in input, if 
                    # no translation was suggested
                    postRen = PostRenamer()
                    renaming_map = postRen.updateRenamingMap(a_name_positions, 
                                                             position_names, 
                                                             a_use_scopes,
                                                             temp_renaming_map, 
                                                             seen,
                                                             r_strategy)

#                     new_name_candidates = {}
# 
#                     for (name, def_scope), renaming in temp_renaming_map.iteritems():
#                         (line_num, line_idx) = a_name_positions[(name, def_scope)][0]
#                         (old_name, old_def_scope) = position_names[line_num][line_idx]
#                         
#                         new_name_candidates.setdefault((old_name, old_def_scope), {})
#                         new_name_candidates[(old_name, old_def_scope)][renaming] = set([1])


#                     tmp_renamed_text = postRen.applyRenaming(a_iBuilder, 
#                                                          a_name_positions, 
#                                                          temp_renaming_map)
#                     (ok, tmp_beautified_renamed_text, _err) = clear.web_run(tmp_renamed_text)
#                     if not ok:
#                         return (js_file_path, None, 'Beautifier fail')
#                     
#                     tmp_lexer = WebLexer(tmp_beautified_renamed_text)
#                     tmp_iBuilder = IndexBuilder(tmp_lexer.tokenList)
#                     tmp_scopeAnalyst = WebScopeAnalyst(tmp_beautified_renamed_text)
#                         
#                     (tmp_name_positions, 
#                      tmp_position_names,
#                      tmp_use_scopes) = prepHelpers(tmp_iBuilder, tmp_scopeAnalyst)
                    
#                     renaming_map = postRen.updateRenamingMap(tmp_name_positions, 
#                                                              position_names, 
#                                                              temp_renaming_map, 
#                                                              r_strategy)
#                     
#                     renaming_map = cc.computeRenaming(CS.FREQLEN,
#                                                       new_name_candidates,
#                                                       name_positions,
#                                                       use_scopes,
#                                                       iBuilder_ugly,
#                                                       lm_path)
                    
#                     # Fall back on original names in input, if 
#                     # no translation was suggested
#                     postRen = PostRenamer()
#                     renaming_map = postRen.updateRenamingMap(a_name_positions, 
#                                                              position_names, 
#                                                              temp_renaming_map, 
#                                                              r_strategy)
                    
                    if dbg:
                        print '\nrenaming_map-------------'
                        for (name, def_scope), renaming in renaming_map.iteritems():
                            print (name, def_scope[-50:]), renaming, '(%s)' % temp_renaming_map[(name, def_scope)]
                    
                    # Apply renaming map and save output for future inspection
                    renamed_text = postRen.applyRenaming(a_iBuilder, 
                                                         a_name_positions, 
                                                         renaming_map)
                    
                    print '\nrenamed_text--------------'
                    print renamed_text
                    print
                    
                    (ok, beautified_renamed_text, _err) = clear.web_run(renamed_text)
                    if not ok:
                        return (js_file_path, None, 'Beautifier fail')
                    with open(temp_files['%s_%s' % (r_strategy, c_strategy)], 'w') as f:
                        f.write(beautified_renamed_text)
                    
                    # Save some stats about which names were renamed to what
                    # This is what enables the comparison between the different 
                    # methods.
                    r = [[c_strategy] + x 
                         for x in ts.compute_summary_scoped(renaming_map,
                                                            name_candidates,
                                                            a_iBuilder,
                                                            a_scopeAnalyst)]
                    
                    if not r:
                        return (js_file_path, None, 'Compute summary failed')
                    nc += r
                
            if nc:
                candidates += [[r_strategy] + x for x in nc]
         

        #create the rows for the suggestion_model.csv
        for r_strategy in RS.all():
            for suggestionKey, s_feat in suggestion_features[r_strategy].iteritems():
                variableKey = (suggestionKey[0], suggestionKey[1])
                original_name = min_name_map[variableKey][0]
                js_nice_name = jsnice_name_map[variableKey][0]
                if(variableKey in name_features): #eval_dbg only
                    n_feat = list(name_features[variableKey])
                    #Convert the def_scope to an equivalent, but smaller, easier to read key: (line_num, token_num)
                    newKey = scopeAnalyst.nameDefScope2pos[variableKey]
                    (keyLine, keyToken) = iBuilder_ugly.revFlatMat[newKey]
                    model_rows.append([original_name, r_strategy, suggestionKey[0], keyLine, keyToken, suggestionKey[2], js_nice_name] + n_feat + s_feat)
 
        return (js_file_path, 'OK', candidates, model_rows)
Beispiel #6
0
    def deobfuscateJS(self,
                      obfuscatedCode,
                      use_mix,
                      transactionID,
                      debug_output=False,
                      parallel=True,
                      use_local=True):
        """
        Take a string representing minified javascript code and attempt to
        translate it into a version with better renamings.
        
        Parameters
        ----------
        obfuscatedCode: The minified javascript text.
        
        use_mix: True/False -> should we invoke JSNice and throw the names into the language model mix?
        
        transactionID: an ID for storing temp files - used currently
        only to identify the input to JSNice.
        
        debug_output: should we print debugging output in this pass (TRUE/FALSE)
        
        parallel: enable parallelization performance enhancements -> such as calling the
        moses servers in parallel. 
        Returns
        -------
        A tuple:
            renamed_text - the renamed text
            jsnice_error - "" if no error, otherwise a message stating
                           where the jsnice mixing failed
            Third element is a tuple of TIMING_COUNT performance times
            preprocess time - total time to preprocess before invoking
                            moses servers
            prepre time - how long does the first step of the preprocessor take?
            jsnice time - part of the preprocessing, how long does it take
                        to get and parse jsnice names
            renaming time - how long did the hashing steps in preprocess take
            lex_total_time - how long did all the lexers take,
            builder_time - how long did all the Index Builders take
            scoper_time - how long did all the scopeAnalysts take
            moses time - how long did the moses servers take
            moses_rn_parallel - total time for the parallel moses and renaming
            to complete
            postprocess time - how long did the consistency resolution and
                            language model queries take.
        """

        RS = RenamingStrategies()
        CS = ConsistencyStrategies()

        r_strategy = RS.HASH_ONE
        #c_strategy = CS.FREQLEN # or CS.LM? (CS.LM requires a language model + a querylm from moses)
        #c_strategy = CS.LM
        c_strategy = CS.LOGMODEL

        if (use_local == False):
            proxies = MosesProxy().web_proxies
        else:
            proxies = MosesProxy().web_local
        mosesParams = {}

        #lm_path = "/data/bogdanv/deobfuscator/experiments/corpora/corpus.lm.970k/js.blm.lm"
        #lm_path = "/data/bogdanv/deobfuscator/experiments/corpora/corpus.lm.500k/js.blm.lm"
        lm_path = "./phrase-tables/langmodels/js.blm.lm"

        #if socket.gethostname() == 'bogdan.mac':
        #    lm_path = "/Users/bogdanv/workspace2/deobfuscator/data/lm/js.blm.lm"
        #elif socket.gethostname() == "Caseys-MacBook-Pro.local" or socket.gethostname() == "campus-019-136.ucdavis.edu":
        #    lm_path = "/Users/caseycas/jsnaughty_lms/js970k.blm.lm"

        #Hashed Name -> Minified Name (name, def_scope) -> (name, def_scope)
        hash_name_map = {}
        #Minified Name -> jsnice name  (name, def_scope) -> (name, def_scope)
        jsnice_name_map = {}
        #Record of any errors we get in the js mixing.
        #If this feature is enabled (to be added as a switch on the website)
        #it should not crash the input if there is a failure.  If the query
        #doesn't work for some reason, then we should just use the candidate
        #names provided by moses.
        jsnice_errors = []

        start = time.time()
        # Strip comments, replace literals, etc
        try:
            #if True:
            prepro = WebLMPreprocessor(obfuscatedCode)
            prepro_text = str(prepro)
            if (debug_output):
                print("Prepro_text----------------------------------")
                print(prepro_text)
                print("Prepro_text----------------------------------")

        except:
            return ((prepro_error, "", (0, ) * TIMING_COUNT))

        prepre_end = time.time()
        prepre_time = prepre_end - start
        clear = Beautifier()

        (ok, beautified_text, _err) = clear.web_run(prepro_text)

        if (debug_output):
            print("Beautified Text")
            print(beautified_text)

        if (not ok):
            return ((beaut_error, "", (0, ) * TIMING_COUNT))

        #Due to a bug? in the jsnice web service, we need to save the
        #input text as a file.
        min_input_file = os.path.join(self.tmpDir,
                                      str(transactionID) + ".u.js")
        with open(min_input_file, 'w') as f:
            f.write(beautified_text)

        try:
            #             lex_ugly = Lexer(beautFile)
            lex_ugly = WebLexer(beautified_text)
            if (debug_output):
                print("Lex_ugly---------------------")
                print(lex_ugly.tokenList)
                print("Lex_ugly---------------------")
            iBuilder_ugly = IndexBuilder(lex_ugly.tokenList)
        except:

            return ((ib_error, "", (0, ) * TIMING_COUNT))

        #Do Scope related tasks
        #a raw text version
        try:
            #             scopeAnalyst = ScopeAnalyst(beautFile)
            scopeAnalyst = WebScopeAnalyst(beautified_text)
        except:
            return ((sa_error, "", (0, ) * TIMING_COUNT))

        #Cut short if no variables
        if (not scopeAnalyst.hasMinifiableVariables()):
            return ((beautified_text, "No Minifiable Variables",
                     (0, ) * TIMING_COUNT))
        elif (debug_output):
            print("GLOBAL VAR MAP: " + str(scopeAnalyst.isGlobal))

        #lex_ugly.write_temp_file(tempFile)
        js_start = time.time()
        ########################
        #  Nice2Predict start
        ########################
        #Don't want a crashing failure for jsnice query.
        # BV: Next block left out until I figure out the pipe issue
        # BV: Update: I couldn't pipe input to N2P. TODO: FIX
        # Run the JSNice from http://www.nice2predict.org
        if (use_mix):
            unuglifyJS = UnuglifyJS()
            (ok, n2p_text, _err) = unuglifyJS.run(min_input_file)
            #ok = False #Failure test
            if not ok:
                jsnice_errors.append('Nice2Predict fail')
                #return (js_file_path, None, 'Nice2Predict fail')

        if (use_mix and jsnice_errors == []):
            (ok, n2p_text_beautified, _err) = clear.web_run(n2p_text)
            if not ok:
                jsnice_errors.append('Beautifier failed for JSNice.')
                #return (js_file_path, None, 'Beautifier fail')

            if (debug_output):
                print("JSNice Text")
                print(n2p_text_beautified)

            try:
                n2p_lexer = WebLexer(n2p_text_beautified)
                n2p_iBuilder = IndexBuilder(n2p_lexer.tokenList)
                n2p_scopeAnalyst = WebScopeAnalyst(n2p_text_beautified)
            except:
                jsnice_errors.append(
                    "IndexBuilder or ScopeAnalysted failed for JSNice.")
                #return (js_file_path, None, 'IndexBuilder / ScopeAnalyst fail')

        ########################
        #   Nice2Predict End
        ########################
        js_end = time.time()
        js_time = js_end - js_start
        #Do Scope related tasks

        (name_positions, position_names,
         use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst)

        #Map the jsnice names to the minified counterparts.
        if (use_mix and jsnice_errors == []
            ):  #only attempt if we are error free for jsnice up to this point.
            try:
                orderedVarsNew = sorted(scopeAnalyst.name2defScope.keys(),
                                        key=lambda x: x[1])
                orderedVarsN2p = sorted(n2p_scopeAnalyst.name2defScope.keys(),
                                        key=lambda x: x[1])

                if (len(orderedVarsNew) != len(orderedVarsN2p)):
                    jsnice_errors.append(
                        "JSNice and minified name lists different lengths.")
                    #raise IndexError("Length Mismatch") #Probably better to have our own defined error type, but this will do for now
                    #return ("JsNice and New Name lists different length")

                for i in range(0, len(orderedVarsNew)):
                    name_new = orderedVarsNew[i][0]
                    def_scope_new = scopeAnalyst.name2defScope[
                        orderedVarsNew[i]]

                    name_n2p = orderedVarsN2p[i][0]
                    def_scope_n2p = scopeAnalyst.name2defScope[
                        orderedVarsNew[i]]
                    jsnice_name_map[(name_new,
                                     def_scope_new)] = (name_n2p,
                                                        def_scope_n2p)
            except:
                jsnice_errors.append(
                    "JSNice to minified name map building failed.")


        (_name_positions, \
         position_names,
         _use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst)

        #Note: we want to put these in parallel once we've tested the
        #serial version...
        pre_outer_end = time.time()
        pre_time = pre_outer_end - start
        if (not parallel):
            #Get moses output for no_renaming
            (status, error_msg, translation_default, name_candidates_default,
             iBuilder_default, scopeAnalyst_default, name_positions_default,
             position_names_default, use_scopes_default, hash_name_map_default,
             rn_time_default, m_time_default, lex_time_default,
             post_start_default) = getMosesTranslation(proxies[RS.NONE],
                                                       RS.NONE, RS, clear,
                                                       iBuilder_ugly,
                                                       scopeAnalyst,
                                                       debug_output)
            #print("MOSES NO RENAMING: " + str(m_time_default))
            if (not status):
                return ((error_msg, "", (0, ) * TIMING_COUNT))

            #Get moses output for hash_renaming
            (status, error_msg, translation, name_candidates, a_iBuilder,
             a_scopeAnalyst, a_name_positions, a_position_names, a_use_scopes,
             hash_name_map, rn_time, m_time, lex_time,
             post_start) = getMosesTranslation(proxies[r_strategy], r_strategy,
                                               RS, clear, iBuilder_ugly,
                                               scopeAnalyst, debug_output)

            #print("MOSES HASH RENAMING: " + str(m_time))
            if (not status):
                return ((error_msg, "", (0, ) * TIMING_COUNT))
            m_parallel_time = 0
        else:
            #Parallel version
            none_wrapper = (RS.NONE, RS, clear, iBuilder_ugly, scopeAnalyst,
                            debug_output, use_local)
            hash_wrapper = (r_strategy, RS, clear, iBuilder_ugly, scopeAnalyst,
                            debug_output, use_local)
            wrappers = [none_wrapper, hash_wrapper]

            pool = multiprocessing.Pool(processes=2)

            m_parallel_start = time.time()
            for result in pool.imap(getMosesTranslationParallel, wrappers):
                if (result[0] == RS.NONE):  #No renaming
                    (status, error_msg, translation_default,
                     name_candidates_default, iBuilder_default,
                     scopeAnalyst_default, name_positions_default,
                     position_names_default, use_scopes_default,
                     hash_name_map_default, rn_time_default, m_time_default,
                     lex_time_default, post_start_default) = result[1]

                    #print("MOSES NO RENAMING: " + str(m_time_default))
                    if (not status):
                        return ((error_msg, "", (0, ) * TIMING_COUNT))
                else:
                    (status, error_msg, translation, name_candidates,
                     a_iBuilder, a_scopeAnalyst, a_name_positions,
                     a_position_names, a_use_scopes, hash_name_map, rn_time,
                     m_time, lex_time, post_start) = result[1]

                    #print("MOSES HASH RENAMING: " + str(m_time))
                    if (not status):
                        return ((error_msg, "", (0, ) * TIMING_COUNT))

            m_parallel_time = time.time() - m_parallel_start

        pre_time += rn_time_default + rn_time
        if (debug_output):
            print("Serial: " +
                  str(m_time + m_time_default + rn_time + rn_time_default))
            print("Parallel: " + str(m_parallel_time))

        if translation is not None and translation_default is not None:

            for key_default, suggestions in name_candidates_default.iteritems(
            ):
                #                         (name_default, def_scope_default) = key_default

                pos_default = scopeAnalyst_default.nameDefScope2pos[
                    key_default]
                (lin, col) = iBuilder_default.revFlatMat[pos_default]
                (line_num, line_idx) = iBuilder_default.revTokMap[(lin, col)]
                (name, def_scope) = a_position_names[line_num][line_idx]
                key = (name, def_scope)

                for name_translation, lines in suggestions.iteritems():
                    name_candidates.setdefault(key, {})
                    name_candidates[key].setdefault(name_translation, set([]))
                    name_candidates[key][name_translation].update(lines)
            # name_candidates is a dictionary of dictionaries:
            # keys are (name, None) (if scopeAnalyst=None) or
            # (name, def_scope) tuples (otherwise);
            # values are suggested translations with the sets
            # of line numbers on which they appear.
            #if(True):
            if (debug_output):
                print("Name_candidates")
                print(name_candidates)

                print("jsnice_name_map")
                print(jsnice_name_map)

                print("hash_name_map")
                print(hash_name_map)

            # **** BV: This might be all we need to combine Naughty & Nice
            if (
                    use_mix and jsnice_errors == []
            ):  #only attempt if we are error free for jsnice up to this point.
                try:
                    name_candidates_copy = deepcopy(name_candidates)
                    for key, suggestions in name_candidates_copy.iteritems():
                        if (debug_output):
                            print("Key: " + str(key))
                            print("Suggestions: " + str(suggestions))
                        if r_strategy == RS.NONE:
                            (name_n2p, def_scope_n2p) = jsnice_name_map[key]
                        else:
                            (name_n2p, def_scope_n2p
                             ) = jsnice_name_map[hash_name_map.get(key, key)]

                        for name_translation, lines in suggestions.iteritems():
                            name_candidates.setdefault(key, {})
                            name_candidates[key].setdefault(name_n2p, set([]))
                            name_candidates[key][name_n2p].update(lines)
                except:
                    jsnice_errors.append(
                        "Failure while adding jsnice names to candidate pool.")
            cr = ConsistencyController(debug_mode=debug_output)

            # An identifier may have been translated inconsistently
            # across different lines (Moses treats each line independently).
            # Try different strategies to resolve inconsistencies, if any

            # Compute renaming map (x -> length, y -> width, ...)
            # Note that x,y here are names after renaming
            #Hash error is occuring in here.
            try:
                (temp_renaming_map, seen) = cr.computeRenaming(
                    c_strategy, name_candidates, a_name_positions,
                    a_use_scopes, a_iBuilder, lm_path, {}, hash_name_map)
            except:
                return ("Compute renaming fail.", "", (0, ) * TIMING_COUNT)

            if (debug_output):
                print("Temp renaming map")
                print(temp_renaming_map)
            # Fall back on original names in input, if
            # no translation was suggested
            postRen = PostRenamer()
            renaming_map = postRen.updateRenamingMap(a_name_positions,
                                                     position_names,
                                                     a_use_scopes,
                                                     temp_renaming_map, seen,
                                                     r_strategy)
            if (debug_output):
                print("Renaming Map")
                print(renaming_map)
            # Apply renaming map and save output for future inspection
            renamed_text = postRen.applyRenaming(a_iBuilder, a_name_positions,
                                                 renaming_map)

            (ok, beautified_renamed_text,
             _err) = clear.web_run_end(renamed_text)
            #print(name_candidates)
            #print("--------------")
            #print(renamed_text)
            #print("--------------")
            #print(beautified_renamed_text)
            #print("--------------")
            #print(" ".join(jsnice_errors))
            if not ok:
                return ((beaut_error, "", (0, ) * TIMING_COUNT))

            if (debug_output):
                print("Renamed text")
                print(beautified_renamed_text)

        #Time Calculations... (Will need to update for when it becomes parallel
        post_end = time.time()
        post_time = post_end - post_start

        #Record any jsnice errors (but leave output blank if there are none).
        jsnice_error_string = ""
        if (jsnice_errors != []):
            jsnice_error_string = "JSNice mixing attempt failed.  Reporting renaming with only our method. \nJSNice Errors : \n"
            jsnice_error_string += "\n".join(jsnice_errors) + "\n"

        #Tally up the build times for the lexers, indexbuilders and scopers.
        if (not use_mix):
            n2pLexTime = 0
            n2pBuildTime = 0
            n2pSATime = 0
        else:
            n2pLexTime = n2p_lexer.build_time
            n2pBuildTime = n2p_iBuilder.build_time
            n2pSATime = n2p_scopeAnalyst.build_time

        #Lexers
        lex_total_time = lex_time + lex_time_default + lex_ugly.build_time + n2pLexTime
        #IndexBuilders
        builder_time = iBuilder_ugly.build_time + n2pBuildTime + a_iBuilder.build_time + iBuilder_default.build_time
        #scopers
        scoper_time = n2pSATime + scopeAnalyst.build_time + scopeAnalyst_default.build_time + a_scopeAnalyst.build_time

        #Change the presentation of this to return performance information
        #and error codes as separate elements in a tuple
        #New return: translation, jsnice_error, preprocess time, js_time, rename_time
        #m_time, post_time.
        return ((str(beautified_renamed_text), jsnice_error_string,
                 (pre_time, prepre_time, js_time, rn_time + rn_time_default,
                  lex_total_time, builder_time, scoper_time,
                  m_time + m_time_default, m_parallel_time, post_time)))
Beispiel #7
0
def getMosesTranslation(proxy,
                        r_strategy,
                        RS,
                        a_beautifier,
                        iBuilder_ugly,
                        scopeAnalyst_ugly,
                        debug_mode=False):
    """
    A helper function so that we can run multiple different renaming
    strategies through moses in a more modular and hopefully parallelizable
    manner.  It performs hashing/no hashing preparation of the file for
    the renaming strategy specified by r_stategy, and then calls the
    appropriate moses_server.
    
    Parameters
    ----------
    proxy: A pointer to which port the appropriate moses server is listening in on
    for this particular renaming strategy.

    r_strategy: One of the renaming strategies from RenamingStrategies
    
    RS: A renaming strategies object.
    
    a_beautifier: a beautify object to make sure the renamed text is 
    cleanly formatted.
   
    iBuilder_ugly: Index Builder for the minified file.
   
    scopeAnalyst_ugly: Scope Analyst for the minified file.
   
    start: The starting time for the preprocessing step.  Used for performance
    metrics.
    
    debug_mode: Print debug information? (True/False - defaults to False)
    
    Returns
    -------
    (status, error, translation, name_candidates, 
            a_iBuilder, a_scopeAnalyst, a_name_positions, 
            a_position_names, a_use_scopes, hash_name_map,
            pre_time, rn_time, m_time, post_start)
    
    status: Did this complete without error?  If False, then the rest of the output
    besides error will be empty/null.
    
    error: What is the reason for the failure?  If status is True (successful
    completion) this is "".
    
    translation: The raw Moses output
    
    name_candidates: The set of Moses suggestions for this renaming
    
    a_iBuilder,a_scopeAnalyst: Index Builder and Scope Analyst for this renaming
    
    a_name_positions, a_posistion_names, a_use_scopes: Addition tracking info
    
    hash_name_map: a map from the hashed names to the original minified names 
    
    rn_time, m_time, lex_time, post_start: The duration of the
    renaming, Moses translation steps, and lexing steps along with the start time for the
    postprocessing of the Moses output. 
    """
    rn_start = time.time()

    #We need both the base_text and the hashed_text.
    preRen = PreRenamer()
    if (debug_mode):
        print("Tokens-------------------")
        print(iBuilder_ugly.tokens)
        print("Tokens-------------------")
    #We always need the non hashed names as a fallback.
    try:
        after_text = preRen.rename(r_strategy, iBuilder_ugly,
                                   scopeAnalyst_ugly)
    except:
        return (False, "Renaming failed for " + str(r_strategy), "", {}, None,
                None, {}, {}, {}, {}, 0, 0, 0, 0)

    (ok, beautified_after_text, _err) = a_beautifier.web_run(after_text)
    if not ok:
        return (False,
                "Beautifier failed on the renamed text for " + str(r_strategy),
                "", {}, None, None, {}, {}, {}, {}, 0, 0, 0, 0)

    # Align hashed and non hashed  files, in case the beautifier
    # line wrapped the extended lines.
    try:
        aligner = Aligner()
        (aligned_after, aligned_before) = aligner.web_align(
            WebLexer(beautified_after_text).tokenList,
            WebLexer(iBuilder_ugly.get_text()).tokenList)
    except:
        return (False,
                "Aligner failed on the renamed text for " + str(r_strategy),
                "", {}, None, None, {}, {}, {}, {}, 0, 0, 0, 0)

    #print("--------Aligned After-------")
    #print(aligned_after)
    #print("----------------------------")

    a_lexer = WebLexer(aligned_after)
    a_iBuilder = IndexBuilder(a_lexer.tokenList)
    a_scopeAnalyst = WebScopeAnalyst(aligned_after)

    hash_name_map = {}

    if (r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO):

        #Something below here is buggy...
        orderedVarsMin = sorted(scopeAnalyst_ugly.name2defScope.keys(),
                                key=lambda x: x[1])
        orderedVarsHash = sorted(a_scopeAnalyst.name2defScope.keys(),
                                 key=lambda x: x[1])
        #print("Min len: " + str(len(orderedVarsMin)))
        #print("Hash len: " + str(len(orderedVarsHash)))
        if (len(orderedVarsMin) != len(orderedVarsHash)):
            return (False, "Mismatch between minified and hashed names.", "",
                    {}, a_iBuilder, a_scopeAnalyst, {}, {}, {}, {}, 0, 0, 0, 0)

        for i in range(0, len(orderedVarsHash)):
            name_hash = orderedVarsHash[i][0]
            def_scope_hash = a_scopeAnalyst.name2defScope[orderedVarsHash[i]]

            name_min = orderedVarsMin[i][0]
            def_scope_min = scopeAnalyst_ugly.name2defScope[orderedVarsMin[i]]
            hash_name_map[(name_hash, def_scope_hash)] = (name_min,
                                                          def_scope_min)

    if (debug_mode):
        print("HASH NAME MAP LEN: " + str(len(hash_name_map)))

    # We can switch this back once we train models on a corpus with literals
    # lx = WebLexer(a_iBuilder.get_text())
    lx = WebLexer(a_iBuilder.get_text_wo_literals())
    #print("-----------------Moses In ----------------------")
    #print(lx)
    #print("------------------------------------------------")
    #print(a_iBuilder.charPosition2Name)
    #print("------------------------------------------------")
    #line_subset = a_scopeAnalyst.getMinifiableLines(a_iBuilder)
    #line_list = sorted(list(line_subset))
    #line_map = {}
    #m_line = 0
    #for next_line in line_list:
    #    line_map[m_line] = next_line
    #    m_line += 1
    #lx = WebLexer(a_iBuilder.get_text_on_lines_wo_literals(line_subset))

    #Performance measures -> wrap up the preprocessing/ renaming
    #phases
    end = time.time()
    rn_time = end - rn_start
    m_start = time.time()
    #if(debug_mode):
    #    print("Invoking Moses.")
    #    print(lx.collapsedText)
    # Translate renamed input
    #md = WebMosesDecoder(proxy)
    #(ok, translation, _err) = md.run(lx.collapsedText)
    (ok, translation, _err) = segmentedTranslation(lx, SEGMENTED_TRANS_SIZE,
                                                   proxy, debug_mode)
    if not ok:
        return (False, "Moses server failed for " + str(r_strategy),
                translation, {}, a_iBuilder, a_scopeAnalyst, {}, {}, {},
                hash_name_map, 0, 0, 0, 0)

    m_end = time.time()
    m_time = m_end - m_start

    post_start = time.time()

    (a_name_positions, a_position_names,
     a_use_scopes) = prepHelpers(a_iBuilder, a_scopeAnalyst)

    if translation is not None:
        # Parse moses output
        mp = MosesParser()
        if (debug_mode):
            print(translation)

        name_candidates = mp.parse(translation, a_iBuilder,
                                   a_position_names)  #,
        #a_scopeAnalyst)

        #A slightly modified version of parse to remap the moses
        #output lines to the correct original lines.
        #name_candidates = mp.parse_subset(translation,
        #                                  a_iBuilder,
        #                                  a_position_names,
        #                                  line_map)

    lex_time = lx.build_time + a_lexer.build_time
    return (True, "", translation, name_candidates, a_iBuilder, a_scopeAnalyst,
            a_name_positions, a_position_names, a_use_scopes, hash_name_map,
            rn_time, m_time, lex_time, post_start)