Python prepHelpers Examples

Programming Language: Python

Namespace/Package Name: tools

Method/Function: prepHelpers

Examples at hotexamples.com: 7

Python prepHelpers - 7 examples found. These are the top rated real world Python examples of tools.prepHelpers extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: postprocessUtil.py Project: renesugar/jsNaughty

def processTranslationUnscoped(translation, iBuilder, lm_path, f_path,
                               output_path, base_name):

    nc = []

    if translation is not None:

        (name_positions, position_names) = prepHelpers(iBuilder, None)

        # Parse moses output
        mp = MosesParser()

        name_candidates = mp.parse(translation, iBuilder, position_names)
        # name_candidates is a dictionary of dictionaries:
        # keys are (name, None) (if scopeAnalyst=None) or
        # (name, def_scope) tuples (otherwise);
        # values are suggested translations with the sets
        # of line numbers on which they appear.

        ts = TranslationSummarizer()
        cs = ConsistencyResolver()

        ts = TranslationSummarizer()
        nc = ts.compute_summary_unscoped(iBuilder, scopeAnalyst, prefix)

        r = ts.compute_summary_unscoped(
            cs.computeLMRenaming(name_candidates, name_positions, iBuilder,
                                 lm_path), f_path, 'lm', output_path,
            base_name, name_candidates, name_positions, iBuilder)
        if not r:
            return False
        nc += r

        #         r = summarizeUnscopedTranslation(computeFreqLenRenaming(name_candidates,
        #                                                               name_positions,
        #                                                               lambda e:-len(e[0])),
        #                                        f_path,
        #                                        'len',
        #                                        output_path,
        #                                        base_name,
        #                                        name_candidates,
        #                                        name_positions,
        #                                        iBuilder)
        #         if not r:
        #             return False
        #         nc += r

        r = ts.compute_summary_unscoped(
            cs.computeFreqLenRenaming(name_candidates, name_positions,
                                      lambda e: (-e[1], -len(e[0]))), f_path,
            'freqlen', output_path, base_name, name_candidates, name_positions,
            iBuilder)
        if not r:
            return False
        nc += r

    return nc

Example #2

Show file

File: postprocessUtil.py Project: renesugar/jsNaughty

def processTranslationScoped(translation, iBuilder, scopeAnalyst, lm_path):

    nc = []

    if translation is not None:

        (name_positions, position_names) = prepHelpers(iBuilder, scopeAnalyst)

        # Parse moses output
        mp = MosesParser()

        name_candidates = mp.parse(translation, iBuilder, position_names,
                                   scopeAnalyst)
        # name_candidates is a dictionary of dictionaries:
        # keys are (name, None) (if scopeAnalyst=None) or
        # (name, def_scope) tuples (otherwise);
        # values are suggested translations with the sets
        # of line numbers on which they appear.

        #         print 'name_candidates\n'
        #         for key, val in name_candidates.iteritems():
        #             print key
        #             for k,v in val.iteritems():
        #                 print '  ', k, v

        cs = ConsistencyResolver()
        ts = TranslationSummarizer()

        renaming_map = cs.computeLMRenaming(name_candidates, name_positions,
                                            iBuilder, lm_path)
        #         print '\nrenaming_map\n', renaming_map

        r = [['lm'] + x for x in ts.compute_summary_scoped(
            renaming_map, name_candidates, iBuilder, scopeAnalyst)]
        #         print 'Done: ts.compute_summary_scoped(cs.computeLMRenaming(...))'

        if not r:
            return False
        nc += r

        #         r = summarizeScopedTranslation(computeFreqLenRenaming(name_candidates,
        #                                                               name_positions,
        #                                                               lambda e:-len(e[0])),
        #                                        f_path,
        #                                        'len',
        #                                        output_path,
        #                                        base_name,
        #                                        name_candidates,
        #                                        name_positions,
        #                                        iBuilder,
        #                                        scopeAnalyst)
        #         if not r:
        #             return False
        #         nc += r

        renaming_map = cs.computeFreqLenRenaming(name_candidates,
                                                 name_positions, lambda e:
                                                 (-e[1], -len(e[0])))

        r = [['freqlen'] + x for x in ts.compute_summary_scoped(
            renaming_map, name_candidates, iBuilder, scopeAnalyst)]
        #         print 'Done: ts.compute_summary_scoped(cs.computeFreqLenRenaming(...))'
        if not r:
            return False
        nc += r

    return nc

Example #3

Show file

def processFile(l):

    js_file_path = l[0]
    base_name = os.path.splitext(os.path.basename(js_file_path))[0]

    temp_files = {
        'orig': '%s.js' % base_name,
        'minified': '%s.u.js' % base_name,
        'n2p': '%s.n2p.js' % base_name
    }

    for r_strategy in RS.all():
        temp_files['%s' % (r_strategy)] = \
                    '%s.%s.js' % (base_name, r_strategy)

        for c_strategy in CS.all():
            temp_files['%s_%s' % (r_strategy, c_strategy)] = \
                    '%s.%s.%s.js' % (base_name, r_strategy, c_strategy)

    for k, v in temp_files.iteritems():
        temp_files[k] = os.path.join(output_path, v)

    candidates = []
    #Minified Name -> Original Name (name, def_scope) -> (name, def_scope)
    min_name_map = {}
    #Hashed Name -> Minified Name (name, def_scope) -> (name, def_scope)
    hash_name_map = {}
    #Minified Name -> jsnice name  (name, def_scope) -> (name, def_scope)
    jsnice_name_map = {}
    #Output Lines for the suggestoin_model.csv
    model_rows = []

    try:
        js_text = open(os.path.join(corpus_root, js_file_path), 'r').read()

        # Strip comments, replace literals, etc
        try:
            prepro = WebLMPreprocessor(js_text)
            prepro_text = str(prepro)
        except:
            return (js_file_path, None, 'Preprocessor fail')

        # Pass through beautifier to fix layout
        clear = Beautifier()
        (ok, tmp_beautified_text, _err) = clear.web_run(prepro_text)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')

        # Minify
        ugly = Uglifier()
        (ok, tmp_minified_text, _err) = ugly.web_run(tmp_beautified_text)
        if not ok:
            return (js_file_path, None, 'Uglifier fail')

        # Align minified and clear files, in case the beautifier
        # did something weird
        try:
            aligner = Aligner()
            (aligned_clear, aligned_minified) = aligner.web_align(
                WebLexer(tmp_beautified_text).tokenList,
                WebLexer(tmp_minified_text).tokenList)
        except:
            return (js_file_path, None, 'Aligner fail')

        # Pass through beautifier to fix layout
        (ok, beautified_text, _err) = clear.web_run(aligned_clear)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')
        (ok, minified_text, _err) = clear.web_run(aligned_minified)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')

        # Num tokens before vs after
        try:
            lex_clear = WebLexer(beautified_text)
            tok_clear = lex_clear.tokenList

            lex_ugly = WebLexer(minified_text)
            tok_ugly = lex_ugly.tokenList
        except:
            return (js_file_path, None, 'Lexer fail')

        # For now only work with minified files that have
        # the same number of tokens as the originals
        if not len(tok_clear) == len(tok_ugly):
            return (js_file_path, None, 'Num tokens mismatch')

        if beautified_text == minified_text:
            return (js_file_path, None, 'Not minified')

        #try:
        #    iBuilder_clear = IndexBuilder(lex_clear.tokenList)
        #except:
        #    return (js_file_path, None, "IndexBuilder fail on original file.")

        try:
            iBuilder_ugly = IndexBuilder(lex_ugly.tokenList)
        except:
            return (js_file_path, None, 'IndexBuilder fail')

        with open(temp_files['orig'], 'w') as f:
            f.write(beautified_text)

        with open(temp_files['minified'], 'w') as f:
            f.write(minified_text)

#         try:
#             orig_lexer = WebLexer(beautified_text)
#             orig_iBuilder = IndexBuilder(orig_lexer.tokenList)
#             orig_scopeAnalyst = WebScopeAnalyst(beautified_text)
#         except:
#             return (js_file_path, None, 'IndexBuilder/Scoper fail on original')

########################
#     Nice2Predict
########################

# BV: Next block left out until I figure out the pipe issue
# BV: Update: I couldn't pipe input to N2P. TODO: FIX
# Run the JSNice from http://www.nice2predict.org
        unuglifyJS = UnuglifyJS()
        (ok, n2p_text, _err) = unuglifyJS.run(temp_files['minified'])
        if not ok:
            return (js_file_path, None, 'Nice2Predict fail')

        (ok, n2p_text_beautified, _err) = clear.web_run(n2p_text)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')

        with open(temp_files['n2p'], 'w') as f:
            f.write(n2p_text_beautified)

        try:
            n2p_lexer = WebLexer(n2p_text_beautified)
            n2p_iBuilder = IndexBuilder(n2p_lexer.tokenList)
            n2p_scopeAnalyst = WebScopeAnalyst(n2p_text_beautified)
        except:
            return (js_file_path, None, 'IndexBuilder / ScopeAnalyst fail')

        # Save some translation stats to compare different methods
        ts = TranslationSummarizer()
        candidates += [['n2p', ''] + x for x in ts.compute_summary_unscoped(
            n2p_iBuilder, n2p_scopeAnalyst)]

        ################################################
        # All other JSNaughty variants
        ################################################

        try:
            scopeAnalyst = WebScopeAnalyst(minified_text)
        except:
            return (js_file_path, None, 'ScopeAnalyst minified fail')

        try:
            scopeAnalyst_clear = WebScopeAnalyst(beautified_text)
        except:
            return (js_file_path, None, 'ScopeAnalyst clear fail')

        #if(not check(iBuilder_clear, scopeAnalyst_clear, n2p_iBuilder, n2p_scopeAnalyst)):
        #    return (js_file_path, None, 'JsNice restructured file. Skipping..')

        #Map the original names to the minified counterparts.
        orderedVarsNew = sorted(scopeAnalyst.name2defScope.keys(),
                                key=lambda x: x[1])
        orderedVarsOld = sorted(scopeAnalyst_clear.name2defScope.keys(),
                                key=lambda x: x[1])
        orderedVarsN2p = sorted(n2p_scopeAnalyst.name2defScope.keys(),
                                key=lambda x: x[1])

        if (len(orderedVarsOld) != len(orderedVarsNew)):
            return (js_file_path, None,
                    "Old and New Name lists different length")

        if (len(orderedVarsOld) != len(orderedVarsN2p)):
            return (js_file_path, None,
                    "JsNice and Old Name lists different length")

        for i in range(0, len(orderedVarsOld)):
            name_old = orderedVarsOld[i][0]
            def_scope_old = scopeAnalyst_clear.name2defScope[orderedVarsOld[i]]

            name_new = orderedVarsNew[i][0]
            def_scope_new = scopeAnalyst.name2defScope[orderedVarsNew[i]]
            min_name_map[(name_new, def_scope_new)] = (name_old, def_scope_old)

            name_n2p = orderedVarsN2p[i][0]
            def_scope_n2p = scopeAnalyst.name2defScope[orderedVarsNew[i]]
            jsnice_name_map[(name_new, def_scope_new)] = (name_n2p,
                                                          def_scope_n2p)

        #Once we have the scopeAnalyst, iBuilder, and tokenlist for the minified
        #version, we can get the name properties
#        vm = VariableMetrics(scopeAnalyst, iBuilder_ugly, lex_ugly.tokenList)
#        variableKeySet = vm.getVariables()
#        for variableKey in variableKeySet:
#            name_features[variableKey] = vm.getNameMetrics(variableKey)



        (_name_positions, \
         position_names,
         _use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst)

        # Try different renaming strategies (hash, etc)
        for r_strategy, proxy in proxies:

            # Rename input prior to translation
            preRen = PreRenamer()
            after_text = preRen.rename(r_strategy, iBuilder_ugly, scopeAnalyst)

            (ok, beautified_after_text, _err) = clear.web_run(after_text)
            if not ok:
                return (js_file_path, None, 'Beautifier fail')

            # Save renamed input to disk for future inspection
            with open(temp_files['%s' % (r_strategy)], 'w') as f:
                f.write(beautified_after_text)

            a_lexer = WebLexer(beautified_after_text)
            a_iBuilder = IndexBuilder(a_lexer.tokenList)
            a_scopeAnalyst = WebScopeAnalyst(beautified_after_text)

            if (r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO):
                #                 try:
                #                     scopeAnalyst_hash = WebScopeAnalyst(beautified_after_text) #This should be beautified_after_text instead of after_text
                #                 except:
                #                     return (js_file_path, None, "ScopeAnalyst hash fail")

                #Map the hashed names to the minified counterparts.
                orderedVarsMin = sorted(scopeAnalyst.name2defScope.keys(),
                                        key=lambda x: x[1])
                orderedVarsHash = sorted(a_scopeAnalyst.name2defScope.keys(),
                                         key=lambda x: x[1])

                if (len(orderedVarsMin) != len(orderedVarsHash)):
                    return (js_file_path, None,
                            "Hash and Min lists different length")

                for i in range(0, len(orderedVarsHash)):
                    name_hash = orderedVarsHash[i][0]
                    def_scope_hash = a_scopeAnalyst.name2defScope[
                        orderedVarsHash[i]]

                    name_min = orderedVarsMin[i][0]
                    def_scope_min = scopeAnalyst.name2defScope[
                        orderedVarsMin[i]]
                    hash_name_map[(name_hash,
                                   def_scope_hash)] = (name_min, def_scope_min)

            # We can switch this back once we train models on a corpus with literals
            # lx = WebLexer(a_iBuilder.get_text())
            lx = WebLexer(a_iBuilder.get_text_wo_literals())

            # Translate renamed input
            md = WebMosesDecoder(proxy)
            (ok, translation, _err) = md.run(lx.collapsedText)
            if not ok:
                return (js_file_path, None, 'Moses translation fail')

            (a_name_positions, a_position_names,
             a_use_scopes) = prepHelpers(a_iBuilder, a_scopeAnalyst)

            nc = []

            if translation is not None:
                # Parse moses output
                mp = MosesParser()

                name_candidates = mp.parse(translation, a_iBuilder,
                                           a_position_names)
                # name_candidates is a dictionary of dictionaries:
                # keys are (name, def_scope) tuples;
                # values are suggested translations with the sets
                # of line numbers on which they appear.

                # Update name_candidates with some default values
                # (in this case the translation without any renaming)
                # if the translation is empty
                if r_strategy == RS.NONE:
                    # RS.NONE should always be first, by construction
                    name_candidates_default = name_candidates
                    scopeAnalyst_default = a_scopeAnalyst
                    iBuilder_default = a_iBuilder
                else:
                    for key_default, suggestions in name_candidates_default.iteritems(
                    ):
                        #                         (name_default, def_scope_default) = key_default

                        pos_default = scopeAnalyst_default.nameDefScope2pos[
                            key_default]
                        (lin, col) = iBuilder_default.revFlatMat[pos_default]
                        (line_num,
                         line_idx) = iBuilder_default.revTokMap[(lin, col)]

                        (name,
                         def_scope) = a_position_names[line_num][line_idx]
                        key = (name, def_scope)

                        for name_translation, lines in suggestions.iteritems():
                            name_candidates.setdefault(key, {})
                            name_candidates[key].setdefault(
                                name_translation, set([]))
                            name_candidates[key][name_translation].update(
                                lines)

                # **** BV: This might be all we need to combine Naughty & Nice
                name_candidates_copy = deepcopy(name_candidates)
                for key, suggestions in name_candidates_copy.iteritems():

                    if r_strategy == RS.NONE:
                        (name_n2p, def_scope_n2p) = jsnice_name_map[key]
                    else:
                        (name_n2p,
                         def_scope_n2p) = jsnice_name_map[hash_name_map.get(
                             key, key)]

                    for name_translation, lines in suggestions.iteritems():
                        name_candidates.setdefault(key, {})
                        name_candidates[key].setdefault(name_n2p, set([]))
                        name_candidates[key][name_n2p].update(lines)

                cc = ConsistencyController(debug_mode=False)
                ts = TranslationSummarizer()

                # An identifier may have been translated inconsistently
                # across different lines (Moses treats each line independently).
                # Try different strategies to resolve inconsistencies, if any
                for c_strategy in CS.all():

                    # Compute renaming map (x -> length, y -> width, ...)
                    # Note that x,y here are names after (hash) renaming
                    (temp_renaming_map, seen) = cc.computeRenaming(
                        c_strategy, name_candidates, a_name_positions,
                        a_use_scopes, a_iBuilder, lm_path, {}, hash_name_map)

                    # After computeRenaming, we have both the entropies stored
                    # if we are in LMDrop strategy and have the suggestions
                    # frequency from name_candidates.  Fill in suggestion_Features
                    #                    if(c_strategy == CS.LMDROP and r_strategy not in suggestion_features):
                    #                        assert(cc.suggestion_cache != None)
                    #                        suggestion_features[r_strategy] = {}
                    #                        """
                    #                        name_candidates: dict
                    #                            name_candidates[(name, def_scope)][name_translation]
                    #                            = set of line numbers in the translation
                    #                        """
                    #                        for variableKey, suggestionDictionary in name_candidates.iteritems():
                    #                            for suggestionName, linesSuggested in suggestionDictionary.iteritems():
                    #
                    #                                # I need to revert variableKey[0] in the suggestion from its hash to its original minified name.
                    #                                if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO):
                    #                                    unhashedKey = hash_name_map[variableKey]
                    #                                    suggestionKey = (unhashedKey[0], unhashedKey[1], suggestionName)
                    #                                else:
                    #                                    suggestionKey = (variableKey[0], variableKey[1], suggestionName)
                    #
                    #                                entropyVals = cc.suggestion_cache.getEntropyStats(variableKey, suggestionName)
                    #                                if(entropyVals != (ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR)):
                    #                                    suggestionValue = [len(linesSuggested)] + \
                    #                                                       list(getSuggestionStats(suggestionName)) + \
                    #                                                       list(entropyVals)
                    #
                    #                                    suggestion_features[r_strategy][suggestionKey] = suggestionValue

                    # Fall back on original names in input, if
                    # no translation was suggested
                    postRen = PostRenamer()
                    renaming_map = postRen.updateRenamingMap(
                        a_name_positions, position_names, a_use_scopes,
                        temp_renaming_map, seen, r_strategy)

                    # Apply renaming map and save output for future inspection
                    renamed_text = postRen.applyRenaming(
                        a_iBuilder, a_name_positions, renaming_map)

                    (ok, beautified_renamed_text,
                     _err) = clear.web_run(renamed_text)
                    if not ok:
                        return (js_file_path, None, 'Beautifier fail')
                    with open(temp_files['%s_%s' % (r_strategy, c_strategy)],
                              'w') as f:
                        f.write(beautified_renamed_text)

                    # Save some stats about which names were renamed to what
                    # This is what enables the comparison between the different
                    # methods.
                    r = [[c_strategy] + x for x in ts.compute_summary_scoped(
                        renaming_map, name_candidates, a_iBuilder,
                        a_scopeAnalyst)]

                    if not r:
                        return (js_file_path, None, 'Compute summary failed')
                    nc += r

            if nc:
                candidates += [[r_strategy] + x for x in nc]

        #create the rows for the suggestion_model.csv


#        for r_strategy in RS.all():
#            for suggestionKey, s_feat in suggestion_features[r_strategy].iteritems():
#                variableKey = (suggestionKey[0], suggestionKey[1])
#                original_name = min_name_map[variableKey][0]
#                js_nice_name = jsnice_name_map[variableKey][0]
#                n_feat = list(name_features[variableKey])
#                #Convert the def_scope to an equivalent, but smaller, easier to read key: (line_num, token_num)
#                newKey = scopeAnalyst.nameDefScope2pos[variableKey]
#                (keyLine, keyToken) = iBuilder_ugly.revFlatMat[newKey]
#                model_rows.append([original_name, r_strategy, suggestionKey[0], keyLine, keyToken, suggestionKey[2], js_nice_name] + n_feat + s_feat)

        return (js_file_path, 'OK', candidates, model_rows)

    except Exception, e:
        return (js_file_path, None, str(e).replace("\n", ""), model_rows)

Example #4

Show file

def processFile(js_file_path):
    
#     js_file_path = l[0]
    base_name = os.path.splitext(os.path.basename(js_file_path))[0]
    
    if dbg:
        print js_file_path
    
    temp_files = {'orig': '%s.js' % base_name,
                  'minified': '%s.u.js' % base_name,
                  'n2p': '%s.n2p.js' % base_name}
    
    for r_strategy in RS.all():
        temp_files['%s' % (r_strategy)] = \
                    '%s.%s.js' % (base_name, r_strategy)
                    
        for c_strategy in CS.all():
            temp_files['%s_%s' % (r_strategy, c_strategy)] = \
                    '%s.%s.%s.js' % (base_name, r_strategy, c_strategy)
                    
    for k,v in temp_files.iteritems():
        temp_files[k] = os.path.join(output_path, v)
    
    
    candidates = []
    #Minified Name -> Original Name (name, def_scope) -> (name, def_scope)
    min_name_map = {}
    #Hashed Name -> Minified Name (name, def_scope) -> (name, def_scope)
    hash_name_map = {}
    #Minified Name -> jsnice name  (name, def_scope) -> (name, def_scope)
    jsnice_name_map = {}
    #Data for the suggestion model.csv
    #Map of variable (name, def_scope) -> results of variableMetrics features function
    name_features = {}
    
    #Map of maps of variable-suggestion (name, def_scope, suggestion) -> suggestion line counts + suggestionMetrics features function
    #The first key is the renaming strategy
    #Ultimately, we will iterate over this to get the keys out of name_features and build model_rows
    suggestion_features = {}
    
    #Output Lines for the suggestoin_model.csv
    model_rows = [] 
    
    if True:
#     try:
#         js_text = open(os.path.join(corpus_root, js_file_path), 'r').read()
        js_text = open(js_file_path, 'r').read()
        
        # Strip comments, replace literals, etc
#         if True:
#         try:
        prepro = WebLMPreprocessor(js_text)
        prepro_text = str(prepro)
#         except:
#             return (js_file_path, None, 'Preprocessor fail')
        
#         print 'Preprocessor'
#         print prepro_text
        
        
        # Pass through beautifier to fix layout
        clear = Beautifier()
        (ok, tmp_beautified_text, _err) = clear.web_run(prepro_text)
        
        
        print '\nOK:', ok, 'ERR:', _err
        print tmp_beautified_text
        
        if not ok:
            return (js_file_path, None, 'Beautifier fail')
        
            
        # Minify
        ugly = Uglifier()
        (ok, tmp_minified_text, _err) = ugly.web_run(tmp_beautified_text)
        
#         print '\nOK:', ok, 'ERR:', _err
#         print tmp_minified_text
        
        if not ok:
            return (js_file_path, None, 'Uglifier fail')
        
        
        # Align minified and clear files, in case the beautifier 
        # did something weird
        try:
            aligner = Aligner()
            (aligned_clear, aligned_minified) = aligner.web_align(WebLexer(tmp_beautified_text).tokenList,
                                                                 WebLexer(tmp_minified_text).tokenList)
        except:
            return (js_file_path, None, 'Aligner fail')
        
#         print '\nAligned clear'
#         print aligned_clear
#         print '\nAligned minified'
#         print aligned_minified
#         print
        
        # Pass through beautifier to fix layout
        (ok, beautified_text, _err) = clear.web_run(aligned_clear)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')
        (ok, minified_text, _err) = clear.web_run(aligned_minified)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')
        
#         print beautified_text
#         print
#         print minified_text
        
        # Num tokens before vs after
        try:
            lex_clear = WebLexer(beautified_text)
            tok_clear = lex_clear.tokenList
            
            lex_ugly = WebLexer(minified_text)
            tok_ugly = lex_ugly.tokenList
        except:
            return (js_file_path, None, 'Lexer fail')

        # For now only work with minified files that have
        # the same number of tokens as the originals
        if not len(tok_clear) == len(tok_ugly):
            return (js_file_path, None, 'Num tokens mismatch')
        
        
        if beautified_text == minified_text:
            return (js_file_path, None, 'Not minified')

        try:
            iBuilder_clear = IndexBuilder(lex_clear.tokenList)
        except:
            return (js_file_path, None, "IndexBuilder fail on original file.")
            
        try:
            iBuilder_ugly = IndexBuilder(lex_ugly.tokenList)
        except:
            return (js_file_path, None, 'IndexBuilder fail on minified file.')
        
        
#         print 'Writing'
        
        with open(temp_files['orig'], 'w') as f:
            f.write(beautified_text)
            
        with open(temp_files['minified'], 'w') as f:
            f.write(minified_text)
        
        ######################## 
        #     Nice2Predict
        ########################
        
        # BV: Next block left out until I figure out the pipe issue
        # BV: Update: I couldn't pipe input to N2P. TODO: FIX
        # Run the JSNice from http://www.nice2predict.org
        unuglifyJS = UnuglifyJS()
        (ok, n2p_text, _err) = unuglifyJS.run(temp_files['minified'])
        if not ok:
            return (js_file_path, None, 'Nice2Predict fail')

        (ok, n2p_text_beautified, _err) = clear.web_run(n2p_text)
        if not ok:
            return (js_file_path, None, 'Beautifier fail')
        
        with open(temp_files['n2p'], 'w') as f:
            f.write(n2p_text_beautified)
         
        if(True):
        #try:
            n2p_lexer = WebLexer(n2p_text_beautified)
            n2p_iBuilder = IndexBuilder(n2p_lexer.tokenList)
            n2p_scopeAnalyst = WebScopeAnalyst(n2p_text_beautified)
        #except:
        #    return (js_file_path, None, 'IndexBuilder / ScopeAnalyst fail')
        
#         print 'n2p'
        
        # Save some translation stats to compare different methods
        ts = TranslationSummarizer()
        candidates += [['n2p', ''] + x 
                       for x in ts.compute_summary_unscoped(n2p_iBuilder, 
                                                            n2p_scopeAnalyst)]
            
        ################################################
        # All other JSNaughty variants
        ################################################
    
        try:
            scopeAnalyst = WebScopeAnalyst(minified_text)
        except:
            return (js_file_path, None, 'ScopeAnalyst minified fail')
        
        try:
            scopeAnalyst_clear = WebScopeAnalyst(beautified_text)
        except:
            return (js_file_path, None, 'ScopeAnalyst clear fail')
        
        if(not check(iBuilder_clear, scopeAnalyst_clear, n2p_iBuilder, n2p_scopeAnalyst)):
            return (js_file_path, None, 'JsNice restructured file. Skipping..')
        
        #Map the original names to the minified counterparts and minified ones to jsnice renamings
        orderedVarsNew = sorted(scopeAnalyst.name2defScope.keys(), key = lambda x: x[1])
        orderedVarsOld = sorted(scopeAnalyst_clear.name2defScope.keys(), key = lambda x: x[1])
        orderedVarsN2p = sorted(n2p_scopeAnalyst.name2defScope.keys(), key = lambda x: x[1])

        if(len(orderedVarsOld) != len(orderedVarsNew)):
            return (js_file_path, None, "Old and New Name lists different length")
        
        if(len(orderedVarsOld) != len(orderedVarsN2p)):
            return (js_file_path, None, "JsNice and Old Name lists different length")
        
        
        for i in range(0, len(orderedVarsOld)):
            name_old = orderedVarsOld[i][0]
            def_scope_old = scopeAnalyst_clear.name2defScope[orderedVarsOld[i]]

            name_new = orderedVarsNew[i][0]
            def_scope_new = scopeAnalyst.name2defScope[orderedVarsNew[i]]
            min_name_map[(name_new, def_scope_new)] = (name_old, def_scope_old)
            
            name_n2p = orderedVarsN2p[i][0]
            def_scope_n2p = scopeAnalyst.name2defScope[orderedVarsNew[i]]
            jsnice_name_map[(name_new, def_scope_new)] = (name_n2p, def_scope_n2p)

        #Once we have the scopeAnalyst, iBuilder, and tokenlist for the minified
        #version, we can get the name properties
        vm = VariableMetrics(scopeAnalyst, iBuilder_ugly, lex_ugly.tokenList)
        variableKeySet = vm.getVariables()
        for variableKey in variableKeySet:
            name_features[variableKey] = vm.getNameMetrics(variableKey)
         
        (name_positions, \
         position_names,
         use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst)
          
#         print 'Helpers'

        # Try different renaming strategies (hash, etc)
        for r_strategy, proxy in proxies:
            
            if dbg:
                print '\n====================='
                print r_strategy
                print '=====================\n'
        
#             try:
#             if True:
            # Rename input prior to translation
            preRen = PreRenamer()
            after_text = preRen.rename(r_strategy, 
                                      iBuilder_ugly,
                                      scopeAnalyst)
            
#             print 'After text:'
#             print after_text
#             print
            
            (ok, beautified_after_text, _err) = clear.web_run(after_text)
            if not ok:
                return (js_file_path, None, 'Beautifier fail')
            
#             print 'Beautified:'
#             print beautified_after_text
#             print
            
            if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO):
                try:
                    scopeAnalyst_hash = WebScopeAnalyst(after_text)
                except:
                    return (js_file_path, None, "ScopeAnalyst hash fail")

                #Map the hashed names to the minified counterparts.
                orderedVarsMin = sorted(scopeAnalyst.name2defScope.keys(), key = lambda x: x[1])
                orderedVarsHash = sorted(scopeAnalyst_hash.name2defScope.keys(), key = lambda x: x[1])

                if(len(orderedVarsMin) != len(orderedVarsHash)):
                    return (js_file_path, None, "Hash and Min lists different length")

                for i in range(0, len(orderedVarsHash)):
                    name_hash = orderedVarsHash[i][0]
                    def_scope_hash = scopeAnalyst_hash.name2defScope[orderedVarsHash[i]]

                    name_min = orderedVarsMin[i][0]
                    def_scope_min = scopeAnalyst.name2defScope[orderedVarsMin[i]]
                    hash_name_map[(name_hash, def_scope_hash)] = (name_min, def_scope_min)


            # Save renamed input to disk for future inspection
            with open(temp_files['%s' % (r_strategy)], 'w') as f:
                f.write(beautified_after_text)
            
            a_lexer = WebLexer(beautified_after_text)
            a_iBuilder = IndexBuilder(a_lexer.tokenList)
            a_scopeAnalyst = WebScopeAnalyst(beautified_after_text)
                
#             except:
#                 return (js_file_path, None, 'Renaming fail')
            
#             print 'Lexing'
            
#             lx = WebLexer(a_iBuilder.get_text())
            lx = WebLexer(a_iBuilder.get_text_wo_literals())
            
#             print a_iBuilder.get_text_wo_literals()
            
            # Translate renamed input
            md = WebMosesDecoder(proxy)
            (ok, translation, _err) = md.run(lx.collapsedText)
            if not ok:
                return (js_file_path, None, 'Moses translation fail')
            
#             print '\ntranslation-------------'
#             print translation
            
#             if r_strategy == RS.HASH_ONE:
#                 exit()
            
            (a_name_positions, 
             a_position_names,
             a_use_scopes) = prepHelpers(a_iBuilder, a_scopeAnalyst)

            nc = []
             
            if translation is not None:
                # Parse moses output
                mp = MosesParser()
                
                if dbg:
                    print '\nr_strategy-----------', r_strategy
                
                name_candidates = mp.parse(translation,
                                           a_iBuilder,
                                           a_position_names)
                # name_candidates is a dictionary of dictionaries: 
                # keys are (name, None) (if scopeAnalyst=None) or 
                # (name, def_scope) tuples (otherwise); 
                # values are suggested translations with the sets 
                # of line numbers on which they appear.

#                 print '\nname_candidates before ----------'
#                 for key, suggestions in name_candidates.iteritems():
#                     print key[0], key[1][-50:]
# #                     for use_scope, suggestions in val.iteritems():
# #                         print '\t...', use_scope[-50:]
#                     for name_translation, lines in suggestions.iteritems():
#                         print '\t', name_translation, lines
                    
                # Update name_candidates with some default values 
                # (in this case the translation without any renaming)
                # if the translation is empty
                if r_strategy == RS.NONE:
                    # RS.NONE should always be first, by construction
                    name_candidates_default = name_candidates
                    scopeAnalyst_default = a_scopeAnalyst
                    iBuilder_default = a_iBuilder
                else:
                    for key_default, suggestions in name_candidates_default.iteritems():
#                         (name_default, def_scope_default) = key_default
                        
                        pos_default = scopeAnalyst_default.nameDefScope2pos[key_default]
                        (lin, col) = iBuilder_default.revFlatMat[pos_default]
                        (line_num, line_idx) = iBuilder_default.revTokMap[(lin, col)]
                        
                        (name, def_scope) = a_position_names[line_num][line_idx]
                        key = (name, def_scope)
                        
                        for name_translation, lines in suggestions.iteritems():
                            name_candidates.setdefault(key, {})
                            name_candidates[key].setdefault(name_translation, set([]))
                            name_candidates[key][name_translation].update(lines)
                                
#                         for use_scope, suggestions in val.iteritems():
#                             for name_translation, lines in suggestions.iteritems():
# #                                 key = preRen.simple_direct_map.get(key_default, key_default)
#                                  
#                                 name_candidates.setdefault(key, {})
#                                 name_candidates[key].setdefault(use_scope, {})
#                                 name_candidates[key][use_scope].setdefault(name_translation, set([]))
#                                 name_candidates[key][use_scope][name_translation].update(lines)
                                
#                 print '\nname_candidates after ----------'
#                 for key, suggestions in name_candidates.iteritems():
#                     print key[0], key[1][-50:]
# #                     for use_scope, suggestions in val.iteritems():
# #                         print '\t...', use_scope[-50:]
#                     for name_translation, lines in suggestions.iteritems():
#                         print '\t', name_translation, lines
                                
                cc = ConsistencyController(debug_mode=True)
                ts = TranslationSummarizer()
                
                # An identifier may have been translated inconsistently
                # across different lines (Moses treats each line independently).
                # Try different strategies to resolve inconsistencies, if any
                for c_strategy in CS.all():
                    
                    if dbg:
                        print '\nc_strategy----------', c_strategy

                    #assert(hash_name_map != {})
                    
                    # Compute renaming map (x -> length, y -> width, ...)
                    # Note that x,y here are names after renaming
                    (temp_renaming_map, seen) = cc.computeRenaming(c_strategy,
                                                      name_candidates,
                                                      a_name_positions,
                                                      a_use_scopes,
                                                      a_iBuilder,
                                                      lm_path,
                                                      vm,
                                                      hash_name_map)
                    
                    
                    #After computeRenaming, we have both the entropies stored
                    #if we are in LMDrop strategy and have the suggestions
                    #frequency from name_candidates.  Fill in suggestion_Features
                    if(c_strategy == CS.LMDROP and r_strategy not in suggestion_features):
                        assert(cc.suggestion_cache != None)
                        suggestion_features[r_strategy] = {}
                        #Need some way of iterating over all name, suggestion groups...
                        """
                        name_candidates: dict
                            name_candidates[(name, def_scope)][name_translation] 
                            = set of line numbers in the translation
                        """
                        for variableKey, suggestionDictionary in name_candidates.iteritems():
                            for suggestionName, linesSuggested in suggestionDictionary.iteritems():
                                # I need to revert variableKey[0] in the suggestion from its hash to its original minified name.
                                if(r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO):
                                    unhashedKey = hash_name_map[variableKey]
                                    suggestionKey = (unhashedKey[0], unhashedKey[1], suggestionName)
                                else:
                                    suggestionKey = (variableKey[0], variableKey[1], suggestionName)

                                entropyVals = cc.suggestion_cache.getEntropyStats(variableKey, suggestionName)
                                
                                if(True): #eval_dbg only
                                #if(entropyVals != (ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR, ENTROPY_ERR)):
                                    suggestionValue = [len(linesSuggested)] + \
                                                       list(getSuggestionStats(suggestionName)) + \
                                                       list(entropyVals)
                                                      
                                    suggestion_features[r_strategy][suggestionKey] = suggestionValue
                    
                    
                    if dbg:
                        print '\ntemp_renaming_map-------------'
                        for (name, def_scope), renaming in temp_renaming_map.iteritems():
                            print (name, def_scope[-50:]), renaming

                    # Fall back on original names in input, if 
                    # no translation was suggested
                    postRen = PostRenamer()
                    renaming_map = postRen.updateRenamingMap(a_name_positions, 
                                                             position_names, 
                                                             a_use_scopes,
                                                             temp_renaming_map, 
                                                             seen,
                                                             r_strategy)

#                     new_name_candidates = {}
# 
#                     for (name, def_scope), renaming in temp_renaming_map.iteritems():
#                         (line_num, line_idx) = a_name_positions[(name, def_scope)][0]
#                         (old_name, old_def_scope) = position_names[line_num][line_idx]
#                         
#                         new_name_candidates.setdefault((old_name, old_def_scope), {})
#                         new_name_candidates[(old_name, old_def_scope)][renaming] = set([1])


#                     tmp_renamed_text = postRen.applyRenaming(a_iBuilder, 
#                                                          a_name_positions, 
#                                                          temp_renaming_map)
#                     (ok, tmp_beautified_renamed_text, _err) = clear.web_run(tmp_renamed_text)
#                     if not ok:
#                         return (js_file_path, None, 'Beautifier fail')
#                     
#                     tmp_lexer = WebLexer(tmp_beautified_renamed_text)
#                     tmp_iBuilder = IndexBuilder(tmp_lexer.tokenList)
#                     tmp_scopeAnalyst = WebScopeAnalyst(tmp_beautified_renamed_text)
#                         
#                     (tmp_name_positions, 
#                      tmp_position_names,
#                      tmp_use_scopes) = prepHelpers(tmp_iBuilder, tmp_scopeAnalyst)
                    
#                     renaming_map = postRen.updateRenamingMap(tmp_name_positions, 
#                                                              position_names, 
#                                                              temp_renaming_map, 
#                                                              r_strategy)
#                     
#                     renaming_map = cc.computeRenaming(CS.FREQLEN,
#                                                       new_name_candidates,
#                                                       name_positions,
#                                                       use_scopes,
#                                                       iBuilder_ugly,
#                                                       lm_path)
                    
#                     # Fall back on original names in input, if 
#                     # no translation was suggested
#                     postRen = PostRenamer()
#                     renaming_map = postRen.updateRenamingMap(a_name_positions, 
#                                                              position_names, 
#                                                              temp_renaming_map, 
#                                                              r_strategy)
                    
                    if dbg:
                        print '\nrenaming_map-------------'
                        for (name, def_scope), renaming in renaming_map.iteritems():
                            print (name, def_scope[-50:]), renaming, '(%s)' % temp_renaming_map[(name, def_scope)]
                    
                    # Apply renaming map and save output for future inspection
                    renamed_text = postRen.applyRenaming(a_iBuilder, 
                                                         a_name_positions, 
                                                         renaming_map)
                    
                    print '\nrenamed_text--------------'
                    print renamed_text
                    print
                    
                    (ok, beautified_renamed_text, _err) = clear.web_run(renamed_text)
                    if not ok:
                        return (js_file_path, None, 'Beautifier fail')
                    with open(temp_files['%s_%s' % (r_strategy, c_strategy)], 'w') as f:
                        f.write(beautified_renamed_text)
                    
                    # Save some stats about which names were renamed to what
                    # This is what enables the comparison between the different 
                    # methods.
                    r = [[c_strategy] + x 
                         for x in ts.compute_summary_scoped(renaming_map,
                                                            name_candidates,
                                                            a_iBuilder,
                                                            a_scopeAnalyst)]
                    
                    if not r:
                        return (js_file_path, None, 'Compute summary failed')
                    nc += r
                
            if nc:
                candidates += [[r_strategy] + x for x in nc]
         

        #create the rows for the suggestion_model.csv
        for r_strategy in RS.all():
            for suggestionKey, s_feat in suggestion_features[r_strategy].iteritems():
                variableKey = (suggestionKey[0], suggestionKey[1])
                original_name = min_name_map[variableKey][0]
                js_nice_name = jsnice_name_map[variableKey][0]
                if(variableKey in name_features): #eval_dbg only
                    n_feat = list(name_features[variableKey])
                    #Convert the def_scope to an equivalent, but smaller, easier to read key: (line_num, token_num)
                    newKey = scopeAnalyst.nameDefScope2pos[variableKey]
                    (keyLine, keyToken) = iBuilder_ugly.revFlatMat[newKey]
                    model_rows.append([original_name, r_strategy, suggestionKey[0], keyLine, keyToken, suggestionKey[2], js_nice_name] + n_feat + s_feat)
 
        return (js_file_path, 'OK', candidates, model_rows)

Example #5

Show file

def renameUsingHashDefLine(scopeAnalyst,
                           iBuilder,
                           twoLines=False,
                           debug=False):
    '''
    '''

    #     hash_renaming = []

    context = {}

    def traversal(scopeAnalyst, iBuilder, context, condition):

        seen = {}
        print(
            "name2defScope---------------------------------------------------")
        print(scopeAnalyst.name2defScope)
        for line_idx, line in enumerate(iBuilder.tokens):
            print("Traversing: " + str(line_idx) + " ----- " + str(line))
            for token_idx, (token_type, token) in enumerate(line):
                (l, c) = iBuilder.tokMap[(line_idx, token_idx)]
                pos = iBuilder.flatMap[(l, c)]

                #if(True):
                try:
                    if (is_token_subtype(token_type, Token.Name)):
                        print("NAME!!!!!!" + str(token))
                        def_scope = scopeAnalyst.name2defScope[(token, pos)]
                        #                       use_scope = scopeAnalyst.name2useScope[(token, pos)]
                        pth = scopeAnalyst.name2pth[(token, pos)]
                except KeyError:
                    print("KEY ERROR! " + str(token_idx) + " -- " +
                          str(token_type) + " -- " + str(token))
                    continue

                if not isValidContextToken((token_type, token)):
                    continue

                if scopeAnalyst.isGlobal.get((token, pos), True):
                    continue

                context_tokens = []

                # If token is defined on the current line,
                # count this line towards token's context.
                if condition(pth, scopeAnalyst, token, def_scope, seen):

                    for tidx, (tt, t) in enumerate(line):
                        (tl, tc) = iBuilder.tokMap[(line_idx, tidx)]
                        p = iBuilder.flatMap[(tl, tc)]

                        if scopeAnalyst.isGlobal.get((t, p), True) or \
                                not is_token_subtype(tt, Token.Name):
                            context_tokens.append(t)

                        if t == token and p == pos and \
                                not scopeAnalyst.isGlobal.get((t, p), True):
                            context_tokens.append('#')

                    seen[(token, def_scope)] = True

                context.setdefault((token, def_scope), [])
                context[(token, def_scope)] += context_tokens

        return context

    def passOne(pth, scopeAnalyst, token, def_scope, seen):
        if pth == scopeAnalyst.nameOrigin.get((token, def_scope), None) and \
                not seen.get((token, def_scope), False):
            return True
        return False

    def passTwo(pth, scopeAnalyst, token, def_scope, seen):
        if pth != scopeAnalyst.nameOrigin[(token, def_scope)] and \
                not seen.get((token, def_scope), False):
            return True
        return False

    context = traversal(scopeAnalyst, iBuilder, context, passOne)

    print("context-------------------------------------")
    print("\n".join(
        [key[0] + " : " + " ".join(item) for key, item in context.items()]))
    #print(context)

    if twoLines:
        context = traversal(scopeAnalyst, iBuilder, context, passTwo)

#     (name_positions, _position_names) = prepareHelpers(iBuilder, scopeAnalyst)
    (name_positions, _position_names) = prepHelpers(iBuilder, scopeAnalyst)

    shas = {}
    name_candidates = {}

    for (token, def_scope), context_tokens in context.iteritems():

        concat_str = ''.join(context_tokens)
        renaming = shas.setdefault(concat_str, sha(concat_str, debug))

        if (token == u'u'):
            print(token)
            print(concat_str)
            print(def_scope)
            print(name_positions.get((token, def_scope), "Empty"))

        name_candidates.setdefault((token, def_scope), {})

        for (line_num, line_idx) in name_positions[(token, def_scope)]:
            (l, c) = iBuilder.tokMap[(line_num, line_idx)]
            p = iBuilder.flatMap[(l, c)]
            use_scope = scopeAnalyst.name2useScope[(token, p)]
            if (token == u'u'):
                print(u'u' + ": " + str(use_scope))

            name_candidates[(token, def_scope)].setdefault(use_scope, {})
            name_candidates[(token, def_scope)][use_scope].setdefault(
                renaming, set([]))
            name_candidates[(token, def_scope)][use_scope][renaming].add(1)

        if (token == u'u'):
            print("Name Candidate: " +
                  str(name_candidates.get((token, def_scope), "Empty")))

    renaming_map = computeFreqLenRenaming(name_candidates, name_positions,
                                          lambda e: e)

    print("Renaming_map-------------------------------")
    print("\n".join(
        [key[0][0] + " : " + item for key, item in renaming_map.items()]))
    #print(renaming_map)

    #     for (k, use_scope), renaming in renaming_map.iteritems():
    #         print k
    #         print renaming, use_scope
    #
    #     print

    hash_renaming = rename(iBuilder, name_positions, renaming_map)
    print("lines-------------------------------------------------------")
    print('\n'.join([
        ' '.join([token for (_token_type, token) in line])
        for line in hash_renaming
    ]))
    print("lines-------------------------------------------------------")
    return '\n'.join([
        ' '.join([token for (_token_type, token) in line])
        for line in hash_renaming
    ]) + '\n'

Example #6

Show file

    def deobfuscateJS(self,
                      obfuscatedCode,
                      use_mix,
                      transactionID,
                      debug_output=False,
                      parallel=True,
                      use_local=True):
        """
        Take a string representing minified javascript code and attempt to
        translate it into a version with better renamings.
        
        Parameters
        ----------
        obfuscatedCode: The minified javascript text.
        
        use_mix: True/False -> should we invoke JSNice and throw the names into the language model mix?
        
        transactionID: an ID for storing temp files - used currently
        only to identify the input to JSNice.
        
        debug_output: should we print debugging output in this pass (TRUE/FALSE)
        
        parallel: enable parallelization performance enhancements -> such as calling the
        moses servers in parallel. 
        Returns
        -------
        A tuple:
            renamed_text - the renamed text
            jsnice_error - "" if no error, otherwise a message stating
                           where the jsnice mixing failed
            Third element is a tuple of TIMING_COUNT performance times
            preprocess time - total time to preprocess before invoking
                            moses servers
            prepre time - how long does the first step of the preprocessor take?
            jsnice time - part of the preprocessing, how long does it take
                        to get and parse jsnice names
            renaming time - how long did the hashing steps in preprocess take
            lex_total_time - how long did all the lexers take,
            builder_time - how long did all the Index Builders take
            scoper_time - how long did all the scopeAnalysts take
            moses time - how long did the moses servers take
            moses_rn_parallel - total time for the parallel moses and renaming
            to complete
            postprocess time - how long did the consistency resolution and
                            language model queries take.
        """

        RS = RenamingStrategies()
        CS = ConsistencyStrategies()

        r_strategy = RS.HASH_ONE
        #c_strategy = CS.FREQLEN # or CS.LM? (CS.LM requires a language model + a querylm from moses)
        #c_strategy = CS.LM
        c_strategy = CS.LOGMODEL

        if (use_local == False):
            proxies = MosesProxy().web_proxies
        else:
            proxies = MosesProxy().web_local
        mosesParams = {}

        #lm_path = "/data/bogdanv/deobfuscator/experiments/corpora/corpus.lm.970k/js.blm.lm"
        #lm_path = "/data/bogdanv/deobfuscator/experiments/corpora/corpus.lm.500k/js.blm.lm"
        lm_path = "./phrase-tables/langmodels/js.blm.lm"

        #if socket.gethostname() == 'bogdan.mac':
        #    lm_path = "/Users/bogdanv/workspace2/deobfuscator/data/lm/js.blm.lm"
        #elif socket.gethostname() == "Caseys-MacBook-Pro.local" or socket.gethostname() == "campus-019-136.ucdavis.edu":
        #    lm_path = "/Users/caseycas/jsnaughty_lms/js970k.blm.lm"

        #Hashed Name -> Minified Name (name, def_scope) -> (name, def_scope)
        hash_name_map = {}
        #Minified Name -> jsnice name  (name, def_scope) -> (name, def_scope)
        jsnice_name_map = {}
        #Record of any errors we get in the js mixing.
        #If this feature is enabled (to be added as a switch on the website)
        #it should not crash the input if there is a failure.  If the query
        #doesn't work for some reason, then we should just use the candidate
        #names provided by moses.
        jsnice_errors = []

        start = time.time()
        # Strip comments, replace literals, etc
        try:
            #if True:
            prepro = WebLMPreprocessor(obfuscatedCode)
            prepro_text = str(prepro)
            if (debug_output):
                print("Prepro_text----------------------------------")
                print(prepro_text)
                print("Prepro_text----------------------------------")

        except:
            return ((prepro_error, "", (0, ) * TIMING_COUNT))

        prepre_end = time.time()
        prepre_time = prepre_end - start
        clear = Beautifier()

        (ok, beautified_text, _err) = clear.web_run(prepro_text)

        if (debug_output):
            print("Beautified Text")
            print(beautified_text)

        if (not ok):
            return ((beaut_error, "", (0, ) * TIMING_COUNT))

        #Due to a bug? in the jsnice web service, we need to save the
        #input text as a file.
        min_input_file = os.path.join(self.tmpDir,
                                      str(transactionID) + ".u.js")
        with open(min_input_file, 'w') as f:
            f.write(beautified_text)

        try:
            #             lex_ugly = Lexer(beautFile)
            lex_ugly = WebLexer(beautified_text)
            if (debug_output):
                print("Lex_ugly---------------------")
                print(lex_ugly.tokenList)
                print("Lex_ugly---------------------")
            iBuilder_ugly = IndexBuilder(lex_ugly.tokenList)
        except:

            return ((ib_error, "", (0, ) * TIMING_COUNT))

        #Do Scope related tasks
        #a raw text version
        try:
            #             scopeAnalyst = ScopeAnalyst(beautFile)
            scopeAnalyst = WebScopeAnalyst(beautified_text)
        except:
            return ((sa_error, "", (0, ) * TIMING_COUNT))

        #Cut short if no variables
        if (not scopeAnalyst.hasMinifiableVariables()):
            return ((beautified_text, "No Minifiable Variables",
                     (0, ) * TIMING_COUNT))
        elif (debug_output):
            print("GLOBAL VAR MAP: " + str(scopeAnalyst.isGlobal))

        #lex_ugly.write_temp_file(tempFile)
        js_start = time.time()
        ########################
        #  Nice2Predict start
        ########################
        #Don't want a crashing failure for jsnice query.
        # BV: Next block left out until I figure out the pipe issue
        # BV: Update: I couldn't pipe input to N2P. TODO: FIX
        # Run the JSNice from http://www.nice2predict.org
        if (use_mix):
            unuglifyJS = UnuglifyJS()
            (ok, n2p_text, _err) = unuglifyJS.run(min_input_file)
            #ok = False #Failure test
            if not ok:
                jsnice_errors.append('Nice2Predict fail')
                #return (js_file_path, None, 'Nice2Predict fail')

        if (use_mix and jsnice_errors == []):
            (ok, n2p_text_beautified, _err) = clear.web_run(n2p_text)
            if not ok:
                jsnice_errors.append('Beautifier failed for JSNice.')
                #return (js_file_path, None, 'Beautifier fail')

            if (debug_output):
                print("JSNice Text")
                print(n2p_text_beautified)

            try:
                n2p_lexer = WebLexer(n2p_text_beautified)
                n2p_iBuilder = IndexBuilder(n2p_lexer.tokenList)
                n2p_scopeAnalyst = WebScopeAnalyst(n2p_text_beautified)
            except:
                jsnice_errors.append(
                    "IndexBuilder or ScopeAnalysted failed for JSNice.")
                #return (js_file_path, None, 'IndexBuilder / ScopeAnalyst fail')

        ########################
        #   Nice2Predict End
        ########################
        js_end = time.time()
        js_time = js_end - js_start
        #Do Scope related tasks

        (name_positions, position_names,
         use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst)

        #Map the jsnice names to the minified counterparts.
        if (use_mix and jsnice_errors == []
            ):  #only attempt if we are error free for jsnice up to this point.
            try:
                orderedVarsNew = sorted(scopeAnalyst.name2defScope.keys(),
                                        key=lambda x: x[1])
                orderedVarsN2p = sorted(n2p_scopeAnalyst.name2defScope.keys(),
                                        key=lambda x: x[1])

                if (len(orderedVarsNew) != len(orderedVarsN2p)):
                    jsnice_errors.append(
                        "JSNice and minified name lists different lengths.")
                    #raise IndexError("Length Mismatch") #Probably better to have our own defined error type, but this will do for now
                    #return ("JsNice and New Name lists different length")

                for i in range(0, len(orderedVarsNew)):
                    name_new = orderedVarsNew[i][0]
                    def_scope_new = scopeAnalyst.name2defScope[
                        orderedVarsNew[i]]

                    name_n2p = orderedVarsN2p[i][0]
                    def_scope_n2p = scopeAnalyst.name2defScope[
                        orderedVarsNew[i]]
                    jsnice_name_map[(name_new,
                                     def_scope_new)] = (name_n2p,
                                                        def_scope_n2p)
            except:
                jsnice_errors.append(
                    "JSNice to minified name map building failed.")


        (_name_positions, \
         position_names,
         _use_scopes) = prepHelpers(iBuilder_ugly, scopeAnalyst)

        #Note: we want to put these in parallel once we've tested the
        #serial version...
        pre_outer_end = time.time()
        pre_time = pre_outer_end - start
        if (not parallel):
            #Get moses output for no_renaming
            (status, error_msg, translation_default, name_candidates_default,
             iBuilder_default, scopeAnalyst_default, name_positions_default,
             position_names_default, use_scopes_default, hash_name_map_default,
             rn_time_default, m_time_default, lex_time_default,
             post_start_default) = getMosesTranslation(proxies[RS.NONE],
                                                       RS.NONE, RS, clear,
                                                       iBuilder_ugly,
                                                       scopeAnalyst,
                                                       debug_output)
            #print("MOSES NO RENAMING: " + str(m_time_default))
            if (not status):
                return ((error_msg, "", (0, ) * TIMING_COUNT))

            #Get moses output for hash_renaming
            (status, error_msg, translation, name_candidates, a_iBuilder,
             a_scopeAnalyst, a_name_positions, a_position_names, a_use_scopes,
             hash_name_map, rn_time, m_time, lex_time,
             post_start) = getMosesTranslation(proxies[r_strategy], r_strategy,
                                               RS, clear, iBuilder_ugly,
                                               scopeAnalyst, debug_output)

            #print("MOSES HASH RENAMING: " + str(m_time))
            if (not status):
                return ((error_msg, "", (0, ) * TIMING_COUNT))
            m_parallel_time = 0
        else:
            #Parallel version
            none_wrapper = (RS.NONE, RS, clear, iBuilder_ugly, scopeAnalyst,
                            debug_output, use_local)
            hash_wrapper = (r_strategy, RS, clear, iBuilder_ugly, scopeAnalyst,
                            debug_output, use_local)
            wrappers = [none_wrapper, hash_wrapper]

            pool = multiprocessing.Pool(processes=2)

            m_parallel_start = time.time()
            for result in pool.imap(getMosesTranslationParallel, wrappers):
                if (result[0] == RS.NONE):  #No renaming
                    (status, error_msg, translation_default,
                     name_candidates_default, iBuilder_default,
                     scopeAnalyst_default, name_positions_default,
                     position_names_default, use_scopes_default,
                     hash_name_map_default, rn_time_default, m_time_default,
                     lex_time_default, post_start_default) = result[1]

                    #print("MOSES NO RENAMING: " + str(m_time_default))
                    if (not status):
                        return ((error_msg, "", (0, ) * TIMING_COUNT))
                else:
                    (status, error_msg, translation, name_candidates,
                     a_iBuilder, a_scopeAnalyst, a_name_positions,
                     a_position_names, a_use_scopes, hash_name_map, rn_time,
                     m_time, lex_time, post_start) = result[1]

                    #print("MOSES HASH RENAMING: " + str(m_time))
                    if (not status):
                        return ((error_msg, "", (0, ) * TIMING_COUNT))

            m_parallel_time = time.time() - m_parallel_start

        pre_time += rn_time_default + rn_time
        if (debug_output):
            print("Serial: " +
                  str(m_time + m_time_default + rn_time + rn_time_default))
            print("Parallel: " + str(m_parallel_time))

        if translation is not None and translation_default is not None:

            for key_default, suggestions in name_candidates_default.iteritems(
            ):
                #                         (name_default, def_scope_default) = key_default

                pos_default = scopeAnalyst_default.nameDefScope2pos[
                    key_default]
                (lin, col) = iBuilder_default.revFlatMat[pos_default]
                (line_num, line_idx) = iBuilder_default.revTokMap[(lin, col)]
                (name, def_scope) = a_position_names[line_num][line_idx]
                key = (name, def_scope)

                for name_translation, lines in suggestions.iteritems():
                    name_candidates.setdefault(key, {})
                    name_candidates[key].setdefault(name_translation, set([]))
                    name_candidates[key][name_translation].update(lines)
            # name_candidates is a dictionary of dictionaries:
            # keys are (name, None) (if scopeAnalyst=None) or
            # (name, def_scope) tuples (otherwise);
            # values are suggested translations with the sets
            # of line numbers on which they appear.
            #if(True):
            if (debug_output):
                print("Name_candidates")
                print(name_candidates)

                print("jsnice_name_map")
                print(jsnice_name_map)

                print("hash_name_map")
                print(hash_name_map)

            # **** BV: This might be all we need to combine Naughty & Nice
            if (
                    use_mix and jsnice_errors == []
            ):  #only attempt if we are error free for jsnice up to this point.
                try:
                    name_candidates_copy = deepcopy(name_candidates)
                    for key, suggestions in name_candidates_copy.iteritems():
                        if (debug_output):
                            print("Key: " + str(key))
                            print("Suggestions: " + str(suggestions))
                        if r_strategy == RS.NONE:
                            (name_n2p, def_scope_n2p) = jsnice_name_map[key]
                        else:
                            (name_n2p, def_scope_n2p
                             ) = jsnice_name_map[hash_name_map.get(key, key)]

                        for name_translation, lines in suggestions.iteritems():
                            name_candidates.setdefault(key, {})
                            name_candidates[key].setdefault(name_n2p, set([]))
                            name_candidates[key][name_n2p].update(lines)
                except:
                    jsnice_errors.append(
                        "Failure while adding jsnice names to candidate pool.")
            cr = ConsistencyController(debug_mode=debug_output)

            # An identifier may have been translated inconsistently
            # across different lines (Moses treats each line independently).
            # Try different strategies to resolve inconsistencies, if any

            # Compute renaming map (x -> length, y -> width, ...)
            # Note that x,y here are names after renaming
            #Hash error is occuring in here.
            try:
                (temp_renaming_map, seen) = cr.computeRenaming(
                    c_strategy, name_candidates, a_name_positions,
                    a_use_scopes, a_iBuilder, lm_path, {}, hash_name_map)
            except:
                return ("Compute renaming fail.", "", (0, ) * TIMING_COUNT)

            if (debug_output):
                print("Temp renaming map")
                print(temp_renaming_map)
            # Fall back on original names in input, if
            # no translation was suggested
            postRen = PostRenamer()
            renaming_map = postRen.updateRenamingMap(a_name_positions,
                                                     position_names,
                                                     a_use_scopes,
                                                     temp_renaming_map, seen,
                                                     r_strategy)
            if (debug_output):
                print("Renaming Map")
                print(renaming_map)
            # Apply renaming map and save output for future inspection
            renamed_text = postRen.applyRenaming(a_iBuilder, a_name_positions,
                                                 renaming_map)

            (ok, beautified_renamed_text,
             _err) = clear.web_run_end(renamed_text)
            #print(name_candidates)
            #print("--------------")
            #print(renamed_text)
            #print("--------------")
            #print(beautified_renamed_text)
            #print("--------------")
            #print(" ".join(jsnice_errors))
            if not ok:
                return ((beaut_error, "", (0, ) * TIMING_COUNT))

            if (debug_output):
                print("Renamed text")
                print(beautified_renamed_text)

        #Time Calculations... (Will need to update for when it becomes parallel
        post_end = time.time()
        post_time = post_end - post_start

        #Record any jsnice errors (but leave output blank if there are none).
        jsnice_error_string = ""
        if (jsnice_errors != []):
            jsnice_error_string = "JSNice mixing attempt failed.  Reporting renaming with only our method. \nJSNice Errors : \n"
            jsnice_error_string += "\n".join(jsnice_errors) + "\n"

        #Tally up the build times for the lexers, indexbuilders and scopers.
        if (not use_mix):
            n2pLexTime = 0
            n2pBuildTime = 0
            n2pSATime = 0
        else:
            n2pLexTime = n2p_lexer.build_time
            n2pBuildTime = n2p_iBuilder.build_time
            n2pSATime = n2p_scopeAnalyst.build_time

        #Lexers
        lex_total_time = lex_time + lex_time_default + lex_ugly.build_time + n2pLexTime
        #IndexBuilders
        builder_time = iBuilder_ugly.build_time + n2pBuildTime + a_iBuilder.build_time + iBuilder_default.build_time
        #scopers
        scoper_time = n2pSATime + scopeAnalyst.build_time + scopeAnalyst_default.build_time + a_scopeAnalyst.build_time

        #Change the presentation of this to return performance information
        #and error codes as separate elements in a tuple
        #New return: translation, jsnice_error, preprocess time, js_time, rename_time
        #m_time, post_time.
        return ((str(beautified_renamed_text), jsnice_error_string,
                 (pre_time, prepre_time, js_time, rn_time + rn_time_default,
                  lex_total_time, builder_time, scoper_time,
                  m_time + m_time_default, m_parallel_time, post_time)))

Example #7

Show file

def getMosesTranslation(proxy,
                        r_strategy,
                        RS,
                        a_beautifier,
                        iBuilder_ugly,
                        scopeAnalyst_ugly,
                        debug_mode=False):
    """
    A helper function so that we can run multiple different renaming
    strategies through moses in a more modular and hopefully parallelizable
    manner.  It performs hashing/no hashing preparation of the file for
    the renaming strategy specified by r_stategy, and then calls the
    appropriate moses_server.
    
    Parameters
    ----------
    proxy: A pointer to which port the appropriate moses server is listening in on
    for this particular renaming strategy.

    r_strategy: One of the renaming strategies from RenamingStrategies
    
    RS: A renaming strategies object.
    
    a_beautifier: a beautify object to make sure the renamed text is 
    cleanly formatted.
   
    iBuilder_ugly: Index Builder for the minified file.
   
    scopeAnalyst_ugly: Scope Analyst for the minified file.
   
    start: The starting time for the preprocessing step.  Used for performance
    metrics.
    
    debug_mode: Print debug information? (True/False - defaults to False)
    
    Returns
    -------
    (status, error, translation, name_candidates, 
            a_iBuilder, a_scopeAnalyst, a_name_positions, 
            a_position_names, a_use_scopes, hash_name_map,
            pre_time, rn_time, m_time, post_start)
    
    status: Did this complete without error?  If False, then the rest of the output
    besides error will be empty/null.
    
    error: What is the reason for the failure?  If status is True (successful
    completion) this is "".
    
    translation: The raw Moses output
    
    name_candidates: The set of Moses suggestions for this renaming
    
    a_iBuilder,a_scopeAnalyst: Index Builder and Scope Analyst for this renaming
    
    a_name_positions, a_posistion_names, a_use_scopes: Addition tracking info
    
    hash_name_map: a map from the hashed names to the original minified names 
    
    rn_time, m_time, lex_time, post_start: The duration of the
    renaming, Moses translation steps, and lexing steps along with the start time for the
    postprocessing of the Moses output. 
    """
    rn_start = time.time()

    #We need both the base_text and the hashed_text.
    preRen = PreRenamer()
    if (debug_mode):
        print("Tokens-------------------")
        print(iBuilder_ugly.tokens)
        print("Tokens-------------------")
    #We always need the non hashed names as a fallback.
    try:
        after_text = preRen.rename(r_strategy, iBuilder_ugly,
                                   scopeAnalyst_ugly)
    except:
        return (False, "Renaming failed for " + str(r_strategy), "", {}, None,
                None, {}, {}, {}, {}, 0, 0, 0, 0)

    (ok, beautified_after_text, _err) = a_beautifier.web_run(after_text)
    if not ok:
        return (False,
                "Beautifier failed on the renamed text for " + str(r_strategy),
                "", {}, None, None, {}, {}, {}, {}, 0, 0, 0, 0)

    # Align hashed and non hashed  files, in case the beautifier
    # line wrapped the extended lines.
    try:
        aligner = Aligner()
        (aligned_after, aligned_before) = aligner.web_align(
            WebLexer(beautified_after_text).tokenList,
            WebLexer(iBuilder_ugly.get_text()).tokenList)
    except:
        return (False,
                "Aligner failed on the renamed text for " + str(r_strategy),
                "", {}, None, None, {}, {}, {}, {}, 0, 0, 0, 0)

    #print("--------Aligned After-------")
    #print(aligned_after)
    #print("----------------------------")

    a_lexer = WebLexer(aligned_after)
    a_iBuilder = IndexBuilder(a_lexer.tokenList)
    a_scopeAnalyst = WebScopeAnalyst(aligned_after)

    hash_name_map = {}

    if (r_strategy == RS.HASH_ONE or r_strategy == RS.HASH_TWO):

        #Something below here is buggy...
        orderedVarsMin = sorted(scopeAnalyst_ugly.name2defScope.keys(),
                                key=lambda x: x[1])
        orderedVarsHash = sorted(a_scopeAnalyst.name2defScope.keys(),
                                 key=lambda x: x[1])
        #print("Min len: " + str(len(orderedVarsMin)))
        #print("Hash len: " + str(len(orderedVarsHash)))
        if (len(orderedVarsMin) != len(orderedVarsHash)):
            return (False, "Mismatch between minified and hashed names.", "",
                    {}, a_iBuilder, a_scopeAnalyst, {}, {}, {}, {}, 0, 0, 0, 0)

        for i in range(0, len(orderedVarsHash)):
            name_hash = orderedVarsHash[i][0]
            def_scope_hash = a_scopeAnalyst.name2defScope[orderedVarsHash[i]]

            name_min = orderedVarsMin[i][0]
            def_scope_min = scopeAnalyst_ugly.name2defScope[orderedVarsMin[i]]
            hash_name_map[(name_hash, def_scope_hash)] = (name_min,
                                                          def_scope_min)

    if (debug_mode):
        print("HASH NAME MAP LEN: " + str(len(hash_name_map)))

    # We can switch this back once we train models on a corpus with literals
    # lx = WebLexer(a_iBuilder.get_text())
    lx = WebLexer(a_iBuilder.get_text_wo_literals())
    #print("-----------------Moses In ----------------------")
    #print(lx)
    #print("------------------------------------------------")
    #print(a_iBuilder.charPosition2Name)
    #print("------------------------------------------------")
    #line_subset = a_scopeAnalyst.getMinifiableLines(a_iBuilder)
    #line_list = sorted(list(line_subset))
    #line_map = {}
    #m_line = 0
    #for next_line in line_list:
    #    line_map[m_line] = next_line
    #    m_line += 1
    #lx = WebLexer(a_iBuilder.get_text_on_lines_wo_literals(line_subset))

    #Performance measures -> wrap up the preprocessing/ renaming
    #phases
    end = time.time()
    rn_time = end - rn_start
    m_start = time.time()
    #if(debug_mode):
    #    print("Invoking Moses.")
    #    print(lx.collapsedText)
    # Translate renamed input
    #md = WebMosesDecoder(proxy)
    #(ok, translation, _err) = md.run(lx.collapsedText)
    (ok, translation, _err) = segmentedTranslation(lx, SEGMENTED_TRANS_SIZE,
                                                   proxy, debug_mode)
    if not ok:
        return (False, "Moses server failed for " + str(r_strategy),
                translation, {}, a_iBuilder, a_scopeAnalyst, {}, {}, {},
                hash_name_map, 0, 0, 0, 0)

    m_end = time.time()
    m_time = m_end - m_start

    post_start = time.time()

    (a_name_positions, a_position_names,
     a_use_scopes) = prepHelpers(a_iBuilder, a_scopeAnalyst)

    if translation is not None:
        # Parse moses output
        mp = MosesParser()
        if (debug_mode):
            print(translation)

        name_candidates = mp.parse(translation, a_iBuilder,
                                   a_position_names)  #,
        #a_scopeAnalyst)

        #A slightly modified version of parse to remap the moses
        #output lines to the correct original lines.
        #name_candidates = mp.parse_subset(translation,
        #                                  a_iBuilder,
        #                                  a_position_names,
        #                                  line_map)

    lex_time = lx.build_time + a_lexer.build_time
    return (True, "", translation, name_candidates, a_iBuilder, a_scopeAnalyst,
            a_name_positions, a_position_names, a_use_scopes, hash_name_map,
            rn_time, m_time, lex_time, post_start)