Example #1
0
    def get_text_on_lines_wo_literals(self, lines):
        """
        Parameters:
        -----------
        lines - a set of line numbers of the text we wish to return 
                (starting from 0)
        Returns:
        -------
        A string resembling the stored code, but only from the lines listed.  
        The other lines are omitted.
        """
        tokens = []

        for _line_idx, line in enumerate(self.tokens):
            x = []
            if(_line_idx not in lines): 
                #Skip the line if its not in our request list
                continue
            for (tt,t) in line:
                if is_token_subtype(tt, String):
                    x.append('TOKEN_LITERAL_STRING')
                elif is_token_subtype(tt, Number):
                    x.append('TOKEN_LITERAL_NUMBER')
                else:
                    x.append(t)

            tokens.append(' '.join(x))

        return '\n'.join(tokens)
Example #2
0
def handleUnicodeEscape(tokens, tokenType, technique="REMOVE"):
    '''
    The subprocess module is struggling with unicode escape characters inside string literals.
    This method removes or replaces all unicode escape characters inside tokens of tokenType.
    technique: "REMOVE" or "REPLACE", defaults to "REMOVE" if another string is sent.
    '''
    rt = []
    if(technique == "REPLACE"): #Not implemented yet.
        for t in tokens:
            newTok = t
            if(is_token_subtype(t[0], tokenType)):
                #DO Stuff
                newTok = (t[0], re.sub(r'[^\x00-\x7F]+',' ', t[1].decode("unicode_escape")))
                
            rt.append(newTok)
    else:
        for t in tokens:
            newTok = t
            if(is_token_subtype(t[0], tokenType)):
#                 print(t)
#                 newTok = (t[0], re.sub(r'[^\x00-\x7F]+','', t[1].decode("unicode_escape")))
                newTok = (t[0], unidecode(t[1]))
                
            rt.append(newTok)
    return rt
Example #3
0
 def process_config(l_num, line):
     context = {'is_config': False, 'config_group': [], 'group_ended': False, 'config_value': []}
     for token, text in inline_lexer.get_tokens(line):
         is_comment = is_token_subtype(token, Token.Comment)
         is_string = is_token_subtype(token, Token.Literal.String)
         if is_comment:
             return
         if context['is_config'] and token is Token.Punctuation and ';' in text:
             context['is_config'] = False
         if context['is_config'] and not context['group_ended'] and is_string:
             context['config_group'].append(text[1:-1])
         if context['is_config'] and context['group_ended'] and token is not Token.Text:
             context['config_value'].append((token, text))
         if context['is_config'] and token is Token.Operator and text == '=':
             context['group_ended'] = True
         if token is Token.Name.Variable and text == '$config':
             context['is_config'] = True
         # print token, (text,)
     if context['config_group']:
         value = []
         for token, val in context['config_value']:
             if token is Token.Literal.Number.Integer:
                 value.append(int(val))
             elif token is Token.Literal.Number.Float:
                 value.append(float(val))
             elif is_token_subtype(token, Token.Literal.String):
                 value.append(val[1:-1])
             elif token is Token.Keyword:
                 value.append({'true': True, 'false': False, 'null': None}.get(val))
             # else:
             #     print token, val
         group = config
         for g in context['config_group'][:-1]:
             group = group[g]
         group[context['config_group'][-1]] = value
Example #4
0
def hash_line(line):
    """Hashes placeholders in a line passed as a list of (token_type, token_name)
    pairs.  A hash combines the hash of the tokens to the left of a placeholder
    (excluding other placeholders) with the hash of the tokens to the right of a
    placeholder (including itself. This encodes position in addition to the
    contents of the rest of the line.
    """
    names = [
        name if not is_token_subtype(token_type, Token.Placeholder) else
        "@@PLACEHOLDER" for (token_type, name) in line
    ]

    hashed_line = []
    for index, (token_type, name) in enumerate(line):
        if is_token_subtype(token_type, Token.Placeholder):
            fst = names[:index]
            snd = names[index:]
            hashed_name = sha256()
            hashed_name.update(str(fst).encode("utf-8"))
            hashed_name.update(hashed_name.digest())
            hashed_name.update(str(snd).encode("utf-8"))
            hashed_line.append((token_type, hashed_name.hexdigest()))
        else:
            hashed_line.append((token_type, name))
    return hashed_line
Example #5
0
    def test_functions(self):
        self.assertTrue(token.is_token_subtype(token.String, token.String))
        self.assertTrue(token.is_token_subtype(token.String, token.Literal))
        self.assertFalse(token.is_token_subtype(token.Literal, token.String))

        self.assertTrue(token.string_to_tokentype(token.String) is token.String)
        self.assertTrue(token.string_to_tokentype("") is token.Token)
        self.assertTrue(token.string_to_tokentype("String") is token.String)
Example #6
0
    def test_functions(self):
        self.assert_(token.is_token_subtype(token.String, token.String))
        self.assert_(token.is_token_subtype(token.String, token.Literal))
        self.failIf(token.is_token_subtype(token.Literal, token.String))

        self.assert_(token.string_to_tokentype(token.String) is token.String)
        self.assert_(token.string_to_tokentype('') is token.Token)
        self.assert_(token.string_to_tokentype('String') is token.String)
def test_functions():
    assert token.is_token_subtype(token.String, token.String)
    assert token.is_token_subtype(token.String, token.Literal)
    assert not token.is_token_subtype(token.Literal, token.String)

    assert token.string_to_tokentype(token.String) is token.String
    assert token.string_to_tokentype('') is token.Token
    assert token.string_to_tokentype('String') is token.String
    def test_functions(self):
        self.assertTrue(token.is_token_subtype(token.String, token.String))
        self.assertTrue(token.is_token_subtype(token.String, token.Literal))
        self.assertFalse(token.is_token_subtype(token.Literal, token.String))

        self.assertTrue(token.string_to_tokentype(token.String) is token.String)
        self.assertTrue(token.string_to_tokentype('') is token.Token)
        self.assertTrue(token.string_to_tokentype('String') is token.String)
Example #9
0
    def interpret_host(cls, host_node, symbol_table):
        assert host_node
        assert isinstance(host_node, AST_T_Node)
        _pos, _type, _value = host_node.data

        if token.is_token_subtype(_type, TT_VARIABLE):
            return symbol_table[_value]

        if token.is_token_subtype(_type, TT_IPv4):
            return _value
Example #10
0
    def traversal(scopeAnalyst, iBuilder, context, condition):

        seen = {}
        print(
            "name2defScope---------------------------------------------------")
        print(scopeAnalyst.name2defScope)
        for line_idx, line in enumerate(iBuilder.tokens):
            print("Traversing: " + str(line_idx) + " ----- " + str(line))
            for token_idx, (token_type, token) in enumerate(line):
                (l, c) = iBuilder.tokMap[(line_idx, token_idx)]
                pos = iBuilder.flatMap[(l, c)]

                #if(True):
                try:
                    if (is_token_subtype(token_type, Token.Name)):
                        print("NAME!!!!!!" + str(token))
                        def_scope = scopeAnalyst.name2defScope[(token, pos)]
                        #                       use_scope = scopeAnalyst.name2useScope[(token, pos)]
                        pth = scopeAnalyst.name2pth[(token, pos)]
                except KeyError:
                    print("KEY ERROR! " + str(token_idx) + " -- " +
                          str(token_type) + " -- " + str(token))
                    continue

                if not isValidContextToken((token_type, token)):
                    continue

                if scopeAnalyst.isGlobal.get((token, pos), True):
                    continue

                context_tokens = []

                # If token is defined on the current line,
                # count this line towards token's context.
                if condition(pth, scopeAnalyst, token, def_scope, seen):

                    for tidx, (tt, t) in enumerate(line):
                        (tl, tc) = iBuilder.tokMap[(line_idx, tidx)]
                        p = iBuilder.flatMap[(tl, tc)]

                        if scopeAnalyst.isGlobal.get((t, p), True) or \
                                not is_token_subtype(tt, Token.Name):
                            context_tokens.append(t)

                        if t == token and p == pos and \
                                not scopeAnalyst.isGlobal.get((t, p), True):
                            context_tokens.append('#')

                    seen[(token, def_scope)] = True

                context.setdefault((token, def_scope), [])
                context[(token, def_scope)] += context_tokens

        return context
Example #11
0
class PreRenamer:
    def __init__(self):
        self.RS = RenamingStrategies()

#         self.simple_direct_map = {}
#         self.simple_inverse_map = {}

    def __isValidContextToken(self, (token_type, token)):
        if is_token_subtype(token_type, String) or \
                is_token_subtype(token_type, Number):
            return False
        return True
Example #12
0
 def strip_literals(self, iBuilder):
     tokens = []
     for line in iBuilder.tokens:
         new_line = []
         for (token_type, token) in line:
             if is_token_subtype(token_type, String):
                 new_line.append('TOKEN_LITERAL_STRING')
             elif is_token_subtype(token_type, Number):
                 new_line.append('TOKEN_LITERAL_NUMBER')
             else:
                 new_line.append(token)
         tokens.append(new_line)
     return tokens
Example #13
0
def tokensExceptTokenType(tokens, 
                          tokenType, 
                          ignoreSubtypes = False):
    """
    @author: Naji Dmeiri
    :param tokens:          A list of `Token` objects as defined in `pygments.token`
    :param tokenType:       A `TokenType` object as defined in `pygments.token`
    :param ignoreSubtypes:  When set to True, the returned list will include subtypes of `tokenType` ; default is `False`.
    :returns:               An iterable of tuples that each hold information about a `tokenType` tokens.
    """
    if tokenType not in STANDARD_TYPES:
        raise ValueError("%s is not a standard Pygments token type." % tokenType)

    rt = []
    for t in tokens:
        rm = False
        if not ignoreSubtypes:
            if is_token_subtype(t[0], tokenType):
                rm = True
        else:
            if t[0] == tokenType:
                rm = True
        if not rm:
            rt.append(t)
        else:
            if t[0] == Comment.Single:
                if t[1].endswith('\n'):
                    rt.append((Token.Text, u'\n'))
    return rt
Example #14
0
    def processTokenList(self):
        """
        Helper: Walk through the token list and record the lines where 
        literals, for loops, and while loops occur
        
        Parameters
        ----------

        Returns
        -------

        """
        
        line_num = 0
        for token_type, token in self.tokens:
            if token_type == Token.Text and '\n' in token:
                line_num += 1
            else:
                #Check if literal
                if(is_token_subtype(token_type, Token.Literal)):
                    self.literalsOnLines.add(line_num)
                    
                #Check if for loop
                if(token_type == Token.Keyword and token.strip() == u'for'):
                    self.forOnLines.add(line_num)
                
                #Check if while loop
                if(token_type == Token.Keyword and token.strip() == u'while'):
                    self.whileOnLines.add(line_num)
Example #15
0
    def get_text_wo_literals(self):
        tokens = []
            
        for _line_idx, line in enumerate(self.tokens):
            x = []
            for (tt,t) in line:
                if is_token_subtype(tt, String):
                    x.append('TOKEN_LITERAL_STRING')
                elif is_token_subtype(tt, Number):
                    x.append('TOKEN_LITERAL_NUMBER')
                else:
                    x.append(t)

            tokens.append(' '.join(x))
            
        return '\n'.join(tokens)
Example #16
0
def test_highlight_block():
    code = ("""def sandwich(bread, cheese=True):
                   result = []
                   result.append(bread.slice())
                   if cheese:
                       result.append('cheese')
                   return result""")
    indent = 15 * ' '
    result = highlight_block('python', code, None)
    reference = \
        [Token('def', Keyword), Token(' ', Text),
           Token('sandwich', Name.Function), Token('(', Punctuation),
           Token('bread', Name), Token(',', Punctuation), Token(' ', Text),
           Token('cheese', Name), Token('=', Operator),
           Token('True', Name.Builtin.Pseudo), Token('):', Punctuation),
         Token('\n' + indent + '    ', Text), Token('result', Name),
           Token(' ', Text), Token('=', Operator), Token(' ', Text),
           Token('[]', Punctuation),
         Token('\n' + indent + '    ', Text), Token('result', Name),
           Token('.', Operator), Token('append', Name), Token('(', Punctuation),
           Token('bread', Name), Token('.', Operator), Token('slice', Name),
           Token('())', Punctuation),
         Token('\n' + indent + '    ', Text), Token('if', Keyword),
           Token(' ', Text), Token('cheese', Name), Token(':', Punctuation),
         Token('\n' + indent + '    ' + '    ', Text), Token('result', Name),
           Token('.', Operator), Token('append', Name), Token('(', Punctuation),
           Token("'cheese'", Literal.String), Token(')', Punctuation),
         Token('\n' + indent + '    ', Text), Token('return', Keyword),
           Token(' ', Text), Token('result', Name), Token('\n', Text)]
    for res, ref in zip(result, reference):
        assert res.text(None) == ref.text(None)
        assert is_token_subtype(res.type, ref.type)
Example #17
0
def formatTokens(tokenList):
    lines = []
    line = []
    for (token_type, token) in tokenList:
        if not is_token_subtype(token_type, Token.Text):
            line.append((token_type, token.strip()))
        elif '\n' in token:
            lines.append(line)
            line = []
    return lines
Example #18
0
    def _match_N(self, s_top_N, _token):
        _rules_LL = self.predict[s_top_N]

        _rule = None
        for term, rule in _rules_LL.items():
            if token.is_token_subtype(_token[1], term):
                _rule = rule
                break

        return _rule
def prepareHelpers(iBuilder, scopeAnalyst=None):

    # Collect names and their locations in various formats
    # that will come in handy later:

    # Which locations [(line number, index within line)] does
    # a variable name appear at?
    name_positions = {}

    # Which variable name is at a location specified by
    # [line number][index within line]?
    position_names = {}

    for line_num, line in enumerate(iBuilder.tokens):
        position_names.setdefault(line_num, {})

        for line_idx, (token_type, token) in enumerate(line):

            if is_token_subtype(token_type, Token.Name):
                (l, c) = iBuilder.tokMap[(line_num, line_idx)]
                p = iBuilder.flatMap[(l, c)]

                #                 cond = False
                if scopeAnalyst is not None:
                    name2defScope = scopeAnalyst.resolve_scope()
                    isGlobal = scopeAnalyst.isGlobal

                    #                     if not False: #isGlobal.get((token, p), True):
                    try:
                        def_scope = name2defScope[(token, p)]

                        name_positions.setdefault((token, def_scope), [])
                        name_positions[(token, def_scope)].append(
                            (line_num, line_idx))
                        position_names[line_num][line_idx] = (token, def_scope)
                    except KeyError:
                        pass

#                         cond = True
# #                         print (token, def_scope), line_num, line_idx

                else:
                    def_scope = None

                    name_positions.setdefault((token, def_scope), [])
                    name_positions[(token, def_scope)].append(
                        (line_num, line_idx))
                    position_names[line_num][line_idx] = (token, def_scope)


#                     cond = True
#                 if cond:
#                     print (token, def_scope), line_num, line_idx

    return (name_positions, position_names)
    def get_context(self, string):
        """ Assuming the cursor is at the end of the specified string, get the
            context (a list of names) for the symbol at cursor position.
        """
        context = []
        reversed_tokens = list(self._lexer.get_tokens(string))
        reversed_tokens.reverse()

        # Pygments often tacks on a newline when none is specified in the input.
        # Remove this newline.
        if reversed_tokens and reversed_tokens[0][1].endswith('\n') and \
                not string.endswith('\n'):
            reversed_tokens.pop(0)

        current_op = ''
        for token, text in reversed_tokens:

            if is_token_subtype(token, Token.Name):

                # Handle a trailing separator, e.g 'foo.bar.'
                if current_op in self._name_separators:
                    if not context:
                        context.insert(0, '')

                # Handle non-separator operators and punction.
                elif current_op:
                    break

                context.insert(0, text)
                current_op = ''

            # Pygments doesn't understand that, e.g., '->' is a single operator
            # in C++. This is why we have to build up an operator from
            # potentially several tokens.
            elif token is Token.Operator or token is Token.Punctuation:
                # Handle a trailing separator, e.g 'foo.bar.'
                if current_op in self._name_separators:
                    if not context:
                        context.insert(0, '')
                else:
                    current_op = text + current_op

            # Break on anything that is not a Operator, Punctuation, or Name.
            else:
                break

        return context
Example #21
0
    def get_context(self, string):
        """ Assuming the cursor is at the end of the specified string, get the
            context (a list of names) for the symbol at cursor position.
        """
        context = []
        reversed_tokens = list(self._lexer.get_tokens(string))
        reversed_tokens.reverse()

        # Pygments often tacks on a newline when none is specified in the input.
        # Remove this newline.
        if reversed_tokens and reversed_tokens[0][1].endswith('\n') and \
                not string.endswith('\n'):
            reversed_tokens.pop(0)

        current_op = ''
        for token, text in reversed_tokens:

            if is_token_subtype(token, Token.Name):

                # Handle a trailing separator, e.g 'foo.bar.'
                if current_op in self._name_separators:
                    if not context:
                        context.insert(0, '')

                # Handle non-separator operators and punction.
                elif current_op:
                    break

                context.insert(0, text)
                current_op = ''

            # Pygments doesn't understand that, e.g., '->' is a single operator
            # in C++. This is why we have to build up an operator from
            # potentially several tokens.
            elif token is Token.Operator or token is Token.Punctuation:
                # Handle a trailing separator, e.g 'foo.bar.'
                if current_op in self._name_separators:
                    if not context:
                        context.insert(0, '')
                else:
                    current_op = text + current_op

            # Break on anything that is not a Operator, Punctuation, or Name.
            else:
                break

        return context
Example #22
0
def renameUsingScopeId(scopeAnalyst, iBuilder_ugly):
    '''
    Simple renaming: disambiguate overloaded names 
    with indices: n -> n_1, n_2, n_3.
    The index is the def_scope id.
    '''

    name2defScope = scopeAnalyst.resolve_scope()
    isGlobal = scopeAnalyst.isGlobal

    # Figure out which _scope_idx suffixes are illegal
    except_ids = map(int, [
        name.split('_')[-1] for name in scopeAnalyst.nameScopes.keys()
        if name.split('_')[-1].isdigit()
    ])

    # Compute shorter def_scope identifiers
    scopes = set(name2defScope.values())
    scope2id = dict(zip(scopes, generateScopeIds(len(scopes), except_ids)))

    renaming = []

    for line_idx, line in enumerate(iBuilder_ugly.tokens):

        new_line = []
        for token_idx, (token_type, token) in enumerate(line):
            try:
                (l, c) = iBuilder_ugly.tokMap[(line_idx, token_idx)]
                pos = iBuilder_ugly.flatMap[(l, c)]
                def_scope = name2defScope[(token, pos)]
            except KeyError:
                new_line.append(token)
                continue

            if is_token_subtype(token_type, Token.Name) and \
                    scopeAnalyst.is_overloaded(token) and \
                    not isGlobal[(token, pos)]:
                # Must rename token to something else
                # Append def_scope id to name
                new_line.append('%s_%d' % (token, scope2id[def_scope]))
            else:
                new_line.append(token)

        renaming.append(' '.join(new_line) + "\n")

    return renaming
Example #23
0
    def get_tokens(self, var_names=Names.RAW):
        """Generate tokens from a raw_code string, skipping comments.

        Keyword arguments:
        var_names -- Which variable names to output (default RAW).
        """
        previous_string = None
        for (token_type, token) in self.tokens:
            # Pygments breaks up strings into individual tokens representing
            # things like opening quotes and escaped characters. We want to
            # collapse all of these into a single string literal token.
            if previous_string and not is_token_subtype(
                    token_type, Token.String):
                yield (Token.String, previous_string)
                previous_string = None
            if is_token_subtype(token_type, Token.String):
                if previous_string:
                    previous_string += token
                else:
                    previous_string = token
            elif is_token_subtype(token_type, Token.Number):
                yield (token_type, token)
            # Skip comments
            elif is_token_subtype(token_type, Token.Comment):
                continue
            # Skip the :: token added by HexRays
            elif is_token_subtype(token_type,
                                  Token.Operator) and token == '::':
                continue
            # Replace the text of placeholder tokens
            elif is_token_subtype(token_type, Token.Placeholder):
                yield {
                    Names.RAW: (token_type, token),
                    Names.SOURCE: (token_type, token.split('@@')[2]),
                    Names.TARGET: (token_type, token.split('@@')[3]),
                }[var_names]
            elif not is_token_subtype(token_type, Token.Text):
                yield (token_type, token.strip())
            # Skip whitespace
            elif is_token_subtype(token_type, Token.Text):
                continue
            else:
                raise TokenError(f"No token ({token_type}, {token})")
Example #24
0
def tokensReplaceTokenOfType(tokens, 
                             tokenType, 
                             replacementValue, 
                             ignoreSubtypes = False):
    """
    :param tokens:          A list of `Token` objects as defined in `pygments.token`
    :param tokenType:       A `TokenType` object as defined in `pygments.token`
    :param replacementValue:
    :param ignoreSubtypes:  When set to True, the returned list will include subtypes of `tokenType` ; default is `False`.
    :returns:               An iterable of tuples that each hold information about a `tokenType` tokens.
    """
    if tokenType not in STANDARD_TYPES:
        raise ValueError("%s is not a standard Pygments token type." % tokenType)

    if not ignoreSubtypes:
        return [t if not is_token_subtype(t[0], tokenType) 
                else (t[0], replacementValue) for t in tokens]
    else:
        return [t if not t[0] == tokenType 
                else (t[0], replacementValue) for t in tokens]
Example #25
0
 def get_lines(self, var_naming=VarNaming.NONE, var_table=None):
     line = []
     for (token_type, token) in self.tokens:
         if is_token_subtype(token_type, Token.Comment) and len(line) > 0:
             if var_naming == VarNaming.HASH:
                 line = hash_line(line)
             yield line
             line = []
         elif is_token_subtype(token_type, Token.String):
             # Pygments breaks up strings into individual tokens representing
             # things like opening quotes and escaped characters. We want to
             # collapse all of these into a single string literal token.
             if line != [] and line[-1] == (
                     Token.String,
                     "<LITERAL_STRING>",
             ):
                 continue
             line.append((Token.String, "<LITERAL_STRING>"))
         elif is_token_subtype(token_type, Token.Number):
             line.append((Token.String, "<LITERAL_NUMBER>"))
         # Skip the :: token
         elif is_token_subtype(token_type,
                               Token.Operator) and token == "::":
             continue
         # Replace placeholders if using table renaming
         elif var_naming == VarNaming.TABLE and is_token_subtype(
                 token_type, Token.Placeholder.Var):
             if not var_table:
                 raise KeyError("var_table must be set with table renaming")
             # Remove the '@@VAR_' from the beginning of the placeholder
             var_id = token[6:]
             line.append(
                 (Token.Placeholder.Var, self.var_table[var_id][var_table]))
         elif not is_token_subtype(token_type, Token.Text):
             line.append((token_type, token.strip()))
         elif "\n" in token and len(line) > 0:
             if var_naming == VarNaming.HASH:
                 line = hash_line(line)
             yield line
             line = []
Example #26
0
def processFile(js_file_path):

    try:

        # Num tokens before vs after
        try:
            tok1 = Lexer(os.path.join(files_root, 'orig',
                                      js_file_path)).tokenList
            tok2 = Lexer(os.path.join(files_root, 'no_renaming',
                                      js_file_path)).tokenList
            #             tok3 = Lexer(os.path.join(files_root, 'basic_renaming', js_file_path)).tokenList
            #             tok4 = Lexer(os.path.join(files_root, 'normalized', js_file_path)).tokenList
            tok5 = Lexer(
                os.path.join(files_root, 'hash_def_one_renaming',
                             js_file_path)).tokenList
            tok6 = Lexer(
                os.path.join(files_root, 'hash_def_two_renaming',
                             js_file_path)).tokenList
        except:
            return (js_file_path, None, 'Lexer fail')

        # For now only work with minified files that have
        # the same number of tokens as the originals
        if not len(set([len(tok1), len(tok2), len(tok5), len(tok6)])) == 1:
            return (js_file_path, None, 'Num tokens mismatch')

        clear = Beautifier()
        # Align minified and clear files, in case the beautifier
        # did something weird
        aligner = Aligner()

        (aligned1, aligned2) = aligner.web_align(tok1, tok2)

        (ok, beautified1, _err) = clear.web_run(aligned1)
        tok11 = WebLexer(beautified1).tokenList

        (ok, beautified2, _err) = clear.web_run(aligned2)
        tok22 = WebLexer(beautified2).tokenList

        (aligned5, aligned2) = aligner.web_align(tok5, tok2)

        (ok, beautified5, _err) = clear.web_run(aligned5)
        tok55 = WebLexer(beautified5).tokenList

        (aligned6, aligned2) = aligner.web_align(tok6, tok2)

        (ok, beautified6, _err) = clear.web_run(aligned6)
        tok66 = WebLexer(beautified6).tokenList

        #         try:
        #             aligner = Aligner()
        #             # This is already the baseline corpus, no (smart) renaming yet
        #             aligner.align(temp_files['path_tmp_b'],
        #                           temp_files['path_tmp_u'])
        #         except:
        #             return (js_file_path, None, 'Aligner fail')

        try:
            iBuilder1 = IndexBuilder(tok11)
            iBuilder2 = IndexBuilder(tok22)
            #             iBuilder3 = IndexBuilder(tok3)
            #             iBuilder4 = IndexBuilder(tok4)
            iBuilder5 = IndexBuilder(tok55)
            iBuilder6 = IndexBuilder(tok66)
        except:
            return (js_file_path, None, 'IndexBuilder fail')

        # Check that at least one variable was renamed during minification
        orig_names = set([
            token for line in iBuilder1.tokens for (token_type, token) in line
            if is_token_subtype(token_type, Token.Name)
        ])
        ugly_names = set([
            token for line in iBuilder2.tokens for (token_type, token) in line
            if is_token_subtype(token_type, Token.Name)
        ])
        if not len(orig_names.difference(ugly_names)):
            return (js_file_path, None, 'Not minified')

        orig = []
        no_renaming = []
        #         basic_renaming = []
        #         normalized = []
        hash_def_one_renaming = []
        hash_def_two_renaming = []

        for _line_idx, line in enumerate(iBuilder1.tokens):
            orig.append(' '.join([t for (_tt, t) in line]) + "\n")

        for _line_idx, line in enumerate(iBuilder2.tokens):
            no_renaming.append(' '.join([t for (_tt, t) in line]) + "\n")


#         for _line_idx, line in enumerate(iBuilder3.tokens):
#             basic_renaming.append(' '.join([t for (_tt,t) in line]) + "\n")

#         for _line_idx, line in enumerate(iBuilder4.tokens):
#             normalized.append(' '.join([t for (_tt,t) in line]) + "\n")

        for _line_idx, line in enumerate(iBuilder5.tokens):
            hash_def_one_renaming.append(' '.join([t for (_tt, t) in line]) +
                                         "\n")

        for _line_idx, line in enumerate(iBuilder6.tokens):
            hash_def_two_renaming.append(' '.join([t for (_tt, t) in line]) +
                                         "\n")

        return (
            js_file_path,
            orig,
            no_renaming,
            #                 basic_renaming,
            #                 normalized,
            hash_def_one_renaming,
            hash_def_two_renaming)

    except Exception, e:
        return (js_file_path, None, str(e))
Example #27
0
def tokenize(program_path, raw=False):
    lexer = RubyLexer()

    token_streams = []
    with open(program_path, "r") as f:
        program = f.readlines()

    num_of_lines = len(program)

    last_indent_count = 0

    for line in program:
        line_of_token = []
        for token_data in lexer.get_tokens(line):
            token_type = token_data[0]
            token = token_data[-1]

            if raw:
                if is_token_subtype(token_type, Comment) or is_token_subtype(
                        token_type, Literal):
                    arranged_token = replace_special_char(token, comment=True)
                else:
                    arranged_token = replace_special_char(token, comment=False)
            else:
                if is_token_subtype(token_type, Literal):
                    arranged_token = "<LITERAL>"
                elif is_token_subtype(token_type, String):
                    arranged_token = "<STRING>"
                elif is_token_subtype(token_type, Number):
                    arranged_token = "<NUMBER>"
                elif token_type == Token.Name.Operator:
                    arranged_token = "<OPERATOR>"
                elif token_type == Name and token not in reserved:
                    arranged_token = "<ID>"
                elif token_type == Name.Variable.Instance:
                    arranged_token = "<INSTANCE_VAL>"
                elif token_type == Name.Variable.Class:
                    arranged_token = "<CLASS_VAL>"
                elif token_type == Name.Constant:
                    arranged_token = "<CONSTANT_ID>"
                elif token_type == Name.Function:
                    arranged_token = "<FUNCTION>"
                elif token_type == Name.Class:
                    arranged_token = "<CLASS>"
                elif token_type == Name.Namespace:
                    arranged_token = "<NAMESPACE>"
                elif token_type == Token.Name.Variable.Global:
                    arranged_token = "<GLOBAL_VAL>"
                elif token_type == Token.Error:
                    arranged_token = "<ERROR>"  # pygments内で字句解析が失敗した際のトークン (絵文字など)
                elif is_token_subtype(token_type, Comment):
                    arranged_token = "<COMMENT>"
                else:
                    arranged_token = replace_special_char(token)
                    # if arranged_token not in reserved and "SPACE" not in arranged_token and "NEWLINE" not in arranged_token:
                    #     if token_type not in (Token.Punctuation, Token.Operator, Token.Name.Builtin, Token.Keyword.Pseudo):
                    #         print("==============")
                    #         print(program_path)
                    #         print(line.rstrip())
                    #         print("{} : {}".format(arranged_token.encode("utf-8"), token_type))
                    #         print("==============")

            line_of_token.append(arranged_token +
                                 " ")  # 空白区切りにするため、最後にスペースをつける

        # 行頭の空白二つはインデントとみなす
        line_of_token[0] = line_of_token[0].replace("<SPACE> <SPACE> ",
                                                    "<INDENT> ")

        # インデントは前の行との相対的な値を番号として付与する
        indent_count = len(re.findall("<INDENT>", line_of_token[0]))

        if indent_count != 0:
            # 空行がインデントされていると0番目の要素にインデントと改行が両方含まれている場合があるため、
            # インデント情報を取り除いてから、相対的なインデント情報を付け加える
            indent_char = "<INDENT{}> ".format(indent_count -
                                               last_indent_count)
            line_of_token[0] = line_of_token[0].replace("<INDENT> ", "")
            line_of_token[0] = indent_char + line_of_token[0]

        if len(line_of_token) != 1:
            last_indent_count = indent_count

        token_streams.append(line_of_token)

    return token_streams, num_of_lines
Example #28
0
    def compare(self, mini_js_path=None, keep_mini=True):
        pid = int(multiprocessing.current_process().ident)

        lexer = get_lexer_for_filename(self.js_path)

        # before
        tmp_b = open(self.js_path, 'r').read()
        tokens_b = list(lex(tmp_b, lexer))

        # Discover the path to the source map
        map_path = sourcemap.discover(tmp_b)
        if map_path is not None:
            # The file couldn't have a source map unless it is already minified
            return True

        # after
        if mini_js_path is None:
            uglifier = Uglifier()
            mini_js_path = os.path.abspath('tmp_%d.u.js' % pid)
            uglifyjs_ok = uglifier.run(self.js_path, mini_js_path)
            if not uglifyjs_ok:
                raise Exception, 'Uglifier failed'

        uglified = open(mini_js_path, 'r').read()
        tokens_u = list(lex(uglified, lexer))  # returns a generator of tuples

        if not len(tokens_b) == len(tokens_u):
            if not keep_mini:
                remove_file(mini_js_path)
            raise Exception, 'Different number of tokens'

        clean_names = [
            token for (token_type, token) in tokens_b
            if is_token_subtype(token_type, Token.Name)
        ]

        ugly_names = [
            token for (token_type, token) in tokens_u
            if is_token_subtype(token_type, Token.Name)
        ]

        same = [
            idx for (idx, token) in enumerate(clean_names)
            if ugly_names[idx] == token
        ]

        clean_names_n = [
            token for (idx, token) in enumerate(clean_names) if idx not in same
        ]
        ugly_names_n = [
            token for (idx, token) in enumerate(ugly_names) if idx not in same
        ]

        if not clean_names_n:
            if not keep_mini:
                remove_file(mini_js_path)
            return False

        if sum([len(v) for v in clean_names_n]) <= \
                sum([len(v) for v in ugly_names_n]):
            if not keep_mini:
                remove_file(mini_js_path)
            return True

        if not keep_mini:
            remove_file(mini_js_path)
        return False
Example #29
0
def intim_introspection():
    # TODO: some of these are not ABSOLUTELY needed. Make user free not
    # to install them.
    from pygments.token import Token, is_token_subtype
    from pygments.lexers import python as pylex
    import os                                # for module type
    from sys import stdout                   # for default 'file'
    from types import ModuleType, MethodType # to define particular types
    from numpy import ufunc as UFuncType     # yet other particular types
    import inspect                           # to check for type types
    from enum import Enum                    # for analysing enum types
    Example_enum = Enum("Example", 'value')

    filenames = {USERSCRIPTFILES} # sed by vimscript, remove duplicates
    source = '' # concat here all these files
    for filename in filenames:
        with open(filename, 'r') as file:
            source += '\n' + file.read()


    class Type(object):
        """Type class for typing nodes of the token forest
        Can iterate over its instances for convenience
        """

        _instances = set()

        def __init__(self, id, python_type):
            """
            python_type: either
            """
            self.id = 'IntimPy' + id
            self._instances.add(self)
            self.type = python_type

        @classmethod
        def instances(cls):
            """Iterate over all instances
            """
            return iter(cls._instances)

    # Supported types
    Bool       = Type("Bool"       , type(True))
    BuiltIn    = Type("Builtin"    , type(dir))
    Class      = Type("Class"      , None) # checked while typing node
    EnumType   = Type("EnumType"   , type(Example_enum))
    EnumValue  = Type("EnumValue"  , type(Example_enum.value))
    Float      = Type("Float"      , type(1.))
    Function   = Type("Function"   , None) # checked while typing node
    Method     = Type("Method"     , None) # checked while typing node
    Instance   = Type("Instance"   , None) # instance of user's custom class
    Int        = Type("Int"        , type(1))
    Module     = Type("Module"     , type(os))
    NoneType   = Type("NoneType"   , type(None))
    String     = Type("String"     , type('a'))
    Unexistent = Type("Unexistent" , None) # node yet undefined in the session

    # Store them so that they can easily be found from actual python types
    types_map = {}
    for cls in Type.instances():  # All `None` keys override each other..
        types_map[cls.type] = cls # .. never mind.


    class Node(object):
        """Identifier and references to its parents and kids. It may have no
        parent, it is a root then.
        """

        def __init__(self, id, parent=None, type=Unexistent):
            """
            id: string the node's identifier: i.e. how it is written in
            the script.
            parent: Node its parent node in the graph, root node if None
            type: Type associated type with coloration etc
            """
            self.id = id
            self.parent = parent
            self._kids = {} # {id: Node}
            self.type = type

        @property
        def leaf(self):
            """True if has no kids
            """
            return not bool(self._kids)

        @property
        def root(self):
            """True if parent is None or a Forest
            """
            return self.parent is None or isinstance(self.parent, Forest)

        def add_node(self, node):
            """basic procedure to add a node as a kid
            """
            node.parent = self
            self._kids[node.id] = node

        def add_id(self, id):
            """Create a new kid from a string id
            if it already exists, do not erase the existing one
            return the newly created node
            """
            node = self._kids.get(id)
            if node:
                return node
            node = Node(id=id, parent=self)
            self._kids[id] = node
            return node

        @property
        def parents(self):
            """iterate backwards until a root parent is found
            """
            yield self
            if self.parent:
                yield from self.parent.parents
            else:
                raise StopIteration()

        @property
        def path(self):
            """Use backward iteration to build the full path to this node
            """
            res = [parent.id for parent in self.parents]
            return '.'.join(reversed(res))

        @property
        def kids(self):
            """iterate over kids
            """
            return iter(self._kids.values())

        @property
        def leaves(self):
            """Iterate over all leaf kids
            """
            if self.leaf:
                yield self
            else:
                for kid in self.kids:
                    yield from kid.leaves

        def __iter__(self):
            """Iterate over all nodes, top-down
            """
            yield self
            for kid in self.kids:
                yield from kid

        def _repr(self, prefix):
            """Iterate over all nodes and print full paths
            """
            res = "{}{}: {}\n".format(prefix, self.id, self.type.id)
            for kid in self.kids:
                res += kid._repr(prefix + self.id + '.')
            return res

        def __repr__(self):
            return self._repr('')

        def __len__(self):
            """Number of nodes: ourselves as a node + the weight of our kids
            """
            return 1 + sum(len(kid) for kid in self.kids)

        def type_nodes(self, prefix=''):
            """Ultimate use of this forest: evaluate our id in the
            current context to retrieve information on the current state
            of this access path
            prefix: string previous path (context) of this node
            called by the parents
            """
            path = prefix + self.id
            # analyse type of this node:
            try:
                t = eval("type({})".format(path), globals())
            except (AttributeError, NameError) as e:
                # then all subsequent nodes are unexistent
                for node in self:
                    node.type = Unexistent
                return
            # is the type available, special?
            node_type = types_map.get(t)
            if node_type:
                self.type = node_type
            else:
                # then it is just a plain valid, known node, probably
                # instance of a custom class or a function, method
                # unelegant way to get these functions into eval scope:
                if eval("f({})".format(path), globals(),
                        {'f': inspect.ismethod}):
                    self.type = Method
                elif eval("f({})".format(path), globals(),
                        {'f': lambda x: inspect.isfunction(x)
                           or type(x) is UFuncType}): # inspect misses this one
                    self.type = Function
                elif eval("f({})".format(path), globals(),
                        {'f': inspect.isclass}):
                    if eval("f({})".format(path), globals(),
                            {'f': lambda c: issubclass(c, Enum)}):
                        self.type = EnumType
                    else:
                        self.type = Class
                elif eval("f({})".format(path), globals(),
                        {'f': lambda i: isinstance(i, Enum)}):
                    self.type = EnumValue
                else:
                    self.type = Instance
            for kid in self.kids:
                kid.type_nodes(path + '.')

        def write(self, prefix, depth, file=stdout):
            """Build a vim syntax command to color this node, given
            information recursively given from above:
            prefix: string prefix to the command, build from above
            depth: int our depth within the forest, build from above
            file: send there the resulting commands: once on each node
            """
            # match expressions from the root, but only color the leaf:
            suffix = r"\>'hs=e-" + str(len(self.id) - 1)
            # allow any amount of whitespace around the '.' operator
            whitespace = r"[ \s\t\n]*\.[ \s\t\n]*"
            # for speed, provide Vim information about the items inclusions:
            if not self.root:
                suffix += " contained"
            if self.leaf:
                suffix += " contains=NONE"
            if not self.leaf:
                # watch out: here is an additional iteration on kids! **
                subgroups = {sub.type.id for sub in self.kids}
                suffix += " contains=" + ','.join(subgroups)
            # here is the full command:
            command = "syntax match " + self.type.id + prefix + suffix
            # throw it up
            print(command, file=file)
            # ask the kids to do so :)
            for kid in self.kids: # ** second iteration, could be the only one
                if kid.type is not Unexistent: # to release burden a little bit
                    kid.write(prefix + whitespace + kid.id, depth + 1, file)


    class Forest(Node):
        """A Forest is a special Node with no parent, no id, and containing
        only root nodes.
        """

        def __init__(self):
            self._kids = {}

        @property
        def parents(self):
            """A forest has no parents
            """
            raise StopIteration()

        def __repr__(self):
            if self.leaf:
                return "empty Forest."
            res = ""
            for kid in self.kids:
                res += repr(kid)
            return res

        def __len__(self):
            """Total number of nodes in the forest:
            """
            return sum(len(kid) for kid in self.kids)

        def __iter__(self):
            """Iterate over all trees, not ourselves
            """
            for kid in self.kids:
                yield from kid

        def type_nodes(self):
            """Ask each tree to type itself
            """
            for kid in self.kids:
                kid.type_nodes()

        def write(self, file=stdout):
            """Visit the forest to build an ad-hoc vim syntax file and color
            the nodes in the source file.
            """
            # The root name starts without being a subname of something else.
            root_prefix = (
                    r" '\("
                    # (exclude comments endind with a period)
                    r"\n\s\{-}[^#].\{-}"
                    # previous identifier + period
                    r"[a-zA-Z][0-9]\{-}[\s\n]\{-}\.[\s\n]\{-}"
                    # well, do not match all this.
                    r"\)\@<!\<")
            for kid in self.kids:
                if kid.type is not Unexistent: # to ease the file a little bit
                    kid.write(root_prefix + kid.id, 0, file=file)
            # signal to Intim: the syntax file may be read now!
            print('" end', file=file)


    # Start lexing!
    lx = pylex.Python3Lexer()
    g = lx.get_tokens(source)
    # gather names to color as a forest of '.' operators:
    forest = Forest()
    current = forest
    # flag to keep track of whether to add in depth or go back to the root
    last_was_a_name = True
    # Also gather misc immediate tokens.. just for fun and extensibility
    misc = {}
    for i in [Token.Name.Decorator,
              Token.Name.Namespace,
              Token.Name.Operator,
              Token.Name.Keyword,
              Token.Name.Literal,
              Token.Comment,
              ]:
        misc[i] = set()
    # iterate over type_of_token, string
    for t, i in g:
        in_misc = False
        for subtype, harvest in misc.items():
            if is_token_subtype(t, subtype):
                in_misc = True
                harvest.add(i)
                break
        if in_misc:
            # no need to go further: this token does not belong to the forest
            continue
        if is_token_subtype(t, Token.Name):
            node = forest if last_was_a_name else current
            current = node.add_id(i)
            last_was_a_name = True
        elif is_token_subtype(t, Token.Operator):
            if i == '.':
                last_was_a_name = False

    # gather information on node types
    forest.type_nodes()

    # Write the vimscript commands to the syntax file:
    filename = INTIMSYNTAXFILE # sed by vimscript
    with open(filename, 'w') as file:
        forest.write(file)
Example #30
0
def intim_introspection():
    # TODO: some of these are not ABSOLUTELY needed. Make user free not
    # to install them.
    from pygments.token import Token, is_token_subtype
    from pygments.lexers import python as pylex
    import os  # for module type
    from sys import stdout  # for default 'file'
    from types import ModuleType, MethodType  # to define particular types
    from numpy import ufunc as UFuncType  # yet other particular types
    import inspect  # to check for type types
    from enum import Enum  # for analysing enum types
    Example_enum = Enum("Example", 'value')

    filenames = {USERSCRIPTFILES}  # sed by vimscript, remove duplicates
    source = ''  # concat here all these files
    for filename in filenames:
        with open(filename, 'r') as file:
            source += '\n' + file.read()

    class Type(object):
        """Type class for typing nodes of the token forest
        Can iterate over its instances for convenience
        """

        _instances = set()

        def __init__(self, id, python_type):
            """
            python_type: either
            """
            self.id = 'IntimPy' + id
            self._instances.add(self)
            self.type = python_type

        @classmethod
        def instances(cls):
            """Iterate over all instances
            """
            return iter(cls._instances)

    # Supported types
    Bool = Type("Bool", type(True))
    BuiltIn = Type("Builtin", type(dir))
    Class = Type("Class", None)  # checked while typing node
    EnumType = Type("EnumType", type(Example_enum))
    EnumValue = Type("EnumValue", type(Example_enum.value))
    Float = Type("Float", type(1.))
    Function = Type("Function", None)  # checked while typing node
    Method = Type("Method", None)  # checked while typing node
    Instance = Type("Instance", None)  # instance of user's custom class
    Int = Type("Int", type(1))
    Module = Type("Module", type(os))
    NoneType = Type("NoneType", type(None))
    String = Type("String", type('a'))
    Unexistent = Type("Unexistent", None)  # node yet undefined in the session

    # Store them so that they can easily be found from actual python types
    types_map = {}
    for cls in Type.instances():  # All `None` keys override each other..
        types_map[cls.type] = cls  # .. never mind.

    class Node(object):
        """Identifier and references to its parents and kids. It may have no
        parent, it is a root then.
        """
        def __init__(self, id, parent=None, type=Unexistent):
            """
            id: string the node's identifier: i.e. how it is written in
            the script.
            parent: Node its parent node in the graph, root node if None
            type: Type associated type with coloration etc
            """
            self.id = id
            self.parent = parent
            self._kids = {}  # {id: Node}
            self.type = type

        @property
        def leaf(self):
            """True if has no kids
            """
            return not bool(self._kids)

        @property
        def root(self):
            """True if parent is None or a Forest
            """
            return self.parent is None or isinstance(self.parent, Forest)

        def add_node(self, node):
            """basic procedure to add a node as a kid
            """
            node.parent = self
            self._kids[node.id] = node

        def add_id(self, id):
            """Create a new kid from a string id
            if it already exists, do not erase the existing one
            return the newly created node
            """
            node = self._kids.get(id)
            if node:
                return node
            node = Node(id=id, parent=self)
            self._kids[id] = node
            return node

        @property
        def parents(self):
            """iterate backwards until a root parent is found
            """
            yield self
            if self.parent:
                yield from self.parent.parents
            else:
                raise StopIteration()

        @property
        def path(self):
            """Use backward iteration to build the full path to this node
            """
            res = [parent.id for parent in self.parents]
            return '.'.join(reversed(res))

        @property
        def kids(self):
            """iterate over kids
            """
            return iter(self._kids.values())

        @property
        def leaves(self):
            """Iterate over all leaf kids
            """
            if self.leaf:
                yield self
            else:
                for kid in self.kids:
                    yield from kid.leaves

        def __iter__(self):
            """Iterate over all nodes, top-down
            """
            yield self
            for kid in self.kids:
                yield from kid

        def _repr(self, prefix):
            """Iterate over all nodes and print full paths
            """
            res = "{}{}: {}\n".format(prefix, self.id, self.type.id)
            for kid in self.kids:
                res += kid._repr(prefix + self.id + '.')
            return res

        def __repr__(self):
            return self._repr('')

        def __len__(self):
            """Number of nodes: ourselves as a node + the weight of our kids
            """
            return 1 + sum(len(kid) for kid in self.kids)

        def type_nodes(self, prefix=''):
            """Ultimate use of this forest: evaluate our id in the
            current context to retrieve information on the current state
            of this access path
            prefix: string previous path (context) of this node
            called by the parents
            """
            path = prefix + self.id
            # analyse type of this node:
            try:
                t = eval("type({})".format(path), globals())
            except (AttributeError, NameError) as e:
                # then all subsequent nodes are unexistent
                for node in self:
                    node.type = Unexistent
                return
            # is the type available, special?
            node_type = types_map.get(t)
            if node_type:
                self.type = node_type
            else:
                # then it is just a plain valid, known node, probably
                # instance of a custom class or a function, method
                # unelegant way to get these functions into eval scope:
                if eval("f({})".format(path), globals(),
                        {'f': inspect.ismethod}):
                    self.type = Method
                elif eval("f({})".format(path), globals(),
                          {'f': inspect.isfunction}):
                    self.type = Function
                elif eval("f({})".format(path), globals(),
                          {'f': inspect.isclass}):
                    if eval("f({})".format(path), globals(),
                            {'f': lambda c: issubclass(c, Enum)}):
                        self.type = EnumType
                    else:
                        self.type = Class
                elif eval("f({})".format(path), globals(),
                          {'f': lambda i: isinstance(i, Enum)}):
                    self.type = EnumValue
                else:
                    self.type = Instance
            for kid in self.kids:
                kid.type_nodes(path + '.')

        def write(self, prefix, depth, file=stdout):
            """Build a vim syntax command to color this node, given
            information recursively given from above:
            prefix: string prefix to the command, build from above
            depth: int our depth within the forest, build from above
            file: send there the resulting commands: once on each node
            """
            # match expressions from the root, but only color the leaf:
            suffix = r"\>'hs=e-" + str(len(self.id) - 1)
            # allow any amount of whitespace around the '.' operator
            whitespace = r"[ \s\t\n]*\.[ \s\t\n]*"
            # for speed, provide Vim information about the items inclusions:
            if not self.root:
                suffix += " contained"
            if self.leaf:
                suffix += " contains=NONE"
            if not self.leaf:
                # watch out: here is an additional iteration on kids! **
                subgroups = {sub.type.id for sub in self.kids}
                suffix += " contains=" + ','.join(subgroups)
            # here is the full command:
            command = "syntax match " + self.type.id + prefix + suffix
            # throw it up
            print(command, file=file)
            # ask the kids to do so :)
            for kid in self.kids:  # ** second iteration, could be the only one
                if kid.type is not Unexistent:  # to release burden a little bit
                    kid.write(prefix + whitespace + kid.id, depth + 1, file)

    class Forest(Node):
        """A Forest is a special Node with no parent, no id, and containing
        only root nodes.
        """
        def __init__(self):
            self._kids = {}

        @property
        def parents(self):
            """A forest has no parents
            """
            raise StopIteration()

        def __repr__(self):
            if self.leaf:
                return "empty Forest."
            res = ""
            for kid in self.kids:
                res += repr(kid)
            return res

        def __len__(self):
            """Total number of nodes in the forest:
            """
            return sum(len(kid) for kid in self.kids)

        def __iter__(self):
            """Iterate over all trees, not ourselves
            """
            for kid in self.kids:
                yield from kid

        def type_nodes(self):
            """Ask each tree to type itself
            """
            for kid in self.kids:
                kid.type_nodes()

        def write(self, file=stdout):
            """Visit the forest to build an ad-hoc vim syntax file and color
            the nodes in the source file.
            """
            # the root name starts without being a subname of something else:
            root_prefix = r" '\([a-zA-Z][a-zA-Z0-9]*[ \s\t\n]*\.[ \s\t\n]*\)\@<!\<"
            for kid in self.kids:
                if kid.type is not Unexistent:  # to ease the file a little bit
                    kid.write(root_prefix + kid.id, 0, file=file)
            # signal to Intim: the syntax file may be read now!
            print('" end', file=file)

    # Start lexing!
    lx = pylex.Python3Lexer()
    g = lx.get_tokens(source)
    # gather names to color as a forest of '.' operators:
    forest = Forest()
    current = forest
    # flag to keep track of whether to add in depth or go back to the root
    last_was_a_name = True
    # Also gather misc immediate tokens.. just for fun and extensibility
    misc = {}
    for i in [
            Token.Name.Decorator,
            Token.Name.Namespace,
            Token.Name.Operator,
            Token.Name.Keyword,
            Token.Name.Literal,
            Token.Comment,
    ]:
        misc[i] = set()
    # iterate over type_of_token, string
    for t, i in g:
        in_misc = False
        for subtype, harvest in misc.items():
            if is_token_subtype(t, subtype):
                in_misc = True
                harvest.add(i)
                break
        if in_misc:
            # no need to go further: this token does not belong to the forest
            continue
        if is_token_subtype(t, Token.Name):
            node = forest if last_was_a_name else current
            current = node.add_id(i)
            last_was_a_name = True
        elif is_token_subtype(t, Token.Operator):
            if i == '.':
                last_was_a_name = False

    # gather information on node types
    forest.type_nodes()

    # Write the vimscript commands to the syntax file:
    filename = INTIMSYNTAXFILE  # sed by vimscript
    with open(filename, 'w') as file:
        forest.write(file)
Example #31
0
lexer = get_lexer_for_filename("jsFile.js")

f = open('really_big_file.dat')

corpora = Folder(sys.argv[1]).fullFileNames("*.js")

for path_corpus in [c for c in corpora if 'orig' in c or 'no_renaming' in c or 'hash_def_one_renaming' in c]:
    print os.path.basename(path_corpus)
    f = open(path_corpus)

    names = set([])
    
    for piece in read_in_chunks(f):
        #process_data(piece)
        
        tokens = lex(piece, lexer).tokenList
        
        names.update([token for (token_type, token) in tokens
                 if is_token_subtype(token_type, Token.Name)])
    
    cnt = Counter(names) 
    print ' ', len(cnt.keys()), 'names'
    s = stats.describe(cnt.values())
    print '  min =', s[1][0]
    print '  max =', s[1][1]
    print '  mean =', s[2]
    print '  variance =', s[3]
    print '  median =', median(cnt.values())

    print
Example #32
0
def processTranslation(translation, iBuilder_clear, 
                       scopeAnalyst, lm_path, f,
                       output_path, base_name, clear):
    
    nc = []
    
    def writeTmpLines(lines, out_file_path):
        js_tmp = open(out_file_path, 'w')
        js_tmp.write('\n'.join([' '.join([token for (_token_type, token) in line]) 
                                for line in lines]).encode('utf8'))
        js_tmp.write('\n')
        js_tmp.close()
        
        
    if translation is not None:

        # Compute scoping 
        try:
            # name2Xscope are dictionaries where keys are (name, start_index) 
            # tuples and values are scope identifiers. Note: start_index is a 
            # flat (unidimensional) index, not (line_chr_idx, col_chr_idx).
            name2defScope = scopeAnalyst.resolve_scope()
#             name2useScope = scopeAnalyst.resolve_use_scope()
            # isGlobal has similar structure and returns True/False
            isGlobal = scopeAnalyst.isGlobal
            # name2pth has similar structure and returns AST depths
#             name2pth = scopeAnalyst.resolve_path()
            # nameOrigin[(name, def_scope)] = depth
#             nameOrigin = scopeAnalyst.nameOrigin
        except:
            return False
    
        name_candidates = {}
        
        # Collect names and their locations in various formats
        # that will come in handy later:
        
        # Which locations [(line number, index within line)] does
        # a variable name appear at?
        name_positions = {}
        
        # Which variable name is at a location specified by 
        # [line number][index within line]?
        position_names = {}
        
        for line_num, line in enumerate(iBuilder_clear.tokens):
            position_names.setdefault(line_num, {})
            
            for line_idx, (token_type, token) in enumerate(line):
                if is_token_subtype(token_type, Token.Name):
                    (l,c) = iBuilder_clear.tokMap[(line_num, line_idx)]
                    p = iBuilder_clear.flatMap[(l,c)]
                    
                    if not isGlobal.get((token, p), True):

                        def_scope = name2defScope[(token, p)]
                    
                        name_positions.setdefault((token, def_scope), [])
                        name_positions[(token, def_scope)].append((line_num, line_idx))
                        position_names[line_num][line_idx] = (token, def_scope)
    
        # Parse moses output. 
        
        lines_translated = set([])
        translations = {}
        
        for line in translation.split('\n'):
        
            parts = line.split('|||')
            if not len(parts[0]):
                continue

            # The index of the line in the input to which this
            # translated line corresponds, starting at 0:
            n = int(parts[0])
            lines_translated.add(n)

            # The translation:
            translation = parts[1].strip()
            translation_parts = translation.split(' ')

            # Only keep translations that have exactly the same 
            # number of tokens as the input
            # If the translation has more tokens, copy the input
            if len(translation_parts) != len(iBuilder_clear.tokens[n]):
                translation_parts = [token for (token_type, token) \
                                        in iBuilder_clear.tokens[n]]
                translation = ' '.join(translation_parts)
            
            # An input can have identical translations, but with
            # different scores (the number of different translations
            # per input is controlled by the -n-best-list decoder
            # parameter). Keep only unique translations.
            translations.setdefault(n, set([]))
            translations[n].add(translation)
           
            #print n, translation_parts 
 
            # Which within-line indices have non-global var names? 
            line_dict = position_names.get(n, {})
            
            # For each variable name, record its candidate translation
            # and on how many lines (among the -n-best-list) it appears on
            for line_idx in line_dict.keys():
                
                # The original variable name
                (name, def_scope) = line_dict[line_idx]
                
                # The translated variable name
                name_translation = translation_parts[line_idx]
                
                # Record the line number (we will give more weight
                # to names that appear on many translation lines) 
                name_candidates.setdefault((name, def_scope), {})
                name_candidates[(name, def_scope)].setdefault(name_translation, set([]))
                name_candidates[(name, def_scope)][name_translation].add(n)            
  
                
#         for (name, def_scope), d in name_candidates.iteritems():
#             nc.append( (def_scope, name, ','.join(d.keys())) )
  
                #print name, name_translation, n, def_scope
        
        
        def computeFreqLenRenaming(lines, name_candidates, name_positions):
            renaming_map = {}
            seen = {}
            
            # There is no uncertainty about the translation for
            # variables that have a single candidate translation
            for ((name, def_scope), val) in [((name, def_scope), val) 
                         for (name, def_scope), val in name_candidates.items() 
                         if len(val.keys()) == 1]:
                             
                candidate_name = val.keys()[0]
                
                # Don't use the same translation for different
                # variables within the same scope.
                if not seen.has_key((candidate_name, def_scope)):
                    renaming_map[(name, def_scope)] = candidate_name
                    seen[(candidate_name, def_scope)] = True
                else:
                    renaming_map[(name, def_scope)] = name
                
            # For the remaining variables, choose the translation 
            # that has the longest name
            
            token_lines = []
            for (name, def_scope), pos in name_positions.iteritems():
                # pos is a list of tuples [(line_num, line_idx)]
                token_lines.append(((name, def_scope), \
                                len(set([line_num \
                                         for (line_num, _line_idx) in pos]))))
                
            # Sort names by how many lines they appear 
            # on in the input, descending
            token_lines = sorted(token_lines, \
                         key=lambda ((name, def_scope), num_lines): -num_lines)
            
            for (name, def_scope), _num_lines in token_lines:
                # Sort candidates by how many lines in the translation
                # they appear on, and by name length, both descending
                candidates = sorted([(name_translation, len(line_nums)) \
                                     for (name_translation,line_nums) \
                                     in name_candidates[(name, def_scope)].items()], 
                                    key=lambda e:(-e[1],-len(e[0])))
                
                if len(candidates) > 1:
                    unseen_candidates = [candidate_name 
                                         for (candidate_name, _occurs) in candidates
                                         if not seen.has_key((candidate_name, def_scope))]
                    
                    if len(unseen_candidates):
                        candidate_name = unseen_candidates[0]
                        
                        renaming_map[(name, def_scope)] = candidate_name
                        seen[(candidate_name, def_scope)] = True
                    else:
                        renaming_map[(name, def_scope)] = name
                        seen[(name, def_scope)] = True
                    
            return renaming_map
        
        
        def computeLenRenaming(lines, name_candidates, name_positions):
            renaming_map = {}
            seen = {}
            
            # There is no uncertainty about the translation for
            # variables that have a single candidate translation
            for ((name, def_scope), val) in [((name, def_scope), val) 
                         for (name, def_scope), val in name_candidates.items() 
                         if len(val.keys()) == 1]:
                
                candidate_name = val.keys()[0]
                
                if not seen.has_key((candidate_name, def_scope)):
                    renaming_map[(name, def_scope)] = candidate_name
                    seen[(candidate_name, def_scope)] = True
                else:
                    renaming_map[(name, def_scope)] = name
                
            # For the remaining variables, choose the translation that 
            # has the longest name
            token_lines = []
            
            for (name, def_scope), pos in name_positions.iteritems():
                token_lines.append(((name, def_scope), \
                                    len(set([line_num \
                                         for (line_num, _line_idx) in pos]))))
                
            # Sort names by how many lines they appear 
            # on in the input, descending
            token_lines = sorted(token_lines, 
                                 key=lambda ((name, def_scope), num_lines): -num_lines)
            
            for (name, def_scope), _num_lines in token_lines:
                
                # Sort candidates by length of translation, descending
                candidates = sorted([(name_translation, len(line_nums)) \
                                     for (name_translation,line_nums) \
                                     in name_candidates[(name, def_scope)].items()],
                                    key=lambda e:-len(e[0]))
                
                if len(candidates) > 1:
                    unseen_candidates = [candidate_name 
                                         for (candidate_name, _occurs) in candidates
                                         if not seen.has_key((candidate_name, def_scope))]
                    
                    if len(unseen_candidates):
                        candidate_name = unseen_candidates[0]
                        
                        renaming_map[(name, def_scope)] = candidate_name
                        seen[(candidate_name, def_scope)] = True
                    else:
                        renaming_map[(name, def_scope)] = name
                        seen[(name, def_scope)] = True
                    
            return renaming_map
        
        
        def computeLMRenaming(lines, name_candidates, name_positions, lm_path):
            renaming_map = {}
            seen = {}

            #print name_candidates

            # There is no uncertainty about the translation for
            # variables that have a single candidate translation
            for ((name, def_scope), val) in [((name, def_scope), val) 
                         for (name, def_scope), val in name_candidates.items() 
                         if len(val.keys()) == 1]:
                             
                candidate_name = val.keys()[0]
                
                if not seen.has_key((candidate_name, def_scope)):
                    renaming_map[(name, def_scope)] = candidate_name
                    seen[(candidate_name, def_scope)] = True
                else:
                    renaming_map[(name, def_scope)] = name
                
            # For the remaining variables, choose the translation that 
            # gives the highest language model log probability
            
            token_lines = []
            
            for (name, def_scope), pos in name_positions.iteritems():
                token_lines.append(((name, def_scope), \
                                    len(set([line_num \
                                         for (line_num, _line_idx) in pos]))))
                
            # Sort names by how many lines they appear 
            # on in the input, descending
            token_lines = sorted(token_lines, 
                                 key=lambda ((name, def_scope), num_lines): -num_lines)
            
            for (name, def_scope), _num_lines in token_lines:
                # Sort candidates by how many lines in the translation
                # they appear on, and by name length, both descending
                candidates = sorted([(name_translation, len(line_nums)) \
                                     for (name_translation,line_nums) \
                                     in name_candidates[(name, def_scope)].items()], 
                                    key=lambda e:(-e[1],-len(e[0])))
                
                if len(candidates) > 1:

                    log_probs = []
                    
                    unseen_candidates = [candidate_name 
                                         for (candidate_name, _occurs) in candidates
                                         if not seen.has_key((candidate_name, def_scope))]
                    
                    if len(unseen_candidates):
                        
                        for candidate_name in unseen_candidates:
                            line_nums = set([num \
                                for (num,idx) in name_positions[(name, def_scope)]])
                            
                            draft_lines = []
                            
                            for line_num in line_nums:
                                draft_line = [token for (token_type, token) 
                                              in lines[line_num]]
                                for line_idx in [idx 
                                                 for (num, idx) in name_positions[(name, def_scope)] 
                                                 if num == line_num]:
                                    draft_line[line_idx] = candidate_name
                                    
                                draft_lines.append(' '.join(draft_line))
                                
                                
                            line_log_probs = []
                            for line in draft_lines:
                                lmquery = LMQuery(lm_path=lm_path)
                                (lm_ok, lm_log_prob, _lm_err) = lmquery.run(line)
                                
                                #print _lm_err

                                if not lm_ok:
                                    lm_log_prob = -9999999999
                                line_log_probs.append(lm_log_prob)

                            if not len(line_log_probs):
                                lm_log_prob = -9999999999
                            else:
                                lm_log_prob = float(sum(line_log_probs)/len(line_log_probs))
            
                            log_probs.append((candidate_name, lm_log_prob))
                            #print candidate_name, log_probs
                        
                        candidate_names = sorted(log_probs, key=lambda e:-e[1])
                        candidate_name = candidate_names[0][0]
                        
                        renaming_map[(name, def_scope)] = candidate_name
                        seen[(candidate_name, def_scope)] = True
                    else:
                        renaming_map[(name, def_scope)] = name
                        seen[(name, def_scope)] = True
            #print renaming_map       
            return renaming_map

            
        def rename(lines, renaming_map):
            draft_translation = deepcopy(lines)
            
            for (name, def_scope), renaming in renaming_map.iteritems():
                for (line_num, line_idx) in name_positions[(name, def_scope)]:
                    (token_type, name) = draft_translation[line_num][line_idx]
                    draft_translation[line_num][line_idx] = (token_type, renaming)

            return draft_translation
            

#         def replaceLiterals(lines, revLiteralsMap):
#             draft_translation = deepcopy(lines)
#             # Replace back literals
#             lineLengths = [len(l) for l in lines]
#             idx = 0
#             sumIdx = 0
#             for (flatIdx, literal) in revLiteralsMap:
#                 while flatIdx > sumIdx + lineLengths[idx]:
#                     sumIdx += lineLengths[idx]
#                     idx += 1
#                 (token_type, name) = draft_translation[idx][flatIdx-sumIdx]
#                 draft_translation[idx][flatIdx-sumIdx] = (token_type, literal)
#             return draft_translation

        
        strategy = f.split('.')[1]
        
        renaming_map = computeLMRenaming(iBuilder_clear.tokens, 
                                          name_candidates, 
                                          name_positions,
                                          lm_path)
        for (name, def_scope), renaming in renaming_map.iteritems():
            nc.append( (strategy+'.lm', def_scope, renaming, name, 
                        ','.join(name_candidates[(name, def_scope)])) )
        
        lm_translation = rename(iBuilder_clear.tokens, renaming_map)

        writeTmpLines(lm_translation, f[:-3] + '.lm.js')
        ok = clear.run(f[:-3] + '.lm.js', 
                       os.path.join(output_path, 
                                    '%s.%s.lm.js' % (base_name, strategy)))
        if not ok:
            return False
        
        
        renaming_map = computeLenRenaming(iBuilder_clear.tokens, 
                                            name_candidates, 
                                            name_positions)
        for (name, def_scope), renaming in renaming_map.iteritems():
            nc.append( (strategy+'.len', def_scope, renaming, name, 
                        ','.join(name_candidates[(name, def_scope)])) )
        
        len_translation = rename(iBuilder_clear.tokens, renaming_map)
        
        writeTmpLines(len_translation, f[:-3] + '.len.js')
        ok = clear.run(f[:-3] + '.len.js', 
                       os.path.join(output_path, 
                                    '%s.%s.len.js' % (base_name, strategy)))
        if not ok:
            return False

        
        renaming_map = computeFreqLenRenaming(iBuilder_clear.tokens, 
                                            name_candidates, 
                                            name_positions)
        for (name, def_scope), renaming in renaming_map.iteritems():
            nc.append( (strategy+'.freqlen', def_scope, renaming, name, 
                        ','.join(name_candidates[(name, def_scope)])) )
        
        freqlen_translation = rename(iBuilder_clear.tokens, renaming_map)
        
        writeTmpLines(freqlen_translation, f[:-3] + '.freqlen.js')
        ok = clear.run(f[:-3] + '.freqlen.js', 
                       os.path.join(output_path, 
                                    '%s.%s.freqlen.js' % (base_name, strategy)))
        if not ok:
            return False

    return nc
Example #33
0
    def semantics(cls, tree_node, symbol_table, check=False):
        if isinstance(tree_node, AST_N_Node):
            if tree_node.n == '<HOST_GRP>':
                assert tree_node.data
                # if TT_EXCLMARK == tree_node.data[0][1]:  # invert
                if len(tree_node.data) == 2\
                        and isinstance(tree_node.data[0], AST_T_Node)\
                        and token.is_token_subtype(tree_node.data[0].data[1], TT_EXCLMARK):
                    _yes, _no = cls.semantics(tree_node.data[1],
                                              symbol_table,
                                              check=check)
                    return _no, _yes

                elif len(tree_node.data) >= 3\
                        and tree_node.data[1].n == '<HOST_GRP>'\
                        and tree_node.data[2].n == '<HOST_PARENS_CONTD>':
                    _gyes, _gno = cls.semantics(tree_node.data[1],
                                                symbol_table,
                                                check=check)
                    _pyes, _pno = cls.semantics(tree_node.data[2],
                                                symbol_table,
                                                check=check)
                    if check:
                        assert cls.disjoint_sets(_gyes, _gno, _pyes, _pno)

                    return _gyes | _pyes, _gno | _pno

                elif len(tree_node.data
                         ) == 1 and tree_node.data[0].n == '<HOST_EXPR>':
                    return cls.semantics(tree_node.data[0],
                                         symbol_table,
                                         check=check)
                else:
                    assert False

            elif tree_node.n == '<HOST_EXPR>':
                assert tree_node.data
                assert len(tree_node.data) == 1
                _host = cls.interpret_host(tree_node.data[0], symbol_table)
                return {_host}, set()

            elif tree_node.n == '<HOST_PARENS_CONTD>':
                if tree_node.data:
                    assert tree_node.data[1].n == '<HOST_GRP>'
                    _gyes, _gno = cls.semantics(tree_node.data[1],
                                                symbol_table,
                                                check=check)
                    if len(tree_node.data) == 3:
                        _pyes, _pno = cls.semantics(tree_node.data[2],
                                                    symbol_table,
                                                    check=check)
                        if check:
                            assert cls.disjoint_sets(_gyes, _gno, _pyes, _pno)

                        _gyes |= _pyes
                        _gno |= _pno
                    return _gyes, _gno

            else:
                assert False

        return set(), set()
Example #34
0
    def parse(self, tokens, ignore_ws=False):
        """
        """
        if ignore_ws:
            tokens = [
                _t for _t in tokens
                if not token.is_token_subtype(token.Token.Text.Whitespace, _t)
            ]

        tokens.extend([self.__DOLLAR])
        t_it = iter(tokens)
        t = next(t_it)

        _root_parent = AST_N_Node('<$>', parent=AST_Node.TN_NO_PARENT)
        ast_root = AST_N_Node(self.S, parent=_root_parent)
        _root_parent.data = [ast_root, self.__DOLLAR]
        top_node = None

        for top_node in ast_root:
            try:
                if top_node is self.__DOLLAR or t is self.__DOLLAR:
                    break

                elif isinstance(top_node, AST_T_Node):  # Terminal -> Match
                    term = top_node.data
                    if self._match_T(term, t):
                        top_node.data = t
                    else:
                        self.SyntaxError(node_root=ast_root,
                                         node_ptr=top_node,
                                         tokens=tokens,
                                         got=t)

                    t = next(t_it)

                elif isinstance(top_node,
                                AST_N_Node):  # Nonterminal -> Predict/Expand
                    nonterm = top_node.n
                    rule = self._match_N(nonterm, t)
                    if not rule:
                        self.SyntaxError(node_root=ast_root,
                                         node_ptr=top_node,
                                         tokens=tokens,
                                         got=t)

                    production = rule[1][:]
                    if production:
                        top_node.expand_node(production, lambda p: p in self.T)
                    else:
                        top_node.expand_node([])

                else:
                    raise Exception('Lexical Error: Unknown Lexem: {0}'.format(
                        type(top_node)))

            except StopIteration:
                break

        if top_node is self.__DOLLAR and t is self.__DOLLAR:  # ok
            pass
        elif top_node is self.__DOLLAR:  # Overflow
            self.SyntaxError(node_root=ast_root,
                             node_ptr=top_node,
                             tokens=tokens,
                             got=t)
        elif t is self.__DOLLAR:  # Underflow
            self.SyntaxError(node_root=ast_root,
                             node_ptr=top_node,
                             tokens=tokens,
                             got=t)
        else:
            raise Exception('Unknown Error: It: {}, Node: {}'.format(
                t, str(top_node)))

        _nodes = ast_root.get_nodes(depth=None)
        for _n in _nodes:
            print(str(_n.n if isinstance(_n, AST_N_Node) else _n.data))
        return ast_root
Example #35
0
 def _match_T(self, s_top_T, _token):
     return token.is_token_subtype(_token[1], s_top_T)