Beispiel #1
0
def hash_node(node: bblfsh.Node,
              ignore_sideness: bool = True) -> hashlib._hashlib.HASH:
    """ Hashes a node ignoring positional information """

    lroles = [
        str(i) for i in node.roles
        if i not in (bblfsh.role_id("LEFT"), bblfsh.role_id("RIGHT"))
    ]

    _hash = hashlib.md5()
    stuff = [node.internal_type, node.token] + lroles

    for prop, value in sorted(node.properties.items()):
        if ignore_sideness and 'left' in value.lower(
        ) or 'right' in value.lower():
            continue
        stuff.append(prop)
        stuff.append(value)

    child_hashes = []
    for child in node.children:
        child_hashes.append(
            hash_node(child, ignore_sideness).hexdigest().encode('utf-8'))

    stuff.extend(sorted(child_hashes))

    for s in stuff:
        _hash.update(str(s).encode('utf-8'))

    return _hash
Beispiel #2
0
def check(uast):
    findings = []


    binexpr_nodes = bblfsh.filter(uast, "//InfixExpression[@roleBinary and @roleExpression]")

    for node in binexpr_nodes:
        left = None
        right = None

        for c in node.children:
            if bblfsh.role_id("LEFT") in c.roles:
                left = c

            elif bblfsh.role_id("RIGHT") in c.roles:
                right = c

            elif c.token in ["=", "*", "+"]:
                left = None
                right = None
                break

            if left and right:
                break

        if not left or not right:
            continue

        if utils.hash_node(left).hexdigest() == utils.hash_node(right).hexdigest():
            findings.append({"msg": "Equal terms on both sides of binary expression, ",
                             "pos": node.start_position})

    return findings
Beispiel #3
0
    def generate_typos_fixes(self,
                             changes: Sequence[Change]) -> Iterator[TypoFix]:
        """
        Generate all data about typo fix required for any type of further processing.

        The processing can be comment generation or performance report generation.

        :param changes: The list of changes in the pointed state.
        :return: Iterator with unrendered data per comment.
        """
        base_files_by_lang = files_by_language(c.base for c in changes)
        head_files_by_lang = files_by_language(c.head for c in changes)
        for lang, head_files in head_files_by_lang.items():
            for file in filter_files(
                    files=head_files,
                    line_length_limit=self.config["line_length_limit"],
                    overall_size_limit=self.config["overall_size_limit"],
                    log=self._log):
                try:
                    prev_file = base_files_by_lang[lang][file.path]
                except KeyError:
                    lines = []
                    old_identifiers = set()
                else:
                    lines = find_new_lines(prev_file, file)
                    old_identifiers = {
                        node.token
                        for node in uast2sequence(prev_file.uast)
                        if bblfsh.role_id("IDENTIFIER") in node.roles
                        and bblfsh.role_id("IMPORT") not in node.roles
                        and node.token
                    }
                changed_nodes = extract_changed_nodes(file.uast, lines)
                new_identifiers = [
                    node for node in changed_nodes
                    if bblfsh.role_id("IDENTIFIER") in node.roles
                    and bblfsh.role_id("IMPORT") not in node.roles
                    and node.token and node.token not in old_identifiers
                ]
                if not new_identifiers:
                    continue
                suggestions = self.check_identifiers(
                    [n.token for n in new_identifiers])
                for index in suggestions.keys():
                    corrections = suggestions[index]
                    for token in corrections.keys():
                        yield TypoFix(
                            head_file=file,
                            token=new_identifiers[index].token,
                            candidates=[
                                Candidate(*c[:2]) for c in corrections[token]
                            ],
                            line_number=new_identifiers[index].start_position.
                            line,
                        )
def rule_chk(uast):
    findings = []
    is_left_literal = False
    is_right_literal = False

    left_node_pos = None
    right_node_pos = None

    #query = "//WhileStatement//InfixExpression"
    #query = "//*[@roleWhile and @roleBinary and @roleCondition and  @roleExpression]"
    #query = "//*[@roleWhile and @roleStatement and not(@roleBody)]"
    query = "//*[@roleWhile and @roleStatement and not(@roleBody)]//*[@roleRelational and @roleExpression and @roleBinary and @roleOperator] "
    print(query)
    node = filter_uast(uast, query)
    i = 0
    for n in node:
        i = i + 1
        print('Node :{0}'.format(i))
        is_left_literal = False
        is_right_literal = False

        left_node_pos = None
        right_node_pos = None

        j = 0
        for child in n.children:
            j = j + 1
            print('Iteration {0} for node {1} '.format(j, i))
            #print(child)

            if (bblfsh.role_id("NUMBER")
                    in child.roles) & (bblfsh.role_id("LEFT") in child.roles):
                is_left_literal = True
                left_node_pos = child.start_position.line

            if (bblfsh.role_id("NUMBER")
                    in child.roles) & (bblfsh.role_id("RIGHT") in child.roles):
                is_right_literal = True
                right_node_pos = child.start_position.line

        if is_left_literal & is_right_literal:
            findings.append({
                "msg": "Number literals found in while condition",
                "left literal at line": left_node_pos,
                "right literal at line": right_node_pos
            })

    return findings
Beispiel #5
0
def check(uast):
    findings = []

    switches = bblfsh.filter(uast, "//SwitchStatement")
    for i in switches:
        cases = bblfsh.filter(i, "//SwitchCase")

        for c in cases:
            if bblfsh.role_id("DEFAULT") in c.roles:
                break
        else:
            findings.append({
                "msg": "Switch without default case",
                "pos": i.start_position
            })

    return findings
Beispiel #6
0
def check(uast):
    findings = []


    switches = bblfsh.filter(uast, "//SwitchStatement")
    for i in switches:
        cases = list(bblfsh.filter(i, "//SwitchCase"))
        if not cases:
            continue

        for r in range(len(cases)):
            c = cases[r]
            if bblfsh.role_id('DEFAULT') in c.roles and r != (len(cases)-1):
                    findings.append({"msg": "'default' should be the line switch case",
                        "pos": c.start_position})

    return findings
Beispiel #7
0
def check(uast):
    findings = []

    format_calls = bblfsh.filter(
        uast, "//MethodInvocation/"
        "Identifier[@roleCall and @roleReceiver and @Name='String']/parent::MethodInvocation/"
        "Identifier[@roleCall and @roleCallee and @Name='format']/parent::MethodInvocation"
    )

    for fcall in format_calls:
        args = list(bblfsh.filter(fcall, "//*[@internalRole='arguments']"))
        if len(args) == 0:
            continue

        format_str = args[0]
        if format_str.internal_type != 'String':
            # Validating format strings assigned elsewhere on the same file is possible,
            # but won't be doing it here for brevity sake
            continue

        # For the reason stated above, we only validate %d
        str_val = format_str.properties["Value"]
        re_res = re.findall(r'[^%]%d', str_val)

        # Validate number of args
        if len(re_res) != len(args[1:]):
            findings.append({
                "msg": "Format string doesn't match number of args",
                "pos": format_str.start_position
            })

        # Validate type of args (for %d it should have the NumberLiteral role)
        for arg in args[1:]:
            froles = filter(lambda x: x == bblfsh.role_id('NUMBER'), arg.roles)
            if len(list(froles)) == 0:
                findings.append({
                    "msg": "Format string argument is not numeric",
                    "pos": arg.start_position
                })

    return findings
Beispiel #8
0
 def testRoleIdName(self) -> None:
     self.assertEqual(role_id(role_name(1)), 1)
     self.assertEqual(role_name(role_id("IDENTIFIER")), "IDENTIFIER")
    def analyze(self, ptr_from: ReferencePointer, ptr_to: ReferencePointer,
                data_service: DataService, **data) -> [Comment]:
        """
        Return the list of `Comment`-s - found typo corrections.

        :param ptr_from: The Git revision of the fork point. Exists in both the original and \
                         the forked repositories.
        :param ptr_to: The Git revision to analyze. Exists only in the forked repository.
        :param data_service: The channel to the data service in Lookout server to query for \
                             UASTs, file contents, etc.
        :param data: Extra data passed into the method. Used by the decorators to simplify \
                     the data retrieval.
        :return: List of found review suggestions. Refer to \
                 lookout/core/server/sdk/service_analyzer.proto.
        """
        log = self.log
        comments = []
        changes = list(data["changes"])
        base_files_by_lang = files_by_language(c.base for c in changes)
        head_files_by_lang = files_by_language(c.head for c in changes)
        line_length = self.config.get("line_length_limit",
                                      self.DEFAULT_LINE_LENGTH_LIMIT)
        for lang, head_files in head_files_by_lang.items():
            for file in filter_files(head_files, line_length, log):
                try:
                    prev_file = base_files_by_lang[lang][file.path]
                except KeyError:
                    lines = []
                    old_identifiers = set()
                else:
                    lines = find_new_lines(prev_file, file)
                    old_identifiers = {
                        node.token
                        for node in uast2sequence(prev_file.uast)
                        if bblfsh.role_id("IDENTIFIER") in node.roles
                        and bblfsh.role_id("IMPORT") not in node.roles
                        and node.token
                    }
                changed_nodes = extract_changed_nodes(file.uast, lines)
                new_identifiers = [
                    node for node in changed_nodes
                    if bblfsh.role_id("IDENTIFIER") in node.roles
                    and bblfsh.role_id("IMPORT") not in node.roles
                    and node.token and node.token not in old_identifiers
                ]
                if not new_identifiers:
                    continue
                suggestions = self.check_identifiers(
                    [n.token for n in new_identifiers])
                for index in suggestions.keys():
                    corrections = suggestions[index]
                    for token in corrections.keys():
                        comment = Comment()
                        comment.file = file.path
                        corrections_line = " " + ", ".join(
                            "%s (%d%%)" %
                            (candidate[0], int(candidate[1] * 100))
                            for candidate in corrections[token])
                        comment.text = """
                            Possible typo in \"%s\". Suggestions:
                        """.strip(
                        ) % new_identifiers[index].token + corrections_line
                        comment.line = new_identifiers[
                            index].start_position.line
                        comment.confidence = int(corrections[token][0][1] *
                                                 100)
                        comments.append(comment)
        return comments
Beispiel #10
0
from lookout.style.typos.utils import Candidate, Columns, flatten_df_by_column, TEMPLATE_DIR

# TODO(zurk): Split TypoFix to FileFixes and TypoFix. content, path and identifiers_number should
# be in the FileFixes.
TypoFix = NamedTuple(
    "TypoFix",
    (
        ("content", str),  # file content from head revision
        ("path", str),  # file path from head revision
        ("line_number", int),  # line number for the comment
        ("identifier", str),  # identifier where typo is found
        ("candidates", Iterable[Candidate]),  # suggested identifiers
        ("identifiers_number", int),  # number of unique analyzed identifiers
    ))

IDENTIFIER = bblfsh.role_id("IDENTIFIER")
IMPORT = bblfsh.role_id("IMPORT")
IDENTIFIER_INDEX_COLUMN = "identifier_index"


class IdTyposAnalyzer(Analyzer):
    """
    Identifier typos analyzer.
    """

    _log = logging.getLogger("IdTyposAnalyzer")
    model_type = IdTyposModel
    name = "lookout.style.typos"
    vendor = "source{d}"
    version = 1
    description = "Corrector of typos in source code identifiers."
Beispiel #11
0
def analyze_uast(path: str, root: bblfsh.Node, roles: set, reserved: set):
    contents = Path(path).read_text()

    # walk the tree: collect nodes with assigned tokens and build the parents map
    node_tokens = []
    parents = {}
    queue = [root]
    while queue:
        node = queue.pop()
        for child in node.children:
            parents[id(child)] = node
        queue.extend(node.children)
        if node.token or node.start_position and node.end_position and not node.children:
            node_tokens.append(node)
    node_tokens.sort(key=lambda n: n.start_position.offset)
    sentinel = bblfsh.Node()
    sentinel.start_position.offset = len(contents)
    sentinel.start_position.line = contents.count("\n")
    node_tokens.append(sentinel)

    # scan `node_tokens` and analyze the gaps and the token prefixes and suffixes
    pos = 0
    ws = re.compile("\s+")
    alpha = re.compile("[a-zA-Z]+")
    IDENTIFIER = bblfsh.role_id("IDENTIFIER")
    log = logging.getLogger("analyze_uast")

    def ccheck(char: str) -> bool:
        return not char.isspace() and not char.isalnum(
        ) and not ord(char) >= 128

    for node in node_tokens:
        token = node.token if node.token else \
            contents[node.start_position.offset:node.end_position.offset]
        if node.start_position.offset > pos:
            diff = contents[pos:node.start_position.offset]
            parts = ws.split(diff)
            for part in parts:
                if len(part) >= 8:
                    continue
                # for keyword in alpha.finditer(part):
                #    reserved.add(keyword.group())
                for nonalpha in alpha.split(part):
                    for char in nonalpha:
                        if ccheck(char):
                            reserved.add(char)
        if node is sentinel:
            break
        pos = node.end_position.offset
        if IDENTIFIER not in node.roles:
            continue
        outer = contents[node.start_position.offset:node.end_position.offset]
        if outer == token:
            continue
        pos = outer.find(token)
        if pos < 0:
            log.warning(
                "skipped %s, token offset corruption \"%s\" vs. \"%s\"", path,
                token, outer)
            break
        if pos > 0:
            for char in outer[:pos]:
                if ccheck(char):
                    reserved.add(char)
        if pos + len(token) < len(outer):
            for char in outer[pos + len(token):]:
                if ccheck(char):
                    reserved.add(char)
Beispiel #12
0
import bblfsh

IDENTIFIER = bblfsh.role_id("IDENTIFIER")
QUALIFIED = bblfsh.role_id("QUALIFIED")
LITERAL = bblfsh.role_id("LITERAL")
OPERATOR = bblfsh.role_id("OPERATOR")
EXPRESSION = bblfsh.role_id("EXPRESSION")
LEFT = bblfsh.role_id("LEFT")
BINARY = bblfsh.role_id("BINARY")
ASSIGNMENT = bblfsh.role_id("ASSIGNMENT")
Beispiel #13
0
import bblfsh

IDENTIFIER = bblfsh.role_id("IDENTIFIER")
QUALIFIED = bblfsh.role_id("QUALIFIED")
LITERAL = bblfsh.role_id("LITERAL")
OPERATOR = bblfsh.role_id("OPERATOR")
EXPRESSION = bblfsh.role_id("EXPRESSION")
LEFT = bblfsh.role_id("LEFT")
BINARY = bblfsh.role_id("BINARY")
ASSIGNMENT = bblfsh.role_id("ASSIGNMENT")
FUNCTION = bblfsh.role_id("FUNCTION")
DECLARATION = bblfsh.role_id("DECLARATION")
NAME = bblfsh.role_id("NAME")
Beispiel #14
0
    def hash_condition(if_node):
        for child in if_node.children:
            if bblfsh.role_id("CONDITION") in child.roles:
                return utils.hash_node(child).hexdigest()

        return None
Beispiel #15
0
def analyze_uast(path: str, content: str, root: bblfsh.Node,
                 internal_types: dict, roles: dict, reserved: set):
    """
    Fill internal types, roles and reserved dictionaries with statistics computed from an UAST.

    :param path: Path of the analyzed file.
    :param content: Content of the analyzed file.
    :param root: UAST of the analyzed file.
    :param internal_types: Dictionary containing the internal types statistics.
    :param roles: Dictionary containing the roles statistics.
    :param reserved: Dictionary containing the reserved (or tokens) statistics.
    """
    # walk the tree: collect nodes with assigned tokens and build the parents map
    node_tokens = []
    parents = {}
    queue = [root]
    while queue:
        node = queue.pop()
        internal_types[node.internal_type] += 1
        for role in node.roles:
            roles[role] += 1
        for child in node.children:
            parents[id(child)] = node
        queue.extend(node.children)
        if node.token or node.start_position and node.end_position and not node.children:
            node_tokens.append(node)
    node_tokens.sort(key=lambda n: n.start_position.offset)
    sentinel = bblfsh.Node()
    sentinel.start_position.offset = len(content)
    sentinel.start_position.line = content.count("\n")
    node_tokens.append(sentinel)

    # scan `node_tokens` and analyze the gaps and the token prefixes and suffixes
    pos = 0
    ws = re.compile("\s+")
    alpha = re.compile("[a-zA-Z]+")
    IDENTIFIER = bblfsh.role_id("IDENTIFIER")
    log = logging.getLogger("analyze_uast")

    def ccheck(char: str) -> bool:
        return not char.isspace() and not char.isalnum(
        ) and not ord(char) >= 128

    for node in node_tokens:
        token = node.token if node.token else \
            content[node.start_position.offset:node.end_position.offset]
        if node.start_position.offset > pos:
            diff = content[pos:node.start_position.offset]
            parts = ws.split(diff)
            for part in parts:
                if len(part) >= 8:
                    log.debug("Skipping weird part in code: %s. Path: %s",
                              diff, path)
                    continue
                for nonalpha in alpha.split(part):
                    for char in nonalpha:
                        if ccheck(char):
                            reserved.add(char)
        if node is sentinel:
            break
        pos = node.end_position.offset
        if IDENTIFIER not in node.roles:
            continue
        outer = content[node.start_position.offset:node.end_position.offset]
        if outer == token:
            continue
        pos = outer.find(token)
        if pos < 0:
            log.warning(
                "skipped %s, token offset corruption \"%s\" vs. \"%s\"", path,
                token, outer)
            break
        if pos > 0:
            for char in outer[:pos]:
                if ccheck(char):
                    reserved.add(char)
        if pos + len(token) < len(outer):
            for char in outer[pos + len(token):]:
                if ccheck(char):
                    reserved.add(char)
Beispiel #16
0
 def testRoleIdName(self):
     assert (role_id(role_name(1)) == 1)
     assert (role_name(role_id("IDENTIFIER")) == "IDENTIFIER")
Beispiel #17
0
```
"""
import difflib
from itertools import islice
import random
from typing import Iterator, Optional, List, Sequence, Union

import bblfsh
import numpy

from tokenizer.tokenizer import CodeTokenizer
from tokenizer.virtual_node import Position, VirtualNode

INDENTATIOS = (" ", "\n", "\t")
QUOTES = ("'", '"')
LITERAL_ID = bblfsh.role_id("LITERAL")
STRING_ID = bblfsh.role_id("STRING")


def is_indentation(node: VirtualNode):
    """
    Check if input node is indentation.
    """
    for ch in node.value:
        if ch not in INDENTATIOS:
            return False
    return True


def is_literal_string(token):
    """