Example #1
0
def extract_comments(filename):
    """Extracts a list of comments from the given Python source file.
        Tags comment with piece of source code it is associated with

        Comments are represented with the Comment class found in the common module.
        Python comments come in two forms, single and multi-line comments.
            - Single-line comments begin with '#' and continue to the end of line.
            - Multi-line comments are enclosed within triple double quotes as docstrings and can span
                multiple lines of code.

        Args:
            filename: String name of the file to extract comments from.
        Returns:
            Python list of common.Comment in the order that they appear in the file.
        Raises:
            common.FileError: File was unable to be open or read.
    """
    comments = []
    try:
        with open(filename, 'r') as source_file:
            file_contents = source_file.read()

            # extract single and multiline comments from source code file
            file_contents = parse_single_line_comments(file_contents, comments)
            parse_multi_line_comments(file_contents, comments)
            comments.sort(key=lambda x: x.start_line())

            tag_comments(file_contents, comments)

            source_file.close()
        return comments
    except OSError as exception:
        raise common.FileError(str(exception))
def extract_comments(filename):
    """Extracts a list of comments from the given HTML family source file.

    Comments are represented with the Comment class found in the common module.
    HTML family comments come in one form, comprising all text within '<!--' and
    '-->' markers. Comments cannot be nested.

    Args:
        filename: String name of the file to extract comments from.
    Returns:
        Python list of common.Comment in the order that they appear in the file.
    Raises:
        common.FileError: File was unable to be open or read.
        common.UnterminatedCommentError: Encountered an unterminated multi-line
            comment.
    """
    try:
        import re
        from bisect import bisect_left

        pattern = r"""
            (?P<literal> (\"([^\"\n])*\")+) |
            (?P<single> <!--(?P<single_content>.*?)-->) |
            (?P<multi> <!--(?P<multi_content>(.|\n)*?)?-->) |
            (?P<error> <!--(.*)?)
        """

        compiled = re.compile(pattern, re.VERBOSE | re.MULTILINE)

        with open(filename, 'r') as source_file:
            content = source_file.read()

        lines_indexes = []
        for match in re.finditer(r"$", content, re.M):
            lines_indexes.append(match.start())

        comments = []
        for match in compiled.finditer(content):
            kind = match.lastgroup

            start_character = match.start()
            line_no = bisect_left(lines_indexes, start_character)

            if kind == "single":
                comment_content = match.group("single_content")
                comment = common.Comment(comment_content, line_no + 1)
                comments.append(comment)
            elif kind == "multi":
                comment_content = match.group("multi_content")
                comment = common.Comment(comment_content,
                                         line_no + 1,
                                         multiline=True)
                comments.append(comment)
            elif kind == "error":
                raise common.UnterminatedCommentError()

        return comments
    except OSError as exception:
        raise common.FileError(str(exception))
Example #3
0
def extract_comments(filename):
    """Extracts a list of comments from the given source file.

    Comments are represented with the Comment class found in the common module.
    Comments come in two forms, single and multi-line comments.
        - Single-line comments begin with '//' and continue to the end of line.
        - Multi-line comments begin with '/*' and end with '*/' and can span
            multiple lines of code. If a multi-line comment does not terminate
            before EOF is reached, then an exception is raised.

    Note that this doesn't take language-specific preprocessor directives into consideration.

    Args:
        filename: String name of the file to extract comments from.
    Returns:
        Python list of common.Comment in the order that they appear in the file.
    Raises:
        common.FileError: File was unable to be open or read.
        common.UnterminatedCommentError: Encountered an unterminated multi-line
            comment.
    """
    try:
        with open(filename, 'r') as source_file:
            comments = []
            file_content = source_file.read()
            tokens = list(javalang.tokenizer.tokenize(file_content))

            prev_line = ''
            prev_comment_text = '-'
            for token in tokens:
                if token.__class__.__name__ == 'Comment':
                    comment_text = token.value
                    if comment_text.startswith('/*'):
                        is_multiline = True
                        comment_text = comment_text.replace('/*', '', 1)
                        comment_text = comment_text.replace('*/', '', 1)
                        end_line = token.position[0]
                        start_line = end_line - comment_text.count('\n')
                    else:
                        is_multiline = False
                        comment_text = token.value.rstrip().replace('//', '', 1)
                        end_line = token.position[0] - 1
                        start_line = token.position[0] - 1

                    comment = common.Comment(comment_text, start_line, end_line, is_multiline)

                    if not is_multiline:
                        line_counter = 0
                        for line in file_content.splitlines():
                            if start_line - 1 == line_counter:
                                if re.match(r"^[ \t]*//" + re.escape(comment_text) + r"[ \t]*$", line) and \
                                        re.match(r"^[ \t]*//" + re.escape(prev_comment_text) + r"[ \t]*$", prev_line):
                                    comment = combine_consecutive_comments(comments, comment)

                                prev_comment_text = comment_text
                                prev_line = line
                                break
                            line_counter += 1
                    file_content = remove_comment(file_content, comment_text, is_multiline)
                    comments.append(comment)
            tag_comments(comments, file_content, eof_line_number=file_content.count('\n'))
            return comments
    except OSError as exception:
        raise common.FileError(str(exception))
Example #4
0
def extract_comments(filename):
    """Extracts a list of comments from the given C family source file.

    Comments are represented with the Comment class found in the common module.
    C family comments come in two forms, single and multi-line comments.
        - Single-line comments begin with '//' and continue to the end of line.
        - Multi-line comments begin with '/*' and end with '*/' and can span
            multiple lines of code. If a multi-line comment does not terminate
            before EOF is reached, then an exception is raised.

    Note that this doesn't take language-specific preprocessor directives into
    consideration.

    Args:
        filename: String name of the file to extract comments from.
    Returns:
        Python list of common.Comment in the order that they appear in the file.
    Raises:
        common.FileError: File was unable to be open or read.
        common.UnterminatedCommentError: Encountered an unterminated multi-line
            comment.
    """
    try:
        import re
        from bisect import bisect_left

        pattern = r"""
            (?P<literal> (\"([^\"\n])*\")+) |
            (?P<single> //(?P<single_content>.*)?$) |
            (?P<multi> /\*(?P<multi_content>(.|\n)*?)?\*/) |
            (?P<error> /\*(.*)?)
        """

        compiled = re.compile(pattern, re.VERBOSE | re.MULTILINE)

        with open(filename, 'r') as source_file:
            content = source_file.read()

        lines_indexes = []
        for match in re.finditer(r"$", content, re.M):
            lines_indexes.append(match.start())

        comments = []
        for match in compiled.finditer(content):
            kind = match.lastgroup

            start_character = match.start()
            line_no = bisect_left(lines_indexes, start_character)

            if kind == "single":
                comment_content = match.group("single_content")
                comment = common.Comment(comment_content, line_no + 1)
                comments.append(comment)
            elif kind == "multi":
                comment_content = match.group("multi_content")
                comment = common.Comment(comment_content,
                                         line_no + 1,
                                         multiline=True)
                comments.append(comment)
            elif kind == "error":
                raise common.UnterminatedCommentError()

        return comments
    except OSError as exception:
        raise common.FileError(str(exception))
Example #5
0
def extract_comments(filename):
    """Extracts a list of comments from the given C family source file.

    Comments are represented with the Comment class found in the common module.
    C family comments come in two forms, single and multi-line comments.
        - Single-line comments begin with '//' and continue to the end of line.
        - Multi-line comments begin with '/*' and end with '*/' and can span
            multiple lines of code. If a multi-line comment does not terminate
            before EOF is reached, then an exception is raised.

    Note that this doesn't take language-specific preprocessor directives into
    consideration.

    Args:
        filename: String name of the file to extract comments from.
    Returns:
        Python list of common.Comment in the order that they appear in the file.
    Raises:
        common.FileError: File was unable to be open or read.
        common.UnterminatedCommentError: Encountered an unterminated multi-line
            comment.
    """
    try:
        with open(filename, 'r') as source_file:
            state = 0
            current_comment = ''
            comments = []
            line_counter = 1
            comment_start = 1
            while True:
                char = source_file.read(1)
                if not char:
                    if state is 3 or state is 4:
                        raise common.UnterminatedCommentError()
                    if state is 2:
                        # Was in single line comment. Create comment.
                        comment = common.Comment(current_comment, line_counter)
                        comments.append(comment)
                    return comments
                if state is 0:
                    # Waiting for comment start character or beginning of
                    # string.
                    if char == '/':
                        state = 1
                    elif char == '"':
                        state = 5
                elif state is 1:
                    # Found comment start character, classify next character and
                    # determine if single or multiline comment.
                    if char == '/':
                        state = 2
                    elif char == '*':
                        comment_start = line_counter
                        state = 3
                    else:
                        state = 0
                elif state is 2:
                    # In single line comment, read characters until EOL.
                    if char == '\n':
                        comment = common.Comment(current_comment, line_counter)
                        comments.append(comment)
                        current_comment = ''
                        state = 0
                    else:
                        current_comment += char
                elif state is 3:
                    # In multi-line comment, add characters until '*'
                    # encountered.
                    if char == '*':
                        state = 4
                    else:
                        current_comment += char
                elif state is 4:
                    # In multi-line comment with asterisk found. Determine if
                    # comment is ending.
                    if char == '/':
                        comment = common.Comment(current_comment,
                                                 comment_start,
                                                 multiline=True)
                        comments.append(comment)
                        current_comment = ''
                        state = 0
                    else:
                        current_comment += '*'
                        # Care for multiple '*' in a row
                        if char != '*':
                            current_comment += char
                            state = 3
                elif state is 5:
                    # In string literal, expect literal end or escape char.
                    if char == '"':
                        state = 0
                    elif char == '\\':
                        state = 6
                elif state is 6:
                    # In string literal, escaping current char.
                    state = 5
                if char == '\n':
                    line_counter += 1
    except OSError as exception:
        raise common.FileError(str(exception))
Example #6
0
def extract_comments(filename):
    """Extracts a list of comments from the given shell script.

    Comments are represented with the Comment class found in the common module.
    Shell script comments only come in one form, single-line. Single line
    comments start with an unquoted or unescaped '#' and continue on until the
    end of the line. A quoted '#' is one that is located within a pair of
    matching single or double quote marks. An escaped '#' is one that is
    immediately preceeded by a backslash '\'

    Args:
        filename: String name of the file to extract comments from.
    Returns:
        Python list of common.Comment in the order that they appear in the file.
    Raises:
        common.FileError: File was unable to be open or read.
    """
    try:
        with open(filename, 'r') as source_file:
            state = 0
            string_char = ''
            current_comment = ''
            comments = []
            line_counter = 1
            while True:
                char = source_file.read(1)
                if not char:
                    # EOF
                    if state is 1:
                        # Was in single line comment. Create comment.
                        comment = common.Comment(current_comment, line_counter)
                        comments.append(comment)
                    return comments
                if state is 0:
                    # Waiting for comment start character, beginning of string,
                    # or escape character.
                    if char == '#':
                        state = 1
                    elif char == '"' or char == "'":
                        string_char = char
                        state = 2
                    elif char == '\\':
                        state = 4
                elif state is 1:
                    # Found comment start character. Read comment until EOL.
                    if char == '\n':
                        comment = common.Comment(current_comment, line_counter)
                        comments.append(comment)
                        current_comment = ''
                        state = 0
                    else:
                        current_comment += char
                elif state is 2:
                    # In string literal, wait for string end or escape char.
                    if char == string_char:
                        state = 0
                    elif char == '\\':
                        state = 3
                elif state is 3:
                    # Escaping current char, inside of string.
                    state = 2
                elif state is 4:
                    # Escaping current char, outside of string.
                    state = 0
                if char == '\n':
                    line_counter += 1
    except OSError as exception:
        raise common.FileError(str(exception))