Ejemplo n.º 1
0
def extract_comments(filename):
    """Extracts a list of comments from the given HTML family source file.

    Comments are represented with the Comment class found in the common module.
    HTML family comments come in one form, comprising all text within '<!--' and
    '-->' markers. Comments cannot be nested.

    Args:
        filename: String name of the file to extract comments from.
    Returns:
        Python list of common.Comment in the order that they appear in the file.
    Raises:
        common.FileError: File was unable to be open or read.
        common.UnterminatedCommentError: Encountered an unterminated multi-line
            comment.
    """
    try:
        import re
        from bisect import bisect_left

        pattern = r"""
            (?P<literal> (\"([^\"\n])*\")+) |
            (?P<single> <!--(?P<single_content>.*?)-->) |
            (?P<multi> <!--(?P<multi_content>(.|\n)*?)?-->) |
            (?P<error> <!--(.*)?)
        """

        compiled = re.compile(pattern, re.VERBOSE | re.MULTILINE)

        with open(filename, 'r') as source_file:
            content = source_file.read()

        lines_indexes = []
        for match in re.finditer(r"$", content, re.M):
            lines_indexes.append(match.start())

        comments = []
        for match in compiled.finditer(content):
            kind = match.lastgroup

            start_character = match.start()
            line_no = bisect_left(lines_indexes, start_character)

            if kind == "single":
                comment_content = match.group("single_content")
                comment = common.Comment(comment_content, line_no + 1)
                comments.append(comment)
            elif kind == "multi":
                comment_content = match.group("multi_content")
                comment = common.Comment(comment_content,
                                         line_no + 1,
                                         multiline=True)
                comments.append(comment)
            elif kind == "error":
                raise common.UnterminatedCommentError()

        return comments
    except OSError as exception:
        raise common.FileError(str(exception))
Ejemplo n.º 2
0
def extract_comments(code):
    """Extracts a list of comments from the given C family source code.

  Comments are represented with the Comment class found in the common module.
  C family comments come in two forms, single and multi-line comments.
    - Single-line comments begin with '//' and continue to the end of line.
    - Multi-line comments begin with '/*' and end with '*/' and can span
      multiple lines of code. If a multi-line comment does not terminate
      before EOF is reached, then an exception is raised.

  Note that this doesn't take language-specific preprocessor directives into
  consideration.

  Args:
    code: String containing code to extract comments from.
  Returns:
    Python list of common.Comment in the order that they appear in the code.
  Raises:
    common.UnterminatedCommentError: Encountered an unterminated multi-line
      comment.
  """
    pattern = r"""
    (?P<literal> (\"([^\"\n])*\")+) |
    (?P<single> //(?P<single_content>.*)?$) |
    (?P<multi> /\*(?P<multi_content>(.|\n)*?)?\*/) |
    (?P<error> /\*(.*)?)
  """

    compiled = re.compile(pattern, re.VERBOSE | re.MULTILINE)

    lines_indexes = []
    for match in re.finditer(r"$", code, re.M):
        lines_indexes.append(match.start())

    comments = []
    for match in compiled.finditer(code):
        kind = match.lastgroup

        start_character = match.start()
        line_no = bisect_left(lines_indexes, start_character)

        if kind == "single":
            comment_content = match.group("single_content")
            comment = common.Comment(comment_content, line_no + 1)
            comments.append(comment)
        elif kind == "multi":
            comment_content = match.group("multi_content")
            comment = common.Comment(comment_content,
                                     line_no + 1,
                                     multiline=True)
            comments.append(comment)
        elif kind == "error":
            raise common.UnterminatedCommentError()

    return comments
Ejemplo n.º 3
0
def extract_comments(code):
  """Extracts a list of comments from the given Ruby source code.

  Comments are represented with the Comment class found in the common module.

  Ruby comments start with a '#' character and run to the end of the line,
  http://ruby-doc.com/docs/ProgrammingRuby.

  Args:
    code: String containing code to extract comments from.
  Returns:
    Python list of common.Comment in the order that they appear in the code..
  """
  pattern = r"""
    (?P<literal> ([\"'])((?:\\\2|(?:(?!\2)).)*)(\2)) |
    (?P<single> \#(?P<single_content>.*?)$) |
    (?P<multi> ^=begin\n(?P<multi_content>(.|\n)*?)?\n=end$) |
    (?P<error> ^=begin$\*(.*)?)
  """
  compiled = re.compile(pattern, re.VERBOSE | re.MULTILINE)

  lines_indexes = []
  for match in re.finditer(r"$", code, re.M):
    lines_indexes.append(match.start())

  comments = []
  for match in compiled.finditer(code):
    kind = match.lastgroup

    start_character = match.start()
    line_no = bisect_left(lines_indexes, start_character)

    if kind == "single":
      comment_content = match.group("single_content")
      comment = common.Comment(comment_content, line_no + 1)
      comments.append(comment)
    elif kind == "multi":
      comment_content = match.group("multi_content")
      comment = common.Comment(comment_content, line_no + 1, multiline=True)
      comments.append(comment)
    elif kind == "error":
      raise common.UnterminatedCommentError()
  return comments
Ejemplo n.º 4
0
def extract_comments(code):
  """Extracts a list of comments from the given Javascript source code.

  Comments are represented with the Comment class found in the common module.
  Javascript comments come in two forms, single and multi-line comments.
    - Single-line comments begin with '//' and continue to the end of line.
    - Multi-line comments begin with '/*' and end with '*/' and can span
      multiple lines of code. If a multi-line comment does not terminate
      before EOF is reached, then an exception is raised.
  This module takes quoted strings into account when extracting comments from
  source code.

  Args:
    code: String containing code to extract comments from.
  Returns:
    Python list of common.Comment in the order that they appear in the code.
  Raises:
    common.UnterminatedCommentError: Encountered an unterminated multi-line
      comment.
  """
  state = 0
  current_comment = ''
  comments = []
  line_counter = 1
  comment_start = 1
  string_char = ''
  for char in code:
    if state == 0:
      # Waiting for comment start character or beginning of
      # string.
      if char == '/':
        state = 1
      elif char in ('"', "'"):
        string_char = char
        state = 5
    elif state == 1:
      # Found comment start character, classify next character and
      # determine if single or multi-line comment.
      if char == '/':
        state = 2
      elif char == '*':
        comment_start = line_counter
        state = 3
      else:
        state = 0
    elif state == 2:
      # In single-line comment, read characters until EOL.
      if char == '\n':
        comment = common.Comment(current_comment, line_counter)
        comments.append(comment)
        current_comment = ''
        state = 0
      else:
        current_comment += char
    elif state == 3:
      # In multi-line comment, add characters until '*' is
      # encountered.
      if char == '*':
        state = 4
      else:
        current_comment += char
    elif state == 4:
      # In multi-line comment with asterisk found. Determine if
      # comment is ending.
      if char == '/':
        comment = common.Comment(current_comment, comment_start, multiline=True)
        comments.append(comment)
        current_comment = ''
        state = 0
      else:
        current_comment += '*'
        # Care for multiple '*' in a row
        if char != '*':
          current_comment += char
          state = 3
    elif state == 5:
      # In string literal, expect literal end or escape character.
      if char == string_char:
        state = 0
      elif char == '\\':
        state = 6
    elif state == 6:
      # In string literal, escaping current char.
      state = 5
    if char == '\n':
      line_counter += 1

  # EOF.
  if state in (3, 4):
    raise common.UnterminatedCommentError()
  if state == 2:
    # Was in single-line comment. Create comment.
    comment = common.Comment(current_comment, line_counter)
    comments.append(comment)
  return comments
Ejemplo n.º 5
0
def extract_comments(filename):
    """Extracts a list of comments from the given C family source file.

    Comments are represented with the Comment class found in the common module.
    C family comments come in two forms, single and multi-line comments.
        - Single-line comments begin with '//' and continue to the end of line.
        - Multi-line comments begin with '/*' and end with '*/' and can span
            multiple lines of code. If a multi-line comment does not terminate
            before EOF is reached, then an exception is raised.

    Note that this doesn't take language-specific preprocessor directives into
    consideration.

    Args:
        filename: String name of the file to extract comments from.
    Returns:
        Python list of common.Comment in the order that they appear in the file.
    Raises:
        common.FileError: File was unable to be open or read.
        common.UnterminatedCommentError: Encountered an unterminated multi-line
            comment.
    """
    try:
        with open(filename, 'r') as source_file:
            state = 0
            current_comment = ''
            comments = []
            line_counter = 1
            comment_start = 1
            while True:
                char = source_file.read(1)
                if not char:
                    if state is 3 or state is 4:
                        raise common.UnterminatedCommentError()
                    if state is 2:
                        # Was in single line comment. Create comment.
                        comment = common.Comment(current_comment, line_counter)
                        comments.append(comment)
                    return comments
                if state is 0:
                    # Waiting for comment start character or beginning of
                    # string.
                    if char == '/':
                        state = 1
                    elif char == '"':
                        state = 5
                elif state is 1:
                    # Found comment start character, classify next character and
                    # determine if single or multiline comment.
                    if char == '/':
                        state = 2
                    elif char == '*':
                        comment_start = line_counter
                        state = 3
                    else:
                        state = 0
                elif state is 2:
                    # In single line comment, read characters until EOL.
                    if char == '\n':
                        comment = common.Comment(current_comment, line_counter)
                        comments.append(comment)
                        current_comment = ''
                        state = 0
                    else:
                        current_comment += char
                elif state is 3:
                    # In multi-line comment, add characters until '*'
                    # encountered.
                    if char == '*':
                        state = 4
                    else:
                        current_comment += char
                elif state is 4:
                    # In multi-line comment with asterisk found. Determine if
                    # comment is ending.
                    if char == '/':
                        comment = common.Comment(current_comment,
                                                 comment_start,
                                                 multiline=True)
                        comments.append(comment)
                        current_comment = ''
                        state = 0
                    else:
                        current_comment += '*'
                        # Care for multiple '*' in a row
                        if char != '*':
                            current_comment += char
                            state = 3
                elif state is 5:
                    # In string literal, expect literal end or escape char.
                    if char == '"':
                        state = 0
                    elif char == '\\':
                        state = 6
                elif state is 6:
                    # In string literal, escaping current char.
                    state = 5
                if char == '\n':
                    line_counter += 1
    except OSError as exception:
        raise common.FileError(str(exception))