def extract_comments(filename): """Extracts a list of comments from the given HTML family source file. Comments are represented with the Comment class found in the common module. HTML family comments come in one form, comprising all text within '<!--' and '-->' markers. Comments cannot be nested. Args: filename: String name of the file to extract comments from. Returns: Python list of common.Comment in the order that they appear in the file. Raises: common.FileError: File was unable to be open or read. common.UnterminatedCommentError: Encountered an unterminated multi-line comment. """ try: import re from bisect import bisect_left pattern = r""" (?P<literal> (\"([^\"\n])*\")+) | (?P<single> <!--(?P<single_content>.*?)-->) | (?P<multi> <!--(?P<multi_content>(.|\n)*?)?-->) | (?P<error> <!--(.*)?) """ compiled = re.compile(pattern, re.VERBOSE | re.MULTILINE) with open(filename, 'r') as source_file: content = source_file.read() lines_indexes = [] for match in re.finditer(r"$", content, re.M): lines_indexes.append(match.start()) comments = [] for match in compiled.finditer(content): kind = match.lastgroup start_character = match.start() line_no = bisect_left(lines_indexes, start_character) if kind == "single": comment_content = match.group("single_content") comment = common.Comment(comment_content, line_no + 1) comments.append(comment) elif kind == "multi": comment_content = match.group("multi_content") comment = common.Comment(comment_content, line_no + 1, multiline=True) comments.append(comment) elif kind == "error": raise common.UnterminatedCommentError() return comments except OSError as exception: raise common.FileError(str(exception))
def extract_comments(code): """Extracts a list of comments from the given C family source code. Comments are represented with the Comment class found in the common module. C family comments come in two forms, single and multi-line comments. - Single-line comments begin with '//' and continue to the end of line. - Multi-line comments begin with '/*' and end with '*/' and can span multiple lines of code. If a multi-line comment does not terminate before EOF is reached, then an exception is raised. Note that this doesn't take language-specific preprocessor directives into consideration. Args: code: String containing code to extract comments from. Returns: Python list of common.Comment in the order that they appear in the code. Raises: common.UnterminatedCommentError: Encountered an unterminated multi-line comment. """ pattern = r""" (?P<literal> (\"([^\"\n])*\")+) | (?P<single> //(?P<single_content>.*)?$) | (?P<multi> /\*(?P<multi_content>(.|\n)*?)?\*/) | (?P<error> /\*(.*)?) """ compiled = re.compile(pattern, re.VERBOSE | re.MULTILINE) lines_indexes = [] for match in re.finditer(r"$", code, re.M): lines_indexes.append(match.start()) comments = [] for match in compiled.finditer(code): kind = match.lastgroup start_character = match.start() line_no = bisect_left(lines_indexes, start_character) if kind == "single": comment_content = match.group("single_content") comment = common.Comment(comment_content, line_no + 1) comments.append(comment) elif kind == "multi": comment_content = match.group("multi_content") comment = common.Comment(comment_content, line_no + 1, multiline=True) comments.append(comment) elif kind == "error": raise common.UnterminatedCommentError() return comments
def extract_comments(code): """Extracts a list of comments from the given Ruby source code. Comments are represented with the Comment class found in the common module. Ruby comments start with a '#' character and run to the end of the line, http://ruby-doc.com/docs/ProgrammingRuby. Args: code: String containing code to extract comments from. Returns: Python list of common.Comment in the order that they appear in the code.. """ pattern = r""" (?P<literal> ([\"'])((?:\\\2|(?:(?!\2)).)*)(\2)) | (?P<single> \#(?P<single_content>.*?)$) | (?P<multi> ^=begin\n(?P<multi_content>(.|\n)*?)?\n=end$) | (?P<error> ^=begin$\*(.*)?) """ compiled = re.compile(pattern, re.VERBOSE | re.MULTILINE) lines_indexes = [] for match in re.finditer(r"$", code, re.M): lines_indexes.append(match.start()) comments = [] for match in compiled.finditer(code): kind = match.lastgroup start_character = match.start() line_no = bisect_left(lines_indexes, start_character) if kind == "single": comment_content = match.group("single_content") comment = common.Comment(comment_content, line_no + 1) comments.append(comment) elif kind == "multi": comment_content = match.group("multi_content") comment = common.Comment(comment_content, line_no + 1, multiline=True) comments.append(comment) elif kind == "error": raise common.UnterminatedCommentError() return comments
def extract_comments(code): """Extracts a list of comments from the given Javascript source code. Comments are represented with the Comment class found in the common module. Javascript comments come in two forms, single and multi-line comments. - Single-line comments begin with '//' and continue to the end of line. - Multi-line comments begin with '/*' and end with '*/' and can span multiple lines of code. If a multi-line comment does not terminate before EOF is reached, then an exception is raised. This module takes quoted strings into account when extracting comments from source code. Args: code: String containing code to extract comments from. Returns: Python list of common.Comment in the order that they appear in the code. Raises: common.UnterminatedCommentError: Encountered an unterminated multi-line comment. """ state = 0 current_comment = '' comments = [] line_counter = 1 comment_start = 1 string_char = '' for char in code: if state == 0: # Waiting for comment start character or beginning of # string. if char == '/': state = 1 elif char in ('"', "'"): string_char = char state = 5 elif state == 1: # Found comment start character, classify next character and # determine if single or multi-line comment. if char == '/': state = 2 elif char == '*': comment_start = line_counter state = 3 else: state = 0 elif state == 2: # In single-line comment, read characters until EOL. if char == '\n': comment = common.Comment(current_comment, line_counter) comments.append(comment) current_comment = '' state = 0 else: current_comment += char elif state == 3: # In multi-line comment, add characters until '*' is # encountered. if char == '*': state = 4 else: current_comment += char elif state == 4: # In multi-line comment with asterisk found. Determine if # comment is ending. if char == '/': comment = common.Comment(current_comment, comment_start, multiline=True) comments.append(comment) current_comment = '' state = 0 else: current_comment += '*' # Care for multiple '*' in a row if char != '*': current_comment += char state = 3 elif state == 5: # In string literal, expect literal end or escape character. if char == string_char: state = 0 elif char == '\\': state = 6 elif state == 6: # In string literal, escaping current char. state = 5 if char == '\n': line_counter += 1 # EOF. if state in (3, 4): raise common.UnterminatedCommentError() if state == 2: # Was in single-line comment. Create comment. comment = common.Comment(current_comment, line_counter) comments.append(comment) return comments
def extract_comments(filename): """Extracts a list of comments from the given C family source file. Comments are represented with the Comment class found in the common module. C family comments come in two forms, single and multi-line comments. - Single-line comments begin with '//' and continue to the end of line. - Multi-line comments begin with '/*' and end with '*/' and can span multiple lines of code. If a multi-line comment does not terminate before EOF is reached, then an exception is raised. Note that this doesn't take language-specific preprocessor directives into consideration. Args: filename: String name of the file to extract comments from. Returns: Python list of common.Comment in the order that they appear in the file. Raises: common.FileError: File was unable to be open or read. common.UnterminatedCommentError: Encountered an unterminated multi-line comment. """ try: with open(filename, 'r') as source_file: state = 0 current_comment = '' comments = [] line_counter = 1 comment_start = 1 while True: char = source_file.read(1) if not char: if state is 3 or state is 4: raise common.UnterminatedCommentError() if state is 2: # Was in single line comment. Create comment. comment = common.Comment(current_comment, line_counter) comments.append(comment) return comments if state is 0: # Waiting for comment start character or beginning of # string. if char == '/': state = 1 elif char == '"': state = 5 elif state is 1: # Found comment start character, classify next character and # determine if single or multiline comment. if char == '/': state = 2 elif char == '*': comment_start = line_counter state = 3 else: state = 0 elif state is 2: # In single line comment, read characters until EOL. if char == '\n': comment = common.Comment(current_comment, line_counter) comments.append(comment) current_comment = '' state = 0 else: current_comment += char elif state is 3: # In multi-line comment, add characters until '*' # encountered. if char == '*': state = 4 else: current_comment += char elif state is 4: # In multi-line comment with asterisk found. Determine if # comment is ending. if char == '/': comment = common.Comment(current_comment, comment_start, multiline=True) comments.append(comment) current_comment = '' state = 0 else: current_comment += '*' # Care for multiple '*' in a row if char != '*': current_comment += char state = 3 elif state is 5: # In string literal, expect literal end or escape char. if char == '"': state = 0 elif char == '\\': state = 6 elif state is 6: # In string literal, escaping current char. state = 5 if char == '\n': line_counter += 1 except OSError as exception: raise common.FileError(str(exception))