def extract_comments(self, response): """ Extract code comments from the response's content. Regardless of the response's content type, the content is searched for HTML comments '<!-- .... -->', JS line comments '//...' and JS block comments '/* ... */'. """ # use the comment_parser package to extract HTML and JS comments try: html_comments = comment_parser.extract_comments_from_str(response.text, mime="text/html") except UnterminatedCommentError: html_comments = [] try: js_comments = comment_parser.extract_comments_from_str(response.text, mime="application/javascript") except UnterminatedCommentError: js_comments = [] # put the discovered comments together comments = list() for comment in html_comments: comments.append({"line": comment.line_number(), "comment": "<!--" + comment.text() + "-->"}) for comment in js_comments: if comment.is_multiline(): comments.append({"line": comment.line_number(), "comment": "/*" + comment.text() + "*/"}) else: comments.append({"line": comment.line_number(), "comment": "//" + comment.text()}) # store the discovered comments w.r.t. the response's path & query if comments: parsed_url = urllib.parse.urlparse(response.url) if self.config["crawl_parameter_links"].lower() == "true": self.comments[parsed_url.path + parsed_url.query] = comments else: self.comments[parsed_url.path] = comments
def get_source_code_comments(repo_content, filters=None): """ Return a dict with the comments of each source file in the repo. """ filters = [] if filters is None else filters comments = dict() with tempfile.TemporaryFile(suffix='.tar.gz') as content_f: content_f.write(repo_content) content_f.seek(0) with tarfile.open(fileobj=content_f, mode='r:gz') as tar: for member in tar.getmembers(): member_comments = [] if member.isdir(): continue tokens = member.name.split('.') if len(tokens) == 1: continue file_extension = tokens[-1] if file_extension in FILE_EXTENSIONS_TO_MIME: inner_f = tar.extractfile(member) mime = FILE_EXTENSIONS_TO_MIME[file_extension] try: file_comments = comment_parser.extract_comments_from_str(inner_f.read().decode('utf-8'), mime=mime) member_comments += file_comments except: print(member.name) file_name = _extract_file_name(member.name) comments[file_name] = member_comments for key in comments.keys(): for f in filters: comments[key] = filter(f, comments[key]) comments[key] = [c.text().strip() for c in comments[key] if c.is_multiline()] return comments
def extract_class_comments(class_text, mime_type="text/x-java-source"): """ Extracts the class or inline comments in given source file :param class_text: the str representation of the source file :param mime_type: the type of source file to parse. See https://pypi.org/project/comment-parser/ for list of potential mime_types. :return: string of comments in class without any comment related syntax """ try: comments = comment_parser.extract_comments_from_str( str(class_text), mime_type) except UnsupportedError: return "" def get_clean_comment(comment): body = comment.text() words = body.split(" ") clean_words = [] for word in words: if len(word) == 0 or not is_alpha_or_space(word[0]): continue clean_words.append(word.strip()) return " ".join(clean_words).replace("\n", "") return " ".join(list(map(get_clean_comment, comments)))
def lint(): global spaces,group try: with open(file, 'r') as f: lines = [list(line) for line in f] except FileNotFoundError: sys.exit('error: \'{}\' does not exist in directory.'.format(file)) # loop all lines in file for i,line in enumerate(lines): try: if comment_parser.extract_comments_from_str(''.join(line), mime=mime): getComment(i, line) else: if com in ''.join(line): getComment(i, line) else: spaces += 1 if spaces == 2 and len(group) != 0: # ignore empty groups _max = max([x[1] for x in group]) # furthest most comment in group for x,y in enumerate(group): if y[1] != _max: lines[y[0]].insert(y[1], ' ' * int(_max - y[1])) # insert ' 's into line spaces, group = 0, [] except Exception as e: pass # print("error: line {}:".format(i), e) # NOTE: couldnt tokenize line, something like split up dict or parentheses. or something wrong with multiline comment in C/C++ I think. Or plain text file and comment in search is not "#". lines = ''.join([''.join(i) for i in lines]) with open(output, 'w') as f: f.write(lines)
def author2comment(one): results = [] shas = Author(one).commit_shas for sha in shas: timestamp = Commit_info(sha).time_author[0] files = os.popen('echo '+ sha +' | ssh da4 ~/lookup/cmputeDiff2.perl').readlines() for file in files: old_sha = file.strip().split(';')[-2] new_sha = file.strip().split(';')[-1] os.system('echo '+ old_sha + ' | ~/lookup/showCnt blob > old') os.system('echo '+ new_sha + ' | ~/lookup/showCnt blob > new') diffs = os.popen('diff old new') addition = '' deletion = '' for diff in diffs: if diff.startswith('>'): addition = addition + diff[1:] if diff.startswith('<'): deletion = deletion + diff[1:] add_comment_words = 0 add_comment_line = 0 dele_comment_words = 0 dele_comment_line = 0 add_comment = comment_parser.extract_comments_from_str(addition) for item in add_comment: add_comment_line = add_comment_line + item.line_number add_comment_words = add_comment_words + len(item.text.split(' ')) dele_comment = comment_parser.extract_comments_from_str(deletion) for item in dele_comment: dele_comment_line = dele_comment_line + item.line_number dele_comment_words = dele_comment_words + len(item.text.split(' ')) comment_lines = abs(add_comment_line - dele_comment_line) comment_words = abs(add_comment_words - dele_comment_words) results.apppend((timestamp,sha,comment_lines,comment_words)) print((comment_lines,comment_wordso)) return results
def get_comments(code: str, mime: str) -> list: """ Extracts all comments from source code and does a multiline split """ comments = comment_parser.extract_comments_from_str(code, mime) new_comments = [] for comment in comments: if comment.is_multiline(): comment_lines = comment.text().splitlines() for line_number, line in enumerate(comment_lines, start=comment.line_number()): new_comments.append(Comment(line, line_number, True)) else: new_comments.append(comment) return new_comments
def parse(text: str) -> StateGraph: """Parse given source code text into a state graph. """ grammer_path = Path(__file__).parent / 'grammar.lark' parser = Lark(grammer_path.read_text()) transformer = GrammarTransformer() definitions = [] for comment in extract_comments_from_str(text, mime='text/x-c'): comment_lines = [ line.strip(' *') for line in comment.text().split('\n') ] for lineno, line in enumerate(comment_lines): lineno += comment.line_number() if not line.startswith('@'): continue try: ast = parser.parse(line) element = transformer.transform(ast) definitions.append(element) except (UnexpectedCharacters, UnexpectedToken) as ex: if ex.column > 1: try: message = ex.args[0].split('at')[0] except IndexError: message = 'Unexpected input' raise ParseError(message=message, line=lineno) except UnexpectedEOF: raise ParseError(message='Unexpected end', line=lineno) Resolvable.resolve_all(definitions) graph = StateGraph.of(definitions) return graph
def get_comments(nb_id): # check if notebook is in python language = nb_analysis.get_language(nb_id) if language == None or "python" not in nb_analysis.get_language(nb_id): return None # get the code cells code_cells = get_code_cells(nb_id) # iterate through the code cells and gather the comments comments = [] for cell in code_cells: # look for the field that holds the code field = "" keys = cell.keys() if 'input' in keys: field = 'input' elif 'source' in keys: field = 'source' # gather all of the code into a single string code = str("".join(cell[field])) # get the comments try: comments += list( map( lambda x: x.text(), comment_parser.extract_comments_from_str( code, mime='text/x-python'))) except: # the comment parser will not work on syntactically incorrect code continue return comments
def get_doc_extracts(go_file, directory, token_impl_map): go_lines, go_code = {}, None # Read the contents of a file as a string and also index lines by line # numbers. This is required later on to process string by lines. with open(go_file, 'r') as file: for no, line in enumerate(file.readlines()): go_lines[no] = line file.seek(0) go_code = file.read() # Extract all comments in a go source file. Each line in the comment is # returned as Comment instance, the comments are thus, not grouped. comments = comment_parser.extract_comments_from_str( go_code, mime='text/x-go', ) comment_groups = [] current_line = None # Group the comment lines as block of comments. Lines that have consecutive # line numbers are assumed to belong to the same comment block. for comment in comments: new_line = comment.line_number() if current_line is None or new_line != current_line + 1: comment_groups += [[]] comment_groups[-1].append(comment) current_line = new_line doc_groups = [] # Get the entity that a comment block is talking about and check if # it belongs to an entity that we are interested in and aggregate # them. for group in comment_groups: group_end = group[-1].line_number() comment_related_to = go_lines.get(group_end) if not comment_related_to: continue match = stdlib_method_signature.search(comment_related_to, ) if match is None: match = stdlib_testing_signature.search(comment_related_to) if match is None: continue doc_groups.append((group, match)) # Collate all the gathered information about the extract and build # Extracts. for (group, match) in doc_groups: content = list(map( lambda comment: comment.text().strip(), group, )) file_name = os.path.relpath(go_file, directory) token_package = os.path.dirname(file_name) impl_name = match.group(1) token_name = token_impl_map.get(impl_name, None) if token_name is None: token_name = impl_name # TODO: token_name should be the name the method registers itself with # instead of the name of the method that implements it. yield Extract( content=content, token_name=token_name, token_package=token_package, line_number=group[-1].line_number(), file_name=file_name, )
def search_comments(self, text: str, type: str, mime: str, sess: InformationLeakageSession): comments = comment_parser.extract_comments_from_str(text, mime) for comment in comments: self.search_string(comment._text, type, [], sess)