Exemple #1
0
    def extract_comments(self, response):
        """
        Extract code comments from the response's content. Regardless of the response's
        content type, the content is searched for HTML comments '<!-- .... -->', JS line
        comments '//...' and JS block comments '/* ... */'.
        """

        # use the comment_parser package to extract HTML and JS comments
        try:
            html_comments = comment_parser.extract_comments_from_str(response.text, mime="text/html")
        except UnterminatedCommentError:
            html_comments = []
        try:
            js_comments = comment_parser.extract_comments_from_str(response.text, mime="application/javascript")
        except UnterminatedCommentError:
            js_comments = []

        # put the discovered comments together
        comments = list()
        for comment in html_comments:
            comments.append({"line": comment.line_number(), "comment": "<!--" + comment.text() + "-->"})
        for comment in js_comments:
            if comment.is_multiline():
                comments.append({"line": comment.line_number(), "comment": "/*" + comment.text() + "*/"})
            else:
                comments.append({"line": comment.line_number(), "comment": "//" + comment.text()})

        # store the discovered comments w.r.t. the response's path & query
        if comments:
            parsed_url = urllib.parse.urlparse(response.url)
            if self.config["crawl_parameter_links"].lower() == "true":
                self.comments[parsed_url.path + parsed_url.query] = comments
            else:
                self.comments[parsed_url.path] = comments
def get_source_code_comments(repo_content, filters=None):
    """ Return a dict with the comments of each source file in the repo.
    """
    filters = [] if filters is None else filters
    comments = dict()
    with tempfile.TemporaryFile(suffix='.tar.gz') as content_f:
        content_f.write(repo_content)
        content_f.seek(0)
        with tarfile.open(fileobj=content_f, mode='r:gz') as tar:
            for member in tar.getmembers():
                member_comments = []
                if member.isdir():
                    continue
                
                tokens = member.name.split('.')
                if len(tokens) == 1:
                    continue

                file_extension = tokens[-1]
                if file_extension in FILE_EXTENSIONS_TO_MIME:
                    inner_f = tar.extractfile(member)
                    mime = FILE_EXTENSIONS_TO_MIME[file_extension]
                    try:
                        file_comments = comment_parser.extract_comments_from_str(inner_f.read().decode('utf-8'), mime=mime)
                        member_comments += file_comments
                    except:
                        print(member.name)
                file_name = _extract_file_name(member.name)
                comments[file_name] = member_comments
    for key in comments.keys():
        for f in filters:
            comments[key] = filter(f, comments[key])
        comments[key] = [c.text().strip() for c in comments[key] if c.is_multiline()]
    return comments
Exemple #3
0
def extract_class_comments(class_text, mime_type="text/x-java-source"):
    """
    Extracts the class or inline comments in given source file
    :param class_text: the str representation of the source file
    :param mime_type: the type of source file to parse.
    See https://pypi.org/project/comment-parser/ for list of potential mime_types.
    :return: string of comments in class without any comment related syntax
    """
    try:
        comments = comment_parser.extract_comments_from_str(
            str(class_text), mime_type)
    except UnsupportedError:
        return ""

    def get_clean_comment(comment):
        body = comment.text()
        words = body.split(" ")
        clean_words = []
        for word in words:
            if len(word) == 0 or not is_alpha_or_space(word[0]):
                continue
            clean_words.append(word.strip())

        return " ".join(clean_words).replace("\n", "")

    return " ".join(list(map(get_clean_comment, comments)))
Exemple #4
0
def lint():
    global spaces,group

    try:
        with open(file, 'r') as f:
            lines = [list(line) for line in f]
    except FileNotFoundError:
        sys.exit('error: \'{}\' does not exist in directory.'.format(file))

    # loop all lines in file
    for i,line in enumerate(lines):
        try:
            if comment_parser.extract_comments_from_str(''.join(line), mime=mime):
                getComment(i, line)
            else:
                if com in ''.join(line):
                    getComment(i, line)
                else:
                    spaces += 1
                    if spaces == 2 and len(group) != 0:     # ignore empty groups
                        _max = max([x[1] for x in group])   # furthest most comment in group
                        for x,y in enumerate(group):
                            if y[1] != _max:
                                lines[y[0]].insert(y[1], ' ' * int(_max - y[1]))    # insert ' 's into line
                        spaces, group = 0, []
        except Exception as e:
            pass
            # print("error: line {}:".format(i), e)
            # NOTE: couldnt tokenize line, something like split up dict or parentheses. or something wrong with multiline comment in C/C++ I think. Or plain text file and comment in search is not "#".

    lines = ''.join([''.join(i) for i in lines])
    with open(output, 'w') as f:
        f.write(lines)
Exemple #5
0
def author2comment(one):
	results = []
	shas = Author(one).commit_shas

	for sha in shas:
		timestamp = Commit_info(sha).time_author[0]
		files = os.popen('echo '+ sha +' | ssh da4 ~/lookup/cmputeDiff2.perl').readlines()
		for file in files:
			old_sha = file.strip().split(';')[-2]
			new_sha = file.strip().split(';')[-1]
			os.system('echo '+ old_sha + ' | ~/lookup/showCnt blob > old')
			os.system('echo '+ new_sha + ' | ~/lookup/showCnt blob > new')
			diffs = os.popen('diff old new')
			addition = ''
			deletion = ''
			for diff in diffs:
				if diff.startswith('>'):
					addition = addition + diff[1:]
				if diff.startswith('<'):
					deletion = deletion + diff[1:]

			add_comment_words = 0
			add_comment_line = 0
			dele_comment_words = 0
			dele_comment_line = 0

			add_comment = comment_parser.extract_comments_from_str(addition)
			for item in add_comment:
				add_comment_line = add_comment_line + item.line_number
				add_comment_words = add_comment_words + len(item.text.split(' '))

			dele_comment = comment_parser.extract_comments_from_str(deletion)
			for item in dele_comment:
				dele_comment_line = dele_comment_line + item.line_number
				dele_comment_words = dele_comment_words + len(item.text.split(' '))

			comment_lines = abs(add_comment_line - dele_comment_line)
			comment_words = abs(add_comment_words - dele_comment_words)
			results.apppend((timestamp,sha,comment_lines,comment_words))
			print((comment_lines,comment_wordso))

	return results
Exemple #6
0
def get_comments(code: str, mime: str) -> list:
    """
    Extracts all comments from source code and does a multiline split
    """
    comments = comment_parser.extract_comments_from_str(code, mime)
    new_comments = []
    for comment in comments:
        if comment.is_multiline():
            comment_lines = comment.text().splitlines()
            for line_number, line in enumerate(comment_lines,
                                               start=comment.line_number()):
                new_comments.append(Comment(line, line_number, True))
        else:
            new_comments.append(comment)
    return new_comments
Exemple #7
0
def parse(text: str) -> StateGraph:
    """Parse given source code text into a state graph.
    """
    grammer_path = Path(__file__).parent / 'grammar.lark'
    parser = Lark(grammer_path.read_text())
    transformer = GrammarTransformer()

    definitions = []

    for comment in extract_comments_from_str(text, mime='text/x-c'):
        comment_lines = [
            line.strip(' *') for line in comment.text().split('\n')
        ]

        for lineno, line in enumerate(comment_lines):
            lineno += comment.line_number()

            if not line.startswith('@'):
                continue

            try:
                ast = parser.parse(line)
                element = transformer.transform(ast)
                definitions.append(element)
            except (UnexpectedCharacters, UnexpectedToken) as ex:
                if ex.column > 1:
                    try:
                        message = ex.args[0].split('at')[0]
                    except IndexError:
                        message = 'Unexpected input'

                    raise ParseError(message=message, line=lineno)
            except UnexpectedEOF:
                raise ParseError(message='Unexpected end', line=lineno)

    Resolvable.resolve_all(definitions)
    graph = StateGraph.of(definitions)
    return graph
def get_comments(nb_id):

    # check if notebook is in python
    language = nb_analysis.get_language(nb_id)
    if language == None or "python" not in nb_analysis.get_language(nb_id):
        return None

    # get the code cells
    code_cells = get_code_cells(nb_id)

    # iterate through the code cells and gather the comments
    comments = []
    for cell in code_cells:

        # look for the field that holds the code
        field = ""
        keys = cell.keys()
        if 'input' in keys:
            field = 'input'
        elif 'source' in keys:
            field = 'source'

        # gather all of the code into a single string
        code = str("".join(cell[field]))

        # get the comments
        try:
            comments += list(
                map(
                    lambda x: x.text(),
                    comment_parser.extract_comments_from_str(
                        code, mime='text/x-python')))
        except:
            # the comment parser will not work on syntactically incorrect code
            continue

    return comments
Exemple #9
0
def get_doc_extracts(go_file, directory, token_impl_map):
    go_lines, go_code = {}, None

    # Read the contents of a file as a string and also index lines by line
    # numbers. This is required later on to process string by lines.
    with open(go_file, 'r') as file:
        for no, line in enumerate(file.readlines()):
            go_lines[no] = line

        file.seek(0)
        go_code = file.read()

    # Extract all comments in a go source file. Each line in the comment is
    # returned as Comment instance, the comments are thus, not grouped.
    comments = comment_parser.extract_comments_from_str(
        go_code,
        mime='text/x-go',
    )

    comment_groups = []
    current_line = None

    # Group the comment lines as block of comments. Lines that have consecutive
    # line numbers are assumed to belong to the same comment block.
    for comment in comments:
        new_line = comment.line_number()

        if current_line is None or new_line != current_line + 1:
            comment_groups += [[]]

        comment_groups[-1].append(comment)
        current_line = new_line

    doc_groups = []

    # Get the entity that a comment block is talking about and check if
    # it belongs to an entity that we are interested in and aggregate
    # them.
    for group in comment_groups:
        group_end = group[-1].line_number()
        comment_related_to = go_lines.get(group_end)

        if not comment_related_to:
            continue

        match = stdlib_method_signature.search(comment_related_to, )

        if match is None:
            match = stdlib_testing_signature.search(comment_related_to)

            if match is None:
                continue

        doc_groups.append((group, match))

    # Collate all the gathered information about the extract and build
    # Extracts.
    for (group, match) in doc_groups:
        content = list(map(
            lambda comment: comment.text().strip(),
            group,
        ))

        file_name = os.path.relpath(go_file, directory)
        token_package = os.path.dirname(file_name)

        impl_name = match.group(1)
        token_name = token_impl_map.get(impl_name, None)

        if token_name is None:
            token_name = impl_name

        # TODO: token_name should be the name the method registers itself with
        # instead of the name of the method that implements it.
        yield Extract(
            content=content,
            token_name=token_name,
            token_package=token_package,
            line_number=group[-1].line_number(),
            file_name=file_name,
        )
 def search_comments(self, text: str, type: str, mime: str,
                     sess: InformationLeakageSession):
     comments = comment_parser.extract_comments_from_str(text, mime)
     for comment in comments:
         self.search_string(comment._text, type, [], sess)