def test_captures(self): parser = Parser() parser.set_language(PYTHON) source = b"def foo():\n bar()\ndef baz():\n quux()\n" tree = parser.parse(source) query = PYTHON.query(""" (function_definition name: (identifier) @func-def) (call function: (identifier) @func-call) """) captures = query.captures(tree.root_node) captures = query.captures(tree.root_node) captures = query.captures(tree.root_node) captures = query.captures(tree.root_node) self.assertEqual(captures[0][0].start_point, (0, 4)) self.assertEqual(captures[0][0].end_point, (0, 7)) self.assertEqual(captures[0][1], "func-def") self.assertEqual(captures[1][0].start_point, (1, 2)) self.assertEqual(captures[1][0].end_point, (1, 5)) self.assertEqual(captures[1][1], "func-call") self.assertEqual(captures[2][0].start_point, (2, 4)) self.assertEqual(captures[2][0].end_point, (2, 7)) self.assertEqual(captures[2][1], "func-def") self.assertEqual(captures[3][0].start_point, (3, 2)) self.assertEqual(captures[3][0].end_point, (3, 6)) self.assertEqual(captures[3][1], "func-call")
class TreeSitter(object): def __init__(self, language_type, encoding='utf-8'): self.language_type = language_type self.encoding = encoding self.parser = Parser() self.parser.set_language(Language(LIB_BIN, self.language_type)) self.UpdateBuffer([""]) self._res = [] def DFS(self, node, tokenModifiers: list): for item in node.children: temp = tokenModifiers if len(item.children) != 0: temp = copy.copy(tokenModifiers) temp.append(item.type) self.DFS(item, temp) self._res.append({ 'node': item.type, 'tokenModifiers': tokenModifiers }) def GetSematicToken(self): self._res = [] self.DFS(self.tree.root_node, []) return self._res def UpdateBuffer(self, content_list): self.tree = self.parser.parse( bytes("\n".join(content_list), self.encoding))
def __init__( self, langs: List[str], added_nodes: Dict[str, Dict[str, str]], skip_node_types: Dict[str, List[str]], vendors_path: Path = Path("./vendor"), ): super(TreeSitterParser, self).__init__() vendors = [] self.added_nodes = added_nodes self.skip_node_types = skip_node_types for lang in langs: vendors.append(vendors_path / f"tree-sitter-{lang}") if lang not in added_nodes: self.added_nodes[lang] = {"prefix": "", "suffix": ""} if lang not in skip_node_types: self.skip_node_types[lang] = [] Language.build_library( # Store the library in the `build` directory "build/my-languages.so", # Include one or more languages vendors, ) self.parser = Parser()
def __init__(self, code, language='python', tree_style='AST', path_style='U2D'): # AST | SPT || HST | HPT self.tree_style = tree_style # L2L | UD | U2D self.path_style = path_style # Use the Language.build_library method to compile these # into a library that's usable from Python: csn_so = 'scripts/build/csn.so' # Language.build_library( # csn_so, # [ # 'vendor/tree-sitter-go', # 'vendor/tree-sitter-java', # 'vendor/tree-sitter-javascript', # 'vendor/tree-sitter-php', # 'vendor/tree-sitter-python', # 'vendor/tree-sitter-ruby', # ] # ) parser = Parser() # Load the languages into your app as Language objects: # ('go', 'java', 'javascript', 'php', 'python', 'ruby') parser.set_language(Language(csn_so, language)) tree = parser.parse(code.encode()) code_lines = code.split('\n') self.root, self.terminals = self.traverse(tree, code_lines) self.debug = True if self.debug: print(f'{language}{"@" * 9}code\n{code}') print(f'{language}{"@" * 9}sexp\n{tree.root_node.sexp()}')
def test_read_callback(self): parser = Parser() parser.set_language(PYTHON) source_lines = ["def foo():\n", " bar()"] def read_callback(byte_offset, point): row, column = point if row >= len(source_lines): return None if column >= len(source_lines[row]): return None return source_lines[row][column:].encode("utf8") tree = parser.parse(read_callback) self.assertEqual( tree.root_node.sexp(), trim( """(module (function_definition name: (identifier) parameters: (parameters) body: (block (expression_statement (call function: (identifier) arguments: (argument_list))))))""" ), )
def run(self, tmp_dir, params): self.ret = 0 self.log = '' try: lib = self.find_lib() lang = Language(lib, 'verilog') parser = Parser() parser.set_language(lang) except Exception as e: self.log += f'{e}\n' self.ret = 1 for src in params['files']: f = None try: f = open(src, 'rb') except IOError: self.ret = 1 self.log_error(src, '', '', 'failed to open file') continue try: tree = parser.parse(f.read()) if self.walk(tree.root_node, src): self.ret = 1 except Exception as e: self.log_error(src, '', '', 'unknown error: ' + str(e)) self.ret = 1 usage = resource.getrusage(resource.RUSAGE_SELF) profiling_data = (usage.ru_utime, usage.ru_stime, usage.ru_maxrss) return (self.log, self.ret) + profiling_data
def file_parse(path,name): Language.build_library('../build/my-languages.so', ['../tree-sitter-python']) PY_LANGUAGE = Language('../build/my-languages.so', 'python') parser = Parser() parser.set_language(PY_LANGUAGE) code = read_file(str(path)) encoded_code = bytes(code, "utf8") tree = parser.parse(encoded_code) cursor = tree.walk() root_node = tree.root_node Graph = nx.DiGraph() f= open('result_dot/'+str(name)+'.dot','w') f.write('digraph G{\n') f.write('rankdir="LR";\n') traverse(root_node,Graph,encoded_code,f) global import_lists write_together(f,import_lists) f.write("}") f.close() #write_in_dot(Graph) return None
def corpus_dataflow_match(references, candidates, lang): LANGUAGE = Language('parser/my-languages.so', lang) parser = Parser() parser.set_language(LANGUAGE) parser = [parser, dfg_function[lang]] match_count = 0 total_count = 0 for i in range(len(candidates)): references_sample = references[i] candidate = candidates[i] for reference in references_sample: try: candidate = remove_comments_and_docstrings(candidate, 'java') except: pass try: reference = remove_comments_and_docstrings(reference, 'java') except: pass cand_dfg = get_data_flow(candidate, parser) ref_dfg = get_data_flow(reference, parser) normalized_cand_dfg = normalize_dataflow(cand_dfg) normalized_ref_dfg = normalize_dataflow(ref_dfg) if len(normalized_ref_dfg) > 0: total_count += len(normalized_ref_dfg) for dataflow in normalized_ref_dfg: if dataflow in normalized_cand_dfg: match_count += 1 normalized_cand_dfg.remove(dataflow) score = match_count / total_count return score
def __init__(self, **options): self.parser = Parser() self.parser.set_language(self.language) self.escape = options.get("escapeinside", None) if self.escape is not None: self.escape = bytes(self.escape, "utf8") super().__init__(**options)
def __init__(self, language_type, encoding='utf-8'): self.language_type = language_type self.encoding = encoding self.parser = Parser() self.parser.set_language(Language(LIB_BIN, self.language_type)) self.UpdateBuffer([""]) self._res = []
def test_child_by_field_id(self): parser = Parser() parser.set_language(PYTHON) tree = parser.parse(b"def foo():\n bar()") root_node = tree.root_node fn_node = tree.root_node.children[0] self.assertEqual(PYTHON.field_id_for_name("nameasdf"), None) name_field = PYTHON.field_id_for_name("name") alias_field = PYTHON.field_id_for_name("alias") self.assertIsInstance(alias_field, int) self.assertIsInstance(name_field, int) self.assertEqual(root_node.child_by_field_id(alias_field), None) self.assertEqual(root_node.child_by_field_id(name_field), None) self.assertEqual(fn_node.child_by_field_id(alias_field), None) self.assertEqual( fn_node.child_by_field_id(name_field).type, "identifier") self.assertRaises(TypeError, root_node.child_by_field_id, "") self.assertRaises(TypeError, root_node.child_by_field_name, True) self.assertRaises(TypeError, root_node.child_by_field_name, 1) self.assertEqual( fn_node.child_by_field_name("name").type, "identifier") self.assertEqual(fn_node.child_by_field_name("asdfasdfname"), None) self.assertEqual( fn_node.child_by_field_name("name"), fn_node.child_by_field_name("name"), )
def checkAndGenerateAST(i, lstCFilesStep1, fopStep2, fopASTInfo, fopStep4GraphAll, fopStep4GraphSimplify, fpLog, nlpObj, offsetContext, isSaveGraph): fpMixFileCPP = lstCFilesStep1[i] lenFile = len(lstCFilesStep1) nameOfFile = os.path.basename(fpMixFileCPP) nameWithoutExtension = nameOfFile.replace('.cpp', '') fpCompiledCPP = fopStep2 + nameOfFile fpASTItem = fopASTInfo + nameOfFile.replace('.cpp', '_ast.txt') isRunOK = False try: parser = Parser() parser.set_language(CPP_LANGUAGE) # getJsonDict(fpCPP, fpDotGraphAllText, fpDotGraphAllImage, fpDotGraphSimplifyText, fpDotGraphSimplifyImage, # parser, offsetContext) fpDotGraphAllText = fopStep4GraphAll + nameWithoutExtension + '_all.dot' fpDotGraphAllImage = fopStep4GraphAll + nameWithoutExtension + '_all.png' fpDotGraphSimplifyText = fopStep4GraphSimplify + nameWithoutExtension + '_simplify.dot' fpDotGraphSimplifyImage = fopStep4GraphSimplify + nameWithoutExtension + '_simplify.png' f1 = open(fpMixFileCPP, 'r') strItem = f1.read() f1.close() start_time = time.time() jsonObject = getJsonDict(fpMixFileCPP, fpDotGraphAllText, fpDotGraphAllImage, fpDotGraphSimplifyText, fpDotGraphSimplifyImage, parser, nlpObj, offsetContext, isSaveGraph) # strASTOfFile=walker.getRepresentASTFromFile(fpCodeFileCPP,indexTu) end_time = time.time() numWordItem = len(strItem.split()) itemTimeProcess = (end_time - start_time) if str(jsonObject) != 'Error' or str(jsonObject) != 'None': # arrContentOfFile=strContentOfFile.split('\n') strContentAppend = '\n'.join( [nameOfFile, str(jsonObject), '\n\n\n']) f1 = open(fpASTItem, 'w') f1.write(strContentAppend) f1.close() shutil.copyfile(fpMixFileCPP, fopStep2 + nameOfFile) f1 = open(fpLog, 'a') f1.write('{}\t{}\n'.format(nameOfFile, 'True')) f1.close() isRunOK = True # print('{}\t{}'.format(strCommand,isRunOK)) else: f1 = open(fpLog, 'a') f1.write('{}\t{}\n'.format(nameOfFile, 'False')) f1.close() # print('{}\t{}'.format(strCommand,isRunOK)) print('OK {}/{} {}'.format(i, len(lstCFilesStep1), fpMixFileCPP)) except: print("Exception in user code:") print("-" * 60) traceback.print_exc(file=sys.stdout) print("-" * 60) print('Error: {} {}'.format(i, fpMixFileCPP)) print('Error {}/{} {}'.format(i, len(lstCFilesStep1), fpMixFileCPP)) return i, numWordItem, itemTimeProcess
def test_children_by_field_name(self): parser = Parser() parser.set_language(JAVASCRIPT) tree = parser.parse(b"<div a={1} b={2} />") jsx_node = tree.root_node.children[0].children[0] attributes = jsx_node.children_by_field_name("attribute") self.assertEqual([a.type for a in attributes], ["jsx_attribute", "jsx_attribute"])
def test_field_name_for_child(self): parser = Parser() parser.set_language(JAVASCRIPT) tree = parser.parse(b"<div a={1} b={2} />") jsx_node = tree.root_node.children[0].children[0] self.assertEqual(jsx_node.field_name_for_child(0), None) self.assertEqual(jsx_node.field_name_for_child(1), "name")
class Code_Parser(): def __init__(self, grammar, language="python", parser_library_path='src/tree-sitter/tree-sitter-python', **kwargs): Language.build_library('/build/my-languages.so',[parser_library_path]) LANGUAGE = Language('/build/my-languages.so', language) self.grammar = grammar self.TS_parser = Parser() self.TS_parser.set_language(LANGUAGE) self.node_builder = NodeBuilder(self.grammar) def code_to_sequence(self, code_str): tree = self.TS_parser.parse(bytes(code_str, "utf8")) root_node = tree.root_node sequence = self.TSTree_to_sequence(root_node, code_str) return sequence def TSTree_to_sequence(self, TSNode, code_str): node_sequence = [TSNode.type] if TSNode.type == "string": node_text = sub_str_from_coords(code_str, TSNode.start_point, TSNode.end_point)[1:-1] node_sequence += ["_string_start",'"',"<REDUCE>"] node_sequence += ["_string_content",node_text,"<REDUCE>"] node_sequence += ["_string_end",'"',"<REDUCE>"] elif TSNode.children == []: node_text = sub_str_from_coords(code_str, TSNode.start_point, TSNode.end_point) if TSNode.type != node_text: node_sequence.append(node_text) elif TSNode.children != []: for child in TSNode.children: node_sequence += self.TSTree_to_sequence(child, code_str) node_sequence.append("<REDUCE>") return node_sequence def is_valid_sequence(self, sequence): first_node = sequence[0] if first_node != "module": return False partial_tree = PartialTree(first_node, self.node_builder) try: for expansion in sequence[1:]: partial_tree.add_action(expansion) except Exception as e: return False return True def sequence_to_partial_tree(self, sequence): first_node = sequence[0] partial_tree = PartialTree(first_node, self.node_builder) try: for expansion in sequence[1:]: partial_tree.add_action(expansion) except Exception as e: print("ERROR!") traceback.print_exc() print("-------") return partial_tree
def get_parser(language): language = LANGUAGE_ALIASES.get(language, language) if language in PARSERS: return PARSERS[language] LANGUAGE = Language(tree_sitter_build, language) parser = Parser() parser.set_language(LANGUAGE) PARSERS[language] = parser return parser
def __init__( self, SO_FILE: str, LANGUAGE: str, to_lower=True, ): self.parser = Parser() self.parser.set_language(Language(SO_FILE, LANGUAGE)) self.LANGUAGE = LANGUAGE self.to_lower = to_lower
def get_parser(so_path: str = None) -> Parser: if so_path is None: so_path = JAVA_SO_PATH JAVA_LANGUAGE = Language(so_path, 'java') parser = Parser() parser.set_language(JAVA_LANGUAGE) return parser
def __init__(self) -> None: # assume submodules exist vendor_dirs = ["vendor/tree-sitter-%s" % l for l in TREE_SITTER_LANGS] Language.build_library(BUILD_PATH, vendor_dirs) self.parsers = {} for l in TREE_SITTER_LANGS: parser = Parser() parser.set_language(Language(BUILD_PATH, "haskell")) self.parsers[l] = parser
def __init__(self): if not os.path.exists('build/my-languages.so'): Language.build_library('build/my-languages.so', [ 'vendor/tree-sitter-c', 'vendor/tree-sitter-cpp', 'vendor/tree-sitter-c-sharp', 'vendor/tree-sitter-rust', 'vendor/tree-sitter-javascript', 'vendor/tree-sitter-python' ]) self.ts = Parser() self.tree = None
def __init__(self, grammar, language="python", parser_library_path='src/tree-sitter/tree-sitter-python', **kwargs): Language.build_library('/build/my-languages.so',[parser_library_path]) LANGUAGE = Language('/build/my-languages.so', language) self.grammar = grammar self.TS_parser = Parser() self.TS_parser.set_language(LANGUAGE) self.node_builder = NodeBuilder(self.grammar)
def test_tree_cursor_without_tree(self): parser = Parser() parser.set_language(PYTHON) def parse(): tree = parser.parse(b"def foo():\n bar()") return tree.walk() cursor = parse() self.assertIs(cursor.node, cursor.node) for item in cursor.node.children: self.assertIsNotNone(item.is_named)
def test_text_predicates_errors(self): parser = Parser() parser.set_language(JAVASCRIPT) with self.assertRaises(RuntimeError): JAVASCRIPT.query(""" ( (function_declaration name: (identifier) @function-name ) (#eq? @function-name @function-name fun1) ) """) with self.assertRaises(RuntimeError): JAVASCRIPT.query(""" ( (function_declaration name: (identifier) @function-name ) (#eq? fun1 @function-name) ) """) with self.assertRaises(RuntimeError): JAVASCRIPT.query(""" ( (function_declaration name: (identifier) @function-name ) (#match? @function-name @function-name fun1) ) """) with self.assertRaises(RuntimeError): JAVASCRIPT.query(""" ( (function_declaration name: (identifier) @function-name ) (#match? fun1 @function-name) ) """) with self.assertRaises(RuntimeError): JAVASCRIPT.query(""" ( (function_declaration name: (identifier) @function-name ) (#match? @function-name @function-name) ) """)
def __init__(self, so_file: str, language: str, operators_file: str = None): self.parser = Parser() self.parser.set_language(Language(so_file, language)) self.language = language if operators_file is None: operators_file = os.path.join(os.path.dirname(__file__), 'operators.json') with open(operators_file, 'r') as reader: self.operators = ujson.load(reader)
def test_set_language(self): parser = Parser() parser.set_language(PYTHON) tree = parser.parse(b"def foo():\n bar()") self.assertEqual( tree.root_node.sexp(), trim("""(module (function_definition name: (identifier) parameters: (parameters) body: (block (expression_statement (call function: (identifier) arguments: (argument_list))))))"""), ) parser.set_language(JAVASCRIPT) tree = parser.parse(b"function foo() {\n bar();\n}") self.assertEqual( tree.root_node.sexp(), trim("""(program (function_declaration name: (identifier) parameters: (formal_parameters) body: (statement_block (expression_statement (call_expression function: (identifier) arguments: (arguments))))))"""), )
def main(file): this_directory = os.path.dirname(__file__) # filename = os.path.join(this_directory, '/relative/path/to/file/you/want') # This code is used to configure parsing tool Tree Sitter Language.build_library( # Store the library in the `build` directory os.path.join(this_directory, 'build/my-languages.so'), # Include one or more languages [ # 'vendor/tree-sitter-go', os.path.join(this_directory, 'vendor/tree-sitter-java') # 'vendor/tree-sitter-python' ]) java_lang = Language(os.path.join(this_directory, 'build/my-languages.so'), 'java') # Parsing algorithm starts here parser = Parser() parser.set_language(java_lang) # For debugging tree_sitter_tree = parser.parse(read_file(file)) # For production # tree_sitter_tree = parser.parse(read_file(file)) gumtree_ast = to_gumtree_node(tree_sitter_tree.root_node) # everything should be inside the tag root_node = doc.createElement('root') # in test case they have context tag, which is empty. Do not know why we need it context_node = doc.createElement('context') # We append our root node to document doc.appendChild(root_node) # Append context tag to root node (<root> </root) root_node.appendChild(context_node) # append data into <root> tag. At this stage we append parsed code structure. root_node.appendChild(gumtree_ast) # Recursively add children nodes (if exist) process_node(tree_sitter_tree.root_node, gumtree_ast) xml = doc.toprettyxml() print(xml)
def add_lcom5(df, col): lang_builds = create_parser_builds() parser = Parser() class_lcom5 = [] for i in range(len(df)): ext = df["name"][i].split('.')[-1] parser.set_language(lang_builds[ext]) enc = df["encoding"][i] tree = parser.parse(bytes(df["contents"][i], df["encoding"][i])) class_lcom5.append(calculate_lcom5(tree, ext, bytes(df["contents"][i], df["encoding"][i]), df["name"][i])) df["class_lcom5"] = class_lcom5 return df
def get_parser(lang: str) -> Parser: """ Initialize parser for a specific language. :param lang: language to use. :return: parser. """ global PARSERS if lang not in PARSERS: parser = Parser() parser.set_language(Language(get_tree_sitter_so(), lang)) PARSERS[lang] = parser else: parser = PARSERS[lang] return parser
def test_multibyte_characters(self): parser = Parser() parser.set_language(JAVASCRIPT) source_code = bytes("'😎' && '🐍'", "utf8") tree = parser.parse(source_code) root_node = tree.root_node statement_node = root_node.children[0] binary_node = statement_node.children[0] snake_node = binary_node.children[2] self.assertEqual(binary_node.type, "binary_expression") self.assertEqual(snake_node.type, "string") self.assertEqual( source_code[snake_node.start_byte:snake_node.end_byte].decode( 'utf8'), "'🐍'")
def jobs(repo_path, args): PARSER = Parser() PARSER.set_language(Language(args.tree_sitter, args.lang)) n_file_per_commit = Counter() add_tokens_per_del_tokens = [] if os.path.exists(repo_path): submodule = os.path.join(repo_path, '.gitmodules') if os.path.exists(submodule): os.remove(submodule) try: n_stored_commit = 0 for commit in RepositoryMining( repo_path, only_no_merge=True, only_in_branch='master', only_modifications_with_file_types=language_ext[args.lang] ).traverse_commits(): if n_stored_commit > args.max_commit_number: break cleaned_message = message_cleaner(commit.msg) if not cleaned_message: continue commit_tokens = tokenize_docstring_from_string(cleaned_message) if len(commit_tokens) < args.min_target_length: continue addeds, deleteds, n_files = get_code_diff(commit, PARSER, args) if 1 <= n_files and n_files <= args.max_duplicate: with jsonlines.open(args.output_file, mode="a") as writer: writer.write( { "commit_tokens": commit_tokens, "add_tokens": addeds[0], "del_tokens": deleteds[0], } ) add_tokens_per_del_tokens.append( len(addeds[0]) / len(deleteds[0]) ) n_file_per_commit.update({n_files}) n_stored_commit += 1 except: pass return (n_file_per_commit, add_tokens_per_del_tokens)