def main() -> None: """ Initialize tree-sitter library. :return: None. """ # root directory for tree-sitter tree_sitter_dir = get_tree_sitter_dir() # grammar locations c_grammar_loc = os.path.join(tree_sitter_dir, "vendor/tree-sitter-c") c_sharp_grammar_loc = os.path.join(tree_sitter_dir, "vendor/tree-sitter-c-sharp") cpp_grammar_loc = os.path.join(tree_sitter_dir, "vendor/tree-sitter-cpp") java_grammar_loc = os.path.join(tree_sitter_dir, "vendor/tree-sitter-java") python_grammar_loc = os.path.join(tree_sitter_dir, "vendor/tree-sitter-python") # location for library bin_loc = get_tree_sitter_so() # build everything Language.build_library( # Store the library in the `bin_loc` bin_loc, # Include languages [ c_grammar_loc, c_sharp_grammar_loc, cpp_grammar_loc, java_grammar_loc, python_grammar_loc ]) print("Parser successfully initialized.")
def build_so(lib_dir, lang): """build so file for certain language with Tree-Sitter""" _lib_dir = os.path.expanduser(lib_dir) lib_file, _lib_file = os.path.join(lib_dir, '{}.zip'.format(lang)), os.path.join( _lib_dir, '{}.zip'.format(lang)) if os.path.exists(_lib_file): LOGGER.info( 'Tree-Sitter so file for {} does not exists, compiling.'.format( lib_file)) # decompress Tree-Sitter library with zipfile.ZipFile(_lib_file, 'r') as zip_file: zip_file.extractall(path=_lib_dir) so_file, _so_file = os.path.join(lib_dir, '{}.so'.format(lang)), os.path.join( _lib_dir, '{}.so'.format(lang)) LOGGER.info('Building Tree-Sitter compile file {}'.format(so_file)) Language.build_library( # your language parser file, we recommend buidl *.so file for each language _so_file, # Include one or more languages [os.path.join(_lib_dir, 'tree-sitter-{}-master'.format(lang))], ) else: LOGGER.info( 'Tree-Sitter so file for {} exists, ignore it.'.format(lib_file))
def __init__( self, langs: List[str], added_nodes: Dict[str, Dict[str, str]], skip_node_types: Dict[str, List[str]], vendors_path: Path = Path("./vendor"), ): super(TreeSitterParser, self).__init__() vendors = [] self.added_nodes = added_nodes self.skip_node_types = skip_node_types for lang in langs: vendors.append(vendors_path / f"tree-sitter-{lang}") if lang not in added_nodes: self.added_nodes[lang] = {"prefix": "", "suffix": ""} if lang not in skip_node_types: self.skip_node_types[lang] = [] Language.build_library( # Store the library in the `build` directory "build/my-languages.so", # Include one or more languages vendors, ) self.parser = Parser()
def main() -> None: """ Initialize tree-sitter library. :return: None """ # root directory for tree-sitter tree_sitter_dir = get_tree_sitter_dir() # grammar locations c_grammar_loc = os.path.join(tree_sitter_dir, "tree-sitter-c") c_sharp_grammar_loc = os.path.join(tree_sitter_dir, "tree-sitter-c-sharp") cpp_grammar_loc = os.path.join(tree_sitter_dir, "tree-sitter-cpp") java_grammar_loc = os.path.join(tree_sitter_dir, "tree-sitter-java") # location for library bin_loc = os.path.join(tree_sitter_dir, "build/langs.so") # build everything Language.build_library( # Store the library in the `bin_loc` bin_loc, # Include languages [ c_grammar_loc, c_sharp_grammar_loc, cpp_grammar_loc, java_grammar_loc ] )
def file_parse(path,name): Language.build_library('../build/my-languages.so', ['../tree-sitter-python']) PY_LANGUAGE = Language('../build/my-languages.so', 'python') parser = Parser() parser.set_language(PY_LANGUAGE) code = read_file(str(path)) encoded_code = bytes(code, "utf8") tree = parser.parse(encoded_code) cursor = tree.walk() root_node = tree.root_node Graph = nx.DiGraph() f= open('result_dot/'+str(name)+'.dot','w') f.write('digraph G{\n') f.write('rankdir="LR";\n') traverse(root_node,Graph,encoded_code,f) global import_lists write_together(f,import_lists) f.write("}") f.close() #write_in_dot(Graph) return None
def __init__(self): if not os.path.exists('build/my-languages.so'): Language.build_library('build/my-languages.so', [ 'vendor/tree-sitter-c', 'vendor/tree-sitter-cpp', 'vendor/tree-sitter-c-sharp', 'vendor/tree-sitter-rust', 'vendor/tree-sitter-javascript', 'vendor/tree-sitter-python' ]) self.ts = Parser() self.tree = None
def build_libraries(languages: List[str], path): # Forcing tree sitter to create new library if os.path.isfile(path): os.remove(path) Language.build_library( # Store the library in the `build` directory path, # Include one or more languages languages, )
def __init__(self, grammar, language="python", parser_library_path='src/tree-sitter/tree-sitter-python', **kwargs): Language.build_library('/build/my-languages.so',[parser_library_path]) LANGUAGE = Language('/build/my-languages.so', language) self.grammar = grammar self.TS_parser = Parser() self.TS_parser.set_language(LANGUAGE) self.node_builder = NodeBuilder(self.grammar)
def __init__(self) -> None: # assume submodules exist vendor_dirs = ["vendor/tree-sitter-%s" % l for l in TREE_SITTER_LANGS] Language.build_library(BUILD_PATH, vendor_dirs) self.parsers = {} for l in TREE_SITTER_LANGS: parser = Parser() parser.set_language(Language(BUILD_PATH, "haskell")) self.parsers[l] = parser
def main() -> None: """ Initialize tree-sitter library. :return: None. """ download_grammars() grammar_locs = get_grammar_locs() bin_loc = get_tree_sitter_so() Language.build_library(bin_loc, grammar_locs) logging.info("Parser successfully initialized.")
def create_parser_builds(path=None): Language.build_library( # Store the library in the `build` directory 'build/my-languages.so', # Include one or more languages ['tree-sitter-java']) JAVA_LANGUAGE = Language('build/my-languages.so', 'java') return {"java": JAVA_LANGUAGE}
def main(file): this_directory = os.path.dirname(__file__) # filename = os.path.join(this_directory, '/relative/path/to/file/you/want') # This code is used to configure parsing tool Tree Sitter Language.build_library( # Store the library in the `build` directory os.path.join(this_directory, 'build/my-languages.so'), # Include one or more languages [ # 'vendor/tree-sitter-go', os.path.join(this_directory, 'vendor/tree-sitter-java') # 'vendor/tree-sitter-python' ]) java_lang = Language(os.path.join(this_directory, 'build/my-languages.so'), 'java') # Parsing algorithm starts here parser = Parser() parser.set_language(java_lang) # For debugging tree_sitter_tree = parser.parse(read_file(file)) # For production # tree_sitter_tree = parser.parse(read_file(file)) gumtree_ast = to_gumtree_node(tree_sitter_tree.root_node) # everything should be inside the tag root_node = doc.createElement('root') # in test case they have context tag, which is empty. Do not know why we need it context_node = doc.createElement('context') # We append our root node to document doc.appendChild(root_node) # Append context tag to root node (<root> </root) root_node.appendChild(context_node) # append data into <root> tag. At this stage we append parsed code structure. root_node.appendChild(gumtree_ast) # Recursively add children nodes (if exist) process_node(tree_sitter_tree.root_node, gumtree_ast) xml = doc.toprettyxml() print(xml)
def main() -> None: """ Initialize tree-sitter library. :return: None. """ # root directory for tree-sitter tree_sitter_dir = get_tree_sitter_dir() # grammar locations javascript_grammar_loc = os.path.join(tree_sitter_dir, "vendor", "tree-sitter-javascript") java_grammar_loc = os.path.join(tree_sitter_dir, "vendor", "tree-sitter-java") python_grammar_loc = os.path.join(tree_sitter_dir, "vendor", "tree-sitter-python") go_grammar_loc = os.path.join(tree_sitter_dir, "vendor", "tree-sitter-go") cpp_grammar_loc = os.path.join(tree_sitter_dir, "vendor", "tree-sitter-cpp") ruby_grammar_loc = os.path.join(tree_sitter_dir, "vendor", "tree-sitter-ruby") typescript_grammar_loc = os.path.join(tree_sitter_dir, "vendor", "tree-sitter-typescript", "typescript") tsx_grammar_loc = os.path.join(tree_sitter_dir, "vendor", "tree-sitter-typescript", "tsx") php_grammar_loc = os.path.join(tree_sitter_dir, "vendor", "tree-sitter-php") c_sharp_grammar_loc = os.path.join(tree_sitter_dir, "vendor", "tree-sitter-c-sharp") c_grammar_loc = os.path.join(tree_sitter_dir, "vendor", "tree-sitter-c") bash_grammar_loc = os.path.join(tree_sitter_dir, "vendor", "tree-sitter-bash") rust_grammar_loc = os.path.join(tree_sitter_dir, "vendor", "tree-sitter-rust") # location for library bin_loc = get_tree_sitter_so() # build everything Language.build_library( # Store the library in the `bin_loc` bin_loc, # Include languages [ javascript_grammar_loc, python_grammar_loc, java_grammar_loc, go_grammar_loc, cpp_grammar_loc, ruby_grammar_loc, typescript_grammar_loc, tsx_grammar_loc, php_grammar_loc, c_sharp_grammar_loc, c_grammar_loc, bash_grammar_loc, rust_grammar_loc ]) print("Parser successfully initialized.")
def __init__(self, code, language='python', tree_style='AST', path_style='U2D'): # AST | SPT || HST | HPT self.tree_style = tree_style # L2L | UD | U2D self.path_style = path_style # Use the Language.build_library method to compile these # into a library that's usable from Python: csn_so = 'scripts/build/csn.so' # Language.build_library( # csn_so, # [ # 'vendor/tree-sitter-go', # 'vendor/tree-sitter-java', # 'vendor/tree-sitter-javascript', # 'vendor/tree-sitter-php', # 'vendor/tree-sitter-python', # 'vendor/tree-sitter-ruby', # ] # ) parser = Parser() # Load the languages into your app as Language objects: # ('go', 'java', 'javascript', 'php', 'python', 'ruby') parser.set_language(Language(csn_so, language)) tree = parser.parse(code.encode()) code_lines = code.split('\n') self.root, self.terminals = self.traverse(tree, code_lines) self.debug = True if self.debug: print(f'{language}{"@" * 9}code\n{code}') print(f'{language}{"@" * 9}sexp\n{tree.root_node.sexp()}')
def run(self, tmp_dir, params): self.ret = 0 self.log = '' try: lib = self.find_lib() lang = Language(lib, 'verilog') parser = Parser() parser.set_language(lang) except Exception as e: self.log += f'{e}\n' self.ret = 1 for src in params['files']: f = None try: f = open(src, 'rb') except IOError: self.ret = 1 self.log_error(src, '', '', 'failed to open file') continue try: tree = parser.parse(f.read()) if self.walk(tree.root_node, src): self.ret = 1 except Exception as e: self.log_error(src, '', '', 'unknown error: ' + str(e)) self.ret = 1 usage = resource.getrusage(resource.RUSAGE_SELF) profiling_data = (usage.ru_utime, usage.ru_stime, usage.ru_maxrss) return (self.log, self.ret) + profiling_data
def corpus_dataflow_match(references, candidates, lang): LANGUAGE = Language('parser/my-languages.so', lang) parser = Parser() parser.set_language(LANGUAGE) parser = [parser, dfg_function[lang]] match_count = 0 total_count = 0 for i in range(len(candidates)): references_sample = references[i] candidate = candidates[i] for reference in references_sample: try: candidate = remove_comments_and_docstrings(candidate, 'java') except: pass try: reference = remove_comments_and_docstrings(reference, 'java') except: pass cand_dfg = get_data_flow(candidate, parser) ref_dfg = get_data_flow(reference, parser) normalized_cand_dfg = normalize_dataflow(cand_dfg) normalized_ref_dfg = normalize_dataflow(ref_dfg) if len(normalized_ref_dfg) > 0: total_count += len(normalized_ref_dfg) for dataflow in normalized_ref_dfg: if dataflow in normalized_cand_dfg: match_count += 1 normalized_cand_dfg.remove(dataflow) score = match_count / total_count return score
def __init__(self, language_type, encoding='utf-8'): self.language_type = language_type self.encoding = encoding self.parser = Parser() self.parser.set_language(Language(LIB_BIN, self.language_type)) self.UpdateBuffer([""]) self._res = []
def language_installed(language): if not Path(tree_sitter_build).exists(): return False try: Language(tree_sitter_build, language) return True except: return False
def get_parser(language): language = LANGUAGE_ALIASES.get(language, language) if language in PARSERS: return PARSERS[language] LANGUAGE = Language(tree_sitter_build, language) parser = Parser() parser.set_language(LANGUAGE) PARSERS[language] = parser return parser
def __init__( self, SO_FILE: str, LANGUAGE: str, to_lower=True, ): self.parser = Parser() self.parser.set_language(Language(SO_FILE, LANGUAGE)) self.LANGUAGE = LANGUAGE self.to_lower = to_lower
def get_parser(so_path: str = None) -> Parser: if so_path is None: so_path = JAVA_SO_PATH JAVA_LANGUAGE = Language(so_path, 'java') parser = Parser() parser.set_language(JAVA_LANGUAGE) return parser
def install_parsers(languages=None): if not languages: languages = supported_languages if all(language_installed(lang) for lang in languages): print(f"Parsers for languages {languages} already installed.") return wd = os.getcwd() os.chdir(PARSER_DIR) for lang in languages: if lang not in supported_languages: raise ValueError( f"{lang} not supported. The supported languages are: {', '.join(sorted(supported_languages))}." ) repo = f"tree-sitter-{lang}" git_clone(f"https://github.com/tree-sitter/{repo}") Language.build_library(tree_sitter_build, [ str(PARSER_DIR / f"tree-sitter-{lang}") for lang in supported_languages ]) os.chdir(wd)
def build_parser(self): url, folder = self.LANG_URL repo_dir = Path(fast_trees.__path__[0] + "/" + folder) if repo_dir.exists(): print("Repo already exists, continuing.") else: print(f"Downloading repo {url} to {repo_dir}.") Repo.clone_from(url, repo_dir) build_dir = fast_trees.__path__[ 0] + "/" + f"{repo_dir}/build/my-languages.so" Language.build_library( # Store the library in the `build` directory build_dir, # Include one or more languages [repo_dir], ) self.language = Language(build_dir, self.LANG) self.parser = Parser() self.parser.set_language(self.language)
def parse_full(self, lang: str, code: str) -> Tuple[List[str], Set[str]]: LANGUAGE = Language("build/my-languages.so", lang) self.parser.set_language(LANGUAGE) code = f"{self.added_nodes[lang]['prefix']} {code} {self.added_nodes[lang]['suffix']}" tree = self.parser.parse(bytes(code, "utf8")) cursor = tree.walk() tokens, special_tokens = self.breadth_first_path(lang, code, cursor, skip_node_types=self.skip_node_types[lang]) return tokens, special_tokens
def make_tree_sitter_lib(args, lang_repo_list): """ Create the library from the repos """ lp = lib_path(args) lib_name = args.lib_name full_lib_creation_path = f"{lp}/{lib_name}" if Path(full_lib_creation_path).exists(): os.remove(full_lib_creation_path) return Language.build_library(full_lib_creation_path, lang_repo_list)
def _get_language_library(self): try: self.ts_lang_cache_lock.acquire(timeout=300) lib = self._get_language_cache_dir() / "language.so" repo = self._get_language_repo() repodir = self._get_language_repo_path() if not lib.exists(): log.warn( f"building library for {self}, this could take a while...") start = time.time() Language.build_library(str(lib.resolve()), [repodir]) log.debug( f"library build of {self} completed after {round(time.time() - start)} seconds" ) return lib except filelock.Timeout as e: log.error(f"Failed to acquire lock on TSABL {self}") log.debug(f"lock object is {self.ts_lang_cache_lock}") raise e finally: self.ts_lang_cache_lock.release()
def parse(self, lang: str, code: str, max_tokens: Optional[int] = None) -> List[str]: LANGUAGE = Language("build/my-languages.so", lang) self.parser.set_language(LANGUAGE) code = f"{self.added_nodes[lang]['prefix']} {code} {self.added_nodes[lang]['suffix']}" tree = self.parser.parse(bytes(code, "utf8")) cursor = tree.walk() tokens = self.breadth_first_path_light( lang, code, cursor, skip_node_types=self.skip_node_types[lang], max_tokens=max_tokens ) return tokens
def __init__(self, so_file: str, language: str, operators_file: str = None): self.parser = Parser() self.parser.set_language(Language(so_file, language)) self.language = language if operators_file is None: operators_file = os.path.join(os.path.dirname(__file__), 'operators.json') with open(operators_file, 'r') as reader: self.operators = ujson.load(reader)
def corpus_syntax_match(references, candidates, lang): JAVA_LANGUAGE = Language('parser/my-languages.so', lang) parser = Parser() parser.set_language(JAVA_LANGUAGE) match_count = 0 total_count = 0 for i in range(len(candidates)): references_sample = references[i] candidate = candidates[i] for reference in references_sample: try: candidate = remove_comments_and_docstrings(candidate, 'java') except: pass try: reference = remove_comments_and_docstrings(reference, 'java') except: pass candidate_tree = parser.parse(bytes(candidate, 'utf8')).root_node reference_tree = parser.parse(bytes(reference, 'utf8')).root_node def get_all_sub_trees(root_node): node_stack = [] sub_tree_sexp_list = [] depth = 1 node_stack.append([root_node, depth]) while len(node_stack) != 0: cur_node, cur_depth = node_stack.pop() sub_tree_sexp_list.append([cur_node.sexp(), cur_depth]) for child_node in cur_node.children: if len(child_node.children) != 0: depth = cur_depth + 1 node_stack.append([child_node, depth]) return sub_tree_sexp_list cand_sexps = [x[0] for x in get_all_sub_trees(candidate_tree)] ref_sexps = get_all_sub_trees(reference_tree) # print(cand_sexps) # print(ref_sexps) for sub_tree, depth in ref_sexps: if sub_tree in cand_sexps: match_count += 1 total_count += len(ref_sexps) score = match_count / total_count return score
def __init__(self, language: str, query_class_name: str, query_file_path: str, library_loc: str = None): if os.getenv("TS_LIB_PATH") is not None and library_loc is None: library_loc = os.getenv("TS_LIB_PATH") if not library_loc: raise ParserLibraryNotFoundError( "Parser library path is 'None'. Please either set up the environment or call the constructor with the path" ) if not Path(library_loc).exists() or not Path(library_loc).is_file(): raise ParserLibraryNotFoundError( f"Parser library '{library_loc}' not found. Did you set up the environement variables?" ) self.language = Language(library_loc, language) self.parser = Parser() self.parser.set_language(self.language) self.qclass = Query.fromFile(query_file_path) self.QUERIES = self.qclass[query_class_name]