def checkAndGenerateAST(i, lstCFilesStep1, fopStep2, fopASTInfo, fopStep4GraphAll, fopStep4GraphSimplify, fpLog, nlpObj, offsetContext, isSaveGraph): fpMixFileCPP = lstCFilesStep1[i] lenFile = len(lstCFilesStep1) nameOfFile = os.path.basename(fpMixFileCPP) nameWithoutExtension = nameOfFile.replace('.cpp', '') fpCompiledCPP = fopStep2 + nameOfFile fpASTItem = fopASTInfo + nameOfFile.replace('.cpp', '_ast.txt') isRunOK = False try: parser = Parser() parser.set_language(CPP_LANGUAGE) # getJsonDict(fpCPP, fpDotGraphAllText, fpDotGraphAllImage, fpDotGraphSimplifyText, fpDotGraphSimplifyImage, # parser, offsetContext) fpDotGraphAllText = fopStep4GraphAll + nameWithoutExtension + '_all.dot' fpDotGraphAllImage = fopStep4GraphAll + nameWithoutExtension + '_all.png' fpDotGraphSimplifyText = fopStep4GraphSimplify + nameWithoutExtension + '_simplify.dot' fpDotGraphSimplifyImage = fopStep4GraphSimplify + nameWithoutExtension + '_simplify.png' f1 = open(fpMixFileCPP, 'r') strItem = f1.read() f1.close() start_time = time.time() jsonObject = getJsonDict(fpMixFileCPP, fpDotGraphAllText, fpDotGraphAllImage, fpDotGraphSimplifyText, fpDotGraphSimplifyImage, parser, nlpObj, offsetContext, isSaveGraph) # strASTOfFile=walker.getRepresentASTFromFile(fpCodeFileCPP,indexTu) end_time = time.time() numWordItem = len(strItem.split()) itemTimeProcess = (end_time - start_time) if str(jsonObject) != 'Error' or str(jsonObject) != 'None': # arrContentOfFile=strContentOfFile.split('\n') strContentAppend = '\n'.join( [nameOfFile, str(jsonObject), '\n\n\n']) f1 = open(fpASTItem, 'w') f1.write(strContentAppend) f1.close() shutil.copyfile(fpMixFileCPP, fopStep2 + nameOfFile) f1 = open(fpLog, 'a') f1.write('{}\t{}\n'.format(nameOfFile, 'True')) f1.close() isRunOK = True # print('{}\t{}'.format(strCommand,isRunOK)) else: f1 = open(fpLog, 'a') f1.write('{}\t{}\n'.format(nameOfFile, 'False')) f1.close() # print('{}\t{}'.format(strCommand,isRunOK)) print('OK {}/{} {}'.format(i, len(lstCFilesStep1), fpMixFileCPP)) except: print("Exception in user code:") print("-" * 60) traceback.print_exc(file=sys.stdout) print("-" * 60) print('Error: {} {}'.format(i, fpMixFileCPP)) print('Error {}/{} {}'.format(i, len(lstCFilesStep1), fpMixFileCPP)) return i, numWordItem, itemTimeProcess
def test_field_name_for_child(self): parser = Parser() parser.set_language(JAVASCRIPT) tree = parser.parse(b"<div a={1} b={2} />") jsx_node = tree.root_node.children[0].children[0] self.assertEqual(jsx_node.field_name_for_child(0), None) self.assertEqual(jsx_node.field_name_for_child(1), "name")
def test_children_by_field_name(self): parser = Parser() parser.set_language(JAVASCRIPT) tree = parser.parse(b"<div a={1} b={2} />") jsx_node = tree.root_node.children[0].children[0] attributes = jsx_node.children_by_field_name("attribute") self.assertEqual([a.type for a in attributes], ["jsx_attribute", "jsx_attribute"])
class Code_Parser(): def __init__(self, grammar, language="python", parser_library_path='src/tree-sitter/tree-sitter-python', **kwargs): Language.build_library('/build/my-languages.so',[parser_library_path]) LANGUAGE = Language('/build/my-languages.so', language) self.grammar = grammar self.TS_parser = Parser() self.TS_parser.set_language(LANGUAGE) self.node_builder = NodeBuilder(self.grammar) def code_to_sequence(self, code_str): tree = self.TS_parser.parse(bytes(code_str, "utf8")) root_node = tree.root_node sequence = self.TSTree_to_sequence(root_node, code_str) return sequence def TSTree_to_sequence(self, TSNode, code_str): node_sequence = [TSNode.type] if TSNode.type == "string": node_text = sub_str_from_coords(code_str, TSNode.start_point, TSNode.end_point)[1:-1] node_sequence += ["_string_start",'"',"<REDUCE>"] node_sequence += ["_string_content",node_text,"<REDUCE>"] node_sequence += ["_string_end",'"',"<REDUCE>"] elif TSNode.children == []: node_text = sub_str_from_coords(code_str, TSNode.start_point, TSNode.end_point) if TSNode.type != node_text: node_sequence.append(node_text) elif TSNode.children != []: for child in TSNode.children: node_sequence += self.TSTree_to_sequence(child, code_str) node_sequence.append("<REDUCE>") return node_sequence def is_valid_sequence(self, sequence): first_node = sequence[0] if first_node != "module": return False partial_tree = PartialTree(first_node, self.node_builder) try: for expansion in sequence[1:]: partial_tree.add_action(expansion) except Exception as e: return False return True def sequence_to_partial_tree(self, sequence): first_node = sequence[0] partial_tree = PartialTree(first_node, self.node_builder) try: for expansion in sequence[1:]: partial_tree.add_action(expansion) except Exception as e: print("ERROR!") traceback.print_exc() print("-------") return partial_tree
def get_parser(language): language = LANGUAGE_ALIASES.get(language, language) if language in PARSERS: return PARSERS[language] LANGUAGE = Language(tree_sitter_build, language) parser = Parser() parser.set_language(LANGUAGE) PARSERS[language] = parser return parser
def get_parser(so_path: str = None) -> Parser: if so_path is None: so_path = JAVA_SO_PATH JAVA_LANGUAGE = Language(so_path, 'java') parser = Parser() parser.set_language(JAVA_LANGUAGE) return parser
def __init__(self) -> None: # assume submodules exist vendor_dirs = ["vendor/tree-sitter-%s" % l for l in TREE_SITTER_LANGS] Language.build_library(BUILD_PATH, vendor_dirs) self.parsers = {} for l in TREE_SITTER_LANGS: parser = Parser() parser.set_language(Language(BUILD_PATH, "haskell")) self.parsers[l] = parser
def test_node_text(self): parser = Parser() parser.set_language(PYTHON) tree = parser.parse(b"[0, [1, 2, 3]]") self.assertEqual(tree.text, b"[0, [1, 2, 3]]") root_node = tree.root_node self.assertEqual(root_node.text, b'[0, [1, 2, 3]]') exp_stmt_node = root_node.children[0] self.assertEqual(exp_stmt_node.text, b'[0, [1, 2, 3]]') list_node = exp_stmt_node.children[0] self.assertEqual(list_node.text, b'[0, [1, 2, 3]]') open_delim_node = list_node.children[0] self.assertEqual(open_delim_node.text, b'[') first_num_node = list_node.children[1] self.assertEqual(first_num_node.text, b'0') first_comma_node = list_node.children[2] self.assertEqual(first_comma_node.text, b',') child_list_node = list_node.children[3] self.assertEqual(child_list_node.text, b'[1, 2, 3]') close_delim_node = list_node.children[4] self.assertEqual(close_delim_node.text, b']') edit_offset = len(b"[0, [") tree.edit( start_byte=edit_offset, old_end_byte=edit_offset, new_end_byte=edit_offset + 2, start_point=(0, edit_offset), old_end_point=(0, edit_offset), new_end_point=(0, edit_offset + 2), ) self.assertEqual(tree.text, None) root_node_again = tree.root_node self.assertEqual(root_node_again.text, None) tree_text_false = parser.parse(b"[0, [1, 2, 3]]", keep_text=False) self.assertIsNone(tree_text_false.text) root_node_text_false = tree_text_false.root_node self.assertIsNone(root_node_text_false.text) tree_text_true = parser.parse(b"[0, [1, 2, 3]]", keep_text=True) self.assertEqual(tree_text_true.text, b"[0, [1, 2, 3]]") root_node_text_true = tree_text_true.root_node self.assertEqual(root_node_text_true.text, b"[0, [1, 2, 3]]")
def test_tree_cursor_without_tree(self): parser = Parser() parser.set_language(PYTHON) def parse(): tree = parser.parse(b"def foo():\n bar()") return tree.walk() cursor = parse() self.assertIs(cursor.node, cursor.node) for item in cursor.node.children: self.assertIsNotNone(item.is_named)
def test_text_predicates_errors(self): parser = Parser() parser.set_language(JAVASCRIPT) with self.assertRaises(RuntimeError): JAVASCRIPT.query(""" ( (function_declaration name: (identifier) @function-name ) (#eq? @function-name @function-name fun1) ) """) with self.assertRaises(RuntimeError): JAVASCRIPT.query(""" ( (function_declaration name: (identifier) @function-name ) (#eq? fun1 @function-name) ) """) with self.assertRaises(RuntimeError): JAVASCRIPT.query(""" ( (function_declaration name: (identifier) @function-name ) (#match? @function-name @function-name fun1) ) """) with self.assertRaises(RuntimeError): JAVASCRIPT.query(""" ( (function_declaration name: (identifier) @function-name ) (#match? fun1 @function-name) ) """) with self.assertRaises(RuntimeError): JAVASCRIPT.query(""" ( (function_declaration name: (identifier) @function-name ) (#match? @function-name @function-name) ) """)
def corpus_syntax_match(references, candidates, lang): JAVA_LANGUAGE = Language('parser/my-languages.so', lang) parser = Parser() parser.set_language(JAVA_LANGUAGE) match_count = 0 total_count = 0 for i in range(len(candidates)): references_sample = references[i] candidate = candidates[i] for reference in references_sample: try: candidate = remove_comments_and_docstrings(candidate, 'java') except: pass try: reference = remove_comments_and_docstrings(reference, 'java') except: pass candidate_tree = parser.parse(bytes(candidate, 'utf8')).root_node reference_tree = parser.parse(bytes(reference, 'utf8')).root_node def get_all_sub_trees(root_node): node_stack = [] sub_tree_sexp_list = [] depth = 1 node_stack.append([root_node, depth]) while len(node_stack) != 0: cur_node, cur_depth = node_stack.pop() sub_tree_sexp_list.append([cur_node.sexp(), cur_depth]) for child_node in cur_node.children: if len(child_node.children) != 0: depth = cur_depth + 1 node_stack.append([child_node, depth]) return sub_tree_sexp_list cand_sexps = [x[0] for x in get_all_sub_trees(candidate_tree)] ref_sexps = get_all_sub_trees(reference_tree) # print(cand_sexps) # print(ref_sexps) for sub_tree, depth in ref_sexps: if sub_tree in cand_sexps: match_count += 1 total_count += len(ref_sexps) score = match_count / total_count return score
def get_parser(lang: str) -> Parser: """ Initialize parser for a specific language. :param lang: language to use. :return: parser. """ global PARSERS if lang not in PARSERS: parser = Parser() parser.set_language(Language(get_tree_sitter_so(), lang)) PARSERS[lang] = parser else: parser = PARSERS[lang] return parser
def test_set_language(self): parser = Parser() parser.set_language(PYTHON) tree = parser.parse(b"def foo():\n bar()") self.assertEqual( tree.root_node.sexp(), "(module (function_definition (identifier) (parameters) (expression_statement (call (identifier) (argument_list)))))" ) parser.set_language(JAVASCRIPT) tree = parser.parse(b"function foo() {\n bar();\n}") self.assertEqual( tree.root_node.sexp(), "(program (function (identifier) (formal_parameters) (statement_block (expression_statement (call_expression (identifier) (arguments))))))" )
def main(file): this_directory = os.path.dirname(__file__) # filename = os.path.join(this_directory, '/relative/path/to/file/you/want') # This code is used to configure parsing tool Tree Sitter Language.build_library( # Store the library in the `build` directory os.path.join(this_directory, 'build/my-languages.so'), # Include one or more languages [ # 'vendor/tree-sitter-go', os.path.join(this_directory, 'vendor/tree-sitter-java') # 'vendor/tree-sitter-python' ]) java_lang = Language(os.path.join(this_directory, 'build/my-languages.so'), 'java') # Parsing algorithm starts here parser = Parser() parser.set_language(java_lang) # For debugging tree_sitter_tree = parser.parse(read_file(file)) # For production # tree_sitter_tree = parser.parse(read_file(file)) gumtree_ast = to_gumtree_node(tree_sitter_tree.root_node) # everything should be inside the tag root_node = doc.createElement('root') # in test case they have context tag, which is empty. Do not know why we need it context_node = doc.createElement('context') # We append our root node to document doc.appendChild(root_node) # Append context tag to root node (<root> </root) root_node.appendChild(context_node) # append data into <root> tag. At this stage we append parsed code structure. root_node.appendChild(gumtree_ast) # Recursively add children nodes (if exist) process_node(tree_sitter_tree.root_node, gumtree_ast) xml = doc.toprettyxml() print(xml)
def add_lcom5(df, col): lang_builds = create_parser_builds() parser = Parser() class_lcom5 = [] for i in range(len(df)): ext = df["name"][i].split('.')[-1] parser.set_language(lang_builds[ext]) enc = df["encoding"][i] tree = parser.parse(bytes(df["contents"][i], df["encoding"][i])) class_lcom5.append(calculate_lcom5(tree, ext, bytes(df["contents"][i], df["encoding"][i]), df["name"][i])) df["class_lcom5"] = class_lcom5 return df
def test_multibyte_characters(self): parser = Parser() parser.set_language(JAVASCRIPT) source_code = bytes("'😎' && '🐍'", "utf8") tree = parser.parse(source_code) root_node = tree.root_node statement_node = root_node.children[0] binary_node = statement_node.children[0] snake_node = binary_node.children[2] self.assertEqual(binary_node.type, "binary_expression") self.assertEqual(snake_node.type, "string") self.assertEqual( source_code[snake_node.start_byte:snake_node.end_byte].decode( 'utf8'), "'🐍'")
def jobs(repo_path, args): PARSER = Parser() PARSER.set_language(Language(args.tree_sitter, args.lang)) n_file_per_commit = Counter() add_tokens_per_del_tokens = [] if os.path.exists(repo_path): submodule = os.path.join(repo_path, '.gitmodules') if os.path.exists(submodule): os.remove(submodule) try: n_stored_commit = 0 for commit in RepositoryMining( repo_path, only_no_merge=True, only_in_branch='master', only_modifications_with_file_types=language_ext[args.lang] ).traverse_commits(): if n_stored_commit > args.max_commit_number: break cleaned_message = message_cleaner(commit.msg) if not cleaned_message: continue commit_tokens = tokenize_docstring_from_string(cleaned_message) if len(commit_tokens) < args.min_target_length: continue addeds, deleteds, n_files = get_code_diff(commit, PARSER, args) if 1 <= n_files and n_files <= args.max_duplicate: with jsonlines.open(args.output_file, mode="a") as writer: writer.write( { "commit_tokens": commit_tokens, "add_tokens": addeds[0], "del_tokens": deleteds[0], } ) add_tokens_per_del_tokens.append( len(addeds[0]) / len(deleteds[0]) ) n_file_per_commit.update({n_files}) n_stored_commit += 1 except: pass return (n_file_per_commit, add_tokens_per_del_tokens)
def main(opt): parser = Parser() lang = Languages.get(opt.language[0]) parser.set_language(lang) lang_node_types_filename = "node_types_{}.csv".format(opt.language[0]) selected_node_types = {} if exists(lang_node_types_filename): lang_node_types = open(lang_node_types_filename, "r").read().splitlines() for lang_node_type in lang_node_types: selected_node_types[lang_node_type.lower()] = 1 data = open(opt.filename[0], "rb").read() tree = parser.parse(data) reports = {} s = print_subtree(data, tree.root_node, reports, selected_node_types) for report in reports: print(reports[report])
def get_parser(lang: str, so_path: str = None) -> Parser: if so_path is None: so_path = SO_PATH # global PARSERS # if lang in PARSERS: # return PARSERS[lang] LANG = Language(so_path, lang) parser = Parser() parser.set_language(LANG) # PARSERS[lang] = parser return parser
def test_byte_range_captures(self): parser = Parser() parser.set_language(PYTHON) source = b"def foo():\n bar()\ndef baz():\n quux()\n" tree = parser.parse(source) query = PYTHON.query( """ (function_definition name: (identifier) @func-def) (call function: (identifier) @func-call) """ ) captures = query.captures(tree.root_node, start_byte=10, end_byte=20) self.assertEqual(captures[0][0].start_point, (1, 2)) self.assertEqual(captures[0][0].end_point, (1, 5)) self.assertEqual(captures[0][1], "func-call")
def codebleu(reference, candidate, weights=[0.1, 0.1, 0.4, 0.4]): parser = Parser() PY_LANGUAGE = Language('./my-languages.so', 'python') parser.set_language(PY_LANGUAGE) lattice = TypeLatticeGenerator('typingRules.json') scores = pure_bleu(reference, candidate), weighted_bleu( reference, candidate), ast_match(reference, candidate, parser), dfg_match(reference, candidate, lattice) final_score = 0.0 norm = 0.0 for i, item in enumerate(scores): if item != -1: #if we can't compute some metric, we shouldn't include it in the score final_score += item * weights[i] norm += weights[i] final_score = final_score / norm return final_score
class TreeSitterLauncher: _parser: Parser def __init__(self, language, library_path): grammar = Language(library_path, language) self._parser = Parser() self._parser.set_language(grammar) def _get_code_bytes(self, filepath: str) -> bytes: file = open(filepath, "r") return bytes(file.read(), "utf-8") def parse_file(self, filepath: str) -> TreeAsDict: code_bytes = self._get_code_bytes(filepath) tree_sitter_tree = self._parser.parse(code_bytes) cursor = tree_sitter_tree.walk() return TreeBuilder(cursor, code_bytes).get_tree_as_dict()
def test_walk(self): parser = Parser() parser.set_language(PYTHON) tree = parser.parse(b"def foo():\n bar()") cursor = tree.walk() # Node always returns the same instance self.assertIs(cursor.node, cursor.node) self.assertEqual(cursor.node.type, "module") self.assertEqual(cursor.node.start_byte, 0) self.assertEqual(cursor.node.end_byte, 18) self.assertEqual(cursor.node.start_point, (0, 0)) self.assertEqual(cursor.node.end_point, (1, 7)) self.assertEqual(cursor.current_field_name(), None) self.assertTrue(cursor.goto_first_child()) self.assertEqual(cursor.node.type, "function_definition") self.assertEqual(cursor.node.start_byte, 0) self.assertEqual(cursor.node.end_byte, 18) self.assertEqual(cursor.node.start_point, (0, 0)) self.assertEqual(cursor.node.end_point, (1, 7)) self.assertEqual(cursor.current_field_name(), None) self.assertTrue(cursor.goto_first_child()) self.assertEqual(cursor.node.type, "def") self.assertEqual(cursor.node.is_named, False) self.assertEqual(cursor.node.sexp(), '("def")') self.assertEqual(cursor.current_field_name(), None) def_node = cursor.node # Node remains cached after a failure to move self.assertFalse(cursor.goto_first_child()) self.assertIs(cursor.node, def_node) self.assertTrue(cursor.goto_next_sibling()) self.assertEqual(cursor.node.type, "identifier") self.assertEqual(cursor.node.is_named, True) self.assertEqual(cursor.current_field_name(), "name") self.assertFalse(cursor.goto_first_child()) self.assertTrue(cursor.goto_next_sibling()) self.assertEqual(cursor.node.type, "parameters") self.assertEqual(cursor.node.is_named, True) self.assertEqual(cursor.current_field_name(), "parameters")
class TSParser: def __init__(self, lang: str): self.lang = lang self.parser = TSBaseParser() self.tsLang = Language(LIBRARY_PATH, lang) self.parser.set_language(self.tsLang) def parse(self, code: str) -> Node: return self(code).root_node def sexp(self, code: str) -> str: return self.parse(code).sexp() def query(self, query: str, code: str) -> dict[str, str]: return dict((k, extract(v, code)) for v, k in self.tsLang.query( query).captures(self.parse(code))) def __call__(self, value: str) -> Tree: return self.parser.parse(bytes(value, "utf8"))
def test_children(self): parser = Parser() parser.set_language(PYTHON) tree = parser.parse(b"def foo():\n bar()") root_node = tree.root_node self.assertEqual(root_node.type, "module") self.assertEqual(root_node.start_byte, 0) self.assertEqual(root_node.end_byte, 18) self.assertEqual(root_node.start_point, (0, 0)) self.assertEqual(root_node.end_point, (1, 7)) # List object is reused self.assertIs(root_node.children, root_node.children) fn_node = root_node.children[0] self.assertEqual(fn_node.type, "function_definition") self.assertEqual(fn_node.start_byte, 0) self.assertEqual(fn_node.end_byte, 18) self.assertEqual(fn_node.start_point, (0, 0)) self.assertEqual(fn_node.end_point, (1, 7)) def_node = fn_node.children[0] self.assertEqual(def_node.type, "def") self.assertEqual(def_node.is_named, False) id_node = fn_node.children[1] self.assertEqual(id_node.type, "identifier") self.assertEqual(id_node.is_named, True) self.assertEqual(len(id_node.children), 0) params_node = fn_node.children[2] self.assertEqual(params_node.type, "parameters") self.assertEqual(params_node.is_named, True) colon_node = fn_node.children[3] self.assertEqual(colon_node.type, ":") self.assertEqual(colon_node.is_named, False) statement_node = fn_node.children[4] self.assertEqual(statement_node.type, "block") self.assertEqual(statement_node.is_named, True)
def test_multibyte_characters_via_read_callback(self): parser = Parser() parser.set_language(JAVASCRIPT) source_code = bytes("'😎' && '🐍'", "utf8") def read(byte_position, point): return source_code[byte_position:byte_position+1] tree = parser.parse(read) root_node = tree.root_node statement_node = root_node.children[0] binary_node = statement_node.children[0] snake_node = binary_node.children[2] self.assertEqual(binary_node.type, "binary_expression") self.assertEqual(snake_node.type, "string") self.assertEqual( source_code[snake_node.start_byte:snake_node.end_byte].decode("utf8"), "'🐍'", )
def test_tree(self): code = b"def foo():\n bar()\n\ndef foo():\n bar()" parser = Parser() parser.set_language(PYTHON) def parse_root(bytes_): tree = parser.parse(bytes_) return tree.root_node root = parse_root(code) for item in root.children: self.assertIsNotNone(item.is_named) def parse_root_children(bytes_): tree = parser.parse(bytes_) return tree.root_node.children children = parse_root_children(code) for item in children: self.assertIsNotNone(item.is_named)
def test_edit(self): parser = Parser() parser.set_language(PYTHON) tree = parser.parse(b"def foo():\n bar()") edit_offset = len(b"def foo(") tree.edit( start_byte=edit_offset, old_end_byte=edit_offset, new_end_byte=edit_offset + 2, start_point=(0, edit_offset), old_end_point=(0, edit_offset), new_end_point=(0, edit_offset + 2), ) fn_node = tree.root_node.children[0] self.assertEqual(fn_node.type, "function_definition") self.assertTrue(fn_node.has_changes) self.assertFalse(fn_node.children[0].has_changes) self.assertFalse(fn_node.children[1].has_changes) self.assertFalse(fn_node.children[3].has_changes) params_node = fn_node.children[2] self.assertEqual(params_node.type, "parameters") self.assertTrue(params_node.has_changes) self.assertEqual(params_node.start_point, (0, edit_offset - 1)) self.assertEqual(params_node.end_point, (0, edit_offset + 3)) new_tree = parser.parse(b"def foo(ab):\n bar()", tree) self.assertEqual( new_tree.root_node.sexp(), trim( """(module (function_definition name: (identifier) parameters: (parameters (identifier)) body: (block (expression_statement (call function: (identifier) arguments: (argument_list))))))""" ), )
def __init__(self, code, language='python', tree_style='SPT', path_style='L2L'): # AST | SPT || HST | HPT self.tree_style = tree_style # L2L | UD | U2D self.path_style = path_style # Use the Language.build_library method to compile these # into a library that's usable from Python: csn_so = '../build/csn.so' # Language.build_library( # csn_so, # [ # '../vendor/tree-sitter-go', # '../vendor/tree-sitter-java', # '../vendor/tree-sitter-javascript', # '../vendor/tree-sitter-php', # '../vendor/tree-sitter-python', # '../vendor/tree-sitter-ruby', # ] # ) parser = Parser() # Load the languages into your app as Language objects: # ('go', 'java', 'javascript', 'php', 'python', 'ruby') parser.set_language(Language(csn_so, language)) tree = parser.parse(code.encode()) code_lines = code.split('\n') self.root, self.terminals, self.num_eldest = self.traverse( tree, code_lines) self.terminal_nodes = list() self.nonterminal_nodes = list() self.leafpath_terminal_nodes = list() self.leafpath_nonterminal_nodes = list() self.rootpath_terminal_nodes = list() self.rootpath_nonterminal_nodes = list() self.debug = False if self.debug: print(f'{"@" * 9}code\n{code}') print(f'{"@" * 9}sexp\n{tree.root_node.sexp()}')
def corpus_dataflow_match(references, candidates, lang): LANGUAGE = Language('parser/my-languages.so', lang) parser = Parser() parser.set_language(LANGUAGE) parser = [parser, dfg_function[lang]] match_count = 0 total_count = 0 for i in range(len(candidates)): references_sample = references[i] candidate = candidates[i] for reference in references_sample: try: candidate = remove_comments_and_docstrings(candidate, 'java') except: pass try: reference = remove_comments_and_docstrings(reference, 'java') except: pass cand_dfg = get_data_flow(candidate, parser) ref_dfg = get_data_flow(reference, parser) normalized_cand_dfg = normalize_dataflow(cand_dfg) normalized_ref_dfg = normalize_dataflow(ref_dfg) if len(normalized_ref_dfg) > 0: total_count += len(normalized_ref_dfg) for dataflow in normalized_ref_dfg: if dataflow in normalized_cand_dfg: match_count += 1 normalized_cand_dfg.remove(dataflow) if total_count == 0: print( "WARNING: There is no reference data-flows extracted from the whole corpus, and the data-flow match score degenerates to 0. Please consider ignoring this score." ) return 0 score = match_count / total_count return score