def test_captures(self):
        parser = Parser()
        parser.set_language(PYTHON)
        source = b"def foo():\n  bar()\ndef baz():\n  quux()\n"
        tree = parser.parse(source)
        query = PYTHON.query("""
            (function_definition name: (identifier) @func-def)
            (call function: (identifier) @func-call)
            """)

        captures = query.captures(tree.root_node)
        captures = query.captures(tree.root_node)
        captures = query.captures(tree.root_node)
        captures = query.captures(tree.root_node)

        self.assertEqual(captures[0][0].start_point, (0, 4))
        self.assertEqual(captures[0][0].end_point, (0, 7))
        self.assertEqual(captures[0][1], "func-def")

        self.assertEqual(captures[1][0].start_point, (1, 2))
        self.assertEqual(captures[1][0].end_point, (1, 5))
        self.assertEqual(captures[1][1], "func-call")

        self.assertEqual(captures[2][0].start_point, (2, 4))
        self.assertEqual(captures[2][0].end_point, (2, 7))
        self.assertEqual(captures[2][1], "func-def")

        self.assertEqual(captures[3][0].start_point, (3, 2))
        self.assertEqual(captures[3][0].end_point, (3, 6))
        self.assertEqual(captures[3][1], "func-call")
Esempio n. 2
0
class TreeSitter(object):
    def __init__(self, language_type, encoding='utf-8'):
        self.language_type = language_type
        self.encoding = encoding
        self.parser = Parser()
        self.parser.set_language(Language(LIB_BIN, self.language_type))
        self.UpdateBuffer([""])
        self._res = []

    def DFS(self, node, tokenModifiers: list):
        for item in node.children:
            temp = tokenModifiers
            if len(item.children) != 0:
                temp = copy.copy(tokenModifiers)
                temp.append(item.type)
                self.DFS(item, temp)
            self._res.append({
                'node': item.type,
                'tokenModifiers': tokenModifiers
            })

    def GetSematicToken(self):
        self._res = []
        self.DFS(self.tree.root_node, [])
        return self._res

    def UpdateBuffer(self, content_list):
        self.tree = self.parser.parse(
            bytes("\n".join(content_list), self.encoding))
Esempio n. 3
0
    def __init__(
            self,
            langs: List[str],
            added_nodes: Dict[str, Dict[str, str]],
            skip_node_types: Dict[str, List[str]],
            vendors_path: Path = Path("./vendor"),
    ):
        super(TreeSitterParser, self).__init__()

        vendors = []
        self.added_nodes = added_nodes
        self.skip_node_types = skip_node_types
        for lang in langs:
            vendors.append(vendors_path / f"tree-sitter-{lang}")
            if lang not in added_nodes:
                self.added_nodes[lang] = {"prefix": "", "suffix": ""}
            if lang not in skip_node_types:
                self.skip_node_types[lang] = []

        Language.build_library(
            # Store the library in the `build` directory
            "build/my-languages.so",
            # Include one or more languages
            vendors,
        )

        self.parser = Parser()
Esempio n. 4
0
File: ts.py Progetto: forest520/csn
    def __init__(self,
                 code,
                 language='python',
                 tree_style='AST',
                 path_style='U2D'):
        # AST | SPT || HST | HPT
        self.tree_style = tree_style
        # L2L | UD | U2D
        self.path_style = path_style
        # Use the Language.build_library method to compile these
        # into a library that's usable from Python:
        csn_so = 'scripts/build/csn.so'
        # Language.build_library(
        #   csn_so,
        #   [
        #     'vendor/tree-sitter-go',
        #     'vendor/tree-sitter-java',
        #     'vendor/tree-sitter-javascript',
        #     'vendor/tree-sitter-php',
        #     'vendor/tree-sitter-python',
        #     'vendor/tree-sitter-ruby',
        #   ]
        # )
        parser = Parser()
        # Load the languages into your app as Language objects:
        # ('go', 'java', 'javascript', 'php', 'python', 'ruby')
        parser.set_language(Language(csn_so, language))
        tree = parser.parse(code.encode())
        code_lines = code.split('\n')
        self.root, self.terminals = self.traverse(tree, code_lines)

        self.debug = True
        if self.debug:
            print(f'{language}{"@" * 9}code\n{code}')
            print(f'{language}{"@" * 9}sexp\n{tree.root_node.sexp()}')
Esempio n. 5
0
    def test_read_callback(self):
        parser = Parser()
        parser.set_language(PYTHON)
        source_lines = ["def foo():\n", "  bar()"]

        def read_callback(byte_offset, point):
            row, column = point
            if row >= len(source_lines):
                return None
            if column >= len(source_lines[row]):
                return None
            return source_lines[row][column:].encode("utf8")

        tree = parser.parse(read_callback)
        self.assertEqual(
            tree.root_node.sexp(),
            trim(
                """(module (function_definition
                name: (identifier)
                parameters: (parameters)
                body: (block (expression_statement (call
                    function: (identifier)
                    arguments: (argument_list))))))"""
            ),
        )
Esempio n. 6
0
    def run(self, tmp_dir, params):
        self.ret = 0
        self.log = ''

        try:
            lib = self.find_lib()

            lang = Language(lib, 'verilog')

            parser = Parser()
            parser.set_language(lang)
        except Exception as e:
            self.log += f'{e}\n'
            self.ret = 1

        for src in params['files']:
            f = None
            try:
                f = open(src, 'rb')
            except IOError:
                self.ret = 1
                self.log_error(src, '', '', 'failed to open file')
                continue

            try:
                tree = parser.parse(f.read())
                if self.walk(tree.root_node, src):
                    self.ret = 1
            except Exception as e:
                self.log_error(src, '', '', 'unknown error: ' + str(e))
                self.ret = 1
        usage = resource.getrusage(resource.RUSAGE_SELF)
        profiling_data = (usage.ru_utime, usage.ru_stime, usage.ru_maxrss)

        return (self.log, self.ret) + profiling_data
Esempio n. 7
0
def file_parse(path,name):
    Language.build_library('../build/my-languages.so', ['../tree-sitter-python'])
    PY_LANGUAGE = Language('../build/my-languages.so', 'python')
    parser = Parser()
    parser.set_language(PY_LANGUAGE)
    code = read_file(str(path))
    encoded_code = bytes(code, "utf8")
    tree = parser.parse(encoded_code)
    cursor = tree.walk()
    root_node = tree.root_node

    Graph = nx.DiGraph()
    f= open('result_dot/'+str(name)+'.dot','w') 
    f.write('digraph G{\n')
    f.write('rankdir="LR";\n')
    traverse(root_node,Graph,encoded_code,f)
    global import_lists
    write_together(f,import_lists)


    f.write("}")
    f.close()

    
    #write_in_dot(Graph)
    return None
Esempio n. 8
0
def corpus_dataflow_match(references, candidates, lang):
    LANGUAGE = Language('parser/my-languages.so', lang)
    parser = Parser()
    parser.set_language(LANGUAGE)
    parser = [parser, dfg_function[lang]]
    match_count = 0
    total_count = 0

    for i in range(len(candidates)):
        references_sample = references[i]
        candidate = candidates[i]
        for reference in references_sample:
            try:
                candidate = remove_comments_and_docstrings(candidate, 'java')
            except:
                pass
            try:
                reference = remove_comments_and_docstrings(reference, 'java')
            except:
                pass

            cand_dfg = get_data_flow(candidate, parser)
            ref_dfg = get_data_flow(reference, parser)

            normalized_cand_dfg = normalize_dataflow(cand_dfg)
            normalized_ref_dfg = normalize_dataflow(ref_dfg)

            if len(normalized_ref_dfg) > 0:
                total_count += len(normalized_ref_dfg)
                for dataflow in normalized_ref_dfg:
                    if dataflow in normalized_cand_dfg:
                        match_count += 1
                        normalized_cand_dfg.remove(dataflow)
    score = match_count / total_count
    return score
Esempio n. 9
0
 def __init__(self, **options):
     self.parser = Parser()
     self.parser.set_language(self.language)
     self.escape = options.get("escapeinside", None)
     if self.escape is not None:
         self.escape = bytes(self.escape, "utf8")
     super().__init__(**options)
Esempio n. 10
0
 def __init__(self, language_type, encoding='utf-8'):
     self.language_type = language_type
     self.encoding = encoding
     self.parser = Parser()
     self.parser.set_language(Language(LIB_BIN, self.language_type))
     self.UpdateBuffer([""])
     self._res = []
    def test_child_by_field_id(self):
        parser = Parser()
        parser.set_language(PYTHON)
        tree = parser.parse(b"def foo():\n  bar()")
        root_node = tree.root_node
        fn_node = tree.root_node.children[0]

        self.assertEqual(PYTHON.field_id_for_name("nameasdf"), None)
        name_field = PYTHON.field_id_for_name("name")
        alias_field = PYTHON.field_id_for_name("alias")
        self.assertIsInstance(alias_field, int)
        self.assertIsInstance(name_field, int)
        self.assertEqual(root_node.child_by_field_id(alias_field), None)
        self.assertEqual(root_node.child_by_field_id(name_field), None)
        self.assertEqual(fn_node.child_by_field_id(alias_field), None)
        self.assertEqual(
            fn_node.child_by_field_id(name_field).type, "identifier")
        self.assertRaises(TypeError, root_node.child_by_field_id, "")
        self.assertRaises(TypeError, root_node.child_by_field_name, True)
        self.assertRaises(TypeError, root_node.child_by_field_name, 1)

        self.assertEqual(
            fn_node.child_by_field_name("name").type, "identifier")
        self.assertEqual(fn_node.child_by_field_name("asdfasdfname"), None)

        self.assertEqual(
            fn_node.child_by_field_name("name"),
            fn_node.child_by_field_name("name"),
        )
def checkAndGenerateAST(i, lstCFilesStep1, fopStep2, fopASTInfo,
                        fopStep4GraphAll, fopStep4GraphSimplify, fpLog, nlpObj,
                        offsetContext, isSaveGraph):
    fpMixFileCPP = lstCFilesStep1[i]
    lenFile = len(lstCFilesStep1)
    nameOfFile = os.path.basename(fpMixFileCPP)
    nameWithoutExtension = nameOfFile.replace('.cpp', '')
    fpCompiledCPP = fopStep2 + nameOfFile
    fpASTItem = fopASTInfo + nameOfFile.replace('.cpp', '_ast.txt')
    isRunOK = False
    try:
        parser = Parser()
        parser.set_language(CPP_LANGUAGE)
        # getJsonDict(fpCPP, fpDotGraphAllText, fpDotGraphAllImage, fpDotGraphSimplifyText, fpDotGraphSimplifyImage,
        #             parser, offsetContext)
        fpDotGraphAllText = fopStep4GraphAll + nameWithoutExtension + '_all.dot'
        fpDotGraphAllImage = fopStep4GraphAll + nameWithoutExtension + '_all.png'
        fpDotGraphSimplifyText = fopStep4GraphSimplify + nameWithoutExtension + '_simplify.dot'
        fpDotGraphSimplifyImage = fopStep4GraphSimplify + nameWithoutExtension + '_simplify.png'

        f1 = open(fpMixFileCPP, 'r')
        strItem = f1.read()
        f1.close()
        start_time = time.time()
        jsonObject = getJsonDict(fpMixFileCPP, fpDotGraphAllText,
                                 fpDotGraphAllImage, fpDotGraphSimplifyText,
                                 fpDotGraphSimplifyImage, parser, nlpObj,
                                 offsetContext, isSaveGraph)
        # strASTOfFile=walker.getRepresentASTFromFile(fpCodeFileCPP,indexTu)
        end_time = time.time()
        numWordItem = len(strItem.split())
        itemTimeProcess = (end_time - start_time)
        if str(jsonObject) != 'Error' or str(jsonObject) != 'None':
            # arrContentOfFile=strContentOfFile.split('\n')
            strContentAppend = '\n'.join(
                [nameOfFile, str(jsonObject), '\n\n\n'])
            f1 = open(fpASTItem, 'w')
            f1.write(strContentAppend)
            f1.close()
            shutil.copyfile(fpMixFileCPP, fopStep2 + nameOfFile)
            f1 = open(fpLog, 'a')
            f1.write('{}\t{}\n'.format(nameOfFile, 'True'))
            f1.close()
            isRunOK = True
            # print('{}\t{}'.format(strCommand,isRunOK))
        else:
            f1 = open(fpLog, 'a')
            f1.write('{}\t{}\n'.format(nameOfFile, 'False'))
            f1.close()
            # print('{}\t{}'.format(strCommand,isRunOK))
        print('OK {}/{} {}'.format(i, len(lstCFilesStep1), fpMixFileCPP))
    except:
        print("Exception in user code:")
        print("-" * 60)
        traceback.print_exc(file=sys.stdout)
        print("-" * 60)
        print('Error: {} {}'.format(i, fpMixFileCPP))
        print('Error {}/{} {}'.format(i, len(lstCFilesStep1), fpMixFileCPP))
    return i, numWordItem, itemTimeProcess
Esempio n. 13
0
    def test_children_by_field_name(self):
        parser = Parser()
        parser.set_language(JAVASCRIPT)
        tree = parser.parse(b"<div a={1} b={2} />")
        jsx_node = tree.root_node.children[0].children[0]

        attributes = jsx_node.children_by_field_name("attribute")
        self.assertEqual([a.type for a in attributes], ["jsx_attribute", "jsx_attribute"])
Esempio n. 14
0
    def test_field_name_for_child(self):
        parser = Parser()
        parser.set_language(JAVASCRIPT)
        tree = parser.parse(b"<div a={1} b={2} />")
        jsx_node = tree.root_node.children[0].children[0]

        self.assertEqual(jsx_node.field_name_for_child(0), None)
        self.assertEqual(jsx_node.field_name_for_child(1), "name")
Esempio n. 15
0
class Code_Parser():
    def __init__(self, grammar, language="python", parser_library_path='src/tree-sitter/tree-sitter-python', **kwargs):
        Language.build_library('/build/my-languages.so',[parser_library_path])
        
        LANGUAGE = Language('/build/my-languages.so', language)
        
        self.grammar = grammar
        
        self.TS_parser = Parser()
        self.TS_parser.set_language(LANGUAGE)
        self.node_builder = NodeBuilder(self.grammar)
    
    def code_to_sequence(self, code_str):
        tree = self.TS_parser.parse(bytes(code_str, "utf8"))
        root_node = tree.root_node
        sequence = self.TSTree_to_sequence(root_node, code_str)
        return sequence
    
    def TSTree_to_sequence(self, TSNode, code_str):
        node_sequence = [TSNode.type]
        if TSNode.type == "string":
            node_text = sub_str_from_coords(code_str, TSNode.start_point, TSNode.end_point)[1:-1]
            node_sequence += ["_string_start",'"',"<REDUCE>"]
            node_sequence += ["_string_content",node_text,"<REDUCE>"]
            node_sequence += ["_string_end",'"',"<REDUCE>"]
        elif TSNode.children == []:
            node_text = sub_str_from_coords(code_str, TSNode.start_point, TSNode.end_point)
            if TSNode.type != node_text:
                node_sequence.append(node_text)
        elif TSNode.children != []:
            for child in TSNode.children:
                node_sequence += self.TSTree_to_sequence(child, code_str)
        node_sequence.append("<REDUCE>")
        return node_sequence
    
    def is_valid_sequence(self, sequence):
        first_node = sequence[0]
        if first_node != "module":
            return False
        partial_tree = PartialTree(first_node, self.node_builder)
        try:
            for expansion in sequence[1:]:
                partial_tree.add_action(expansion)
        except Exception as e:
            return False
        return True 
        
    def sequence_to_partial_tree(self, sequence):
        first_node = sequence[0]
        partial_tree = PartialTree(first_node, self.node_builder)
        try:
            for expansion in sequence[1:]:
                partial_tree.add_action(expansion)
        except Exception as e:
            print("ERROR!")
            traceback.print_exc()
            print("-------")
        return partial_tree 
Esempio n. 16
0
def get_parser(language):
    language = LANGUAGE_ALIASES.get(language, language)
    if language in PARSERS:
        return PARSERS[language]
    LANGUAGE = Language(tree_sitter_build, language)
    parser = Parser()
    parser.set_language(LANGUAGE)
    PARSERS[language] = parser
    return parser
Esempio n. 17
0
 def __init__(
     self,
     SO_FILE: str,
     LANGUAGE: str,
     to_lower=True,
 ):
     self.parser = Parser()
     self.parser.set_language(Language(SO_FILE, LANGUAGE))
     self.LANGUAGE = LANGUAGE
     self.to_lower = to_lower
Esempio n. 18
0
def get_parser(so_path: str = None) -> Parser:
    if so_path is None:
        so_path = JAVA_SO_PATH

    JAVA_LANGUAGE = Language(so_path, 'java')

    parser = Parser()
    parser.set_language(JAVA_LANGUAGE)

    return parser
Esempio n. 19
0
    def __init__(self) -> None:
        # assume submodules exist
        vendor_dirs = ["vendor/tree-sitter-%s" % l for l in TREE_SITTER_LANGS]
        Language.build_library(BUILD_PATH, vendor_dirs)

        self.parsers = {}
        for l in TREE_SITTER_LANGS:
            parser = Parser()
            parser.set_language(Language(BUILD_PATH, "haskell"))
            self.parsers[l] = parser
Esempio n. 20
0
    def __init__(self):
        if not os.path.exists('build/my-languages.so'):
            Language.build_library('build/my-languages.so', [
                'vendor/tree-sitter-c', 'vendor/tree-sitter-cpp',
                'vendor/tree-sitter-c-sharp', 'vendor/tree-sitter-rust',
                'vendor/tree-sitter-javascript', 'vendor/tree-sitter-python'
            ])

        self.ts = Parser()
        self.tree = None
Esempio n. 21
0
 def __init__(self, grammar, language="python", parser_library_path='src/tree-sitter/tree-sitter-python', **kwargs):
     Language.build_library('/build/my-languages.so',[parser_library_path])
     
     LANGUAGE = Language('/build/my-languages.so', language)
     
     self.grammar = grammar
     
     self.TS_parser = Parser()
     self.TS_parser.set_language(LANGUAGE)
     self.node_builder = NodeBuilder(self.grammar)
    def test_tree_cursor_without_tree(self):
        parser = Parser()
        parser.set_language(PYTHON)

        def parse():
            tree = parser.parse(b"def foo():\n  bar()")
            return tree.walk()

        cursor = parse()
        self.assertIs(cursor.node, cursor.node)
        for item in cursor.node.children:
            self.assertIsNotNone(item.is_named)
Esempio n. 23
0
    def test_text_predicates_errors(self):
        parser = Parser()
        parser.set_language(JAVASCRIPT)
        with self.assertRaises(RuntimeError):
            JAVASCRIPT.query("""
            (
                (function_declaration
                    name: (identifier) @function-name
                )
                (#eq? @function-name @function-name fun1)
            )
            """)

        with self.assertRaises(RuntimeError):
            JAVASCRIPT.query("""
            (
                (function_declaration
                    name: (identifier) @function-name
                )
                (#eq? fun1 @function-name)
            )
            """)

        with self.assertRaises(RuntimeError):
            JAVASCRIPT.query("""
            (
                (function_declaration
                    name: (identifier) @function-name
                )
                (#match? @function-name @function-name fun1)
            )
            """)

        with self.assertRaises(RuntimeError):
            JAVASCRIPT.query("""
            (
                (function_declaration
                    name: (identifier) @function-name
                )
                (#match? fun1 @function-name)
            )
            """)

        with self.assertRaises(RuntimeError):
            JAVASCRIPT.query("""
            (
                (function_declaration
                    name: (identifier) @function-name
                )
                (#match? @function-name @function-name)
            )
            """)
Esempio n. 24
0
    def __init__(self,
                 so_file: str,
                 language: str,
                 operators_file: str = None):
        self.parser = Parser()
        self.parser.set_language(Language(so_file, language))
        self.language = language

        if operators_file is None:
            operators_file = os.path.join(os.path.dirname(__file__),
                                          'operators.json')
        with open(operators_file, 'r') as reader:
            self.operators = ujson.load(reader)
 def test_set_language(self):
     parser = Parser()
     parser.set_language(PYTHON)
     tree = parser.parse(b"def foo():\n  bar()")
     self.assertEqual(
         tree.root_node.sexp(),
         trim("""(module (function_definition
             name: (identifier)
             parameters: (parameters)
             body: (block (expression_statement (call
                 function: (identifier)
                 arguments: (argument_list))))))"""),
     )
     parser.set_language(JAVASCRIPT)
     tree = parser.parse(b"function foo() {\n  bar();\n}")
     self.assertEqual(
         tree.root_node.sexp(),
         trim("""(program (function_declaration
             name: (identifier)
             parameters: (formal_parameters)
             body: (statement_block
                 (expression_statement
                      (call_expression
                         function: (identifier)
                         arguments: (arguments))))))"""),
     )
Esempio n. 26
0
def main(file):

    this_directory = os.path.dirname(__file__)
    # filename = os.path.join(this_directory, '/relative/path/to/file/you/want')
    # This code is used to configure parsing tool Tree Sitter
    Language.build_library(
        # Store the library in the `build` directory
        os.path.join(this_directory, 'build/my-languages.so'),

        # Include one or more languages
        [
            # 'vendor/tree-sitter-go',
            os.path.join(this_directory, 'vendor/tree-sitter-java')
            # 'vendor/tree-sitter-python'
        ])
    java_lang = Language(os.path.join(this_directory, 'build/my-languages.so'),
                         'java')

    # Parsing algorithm starts here
    parser = Parser()
    parser.set_language(java_lang)

    # For debugging
    tree_sitter_tree = parser.parse(read_file(file))

    # For production
    # tree_sitter_tree = parser.parse(read_file(file))

    gumtree_ast = to_gumtree_node(tree_sitter_tree.root_node)

    # everything should be inside the tag
    root_node = doc.createElement('root')

    # in test case they have context tag, which is empty. Do not know why we need it
    context_node = doc.createElement('context')

    # We append our root node to document
    doc.appendChild(root_node)

    # Append context tag to root node (<root> </root)
    root_node.appendChild(context_node)

    # append data into <root> tag. At this stage we append parsed code structure.
    root_node.appendChild(gumtree_ast)

    # Recursively add children nodes (if exist)
    process_node(tree_sitter_tree.root_node, gumtree_ast)

    xml = doc.toprettyxml()
    print(xml)
Esempio n. 27
0
def add_lcom5(df, col):
    lang_builds = create_parser_builds()
    parser = Parser()
    class_lcom5 = []

    for i in range(len(df)):
        ext = df["name"][i].split('.')[-1]
        parser.set_language(lang_builds[ext])
        enc = df["encoding"][i]
        tree = parser.parse(bytes(df["contents"][i], df["encoding"][i]))
        class_lcom5.append(calculate_lcom5(tree, ext, bytes(df["contents"][i], df["encoding"][i]), df["name"][i]))
    df["class_lcom5"] = class_lcom5

    return df
Esempio n. 28
0
def get_parser(lang: str) -> Parser:
    """
    Initialize parser for a specific language.
    :param lang: language to use.
    :return: parser.
    """
    global PARSERS
    if lang not in PARSERS:
        parser = Parser()
        parser.set_language(Language(get_tree_sitter_so(), lang))
        PARSERS[lang] = parser
    else:
        parser = PARSERS[lang]
    return parser
Esempio n. 29
0
    def test_multibyte_characters(self):
        parser = Parser()
        parser.set_language(JAVASCRIPT)
        source_code = bytes("'😎' && '🐍'", "utf8")
        tree = parser.parse(source_code)
        root_node = tree.root_node
        statement_node = root_node.children[0]
        binary_node = statement_node.children[0]
        snake_node = binary_node.children[2]

        self.assertEqual(binary_node.type, "binary_expression")
        self.assertEqual(snake_node.type, "string")
        self.assertEqual(
            source_code[snake_node.start_byte:snake_node.end_byte].decode(
                'utf8'), "'🐍'")
Esempio n. 30
0
def jobs(repo_path, args):
    PARSER = Parser()
    PARSER.set_language(Language(args.tree_sitter, args.lang))

    n_file_per_commit = Counter()
    add_tokens_per_del_tokens = []

    if os.path.exists(repo_path):
        submodule = os.path.join(repo_path, '.gitmodules')
        if os.path.exists(submodule):
            os.remove(submodule)

        try:
            n_stored_commit = 0
            for commit in RepositoryMining(
                repo_path,
                only_no_merge=True,
                only_in_branch='master',
                only_modifications_with_file_types=language_ext[args.lang]
            ).traverse_commits():
                if n_stored_commit > args.max_commit_number:
                    break

                cleaned_message = message_cleaner(commit.msg)
                if not cleaned_message:
                    continue
                commit_tokens = tokenize_docstring_from_string(cleaned_message)

                if len(commit_tokens) < args.min_target_length:
                    continue

                addeds, deleteds, n_files = get_code_diff(commit, PARSER, args)
                if 1 <= n_files and n_files <= args.max_duplicate:
                    with jsonlines.open(args.output_file, mode="a") as writer:
                        writer.write(
                            {
                                "commit_tokens": commit_tokens,
                                "add_tokens": addeds[0],
                                "del_tokens": deleteds[0],
                            }
                        )
                    add_tokens_per_del_tokens.append( len(addeds[0]) / len(deleteds[0]) )
                    n_file_per_commit.update({n_files})
                    n_stored_commit += 1
        except:
            pass

    return (n_file_per_commit, add_tokens_per_del_tokens)