Ejemplo n.º 1
0
    def __init__(self, node_type_vocab_path, token_vocab_path,
                 subtree_vocab_path, data_path):

        home = str(Path.home())
        cd = os.getcwd()
        os.chdir(path.join(home, ".tree-sitter", "bin"))
        self.Languages = {}
        for file in glob.glob("*.so"):
            try:
                lang = os.path.splitext(file)[0]
                self.Languages[lang] = Language(
                    path.join(home, ".tree-sitter", "bin", file), lang)
            except:
                print("An exception occurred to {}".format(lang))
        os.chdir(cd)

        self.excluded_node_types = ["comment", "error", "'", '"']
        super().__init__(node_type_vocab_path, token_vocab_path,
                         subtree_vocab_path, data_path)
Ejemplo n.º 2
0
    def build_parser(self):
        url, folder = self.LANG_URL
        repo_dir = Path(fast_trees.__path__[0] + "/" + folder)
        if repo_dir.exists():
            print("Repo already exists, continuing.")
        else:
            print(f"Downloading repo {url} to {repo_dir}.")
            Repo.clone_from(url, repo_dir)

        build_dir = fast_trees.__path__[
            0] + "/" + f"{repo_dir}/build/my-languages.so"
        Language.build_library(
            # Store the library in the `build` directory
            build_dir,
            # Include one or more languages
            [repo_dir],
        )
        self.language = Language(build_dir, self.LANG)
        self.parser = Parser()
        self.parser.set_language(self.language)
Ejemplo n.º 3
0
 def __init__(self,
              code,
              language='python',
              tree_style='SPT',
              path_style='L2L'):
     # AST | SPT || HST | HPT
     self.tree_style = tree_style
     # L2L | UD | U2D
     self.path_style = path_style
     # Use the Language.build_library method to compile these
     # into a library that's usable from Python:
     csn_so = '../build/csn.so'
     # Language.build_library(
     #   csn_so,
     #   [
     #     '../vendor/tree-sitter-go',
     #     '../vendor/tree-sitter-java',
     #     '../vendor/tree-sitter-javascript',
     #     '../vendor/tree-sitter-php',
     #     '../vendor/tree-sitter-python',
     #     '../vendor/tree-sitter-ruby',
     #   ]
     # )
     parser = Parser()
     # Load the languages into your app as Language objects:
     # ('go', 'java', 'javascript', 'php', 'python', 'ruby')
     parser.set_language(Language(csn_so, language))
     tree = parser.parse(code.encode())
     code_lines = code.split('\n')
     self.root, self.terminals, self.num_eldest = self.traverse(
         tree, code_lines)
     self.terminal_nodes = list()
     self.nonterminal_nodes = list()
     self.leafpath_terminal_nodes = list()
     self.leafpath_nonterminal_nodes = list()
     self.rootpath_terminal_nodes = list()
     self.rootpath_nonterminal_nodes = list()
     self.debug = False
     if self.debug:
         print(f'{"@" * 9}code\n{code}')
         print(f'{"@" * 9}sexp\n{tree.root_node.sexp()}')
Ejemplo n.º 4
0
def corpus_dataflow_match(references, candidates, lang):
    LANGUAGE = Language('parser/my-languages.so', lang)
    parser = Parser()
    parser.set_language(LANGUAGE)
    parser = [parser, dfg_function[lang]]
    match_count = 0
    total_count = 0

    for i in range(len(candidates)):
        references_sample = references[i]
        candidate = candidates[i]
        for reference in references_sample:
            try:
                candidate = remove_comments_and_docstrings(candidate, 'java')
            except:
                pass
            try:
                reference = remove_comments_and_docstrings(reference, 'java')
            except:
                pass

            cand_dfg = get_data_flow(candidate, parser)
            ref_dfg = get_data_flow(reference, parser)

            normalized_cand_dfg = normalize_dataflow(cand_dfg)
            normalized_ref_dfg = normalize_dataflow(ref_dfg)

            if len(normalized_ref_dfg) > 0:
                total_count += len(normalized_ref_dfg)
                for dataflow in normalized_ref_dfg:
                    if dataflow in normalized_cand_dfg:
                        match_count += 1
                        normalized_cand_dfg.remove(dataflow)
    if total_count == 0:
        print(
            "WARNING: There is no reference data-flows extracted from the whole corpus, and the data-flow match score degenerates to 0. Please consider ignoring this score."
        )
        return 0
    score = match_count / total_count
    return score
Ejemplo n.º 5
0
    def __init__(self, SO_FILE, LANGUAGE, to_lower=False, operators_file=None):
        self.parser = Parser()
        try:
            assert PathManager.exists(SO_FILE), FileExistsError(
                f"{SO_FILE} does not exist, automatically download TreeSitter parse file {LANGUAGE}.so."
            )
        except FileExistsError as err:
            LOGGER.warning(err)
            from ncc.hub.tree_sitter.download import download
            download(LANGUAGE)

        if LANGUAGE == 'csharp':
            LANGUAGE = 'c_sharp'
        self.parser.set_language(Language(SO_FILE, LANGUAGE))
        self.LANGUAGE = LANGUAGE
        self.to_lower = to_lower

        if operators_file is None:
            operators_file = os.path.join(os.path.dirname(__file__),
                                          'operators.json')
        with open(operators_file, 'r') as reader:
            self.operators = json_io.json_load(reader)
Ejemplo n.º 6
0
def getLang(path):
    ''' return appropriate language for a file given path '''
    ending = path.split('.')[-1]
    if ending in ['cpp', 'h', 'cc', 'hh', 'hpp']:
        return Language('build/my-languages.so', 'cpp')
    elif ending in ['c']:
        return Language('build/my-languages.so', 'c')
    elif ending in ['py']:
        return Language('build/my-languages.so', 'python')
    elif ending in ['cs']:
        return Language('build/my-languages.so', 'c_sharp')
    elif ending in ['rs']:
        return Language('build/my-languages.so', 'rust')
    elif ending in ['js']:
        return Language('build/my-languages.so', 'javascript')
    else:
        raise Exception
Ejemplo n.º 7
0
    def __init__(self,
                 language: str,
                 query_class_name: str,
                 query_file_path: str,
                 library_loc: str = None):
        if os.getenv("TS_LIB_PATH") is not None and library_loc is None:
            library_loc = os.getenv("TS_LIB_PATH")

        if not library_loc:
            raise ParserLibraryNotFoundError(
                "Parser library path is 'None'. Please either set up the environment or call the constructor with the path"
            )

        if not Path(library_loc).exists() or not Path(library_loc).is_file():
            raise ParserLibraryNotFoundError(
                f"Parser library '{library_loc}' not found. Did you set up the environement variables?"
            )

        self.language = Language(library_loc, language)
        self.parser = Parser()
        self.parser.set_language(self.language)
        self.qclass = Query.fromFile(query_file_path)
        self.QUERIES = self.qclass[query_class_name]
Ejemplo n.º 8
0
def corpus_dataflow_match(references, candidates, lang):
    LANGUAGE = Language(PARSER_LOCATION, lang)
    parser = Parser()
    parser.set_language(LANGUAGE)
    parser = [parser, dfg_function[lang]]
    match_count = 0
    total_count = 0

    for i in range(len(candidates)):
        references_sample = references[i]
        candidate = candidates[i]
        for reference in references_sample:
            try:
                candidate = remove_comments_and_docstrings(candidate, 'java')
            except:
                pass
            try:
                reference = remove_comments_and_docstrings(reference, 'java')
            except:
                pass

            cand_dfg = get_data_flow(candidate, parser)
            ref_dfg = get_data_flow(reference, parser)

            normalized_cand_dfg = normalize_dataflow(cand_dfg)
            normalized_ref_dfg = normalize_dataflow(ref_dfg)

            if len(normalized_ref_dfg) > 0:
                total_count += len(normalized_ref_dfg)
                for dataflow in normalized_ref_dfg:
                    if dataflow in normalized_cand_dfg:
                        match_count += 1
                        normalized_cand_dfg.remove(dataflow)

    score = match_count / total_count if total_count > 0 else 1.0
    return score
Ejemplo n.º 9
0
from parser import (remove_comments_and_docstrings, tree_to_token_index,
                    index_to_code_token, tree_to_variable_index)
from tree_sitter import Language, Parser
dfg_function = {
    'python': DFG_python,
    'java': DFG_java,
    'ruby': DFG_ruby,
    'go': DFG_go,
    'php': DFG_php,
    'javascript': DFG_javascript
}

#load parsers
parsers = {}
for lang in dfg_function:
    LANGUAGE = Language('parser/my-languages.so', lang)
    parser = Parser()
    parser.set_language(LANGUAGE)
    parser = [parser, dfg_function[lang]]
    parsers[lang] = parser


#remove comments, tokenize code and extract dataflow
def extract_dataflow(code, parser, lang):
    #remove comments
    try:
        code = remove_comments_and_docstrings(code, lang)
    except:
        pass
    #obtain dataflow
    if lang == "php":
Ejemplo n.º 10
0
        # Clone all repos above at the given tag
        repo_dirs = []
        for lang, (url, suffix) in _LANGUAGE_REPOS.items():
            logging.warning(f"Cloning `{lang}`...")
            repo_dir = os.path.join(dir, lang)
            repo = Repo.clone_from(url, repo_dir)
            repo.git.checkout(TREE_SITTER_LANG_VER)
            repo_dirs.append(os.path.join(repo_dir, suffix))

        # Build library by pointing to each repo
        logging.warning(f"Building Tree-sitter Library...")
        Language.build_library(LIBRARY_DIR, repo_dirs)

_LANGUAGES = {}
for language in _LANGUAGE_REPOS:
    _LANGUAGES[language] = Language(LIBRARY_DIR, language)

# Add aliases
_ALIASES = {
    "c++": "cpp",
    "c#": "c_sharp",
    "csharp": "c_sharp",
    "js": "javascript",
    "ts": "typescript"
}
for alias, target in _ALIASES.items():
    _LANGUAGES[alias] = _LANGUAGES[target]

def parser_for(language: str) -> Parser:
    parser = Parser()
    parser.set_language(_LANGUAGES[language])
Ejemplo n.º 11
0
"""[1:-1])

subprocess.run(["npm", "install"], cwd=grammar_path, check=True)
subprocess.run(["npx", "tree-sitter", "generate"],
               cwd=grammar_path,
               check=True)
# Following are commented for future reference to expose playground
# Remove "--docker" if local environment matches with the container
# subprocess.run(["npx", "tree-sitter", "build-wasm", "--docker"],
#                cwd=grammar_path, check=True)

Language.build_library(grammar_path + "/build/wgsl.so", [
    grammar_path,
])

WGSL_LANGUAGE = Language(grammar_path + "/build/wgsl.so", "wgsl")

parser = Parser()
parser.set_language(WGSL_LANGUAGE)

error_list = []

for key, value in scanner_components[scanner_example.name()].items():
    if "expect-error" in key:
        continue
    value = value[:]
    if "function-scope" in key:
        value = ["fn function__scope____() {"] + value + ["}"]
    if "type-scope" in key:
        # Initiailize with zero-value expression.
        value = ["let type_scope____: "] + value + ["="] + value + ["()"
Ejemplo n.º 12
0
def build_parser(language):
    LANGUAGE = Language('build/my-languages.so', language)
    parser = Parser()
    parser.set_language(LANGUAGE)
    return parser
Ejemplo n.º 13
0
import os
from tree_sitter import Language, Parser

dname = os.path.dirname(__file__)
so_path = os.path.join(dname, "./tree-sitter-wp.so")

Language.build_library(so_path, [
    os.path.join(dname, 'tree-sitter-wordprocessing')])

WP_LANGUAGE = Language(so_path, 'wordprocessing')

parser = Parser()
parser.set_language(WP_LANGUAGE)
fopDataRoot = '/home/hungphd/'
fopGithub = '/home/hungphd/git/'
fopBuildFolder = fopDataRoot + 'build-tree-sitter/'
fpLanguageSo = fopBuildFolder + 'my-languages.so'

createDirIfNotExist(fopJsonData)
# createDirIfNotExist(fopFilesPerProjectData)
lstFopAlonCorpus = []
lstFopJsonData = []
lstFpLogAPICalls = []
lstFopFilesPerProjectData = []

lstFolderNames = ['training', 'test', 'validation']

JAVA_LANGUAGE = Language(fpLanguageSo, 'java')
parser = Parser()
parser.set_language(JAVA_LANGUAGE)
isCollectFromStart = False

for i in range(0, len(lstFolderNames)):
    folderName = lstFolderNames[i]
    lstFopJsonData.append(fopJsonData + folderName + '/')
    lstFopAlonCorpus.append(fopAlonCorpus + folderName + '/')
    lstFpLogAPICalls.append(fopDataAPICalls + 'log_projects_' + folderName +
                            '.txt')
    # lstFopFilesPerProjectData.append(fopFilesPerProjectData+folderName+'/')

for i in range(1, len(lstFolderNames)):
    logFileLocationForEachJavaProjects(lstFopAlonCorpus[i], lstFopJsonData[i],
                                       lstFpLogAPICalls[i], parser,
Ejemplo n.º 15
0
from tree_sitter import Language, Parser

# Store the library in the `build` directory
so_file_path = 'build/python-languages.so'
python_repo_path = '.'

# Build the *.so file
Language.build_library(so_file_path, python_repo_path)

PY_LANGUAGE = Language(so_file_path, 'python')

parser = Parser()
parser.set_language(PY_LANGUAGE)

source_bytes = bytes(
    '''
def foo(required, optional=None):
    if bar:
        baz()
''', 'utf8')

tree = parser.parse(source_bytes)

root_node = tree.root_node

print()
print('type(tree) =', type(tree))
print('type(root_node) =', type(root_node))

print()
print('root_node.sexp() =', root_node.sexp())
Ejemplo n.º 16
0
import os
import sys
import ast
import inspect
from tree_sitter import Language, Parser

tree_sitter_python_file = os.path.join(
    os.path.dirname(inspect.getfile(sys.modules[__name__])),
    'tree-sitter-python.so')
if not os.path.isfile(tree_sitter_python_file):
    os.system(
        'git clone --depth 1 https://github.com/tree-sitter/tree-sitter-python'
    )
    Language.build_library(tree_sitter_python_file, ['tree-sitter-python'])
    os.system('rm -rf tree-sitter-python')
PY_LANGUAGE = Language(tree_sitter_python_file, 'python')
parser = Parser()
parser.set_language(PY_LANGUAGE)

operator_registry = {}


def get_or_create_operator(identifier):
    if identifier in operator_registry:
        return operator_registry[identifier]
    return Operator(identifier)


class Operator:
    def __init__(self, identifier):
        operator_registry[identifier] = self
Ejemplo n.º 17
0
dfg_function = {
    'python': DFG_python,
    'java': DFG_java,
    'ruby': DFG_ruby,
    'go': DFG_go,
    'csharp': DFG_csharp,
    'php': DFG_php,
    'javascript': DFG_javascript,
    'cpp': DFG_cpp,
}

# load parsers
parsers = {}
for lang in dfg_function:
    LANGUAGE = Language(
        os.path.join(os.path.dirname(__file__), 'my-languages.so'),
        'c_sharp' if lang == 'csharp' else lang)
    parser = Parser()
    parser.set_language(LANGUAGE)
    parser = [parser, dfg_function[lang]]
    parsers[lang] = parser

from dataset.clcdsa.dfg.utils import (
    remove_comments_and_docstrings,
    tree_to_token_index,
    index_to_code_token,
    tree_to_variable_index,
)


# remove comments, tokenize code and extract dataflow
Ejemplo n.º 18
0
# pylint: disable=missing-docstring

import re
from unittest import TestCase
from os import path
from tree_sitter import Language, Parser

LIB_PATH = path.join("build", "languages.so")
Language.build_library(
    LIB_PATH,
    [
        path.join("tests", "fixtures", "tree-sitter-python"),
        path.join("tests", "fixtures", "tree-sitter-javascript"),
    ],
)
PYTHON = Language(LIB_PATH, "python")
JAVASCRIPT = Language(LIB_PATH, "javascript")


class TestParser(TestCase):
    def test_set_language(self):
        parser = Parser()
        parser.set_language(PYTHON)
        tree = parser.parse(b"def foo():\n  bar()")
        self.assertEqual(
            tree.root_node.sexp(),
            trim("""(module (function_definition
                name: (identifier)
                parameters: (parameters)
                body: (block (expression_statement (call
                    function: (identifier)
Ejemplo n.º 19
0
def main():
    st.title('Code Decoder')
    language = st.sidebar.selectbox('Choose Language', ('Python', 'Java', 'Javascript', 'Go'))
    format = st.selectbox('Upload or Paste your code over', ('Upload', 'Paste'))
    if language == 'Python':
    	parser = build_parser("python")
    	name = "t5_base_python"
    	python_model, python_tokenizer = loadmodel(name)
    	python_pipeline = SummarizationPipeline(model=python_model, tokenizer=python_tokenizer, device=0)
    	
    	if format == 'Paste':
        	code = st.text_area("Enter code")
        	if code is not None:
        		tree = parser.parse(bytes(code, "utf8"))
        		code_list=[]
        		tokenized_code = my_traverse(code, tree.root_node, code_list)
        		out = python_pipeline([tokenized_code])
        		final_code = f"# {out[0]['summary_text']}\n{code}"
        		st.code(final_code, language="python")
        		fname = 'commented_code.py'
        		with open(fname, 'w') as f:
            		f.write(final_code)
          		st.markdown(get_binary_file_downloader_html(fname, 'Code'), unsafe_allow_html=True)
    	
    	elif format == 'Upload':
        	uploaded_file = st.file_uploader("Upload your python file")
        	if uploaded_file is not None:
        		stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
        		code = stringio.read()
        		tree = parser.parse(bytes(code, "utf8"))
        		code_list=[]
        		tokenized_code = my_traverse(code, tree.root_node, code_list)
        		out = python_pipeline([tokenized_code])
        		final_code = f"# {out[0]['summary_text']}\n{code}"
        		st.code(final_code, language="python")
        		fname = 'commented_code.py'
        		with open(fname, 'w') as f:
            		f.write(final_code)
        		st.markdown(get_binary_file_downloader_html(fname, 'Code'), unsafe_allow_html=True)
    
    elif language == 'Go':
    	GO_LANGUAGE = Language('build/my-languages.so', 'go')
    	go_parser = Parser()
    	go_parser.set_language(GO_LANGUAGE)
    	#parser = build_parser("go")
    	name = "t5_base_go"
    	go_model, go_tokenizer = loadmodel(name)
    	go_pipeline = SummarizationPipeline(model=go_model, tokenizer=go_tokenizer, device=0)
    	
    	if format == 'Paste':
        	code = st.text_area("Enter code")
        	if code is not None:
        		tree = go_parser.parse(bytes(code, "utf8"))
        		code_list=[]
        		tokenized_code = my_traverse(code, tree.root_node, code_list)
        		out = go_pipeline([tokenized_code])
        		final_code = f"// {out[0]['summary_text']}\n{code}"
        		st.code(final_code, language="go")
          		fname = 'commented_code.go'
        		with open(fname, 'w') as f:
            		f.write(final_code)
        		st.markdown(get_binary_file_downloader_html(fname, 'Code'), unsafe_allow_html=True)
    	
    	elif format == 'Upload':
        	uploaded_file = st.file_uploader("Upload your go file")
        	if uploaded_file is not None:
        		stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
        		code = stringio.read()
        		tree = go_parser.parse(bytes(code, "utf8"))
        		code_list=[]
        		tokenized_code = my_traverse(code, tree.root_node, code_list)
        		out = go_pipeline([tokenized_code])
        		st.code(f"// {out[0]['summary_text']}\n{code}", language="go")
        		fname = 'commented_code.go'
        		with open(fname, 'w') as f:
            		f.write(final_code)
        		st.markdown(get_binary_file_downloader_html(fname, 'Code'), unsafe_allow_html=True)
    
    elif language == 'Javascript':
    	JS_LANGUAGE = Language('build/my-languages.so', 'javascript')
    	js_parser = Parser()
    	js_parser.set_language(JS_LANGUAGE)
    	name = "t5_base_javascript"
    	js_model, js_tokenizer = loadmodel(name)
    	js_pipeline = SummarizationPipeline(model=js_model, tokenizer=js_tokenizer, device=0)
    	
    	if format == 'Paste':
        	code = st.text_area("Enter code")
        	if code:
        	tree = js_parser.parse(bytes(code, "utf8"))
        	code_list=[]
        	tokenized_code = my_traverse(code, tree.root_node, code_list)
        	out = js_pipeline([tokenized_code])
        	final_code = f"// {out[0]['summary_text']}\n{code}"
        	st.code(final_code, language="javascript")
        	fnamejs = 'commented_code.js'
        	with open(fnamejs, 'w') as fjs:
            	fjs.write(final_code)
        	st.markdown(get_binary_file_downloader_html(fnamejs, 'Code'), unsafe_allow_html=True)
    	
    	elif format == 'Upload':
        	uploaded_file = st.file_uploader("Upload your javascript file")
        	if uploaded_file is not None:
        		stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
        		code = stringio.read()
        		tree = go_parser.parse(bytes(code, "utf8"))
        		code_list=[]
        		tokenized_code = my_traverse(code, tree.root_node, code_list)
        		out = go_pipeline([tokenized_code])
        		st.code(f"// {out[0]['summary_text']}\n{code}", language="javascript")
        		fname = 'commented_code.js'
        		with open(fname, 'w') as f:
            		f.write(final_code)
        		st.markdown(get_binary_file_downloader_html(fname, 'Code'), unsafe_allow_html=True)
    
    elif language == 'Java':
    	JAVA_LANGUAGE = Language('build/my-languages.so', 'java')
    	java_parser = Parser()
    	java_parser.set_language(JAVA_LANGUAGE)
    	name = "t5_base_java"
    	java_model, java_tokenizer = loadmodel(name)
    	java_pipeline = SummarizationPipeline(model=java_model, tokenizer=java_tokenizer, device=0)
    	
    	if format == 'Paste':
        	code = st.text_area("Enter code")
        	if code:
        		tree = java_parser.parse(bytes(code, "utf8"))
        		code_list=[]
        		tokenized_code = my_traverse(code, tree.root_node, code_list)
        		out = java_pipeline([tokenized_code])
        		final_code = f"// {out[0]['summary_text']}\n{code}"
        		st.code(final_code, language="java")
        		fname = 'commented_code.java'
        		with open(fname, 'w') as f:
            		f.write(final_code)
        		st.markdown(get_binary_file_downloader_html(fname, 'Code'), unsafe_allow_html=True)
    	
    	elif format == 'Upload':
        	uploaded_file = st.file_uploader("Upload your java file")
        	if uploaded_file is not None:
        		stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
        		code = stringio.read()
        		tree = go_parser.parse(bytes(code, "utf8"))
        		code_list=[]
        		tokenized_code = my_traverse(code, tree.root_node, code_list)
        		out = go_pipeline([tokenized_code])
        		st.code(f"// {out[0]['summary_text']}\n{code}", language="java")
        		fname = 'commented_code.java'
        		with open(fname, 'w') as f:
            		f.write(final_code)
        		st.markdown(get_binary_file_downloader_html(fname, 'Code'), unsafe_allow_html=True)

if __name__=='__main__':
	main()
Ejemplo n.º 20
0
from tree_sitter import Language, Parser
import sys
import difflib
import os, sys


def resource_path(relative_path):
    if hasattr(sys, "_MEIPASS"):
        #sys._MEIPASS 是pyinstaller打包后生成的exe文件执行时,将所有的二进制动态链接文件存放的目录。
        base_path = sys._MEIPASS
    else:
        base_path = os.path.abspath(".")
    return os.path.join(base_path, relative_path)


CPP_LANGUAGE = Language(resource_path('language.so'), 'cpp')
parser = Parser()
parser.set_language(CPP_LANGUAGE)


def tokenize(root, b_code, tokens, types):
    childrenList = root.children
    if len(childrenList) == 0:
        tokens.append(b_code[root.start_byte:root.end_byte].decode('utf-8'))
        types.append(root.type)
    else:
        for child in childrenList:
            tokenize(child, b_code, tokens, types)


def types_diff(types1, types2):
Ejemplo n.º 21
0
# for pre-train (python and java)

import copy
import json
import json as json
import os
import random
import re
import sys

import numpy as np
from tqdm import tqdm
from tree_sitter import Language, Parser


CS_LANGUAGE = Language('build_parser/languages_java_py_cs.so', 'c_sharp')
JA_LANGUAGE = Language('build_parser/languages_java_py_cs.so', 'java')
PY_LANGUAGE = Language('build_parser/languages_java_py_cs.so', 'python')

lang = {
    "py" : PY_LANGUAGE,
    "java" : JA_LANGUAGE,
    "cs" : CS_LANGUAGE
}
parser = Parser()

Path = "/home/pretrain_data_code/"
savepath = "/home/pretrain_data_AST_tmp/"

AST = []
queue = []
Ejemplo n.º 22
0
    ast_tree = delete_single_child_ndoe(ast_tree)
    ast_tree = to_binary_tree(ast_tree)  # to binary ast tree
    ast_tree = reset_indices(ast_tree)  # reset node indices
    return ast_tree


def parse_deepcom(ast_tree: dict, sbt_func: None, to_lower: bool):
    sbt_seq = sbt_func(ast_tree, constants.ROOT_NODE_NAME, to_lower)
    return sbt_seq


if __name__ == '__main__':
    parser = Parser()
    parser.set_language(
        Language(
            '/data/wanyao/yang/ghproj_d/GitHub/naturalcodev2/dataset/parser_zips/languages.so',
            'ruby'))
    code = '''
def create_router(name, admin_state_up = true)
    data = {
        'router' =>{
            'name' => name,
            'admin_state_up' => admin_state_up,
        }   
    }
    return post_request(address("routers"), data, @token)
end 
    '''.replace('\t', '    ').replace('\n', ' ').strip()
    ast_tree = parser.parse(bytes(code, "utf8"))
    code_lines = code.split('\n')  # raw code
    # 1) build ast tree in Dict type
Ejemplo n.º 23
0
import argparse
from os.path import exists
import re
from os import path
from tree_sitter import Language, Parser
from pathlib import Path
home = str(Path.home())

import glob, os
cd = os.getcwd()
os.chdir(path.join(home, ".tree-sitter", "bin"))
Languages = {}
for file in glob.glob("*.so"):
  try:
    lang = os.path.splitext(file)[0]
    Languages[lang] = Language(path.join(home, ".tree-sitter", "bin", file), lang)
  except:
    print("An exception occurred to {}".format(lang))
os.chdir(cd)

parser = argparse.ArgumentParser()
parser.add_argument('--language', metavar='L', type=str, default="java", help='language to parse')
parser.add_argument('--filename', metavar='F', type=str, default="../examples/raw_code/104.c", help='file to parse')
#parser.add_argument('--node_types', type=str, help='a list of node types to be selected')
opt = parser.parse_args()

def main(opt):
    parser = Parser()
    print(opt.language[0])
    lang = Languages.get("java")
    parser.set_language(lang)
Ejemplo n.º 24
0
def get_language(name):
    return Language(so_path, name)
Ejemplo n.º 25
0
#!/usr/bin/env python3

from tree_sitter import Language, Parser
from pathlib import Path

import pkg_resources

LANGUAGE = Language(next(Path(__file__).parent.glob("binding.*.so")),
                    "minizinc")
HIGHLIGHT_QUERY = LANGUAGE.query(
    pkg_resources.resource_string(__name__, "queries/highlights.scm"))

try:
    from pygments.lexer import Lexer
    from pygments import token

    class TreeSitterLexer(Lexer):
        ts_alias = {
            "comment": token.Comment,
            "type.builtin": token.Name.Builtin,
            "punctuation.delimiter": token.Punctuation,
            "function": token.Name.Function,
            "keyword": token.Keyword,
            "operator": token.Operator,
            "punctuation.bracket": token.Punctuation,
            "number": token.Number,
            "string": token.String,
            "escape": token.String.Escape,
            "constant.builtin": token.Generic,
            "variable": token.Name.Variable,
        }
Ejemplo n.º 26
0
"""

from tree_sitter import Language, Parser
from queue import SimpleQueue

Language.build_library(
    # Store the library in the `build` directory
    'build/my-languages.so',

    # Include one or more languages
    [
        'tree-sitter-cpp',
    ])

CPP_LANGUAGE = Language('build/my-languages.so', 'cpp')
parser = Parser()
parser.set_language(CPP_LANGUAGE)

tree = parser.parse(
    bytes(
        """
#include <iostream>
#include <cstdlib>

auto main( int argc, char** argv ) -> int
{
    std::cout << "Hello world!" << std::endl;
    
    return EXIT_SUCCESS;
}
fopStep4GraphSimplifyTestW = fopStep4GraphSimplify + 'testW/'
fopStep4GraphSimplifyTrain = fopStep4GraphSimplify + 'train/'

fpLogTrain = fopCompiledFiles + 'log_train.txt'
fpLogTestP = fopCompiledFiles + 'log_testP.txt'
fpLogTestW = fopCompiledFiles + 'log_testW.txt'

fopDataRoot = '/home/hungphd/'
fopGithub = '/home/hungphd/git/'
fopBuildFolder = fopDataRoot + 'build-tree-sitter/'
fpLanguageSo = fopBuildFolder + 'my-languages.so'

from pycorenlp import StanfordCoreNLP
nlpObj = StanfordCoreNLP('http://localhost:9000')

CPP_LANGUAGE = Language(fpLanguageSo, 'cpp')
parser = Parser()
parser.set_language(CPP_LANGUAGE)

numOmit = 30
offsetContext = 3
compileMixCCodeAndSave(fopStep1TestPMixFiles, fopStep2TestPMixFiles,
                       fopStep3TestPMixFiles, fopStep4GraphAllTestP,
                       fopStep4GraphSimplifyTestP, fpLogTestP, nlpObj,
                       offsetContext, False)
compileMixCCodeAndSave(fopStep1TestWMixFiles, fopStep2TestWMixFiles,
                       fopStep3TestWMixFiles, fopStep4GraphAllTestW,
                       fopStep4GraphSimplifyTestW, fpLogTestW, nlpObj,
                       offsetContext, False)
compileMixCCodeAndSave(fopStep1TrainMixFiles, fopStep2TrainMixFiles,
                       fopStep3TrainMixFiles, fopStep4GraphAllTrain,
Ejemplo n.º 28
0
 def _get_ts_language(self):
     if self.ts_language is not None:
         return self.ts_language
     self.ts_language = Language(self._get_language_library(), self.lang)
     return self.ts_language
Ejemplo n.º 29
0
from javim.buffer_change import BufferChangeListener, BufferChangeDispatcher
from javim.util_classes import OffsetChain, ReplaceRangeOffsetChainUpdate, DeleteOffsetChainUpdate, DelayedAction
import javim
import treelib as tl
from tempfile import mkstemp
from os import remove
from os.path import exists
from threading import Lock

from tree_sitter import Language, Parser, Tree

Language.build_library("build/langs.so", ["/home/friese/git/tree-sitter-java"])
JAVA_LANG = Language("build/langs.so", "java")
parser = Parser()
parser.set_language(JAVA_LANG)

_, tree_file = mkstemp(suffix="tree")

tree_lock = Lock()


def tree2file(tree: Tree):
    if exists(tree_file): remove(tree_file)

    def gen_index():
        i = 1
        while True:
            yield i
            i += 1

    index = gen_index()
Ejemplo n.º 30
0
"""
Usage:
    parser_cli.py [options] INPUT_FILEPATH

Options:
    -h --help
    --language LANGUAGE             Language
"""
import json

from docopt import docopt
from tree_sitter import Language

from language_data import LANGUAGE_METADATA
from process import DataProcessor

if __name__ == '__main__':
    args = docopt(__doc__)

    DataProcessor.PARSER.set_language(
        Language('/src/build/py-tree-sitter-languages.so', args['--language']))
    processor = DataProcessor(language=args['--language'],
                              language_parser=LANGUAGE_METADATA[
                                  args['--language']]['language_parser'])

    functions = processor.process_single_file(args['INPUT_FILEPATH'])
    print(json.dumps(functions, indent=2))