def parse_ast_tree(filename): nodes = {} links = {} nodetypes = {} for x in dir(ast): try: if isinstance(ast.__getattribute__(x)(), ast.AST): nodetypes[x.lower()] = ast.__getattribute__(x) except TypeError: pass for line in open(filename): if line.startswith("<"): parts = line[1:].strip("\n").split("=") links[parts[0]] = parts[1].split(",") elif line.startswith(">"): parts = line[1:].strip("\n").split("\t") nodes[parts[0]] = nodetypes[parts[1].lower()]() nodes[parts[0]].children = [] root_nodes = [] for id, value in sorted(links.items()): for link in value: nodes[id].children.append(nodes[link]) root_nodes.append(link) dot = AstNodes() for node in nodes.values(): try: dot.index(node) except: print(filename) root_nodes = set(nodes.keys()) - set(root_nodes) root_nodes = [nodes[id] for id in list(sorted(root_nodes))] return nodes['0']
def parse_src_files(basefolder, seperate_trees=False, verbose=0): if basefolder.endswith("python"): X_names, y, problems = get_ast_src_files(basefolder) X, y, tags = np.array([ast_parse_file(name) for name in tqdm(X_names) ]), np.array(y), problems if verbose == 1: dump(X, y, X_names) return X, y, tags, AstNodes() elif basefolder.endswith("python_trees"): X_names, y, problems = get_ast_src_files(basefolder) X, y, tags = np.array([parse_ast_tree(name) for name in tqdm(X_names) ]), np.array(y), problems return X, y, tags, AstNodes() elif basefolder.endswith("cpp"): X_names, y, problems = get_dot_src_files(basefolder) extend_X = [] extend_X_names = [] extend_y = [] for id, name in enumerate(tqdm(X_names)): program_trees = parse_tree(name, seperate_trees) extend_X.extend(program_trees) extend_y.extend([y[id]] * len(program_trees)) extend_X_names.extend([name] * len(program_trees)) X, y, tags, X_names = np.array(extend_X), np.array( extend_y), problems, extend_X_names return X, y, tags, DotNodes()
def parse_src_files(basefolder, seperate_trees=False,verbose=0): if basefolder.endswith("python"): X_names, y, problems = get_ast_src_files(basefolder) X ,y,tags = np.array([ast_parse_file(name) for name in tqdm(X_names)]), np.array(y), problems return X ,y,tags,AstNodes() else: X_names, y, problems = get_ast_src_files(basefolder) X ,y,tags = np.array([parse_ast_tree(name) for name in tqdm(X_names)]), np.array(y), problems return X ,y,tags,AstNodes()
def show_embeding(model, basefolder): # Word Embedding Analysis X = scale(model.embed.W.data) y = np.arange(X.shape[0]) ast_nodes = AstNodes() true_labels = np.array(ast_nodes.nodetypes + [ast_nodes.NONE]) # estimator = KernelPCA(n_components=2,kernel="rbf")#PCA(n_components=2)#PCA(n_components=2) #TSNE(n_components=2, random_state=None)# data_plot(X, y, true_labels) # estimator = DBSCAN(eps=0.3, min_samples=10) estimator = KMeans(n_clusters=10, init='k-means++') cluster_plot(estimator, X, y, true_labels, basefolder=basefolder) print("*" * 10, " Cluster AST Node types:", "*" * 10) cluster_table(estimator, X, y, true_labels) estimator = NearestNeighbors(n_neighbors=5) print("*" * 10, " Neighbors of AST Node types:", "*" * 10) neighbors_table(estimator, X, y, true_labels)
def getParams(dataset, layers, cell, units, authors): if dataset == "python": nodes = AstNodes() else: nodes = DotNodes() if cell == "lstm": model = RecursiveLSTM(units, authors, layers=layers, dropout=0.2, feature_dict=nodes, classes=None, cell="lstm", residual=False) elif cell == "bilstm": model = RecursiveBiLSTM(units, authors, layers=layers, dropout=0.2, feature_dict=nodes, classes=None, cell="lstm", residual=False) return model.params_count()
# path = R"C:\Users\bms\Files\current\research\stylemotry\stylemotery_code\saved_models\3_treelstm_3tree_500_70_labels1_epoch_206.my" # model = RecursiveTreeLSTM(n_children=1, n_units=500,n_label=70, dropout=0.2,feature_dict=TreeFeatures()) # serializers.load_npz(path,model) # print_model(model, depth=1, output=sys.stdout) # show_embeding(model,basefolder=os.path.join(basefolder,model_name+"_embed")) # show_authors(model,basefolder=os.path.join(basefolder,model_name+"_authors")) # lstm print("LSTM") model_name = "lstm" path = R"C:\Users\bms\Files\current\research\stylemotry\stylemotery_code\saved_models\lstm\1_lstm_100_python_70_labels1_1_epoch_409.my" model = RecursiveLSTM(n_units=100, layers=1, n_label=70, dropout=0.2, feature_dict=AstNodes()) serializers.load_npz(path, model) print_model(model, depth=1, output=sys.stdout) show_embeding(model, basefolder=os.path.join(basefolder, model_name + "_embed")) # bilstm # print("BiLSTM") model_name = "bilstm" path = R"C:\Users\bms\Files\current\research\stylemotry\stylemotery_code\saved_models\bilstm\1_bilstm_100_python_70_labels1_epoch_333.my" model = RecursiveBiLSTM(n_units=100, layers=1, n_label=70, dropout=0.2, feature_dict=AstNodes(), peephole=False)