def bblfsh_node_to_node( self, bblfsh_node: BblfshNode, parent: Optional[Node] ) -> Node: """Create a `Node` given a `BblfshNode` and an optional parent.""" position = bool( bblfsh_node.start_position.offset or bblfsh_node.end_position.offset ) if position: start = bblfsh_node.start_position.offset end = bblfsh_node.end_position.offset if self.convert_to_utf8: start = self.binary_to_str[start] end = self.binary_to_str[end] token = self.file_content[start:end] else: start = None end = None token = bblfsh_node.token # Workaround https://github.com/bblfsh/javascript-driver/issues/65 if not token and bblfsh_node.internal_type == "StringLiteralTypeAnnotation": token = bblfsh_node.properties["value"] return Node( token=token, internal_type=bblfsh_node.internal_type, roles=[role_name(role_id) for role_id in bblfsh_node.roles], parent=parent, start=start, end=end, )
def __str__(self): text = "BaseNode(roles=" + str([bblfsh.role_name(role) for role in self.roles]) + \ ", token=" + self.token + \ ", internal_type=" + self.internal_type + \ ", properties=" + str(self.properties) + ")" # text = "BaseNode(roles=" + self.roles + \ return text
def node_to_roles(node: bblfsh.Node): """ Converte bblfsh roles of a node to a unique string representation :param node: base_node :return: node's roles or the string's hash (in case its UP/DOWN token or a leaf) """ return " | ".join(bblfsh.role_name(r) for r in sorted(node.roles))
def get_node_properties(tree): D = tree.get_dict() node_properties = {} node_properties['token'] = "" node_properties['roles'] = [] if '@type' in tree.get_dict(): node_properties['internal_type'] = tree.internal_type else: node_properties['internal_type'] = None try: node_properties['start_line'] = D['@pos']['start']['line'] node_properties['start_col'] = D['@pos']['start']['col'] node_properties['end_line'] = D['@pos']['end']['line'] node_properties['end_col'] = D['@pos']['end']['col'] except: node_properties['start_line'] = 0 node_properties['start_col'] = 0 node_properties['end_line'] = 0 node_properties['end_col'] = 0 pass try: node_properties['roles'] = [bblfsh.role_name(r) for r in tree.roles] except: pass if '@token' in tree.get_dict(): node_properties['token'] = tree.token if node_properties['internal_type'] == 'uast:Identifier': node_properties['token'] = tree.get()['Name'] return node_properties
def save_roles(output, nodes_count): roles_count = [(tuple([bblfsh.role_name(role_id) for role_id in n]), count) for n, count in nodes_count] if not os.path.exists(output): os.makedirs(output) with open('{}/{}'.format(output, 'roles_count.pickle'), 'wb') as f: pickle.dump(roles_count, f)
def print_statistics(rules_count, nodes_count): print("Top twenty rules:") for i in range(20): print("{}. {}\n".format(i, rules_count[i][0])) print("Top twenty nodes:") for i in range(20): print("{}. {}\n\t{}\n".format( i, [bblfsh.role_name(role_id) for role_id in nodes_count[i][0]], nodes_count[i][1][0])) with open('../results/rules.bin', 'w') as rules_file: rules_file.write("\n".join( [str(r) for r in [r for r, count, in rules_count]]))
def get_rule(node): if len(node.children) == 0: token = "{} {}".format([bblfsh.role_name(i) for i in node.roles], node.token) return [], BaseNode(node.roles, token, node.internal_type, node.properties) rules = [] rhs = [] tokens = "{}".format([bblfsh.role_name(i) for i in node.roles]) for i, child in enumerate(node.children): c_rules, c_node = get_rule(child) rhs.append(c_node) # if i > 0: tokens += " " tokens += c_node.token rules.extend(c_rules) tokens += "\n" lhs = BaseNode(node.roles, tokens, node.internal_type, node.properties) rules.append(Rule(rhs, lhs)) return rules, lhs
def plot_tsne(nodes_dict): """ Compute the t-SNE dimensionality reduction values of input parameter and plot them in 2D :param id_word_vec: vector containing the tuples (id, word, embedding) to be plotted """ nodes = [node for node, _ in nodes_dict] counts = [len(instances) for node, instances in nodes_dict] tsne = TSNE(n_components=2) X_tsne = tsne.fit_transform([BaseNode.roles_as_vector(n) for n in nodes]) plt.scatter(X_tsne[:, 0], X_tsne[:, 1], s=counts) for i, roles in enumerate(nodes): plt.annotate([bblfsh.role_name(role) for role in roles], (X_tsne[i, 0], X_tsne[i, 1])) plt.show()
def _group_quote_predictions( self, vnodes_y: Sequence[VirtualNode], vnodes: Sequence[VirtualNode]) -> QuotedNodeTripleMapping: quotes_classes = frozenset( (CLASS_INDEX[CLS_DOUBLE_QUOTE], CLASS_INDEX[CLS_SINGLE_QUOTE])) y_indices = {id(vnode): i for i, vnode in enumerate(vnodes_y)} grouped_predictions = OrderedDict() for vnode1, vnode2, vnode3 in zip(vnodes, islice(vnodes, 1, None), islice(vnodes, 2, None)): if (id(vnode1) not in y_indices or id(vnode3) not in y_indices or vnode2.node is None or vnode1.y[-1] not in quotes_classes or vnode3.y[0] != vnode1.y[-1]): continue vnode2_roles = frozenset( role_name(role_id) for role_id in vnode2.node.roles) if "STRING" in vnode2_roles: grouped_predictions[id(vnode1)] = vnode1, vnode2, vnode3 grouped_predictions[id(vnode3)] = None return grouped_predictions
def testRoleIdName(self) -> None: self.assertEqual(role_id(role_name(1)), 1) self.assertEqual(role_name(role_id("IDENTIFIER")), "IDENTIFIER")
def _compute_row(self, node: bblfsh.Node) -> Iterable[Tuple[int, int]]: for role_id in node.roles: role = bblfsh.role_name(role_id) if role in self.selected_names_index: yield 1, self.selected_names_index[role]
def main(): """Entry point.""" args = parse_args() slogging.setup(args.log_level, False) clients = threading.local() pool = ThreadPoolExecutor(max_workers=args.threads) log = logging.getLogger("main") log.info("Will parse %d files in %d threads", len(args.input), args.threads) internal_types = defaultdict(int) roles = defaultdict(int) reserved = set() language = args.parquet_language inputs = list(handle_input_arg(args.input)) progress = tqdm(total=len(inputs)) progress_lock = threading.Lock() errors = False def analyze_code_file(path: str): nonlocal errors if errors: return try: try: client = clients.client except AttributeError: client = bblfsh.BblfshClient(args.bblfsh) clients.client = client response = client.parse(path) nonlocal language if not language: language = response.language elif language != response.language: log.warning("dropped %s - language mismatch %s != %s", path, language, response.language) return content = Path(path).read_text() analyze_uast(path, content, response.uast, internal_types, roles, reserved) except: # noqa: E722 log.exception("Parsing %s", path) errors = True finally: with progress_lock: progress.disable = False # this is needed, do not remove progress.update(1) def analyze_parquet_row(row: pandas.Series, filepath: str): nonlocal errors if errors: return nonlocal language try: path = "%s:%s" % (filepath, row.path) analyze_uast(path, row.content.decode(errors="ignore"), bblfsh.Node.FromString(row.uast), internal_types, roles, reserved) except DecodeError as e: log.warning(e) except: # noqa: E722 log.exception("Parsing %s", row.path) errors = True finally: with progress_lock: progress.disable = False # this is needed, do not remove progress.update(1) try: if args.parquet: if not language: raise ValueError( "--parquet-language must be specified with --parquet.") with progress: for filepath in inputs: try: data = pandas.read_parquet(filepath) except: # noqa: E722 log.warning("Bad parquet file %s", filepath) else: analyze = partial(analyze_parquet_row, filepath=filepath) for _, row in data.iterrows(): progress.total += 1 pool.submit(analyze, row) progress.update(1) else: with progress: for filepath in inputs: pool.submit(analyze_code_file, filepath) finally: pool.shutdown() if errors: return 1 reserved.discard("") log.info("Internal types: %d", len(internal_types)) log.info("UAST roles: %d", len(roles)) log.info("Reserved: %d", len(reserved)) roles = {bblfsh.role_name(role_id): n for role_id, n in roles.items()} generate_files(args.output, internal_types, roles, reserved)
def merge_roles(roles: Iterable[int]): return " | ".join(bblfsh.role_name(r) for r in sorted(roles))
def __repr__(self): return "[" + " ".join(["{}/{}".format(bblfsh.role_name(role), role) for role in self.roles]) + "]"
def named_roles(self): return [bblfsh.role_name(role) for role in self.roles]
def testRoleIdName(self): assert (role_id(role_name(1)) == 1) assert (role_name(role_id("IDENTIFIER")) == "IDENTIFIER")