Ejemplo n.º 1
0
class UastIds2Bag:
    """
    Converts a UAST to a bag-of-identifiers.
    """
    def __init__(self, vocabulary, token_parser=None):
        """
        :param vocabulary: The mapping from tokens to bag keys. If None, no mapping is performed.
        :param token_parser: Specify token parser if you want to use a custome one. \
            :class:'TokenParser' is used if it is not specified.
        """
        self._vocabulary = FakeVocabulary(
        ) if vocabulary is None else vocabulary
        self._token_parser = TokenParser(
        ) if token_parser is None else token_parser

    @property
    def vocabulary(self):
        return self._vocabulary

    def uast_to_bag(
            self,
            uast,
            roles_filter="//*[@roleIdentifier and not(@roleQualified)]"):
        """
        Converts a UAST to a bag-of-words. The weights are identifier frequencies.
        The identifiers are preprocessed by :class:`TokenParser`.

        :param uast: The UAST root node.
        :param roles_filter: The libuast xpath query to filter identifiers.
        :return:
        """
        import bblfsh
        nodes = bblfsh.filter(uast, roles_filter)
        bag = defaultdict(int)
        for node in nodes:
            for sub in self._token_parser.process_token(node.token):
                try:
                    bag[self._vocabulary[sub]] += 1
                except KeyError:
                    continue
        return bag
Ejemplo n.º 2
0
class UastIds2Bag:
    """
    Converts a UAST to a bag-of-identifiers.
    """
    def __init__(self, vocabulary, token_parser=None):
        """
        :param vocabulary: The mapping from tokens to bag keys. If None, no mapping is performed.
        :param token_parser: Specify token parser if you want to use a custome one. \
            :class:'TokenParser' is used if it is not specified.
        """
        self._vocabulary = vocabulary if vocabulary is not None else FakeVocabulary()
        self._token_parser = TokenParser() if token_parser is None else token_parser

    @property
    def vocabulary(self):
        return self._vocabulary

    def uast_to_bag(self, uast, role=SIMPLE_IDENTIFIER):
        """
        Converts a UAST to a bag-of-words. The weights are identifier frequencies.
        The identifiers are preprocessed by :class:`TokenParser`.

        :param uast:
        :param role: Specify role of bblfsh Node if you want to get bag of words form them.
        :return:
        """
        stack = [uast]
        bag = defaultdict(int)
        while stack:
            node = stack.pop(0)
            if role in node.roles:
                for sub in self._token_parser.process_token(node.token):
                    try:
                        bag[self._vocabulary[sub]] += 1
                    except KeyError:
                        continue
            stack.extend(node.children)
        return bag
Ejemplo n.º 3
0
class UastIds2Bag:
    """
    Converts a UAST to a bag-of-identifiers.
    """
    def __init__(self, vocabulary):
        """
        :param vocabulary: The mapping from tokens to bag keys. \
                           If None, no mapping is performed.
        """
        self._vocabulary = vocabulary if vocabulary is not None else FakeVocabulary()
        self._token_parser = TokenParser()

    @property
    def vocabulary(self):
        return self._vocabulary

    def uast_to_bag(self, uast):
        """
        Converts a UAST to a bag-of-words. The weights are identifier frequencies.
        The identifiers are preprocessed by :class:`TokenParser`.

        :param uast:
        :return:
        """
        stack = [uast]
        bag = defaultdict(int)
        while stack:
            node = stack.pop(0)
            if SIMPLE_IDENTIFIER in node.roles:
                for sub in self._token_parser.process_token(node.token):
                    try:
                        bag[self._vocabulary[sub]] += 1
                    except KeyError:
                        continue
            stack.extend(node.children)
        return bag
Ejemplo n.º 4
0
class Repo2CooccBase(Repo2Base):
    """
    Converts UASTs to co-occurrence matrices.
    """

    def __init__(self, *args, **kwargs):
        super(Repo2CooccBase, self).__init__(*args, **kwargs)
        self._token_parser = TokenParser()

    def convert_uasts(self, file_uast_generator):
        word2ind = self._get_vocabulary()
        dok_matrix = defaultdict(int)
        for file_uast in file_uast_generator:
            self._traverse_uast(file_uast.response.uast, word2ind, dok_matrix)

        n_tokens = len(word2ind)
        mat = coo_matrix((n_tokens, n_tokens), dtype=numpy.float32)

        if n_tokens == 0:
            return [], mat

        mat.row = row = numpy.empty(len(dok_matrix), dtype=numpy.int32)
        mat.col = col = numpy.empty(len(dok_matrix), dtype=numpy.int32)
        mat.data = data = numpy.empty(len(dok_matrix), dtype=numpy.float32)
        for i, (coord, val) in enumerate(sorted(dok_matrix.items())):
            row[i], col[i] = coord
            data[i] = val

        return self._get_result(word2ind, mat)

    def _get_vocabulary(self):
        raise NotImplementedError

    def _get_result(self, word2ind, mat):
        raise NotImplementedError

    def _update_dict(self, generator, word2ind, tokens):
        raise NotImplementedError

    def _flatten_children(self, root):
        ids = []
        stack = list(root.children)
        for node in stack:
            if SIMPLE_IDENTIFIER in node.roles:
                ids.append(node)
            else:
                stack.extend(node.children)
        return ids

    @staticmethod
    def _all2all(words, word2ind):
        for i in range(len(words)):
            for j in range(i + 1, len(words)):
                try:
                    wi = word2ind[words[i]]
                    wj = word2ind[words[j]]
                except KeyError:
                    continue
                yield wi, wj, 1
                yield wj, wi, 1

    def _process_node(self, root, word2ind, mat):
        children = self._flatten_children(root)

        tokens = []
        for ch in children:
            self._update_dict(self._token_parser.process_token(ch.token), word2ind, tokens)

        if (root.token.strip() is not None and root.token.strip() != "" and
                SIMPLE_IDENTIFIER in root.roles):
            self._update_dict(self._token_parser.process_token(root.token), word2ind, tokens)

        for triplet in self._all2all(tokens, word2ind):
            mat[(triplet[0], triplet[1])] += triplet[2]
        return children

    def _extract_ids(self, root):
        queue = [root]
        while queue:
            node = queue.pop()
            if SIMPLE_IDENTIFIER in node.roles:
                yield node.token
            queue.extend(node.children)

    def _traverse_uast(self, root, word2ind, dok_mat):
        """
        Traverses UAST and extract the co-occurrence matrix.
        """
        stack = [root]
        new_stack = []

        while stack:
            for node in stack:
                children = self._process_node(node, word2ind, dok_mat)
                new_stack.extend(children)
            stack = new_stack
            new_stack = []
Ejemplo n.º 5
0
class ProxBase(Model2Base):
    """
    Contains common utilities for proximity matrix models.

    Proximity matrix captures structural information of the graph. Consider A to be adjacency
    matrix, then useful proximity matrices could be A^2, A(A^k-I)/(A-I), etc.
    To get node (entities corresponding to proximity matrix rows) embeddings we just decompose it.
    """
    MODEL_FROM_CLASS = UASTModel
    MODEL_TO_CLASS = Cooccurrences

    def __init__(self, edges=EDGE_TYPES, *args, **kwargs):
        super(ProxBase, self).__init__(*args, **kwargs)
        self.edges = set(edges)
        self._token_parser = TokenParser()
        self._clear()

    def convert_model(self, model) -> Cooccurrences:
        """
        Update attributes by processing UASTs in the input model.
        Then convert it into Cooccurrences model.

        :param model: UASTModel instance.
        :return: Cooccurences model for all UASTs in `model`.
        """
        for uast in model.uasts:
            self._traverse_uast(uast)

        roles_to_roles = defaultdict(int)
        tokens_to_tokens = defaultdict(int)
        roles_to_tokens = defaultdict(int)

        def add_permutations(edge_type, node_items_list, item_to_item):
            if edge_type in self.edges:
                for node_items in node_items_list:
                    for node_item_a, node_item_b in permutations(
                            node_items, 2):
                        item_to_item[(node_item_a, node_item_b)] += 1

        def add_product(edge_type, items_a, items_b, item_to_item):
            if edge_type in self.edges:
                for item_a, item_b in product(items_a, items_b):
                    item_to_item[(item_a, item_b)] += 1

        add_permutations("r", self.roles, roles_to_roles)
        add_permutations("t", self.tokens, tokens_to_tokens)

        for node_roles, node_tokens in zip(self.roles, self.tokens):
            add_product("rt", node_roles, node_tokens, roles_to_tokens)

        for node_a, node_b in self.dok_matrix:
            roles_a = self.roles[node_a]
            roles_b = self.roles[node_b]
            tokens_a = self.tokens[node_a]
            tokens_b = self.tokens[node_b]

            add_product("R", roles_a, roles_b, roles_to_roles)
            add_product("T", tokens_a, tokens_b, tokens_to_tokens)
            add_product("RT", roles_a, tokens_b, roles_to_tokens)

        if roles_to_roles or roles_to_tokens:
            n_roles = len(self.role2ind)
        else:
            n_roles = 0

        if tokens_to_tokens or roles_to_tokens:
            n_tokens = len(self.token2ind)
        else:
            n_tokens = 0

        n_nodes = n_roles + n_tokens
        n_values = len(roles_to_roles) + len(tokens_to_tokens) + len(
            roles_to_tokens)
        mat = coo_matrix((n_nodes, n_nodes), dtype=numpy.float32)

        mat.row = row = numpy.empty(n_values, dtype=numpy.int32)
        mat.col = col = numpy.empty(n_values, dtype=numpy.int32)
        mat.data = data = numpy.empty(n_values, dtype=numpy.float32)

        def fill_mat(item_to_item, offset):
            for i, (coord, val) in enumerate(sorted(item_to_item.items())):
                row[i + fill_mat.count] = coord[0] + offset[0]
                col[i + fill_mat.count] = coord[1] + offset[1]
                data[i + fill_mat.count] = val
            fill_mat.count += len(item_to_item)

        fill_mat.count = 0

        fill_mat(roles_to_roles, (0, 0))
        fill_mat(roles_to_tokens, (0, n_roles))
        fill_mat(tokens_to_tokens, (n_roles, n_roles))

        mat = coo_matrix(mat + mat.T - diags(mat.diagonal()))
        tokens, mat = self._adj_to_feat(self.role2ind, self.token2ind, mat)
        self._clear()

        prox = Cooccurrences()
        prox.construct(tokens=tokens, matrix=mat)
        return prox

    def _adj_to_feat(self, role2ind: Dict[int, int], token2ind: Dict[int, int],
                     mat) -> Tuple:
        """
        This must be implemented in the child classes.

        :param role2ind: Mapping from roles to indices, starting with 0.
        :param token2ind: Mapping from tokens to indices, starting with 0.
        :param mat: Adjacency matrix ('scipy.sparse.coo_matrix') with rows corresponding to
                    node roles followed by node tokens.
        :return: tuple('tokens', 'matrix'). 'tokens' are generalized tokens (usually roles+tokens).
                 'matrix' rows correspond to 'tokens'.
        """
        raise NotImplementedError

    def _clear(self):
        """
        Release memory.
        """
        self.roles = list()
        self.tokens = list()
        self.role2ind = dict()
        self.token2ind = dict()
        self.dok_matrix = defaultdict(int)

    def _traverse_uast(self, root) -> None:
        """
        Traverse UAST and extract adjacency matrix.

        :param root: UAST root node.
        :return: None
        """
        n_nodes = len(self.roles)
        queue = deque([(root, n_nodes)])  # (node, node_idx)

        while queue:
            node, node_idx = queue.popleft()
            node_tokens = list(self._token_parser.process_token(node.token))

            for role in node.roles:
                self.role2ind.setdefault(role, len(self.role2ind))
            for token in node_tokens:
                self.token2ind.setdefault(token, len(self.token2ind))

            self.roles.append([self.role2ind[role] for role in node.roles])
            self.tokens.append(
                [self.token2ind[token] for token in node_tokens])

            for ch in node.children:
                n_nodes += 1
                self.dok_matrix[(node_idx, n_nodes)] += 1
                queue.append((ch, n_nodes))