Beispiel #1
0
def get_dataset_from(
        data_dirs: List[RichPath],
        use_func_names: bool = False,
        max_files_per_dir: Optional[int] = None) -> List[Dict[str, Any]]:
    data_files = sorted(
        get_data_files_from_directory(data_dirs, max_files_per_dir))
    data = list(
        chain(*chain(list(f.read_by_file_suffix()) for f in data_files)))

    if use_func_names:
        # This task tries to match the function name to the code, by setting the function name as the query
        for sample in data:
            # Replace the query tokens with the function name, broken up into its sub-tokens:
            sample['docstring_tokens'] = split_identifier_into_parts(
                sample['func_name'])

            # In the code, replace the function name with the out-of-vocab token everywhere it appears:
            sample['code_tokens'] = [
                Vocabulary.get_unk() if token == sample['func_name'] else token
                for token in sample['code_tokens']
            ]
    return data
Beispiel #2
0
    def load_data_from_sample(name: str,
                              metadata: Dict[str, Any],
                              data: List[str],
                              result_holder: Dict[str, Any],
                              hyperparameters: Dict[str, Any],
                              is_train: bool = True) -> bool:
        label_embedding_style = hyperparameters[
            f'{name}_embedding_style'].lower()
        num_nodes = len(data)

        if label_embedding_style == 'token':
            # Translate node labels using the token vocabulary:
            node_labels = np.zeros((num_nodes, ), dtype=np.uint16)
            for (node, label) in enumerate(data):
                if metadata[f'{name}_vocab'].is_unk(label):
                    label = TokenEmbedder.filter_literals(
                        label
                    )  # UNKs that are literals will be converted to special symbols.
                node_labels[node] = metadata[f'{name}_vocab'].get_id_or_unk(
                    label)
            result_holder[f'{name}_token_ids'] = node_labels

        elif label_embedding_style == 'subtoken':
            max_num_subtokens = hyperparameters[f'{name}_max_subtokens']
            node_subtokens = np.zeros((num_nodes, max_num_subtokens),
                                      dtype=np.uint16)
            node_subtoken_length = np.zeros(num_nodes, dtype=np.uint8)
            for (node, label) in enumerate(data):
                filtered_label = TokenEmbedder.filter_literals(label)
                if filtered_label == label:
                    subtoken_ids = metadata[
                        f'{name}_subtoken_vocab'].get_id_or_unk_multiple(
                            split_identifier_into_parts(
                                label))[:max_num_subtokens]
                elif metadata[f'{name}_subtoken_vocab'].is_unk(label):
                    subtoken_ids = metadata[
                        f'{name}_subtoken_vocab'].get_id_or_unk_multiple(
                            [filtered_label])
                else:
                    subtoken_ids = metadata[
                        f'{name}_subtoken_vocab'].get_id_or_unk_multiple(
                            [label])
                node_subtokens[node, :len(subtoken_ids)] = subtoken_ids
                node_subtoken_length[node] = len(subtoken_ids)
            result_holder[f'{name}_subtoken_ids'] = node_subtokens
            result_holder[f'{name}_subtoken_lengths'] = node_subtoken_length

        elif label_embedding_style == 'charcnn':
            # Translate node labels into character-based representation, and make unique per context graph:
            node_label_chars = np.zeros(
                shape=(num_nodes, hyperparameters[f'{name}_char_length']),
                dtype=np.uint8)
            for (node, label) in enumerate(data):
                for (char_idx, label_char) in enumerate(
                        label[:hyperparameters[f'{name}_char_length']]):
                    node_label_chars[int(node), char_idx] = ALPHABET_DICT.get(
                        label_char, 1)
            unique_chars, node_label_unique_indices = np.unique(
                node_label_chars, axis=0, return_inverse=True)
            result_holder[f'{name}_unique_chars'] = unique_chars
            result_holder[f'{name}_unique_indices'] = node_label_unique_indices
        else:
            raise Exception("Unknown node label embedding style '%s'!" %
                            label_embedding_style)
        return True
Beispiel #3
0
    def load_data_from_sample(cls,
                              encoder_label: str,
                              hyperparameters: Dict[str, Any],
                              metadata: Dict[str, Any],
                              data_to_load: Any,
                              function_name: Optional[str],
                              result_holder: Dict[str, Any],
                              is_test: bool = True) -> bool:
        """
        Saves two versions of both the code and the query: one using the docstring as the query and the other using the
        function-name as the query, and replacing the function name in the code with an out-of-vocab token.
        Sub-tokenizes, converts, and pads both versions, and rejects empty samples.
        """
        # Save the two versions of the code and query:
        data_holder = {
            QueryType.DOCSTRING.value: data_to_load,
            QueryType.FUNCTION_NAME.value: None
        }
        # Skip samples where the function name is very short, because it probably has too little information
        # to be a good search query.
        if not is_test and hyperparameters['fraction_using_func_name'] > 0. and function_name and \
                len(function_name) >= hyperparameters['min_len_func_name_for_query']:
            if encoder_label == 'query':
                # Set the query tokens to the function name, broken up into its sub-tokens:
                data_holder[QueryType.FUNCTION_NAME.
                            value] = split_identifier_into_parts(function_name)
            elif encoder_label == 'code':
                # In the code, replace the function name with the out-of-vocab token everywhere it appears:
                data_holder[QueryType.FUNCTION_NAME.value] = [
                    Vocabulary.get_unk() if token == function_name else token
                    for token in data_to_load
                ]

        # Sub-tokenize, convert, and pad both versions:
        for key, data in data_holder.items():
            if not data:
                result_holder[f'{encoder_label}_tokens_{key}'] = None
                result_holder[f'{encoder_label}_tokens_mask_{key}'] = None
                result_holder[f'{encoder_label}_tokens_length_{key}'] = None
                result_holder[f'{encoder_label}_tokens_str_{key}'] = None
                continue
            if hyperparameters[f'{encoder_label}_use_subtokens']:
                data = cls._to_subtoken_stream(
                    data,
                    mark_subtoken_end=hyperparameters[
                        f'{encoder_label}_mark_subtoken_end'])
            tokens, tokens_mask = \
                convert_and_pad_token_sequence(metadata['token_vocab'], list(data),
                                               hyperparameters[f'{encoder_label}_max_num_tokens'])
            # Note that we share the result_holder with different encoders, and so we need to make our identifiers
            # unique-ish
            result_holder[f'{encoder_label}_tokens_{key}'] = tokens
            result_holder[f'{encoder_label}_tokens_mask_{key}'] = tokens_mask
            result_holder[f'{encoder_label}_tokens_length_{key}'] = int(
                np.sum(tokens_mask))
            result_holder[f'{encoder_label}_tokens_str_{key}'] = list(data)

            # print(list(data))

        if result_holder[f'{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}'] is None or \
                int(np.sum(result_holder[f'{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}'])) == 0:
            return False

        return True
def get_log_samples(graph, seq_length, pad_token, vocabulary, rnn_len):

    node_table = {}
    edge_table = defaultdict(list)
    token_pointer = 0
    token_table = []
    sample_contents = []

    semi_node_ids = []

    for node in graph.node:

        node_table[node.id] = node
        if (node.type in [FeatureNode.TOKEN, FeatureNode.IDENTIFIER_TOKEN
                          ]) and token_pointer == 0:
            token_pointer = node.id
            token_table.append(
                vocabulary.get_id_or_unk_multiple(
                    split_identifier_into_parts(node.contents), seq_length,
                    pad_token))

    for edge in graph.edge:

        edge_table[edge.sourceId].append(edge)

    while (True):
        term_flag = True
        if (len(edge_table[token_pointer]) > 0):
            for edge in edge_table[token_pointer]:
                if edge.type == FeatureEdge.NEXT_TOKEN:
                    term_flag = False
                    #                    id_in_order.append(token_pointer)
                    if node_table[
                            token_pointer].type == FeatureNode.TOKEN and node_table[
                                token_pointer].contents == "SEMI":
                        semi_node_ids.append(len(token_table))
                    token_pointer = edge.destinationId
                    token_table.append(
                        vocabulary.get_id_or_unk_multiple(
                            split_identifier_into_parts(
                                node_table[token_pointer].contents),
                            seq_length, pad_token))
                    break
            if term_flag:
                break
        else:
            #            print("warning: unable to find next node")
            break

    for semi_node_id in semi_node_ids:

        if semi_node_id < rnn_len:
            sample_content = [
                vocabulary.get_id_or_unk_multiple(
                    split_identifier_into_parts(" "), seq_length, pad_token)
            ] * rnn_len
            sample_content[-semi_node_id:] = token_table[:semi_node_id]
        else:
            sample_content = token_table[semi_node_id - rnn_len:semi_node_id]

        sample_contents.append(np.array([sample_content]))

    return sample_contents
Beispiel #5
0
def load_data_from_sample_siamese(
    language: str,
    encoder_label: str,
    data_to_load: Any,
    function_name: Optional[str],
    tokenizer: TokenizerRecordable,
    fraction_using_func_name: float,
    min_len_func_name_for_query: int,
    use_subtokens: bool,
    mark_subtoken_end: bool,
    max_num_tokens: int,
    lang_token: str,
    query_token: str,
) -> Optional[Dict[str, np.ndarray]]:
    """
    Save two versions of both the code and the query: one using the docstring as the query and the other using the
    function-name as the query, and replacing the function name in the code with an out-of-vocab token.
    Sub-tokenizes, converts, and pads both versions, and rejects empty samples.
    """
    result_holder: Dict[str, Any] = {}
    # Save the two versions of the code and query:
    data_holder = {
        QueryType.DOCSTRING.value: data_to_load,
        QueryType.FUNCTION_NAME.value: None
    }
    # Skip samples where the function name is very short, because it probably has too little information
    # to be a good search query.
    if fraction_using_func_name > 0.0 and function_name and len(
            function_name) >= min_len_func_name_for_query:
        if encoder_label == "query":
            # Set the query tokens to the function name, broken up into its sub-tokens:
            data_holder[QueryType.FUNCTION_NAME.
                        value] = split_identifier_into_parts(function_name)
        elif encoder_label == "code":
            # In the code, replace the function name with the out-of-vocab token everywhere it appears:
            data_holder[QueryType.FUNCTION_NAME.value] = [
                tokenizer.unk_token() if token == function_name else token
                for token in data_to_load
            ]
    else:
        return None

    # Sub-tokenize, convert, and pad both versions:
    for key, data in data_holder.items():
        # if hyperparameters[f"{encoder_label}_use_subtokens"]:
        if use_subtokens:
            data = _to_subtoken_stream(data,
                                       mark_subtoken_end=mark_subtoken_end)

        logger.debug("")
        if encoder_label == "code":
            tokens, tokens_mask = convert_and_pad_token_sequence(
                tokenizer=tokenizer,
                token_sequence=list(data),
                output_tensor_size=max_num_tokens,
                token=lang_token,
                prefix=language,
            )
        elif encoder_label == "query":
            tokens, tokens_mask = convert_and_pad_token_sequence(
                tokenizer=tokenizer,
                token_sequence=list(data),
                output_tensor_size=max_num_tokens,
                token=query_token,
                prefix=None,
            )
        # Note that we share the result_holder with different encoders, and so we need to make our identifiers
        # unique-ish
        result_holder[f"{encoder_label}_tokens_{key}"] = tokens
        result_holder[f"{encoder_label}_tokens_mask_{key}"] = tokens_mask

    if (result_holder[
            f"{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}"] is None
            or int(
                np.sum(result_holder[
                    f"{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}"]
                       )) == 0):
        return None

    return result_holder
Beispiel #6
0
def parse_data_file_ast_tokenizer(
    data_file: Path,
    data_params: DatasetParams,
    tokenizer: TokenizerRecordable,
    ast_parser: TreeSitterParser,
    query_token: str,
    pickle_path: Path,
) -> Tuple[str, pd.DataFrame]:
    logger.info(f"Reading samples from {data_file}")
    filename = os.path.basename(data_file)
    file_language = filename.split("_")[0]
    file_id = filename.split(".")[0]
    pickle_file = pickle_path / f"{file_id}.p"

    if pickle_file.exists():
        df = pd.read_pickle(pickle_path / f"{file_id}.p")
        return (file_language, df)

    samples = list(read_file_samples(data_file))

    # ds: List[Dict[str, Union[str, int]]] = []
    codes: List[List[str]] = []
    funcs: List[List[str]] = []
    docstrings: List[List[str]] = []
    for idx, raw_sample in enumerate(tqdm(samples)):
        language = raw_sample["language"]
        if language.startswith(
                "python"
        ):  # In some datasets, we use 'python-2.7' and 'python-3'
            language = "python"

        if language != file_language:
            logger.error(
                f"file with different language {language} from filename {file_language}"
            )
            sys.exit(
                f"file with multiple language {language} from filename {file_language}"
            )

        function_name = raw_sample.get("func_name")

        code: List[str] = ast_parser.parse(
            language,
            raw_sample["code"],
            max_tokens=data_params.code_max_num_tokens)

        # Skip samples where the function name is very short, because it probably has too little information
        # to be a good search query.
        if (data_params.fraction_using_func_name > 0.0 and function_name and
                len(function_name) >= data_params.min_len_func_name_for_query):
            func = [query_token] + split_identifier_into_parts(function_name)
            code = [
                tokenizer.unk_token() if token == function_name else token
                for token in code
            ]
            docstring = [query_token] + [
                d.lower() for d in raw_sample["docstring_tokens"]
            ]

            codes.append(code)
            funcs.append(func)
            docstrings.append(docstring)

    code_toks: List[List[int]] = []
    code_masks: List[List[int]] = []
    func_toks: List[List[int]] = []
    func_masks: List[List[int]] = []
    docstring_toks: List[List[int]] = []
    docstring_masks: List[List[int]] = []

    for batch in batch_iter(codes, batch_size=100):
        toks, masks = tokenizer.encode_tokens(
            batch, max_length=data_params.code_max_num_tokens)
        code_toks.extend(toks)
        code_masks.extend(masks)

    for batch in batch_iter(funcs, batch_size=100):
        toks, masks = tokenizer.encode_tokens(
            batch, max_length=data_params.query_max_num_tokens)
        func_toks.extend(toks)
        func_masks.extend(masks)

    for batch in batch_iter(docstrings, batch_size=100):
        toks, masks = tokenizer.encode_tokens(
            batch, max_length=data_params.query_max_num_tokens)
        docstring_toks.extend(toks)
        docstring_masks.extend(masks)

    langs = [data_params.lang_ids[file_language]] * len(func_toks)
    similarities = [1] * len(func_toks)
    logger.debug(f"func_toks {func_toks[:2]}")
    logger.debug(f"docstring_toks {docstring_toks[:2]}")
    logger.debug(f"code_toks {code_toks[:2]}")
    logger.debug(f"langs {langs[:2]}")
    logger.debug(f"similarities {similarities[:2]}")
    df = pd.DataFrame({
        "lang": langs,
        "similarity": similarities,
        "func_tokens": func_toks,
        "func_masks": func_masks,
        "docstring_tokens": docstring_toks,
        "docstring_masks": docstring_masks,
        "code_tokens": code_toks,
        "code_masks": code_masks,
    })

    df.to_pickle(pickle_file)

    logger.debug(
        f"Saved file {data_file}: language {file_language} [{df.shape}] to {pickle_file}"
    )

    return (file_language, df)
Beispiel #7
0
def load_data_from_sample_ast(
    language: str,
    encoder_label: str,
    data_to_load: List[str],
    function_name: Optional[str],
    tokenizer: TokenizerRecordable,
    data_params: DatasetParams,
    query_token: str,
) -> Optional[Dict[str, np.ndarray]]:
    """
    Save two versions of both the code and the query: one using the docstring as the query and the other using the
    function-name as the query, and replacing the function name in the code with an out-of-vocab token.
    Sub-tokenizes, converts, and pads both versions, and rejects empty samples.
    """
    result_holder: Dict[str, Any] = {}
    # Save the two versions of the code and query:
    data_holder = {
        QueryType.DOCSTRING.value: data_to_load,
        QueryType.FUNCTION_NAME.value: None
    }
    # Skip samples where the function name is very short, because it probably has too little information
    # to be a good search query.
    if (data_params.fraction_using_func_name > 0.0 and function_name
            and len(function_name) >= data_params.min_len_func_name_for_query):
        if encoder_label == "query":
            # Set the query tokens to the function name, broken up into its sub-tokens:
            data_holder[QueryType.FUNCTION_NAME.
                        value] = split_identifier_into_parts(function_name)
        elif encoder_label == "code":
            # In the code, replace the function name with the out-of-vocab token everywhere it appears:
            data_holder[QueryType.FUNCTION_NAME.value] = [
                tokenizer.unk_token() if token == function_name else token
                for token in data_to_load
            ]
    else:
        return None

    # Sub-tokenize, convert, and pad both versions:
    for key, data in data_holder.items():
        # if hyperparameters[f"{encoder_label}_use_subtokens"]:
        if data is not None:
            data_l: List[str] = list(data)
            if data_params.use_subtokens:
                data_l = list(
                    _to_subtoken_stream(
                        data_l,
                        mark_subtoken_end=data_params.mark_subtoken_end))

            if encoder_label == "code":
                token_ids, token_mask = tokenizer.encode_tokens(
                    [data_l], max_length=data_params.code_max_num_tokens)

            elif encoder_label == "query":
                token_sequence = [query_token] + data_l
                token_ids, token_mask = tokenizer.encode_tokens(
                    [token_sequence],
                    max_length=data_params.query_max_num_tokens)

            result_holder[f"{encoder_label}_tokens_{key}"] = token_ids[0]
            result_holder[f"{encoder_label}_tokens_mask_{key}"] = token_mask[0]

    if (result_holder[
            f"{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}"] is None
            or int(
                np.sum(result_holder[
                    f"{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}"]
                       )) == 0):
        return None

    return result_holder
Beispiel #8
0
    def _convert_ast_into_simpler_tree_format(self, root, binary_data):
        num_nodes = 0

        queue = [root]

        root_token = ""
        root_sub_tokens = []
        # Check if the node has children or not
        if len(root.children) == 0:
            root_token = binary_data[root_node.start_byte:root_node.end_byte]
            root_token_raw = root_token.decode("utf-8")
            root_token = self.process_token(root_token_raw)

            root_sub_tokens = split_identifier_into_parts(root_token_raw)
            root_sub_tokens = self.process_list_of_sub_tokens(root_sub_tokens)

        root_sub_token_ids = []
        for sub_token in root_sub_tokens:
            root_sub_token_ids.append(
                self.look_up_for_id_from_token(sub_token))

        root_json = {
            "node_type": str(root.type),
            "node_type_id": self.look_up_for_id_from_node_type(str(root.type)),
            "node_token": root_token,
            "node_sub_tokens": root_sub_tokens,
            "node_sub_tokens_id": root_sub_token_ids,
            "children": [
            ]  # Using children = None instead of [] to avoid the error 'Python 3: maximum recursion depth exceeded'
        }

        tree_tokens = []
        tree_tokens.extend(root_sub_tokens)

        queue_json = [root_json]
        while queue:

            current_node = queue.pop(0)
            current_node_json = queue_json.pop(0)
            num_nodes += 1

            children = [x for x in current_node.children]
            queue.extend(children)

            if len(children) > 0:
                current_node_json['children'] = []

            for child_node in children:

                child_token = ""
                child_sub_tokens = []
                if len(child_node.children) == 0:
                    child_token = binary_data[child_node.start_byte:child_node.
                                              end_byte]
                    child_token_raw = child_token.decode("utf-8")
                    child_token = self.process_token(child_token_raw)
                    child_sub_tokens = split_identifier_into_parts(
                        str(child_token_raw))
                    child_sub_tokens = self.process_list_of_sub_tokens(
                        child_sub_tokens)

                children_sub_token_ids = []
                for sub_token in child_sub_tokens:
                    sub_token = self.process_token(sub_token)
                    # print(sub_token)
                    sub_token_id = self.look_up_for_id_from_token(sub_token)
                    children_sub_token_ids.append(sub_token_id)

                if len(children_sub_token_ids) == 0:
                    children_sub_token_ids.append(0)

                child_json = {
                    "node_type":
                    str(child_node.type),
                    "node_type_id":
                    self.look_up_for_id_from_node_type(str(child_node.type)),
                    "node_token":
                    child_token,
                    "node_sub_tokens":
                    child_sub_tokens,
                    "node_sub_tokens_id":
                    children_sub_token_ids,
                    "children": []
                }

                tree_tokens.extend(child_sub_tokens)

                current_node_json['children'].append(child_json)
                queue_json.append(child_json)

        tree_tokens = list(set(tree_tokens))
        return root_json, tree_tokens, num_nodes
Beispiel #9
0
from dpu_utils.codeutils import split_identifier_into_parts

print(split_identifier_into_parts("eatRelationalExpression"))
Beispiel #10
0
def func_name_tokenizer(tokens):
    tokens = ujson.loads(tokens)
    return split_identifier_into_parts(tokens)
Beispiel #11
0
def compute_sample_data(sub_graph,
                        identifier_token_node_ids,
                        seq_length,
                        pad_token,
                        slot_token,
                        vocabulary,
                        exception_node_ids=[]):

    used_node_types = get_used_nodes_type()
    used_edge_types = get_used_edges_type()

    node_representations = []
    id_to_index_map = {}
    ind = 0

    (sub_nodes, sub_edges) = sub_graph

    for node in sub_nodes:
        if node.type in used_node_types:
            if node.id in exception_node_ids:
                node_representation = [pad_token for _ in range(seq_length)]
                node_representation[0] = slot_token
            else:
                node_representation = vocabulary.get_id_or_unk_multiple(
                    split_identifier_into_parts(node.contents), seq_length,
                    pad_token)

            node_representations.append(node_representation)
            id_to_index_map[node.id] = ind
            ind += 1

    n_nodes = len(node_representations)
    n_types = len(used_edge_types)
    node_representations = np.array(node_representations)
    num_incoming_edges_per_type = np.zeros((n_nodes, n_types))
    num_outgoing_edges_per_type = np.zeros((n_nodes, n_types))
    adj_lists = defaultdict(list)

    for edge in sub_edges:
        if edge.type in used_edge_types \
                and edge.sourceId in id_to_index_map \
                and edge.destinationId in id_to_index_map:

            type_id = used_edge_types.index(edge.type)
            adj_lists[type_id].append([
                id_to_index_map[edge.sourceId],
                id_to_index_map[edge.destinationId]
            ])
            num_incoming_edges_per_type[id_to_index_map[edge.destinationId],
                                        type_id] += 1
            num_outgoing_edges_per_type[id_to_index_map[edge.sourceId],
                                        type_id] += 1

    final_adj_lists = {
        edge_type: np.array(sorted(adj_list), dtype=np.int32)
        for edge_type, adj_list in adj_lists.items()
    }

    # Add empty entries for types with no adjacency lists
    for i in range(len(used_edge_types)):
        if i not in final_adj_lists:
            final_adj_lists[i] = np.zeros((0, 2), dtype=np.int32)

    identifier_nodes = [
        id_to_index_map[node_id] for node_id in identifier_token_node_ids
    ]

    return (identifier_nodes, node_representations, final_adj_lists, \
           num_incoming_edges_per_type, num_outgoing_edges_per_type)
Beispiel #12
0
def func_name_tokenizer(tokens, **kwargs):
    tokens = ujson.loads(tokens)
    tokens = split_identifier_into_parts(tokens)[:kwargs['min_func_len']]
    return tokens
Beispiel #13
0
def string_sub_tokenizer(tokens: list):
    """code from https://github.com/github/CodeSearchNet/blob/e792e1caea20fbd4fba439565fe20c10d4798435/src/encoders/seq_encoder.py#L84-L92"""
    tokens = [split_identifier_into_parts(tok) if IDENTIFIER_TOKEN_REGEX.match(tok) else tok for tok in tokens]
    tokens = list(itertools.chain(*tokens))
    return tokens