Beispiel #1
0
def load_data_from_sample_with_lang(
    language: str,
    encoder_label: str,
    data_to_load: Any,
    function_name: Optional[str],
    tokenizer: TokenizerRecordable,
    fraction_using_func_name: float,
    min_len_func_name_for_query: int,
    use_subtokens: bool,
    mark_subtoken_end: bool,
    max_num_tokens: int,
    lang_token: str,
) -> Optional[Dict[str, np.ndarray]]:
    """
    Save two versions of both the code and the query: one using the docstring as the query and the other using the
    function-name as the query, and replacing the function name in the code with an out-of-vocab token.
    Sub-tokenizes, converts, and pads both versions, and rejects empty samples.
    """
    result_holder: Dict[str, Any] = {}
    # Save the two versions of the code and query:
    data_holder = {QueryType.DOCSTRING.value: data_to_load, QueryType.FUNCTION_NAME.value: None}
    # Skip samples where the function name is very short, because it probably has too little information
    # to be a good search query.
    if fraction_using_func_name > 0.0 and function_name and len(function_name) >= min_len_func_name_for_query:
        if encoder_label == "query":
            # Set the query tokens to the function name, broken up into its sub-tokens:
            data_holder[QueryType.FUNCTION_NAME.value] = split_identifier_into_parts(function_name)
        elif encoder_label == "code":
            # In the code, replace the function name with the out-of-vocab token everywhere it appears:
            data_holder[QueryType.FUNCTION_NAME.value] = [
                tokenizer.unk_token() if token == function_name else token for token in data_to_load
            ]
    else:
        return None

    # Sub-tokenize, convert, and pad both versions:
    for key, data in data_holder.items():
        # if hyperparameters[f"{encoder_label}_use_subtokens"]:
        if use_subtokens:
            data = _to_subtoken_stream(data, mark_subtoken_end=mark_subtoken_end)
        tokens, tokens_mask = convert_and_pad_token_sequence(
            tokenizer=tokenizer,
            token_sequence=list(data),
            output_tensor_size=max_num_tokens,
            language=language,
            lang_token=lang_token,
        )
        # Note that we share the result_holder with different encoders, and so we need to make our identifiers
        # unique-ish
        result_holder[f"{encoder_label}_tokens_{key}"] = tokens
        result_holder[f"{encoder_label}_tokens_mask_{key}"] = tokens_mask

    if (
        result_holder[f"{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}"] is None
        or int(np.sum(result_holder[f"{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}"])) == 0
    ):
        return result_holder

    return result_holder
Beispiel #2
0
 def sample_update(tpe: str, lang: str, tokens: List[str]) -> str:
     if data_params.use_subtokens:
         tokens = list(
             _to_subtoken_stream(tokens, mark_subtoken_end=False))
     if tpe == "code":
         return f"{lang} <lg> {' '.join(tokens)}\r\n"
     elif tpe == "query":
         return f"<qy> {' '.join(tokens)}\r\n"
     else:
         raise ValueError("tpe can be 'code' or 'query'")
Beispiel #3
0
def load_data_from_sample_ast(
    language: str,
    encoder_label: str,
    data_to_load: List[str],
    function_name: Optional[str],
    tokenizer: TokenizerRecordable,
    data_params: DatasetParams,
    query_token: str,
) -> Optional[Dict[str, np.ndarray]]:
    """
    Save two versions of both the code and the query: one using the docstring as the query and the other using the
    function-name as the query, and replacing the function name in the code with an out-of-vocab token.
    Sub-tokenizes, converts, and pads both versions, and rejects empty samples.
    """
    result_holder: Dict[str, Any] = {}
    # Save the two versions of the code and query:
    data_holder = {
        QueryType.DOCSTRING.value: data_to_load,
        QueryType.FUNCTION_NAME.value: None
    }
    # Skip samples where the function name is very short, because it probably has too little information
    # to be a good search query.
    if (data_params.fraction_using_func_name > 0.0 and function_name
            and len(function_name) >= data_params.min_len_func_name_for_query):
        if encoder_label == "query":
            # Set the query tokens to the function name, broken up into its sub-tokens:
            data_holder[QueryType.FUNCTION_NAME.
                        value] = split_identifier_into_parts(function_name)
        elif encoder_label == "code":
            # In the code, replace the function name with the out-of-vocab token everywhere it appears:
            data_holder[QueryType.FUNCTION_NAME.value] = [
                tokenizer.unk_token() if token == function_name else token
                for token in data_to_load
            ]
    else:
        return None

    # Sub-tokenize, convert, and pad both versions:
    for key, data in data_holder.items():
        # if hyperparameters[f"{encoder_label}_use_subtokens"]:
        if data is not None:
            data_l: List[str] = list(data)
            if data_params.use_subtokens:
                data_l = list(
                    _to_subtoken_stream(
                        data_l,
                        mark_subtoken_end=data_params.mark_subtoken_end))

            if encoder_label == "code":
                token_ids, token_mask = tokenizer.encode_tokens(
                    [data_l], max_length=data_params.code_max_num_tokens)

            elif encoder_label == "query":
                token_sequence = [query_token] + data_l
                token_ids, token_mask = tokenizer.encode_tokens(
                    [token_sequence],
                    max_length=data_params.query_max_num_tokens)

            result_holder[f"{encoder_label}_tokens_{key}"] = token_ids[0]
            result_holder[f"{encoder_label}_tokens_mask_{key}"] = token_mask[0]

    if (result_holder[
            f"{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}"] is None
            or int(
                np.sum(result_holder[
                    f"{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}"]
                       )) == 0):
        return None

    return result_holder