def load_data_from_sample_with_lang( language: str, encoder_label: str, data_to_load: Any, function_name: Optional[str], tokenizer: TokenizerRecordable, fraction_using_func_name: float, min_len_func_name_for_query: int, use_subtokens: bool, mark_subtoken_end: bool, max_num_tokens: int, lang_token: str, ) -> Optional[Dict[str, np.ndarray]]: """ Save two versions of both the code and the query: one using the docstring as the query and the other using the function-name as the query, and replacing the function name in the code with an out-of-vocab token. Sub-tokenizes, converts, and pads both versions, and rejects empty samples. """ result_holder: Dict[str, Any] = {} # Save the two versions of the code and query: data_holder = {QueryType.DOCSTRING.value: data_to_load, QueryType.FUNCTION_NAME.value: None} # Skip samples where the function name is very short, because it probably has too little information # to be a good search query. if fraction_using_func_name > 0.0 and function_name and len(function_name) >= min_len_func_name_for_query: if encoder_label == "query": # Set the query tokens to the function name, broken up into its sub-tokens: data_holder[QueryType.FUNCTION_NAME.value] = split_identifier_into_parts(function_name) elif encoder_label == "code": # In the code, replace the function name with the out-of-vocab token everywhere it appears: data_holder[QueryType.FUNCTION_NAME.value] = [ tokenizer.unk_token() if token == function_name else token for token in data_to_load ] else: return None # Sub-tokenize, convert, and pad both versions: for key, data in data_holder.items(): # if hyperparameters[f"{encoder_label}_use_subtokens"]: if use_subtokens: data = _to_subtoken_stream(data, mark_subtoken_end=mark_subtoken_end) tokens, tokens_mask = convert_and_pad_token_sequence( tokenizer=tokenizer, token_sequence=list(data), output_tensor_size=max_num_tokens, language=language, lang_token=lang_token, ) # Note that we share the result_holder with different encoders, and so we need to make our identifiers # unique-ish result_holder[f"{encoder_label}_tokens_{key}"] = tokens result_holder[f"{encoder_label}_tokens_mask_{key}"] = tokens_mask if ( result_holder[f"{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}"] is None or int(np.sum(result_holder[f"{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}"])) == 0 ): return result_holder return result_holder
def sample_update(tpe: str, lang: str, tokens: List[str]) -> str: if data_params.use_subtokens: tokens = list( _to_subtoken_stream(tokens, mark_subtoken_end=False)) if tpe == "code": return f"{lang} <lg> {' '.join(tokens)}\r\n" elif tpe == "query": return f"<qy> {' '.join(tokens)}\r\n" else: raise ValueError("tpe can be 'code' or 'query'")
def load_data_from_sample_ast( language: str, encoder_label: str, data_to_load: List[str], function_name: Optional[str], tokenizer: TokenizerRecordable, data_params: DatasetParams, query_token: str, ) -> Optional[Dict[str, np.ndarray]]: """ Save two versions of both the code and the query: one using the docstring as the query and the other using the function-name as the query, and replacing the function name in the code with an out-of-vocab token. Sub-tokenizes, converts, and pads both versions, and rejects empty samples. """ result_holder: Dict[str, Any] = {} # Save the two versions of the code and query: data_holder = { QueryType.DOCSTRING.value: data_to_load, QueryType.FUNCTION_NAME.value: None } # Skip samples where the function name is very short, because it probably has too little information # to be a good search query. if (data_params.fraction_using_func_name > 0.0 and function_name and len(function_name) >= data_params.min_len_func_name_for_query): if encoder_label == "query": # Set the query tokens to the function name, broken up into its sub-tokens: data_holder[QueryType.FUNCTION_NAME. value] = split_identifier_into_parts(function_name) elif encoder_label == "code": # In the code, replace the function name with the out-of-vocab token everywhere it appears: data_holder[QueryType.FUNCTION_NAME.value] = [ tokenizer.unk_token() if token == function_name else token for token in data_to_load ] else: return None # Sub-tokenize, convert, and pad both versions: for key, data in data_holder.items(): # if hyperparameters[f"{encoder_label}_use_subtokens"]: if data is not None: data_l: List[str] = list(data) if data_params.use_subtokens: data_l = list( _to_subtoken_stream( data_l, mark_subtoken_end=data_params.mark_subtoken_end)) if encoder_label == "code": token_ids, token_mask = tokenizer.encode_tokens( [data_l], max_length=data_params.code_max_num_tokens) elif encoder_label == "query": token_sequence = [query_token] + data_l token_ids, token_mask = tokenizer.encode_tokens( [token_sequence], max_length=data_params.query_max_num_tokens) result_holder[f"{encoder_label}_tokens_{key}"] = token_ids[0] result_holder[f"{encoder_label}_tokens_mask_{key}"] = token_mask[0] if (result_holder[ f"{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}"] is None or int( np.sum(result_holder[ f"{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}"] )) == 0): return None return result_holder