def build_language_ast(name: str, dirs: List[Path], pickle_path: Path, data_params: DatasetParams): start = time.time() if data_params.use_ast == "tree-sitter": parser = TreeSitterParser( langs=["go", "java", "javascript", "python", "php", "ruby"], added_nodes=data_params.ast_added_nodes, skip_node_types=data_params.ast_skip_node_types, ) all_special_tokens: Set[str] = set() lengths: Dict[str, List[int]] = { "go": [], "java": [], "javascript": [], "python": [], "php": [], "ruby": [] } for (idx, file_path) in enumerate(get_data_files_from_directory(dirs)): logger.info(f"Reading {file_path}") raw_samples = list(read_file_samples(file_path)) for raw_sample in raw_samples: lang = raw_sample["language"] tokens, special_tokens = parser.parse_full( lang, raw_sample["code"]) all_special_tokens.update(special_tokens) lengths[lang].append(len(tokens)) end = time.time() logger.debug( f"all_special_tokens ({len(all_special_tokens)}) {all_special_tokens}" ) if not os.path.exists(pickle_path): os.makedirs(pickle_path) json_file = Path(pickle_path) / f"{name}_special_tokens.json" with open(json_file, "w") as f: json.dump(list(all_special_tokens), f) import statistics for lang, lgs in lengths.items(): if len(lgs) > 0: max_lg = max(lgs) min_lg = min(lgs) mean_lg = statistics.mean(lgs) std_lg = statistics.stdev(lgs) logger.debug( f"{lang} [ min:{min_lg}, max:{max_lg}, mean:{mean_lg}, stddev:{std_lg} ]" ) time_p = end - start logger.info(f"Building AST took: {time_p} sec")
def load_data_from_dirs_ast( name: str, data_dirs: List[Path], tokenizer: TokenizerRecordable, ast_parser: TreeSitterParser, data_params: DatasetParams, parse_callback: Callable[ [Path, DatasetParams, TokenizerRecordable, TreeSitterParser], Tuple[str, pd.DataFrame]], ) -> Dict[str, pd.DataFrame]: dfs: Dict[str, pd.DataFrame] = {} for d in data_dirs: lg = os.path.basename(d.parents[2]) logger.debug(f"Getting samples for lang {lg}") lang_samples = load_data_from_files_ast( data_files=list(get_data_files_from_directory([d], None)), data_params=data_params, tokenizer=tokenizer, ast_parser=ast_parser, parse_callback=parse_callback, ) df = lang_samples[lg] logger.debug(f"lang {lg} ({df.shape[0]} samples)") dfs[lg] = df return dfs
def build_huggingface_token_files( data_dirs: List[Path], data_params: DatasetParams, output_path: Union[Path, str], sample_update: Callable[[str, str, List[str]], str] = default_sample_update, ) -> Tuple[List[Path], Dict[str, Path]]: tokenizers_path = Path(output_path) os.makedirs(tokenizers_path, exist_ok=True) # build files of strings lang_ios: Dict[str, Tuple[IO[str], IO[str]]] = {} query_files: List[Path] = [] lang_files: Dict[str, Path] = {} for (idx, file_path) in enumerate(get_data_files_from_directory(data_dirs)): logger.info(f"Reading {file_path}") for raw_sample in read_file_samples(file_path): lang = raw_sample["language"] if lang not in lang_ios: query_file = tokenizers_path / f"{lang}_query.txt" code_file = tokenizers_path / f"{lang}_code.txt" lang_ios[lang] = (open(query_file, "w"), open(code_file, "w")) query_files.append(query_file) lang_files[lang] = code_file lang_ios[lang][0].write( sample_update("query", lang, raw_sample["docstring_tokens"])) lang_ios[lang][1].write( sample_update("code", lang, raw_sample["code_tokens"])) return query_files, lang_files
def load_data_from_dirs( data_dirs: List[Path], parse_callback: Callable[..., Tuple[str, int, Iterable[T_Single]]], # type: ignore max_files_per_dir: Optional[int], parallelize: bool, *args, ) -> Dict[str, Tuple[int, Iterable[T_Single]]]: return load_data_from_files_raw( list(get_data_files_from_directory(data_dirs, max_files_per_dir)), parse_callback, parallelize, *args )
def load_data_from_dirs_siamese_tokenizer( data_dirs: List[Path], tokenizer: TokenizerRecordable, data_params: DatasetParams, parse_callback: Callable[[Path, DatasetParams, TokenizerRecordable], Tuple[str, int, Iterable[T_Single]]], max_files_per_dir: Optional[int] = None, parallelize: bool = True, ) -> Dict[str, Tuple[int, Iterable[T_Single]]]: return load_data_from_files( data_files=list(get_data_files_from_directory(data_dirs, max_files_per_dir)), data_params=data_params, tokenizer=tokenizer, parse_callback=parse_callback, parallelize=parallelize, )