Example #1
0
def build_language_ast(name: str, dirs: List[Path], pickle_path: Path,
                       data_params: DatasetParams):
    start = time.time()

    if data_params.use_ast == "tree-sitter":
        parser = TreeSitterParser(
            langs=["go", "java", "javascript", "python", "php", "ruby"],
            added_nodes=data_params.ast_added_nodes,
            skip_node_types=data_params.ast_skip_node_types,
        )

        all_special_tokens: Set[str] = set()

        lengths: Dict[str, List[int]] = {
            "go": [],
            "java": [],
            "javascript": [],
            "python": [],
            "php": [],
            "ruby": []
        }

        for (idx, file_path) in enumerate(get_data_files_from_directory(dirs)):
            logger.info(f"Reading {file_path}")
            raw_samples = list(read_file_samples(file_path))
            for raw_sample in raw_samples:
                lang = raw_sample["language"]
                tokens, special_tokens = parser.parse_full(
                    lang, raw_sample["code"])

                all_special_tokens.update(special_tokens)

                lengths[lang].append(len(tokens))

        end = time.time()
        logger.debug(
            f"all_special_tokens ({len(all_special_tokens)}) {all_special_tokens}"
        )

        if not os.path.exists(pickle_path):
            os.makedirs(pickle_path)

        json_file = Path(pickle_path) / f"{name}_special_tokens.json"
        with open(json_file, "w") as f:
            json.dump(list(all_special_tokens), f)

        import statistics

        for lang, lgs in lengths.items():
            if len(lgs) > 0:
                max_lg = max(lgs)
                min_lg = min(lgs)
                mean_lg = statistics.mean(lgs)
                std_lg = statistics.stdev(lgs)
                logger.debug(
                    f"{lang} [ min:{min_lg}, max:{max_lg}, mean:{mean_lg}, stddev:{std_lg} ]"
                )

        time_p = end - start
        logger.info(f"Building AST took: {time_p} sec")
Example #2
0
def load_data_from_dirs_ast(
    name: str,
    data_dirs: List[Path],
    tokenizer: TokenizerRecordable,
    ast_parser: TreeSitterParser,
    data_params: DatasetParams,
    parse_callback: Callable[
        [Path, DatasetParams, TokenizerRecordable, TreeSitterParser],
        Tuple[str, pd.DataFrame]],
) -> Dict[str, pd.DataFrame]:
    dfs: Dict[str, pd.DataFrame] = {}
    for d in data_dirs:
        lg = os.path.basename(d.parents[2])
        logger.debug(f"Getting samples for lang {lg}")
        lang_samples = load_data_from_files_ast(
            data_files=list(get_data_files_from_directory([d], None)),
            data_params=data_params,
            tokenizer=tokenizer,
            ast_parser=ast_parser,
            parse_callback=parse_callback,
        )
        df = lang_samples[lg]

        logger.debug(f"lang {lg} ({df.shape[0]} samples)")

        dfs[lg] = df
    return dfs
Example #3
0
def build_huggingface_token_files(
    data_dirs: List[Path],
    data_params: DatasetParams,
    output_path: Union[Path, str],
    sample_update: Callable[[str, str, List[str]],
                            str] = default_sample_update,
) -> Tuple[List[Path], Dict[str, Path]]:
    tokenizers_path = Path(output_path)
    os.makedirs(tokenizers_path, exist_ok=True)
    # build files of strings
    lang_ios: Dict[str, Tuple[IO[str], IO[str]]] = {}

    query_files: List[Path] = []
    lang_files: Dict[str, Path] = {}
    for (idx,
         file_path) in enumerate(get_data_files_from_directory(data_dirs)):
        logger.info(f"Reading {file_path}")
        for raw_sample in read_file_samples(file_path):
            lang = raw_sample["language"]
            if lang not in lang_ios:
                query_file = tokenizers_path / f"{lang}_query.txt"
                code_file = tokenizers_path / f"{lang}_code.txt"
                lang_ios[lang] = (open(query_file, "w"), open(code_file, "w"))
                query_files.append(query_file)
                lang_files[lang] = code_file
            lang_ios[lang][0].write(
                sample_update("query", lang, raw_sample["docstring_tokens"]))
            lang_ios[lang][1].write(
                sample_update("code", lang, raw_sample["code_tokens"]))

    return query_files, lang_files
Example #4
0
def load_data_from_dirs(
    data_dirs: List[Path],
    parse_callback: Callable[..., Tuple[str, int, Iterable[T_Single]]],  # type: ignore
    max_files_per_dir: Optional[int],
    parallelize: bool,
    *args,
) -> Dict[str, Tuple[int, Iterable[T_Single]]]:
    return load_data_from_files_raw(
        list(get_data_files_from_directory(data_dirs, max_files_per_dir)), parse_callback, parallelize, *args
    )
Example #5
0
def load_data_from_dirs_siamese_tokenizer(
    data_dirs: List[Path],
    tokenizer: TokenizerRecordable,
    data_params: DatasetParams,
    parse_callback: Callable[[Path, DatasetParams, TokenizerRecordable], Tuple[str, int, Iterable[T_Single]]],
    max_files_per_dir: Optional[int] = None,
    parallelize: bool = True,
) -> Dict[str, Tuple[int, Iterable[T_Single]]]:
    return load_data_from_files(
        data_files=list(get_data_files_from_directory(data_dirs, max_files_per_dir)),
        data_params=data_params,
        tokenizer=tokenizer,
        parse_callback=parse_callback,
        parallelize=parallelize,
    )