def tokenize_wikimedia_jsonl(jsonl_in: pathlib.Path, jsonl_out: pathlib.Path, sub_process_count: int) -> None: """ Tokenizes all the articles into the standard form: one sentence per line, paragraphs have a blank line between them. Parameters ---------- jsonl_in : pathlib.Path The JSONL containing all the articles jsonl_out : pathlib.Path The JSONL containing all the articles after tokenization sub_process_count : int The number of sub processes used to transformation from in to out formats """ if jsonl_out.exists(): jsonl_out.unlink() worker = mpb.EPTS(extract=_collect_articles, extract_args=(jsonl_in), transform=_tokenize_article, save=_save_articles_to_jsonl, save_args=(jsonl_out), worker_count=sub_process_count, show_progress=True) worker.start() worker.join()
def extract_csv_from_jsonl(jsonl_in: pathlib.Path, csv_out: pathlib.Path, extract: t.List[str], sub_process_count: int) -> None: """ Extracts a `CSV` file from a `JSONL` file. Parameters ---------- jsonl_in : pathlib.Path The JSONL containing all the documents csv_out : pathlib.Path The CSV file containing all the documents extract : List[str] The name(s) of the elements to extract sub_process_count : int The number of sub processes used to transformation from in to out formats """ if csv_out.exists(): csv_out.unlink() worker = mpb.EPTS(extract=u.list_jsonl_documents, extract_args=(jsonl_in), transform=_extract_document, transform_init=_passthrough, transform_init_args=(extract), save=_save_documents, save_args=(csv_out, extract), worker_count=sub_process_count, show_progress=True) worker.start() worker.join()
def combine_txt_to_jsonl(folder_in: pathlib.Path, jsonl_out: pathlib.Path, sub_process_count: int) -> None: """ Combines a folder of `TXT` files into a single `JSONL` file. Parameters ---------- folder_in : pathlib.Path Folder containing the source documents jsonl_out : pathlib.Path JSONL containing the aggregated corpus sub_process_count : int The number of sub processes used to transformation from in to out formats """ if jsonl_out.exists(): jsonl_out.unlink() worker = mpb.EPTS(extract=u.list_folder_documents, extract_args=(folder_in, u.is_txt_document), transform=_process_document, save=_save_documents_to_jsonl, save_args=(jsonl_out), worker_count=sub_process_count, show_progress=True) worker.start() worker.join()
def convert_txt(folder_in: pathlib.Path, folder_out: pathlib.Path, stem: str, max_lines: int, sub_process_count: int) -> None: """ Convert a folder of `TXT` files into a folder of _bigger_ `TXT` files. Parameters ---------- folder_in : pathlib.Path Folder containing the source documents folder_out : pathlib.Path The folder containing all the aggreated documents stem : str The output file's stem max_lines : int The number of lines per file sub_process_count : int The number of sub processes used to transformation from in to out formats """ folder_out.mkdir(parents=True, exist_ok=True) worker = mpb.EPTS(extract=u.list_folder_documents, extract_args=(folder_in, u.is_txt_document), transform=_process_document, save=_save_documents, save_args=(folder_out, stem, max_lines), worker_count=sub_process_count, show_progress=True) worker.start() worker.join()
def convert_jsonl(jsonl_in: pathlib.Path, jsonl_out: pathlib.Path, keep: t.List[str], sub_process_count: int) -> None: """ Converts a `JSONL` file into a _smaller_ `JSONL` file by keeping only some elements. Parameters ---------- jsonl_in : pathlib.Path The JSONL containing all the documents jsonl_out : pathlib.Path The JSONL file containing all the documents keep : List[str] The name(s) of the elements to keep sub_process_count : int The number of sub processes used to transformation from in to out formats """ if jsonl_out.exists(): jsonl_out.unlink() worker = mpb.EPTS(extract=u.list_jsonl_documents, extract_args=(jsonl_in), transform=_convert_document, transform_init=_passthrough, transform_init_args=(keep), save=_save_documents, save_args=(jsonl_out), worker_count=sub_process_count, show_progress=True) worker.start() worker.join()
def wikimedia_to_json(mediawiki_in: pathlib.Path, jsonl_out: pathlib.Path, sub_process_count: int) -> None: """ Converts a Wikimedia dump file to a JSONL file containing all the articles minus any wiki markup. Articles that contain no text are removed. Disambiguation articles are removed. Parameters ---------- mediawiki_in : pathlib.Path The XML dump file from Wikimedia jsonl_file : pathlib.Path JSONL containing all the wikimedia articles sub_process_count : int The number of sub processes used to transformation from in to out formats """ if jsonl_out.exists(): jsonl_out.unlink() worker = mpb.EPTS(extract=_collect_articles, extract_args=(mediawiki_in), transform=_parse_article, save=_save_articles_to_jsonl, save_args=(jsonl_out), worker_count=sub_process_count, show_progress=True) worker.start() worker.join()
def extract_txt_from_jsonl(jsonl_in: pathlib.Path, folder_out: pathlib.Path, id_element: str, extract: t.List[str], sub_process_count: int) -> None: """ Extracts a folder of `TXT` files from a `JSONL` file. Parameters ---------- jsonl_in : pathlib.Path The JSONL containing all the documents folder_out : pathlib.Path The folder containing all the documents after being extracted id_element : str The name of the element to use as a file name extract : List[str] The name(s) of the elements to extract sub_process_count : int The number of sub processes used to transformation from in to out formats """ folder_out.mkdir(parents=True, exist_ok=True) worker = mpb.EPTS(extract=u.list_jsonl_documents, extract_args=(jsonl_in), transform=_save_txt_document, transform_init=_passthrough, transform_init_args=(str(folder_out), id_element, extract), save=u.drain_iterator, worker_count=sub_process_count, show_progress=True) worker.start() worker.join()
def merge_json_folders(folders_in: t.List[pathlib.Path], folder_out: pathlib.Path, sub_process_count: int) -> None: """ Merges _several_ folders of `JSON` files into a _single_ folder of `JSON` files based on their file name. Parameters ---------- folders_in : List[pathlib.Path] Folders containing the documents to merge folder_out : pathlib.Path Folder containing the merged documents sub_process_count : int The number of sub processes used to transformation from in to out formats """ folder_out.mkdir(parents=True, exist_ok=True) worker = mpb.EPTS(extract=u.list_merged_folder_documents, extract_args=(folders_in, u.is_json_document), transform=_merge_documents, transform_init=_passthrough, transform_init_args=(str(folder_out)), save=u.drain_iterator, worker_count=sub_process_count, show_progress=True) worker.start() worker.join()
def oas_to_jsonl(folder_in: pathlib.Path, jsonl_out: pathlib.Path, sub_process_count: int) -> None: """ Converts the PMC OAS folders to a JSONL file containing all the articles minus any markup. Articles that contain no body are removed. Parameters ---------- folder_in : pathlib.Path The root of the JATS folders jsonl_out : pathlib.Path JSONL containing all the wikimedia articles sub_process_count : int The number of sub processes used to transformation from in to out formats """ if jsonl_out.exists(): jsonl_out.unlink() worker = mpb.EPTS(extract=_collect_articles, extract_args=(folder_in), transform=_parse_article_safe, save=_save_articles_to_jsonl, save_args=(jsonl_out), worker_count=sub_process_count, show_progress=True) worker.start() worker.join()
def tokenize_jsonl(jsonl_in: pathlib.Path, jsonl_out: pathlib.Path, id_element: str, tokens: t.List[t.Tuple[str, str]], sub_process_count: int) -> None: """ Tokenizes all the files into the standard form: one sentence per line, paragraphs have a blank line between them. Parameters ---------- jsonl_in : pathlib.Path The JSONL containing all the files jsonl_out : pathlib.Path The JSONL containing all the files after tokenization id_element : str The name of the element used for correlation between processed files tokens : List[(str,str)] Run the algorithm over these elements sub_process_count : int The number of sub processes used to transformation from in to out formats """ if jsonl_out.exists(): jsonl_out.unlink() worker = mpb.EPTS(extract=_collect_documents, extract_args=(jsonl_in), transform=_tokenize_document, transform_init=_passthrough, transform_init_args=(id_element, tokens), save=_save_documents_to_jsonl, save_args=(jsonl_out), worker_count=sub_process_count, show_progress=True) worker.start() worker.join()