Ejemplo n.º 1
0
def tokenize_wikimedia_jsonl(jsonl_in: pathlib.Path, jsonl_out: pathlib.Path,
                             sub_process_count: int) -> None:
    """
    Tokenizes all the articles into the standard form: one sentence per line, paragraphs have a blank line between them.

    Parameters
    ----------
    jsonl_in : pathlib.Path
        The JSONL containing all the articles
    jsonl_out : pathlib.Path
        The JSONL containing all the articles after tokenization
    sub_process_count : int
        The number of sub processes used to transformation from in to out formats
    """

    if jsonl_out.exists():
        jsonl_out.unlink()

    worker = mpb.EPTS(extract=_collect_articles,
                      extract_args=(jsonl_in),
                      transform=_tokenize_article,
                      save=_save_articles_to_jsonl,
                      save_args=(jsonl_out),
                      worker_count=sub_process_count,
                      show_progress=True)
    worker.start()
    worker.join()
def extract_csv_from_jsonl(jsonl_in: pathlib.Path, csv_out: pathlib.Path,
                           extract: t.List[str],
                           sub_process_count: int) -> None:
    """
    Extracts a `CSV` file from a `JSONL` file. 

    Parameters
    ----------
    jsonl_in : pathlib.Path
        The JSONL containing all the documents
    csv_out : pathlib.Path
        The CSV file containing all the documents
    extract : List[str]
        The name(s) of the elements to extract
    sub_process_count : int
        The number of sub processes used to transformation from in to out formats
    """

    if csv_out.exists():
        csv_out.unlink()

    worker = mpb.EPTS(extract=u.list_jsonl_documents,
                      extract_args=(jsonl_in),
                      transform=_extract_document,
                      transform_init=_passthrough,
                      transform_init_args=(extract),
                      save=_save_documents,
                      save_args=(csv_out, extract),
                      worker_count=sub_process_count,
                      show_progress=True)
    worker.start()
    worker.join()
Ejemplo n.º 3
0
def combine_txt_to_jsonl(folder_in: pathlib.Path, jsonl_out: pathlib.Path,
                         sub_process_count: int) -> None:
    """
    Combines a folder of `TXT` files into a single `JSONL` file.

    Parameters
    ----------
    folder_in : pathlib.Path
        Folder containing the source documents
    jsonl_out : pathlib.Path
        JSONL containing the aggregated corpus
    sub_process_count : int
        The number of sub processes used to transformation from in to out formats
    """

    if jsonl_out.exists():
        jsonl_out.unlink()

    worker = mpb.EPTS(extract=u.list_folder_documents,
                      extract_args=(folder_in, u.is_txt_document),
                      transform=_process_document,
                      save=_save_documents_to_jsonl,
                      save_args=(jsonl_out),
                      worker_count=sub_process_count,
                      show_progress=True)
    worker.start()
    worker.join()
Ejemplo n.º 4
0
def convert_txt(folder_in: pathlib.Path, folder_out: pathlib.Path, stem: str,
                max_lines: int, sub_process_count: int) -> None:
    """
    Convert a folder of `TXT` files into a folder of _bigger_ `TXT` files.

    Parameters
    ----------
    folder_in : pathlib.Path
        Folder containing the source documents
    folder_out : pathlib.Path
        The folder containing all the aggreated documents
    stem : str
        The output file's stem
    max_lines : int
        The number of lines per file
    sub_process_count : int
        The number of sub processes used to transformation from in to out formats
    """

    folder_out.mkdir(parents=True, exist_ok=True)

    worker = mpb.EPTS(extract=u.list_folder_documents,
                      extract_args=(folder_in, u.is_txt_document),
                      transform=_process_document,
                      save=_save_documents,
                      save_args=(folder_out, stem, max_lines),
                      worker_count=sub_process_count,
                      show_progress=True)
    worker.start()
    worker.join()
def convert_jsonl(jsonl_in: pathlib.Path, jsonl_out: pathlib.Path,
                  keep: t.List[str], sub_process_count: int) -> None:
    """
    Converts a `JSONL` file into a _smaller_ `JSONL` file by keeping only some elements.

    Parameters
    ----------
    jsonl_in : pathlib.Path
        The JSONL containing all the documents
    jsonl_out : pathlib.Path
        The JSONL file containing all the documents
    keep : List[str]
        The name(s) of the elements to keep
    sub_process_count : int
        The number of sub processes used to transformation from in to out formats
    """

    if jsonl_out.exists():
        jsonl_out.unlink()

    worker = mpb.EPTS(extract=u.list_jsonl_documents,
                      extract_args=(jsonl_in),
                      transform=_convert_document,
                      transform_init=_passthrough,
                      transform_init_args=(keep),
                      save=_save_documents,
                      save_args=(jsonl_out),
                      worker_count=sub_process_count,
                      show_progress=True)
    worker.start()
    worker.join()
Ejemplo n.º 6
0
def wikimedia_to_json(mediawiki_in: pathlib.Path, jsonl_out: pathlib.Path,
                      sub_process_count: int) -> None:
    """
    Converts a Wikimedia dump file to a JSONL file containing all the articles minus any wiki markup.
    Articles that contain no text are removed.
    Disambiguation articles are removed.

    Parameters
    ----------
    mediawiki_in : pathlib.Path
        The XML dump file from Wikimedia
    jsonl_file : pathlib.Path
        JSONL containing all the wikimedia articles
    sub_process_count : int
        The number of sub processes used to transformation from in to out formats
    """

    if jsonl_out.exists():
        jsonl_out.unlink()

    worker = mpb.EPTS(extract=_collect_articles,
                      extract_args=(mediawiki_in),
                      transform=_parse_article,
                      save=_save_articles_to_jsonl,
                      save_args=(jsonl_out),
                      worker_count=sub_process_count,
                      show_progress=True)
    worker.start()
    worker.join()
Ejemplo n.º 7
0
def extract_txt_from_jsonl(jsonl_in: pathlib.Path, folder_out: pathlib.Path,
                           id_element: str, extract: t.List[str],
                           sub_process_count: int) -> None:
    """
    Extracts a folder of `TXT` files from a `JSONL` file.

    Parameters
    ----------
    jsonl_in : pathlib.Path
        The JSONL containing all the documents
    folder_out : pathlib.Path
        The folder containing all the documents after being extracted
    id_element : str
        The name of the element to use as a file name
    extract : List[str]
        The name(s) of the elements to extract
    sub_process_count : int
        The number of sub processes used to transformation from in to out formats
    """

    folder_out.mkdir(parents=True, exist_ok=True)

    worker = mpb.EPTS(extract=u.list_jsonl_documents,
                      extract_args=(jsonl_in),
                      transform=_save_txt_document,
                      transform_init=_passthrough,
                      transform_init_args=(str(folder_out), id_element,
                                           extract),
                      save=u.drain_iterator,
                      worker_count=sub_process_count,
                      show_progress=True)
    worker.start()
    worker.join()
def merge_json_folders(folders_in: t.List[pathlib.Path],
                       folder_out: pathlib.Path,
                       sub_process_count: int) -> None:
    """
    Merges _several_ folders of `JSON` files into a _single_ folder of `JSON` files based on their file name.

    Parameters
    ----------
    folders_in : List[pathlib.Path]
        Folders containing the documents to merge 
    folder_out : pathlib.Path
        Folder containing the merged documents
    sub_process_count : int
        The number of sub processes used to transformation from in to out formats
    """

    folder_out.mkdir(parents=True, exist_ok=True)

    worker = mpb.EPTS(extract=u.list_merged_folder_documents,
                      extract_args=(folders_in, u.is_json_document),
                      transform=_merge_documents,
                      transform_init=_passthrough,
                      transform_init_args=(str(folder_out)),
                      save=u.drain_iterator,
                      worker_count=sub_process_count,
                      show_progress=True)
    worker.start()
    worker.join()
Ejemplo n.º 9
0
def oas_to_jsonl(folder_in: pathlib.Path, jsonl_out: pathlib.Path,
                 sub_process_count: int) -> None:
    """
    Converts the PMC OAS folders to a JSONL file containing all the articles minus any markup.
    Articles that contain no body are removed.

    Parameters
    ----------
    folder_in : pathlib.Path
        The root of the JATS folders
    jsonl_out : pathlib.Path
        JSONL containing all the wikimedia articles
    sub_process_count : int
        The number of sub processes used to transformation from in to out formats
    """

    if jsonl_out.exists():
        jsonl_out.unlink()

    worker = mpb.EPTS(extract=_collect_articles,
                      extract_args=(folder_in),
                      transform=_parse_article_safe,
                      save=_save_articles_to_jsonl,
                      save_args=(jsonl_out),
                      worker_count=sub_process_count,
                      show_progress=True)
    worker.start()
    worker.join()
Ejemplo n.º 10
0
def tokenize_jsonl(jsonl_in: pathlib.Path, jsonl_out: pathlib.Path,
                   id_element: str, tokens: t.List[t.Tuple[str, str]],
                   sub_process_count: int) -> None:
    """
    Tokenizes all the files into the standard form: one sentence per line, paragraphs have a blank line between them.

    Parameters
    ----------
    jsonl_in : pathlib.Path
        The JSONL containing all the files
    jsonl_out : pathlib.Path
        The JSONL containing all the files after tokenization
    id_element : str
        The name of the element used for correlation between processed files
    tokens : List[(str,str)]
        Run the algorithm over these elements
    sub_process_count : int
        The number of sub processes used to transformation from in to out formats
    """

    if jsonl_out.exists():
        jsonl_out.unlink()

    worker = mpb.EPTS(extract=_collect_documents,
                      extract_args=(jsonl_in),
                      transform=_tokenize_document,
                      transform_init=_passthrough,
                      transform_init_args=(id_element, tokens),
                      save=_save_documents_to_jsonl,
                      save_args=(jsonl_out),
                      worker_count=sub_process_count,
                      show_progress=True)
    worker.start()
    worker.join()