Example #1
0
 def test_articles_to_jsonl(self):
     """Serialize pubmedarticles objects into jsonl file."""
     articles: [PubMedArticle] = ArticleSetParser.extract_articles(
         self.SAMPLE_ARTICLE_SET1_PATH)
     target_file_name = "test_articles.jsonl"
     target_path = os.path.join(get_test_output_path(), target_file_name)
     ArticleSetParser.articles_to_jsonl(articles, target_path)
Example #2
0
 def test_serialize_to_pipe_delimited(self):
     """Serialize pubmedarticles to csv file."""
     articles: [PubMedArticle] = ArticleSetParser.extract_articles(
         self.SAMPLE_ARTICLE_SET1_PATH)
     target_file_name = "test_articles.csv"
     target_path = os.path.join(get_test_output_path(), target_file_name)
     ArticleSetParser.articles_to_pipe(articles, target_path)
Example #3
0
 def test_extract_articles(self):
     """Extract pubmed articles from article set."""
     articles: [PubMedArticle] = ArticleSetParser.extract_articles(
         self.SAMPLE_ARTICLE_SET1_PATH)
     assert articles is not None
     assert len(articles) == 2
     assert isinstance(articles[0], PubMedArticle)
Example #4
0
def parse_pubmed_article_set(in_path: str, out_path: str):
    """
    Convert xml to json articles.

    Crawls through all files in a directory and create equivalent parsed jsonl file
    in output_path

    Arguments:
        in_path {str} -- absolute path to directory containing compressed article sets
        out_path {str} -- absolute path to desired directory to save output jsonl files
    """
    filename = os.path.basename(in_path)
    if not is_xml_article_set(filename):
        return
    xml_file = in_path.replace(".gz", "")
    if not os.path.exists(xml_file):
        logging.info("Extracting %s to %s", in_path, xml_file)
        decompress_gz(in_path, xml_file)

    logging.info("Parsing %s", xml_file)
    article_list: PubMedArticle = ArticleSetParser.extract_articles(xml_file)

    # Done with xml - delete to free up space
    os.remove(xml_file)

    output_file = os.path.join(out_path, filename.replace(".xml.gz", ".jsonl"))
    logging.info("Generating %s", output_file)
    ArticleSetParser.articles_to_jsonl(article_list, output_file)

    logging.info("Compressing file: %s", output_file)
    with open(output_file, "rb") as jsonl_data:
        data_jsonl = jsonl_data.read()
    compressed_data = gzip.compress(data_jsonl)
    output_file_compressed = output_file + ".gz"
    with open(output_file_compressed, "wb") as out_compressed:
        out_compressed.write(compressed_data)

    # Done with parsed file - delete to free up space
    os.remove(output_file)

    logging.info(
        "PID: %s. File Processed: %s. Articles Processed %s",
        os.getpid(),
        output_file,
        len(article_list),
    )
    return