def test_articles_to_jsonl(self): """Serialize pubmedarticles objects into jsonl file.""" articles: [PubMedArticle] = ArticleSetParser.extract_articles( self.SAMPLE_ARTICLE_SET1_PATH) target_file_name = "test_articles.jsonl" target_path = os.path.join(get_test_output_path(), target_file_name) ArticleSetParser.articles_to_jsonl(articles, target_path)
def test_serialize_to_pipe_delimited(self): """Serialize pubmedarticles to csv file.""" articles: [PubMedArticle] = ArticleSetParser.extract_articles( self.SAMPLE_ARTICLE_SET1_PATH) target_file_name = "test_articles.csv" target_path = os.path.join(get_test_output_path(), target_file_name) ArticleSetParser.articles_to_pipe(articles, target_path)
def test_extract_articles(self): """Extract pubmed articles from article set.""" articles: [PubMedArticle] = ArticleSetParser.extract_articles( self.SAMPLE_ARTICLE_SET1_PATH) assert articles is not None assert len(articles) == 2 assert isinstance(articles[0], PubMedArticle)
def parse_pubmed_article_set(in_path: str, out_path: str): """ Convert xml to json articles. Crawls through all files in a directory and create equivalent parsed jsonl file in output_path Arguments: in_path {str} -- absolute path to directory containing compressed article sets out_path {str} -- absolute path to desired directory to save output jsonl files """ filename = os.path.basename(in_path) if not is_xml_article_set(filename): return xml_file = in_path.replace(".gz", "") if not os.path.exists(xml_file): logging.info("Extracting %s to %s", in_path, xml_file) decompress_gz(in_path, xml_file) logging.info("Parsing %s", xml_file) article_list: PubMedArticle = ArticleSetParser.extract_articles(xml_file) # Done with xml - delete to free up space os.remove(xml_file) output_file = os.path.join(out_path, filename.replace(".xml.gz", ".jsonl")) logging.info("Generating %s", output_file) ArticleSetParser.articles_to_jsonl(article_list, output_file) logging.info("Compressing file: %s", output_file) with open(output_file, "rb") as jsonl_data: data_jsonl = jsonl_data.read() compressed_data = gzip.compress(data_jsonl) output_file_compressed = output_file + ".gz" with open(output_file_compressed, "wb") as out_compressed: out_compressed.write(compressed_data) # Done with parsed file - delete to free up space os.remove(output_file) logging.info( "PID: %s. File Processed: %s. Articles Processed %s", os.getpid(), output_file, len(article_list), ) return