class TestArticlesSetParser: """Test ArticleSetParser. i.e. xml files with array of PubMedArticles.""" SAMPLE_ARTICLE_SET1_NAME = "sample_articleset1.xml" SAMPLE_ARTICLE_SET1_PATH = os.path.join( get_resources_path(), SAMPLE_ARTICLE_SET1_NAME ) def test_extract_articles(self): """Extract pubmed articles from article set.""" articles: [PubMedArticle] = ArticleSetParser.extract_articles( self.SAMPLE_ARTICLE_SET1_PATH ) assert articles is not None assert len(articles) == 2 assert isinstance(articles[0], PubMedArticle) def test_articles_to_json(self): """Serialize pubmedarticle object into json file.""" articles: [PubMedArticle] = ArticleSetParser.extract_articles( self.SAMPLE_ARTICLE_SET1_PATH ) target_file_name = "test_articles.json" target_path = os.path.join(get_test_output_path(), target_file_name) ArticleSetParser.articles_to_json(articles, target_path) def test_articles_to_jsonl(self): """Serialize pubmedarticles objects into jsonl file.""" articles: [PubMedArticle] = ArticleSetParser.extract_articles( self.SAMPLE_ARTICLE_SET1_PATH ) target_file_name = "test_articles.jsonl" target_path = os.path.join(get_test_output_path(), target_file_name) ArticleSetParser.articles_to_jsonl(articles, target_path) def test_serialize_to_pipe_delimited(self): """Serialize pubmedarticles to csv file.""" articles: [PubMedArticle] = ArticleSetParser.extract_articles( self.SAMPLE_ARTICLE_SET1_PATH ) target_file_name = "test_articles.csv" target_path = os.path.join(get_test_output_path(), target_file_name) ArticleSetParser.articles_to_pipe(articles, target_path)
def scrape(self, chunksize: int, **kwargs) -> Generator: """Simulate scraping records and returning csv str.""" csv_file = os.path.join(get_resources_path(), self.filename) # Open a connection to the file with open(csv_file) as file: # Read header header = file.readline() # Loop indefinitely until the end of the file while True: data = "" for _ in range(chunksize): line = file.readline() if not line: break data += line if not data: break chunk = header + data yield chunk
def scrape(self, chunksize: int = 2, **kwargs) -> Generator: """Simulate scraping pubmed baseline and returning xml objs.""" xml_file_path = os.path.join(get_resources_path(), self.filename) xml_root: ET.Element = ET.parse(xml_file_path).getroot() xml_list = xml_root.findall("PubmedArticle") pubmed_articles: [PubMedArticle] = [] for article_xml in xml_list: pubmed_articles.append(PubMedArticle(article_xml)) while True: articles_chunk = [] for _ in range(chunksize): # Append articles to create chunk sized array if len(xml_list) > 0: articles_chunk.append(xml_list.pop()) # If no more articles return if not articles_chunk: return # If still articles, yield to generator yield articles_chunk
def create_article(article_name: str) -> PubMedArticle: """Create the article object.""" article_path = os.path.join(get_resources_path(), article_name) xml_element = ET.parse(article_path).getroot() return PubMedArticle(xml_element)
"""Test package configuration & yaml files.""" from pathlib import Path import geniepy.config as config from tests import get_resources_path TEST_CONFIG_NAME = "testconfig.yaml" TEST_CONFIG_PATH = Path(get_resources_path()).resolve().joinpath(TEST_CONFIG_NAME) # Override default config path for tests config.CONFIG_FILE = TEST_CONFIG_PATH def test_chunksize(): """Test retrieving chunksize.""" expected = 10 actual = config.get_chunksize() assert actual == expected