Exemple #1
0
class TestArticlesSetParser:
    """Test ArticleSetParser. i.e. xml files with array of PubMedArticles."""

    SAMPLE_ARTICLE_SET1_NAME = "sample_articleset1.xml"
    SAMPLE_ARTICLE_SET1_PATH = os.path.join(
        get_resources_path(), SAMPLE_ARTICLE_SET1_NAME
    )

    def test_extract_articles(self):
        """Extract pubmed articles from article set."""
        articles: [PubMedArticle] = ArticleSetParser.extract_articles(
            self.SAMPLE_ARTICLE_SET1_PATH
        )
        assert articles is not None
        assert len(articles) == 2
        assert isinstance(articles[0], PubMedArticle)

    def test_articles_to_json(self):
        """Serialize pubmedarticle object into json file."""
        articles: [PubMedArticle] = ArticleSetParser.extract_articles(
            self.SAMPLE_ARTICLE_SET1_PATH
        )
        target_file_name = "test_articles.json"
        target_path = os.path.join(get_test_output_path(), target_file_name)
        ArticleSetParser.articles_to_json(articles, target_path)

    def test_articles_to_jsonl(self):
        """Serialize pubmedarticles objects into jsonl file."""
        articles: [PubMedArticle] = ArticleSetParser.extract_articles(
            self.SAMPLE_ARTICLE_SET1_PATH
        )
        target_file_name = "test_articles.jsonl"
        target_path = os.path.join(get_test_output_path(), target_file_name)
        ArticleSetParser.articles_to_jsonl(articles, target_path)

    def test_serialize_to_pipe_delimited(self):
        """Serialize pubmedarticles to csv file."""
        articles: [PubMedArticle] = ArticleSetParser.extract_articles(
            self.SAMPLE_ARTICLE_SET1_PATH
        )
        target_file_name = "test_articles.csv"
        target_path = os.path.join(get_test_output_path(), target_file_name)
        ArticleSetParser.articles_to_pipe(articles, target_path)
Exemple #2
0
 def scrape(self, chunksize: int, **kwargs) -> Generator:
     """Simulate scraping records and returning csv str."""
     csv_file = os.path.join(get_resources_path(), self.filename)
     # Open a connection to the file
     with open(csv_file) as file:
         # Read header
         header = file.readline()
         # Loop indefinitely until the end of the file
         while True:
             data = ""
             for _ in range(chunksize):
                 line = file.readline()
                 if not line:
                     break
                 data += line
             if not data:
                 break
             chunk = header + data
             yield chunk
Exemple #3
0
    def scrape(self, chunksize: int = 2, **kwargs) -> Generator:
        """Simulate scraping pubmed baseline and returning xml objs."""
        xml_file_path = os.path.join(get_resources_path(), self.filename)
        xml_root: ET.Element = ET.parse(xml_file_path).getroot()

        xml_list = xml_root.findall("PubmedArticle")
        pubmed_articles: [PubMedArticle] = []
        for article_xml in xml_list:
            pubmed_articles.append(PubMedArticle(article_xml))

        while True:
            articles_chunk = []
            for _ in range(chunksize):
                # Append articles to create chunk sized array
                if len(xml_list) > 0:
                    articles_chunk.append(xml_list.pop())
            # If no more articles return
            if not articles_chunk:
                return
            # If still articles, yield to generator
            yield articles_chunk
Exemple #4
0
def create_article(article_name: str) -> PubMedArticle:
    """Create the article object."""
    article_path = os.path.join(get_resources_path(), article_name)
    xml_element = ET.parse(article_path).getroot()
    return PubMedArticle(xml_element)
Exemple #5
0
"""Test package configuration & yaml files."""
from pathlib import Path
import geniepy.config as config
from tests import get_resources_path

TEST_CONFIG_NAME = "testconfig.yaml"
TEST_CONFIG_PATH = Path(get_resources_path()).resolve().joinpath(TEST_CONFIG_NAME)

# Override default config path for tests
config.CONFIG_FILE = TEST_CONFIG_PATH


def test_chunksize():
    """Test retrieving chunksize."""
    expected = 10
    actual = config.get_chunksize()
    assert actual == expected