def test_get_summary(monkeypatch, req, tmpdir): """Test getting the assembly summary file.""" cache_dir = tmpdir.mkdir('cache') monkeypatch.setattr(core, 'CACHE_DIR', str(cache_dir)) cache_file = cache_dir.join('refseq_bacteria_assembly_summary.txt') req.get( 'https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/assembly_summary.txt', text='test') ret = core.get_summary('refseq', 'bacteria', NgdConfig.get_default('uri'), False) assert ret.read() == 'test' assert not cache_file.check() ret = core.get_summary('refseq', 'bacteria', NgdConfig.get_default('uri'), True) assert ret.read() == 'test' assert cache_file.check() req.get( 'https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/assembly_summary.txt', text='never read') ret = core.get_summary('refseq', 'bacteria', NgdConfig.get_default('uri'), True) assert ret.read() == 'test'
def test_filter_entries(): """Test filter_entries.""" config = NgdConfig() with open(_get_file('assembly_status.txt'), 'r') as fh: entries = list(core.parse_summary(fh)) assert core.filter_entries(entries, config) == entries expected = entries[-1:] config.assembly_accessions = "GCF_000203835.1" assert core.filter_entries(entries, config) == expected
def prepare_create_downloadjob(req, tmpdir, format_map=NgdConfig._FORMATS, human_readable=False, create_local_file=False): # Set up test env entry = { 'assembly_accession': 'FAKE0.1', 'organism_name': 'Example species', 'infraspecific_name': 'strain=ABC 1234', 'ftp_path': 'https://fake/genomes/FAKE0.1' } config = NgdConfig() outdir = tmpdir.mkdir('output') download_jobs = [] config.output = str(outdir) config.human_readable = human_readable checksum_file_content = '' for key, val in format_map.items(): seqfile = tmpdir.join('fake{}'.format(val)) seqfile.write(key) checksum = core.md5sum(str(seqfile)) filename = path.basename(str(seqfile)) full_url = 'https://fake/genomes/FAKE0.1/{}'.format(filename) local_file = outdir.join('refseq', 'bacteria', 'FAKE0.1', filename) if create_local_file: local_file.write(seqfile.read(), ensure=True) symlink_path = None if human_readable: symlink_path = str( outdir.join('human_readable', 'refseq', 'bacteria', 'Example', 'species', 'ABC_1234', filename)) download_jobs.append( core.DownloadJob(full_url, str(local_file), checksum, symlink_path)) checksum_file_content += '{}\t./{}\n'.format(checksum, filename) req.get(full_url, text=seqfile.read()) req.get('https://fake/genomes/FAKE0.1/md5checksums.txt', text=checksum_file_content) return entry, config, download_jobs
def test_get_summary(monkeypatch, req, tmpdir): """Test getting the assembly summary file.""" cache_dir = tmpdir.mkdir('cache') monkeypatch.setattr(core, 'CACHE_DIR', str(cache_dir)) cache_file = cache_dir.join('refseq_bacteria_assembly_summary.txt') req.get('https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/assembly_summary.txt', text='test') ret = core.get_summary('refseq', 'bacteria', NgdConfig.get_default('uri'), False) assert ret.read() == 'test' assert not cache_file.check() ret = core.get_summary('refseq', 'bacteria', NgdConfig.get_default('uri'), True) assert ret.read() == 'test' assert cache_file.check() req.get('https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/assembly_summary.txt', text='never read') ret = core.get_summary('refseq', 'bacteria', NgdConfig.get_default('uri'), True) assert ret.read() == 'test'
def test_get_summary_error_handling(monkeypatch, mocker, req, tmpdir): """Test get_summary error handling.""" cache_dir = tmpdir.join('cache') monkeypatch.setattr(core, 'CACHE_DIR', str(cache_dir)) req.get('https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/assembly_summary.txt', text='test') fake_makedirs = mocker.MagicMock(side_effect=OSError(13, "Permission denied")) monkeypatch.setattr(os, 'makedirs', fake_makedirs) with pytest.raises(OSError): core.get_summary('refseq', 'bacteria', NgdConfig.get_default('uri'), True)
def prepare_create_downloadjob(req, tmpdir, format_map=NgdConfig._FORMATS, human_readable=False, create_local_file=False): # Set up test env entry = { 'assembly_accession': 'FAKE0.1', 'organism_name': 'Example species', 'infraspecific_name': 'strain=ABC 1234', 'ftp_path': 'https://fake/genomes/FAKE0.1' } config = NgdConfig() outdir = tmpdir.mkdir('output') download_jobs = [] config.output = str(outdir) config.human_readable = human_readable checksum_file_content = '' for key, val in format_map.items(): seqfile = tmpdir.join('fake{}'.format(val)) seqfile.write(key) checksum = core.md5sum(str(seqfile)) filename = path.basename(str(seqfile)) full_url = 'https://fake/genomes/FAKE0.1/{}'.format(filename) local_file = outdir.join('refseq', 'bacteria', 'FAKE0.1', filename) if create_local_file: local_file.write(seqfile.read(), ensure=True) symlink_path = None if human_readable: symlink_path = str( outdir.join('human_readable', 'refseq', 'bacteria', 'Example', 'species', 'ABC_1234', filename)) download_jobs.append(core.DownloadJob(full_url, str(local_file), checksum, symlink_path)) checksum_file_content += '{}\t./{}\n'.format(checksum, filename) req.get(full_url, text=seqfile.read()) req.get('https://fake/genomes/FAKE0.1/md5checksums.txt', text=checksum_file_content) return entry, config, download_jobs
def test_get_name_and_checksum(): class TestData: def __init__(self, checksums, end, filename, md5sum): self.checksums = checksums self.end = end self.filename = filename self.md5sum = md5sum regular_filenames = ( {'checksum': 'd76c643ec4bbc34d2935eb0664156d99', 'file': 'GCF_000009605.1_ASM960v1_cds_from_genomic.fna.gz'}, {'checksum': '42c1bb1447aea2512a17aeb3645b55e9', 'file': 'GCF_000009605.1_ASM960v1_genomic.fna.gz'}, {'checksum': '8a685d49d826c4f0ad05152e906f3250', 'file': 'GCF_000009605.1_ASM960v1_genomic.gbff.gz'}, {'checksum': 'e2d9e1cfa085cb462a73d3d2d2c22be5', 'file': 'GCF_000009605.1_ASM960v1_genomic.gff.gz'}, ) weird_filenames = ( {'checksum': '4d5f39ceb7e113ad461f8370aaac4e41', 'file': 'GCF_003583405.1_CHULA_Jazt_1.1_for_version_1.1_of_the_Jishengella_sp._nov._AZ1-13_genome_from_a_lab_in_CHULA_cds_from_genomic.fna.gz'}, {'checksum': 'e77c1e8bf0df2c353ce6a4899ae0cb5e', 'file': 'GCF_003583405.1_CHULA_Jazt_1.1_for_version_1.1_of_the_Jishengella_sp._nov._AZ1-13_genome_from_a_lab_in_CHULA_genomic.fna.gz'}, {'checksum': 'c93ba924075c8b22210ac283d41207ad', 'file': 'GCF_003583405.1_CHULA_Jazt_1.1_for_version_1.1_of_the_Jishengella_sp._nov._AZ1-13_genome_from_a_lab_in_CHULA_genomic.gbff.gz'}, {'checksum': 'd8394d0aff594ae962c88e1192238413', 'file': 'GCF_003583405.1_CHULA_Jazt_1.1_for_version_1.1_of_the_Jishengella_sp._nov._AZ1-13_genome_from_a_lab_in_CHULA_rna_from_genomic.fna.gz'}, ) test_table = ( TestData(regular_filenames, NgdConfig.get_fileending('genbank'), regular_filenames[2]['file'], regular_filenames[2]['checksum']), TestData(regular_filenames, NgdConfig.get_fileending('fasta'), regular_filenames[1]['file'], regular_filenames[1]['checksum']), TestData(regular_filenames, NgdConfig.get_fileending('cds-fasta'), regular_filenames[0]['file'], regular_filenames[0]['checksum']), TestData(weird_filenames, NgdConfig.get_fileending('genbank'), weird_filenames[2]['file'], weird_filenames[2]['checksum']), TestData(weird_filenames, NgdConfig.get_fileending('fasta'), weird_filenames[1]['file'], weird_filenames[1]['checksum']), TestData(weird_filenames, NgdConfig.get_fileending('cds-fasta'), weird_filenames[0]['file'], weird_filenames[0]['checksum']), ) for test in test_table: filename, checksum = core.get_name_and_checksum(test.checksums, test.end) assert filename == test.filename assert checksum == test.md5sum
"-n", "name", help="input the phylum name or other. use ; to separate multiple ") @click.option( "-t", "taxons", help= "input the taxon id. It will retrieve all the genomes desceding to the provided taxon; to separate multiple " ) @click.option( "-F", "formats", help='Which formats to download (default: %(default)s).' 'A comma-separated list of formats is also possible. For example: "fasta,assembly-report". ' 'Choose from: {choices}'.format( choices=NgdConfig.get_choices('file_formats')), default='fasta') @click.option( "-o", "odir", help= f"Create output hierarchy in specified folder (default: {NgdConfig.get_default('output')})", default=NgdConfig.get_default('output')) @click.option("-size", "size_of_batch", help=f"The size of each batch.", default=20) @click.option("-p", "parallel", help=f"Run N downloads in parallel (default: 10)", default=5)