def test_decompress(self): out_file = os.path.join(tempfile.gettempdir(), 'gtf.txt') ftp_down(self.remote_1, out_file) ftl = run_child('file %s' % out_file) os.remove(out_file) ft = parse_file_line(ftl) self.assertEqual(ft, 'ascii')
def fetch_bovine(): """Download cow genome and annotations.""" target_dir = os.path.join(DB_DIR, 'bovine') try: os.mkdir(target_dir) except FileExistsError: pass os.chdir(target_dir) try: os.mkdir('fasta') except FileExistsError: pass os.chdir('fasta') chromosomes = ['chr%d' % chrom for chrom in range(1, 30)] chromosomes.extend(['chrMT', 'chrX', 'unplaced']) # Y IS MISSING logging.info('Downloading bovine genome') local_file_name = os.path.join(target_dir, 'fasta', 'bt_ref_Bos_taurus_UMD_3.1.1.fasta') if os.path.exists(local_file_name): os.remove(local_file_name) for chrom in chromosomes: logging.debug('Downloading bovine chromosome %s', chrom) fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/Bos_taurus/Assembled_chromosomes/seq/bt_ref_Bos_taurus_UMD_3.1.1_%s.fa.gz' % chrom download_handle = ftp_down(fasta_url, local_file_name) download_handle.close() logging.debug('Downloaded bovine chromosome %s', chrom) run_child('bgzip %s' % local_file_name) logging.info('Downloading gff annotation file') gff3_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/Bos_taurus/GFF/ref_Bos_taurus_UMD_3.1.1_top_level.gff3.gz' download_handle = ftp_down(gff3_url) download_handle.close()
def test_append(self): out_file = os.path.join(tempfile.gettempdir(), 'README.TXT') try: os.remove(out_file) except FileNotFoundError: pass ftp_down(self.remote_2, out_file) with open(out_file) as f: n_lines_1 = sum(1 for _ in f) ftp_down(self.remote_2, out_file) with open(out_file) as f: n_lines_2 = sum(1 for _ in f) self.assertEqual(n_lines_2, 2 * n_lines_1)
def fetch_viral(viral_mode): """Download nucleotide or protein database.""" # define the search nuccore/protein if viral_mode == 'n': logging.info('downloading viral nuccore sequences') target_dir = os.path.join(DB_DIR, 'viral_nuccore') cml_search = viral_query('n') elif viral_mode == 'p': logging.info('downloaded viral protein sequences') target_dir = os.path.join(DB_DIR, 'viral_protein') cml_search = viral_query('p') # run the search and download os.chdir(target_dir) run_child(cml_search) cml_fetch_fasta = 'efetch -format fasta < ncbi_search > viral_database.fasta' run_child(cml_fetch_fasta) cml_efetch_xtract = 'efetch -format docsum < ncbi_search | xtract' cml_efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title > viral_seqs_info.tsv' run_child(cml_efetch_xtract) logging.info('downloaded viral seqs info in %s', target_dir) logging.info('saving viral taxonomy') # viral_seqs_info.tsv contains Accn TaxId cml = 'cut -f 1,2 viral_seqs_info.tsv > viral_accn_taxid.dmp' run_child(cml) accs_1 = set(get_accs('viral_database.fasta')) accs_2 = set([l.split()[0] for l in open('viral_accn_taxid.dmp')]) assert accs_1 == accs_2, accs_1 ^ accs_2 logging.info('taxonomy and fasta sequences match') os.chdir(DB_DIR) logging.info('downloading taxonomy databases') download_handle = ftp_down( 'ftp://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz') download_handle.close() run_child('tar xvfz taxdb.tar.gz') os.remove('taxdb.tar.gz') download_handle = ftp_down( 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz') download_handle.close() run_child('tar xvfz taxdump.tar.gz') for ftd in [ 'taxdump.tar.gz', 'merged.dmp', 'gencode.dmp', 'division.dmp', 'delnodes.dmp', 'citations.dmp' ]: try: os.remove(ftd) except OSError: logging.warning('Could not find file %s', ftd)
def fetch_viral(viral_mode): """Download nucleotide or protein database.""" # define the search nuccore/protein if viral_mode == 'n': logging.info('downloading viral nuccore sequences') target_dir = os.path.join(DB_DIR, 'viral_nuccore') cml_search = viral_query('n') elif viral_mode == 'p': logging.info('downloaded viral protein sequences') target_dir = os.path.join(DB_DIR, 'viral_protein') cml_search = viral_query('p') # run the search and download os.chdir(target_dir) run_child(cml_search) cml_fetch_fasta = 'efetch -format fasta < ncbi_search > viral_database.fasta' run_child(cml_fetch_fasta) cml_efetch_xtract = 'efetch -format docsum < ncbi_search | xtract' cml_efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title > viral_seqs_info.tsv' run_child(cml_efetch_xtract) logging.info('downloaded viral seqs info in %s', target_dir) logging.info('saving viral taxonomy') # viral_seqs_info.tsv contains Accn TaxId cml = 'cut -f 1,2 viral_seqs_info.tsv > viral_accn_taxid.dmp' run_child(cml) accs_1 = set(get_accs('viral_database.fasta')) accs_2 = set([l.split()[0] for l in open('viral_accn_taxid.dmp')]) assert accs_1 == accs_2, accs_1 ^ accs_2 logging.info('taxonomy and fasta sequences match') os.chdir(DB_DIR) logging.info('downloading taxonomy databases') download_handle = ftp_down('ftp://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz') download_handle.close() run_child('tar xvfz taxdb.tar.gz') os.remove('taxdb.tar.gz') download_handle = ftp_down('ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz') download_handle.close() run_child('tar xvfz taxdump.tar.gz') for ftd in ['taxdump.tar.gz', 'merged.dmp', 'gencode.dmp', 'division.dmp', 'delnodes.dmp', 'citations.dmp']: try: os.remove(ftd) except OSError: logging.warning('Could not find file %s', ftd)
def fetch_bovine(): """Download cow genome and annotations.""" target_dir = os.path.join(DB_DIR, 'bovine') try: os.mkdir(target_dir) except FileExistsError: pass os.chdir(target_dir) try: os.mkdir('fasta') except FileExistsError: pass os.chdir('fasta') chromosomes = ['chr%d' % chrom for chrom in range(1, 30)] chromosomes.extend(['chrX']) # chrY is missing logging.info('Downloading bovine genome') local_file_name = os.path.join( target_dir, 'fasta', 'ref_Bos_taurus_GCF_002263795.1_ARS-UCD1.2.fasta') if os.path.exists(local_file_name): os.remove(local_file_name) for chrom in chromosomes: logging.debug('Downloading bovine chromosome %s', chrom) fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/%s.fna.gz' % chrom download_handle = ftp_down(fasta_url, local_file_name) download_handle.close() logging.debug('Downloaded bovine chromosome %s', chrom) fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/chrMT.fna.gz' download_handle = ftp_down(fasta_url, local_file_name) download_handle.close() logging.debug('Downloaded bovine chromosome MT') fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fna.gz' download_handle = ftp_down(fasta_url, local_file_name) download_handle.close() logging.debug('Downloaded bovine chromosome unplaced') run_child('bgzip %s' % local_file_name) logging.info('Downloading gff annotation file') gff_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_genomic.gff.gz' download_handle = ftp_down(gff_url) download_handle.close()
def fetch_human(): """Download human genome and annotations.""" target_dir = os.path.join(DB_DIR, 'human') try: os.mkdir(target_dir) except FileExistsError: pass os.chdir(target_dir) try: os.mkdir('fasta') except FileExistsError: pass os.chdir('fasta') fasta_url = 'ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/GRCh38.primary_assembly.genome.fa.gz' gtf_url = 'ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/gencode.v24.primary_assembly.annotation.gtf.gz' logging.info('Downloading human annotation') download_handle = ftp_down(gtf_url) download_handle.close() logging.info('Downloading human genome and bgzip compressing') if os.path.exists('GRCh38.fasta'): os.remove('GRCh38.fasta') download_handle = ftp_down(fasta_url, 'GRCh38.fasta') download_handle.close() run_child('bgzip GRCh38.fasta')