Ejemplo n.º 1
0
 def test_decompress(self):
     out_file = os.path.join(tempfile.gettempdir(), 'gtf.txt')
     ftp_down(self.remote_1, out_file)
     ftl = run_child('file %s' % out_file)
     os.remove(out_file)
     ft = parse_file_line(ftl)
     self.assertEqual(ft, 'ascii')
Ejemplo n.º 2
0
def fetch_bovine():
    """Download cow genome and annotations."""
    target_dir = os.path.join(DB_DIR, 'bovine')
    try:
        os.mkdir(target_dir)
    except FileExistsError:
        pass
    os.chdir(target_dir)
    try:
        os.mkdir('fasta')
    except FileExistsError:
        pass
    os.chdir('fasta')
    chromosomes = ['chr%d' % chrom for chrom in range(1, 30)]
    chromosomes.extend(['chrMT', 'chrX', 'unplaced'])  # Y IS MISSING
    logging.info('Downloading bovine genome')
    local_file_name = os.path.join(target_dir, 'fasta', 'bt_ref_Bos_taurus_UMD_3.1.1.fasta')
    if os.path.exists(local_file_name):
        os.remove(local_file_name)
    for chrom in chromosomes:
        logging.debug('Downloading bovine chromosome %s', chrom)
        fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/Bos_taurus/Assembled_chromosomes/seq/bt_ref_Bos_taurus_UMD_3.1.1_%s.fa.gz' % chrom
        download_handle = ftp_down(fasta_url, local_file_name)
        download_handle.close()
        logging.debug('Downloaded bovine chromosome %s', chrom)
    run_child('bgzip %s' % local_file_name)
    logging.info('Downloading gff annotation file')
    gff3_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/Bos_taurus/GFF/ref_Bos_taurus_UMD_3.1.1_top_level.gff3.gz'
    download_handle = ftp_down(gff3_url)
    download_handle.close()
Ejemplo n.º 3
0
def fetch_bovine():
    """Download cow genome and annotations."""
    target_dir = os.path.join(DB_DIR, 'bovine')
    try:
        os.mkdir(target_dir)
    except FileExistsError:
        pass
    os.chdir(target_dir)
    try:
        os.mkdir('fasta')
    except FileExistsError:
        pass
    os.chdir('fasta')
    chromosomes = ['chr%d' % chrom for chrom in range(1, 30)]
    chromosomes.extend(['chrMT', 'chrX', 'unplaced'])  # Y IS MISSING
    logging.info('Downloading bovine genome')
    local_file_name = os.path.join(target_dir, 'fasta',
                                   'bt_ref_Bos_taurus_UMD_3.1.1.fasta')
    if os.path.exists(local_file_name):
        os.remove(local_file_name)
    for chrom in chromosomes:
        logging.debug('Downloading bovine chromosome %s', chrom)
        fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/Bos_taurus/Assembled_chromosomes/seq/bt_ref_Bos_taurus_UMD_3.1.1_%s.fa.gz' % chrom
        download_handle = ftp_down(fasta_url, local_file_name)
        download_handle.close()
        logging.debug('Downloaded bovine chromosome %s', chrom)
    run_child('bgzip %s' % local_file_name)
    logging.info('Downloading gff annotation file')
    gff3_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/Bos_taurus/GFF/ref_Bos_taurus_UMD_3.1.1_top_level.gff3.gz'
    download_handle = ftp_down(gff3_url)
    download_handle.close()
Ejemplo n.º 4
0
 def test_append(self):
     out_file = os.path.join(tempfile.gettempdir(), 'README.TXT')
     try:
         os.remove(out_file)
     except FileNotFoundError:
         pass
     ftp_down(self.remote_2, out_file)
     with open(out_file) as f:
         n_lines_1 = sum(1 for _ in f)
     ftp_down(self.remote_2, out_file)
     with open(out_file) as f:
         n_lines_2 = sum(1 for _ in f)
     self.assertEqual(n_lines_2, 2 * n_lines_1)
Ejemplo n.º 5
0
def fetch_viral(viral_mode):
    """Download nucleotide or protein database."""
    # define the search nuccore/protein
    if viral_mode == 'n':
        logging.info('downloading viral nuccore sequences')
        target_dir = os.path.join(DB_DIR, 'viral_nuccore')
        cml_search = viral_query('n')
    elif viral_mode == 'p':
        logging.info('downloaded viral protein sequences')
        target_dir = os.path.join(DB_DIR, 'viral_protein')
        cml_search = viral_query('p')
    # run the search and download
    os.chdir(target_dir)
    run_child(cml_search)
    cml_fetch_fasta = 'efetch -format fasta < ncbi_search > viral_database.fasta'
    run_child(cml_fetch_fasta)
    cml_efetch_xtract = 'efetch -format docsum < ncbi_search | xtract'
    cml_efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title > viral_seqs_info.tsv'
    run_child(cml_efetch_xtract)
    logging.info('downloaded viral seqs info in %s', target_dir)
    logging.info('saving viral taxonomy')
    # viral_seqs_info.tsv contains Accn TaxId
    cml = 'cut -f 1,2 viral_seqs_info.tsv > viral_accn_taxid.dmp'
    run_child(cml)
    accs_1 = set(get_accs('viral_database.fasta'))
    accs_2 = set([l.split()[0] for l in open('viral_accn_taxid.dmp')])
    assert accs_1 == accs_2, accs_1 ^ accs_2
    logging.info('taxonomy and fasta sequences match')

    os.chdir(DB_DIR)
    logging.info('downloading taxonomy databases')
    download_handle = ftp_down(
        'ftp://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz')
    download_handle.close()
    run_child('tar xvfz taxdb.tar.gz')
    os.remove('taxdb.tar.gz')
    download_handle = ftp_down(
        'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz')
    download_handle.close()
    run_child('tar xvfz taxdump.tar.gz')
    for ftd in [
            'taxdump.tar.gz', 'merged.dmp', 'gencode.dmp', 'division.dmp',
            'delnodes.dmp', 'citations.dmp'
    ]:
        try:
            os.remove(ftd)
        except OSError:
            logging.warning('Could not find file %s', ftd)
Ejemplo n.º 6
0
def fetch_viral(viral_mode):
    """Download nucleotide or protein database."""
    # define the search nuccore/protein
    if viral_mode == 'n':
        logging.info('downloading viral nuccore sequences')
        target_dir = os.path.join(DB_DIR, 'viral_nuccore')
        cml_search = viral_query('n')
    elif viral_mode == 'p':
        logging.info('downloaded viral protein sequences')
        target_dir = os.path.join(DB_DIR, 'viral_protein')
        cml_search = viral_query('p')
    # run the search and download
    os.chdir(target_dir)
    run_child(cml_search)
    cml_fetch_fasta = 'efetch -format fasta < ncbi_search > viral_database.fasta'
    run_child(cml_fetch_fasta)
    cml_efetch_xtract = 'efetch -format docsum < ncbi_search | xtract'
    cml_efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title > viral_seqs_info.tsv'
    run_child(cml_efetch_xtract)
    logging.info('downloaded viral seqs info in %s', target_dir)
    logging.info('saving viral taxonomy')
    # viral_seqs_info.tsv contains Accn TaxId
    cml = 'cut -f 1,2 viral_seqs_info.tsv > viral_accn_taxid.dmp'
    run_child(cml)
    accs_1 = set(get_accs('viral_database.fasta'))
    accs_2 = set([l.split()[0] for l in open('viral_accn_taxid.dmp')])
    assert accs_1 == accs_2, accs_1 ^ accs_2
    logging.info('taxonomy and fasta sequences match')

    os.chdir(DB_DIR)
    logging.info('downloading taxonomy databases')
    download_handle = ftp_down('ftp://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz')
    download_handle.close()
    run_child('tar xvfz taxdb.tar.gz')
    os.remove('taxdb.tar.gz')
    download_handle = ftp_down('ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz')
    download_handle.close()
    run_child('tar xvfz taxdump.tar.gz')
    for ftd in ['taxdump.tar.gz', 'merged.dmp', 'gencode.dmp', 'division.dmp', 'delnodes.dmp', 'citations.dmp']:
        try:
            os.remove(ftd)
        except OSError:
            logging.warning('Could not find file %s', ftd)
Ejemplo n.º 7
0
def fetch_bovine():
    """Download cow genome and annotations."""
    target_dir = os.path.join(DB_DIR, 'bovine')
    try:
        os.mkdir(target_dir)
    except FileExistsError:
        pass
    os.chdir(target_dir)
    try:
        os.mkdir('fasta')
    except FileExistsError:
        pass
    os.chdir('fasta')
    chromosomes = ['chr%d' % chrom for chrom in range(1, 30)]
    chromosomes.extend(['chrX'])  # chrY is missing
    logging.info('Downloading bovine genome')
    local_file_name = os.path.join(
        target_dir, 'fasta', 'ref_Bos_taurus_GCF_002263795.1_ARS-UCD1.2.fasta')
    if os.path.exists(local_file_name):
        os.remove(local_file_name)
    for chrom in chromosomes:
        logging.debug('Downloading bovine chromosome %s', chrom)
        fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/%s.fna.gz' % chrom
        download_handle = ftp_down(fasta_url, local_file_name)
        download_handle.close()
        logging.debug('Downloaded bovine chromosome %s', chrom)
    fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/chrMT.fna.gz'
    download_handle = ftp_down(fasta_url, local_file_name)
    download_handle.close()
    logging.debug('Downloaded bovine chromosome MT')
    fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fna.gz'
    download_handle = ftp_down(fasta_url, local_file_name)
    download_handle.close()
    logging.debug('Downloaded bovine chromosome unplaced')

    run_child('bgzip %s' % local_file_name)
    logging.info('Downloading gff annotation file')
    gff_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_genomic.gff.gz'
    download_handle = ftp_down(gff_url)
    download_handle.close()
Ejemplo n.º 8
0
def fetch_human():
    """Download human genome and annotations."""
    target_dir = os.path.join(DB_DIR, 'human')
    try:
        os.mkdir(target_dir)
    except FileExistsError:
        pass
    os.chdir(target_dir)
    try:
        os.mkdir('fasta')
    except FileExistsError:
        pass
    os.chdir('fasta')
    fasta_url = 'ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/GRCh38.primary_assembly.genome.fa.gz'
    gtf_url = 'ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/gencode.v24.primary_assembly.annotation.gtf.gz'
    logging.info('Downloading human annotation')
    download_handle = ftp_down(gtf_url)
    download_handle.close()
    logging.info('Downloading human genome and bgzip compressing')
    if os.path.exists('GRCh38.fasta'):
        os.remove('GRCh38.fasta')
    download_handle = ftp_down(fasta_url, 'GRCh38.fasta')
    download_handle.close()
    run_child('bgzip GRCh38.fasta')
Ejemplo n.º 9
0
def fetch_human():
    """Download human genome and annotations."""
    target_dir = os.path.join(DB_DIR, 'human')
    try:
        os.mkdir(target_dir)
    except FileExistsError:
        pass
    os.chdir(target_dir)
    try:
        os.mkdir('fasta')
    except FileExistsError:
        pass
    os.chdir('fasta')
    fasta_url = 'ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/GRCh38.primary_assembly.genome.fa.gz'
    gtf_url = 'ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/gencode.v24.primary_assembly.annotation.gtf.gz'
    logging.info('Downloading human annotation')
    download_handle = ftp_down(gtf_url)
    download_handle.close()
    logging.info('Downloading human genome and bgzip compressing')
    if os.path.exists('GRCh38.fasta'):
        os.remove('GRCh38.fasta')
    download_handle = ftp_down(fasta_url, 'GRCh38.fasta')
    download_handle.close()
    run_child('bgzip GRCh38.fasta')