def bact_fung_update(query_type=None, picked=None): """ """ import glob import itertools cont_dir = os.path.join(DB_DIR, query_type) os.chdir(cont_dir) logging.info('updating %s, now in %s', query_type, cont_dir) # read old info os.rename('%s_refseq_info.tsv' % query_type, 'old_%s_refseq_info.tsv' % query_type) old_urls = bact_fung_query(query_type=query_type, download=False, info_file='old_%s_refseq_info.tsv' % query_type) logging.info('%d assemblies were present in refseq', len(old_urls)) # download new info new_urls = bact_fung_query(query_type=query_type, download=True) logging.info('%d assemblies are now in refseq', len(new_urls)) to_add = set(new_urls) - set(old_urls) to_add = list(to_add) if not to_add: logging.info('no new sequences in %s database', query_type) print('no new sequences in %s database' % query_type, file=sys.stderr) for t in to_add: logging.debug('genome from %s will be added', t) if query_type == 'bacteria': download_genomes(to_add, prefix='tmp', n_files=3) for i in [1, 2, 3]: run_child('bgzip -c fasta/tmp%d.fasta >> fasta/bact%d.fasta.gz' % (i, i)) os.remove('fasta/bact%d.fasta.gz' % i) elif query_type == 'fungi': download_genomes(to_add, prefix='tmp', n_files=1) run_child('bgzip -c fasta/tmp1.fasta >> fasta/fungi1.fasta.gz') os.remove('fasta/fungi1.fasta.gz') if picked is None: return # present_ids = itertools.chain.from_iterable([get_gids(f) for f in glob.glob('fasta/*.fasta.gz')]) present_ids = itertools.chain.from_iterable([get_accs(f) for f in glob.glob('fasta/*.fasta.gz')]) picked_ids = [l.strip() for l in open(picked)] to_add = set(present_ids) - set(picked_ids) if not to_add: logging.info('no new sequence manually added') print('no new sequence manually added', file=sys.stderr) for i, gid in enumerate(to_add): if query_type == 'bacteria': fileout = 'fasta/bact%d.fasta.gz' % ((i % 3) + 1) elif query_type == 'fungi': fileout = 'fasta/fungi%d.fasta.gz' % ((i % 1) + 1) run_child('bgzip -c <(efetch -db nuccore -id %s -format fasta) >> %s' % (gid, fileout), exe='/bin/bash') logging.info('added %d sequences from file %s', i, picked) if query_type == 'bacteria': for i in [1, 2, 3]: run_child('bgzip -r fasta/bact%d.fasta.gz') elif query_type == 'fungi': run_child('bgzip -r fasta/fungi1.fasta.gz')
def fetch_viral(viral_mode): """Download nucleotide or protein database.""" # define the search nuccore/protein if viral_mode == 'n': logging.info('downloading viral nuccore sequences') target_dir = os.path.join(DB_DIR, 'viral_nuccore') cml_search = viral_query('n') elif viral_mode == 'p': logging.info('downloaded viral protein sequences') target_dir = os.path.join(DB_DIR, 'viral_protein') cml_search = viral_query('p') # run the search and download os.chdir(target_dir) run_child(cml_search) cml_fetch_fasta = 'efetch -format fasta < ncbi_search > viral_database.fasta' run_child(cml_fetch_fasta) cml_efetch_xtract = 'efetch -format docsum < ncbi_search | xtract' cml_efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title > viral_seqs_info.tsv' run_child(cml_efetch_xtract) logging.info('downloaded viral seqs info in %s', target_dir) logging.info('saving viral taxonomy') # viral_seqs_info.tsv contains Accn TaxId cml = 'cut -f 1,2 viral_seqs_info.tsv > viral_accn_taxid.dmp' run_child(cml) accs_1 = set(get_accs('viral_database.fasta')) accs_2 = set([l.split()[0] for l in open('viral_accn_taxid.dmp')]) assert accs_1 == accs_2, accs_1 ^ accs_2 logging.info('taxonomy and fasta sequences match') os.chdir(DB_DIR) logging.info('downloading taxonomy databases') download_handle = ftp_down( 'ftp://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz') download_handle.close() run_child('tar xvfz taxdb.tar.gz') os.remove('taxdb.tar.gz') download_handle = ftp_down( 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz') download_handle.close() run_child('tar xvfz taxdump.tar.gz') for ftd in [ 'taxdump.tar.gz', 'merged.dmp', 'gencode.dmp', 'division.dmp', 'delnodes.dmp', 'citations.dmp' ]: try: os.remove(ftd) except OSError: logging.warning('Could not find file %s', ftd)
def fetch_viral(viral_mode): """Download nucleotide or protein database.""" # define the search nuccore/protein if viral_mode == 'n': logging.info('downloading viral nuccore sequences') target_dir = os.path.join(DB_DIR, 'viral_nuccore') cml_search = viral_query('n') elif viral_mode == 'p': logging.info('downloaded viral protein sequences') target_dir = os.path.join(DB_DIR, 'viral_protein') cml_search = viral_query('p') # run the search and download os.chdir(target_dir) run_child(cml_search) cml_fetch_fasta = 'efetch -format fasta < ncbi_search > viral_database.fasta' run_child(cml_fetch_fasta) cml_efetch_xtract = 'efetch -format docsum < ncbi_search | xtract' cml_efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title > viral_seqs_info.tsv' run_child(cml_efetch_xtract) logging.info('downloaded viral seqs info in %s', target_dir) logging.info('saving viral taxonomy') # viral_seqs_info.tsv contains Accn TaxId cml = 'cut -f 1,2 viral_seqs_info.tsv > viral_accn_taxid.dmp' run_child(cml) accs_1 = set(get_accs('viral_database.fasta')) accs_2 = set([l.split()[0] for l in open('viral_accn_taxid.dmp')]) assert accs_1 == accs_2, accs_1 ^ accs_2 logging.info('taxonomy and fasta sequences match') os.chdir(DB_DIR) logging.info('downloading taxonomy databases') download_handle = ftp_down('ftp://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz') download_handle.close() run_child('tar xvfz taxdb.tar.gz') os.remove('taxdb.tar.gz') download_handle = ftp_down('ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz') download_handle.close() run_child('tar xvfz taxdump.tar.gz') for ftd in ['taxdump.tar.gz', 'merged.dmp', 'gencode.dmp', 'division.dmp', 'delnodes.dmp', 'citations.dmp']: try: os.remove(ftd) except OSError: logging.warning('Could not find file %s', ftd)
def virupdate(viral_type, picked=None): if viral_type == 'n': db_type = 'nuccore' elif viral_type == 'p': db_type = 'protein' viral_dir = os.path.join(DB_DIR, 'viral_%s' % db_type) # this query downloads a new viral_seqs_info.tsv and parses the GI logging.info('interrogating NCBI again') os.chdir(viral_dir) cml_search = viral_query(viral_type) run_child(cml_search) efetch_xtract = 'efetch -format docsum < ncbi_search | xtract' efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title > viral_seqs_info.tsv' run_child(efetch_xtract) info_file = os.path.join(viral_dir, 'viral_seqs_info.tsv') info_seqs = pd.read_csv(info_file, sep='\t', names=['Caption', 'TaxId', 'Slen', 'Organism', 'Title']) new_ids = [str(acc) for acc in info_seqs['Caption'].tolist()] logging.info('NCBI reports %d sequences', len(new_ids)) # read ids already present in fasta file fasta_db = os.path.join(viral_dir, 'viral_database.fasta') present_ids = get_accs(fasta_db) logging.info('fasta file has %d sequences', len(present_ids)) # sequences given manually by specifying file with GI if picked: manual_ids = [l.strip() for l in open(picked)] logging.info('%d sequences specified manually', len(manual_ids)) else: manual_ids = [] # update fasta: ids to add are union of picked plus those in ncbi minus those present ids_to_add = set(manual_ids) | set(new_ids) ids_to_add = ids_to_add - set(present_ids) if not ids_to_add: logging.info('no sequences to add to fasta file') print('no sequences to add to fasta file', file=sys.stderr) elif len(ids_to_add) > 2000: logging.error('cannot add %d sequences, exiting', len(ids_to_add)) sys.exit('too many sequences to add: run `virmet fetch` first') else: logging.info('adding %d sequences to fasta file', len(ids_to_add)) s_code = run_child('efetch -db %s -id ' % db_type + ','.join(ids_to_add) + ' -format fasta >> %s' % fasta_db) logging.debug(s_code) # update viral_seqs_info.tsv and taxonomy ids_to_add = set(present_ids) | set(manual_ids) ids_to_add = ids_to_add - set(new_ids) if not ids_to_add: logging.info('no sequences to add to viral_seqs_info') print('no sequences to add to viral_seqs_info', file=sys.stderr) else: logging.info('adding %d line(s) to viral_seqs_info.tsv', len(ids_to_add)) # loop needed as efetch with format docsum only takes one id at a time # (change introduced in edirect 3.30, December 2015) # slow, but other solutions seem complicated with edirect for ita in ids_to_add: cml = 'efetch -db %s -id %s' % (db_type, ita) cml = cml + ' -format docsum | xtract -pattern DocumentSummary \ -element Caption TaxId Slen Organism Title >> %s' % info_file run_child(cml) logging.info('updating taxonomy') s_code = run_child('cut -f 1,2 %s > %s' % (info_file, os.path.join(viral_dir, 'viral_accn_taxid.dmp'))) # perform tests gids_1 = Counter(get_accs('viral_database.fasta')) gids_2 = Counter([l.split()[0] for l in open('viral_accn_taxid.dmp')]) assert set(gids_1) == set(gids_2), 'taxonomy/viral_seqs_info not matching with fasta' duplicates = [k for k, v in gids_1.items() if v > 1] if duplicates: warnings.warn('Duplicate sequences in viral_database.fasta: %s' % ' '.join(duplicates)) logging.warning('Duplicate sequences in viral_database.fasta: %s', ' '.join(duplicates)) for l in open('viral_database.fasta'): if '>' in l and not l.startswith('>') or l.count('>') > 1: warnings.warn('Invalid line in viral_database.fasta: %s' % l) logging.warning('Invalid line in viral_database.fasta: %s', l)