def fetch_bovine(): """Download cow genome and annotations.""" target_dir = os.path.join(DB_DIR, 'bovine') try: os.mkdir(target_dir) except FileExistsError: pass os.chdir(target_dir) try: os.mkdir('fasta') except FileExistsError: pass os.chdir('fasta') chromosomes = ['chr%d' % chrom for chrom in range(1, 30)] chromosomes.extend(['chrMT', 'chrX', 'unplaced']) # Y IS MISSING logging.info('Downloading bovine genome') local_file_name = os.path.join(target_dir, 'fasta', 'bt_ref_Bos_taurus_UMD_3.1.1.fasta') if os.path.exists(local_file_name): os.remove(local_file_name) for chrom in chromosomes: logging.debug('Downloading bovine chromosome %s', chrom) fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/Bos_taurus/Assembled_chromosomes/seq/bt_ref_Bos_taurus_UMD_3.1.1_%s.fa.gz' % chrom download_handle = ftp_down(fasta_url, local_file_name) download_handle.close() logging.debug('Downloaded bovine chromosome %s', chrom) run_child('bgzip %s' % local_file_name) logging.info('Downloading gff annotation file') gff3_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/Bos_taurus/GFF/ref_Bos_taurus_UMD_3.1.1_top_level.gff3.gz' download_handle = ftp_down(gff3_url) download_handle.close()
def bact_fung_update(query_type=None, picked=None): """ """ import glob import itertools cont_dir = os.path.join(DB_DIR, query_type) os.chdir(cont_dir) logging.info('updating %s, now in %s', query_type, cont_dir) # read old info os.rename('%s_refseq_info.tsv' % query_type, 'old_%s_refseq_info.tsv' % query_type) old_urls = bact_fung_query(query_type=query_type, download=False, info_file='old_%s_refseq_info.tsv' % query_type) logging.info('%d assemblies were present in refseq', len(old_urls)) # download new info new_urls = bact_fung_query(query_type=query_type, download=True) logging.info('%d assemblies are now in refseq', len(new_urls)) to_add = set(new_urls) - set(old_urls) to_add = list(to_add) if not to_add: logging.info('no new sequences in %s database', query_type) print('no new sequences in %s database' % query_type, file=sys.stderr) for t in to_add: logging.debug('genome from %s will be added', t) if query_type == 'bacteria': download_genomes(to_add, prefix='tmp', n_files=3) for i in [1, 2, 3]: run_child('bgzip -c fasta/tmp%d.fasta >> fasta/bact%d.fasta.gz' % (i, i)) os.remove('fasta/bact%d.fasta.gz' % i) elif query_type == 'fungi': download_genomes(to_add, prefix='tmp', n_files=1) run_child('bgzip -c fasta/tmp1.fasta >> fasta/fungi1.fasta.gz') os.remove('fasta/fungi1.fasta.gz') if picked is None: return # present_ids = itertools.chain.from_iterable([get_gids(f) for f in glob.glob('fasta/*.fasta.gz')]) present_ids = itertools.chain.from_iterable([get_accs(f) for f in glob.glob('fasta/*.fasta.gz')]) picked_ids = [l.strip() for l in open(picked)] to_add = set(present_ids) - set(picked_ids) if not to_add: logging.info('no new sequence manually added') print('no new sequence manually added', file=sys.stderr) for i, gid in enumerate(to_add): if query_type == 'bacteria': fileout = 'fasta/bact%d.fasta.gz' % ((i % 3) + 1) elif query_type == 'fungi': fileout = 'fasta/fungi%d.fasta.gz' % ((i % 1) + 1) run_child('bgzip -c <(efetch -db nuccore -id %s -format fasta) >> %s' % (gid, fileout), exe='/bin/bash') logging.info('added %d sequences from file %s', i, picked) if query_type == 'bacteria': for i in [1, 2, 3]: run_child('bgzip -r fasta/bact%d.fasta.gz') elif query_type == 'fungi': run_child('bgzip -r fasta/fungi1.fasta.gz')
def test_blast(self): log_file = os.path.join(tempfile.gettempdir(), 'tmp.log') run_child('blastn -help > %s 2>&1' % log_file) with open(log_file) as f: l = sum(1 for _ in f) self.assertGreater(l, 6) os.remove(log_file)
def test_bwa_index(self): run_child('efetch -db nuccore -id K03455 -format fasta > %s' % self.genome_file) run_child('bwa index %s &> /dev/null' % self.genome_file) self.assertTrue(os.path.join(tempfile.gettempdir(), 'HIV.bwt')) os.remove(self.genome_file) for f in glob.glob('%s/HIV.*' % tempfile.gettempdir()): os.remove(f)
def single_bwa_index(index_params): '''run a single bwa indexing job''' in_fasta, index_prefix = index_params cml = 'bwa index -p %s %s &> %s_bwa_index.log' % (index_prefix, in_fasta, index_prefix) run_child(cml, exe='/bin/bash') return 'index %s done' % index_prefix
def fetch_fungal(): """Download fungal sequences.""" target_dir = os.path.join(DB_DIR, 'fungi') try: os.mkdir(target_dir) except FileExistsError: pass os.chdir(target_dir) # first download summary file with all ftp paths and return urls all_urls = bact_fung_query(query_type='fungi') logging.info('%d fungal genomes were found', len(all_urls)) # then download genomic_fna.gz files download_genomes(all_urls, prefix='fungi', n_files=1) run_child('bgzip fasta/fungi1.fasta')
def fetch_bacterial(): """Download the three bacterial sequence databases.""" target_dir = os.path.join(DB_DIR, 'bacteria') try: os.mkdir(target_dir) except FileExistsError: pass os.chdir(target_dir) # first download summary file with all ftp paths and return urls all_urls = bact_fung_query(query_type='bacteria') logging.info('%d bacterial genomes were found', len(all_urls)) # then download genomic_fna.gz files download_genomes(all_urls, prefix='bact', n_files=3) for j in [1, 2, 3]: run_child('bgzip fasta/bact%d.fasta' % j)
def test_decompress(self): out_file = os.path.join(tempfile.gettempdir(), 'gtf.txt') ftp_down(self.remote_1, out_file) ftl = run_child('file %s' % out_file) os.remove(out_file) ft = parse_file_line(ftl) self.assertEqual(ft, 'ascii')
def fetch_bovine(): """Download cow genome and annotations.""" target_dir = os.path.join(DB_DIR, 'bovine') try: os.mkdir(target_dir) except FileExistsError: pass os.chdir(target_dir) try: os.mkdir('fasta') except FileExistsError: pass os.chdir('fasta') chromosomes = ['chr%d' % chrom for chrom in range(1, 30)] chromosomes.extend(['chrX']) # chrY is missing logging.info('Downloading bovine genome') local_file_name = os.path.join( target_dir, 'fasta', 'ref_Bos_taurus_GCF_002263795.1_ARS-UCD1.2.fasta') if os.path.exists(local_file_name): os.remove(local_file_name) for chrom in chromosomes: logging.debug('Downloading bovine chromosome %s', chrom) fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/%s.fna.gz' % chrom download_handle = ftp_down(fasta_url, local_file_name) download_handle.close() logging.debug('Downloaded bovine chromosome %s', chrom) fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/chrMT.fna.gz' download_handle = ftp_down(fasta_url, local_file_name) download_handle.close() logging.debug('Downloaded bovine chromosome MT') fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fna.gz' download_handle = ftp_down(fasta_url, local_file_name) download_handle.close() logging.debug('Downloaded bovine chromosome unplaced') run_child('bgzip %s' % local_file_name) logging.info('Downloading gff annotation file') gff_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_genomic.gff.gz' download_handle = ftp_down(gff_url) download_handle.close()
def fetch_human(): """Download human genome and annotations.""" target_dir = os.path.join(DB_DIR, 'human') try: os.mkdir(target_dir) except FileExistsError: pass os.chdir(target_dir) try: os.mkdir('fasta') except FileExistsError: pass os.chdir('fasta') fasta_url = 'ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/GRCh38.primary_assembly.genome.fa.gz' gtf_url = 'ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/gencode.v24.primary_assembly.annotation.gtf.gz' logging.info('Downloading human annotation') download_handle = ftp_down(gtf_url) download_handle.close() logging.info('Downloading human genome and bgzip compressing') if os.path.exists('GRCh38.fasta'): os.remove('GRCh38.fasta') download_handle = ftp_down(fasta_url, 'GRCh38.fasta') download_handle.close() run_child('bgzip GRCh38.fasta')
def test_hunter(self): os.chdir(self.tmpdir) s_dir = hunter(self.reads) os.chdir(s_dir) raw_reads = run_child('gunzip -c %s | wc -l' % self.reads) raw_reads = int(raw_reads.strip().split()[0]) / 4 with open('good.fastq') as f: good_n = sum(1 for l in f) / 4 with open('bad.fastq') as f: bad_n = sum(1 for l in f) / 4 with open('stats.tsv') as f: stats = dict(l.strip().split() for l in f) filtered_out = int(stats['low_entropy']) + int(stats['low_quality']) self.assertEqual(bad_n, filtered_out) self.assertEqual(good_n, int(stats['passing_filter'])) self.assertEqual(raw_reads, bad_n + good_n + int(stats['trimmed_too_short']))
def victor(input_reads, contaminant): """decontaminate reads by aligning against contaminants with bwa and removing reads with alignments """ import gzip from Bio.SeqIO.QualityIO import FastqGeneralIterator try: n_proc = min(os.cpu_count(), 16) except NotImplementedError: n_proc = 2 rf_head = input_reads.split('.')[0] cont_name = os.path.split(contaminant)[1] sam_name = '%s_%s.sam' % (rf_head, cont_name) err_name = '%s_%s.err' % (rf_head, cont_name) clean_name = os.path.splitext(sam_name)[0] + '.fastq' # skipping if hot run if os.path.exists(err_name): logging.info('decontamination already performed, skipping') return clean_name # alignment with bwa cml = 'bwa mem -t %d -R \'@RG\\tID:foo\\tSM:bar\\tLB:library1\' -T 75 -M %s %s 2> \ %s | samtools view -h -F 4 - > %s' % (n_proc, contaminant, input_reads, err_name, sam_name) logging.debug('running bwa %s %s on %d cores', cont_name, rf_head, n_proc) run_child(cml) # reading sam file to remove reads with hits # test if an object is in set is way faster than in list mapped_reads = set( run_child('grep -v \"^@\" %s | cut -f 1' % sam_name).strip().split('\n')) try: # if no matches, empty string is present mapped_reads.remove('') except KeyError: pass oh = open('stats.tsv', 'a') oh.write('matching_%s\t%d\n' % (cont_name, len(mapped_reads))) oh.close() output_handle = open(clean_name, 'w') logging.debug('Cleaning reads in %s with alignments in %s', input_reads, sam_name) logging.debug('Writing to %s', clean_name) if input_reads.endswith('.gz'): cont_handle = gzip.open(input_reads) else: cont_handle = open(input_reads) c = 0 # Using FastqGeneralIterator allows fast performance for title, seq, qual in FastqGeneralIterator(cont_handle): if title.split()[0] not in mapped_reads: c += 1 output_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) if c % 100000 == 0: logging.debug('written %d clean reads', c) logging.info('written %d clean reads', c) output_handle.close() if input_reads != 'good.fastq': os.remove(input_reads) return clean_name
def hunter(fq_file): """runs quality filter on a fastq file with seqtk and prinseq, simple parallelisation with xargs, returns output directory """ # from virmet.common import prinseq_exe prinseq_exe = 'prinseq-lite.pl' prinseq_exe = 'prinseq' try: n_proc = min(os.cpu_count(), 16) if n_proc == 1: n_proc = 2 except NotImplementedError: n_proc = 2 logging.debug('hunter will run on %s processors', n_proc) if 'L001' in fq_file: s_dir = '_'.join(os.path.split(fq_file)[1].split('_')[:2]) try: os.mkdir(s_dir) except FileExistsError: logging.debug('entering %s already existing', s_dir) os.chdir(s_dir) s_dir = os.getcwd() else: s_dir = os.getcwd() # skip if this is a hot run if os.path.exists('prinseq.err') and os.path.exists('prinseq.log'): logging.info('hunter was already run in %s, skipping', s_dir) os.chdir(os.pardir) return s_dir # first occurrence of stats.tsv oh = open('stats.tsv', 'w+') # count raw reads if fq_file.endswith('gz'): out1 = run_child('gunzip -c %s | wc -l' % fq_file) else: out1 = run_child('wc -l %s' % fq_file) out1 = out1.strip().split()[0] n_reads = int(int(out1.strip()) / 4) oh.write('raw_reads\t%d\n' % n_reads) # trim and discard short reads, count logging.debug('trimming with seqtk') cml = 'trimfq %s | seqtk seq -L 75 - > intermediate.fastq' % fq_file out1 = run_child('seqtk ' + cml) out1 = run_child('wc -l intermediate.fastq') out1 = out1.strip().split()[0] long_reads = int(int(out1.strip()) / 4) short = n_reads - long_reads oh.write('trimmed_too_short\t%d\n' % short) # We want to split in n_proc processors, so each file has at most # (n_reads / n_proc) + 1 reads and 4 times as many lines # this fails if there are more cpus than reads! max_reads_per_file = int(n_reads / n_proc) + 1 max_l = max_reads_per_file * 4 # split and rename run_child('split -l %d intermediate.fastq splitted' % max_l) os.remove('intermediate.fastq') splitted = glob.glob('splitted*') n_splitted = len(splitted) for i, spf in enumerate(sorted(splitted)): os.rename(spf, 'splitted%03d.fastq' % i) # W.O. max 1000 files/cpus # filter with prinseq, parallelize with xargs logging.debug('filtering with prinseq') cml = '-f %%03g 0 %d | xargs -P %d -I {} %s \ -fastq splitted{}.fastq -lc_method entropy -lc_threshold 70 \ -log prinseq{}.log -min_qual_mean 20 \ -out_good ./good{} -out_bad ./bad{} > ./prinseq.err 2>&1' % ( n_splitted - 1, n_splitted, prinseq_exe) run_child('/usr/bin/seq ' + cml, exe='/bin/bash') logging.debug('cleaning up') if glob.glob('good???.fastq'): run_child('cat good???.fastq > good.fastq') run_child('rm good???.fastq') if glob.glob('bad???.fastq'): run_child('cat bad???.fastq > bad.fastq') run_child('rm bad???.fastq') if glob.glob('prinseq???.log'): run_child('cat prinseq???.log > prinseq.log') run_child('rm prinseq???.log') run_child('rm splitted*fastq') # parsing number of reads deleted because of low entropy low_ent = 0 min_qual = 0 with open('prinseq.log') as f: for l in f: match_lc = re.search('lc_method\:\s(\d*)$', l) match_mq = re.search('min_qual_mean\:\s(\d*)$', l) if match_lc: low_ent += int(match_lc.group(1)) elif match_mq: min_qual += int(match_mq.group(1)) oh.write('low_entropy\t%d\n' % low_ent) oh.write('low_quality\t%d\n' % min_qual) out1 = run_child('wc -l good.fastq') out1 = out1.strip().split()[0] n_reads = int(int(out1) / 4) lost_reads = n_reads + low_ent + min_qual - long_reads if lost_reads > 0: logging.error('%d reads were lost', lost_reads) warnings.warn('%d reads were lost' % lost_reads, RuntimeWarning) oh.write('passing_filter\t%d\n' % n_reads) oh.close() with open('sample_info.txt', 'a') as oh: oh.write('VirMet version: %s\n' % __version__) os.chdir(os.pardir) return s_dir
def fetch_viral(viral_mode): """Download nucleotide or protein database.""" # define the search nuccore/protein if viral_mode == 'n': logging.info('downloading viral nuccore sequences') target_dir = os.path.join(DB_DIR, 'viral_nuccore') cml_search = viral_query('n') elif viral_mode == 'p': logging.info('downloaded viral protein sequences') target_dir = os.path.join(DB_DIR, 'viral_protein') cml_search = viral_query('p') # run the search and download os.chdir(target_dir) run_child(cml_search) cml_fetch_fasta = 'efetch -format fasta < ncbi_search > viral_database.fasta' run_child(cml_fetch_fasta) cml_efetch_xtract = 'efetch -format docsum < ncbi_search | xtract' cml_efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title > viral_seqs_info.tsv' run_child(cml_efetch_xtract) logging.info('downloaded viral seqs info in %s', target_dir) logging.info('saving viral taxonomy') # viral_seqs_info.tsv contains Accn TaxId cml = 'cut -f 1,2 viral_seqs_info.tsv > viral_accn_taxid.dmp' run_child(cml) accs_1 = set(get_accs('viral_database.fasta')) accs_2 = set([l.split()[0] for l in open('viral_accn_taxid.dmp')]) assert accs_1 == accs_2, accs_1 ^ accs_2 logging.info('taxonomy and fasta sequences match') os.chdir(DB_DIR) logging.info('downloading taxonomy databases') download_handle = ftp_down('ftp://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz') download_handle.close() run_child('tar xvfz taxdb.tar.gz') os.remove('taxdb.tar.gz') download_handle = ftp_down('ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz') download_handle.close() run_child('tar xvfz taxdump.tar.gz') for ftd in ['taxdump.tar.gz', 'merged.dmp', 'gencode.dmp', 'division.dmp', 'delnodes.dmp', 'citations.dmp']: try: os.remove(ftd) except OSError: logging.warning('Could not find file %s', ftd)
def fetch_viral(viral_mode): """Download nucleotide or protein database.""" # define the search nuccore/protein if viral_mode == 'n': logging.info('downloading viral nuccore sequences') target_dir = os.path.join(DB_DIR, 'viral_nuccore') cml_search = viral_query('n') elif viral_mode == 'p': logging.info('downloaded viral protein sequences') target_dir = os.path.join(DB_DIR, 'viral_protein') cml_search = viral_query('p') # run the search and download os.chdir(target_dir) run_child(cml_search) cml_fetch_fasta = 'efetch -format fasta < ncbi_search > viral_database.fasta' run_child(cml_fetch_fasta) cml_efetch_xtract = 'efetch -format docsum < ncbi_search | xtract' cml_efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title > viral_seqs_info.tsv' run_child(cml_efetch_xtract) logging.info('downloaded viral seqs info in %s', target_dir) logging.info('saving viral taxonomy') # viral_seqs_info.tsv contains Accn TaxId cml = 'cut -f 1,2 viral_seqs_info.tsv > viral_accn_taxid.dmp' run_child(cml) accs_1 = set(get_accs('viral_database.fasta')) accs_2 = set([l.split()[0] for l in open('viral_accn_taxid.dmp')]) assert accs_1 == accs_2, accs_1 ^ accs_2 logging.info('taxonomy and fasta sequences match') os.chdir(DB_DIR) logging.info('downloading taxonomy databases') download_handle = ftp_down( 'ftp://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz') download_handle.close() run_child('tar xvfz taxdb.tar.gz') os.remove('taxdb.tar.gz') download_handle = ftp_down( 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz') download_handle.close() run_child('tar xvfz taxdump.tar.gz') for ftd in [ 'taxdump.tar.gz', 'merged.dmp', 'gencode.dmp', 'division.dmp', 'delnodes.dmp', 'citations.dmp' ]: try: os.remove(ftd) except OSError: logging.warning('Could not find file %s', ftd)
def main(args): """Extract the best species, realign reads, run ``covplot.R`` script to create the plot """ import datetime outdir = args.outdir organism = args.organism assert os.path.isdir(outdir), 'Where is the output dir? Check the path.' org_file = os.path.join(outdir, 'orgs_list.tsv') best_spec = best_species(org_file, organism) # parse blast results blast_file = os.path.join(outdir, 'unique.tsv.gz') unique = pd.read_csv(blast_file, sep='\t', header=0, compression='gzip') matching_reads = unique[unique['ssciname'] == best_spec] best_seqids = matching_reads.groupby('sseqid').size().sort_values( ascending=False) try: dsc, acc = str(best_seqids.index.tolist()[0]).split('|')[:2] except ValueError: dsc = 'None' acc = str(best_seqids.index.tolist()[0]) logging.info('Best hit in blast results: %s accession:%s', dsc, acc) # copy single genome, index, align viral_reads os.chdir(outdir) organism = organism.replace(' ', '_').replace('/', '_') try: os.mkdir(organism) except FileExistsError: warn( 'directory %s exists already: delete it to run covplot from scratch' % organism) os.chdir(organism) if os.path.exists('single.fasta'): warn('Reusing single.fasta') best_seq = SeqIO.parse('single.fasta', 'fasta') else: viral_db = os.path.join(DB_DIR, 'viral_nuccore/viral_database.fasta') time1 = datetime.datetime.now() #best_seq = [s for s in SeqIO.parse(viral_db, 'fasta') if acc in s.id] with open(viral_db) as handle: best_seq = [ s for name, s in SimpleFastaParser(handle) if acc in name ] print('best_seq found', file=sys.stderr) print(datetime.datetime.now() - time1, file=sys.stderr) sr = [SeqRecord(Seq(best_seq[0]), id=acc, description='')] SeqIO.write(sr, 'single.fasta', 'fasta') seq_len = len(list(best_seq)[0]) bam_file = 'single_sorted.bam' if os.path.exists(bam_file): warn('Reusing alignment') logging.info('Refusing to rerun alignment') else: run_child('bwa index single.fasta') logging.info('Aligning viral reads') run_child( 'bwa mem -t 8 single.fasta ../viral_reads.fastq.gz 2> /dev/null | samtools view -@ 2 -u - | samtools sort -@ 2 -O bam -T tmp -o %s -' % bam_file) run_child('samtools index %s' % bam_file) n_reads = int( subprocess.check_output( "samtools stats %s | grep ^SN | grep \"reads mapped:\" | cut -f 3" % bam_file, shell=True).strip()) depth_file = 'depth.txt' if os.path.exists(depth_file): warn('Reusing depth file') else: run_child('samtools depth -a -q 0 -Q 0 %s > %s' % (bam_file, depth_file)) image_name = organism + '_coverage.pdf' logging.info('Plotting coverage') perc_obs = subprocess.check_output( shlex.split( 'Rscript %s %s %s %s %s %d' % (covpl_exe, depth_file, acc, seq_len, image_name, n_reads))) try: perc_obs_string = perc_obs.decode('ascii').split()[1] except IndexError: perc_obs_string = 'NA' print('acc:%s seq_len:%s n_reads:%d perc_obs:%s' % (acc, seq_len, n_reads, perc_obs_string)) return best_species
def test_edirect(self): run_child('efetch -db nuccore -id K03455 -format fasta > %s' % self.genome_file) self.assertTrue(os.path.isfile(self.genome_file)) os.remove(self.genome_file)
def virupdate(viral_type, picked=None): if viral_type == 'n': db_type = 'nuccore' elif viral_type == 'p': db_type = 'protein' viral_dir = os.path.join(DB_DIR, 'viral_%s' % db_type) # this query downloads a new viral_seqs_info.tsv and parses the GI logging.info('interrogating NCBI again') os.chdir(viral_dir) cml_search = viral_query(viral_type) run_child(cml_search) efetch_xtract = 'efetch -format docsum < ncbi_search | xtract' efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title > viral_seqs_info.tsv' run_child(efetch_xtract) info_file = os.path.join(viral_dir, 'viral_seqs_info.tsv') info_seqs = pd.read_csv(info_file, sep='\t', names=['Caption', 'TaxId', 'Slen', 'Organism', 'Title']) new_ids = [str(acc) for acc in info_seqs['Caption'].tolist()] logging.info('NCBI reports %d sequences', len(new_ids)) # read ids already present in fasta file fasta_db = os.path.join(viral_dir, 'viral_database.fasta') present_ids = get_accs(fasta_db) logging.info('fasta file has %d sequences', len(present_ids)) # sequences given manually by specifying file with GI if picked: manual_ids = [l.strip() for l in open(picked)] logging.info('%d sequences specified manually', len(manual_ids)) else: manual_ids = [] # update fasta: ids to add are union of picked plus those in ncbi minus those present ids_to_add = set(manual_ids) | set(new_ids) ids_to_add = ids_to_add - set(present_ids) if not ids_to_add: logging.info('no sequences to add to fasta file') print('no sequences to add to fasta file', file=sys.stderr) elif len(ids_to_add) > 2000: logging.error('cannot add %d sequences, exiting', len(ids_to_add)) sys.exit('too many sequences to add: run `virmet fetch` first') else: logging.info('adding %d sequences to fasta file', len(ids_to_add)) s_code = run_child('efetch -db %s -id ' % db_type + ','.join(ids_to_add) + ' -format fasta >> %s' % fasta_db) logging.debug(s_code) # update viral_seqs_info.tsv and taxonomy ids_to_add = set(present_ids) | set(manual_ids) ids_to_add = ids_to_add - set(new_ids) if not ids_to_add: logging.info('no sequences to add to viral_seqs_info') print('no sequences to add to viral_seqs_info', file=sys.stderr) else: logging.info('adding %d line(s) to viral_seqs_info.tsv', len(ids_to_add)) # loop needed as efetch with format docsum only takes one id at a time # (change introduced in edirect 3.30, December 2015) # slow, but other solutions seem complicated with edirect for ita in ids_to_add: cml = 'efetch -db %s -id %s' % (db_type, ita) cml = cml + ' -format docsum | xtract -pattern DocumentSummary \ -element Caption TaxId Slen Organism Title >> %s' % info_file run_child(cml) logging.info('updating taxonomy') s_code = run_child('cut -f 1,2 %s > %s' % (info_file, os.path.join(viral_dir, 'viral_accn_taxid.dmp'))) # perform tests gids_1 = Counter(get_accs('viral_database.fasta')) gids_2 = Counter([l.split()[0] for l in open('viral_accn_taxid.dmp')]) assert set(gids_1) == set(gids_2), 'taxonomy/viral_seqs_info not matching with fasta' duplicates = [k for k, v in gids_1.items() if v > 1] if duplicates: warnings.warn('Duplicate sequences in viral_database.fasta: %s' % ' '.join(duplicates)) logging.warning('Duplicate sequences in viral_database.fasta: %s', ' '.join(duplicates)) for l in open('viral_database.fasta'): if '>' in l and not l.startswith('>') or l.count('>') > 1: warnings.warn('Invalid line in viral_database.fasta: %s' % l) logging.warning('Invalid line in viral_database.fasta: %s', l)
def main(args): '''only function doing all the indexing''' logging.info('now in index') if args.viral == 'n': target_dir = os.path.join(DB_DIR, 'viral_nuccore') os.chdir(target_dir) dt = datetime.date.today().isoformat() cml = "makeblastdb -in viral_database.fasta -dbtype nucl -hash_index \ -title \"Viral database indexed {}\" \ -out viral_db \ -logfile blast.log -parse_seqids -taxid_map viral_accn_taxid.dmp".format(dt) run_child(cml) if args.viral == 'p': target_dir = os.path.join(DB_DIR, 'viral_protein') os.chdir(target_dir) dt = datetime.date.today().isoformat() cml = "makeblastdb -in viral_database.fasta -dbtype prot -hash_index \ -title \"Viral database indexed {}\" \ -out viral_db \ -logfile blast.log -parse_seqids -taxid_map viral_accn_taxid.dmp".format(dt) run_child(cml) index_pairs = [] # holds (fasta, index) tuples to run in parallel if args.bact: bwa_dir = os.path.join(DB_DIR, 'bacteria', 'bwa') try: os.mkdir(bwa_dir) except FileExistsError as err: logging.warning('FileExistsError: %s' % err) for i in [1, 2, 3]: fasta_file = os.path.join(DB_DIR, 'bacteria', 'fasta', 'bact%d.fasta.gz' % i) index_prefix = os.path.join(bwa_dir, 'bact%d' % i) index_pairs.append((fasta_file, index_prefix)) if args.human: bwa_dir = os.path.join(DB_DIR, 'human', 'bwa') try: os.mkdir(bwa_dir) except FileExistsError as err: logging.warning('FileExistsError: %s' % err) fasta_file = os.path.join(DB_DIR, 'human', 'fasta', 'GRCh38.fasta.gz') index_prefix = os.path.join(bwa_dir, 'humanGRCh38') index_pairs.append((fasta_file, index_prefix)) if args.fungal: bwa_dir = os.path.join(DB_DIR, 'fungi', 'bwa') try: os.mkdir(bwa_dir) except FileExistsError as err: logging.warning('FileExistsError: %s' % err) fasta_file = os.path.join(DB_DIR, 'fungi', 'fasta', 'fungi1.fasta.gz') index_prefix = os.path.join(bwa_dir, 'fungi1') index_pairs.append((fasta_file, index_prefix)) if args.bovine: bwa_dir = os.path.join(DB_DIR, 'bovine', 'bwa') try: os.mkdir(bwa_dir) except FileExistsError as err: logging.warning('FileExistsError: %s' % err) fasta_file = os.path.join(DB_DIR, 'bovine', 'fasta', 'bt_ref_Bos_taurus_UMD_3.1.1.fasta.gz') index_prefix = os.path.join(bwa_dir, 'bt_ref') index_pairs.append((fasta_file, index_prefix)) # run in parallel # TODO: use single_process pool = mp.Pool() results = pool.map(single_bwa_index, index_pairs) for r in results: logging.info(r) # TODO parallelize this too for fasta_file, prefix in index_pairs: run_child('samtools faidx %s' % fasta_file)
def viral_blast(file_in, n_proc, nodes, names): """runs blast against viral database, parallelise with xargs """ import re import sys import warnings # on hot start, blast again all decontaminated reads if os.path.exists('viral_reads.fastq.gz') and os.path.exists( 'undetermined_reads.fastq.gz'): run_child( 'zcat viral_reads.fastq.gz undetermined_reads.fastq.gz > %s' % file_in) os.remove('viral_reads.fastq.gz') os.remove('undetermined_reads.fastq.gz') # streams will be used during the execution oh = open('stats.tsv', 'a') bh = open('unique.tsv', 'w') bh.write( 'qseqid\tsseqid\tssciname\tstitle\tpident\tqcovs\tscore\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tstaxid\n' ) if not os.path.exists('hq_decont_reads.fastq'): os.rename(file_in, 'hq_decont_reads.fastq') fasta_file = 'hq_decont_reads.fasta' run_child('seqtk seq -A hq_decont_reads.fastq > %s' % fasta_file) try: tot_seqs = int(run_child('grep -c \"^>\" %s' % fasta_file).strip()) except AttributeError: # deals with empty file tot_seqs = 0 logging.info('No reads left after decontamination') oh.write('reads_to_blast\t%d\n' % tot_seqs) if tot_seqs == 0: bh.close() oh.write('viral_reads\t0\n') oh.write('undetermined_reads\t0\n') oh.close() return max_n = (tot_seqs / n_proc) + 1 # We want to split in n_proc processors, so each file has at most # (tot_seqs / n_proc) + 1 reads cml = "awk -v \"MAX_N=%d\" \'BEGIN {n_seq=0;} /^>/ \ {if(n_seq %% %d == 0){file=sprintf(\"splitted_clean_%%d.fasta\", n_seq/%d);} \ print >> file; n_seq++; next;} { print >> file; }' %s" % ( max_n, max_n, max_n, fasta_file) run_child(cml) # blast needs access to taxdb files to retrieve organism name os.environ['BLASTDB'] = DB_DIR if sys.platform.startswith('linux'): xargs_thread = 0 # means on all available cores, caution elif sys.platform.startswith('darwin'): xargs_thread = n_proc # darwin xargs does not accept -P 0 else: logging.info('could not detect system platform: runnning on %d cores', n_proc) xargs_thread = n_proc # if Darwin then xargs_thread must be n_proc cml = 'seq 0 %s | xargs -P %d -I {} blastn -task megablast \ -query splitted_clean_{}.fasta -db %s \ -out tmp_{}.tsv \ -outfmt \'6 qseqid sseqid ssciname stitle pident qcovs score length mismatch gapopen qstart qend sstart send staxid\'' \ % (n_proc - 1, xargs_thread, os.path.join(DB_DIR, 'viral_nuccore/viral_db')) logging.debug('running blast now') run_child(cml) logging.debug('saving blast database info') cml = shlex.split( 'blastdbcmd -db /data/virmet_databases/viral_nuccore/viral_db -info') with open('blast_info.txt', 'a') as boh: subprocess.call(cml, stdout=boh) logging.debug('parsing best HSP for each query sequence') qseqid = '' # write to unique.tsv for tmpf in glob.glob('tmp_*.tsv'): i = tmpf.split('_')[1].split('.')[0] with open(tmpf) as f: for line in f: if line.split('\t')[0] != qseqid: bh.write(line) qseqid = line.split('\t')[0] os.remove(tmpf) os.remove('splitted_clean_%s.fasta' % i) bh.close() logging.debug('filtering and grouping by hit sequence') hits = pd.read_csv('unique.tsv', index_col='qseqid', delimiter="\t") logging.debug('found %d hits', hits.shape[0]) # select according to identity and coverage, count occurrences good_hits = hits[(hits.pident > blast_ident_threshold) & (hits.qcovs > blast_cov_threshold)] matched_reads = good_hits.shape[0] logging.debug('%d hits passing coverage and identity filter', matched_reads) oh.write('viral_reads\t%s\n' % matched_reads) unknown_reads = tot_seqs - matched_reads oh.write('undetermined_reads\t%d\n' % unknown_reads) oh.close() if matched_reads == 0: # deals with no good_hits warnings.warn('No hits') return # create a column for accession number good_hits['accn'] = good_hits.apply( lambda row: re.search(r'([A-Z]+_?\d*)\.?\d*', row['sseqid']).group(1), axis=1) good_hits = good_hits.rename(columns={'staxid': 'tax_id'}) viral_info_file = os.path.join(DB_DIR, 'viral_nuccore/viral_seqs_info.tsv') viral_info = pd.read_table( viral_info_file, names=['accn', 'TaxId', 'seq_len', 'Organism', 'Title']) good_hits = pd.merge(good_hits, viral_info, on='accn') # if blastn gives no taxid and scientific name, fill these col from viral_seqs_info.tsv file good_hits.loc[:, 'ssciname'] = good_hits.loc[:, 'ssciname'].fillna( good_hits['Organism']).astype(str) good_hits.loc[:, 'tax_id'] = good_hits.loc[:, 'tax_id'].fillna( good_hits['TaxId']).astype(int) # fill the species and the covered range on subject sequence good_hits['species'] = good_hits.apply( lambda row: get_parent_species(row, nodes, names), axis=1) good_hits['covered_region'] = good_hits.apply( lambda row: span_coverage(row), axis=1) if good_hits.isnull().any().any(): logging.error( "There is 'nan' in the result of the blastn after selecting good hits." ) # now summarise and write the covered region length ds = good_hits.groupby(['accn', 'stitle', 'ssciname', 'species', 'tax_id']).agg({'covered_region': merge_coverage}) ds['reads'] = good_hits.groupby( ['accn', 'stitle', 'ssciname', 'species', 'tax_id']).size() ds = ds.reset_index() viral_info = viral_info.drop(columns=['TaxId', 'Organism', 'Title']) ds = pd.merge(ds, viral_info) #ds['covered_fraction'] = round(ds['covered_region'] / ds['seq_len'], 4) ds = ds.loc[:, [ 'species', 'reads', 'stitle', 'ssciname', 'covered_region', 'seq_len' ]] ds = ds.sort_values(by=['reads', 'covered_region'], ascending=[False, False]) ds.to_csv('orgs_list.tsv', header=True, sep='\t', index=False)
def main(args): """Extract the best species, realign reads, run ``covplot.R`` script to create the plot """ import datetime outdir = args.outdir organism = args.organism assert os.path.isdir(outdir), 'Where is the output dir? Check the path.' org_file = os.path.join(outdir, 'orgs_list.tsv') best_spec = best_species(org_file, organism) # parse blast results blast_file = os.path.join(outdir, 'unique.tsv.gz') unique = pd.read_csv(blast_file, sep='\t', header=0, compression='gzip') matching_reads = unique[unique['ssciname'] == best_spec] best_seqids = matching_reads.groupby('sseqid').size().sort_values(ascending=False) try: dsc, acc = str(best_seqids.index.tolist()[0]).split('|')[:2] except ValueError: dsc = 'None' acc = str(best_seqids.index.tolist()[0]) logging.info('Best hit in blast results: %s accession:%s', dsc, acc) # copy single genome, index, align viral_reads os.chdir(outdir) organism = organism.replace(' ', '_').replace('/', '_') try: os.mkdir(organism) except FileExistsError: warn('directory %s exists already: delete it to run covplot from scratch' % organism) os.chdir(organism) if os.path.exists('single.fasta'): warn('Reusing single.fasta') best_seq = SeqIO.parse('single.fasta', 'fasta') else: viral_db = os.path.join(DB_DIR, 'viral_nuccore/viral_database.fasta') time1 = datetime.datetime.now() #best_seq = [s for s in SeqIO.parse(viral_db, 'fasta') if acc in s.id] with open(viral_db) as handle: best_seq = [s for name, s in SimpleFastaParser(handle) if acc in name] print('best_seq found', file=sys.stderr) print(datetime.datetime.now() - time1, file=sys.stderr) sr = [SeqRecord(Seq(best_seq[0]), id=acc, description='')] SeqIO.write(sr, 'single.fasta', 'fasta') seq_len = len(list(best_seq)[0]) bam_file = 'single_sorted.bam' if os.path.exists(bam_file): warn('Reusing alignment') logging.info('Refusing to rerun alignment') else: run_child('bwa index single.fasta') logging.info('Aligning viral reads') run_child('bwa mem -t 8 single.fasta ../viral_reads.fastq.gz 2> /dev/null | samtools view -@ 2 -u - | samtools sort -@ 2 -O bam -T tmp -o %s -' % bam_file) run_child('samtools index %s' % bam_file) n_reads = int(subprocess.check_output("samtools stats %s | grep ^SN | grep \"reads mapped:\" | cut -f 3" % bam_file, shell=True).strip()) depth_file = 'depth.txt' if os.path.exists(depth_file): warn('Reusing depth file') else: run_child('samtools depth -a -q 0 -Q 0 %s > %s' % (bam_file, depth_file)) image_name = organism + '_coverage.pdf' logging.info('Plotting coverage') perc_obs = subprocess.check_output(shlex.split('Rscript %s %s %s %s %s %d' % (covpl_exe, depth_file, acc, seq_len, image_name, n_reads))) try: perc_obs_string = perc_obs.decode('ascii').split()[1] except IndexError: perc_obs_string = 'NA' print('acc:%s seq_len:%s n_reads:%d perc_obs:%s' % (acc, seq_len, n_reads, perc_obs_string)) return best_species
def cleaning_up(): """sift reads into viral/unknown, compresses and removes files """ import multiprocessing as mp from Bio.SeqIO.QualityIO import FastqGeneralIterator # selects reads with coverage and identity higher than 75 df = pd.read_csv('unique.tsv', sep='\t') viral_ids = set(df[(df.qcovs > blast_cov_threshold) & (df.pident > blast_ident_threshold)].qseqid) viral_c = 0 undet_c = 0 all_reads = 'hq_decont_reads.fastq' all_handle = open(all_reads) undet_handle = open('undetermined_reads.fastq', 'w') viral_handle = open('viral_reads.fastq', 'w') # Using FastqGeneralIterator allows fast performance for title, seq, qual in FastqGeneralIterator(all_handle): if title.split()[0] not in viral_ids: undet_c += 1 undet_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) if undet_c % 100000 == 0: logging.debug('written %d undet reads', undet_c) else: viral_c += 1 viral_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) if viral_c % 10000 == 0: logging.debug('written %d viral reads', viral_c) undet_handle.close() viral_handle.close() logging.info('written %d undet reads', undet_c) logging.info('written %d viral reads', viral_c) run_child('gzip -f viral_reads.fastq') run_child('gzip -f undetermined_reads.fastq') os.remove(all_reads) cmls = [] for samfile in glob.glob('*.sam'): stem = os.path.splitext(samfile)[0] cont = stem.split('_')[-1] if cont == 'ref': # hack because _ in bovine file name cont = 'bt_ref' cml = 'samtools sort -O bam -l 0 -T /tmp -@ 4 %s | \ samtools view -T %s -C -o %s.cram -@ 4 -' % (samfile, ref_map[cont], stem) cmls.append(cml) # run in parallel pool = mp.Pool() results = pool.map(run_child, cmls) for r in results: logging.debug(r) # removing and zipping for samfile in glob.glob('*.sam'): os.remove(samfile) for rf in ['good.fastq', 'bad.fastq', 'hq_decont_reads.fasta']: try: os.remove(rf) except FileNotFoundError: pass for gf in glob.glob('good_*fastq'): os.remove(gf) run_child('gzip -f unique.tsv')
def main(args): '''only function doing all the indexing''' logging.info('now in index') if args.viral == 'n': target_dir = os.path.join(DB_DIR, 'viral_nuccore') os.chdir(target_dir) dt = datetime.date.today().isoformat() cml = "makeblastdb -in viral_database.fasta -dbtype nucl -hash_index \ -title \"Viral database indexed {}\" \ -out viral_db \ -logfile blast.log -parse_seqids -taxid_map viral_accn_taxid.dmp".format( dt) run_child(cml) if args.viral == 'p': target_dir = os.path.join(DB_DIR, 'viral_protein') os.chdir(target_dir) dt = datetime.date.today().isoformat() cml = "makeblastdb -in viral_database.fasta -dbtype prot -hash_index \ -title \"Viral database indexed {}\" \ -out viral_db \ -logfile blast.log -parse_seqids -taxid_map viral_accn_taxid.dmp".format( dt) run_child(cml) index_pairs = [] # holds (fasta, index) tuples to run in parallel if args.bact: bwa_dir = os.path.join(DB_DIR, 'bacteria', 'bwa') try: os.mkdir(bwa_dir) except FileExistsError as err: logging.warning('FileExistsError: %s' % err) for i in range(1, N_FILES_BACT + 1): fasta_file = os.path.join(DB_DIR, 'bacteria', 'fasta', 'bact%d.fasta.gz' % i) index_prefix = os.path.join(bwa_dir, 'bact%d' % i) index_pairs.append((fasta_file, index_prefix)) if args.human: bwa_dir = os.path.join(DB_DIR, 'human', 'bwa') try: os.mkdir(bwa_dir) except FileExistsError as err: logging.warning('FileExistsError: %s' % err) fasta_file = os.path.join(DB_DIR, 'human', 'fasta', 'GRCh38.fasta.gz') index_prefix = os.path.join(bwa_dir, 'humanGRCh38') index_pairs.append((fasta_file, index_prefix)) if args.fungal: bwa_dir = os.path.join(DB_DIR, 'fungi', 'bwa') try: os.mkdir(bwa_dir) except FileExistsError as err: logging.warning('FileExistsError: %s' % err) fasta_file = os.path.join(DB_DIR, 'fungi', 'fasta', 'fungi1.fasta.gz') index_prefix = os.path.join(bwa_dir, 'fungi1') index_pairs.append((fasta_file, index_prefix)) if args.bovine: bwa_dir = os.path.join(DB_DIR, 'bovine', 'bwa') try: os.mkdir(bwa_dir) except FileExistsError as err: logging.warning('FileExistsError: %s' % err) fasta_file = os.path.join( DB_DIR, 'bovine', 'fasta', 'ref_Bos_taurus_GCF_002263795.1_ARS-UCD1.2.fasta.gz') index_prefix = os.path.join(bwa_dir, 'bt_ref') index_pairs.append((fasta_file, index_prefix)) # run in parallel # TODO: use single_process pool = mp.Pool() results = pool.map(single_bwa_index, index_pairs) for r in results: logging.info(r) # TODO parallelize this too for fasta_file, prefix in index_pairs: run_child('samtools faidx %s' % fasta_file)