Ejemplo n.º 1
0
def fetch_bovine():
    """Download cow genome and annotations."""
    target_dir = os.path.join(DB_DIR, 'bovine')
    try:
        os.mkdir(target_dir)
    except FileExistsError:
        pass
    os.chdir(target_dir)
    try:
        os.mkdir('fasta')
    except FileExistsError:
        pass
    os.chdir('fasta')
    chromosomes = ['chr%d' % chrom for chrom in range(1, 30)]
    chromosomes.extend(['chrMT', 'chrX', 'unplaced'])  # Y IS MISSING
    logging.info('Downloading bovine genome')
    local_file_name = os.path.join(target_dir, 'fasta', 'bt_ref_Bos_taurus_UMD_3.1.1.fasta')
    if os.path.exists(local_file_name):
        os.remove(local_file_name)
    for chrom in chromosomes:
        logging.debug('Downloading bovine chromosome %s', chrom)
        fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/Bos_taurus/Assembled_chromosomes/seq/bt_ref_Bos_taurus_UMD_3.1.1_%s.fa.gz' % chrom
        download_handle = ftp_down(fasta_url, local_file_name)
        download_handle.close()
        logging.debug('Downloaded bovine chromosome %s', chrom)
    run_child('bgzip %s' % local_file_name)
    logging.info('Downloading gff annotation file')
    gff3_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/Bos_taurus/GFF/ref_Bos_taurus_UMD_3.1.1_top_level.gff3.gz'
    download_handle = ftp_down(gff3_url)
    download_handle.close()
Ejemplo n.º 2
0
def bact_fung_update(query_type=None, picked=None):
    """
    """
    import glob
    import itertools

    cont_dir = os.path.join(DB_DIR, query_type)
    os.chdir(cont_dir)
    logging.info('updating %s, now in %s', query_type, cont_dir)
    # read old info
    os.rename('%s_refseq_info.tsv' % query_type, 'old_%s_refseq_info.tsv' % query_type)
    old_urls = bact_fung_query(query_type=query_type, download=False, info_file='old_%s_refseq_info.tsv' % query_type)

    logging.info('%d assemblies were present in refseq', len(old_urls))
    # download new info
    new_urls = bact_fung_query(query_type=query_type, download=True)
    logging.info('%d assemblies are now in refseq', len(new_urls))
    to_add = set(new_urls) - set(old_urls)
    to_add = list(to_add)

    if not to_add:
        logging.info('no new sequences in %s database', query_type)
        print('no new sequences in %s database' % query_type, file=sys.stderr)

    for t in to_add:
        logging.debug('genome from %s will be added', t)
        if query_type == 'bacteria':
            download_genomes(to_add, prefix='tmp', n_files=3)
            for i in [1, 2, 3]:
                run_child('bgzip -c fasta/tmp%d.fasta >> fasta/bact%d.fasta.gz' % (i, i))
                os.remove('fasta/bact%d.fasta.gz' % i)
        elif query_type == 'fungi':
            download_genomes(to_add, prefix='tmp', n_files=1)
            run_child('bgzip -c fasta/tmp1.fasta >> fasta/fungi1.fasta.gz')
            os.remove('fasta/fungi1.fasta.gz')

    if picked is None:
        return

    # present_ids = itertools.chain.from_iterable([get_gids(f) for f in glob.glob('fasta/*.fasta.gz')])
    present_ids = itertools.chain.from_iterable([get_accs(f) for f in glob.glob('fasta/*.fasta.gz')])
    picked_ids = [l.strip() for l in open(picked)]
    to_add = set(present_ids) - set(picked_ids)

    if not to_add:
        logging.info('no new sequence manually added')
        print('no new sequence manually added', file=sys.stderr)

    for i, gid in enumerate(to_add):
        if query_type == 'bacteria':
            fileout = 'fasta/bact%d.fasta.gz' % ((i % 3) + 1)
        elif query_type == 'fungi':
            fileout = 'fasta/fungi%d.fasta.gz' % ((i % 1) + 1)
        run_child('bgzip -c <(efetch -db nuccore -id %s -format fasta) >> %s' % (gid, fileout), exe='/bin/bash')
    logging.info('added %d sequences from file %s', i, picked)
    if query_type == 'bacteria':
        for i in [1, 2, 3]:
            run_child('bgzip -r fasta/bact%d.fasta.gz')
    elif query_type == 'fungi':
        run_child('bgzip -r fasta/fungi1.fasta.gz')
Ejemplo n.º 3
0
 def test_blast(self):
     log_file = os.path.join(tempfile.gettempdir(), 'tmp.log')
     run_child('blastn -help > %s 2>&1' % log_file)
     with open(log_file) as f:
         l = sum(1 for _ in f)
     self.assertGreater(l, 6)
     os.remove(log_file)
Ejemplo n.º 4
0
def fetch_bovine():
    """Download cow genome and annotations."""
    target_dir = os.path.join(DB_DIR, 'bovine')
    try:
        os.mkdir(target_dir)
    except FileExistsError:
        pass
    os.chdir(target_dir)
    try:
        os.mkdir('fasta')
    except FileExistsError:
        pass
    os.chdir('fasta')
    chromosomes = ['chr%d' % chrom for chrom in range(1, 30)]
    chromosomes.extend(['chrMT', 'chrX', 'unplaced'])  # Y IS MISSING
    logging.info('Downloading bovine genome')
    local_file_name = os.path.join(target_dir, 'fasta',
                                   'bt_ref_Bos_taurus_UMD_3.1.1.fasta')
    if os.path.exists(local_file_name):
        os.remove(local_file_name)
    for chrom in chromosomes:
        logging.debug('Downloading bovine chromosome %s', chrom)
        fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/Bos_taurus/Assembled_chromosomes/seq/bt_ref_Bos_taurus_UMD_3.1.1_%s.fa.gz' % chrom
        download_handle = ftp_down(fasta_url, local_file_name)
        download_handle.close()
        logging.debug('Downloaded bovine chromosome %s', chrom)
    run_child('bgzip %s' % local_file_name)
    logging.info('Downloading gff annotation file')
    gff3_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/Bos_taurus/GFF/ref_Bos_taurus_UMD_3.1.1_top_level.gff3.gz'
    download_handle = ftp_down(gff3_url)
    download_handle.close()
Ejemplo n.º 5
0
 def test_blast(self):
     log_file = os.path.join(tempfile.gettempdir(), 'tmp.log')
     run_child('blastn -help > %s 2>&1' % log_file)
     with open(log_file) as f:
         l = sum(1 for _ in f)
     self.assertGreater(l, 6)
     os.remove(log_file)
Ejemplo n.º 6
0
 def test_bwa_index(self):
     run_child('efetch -db nuccore -id K03455 -format fasta > %s' % self.genome_file)
     run_child('bwa index %s &> /dev/null' % self.genome_file)
     self.assertTrue(os.path.join(tempfile.gettempdir(), 'HIV.bwt'))
     os.remove(self.genome_file)
     for f in glob.glob('%s/HIV.*' % tempfile.gettempdir()):
         os.remove(f)
Ejemplo n.º 7
0
def single_bwa_index(index_params):
    '''run a single bwa indexing job'''
    in_fasta, index_prefix = index_params
    cml = 'bwa index -p %s %s &> %s_bwa_index.log' % (index_prefix, in_fasta,
                                                      index_prefix)
    run_child(cml, exe='/bin/bash')
    return 'index %s done' % index_prefix
Ejemplo n.º 8
0
 def test_bwa_index(self):
     run_child('efetch -db nuccore -id K03455 -format fasta > %s' %
               self.genome_file)
     run_child('bwa index %s &> /dev/null' % self.genome_file)
     self.assertTrue(os.path.join(tempfile.gettempdir(), 'HIV.bwt'))
     os.remove(self.genome_file)
     for f in glob.glob('%s/HIV.*' % tempfile.gettempdir()):
         os.remove(f)
Ejemplo n.º 9
0
def fetch_fungal():
    """Download fungal sequences."""
    target_dir = os.path.join(DB_DIR, 'fungi')
    try:
        os.mkdir(target_dir)
    except FileExistsError:
        pass
    os.chdir(target_dir)

    # first download summary file with all ftp paths and return urls
    all_urls = bact_fung_query(query_type='fungi')
    logging.info('%d fungal genomes were found', len(all_urls))
    # then download genomic_fna.gz files
    download_genomes(all_urls, prefix='fungi', n_files=1)
    run_child('bgzip fasta/fungi1.fasta')
Ejemplo n.º 10
0
def fetch_fungal():
    """Download fungal sequences."""
    target_dir = os.path.join(DB_DIR, 'fungi')
    try:
        os.mkdir(target_dir)
    except FileExistsError:
        pass
    os.chdir(target_dir)

    # first download summary file with all ftp paths and return urls
    all_urls = bact_fung_query(query_type='fungi')
    logging.info('%d fungal genomes were found', len(all_urls))
    # then download genomic_fna.gz files
    download_genomes(all_urls, prefix='fungi', n_files=1)
    run_child('bgzip fasta/fungi1.fasta')
Ejemplo n.º 11
0
def fetch_bacterial():
    """Download the three bacterial sequence databases."""
    target_dir = os.path.join(DB_DIR, 'bacteria')
    try:
        os.mkdir(target_dir)
    except FileExistsError:
        pass
    os.chdir(target_dir)

    # first download summary file with all ftp paths and return urls
    all_urls = bact_fung_query(query_type='bacteria')
    logging.info('%d bacterial genomes were found', len(all_urls))
    # then download genomic_fna.gz files
    download_genomes(all_urls, prefix='bact', n_files=3)
    for j in [1, 2, 3]:
        run_child('bgzip fasta/bact%d.fasta' % j)
Ejemplo n.º 12
0
def fetch_bacterial():
    """Download the three bacterial sequence databases."""
    target_dir = os.path.join(DB_DIR, 'bacteria')
    try:
        os.mkdir(target_dir)
    except FileExistsError:
        pass
    os.chdir(target_dir)

    # first download summary file with all ftp paths and return urls
    all_urls = bact_fung_query(query_type='bacteria')
    logging.info('%d bacterial genomes were found', len(all_urls))
    # then download genomic_fna.gz files
    download_genomes(all_urls, prefix='bact', n_files=3)
    for j in [1, 2, 3]:
        run_child('bgzip fasta/bact%d.fasta' % j)
Ejemplo n.º 13
0
 def test_decompress(self):
     out_file = os.path.join(tempfile.gettempdir(), 'gtf.txt')
     ftp_down(self.remote_1, out_file)
     ftl = run_child('file %s' % out_file)
     os.remove(out_file)
     ft = parse_file_line(ftl)
     self.assertEqual(ft, 'ascii')
Ejemplo n.º 14
0
def fetch_bovine():
    """Download cow genome and annotations."""
    target_dir = os.path.join(DB_DIR, 'bovine')
    try:
        os.mkdir(target_dir)
    except FileExistsError:
        pass
    os.chdir(target_dir)
    try:
        os.mkdir('fasta')
    except FileExistsError:
        pass
    os.chdir('fasta')
    chromosomes = ['chr%d' % chrom for chrom in range(1, 30)]
    chromosomes.extend(['chrX'])  # chrY is missing
    logging.info('Downloading bovine genome')
    local_file_name = os.path.join(
        target_dir, 'fasta', 'ref_Bos_taurus_GCF_002263795.1_ARS-UCD1.2.fasta')
    if os.path.exists(local_file_name):
        os.remove(local_file_name)
    for chrom in chromosomes:
        logging.debug('Downloading bovine chromosome %s', chrom)
        fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/%s.fna.gz' % chrom
        download_handle = ftp_down(fasta_url, local_file_name)
        download_handle.close()
        logging.debug('Downloaded bovine chromosome %s', chrom)
    fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/chrMT.fna.gz'
    download_handle = ftp_down(fasta_url, local_file_name)
    download_handle.close()
    logging.debug('Downloaded bovine chromosome MT')
    fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fna.gz'
    download_handle = ftp_down(fasta_url, local_file_name)
    download_handle.close()
    logging.debug('Downloaded bovine chromosome unplaced')

    run_child('bgzip %s' % local_file_name)
    logging.info('Downloading gff annotation file')
    gff_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_genomic.gff.gz'
    download_handle = ftp_down(gff_url)
    download_handle.close()
Ejemplo n.º 15
0
def fetch_human():
    """Download human genome and annotations."""
    target_dir = os.path.join(DB_DIR, 'human')
    try:
        os.mkdir(target_dir)
    except FileExistsError:
        pass
    os.chdir(target_dir)
    try:
        os.mkdir('fasta')
    except FileExistsError:
        pass
    os.chdir('fasta')
    fasta_url = 'ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/GRCh38.primary_assembly.genome.fa.gz'
    gtf_url = 'ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/gencode.v24.primary_assembly.annotation.gtf.gz'
    logging.info('Downloading human annotation')
    download_handle = ftp_down(gtf_url)
    download_handle.close()
    logging.info('Downloading human genome and bgzip compressing')
    if os.path.exists('GRCh38.fasta'):
        os.remove('GRCh38.fasta')
    download_handle = ftp_down(fasta_url, 'GRCh38.fasta')
    download_handle.close()
    run_child('bgzip GRCh38.fasta')
Ejemplo n.º 16
0
def fetch_human():
    """Download human genome and annotations."""
    target_dir = os.path.join(DB_DIR, 'human')
    try:
        os.mkdir(target_dir)
    except FileExistsError:
        pass
    os.chdir(target_dir)
    try:
        os.mkdir('fasta')
    except FileExistsError:
        pass
    os.chdir('fasta')
    fasta_url = 'ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/GRCh38.primary_assembly.genome.fa.gz'
    gtf_url = 'ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/gencode.v24.primary_assembly.annotation.gtf.gz'
    logging.info('Downloading human annotation')
    download_handle = ftp_down(gtf_url)
    download_handle.close()
    logging.info('Downloading human genome and bgzip compressing')
    if os.path.exists('GRCh38.fasta'):
        os.remove('GRCh38.fasta')
    download_handle = ftp_down(fasta_url, 'GRCh38.fasta')
    download_handle.close()
    run_child('bgzip GRCh38.fasta')
Ejemplo n.º 17
0
 def test_hunter(self):
     os.chdir(self.tmpdir)
     s_dir = hunter(self.reads)
     os.chdir(s_dir)
     raw_reads = run_child('gunzip -c %s | wc -l' % self.reads)
     raw_reads = int(raw_reads.strip().split()[0]) / 4
     with open('good.fastq') as f:
         good_n = sum(1 for l in f) / 4
     with open('bad.fastq') as f:
         bad_n = sum(1 for l in f) / 4
     with open('stats.tsv') as f:
         stats = dict(l.strip().split() for l in f)
     filtered_out = int(stats['low_entropy']) + int(stats['low_quality'])
     self.assertEqual(bad_n, filtered_out)
     self.assertEqual(good_n, int(stats['passing_filter']))
     self.assertEqual(raw_reads, bad_n + good_n + int(stats['trimmed_too_short']))
Ejemplo n.º 18
0
 def test_hunter(self):
     os.chdir(self.tmpdir)
     s_dir = hunter(self.reads)
     os.chdir(s_dir)
     raw_reads = run_child('gunzip -c %s | wc -l' % self.reads)
     raw_reads = int(raw_reads.strip().split()[0]) / 4
     with open('good.fastq') as f:
         good_n = sum(1 for l in f) / 4
     with open('bad.fastq') as f:
         bad_n = sum(1 for l in f) / 4
     with open('stats.tsv') as f:
         stats = dict(l.strip().split() for l in f)
     filtered_out = int(stats['low_entropy']) + int(stats['low_quality'])
     self.assertEqual(bad_n, filtered_out)
     self.assertEqual(good_n, int(stats['passing_filter']))
     self.assertEqual(raw_reads,
                      bad_n + good_n + int(stats['trimmed_too_short']))
Ejemplo n.º 19
0
def victor(input_reads, contaminant):
    """decontaminate reads by aligning against contaminants with bwa and removing
    reads with alignments
    """
    import gzip
    from Bio.SeqIO.QualityIO import FastqGeneralIterator
    try:
        n_proc = min(os.cpu_count(), 16)
    except NotImplementedError:
        n_proc = 2

    rf_head = input_reads.split('.')[0]
    cont_name = os.path.split(contaminant)[1]
    sam_name = '%s_%s.sam' % (rf_head, cont_name)
    err_name = '%s_%s.err' % (rf_head, cont_name)
    clean_name = os.path.splitext(sam_name)[0] + '.fastq'

    # skipping if hot run
    if os.path.exists(err_name):
        logging.info('decontamination already performed, skipping')
        return clean_name

    # alignment with bwa
    cml = 'bwa mem -t %d -R \'@RG\\tID:foo\\tSM:bar\\tLB:library1\' -T 75 -M %s %s 2> \
    %s | samtools view -h -F 4 - > %s' % (n_proc, contaminant, input_reads,
                                          err_name, sam_name)
    logging.debug('running bwa %s %s on %d cores', cont_name, rf_head, n_proc)
    run_child(cml)

    # reading sam file to remove reads with hits
    # test if an object is in set is way faster than in list
    mapped_reads = set(
        run_child('grep -v \"^@\" %s | cut -f 1' %
                  sam_name).strip().split('\n'))
    try:  # if no matches, empty string is present
        mapped_reads.remove('')
    except KeyError:
        pass

    oh = open('stats.tsv', 'a')
    oh.write('matching_%s\t%d\n' % (cont_name, len(mapped_reads)))
    oh.close()

    output_handle = open(clean_name, 'w')
    logging.debug('Cleaning reads in %s with alignments in %s', input_reads,
                  sam_name)
    logging.debug('Writing to %s', clean_name)
    if input_reads.endswith('.gz'):
        cont_handle = gzip.open(input_reads)
    else:
        cont_handle = open(input_reads)
    c = 0
    # Using FastqGeneralIterator allows fast performance
    for title, seq, qual in FastqGeneralIterator(cont_handle):
        if title.split()[0] not in mapped_reads:
            c += 1
            output_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
            if c % 100000 == 0:
                logging.debug('written %d clean reads', c)
    logging.info('written %d clean reads', c)
    output_handle.close()

    if input_reads != 'good.fastq':
        os.remove(input_reads)

    return clean_name
Ejemplo n.º 20
0
def hunter(fq_file):
    """runs quality filter on a fastq file with seqtk and prinseq,
    simple parallelisation with xargs, returns output directory
    """
    # from virmet.common import prinseq_exe
    prinseq_exe = 'prinseq-lite.pl'
    prinseq_exe = 'prinseq'

    try:
        n_proc = min(os.cpu_count(), 16)
        if n_proc == 1:
            n_proc = 2
    except NotImplementedError:
        n_proc = 2

    logging.debug('hunter will run on %s processors', n_proc)
    if 'L001' in fq_file:
        s_dir = '_'.join(os.path.split(fq_file)[1].split('_')[:2])
        try:
            os.mkdir(s_dir)
        except FileExistsError:
            logging.debug('entering %s already existing', s_dir)
        os.chdir(s_dir)
        s_dir = os.getcwd()
    else:
        s_dir = os.getcwd()

    # skip if this is a hot run
    if os.path.exists('prinseq.err') and os.path.exists('prinseq.log'):
        logging.info('hunter was already run in %s, skipping', s_dir)
        os.chdir(os.pardir)
        return s_dir

    # first occurrence of stats.tsv
    oh = open('stats.tsv', 'w+')
    # count raw reads
    if fq_file.endswith('gz'):
        out1 = run_child('gunzip -c %s | wc -l' % fq_file)
    else:
        out1 = run_child('wc -l %s' % fq_file)
    out1 = out1.strip().split()[0]
    n_reads = int(int(out1.strip()) / 4)
    oh.write('raw_reads\t%d\n' % n_reads)

    # trim and discard short reads, count
    logging.debug('trimming with seqtk')
    cml = 'trimfq %s | seqtk seq -L 75 - > intermediate.fastq' % fq_file
    out1 = run_child('seqtk ' + cml)
    out1 = run_child('wc -l intermediate.fastq')
    out1 = out1.strip().split()[0]

    long_reads = int(int(out1.strip()) / 4)
    short = n_reads - long_reads
    oh.write('trimmed_too_short\t%d\n' % short)

    # We want to split in n_proc processors, so each file has at most
    # (n_reads / n_proc) + 1 reads and 4 times as many lines
    # this fails if there are more cpus than reads!
    max_reads_per_file = int(n_reads / n_proc) + 1
    max_l = max_reads_per_file * 4
    # split and rename
    run_child('split -l %d intermediate.fastq splitted' % max_l)
    os.remove('intermediate.fastq')
    splitted = glob.glob('splitted*')
    n_splitted = len(splitted)
    for i, spf in enumerate(sorted(splitted)):
        os.rename(spf, 'splitted%03d.fastq' % i)  # W.O. max 1000 files/cpus

    # filter with prinseq, parallelize with xargs
    logging.debug('filtering with prinseq')
    cml = '-f %%03g 0 %d | xargs -P %d -I {} %s \
            -fastq splitted{}.fastq -lc_method entropy -lc_threshold 70 \
            -log prinseq{}.log -min_qual_mean 20 \
            -out_good ./good{} -out_bad ./bad{} > ./prinseq.err 2>&1' % (
        n_splitted - 1, n_splitted, prinseq_exe)
    run_child('/usr/bin/seq ' + cml, exe='/bin/bash')

    logging.debug('cleaning up')
    if glob.glob('good???.fastq'):
        run_child('cat good???.fastq > good.fastq')
        run_child('rm good???.fastq')

    if glob.glob('bad???.fastq'):
        run_child('cat bad???.fastq > bad.fastq')
        run_child('rm bad???.fastq')

    if glob.glob('prinseq???.log'):
        run_child('cat prinseq???.log > prinseq.log')
        run_child('rm prinseq???.log')

    run_child('rm splitted*fastq')

    # parsing number of reads deleted because of low entropy
    low_ent = 0
    min_qual = 0
    with open('prinseq.log') as f:
        for l in f:
            match_lc = re.search('lc_method\:\s(\d*)$', l)
            match_mq = re.search('min_qual_mean\:\s(\d*)$', l)
            if match_lc:
                low_ent += int(match_lc.group(1))
            elif match_mq:
                min_qual += int(match_mq.group(1))
    oh.write('low_entropy\t%d\n' % low_ent)
    oh.write('low_quality\t%d\n' % min_qual)

    out1 = run_child('wc -l good.fastq')
    out1 = out1.strip().split()[0]
    n_reads = int(int(out1) / 4)
    lost_reads = n_reads + low_ent + min_qual - long_reads
    if lost_reads > 0:
        logging.error('%d reads were lost', lost_reads)
        warnings.warn('%d reads were lost' % lost_reads, RuntimeWarning)
    oh.write('passing_filter\t%d\n' % n_reads)
    oh.close()

    with open('sample_info.txt', 'a') as oh:
        oh.write('VirMet version: %s\n' % __version__)

    os.chdir(os.pardir)
    return s_dir
Ejemplo n.º 21
0
def fetch_viral(viral_mode):
    """Download nucleotide or protein database."""
    # define the search nuccore/protein
    if viral_mode == 'n':
        logging.info('downloading viral nuccore sequences')
        target_dir = os.path.join(DB_DIR, 'viral_nuccore')
        cml_search = viral_query('n')
    elif viral_mode == 'p':
        logging.info('downloaded viral protein sequences')
        target_dir = os.path.join(DB_DIR, 'viral_protein')
        cml_search = viral_query('p')
    # run the search and download
    os.chdir(target_dir)
    run_child(cml_search)
    cml_fetch_fasta = 'efetch -format fasta < ncbi_search > viral_database.fasta'
    run_child(cml_fetch_fasta)
    cml_efetch_xtract = 'efetch -format docsum < ncbi_search | xtract'
    cml_efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title > viral_seqs_info.tsv'
    run_child(cml_efetch_xtract)
    logging.info('downloaded viral seqs info in %s', target_dir)
    logging.info('saving viral taxonomy')
    # viral_seqs_info.tsv contains Accn TaxId
    cml = 'cut -f 1,2 viral_seqs_info.tsv > viral_accn_taxid.dmp'
    run_child(cml)
    accs_1 = set(get_accs('viral_database.fasta'))
    accs_2 = set([l.split()[0] for l in open('viral_accn_taxid.dmp')])
    assert accs_1 == accs_2, accs_1 ^ accs_2
    logging.info('taxonomy and fasta sequences match')

    os.chdir(DB_DIR)
    logging.info('downloading taxonomy databases')
    download_handle = ftp_down('ftp://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz')
    download_handle.close()
    run_child('tar xvfz taxdb.tar.gz')
    os.remove('taxdb.tar.gz')
    download_handle = ftp_down('ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz')
    download_handle.close()
    run_child('tar xvfz taxdump.tar.gz')
    for ftd in ['taxdump.tar.gz', 'merged.dmp', 'gencode.dmp', 'division.dmp', 'delnodes.dmp', 'citations.dmp']:
        try:
            os.remove(ftd)
        except OSError:
            logging.warning('Could not find file %s', ftd)
Ejemplo n.º 22
0
def fetch_viral(viral_mode):
    """Download nucleotide or protein database."""
    # define the search nuccore/protein
    if viral_mode == 'n':
        logging.info('downloading viral nuccore sequences')
        target_dir = os.path.join(DB_DIR, 'viral_nuccore')
        cml_search = viral_query('n')
    elif viral_mode == 'p':
        logging.info('downloaded viral protein sequences')
        target_dir = os.path.join(DB_DIR, 'viral_protein')
        cml_search = viral_query('p')
    # run the search and download
    os.chdir(target_dir)
    run_child(cml_search)
    cml_fetch_fasta = 'efetch -format fasta < ncbi_search > viral_database.fasta'
    run_child(cml_fetch_fasta)
    cml_efetch_xtract = 'efetch -format docsum < ncbi_search | xtract'
    cml_efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title > viral_seqs_info.tsv'
    run_child(cml_efetch_xtract)
    logging.info('downloaded viral seqs info in %s', target_dir)
    logging.info('saving viral taxonomy')
    # viral_seqs_info.tsv contains Accn TaxId
    cml = 'cut -f 1,2 viral_seqs_info.tsv > viral_accn_taxid.dmp'
    run_child(cml)
    accs_1 = set(get_accs('viral_database.fasta'))
    accs_2 = set([l.split()[0] for l in open('viral_accn_taxid.dmp')])
    assert accs_1 == accs_2, accs_1 ^ accs_2
    logging.info('taxonomy and fasta sequences match')

    os.chdir(DB_DIR)
    logging.info('downloading taxonomy databases')
    download_handle = ftp_down(
        'ftp://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz')
    download_handle.close()
    run_child('tar xvfz taxdb.tar.gz')
    os.remove('taxdb.tar.gz')
    download_handle = ftp_down(
        'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz')
    download_handle.close()
    run_child('tar xvfz taxdump.tar.gz')
    for ftd in [
            'taxdump.tar.gz', 'merged.dmp', 'gencode.dmp', 'division.dmp',
            'delnodes.dmp', 'citations.dmp'
    ]:
        try:
            os.remove(ftd)
        except OSError:
            logging.warning('Could not find file %s', ftd)
Ejemplo n.º 23
0
def main(args):
    """Extract the best species, realign reads, run ``covplot.R`` script to create the plot
    """
    import datetime
    outdir = args.outdir
    organism = args.organism

    assert os.path.isdir(outdir), 'Where is the output dir? Check the path.'

    org_file = os.path.join(outdir, 'orgs_list.tsv')
    best_spec = best_species(org_file, organism)

    # parse blast results
    blast_file = os.path.join(outdir, 'unique.tsv.gz')
    unique = pd.read_csv(blast_file, sep='\t', header=0, compression='gzip')
    matching_reads = unique[unique['ssciname'] == best_spec]
    best_seqids = matching_reads.groupby('sseqid').size().sort_values(
        ascending=False)
    try:
        dsc, acc = str(best_seqids.index.tolist()[0]).split('|')[:2]
    except ValueError:
        dsc = 'None'
        acc = str(best_seqids.index.tolist()[0])
    logging.info('Best hit in blast results: %s accession:%s', dsc, acc)

    # copy single genome, index, align viral_reads
    os.chdir(outdir)
    organism = organism.replace(' ', '_').replace('/', '_')
    try:
        os.mkdir(organism)
    except FileExistsError:
        warn(
            'directory %s exists already: delete it to run covplot from scratch'
            % organism)
    os.chdir(organism)

    if os.path.exists('single.fasta'):
        warn('Reusing single.fasta')
        best_seq = SeqIO.parse('single.fasta', 'fasta')
    else:
        viral_db = os.path.join(DB_DIR, 'viral_nuccore/viral_database.fasta')
        time1 = datetime.datetime.now()
        #best_seq = [s for s in SeqIO.parse(viral_db, 'fasta') if acc in s.id]
        with open(viral_db) as handle:
            best_seq = [
                s for name, s in SimpleFastaParser(handle) if acc in name
            ]
        print('best_seq found', file=sys.stderr)
        print(datetime.datetime.now() - time1, file=sys.stderr)
        sr = [SeqRecord(Seq(best_seq[0]), id=acc, description='')]
        SeqIO.write(sr, 'single.fasta', 'fasta')

    seq_len = len(list(best_seq)[0])

    bam_file = 'single_sorted.bam'
    if os.path.exists(bam_file):
        warn('Reusing alignment')
        logging.info('Refusing to rerun alignment')
    else:
        run_child('bwa index single.fasta')
        logging.info('Aligning viral reads')
        run_child(
            'bwa mem -t 8 single.fasta ../viral_reads.fastq.gz 2> /dev/null | samtools view -@ 2 -u - | samtools sort -@ 2 -O bam -T tmp -o %s -'
            % bam_file)
        run_child('samtools index %s' % bam_file)
    n_reads = int(
        subprocess.check_output(
            "samtools stats %s | grep ^SN | grep \"reads mapped:\" | cut -f 3"
            % bam_file,
            shell=True).strip())
    depth_file = 'depth.txt'
    if os.path.exists(depth_file):
        warn('Reusing depth file')
    else:
        run_child('samtools depth -a -q 0 -Q 0 %s > %s' %
                  (bam_file, depth_file))
    image_name = organism + '_coverage.pdf'
    logging.info('Plotting coverage')
    perc_obs = subprocess.check_output(
        shlex.split(
            'Rscript %s %s %s %s %s %d' %
            (covpl_exe, depth_file, acc, seq_len, image_name, n_reads)))
    try:
        perc_obs_string = perc_obs.decode('ascii').split()[1]
    except IndexError:
        perc_obs_string = 'NA'
    print('acc:%s seq_len:%s n_reads:%d perc_obs:%s' %
          (acc, seq_len, n_reads, perc_obs_string))

    return best_species
Ejemplo n.º 24
0
 def test_edirect(self):
     run_child('efetch -db nuccore -id K03455 -format fasta > %s' %
               self.genome_file)
     self.assertTrue(os.path.isfile(self.genome_file))
     os.remove(self.genome_file)
Ejemplo n.º 25
0
def single_bwa_index(index_params):
    '''run a single bwa indexing job'''
    in_fasta, index_prefix = index_params
    cml = 'bwa index -p %s %s &> %s_bwa_index.log' % (index_prefix, in_fasta, index_prefix)
    run_child(cml, exe='/bin/bash')
    return 'index %s done' % index_prefix
Ejemplo n.º 26
0
def virupdate(viral_type, picked=None):
    if viral_type == 'n':
        db_type = 'nuccore'
    elif viral_type == 'p':
        db_type = 'protein'
    viral_dir = os.path.join(DB_DIR, 'viral_%s' % db_type)

    # this query downloads a new viral_seqs_info.tsv and parses the GI
    logging.info('interrogating NCBI again')
    os.chdir(viral_dir)
    cml_search = viral_query(viral_type)
    run_child(cml_search)
    efetch_xtract = 'efetch -format docsum < ncbi_search | xtract'
    efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title > viral_seqs_info.tsv'
    run_child(efetch_xtract)
    info_file = os.path.join(viral_dir, 'viral_seqs_info.tsv')
    info_seqs = pd.read_csv(info_file, sep='\t', names=['Caption', 'TaxId', 'Slen', 'Organism', 'Title'])
    new_ids = [str(acc) for acc in info_seqs['Caption'].tolist()]
    logging.info('NCBI reports %d sequences', len(new_ids))

    # read ids already present in fasta file
    fasta_db = os.path.join(viral_dir, 'viral_database.fasta')
    present_ids = get_accs(fasta_db)
    logging.info('fasta file has %d sequences', len(present_ids))

    # sequences given manually by specifying file with GI
    if picked:
        manual_ids = [l.strip() for l in open(picked)]
        logging.info('%d sequences specified manually', len(manual_ids))
    else:
        manual_ids = []

    # update fasta: ids to add are union of picked plus those in ncbi minus those present
    ids_to_add = set(manual_ids) | set(new_ids)
    ids_to_add = ids_to_add - set(present_ids)
    if not ids_to_add:
        logging.info('no sequences to add to fasta file')
        print('no sequences to add to fasta file', file=sys.stderr)
    elif len(ids_to_add) > 2000:
        logging.error('cannot add %d sequences, exiting', len(ids_to_add))
        sys.exit('too many sequences to add: run `virmet fetch` first')
    else:
        logging.info('adding %d sequences to fasta file', len(ids_to_add))
        s_code = run_child('efetch -db %s -id ' % db_type + ','.join(ids_to_add) + ' -format fasta >> %s' % fasta_db)
        logging.debug(s_code)

    # update viral_seqs_info.tsv and taxonomy
    ids_to_add = set(present_ids) | set(manual_ids)
    ids_to_add = ids_to_add - set(new_ids)
    if not ids_to_add:
        logging.info('no sequences to add to viral_seqs_info')
        print('no sequences to add to viral_seqs_info', file=sys.stderr)
    else:
        logging.info('adding %d line(s) to viral_seqs_info.tsv', len(ids_to_add))
        # loop needed as efetch with format docsum only takes one id at a time
        # (change introduced in edirect 3.30, December 2015)
        # slow, but other solutions seem complicated with edirect
        for ita in ids_to_add:
            cml = 'efetch -db %s -id %s' % (db_type, ita)
            cml = cml + ' -format docsum | xtract -pattern DocumentSummary \
            -element Caption TaxId Slen Organism Title >> %s' % info_file
            run_child(cml)

    logging.info('updating taxonomy')
    s_code = run_child('cut -f 1,2 %s > %s' % (info_file, os.path.join(viral_dir, 'viral_accn_taxid.dmp')))

    # perform tests
    gids_1 = Counter(get_accs('viral_database.fasta'))
    gids_2 = Counter([l.split()[0] for l in open('viral_accn_taxid.dmp')])
    assert set(gids_1) == set(gids_2), 'taxonomy/viral_seqs_info not matching with fasta'
    duplicates = [k for k, v in gids_1.items() if v > 1]
    if duplicates:
        warnings.warn('Duplicate sequences in viral_database.fasta: %s' % ' '.join(duplicates))
        logging.warning('Duplicate sequences in viral_database.fasta: %s', ' '.join(duplicates))
    for l in open('viral_database.fasta'):
        if '>' in l and not l.startswith('>') or l.count('>') > 1:
            warnings.warn('Invalid line in viral_database.fasta: %s' % l)
            logging.warning('Invalid line in viral_database.fasta: %s', l)
Ejemplo n.º 27
0
def main(args):
    '''only function doing all the indexing'''
    logging.info('now in index')

    if args.viral == 'n':
        target_dir = os.path.join(DB_DIR, 'viral_nuccore')
        os.chdir(target_dir)
        dt = datetime.date.today().isoformat()
        cml = "makeblastdb -in viral_database.fasta -dbtype nucl -hash_index \
        -title \"Viral database indexed {}\" \
        -out viral_db \
        -logfile blast.log -parse_seqids -taxid_map viral_accn_taxid.dmp".format(dt)
        run_child(cml)

    if args.viral == 'p':
        target_dir = os.path.join(DB_DIR, 'viral_protein')
        os.chdir(target_dir)
        dt = datetime.date.today().isoformat()
        cml = "makeblastdb -in viral_database.fasta -dbtype prot -hash_index \
        -title \"Viral database indexed {}\" \
        -out viral_db \
        -logfile blast.log -parse_seqids -taxid_map viral_accn_taxid.dmp".format(dt)
        run_child(cml)

    index_pairs = []  # holds (fasta, index) tuples to run in parallel
    if args.bact:
        bwa_dir = os.path.join(DB_DIR, 'bacteria', 'bwa')
        try:
            os.mkdir(bwa_dir)
        except FileExistsError as err:
            logging.warning('FileExistsError: %s' % err)
        for i in [1, 2, 3]:
            fasta_file = os.path.join(DB_DIR, 'bacteria', 'fasta', 'bact%d.fasta.gz' % i)
            index_prefix = os.path.join(bwa_dir, 'bact%d' % i)
            index_pairs.append((fasta_file, index_prefix))

    if args.human:
        bwa_dir = os.path.join(DB_DIR, 'human', 'bwa')
        try:
            os.mkdir(bwa_dir)
        except FileExistsError as err:
            logging.warning('FileExistsError: %s' % err)
        fasta_file = os.path.join(DB_DIR, 'human', 'fasta', 'GRCh38.fasta.gz')
        index_prefix = os.path.join(bwa_dir, 'humanGRCh38')
        index_pairs.append((fasta_file, index_prefix))

    if args.fungal:
        bwa_dir = os.path.join(DB_DIR, 'fungi', 'bwa')
        try:
            os.mkdir(bwa_dir)
        except FileExistsError as err:
            logging.warning('FileExistsError: %s' % err)
        fasta_file = os.path.join(DB_DIR, 'fungi', 'fasta', 'fungi1.fasta.gz')
        index_prefix = os.path.join(bwa_dir, 'fungi1')
        index_pairs.append((fasta_file, index_prefix))

    if args.bovine:
        bwa_dir = os.path.join(DB_DIR, 'bovine', 'bwa')
        try:
            os.mkdir(bwa_dir)
        except FileExistsError as err:
            logging.warning('FileExistsError: %s' % err)
        fasta_file = os.path.join(DB_DIR, 'bovine', 'fasta', 'bt_ref_Bos_taurus_UMD_3.1.1.fasta.gz')
        index_prefix = os.path.join(bwa_dir, 'bt_ref')
        index_pairs.append((fasta_file, index_prefix))

    # run in parallel
    # TODO: use single_process
    pool = mp.Pool()
    results = pool.map(single_bwa_index, index_pairs)
    for r in results:
        logging.info(r)

    # TODO parallelize this too
    for fasta_file, prefix in index_pairs:
        run_child('samtools faidx %s' % fasta_file)
Ejemplo n.º 28
0
 def test_edirect(self):
     run_child('efetch -db nuccore -id K03455 -format fasta > %s' % self.genome_file)
     self.assertTrue(os.path.isfile(self.genome_file))
     os.remove(self.genome_file)
Ejemplo n.º 29
0
def viral_blast(file_in, n_proc, nodes, names):
    """runs blast against viral database, parallelise with xargs
    """
    import re
    import sys
    import warnings
    # on hot start, blast again all decontaminated reads
    if os.path.exists('viral_reads.fastq.gz') and os.path.exists(
            'undetermined_reads.fastq.gz'):
        run_child(
            'zcat viral_reads.fastq.gz undetermined_reads.fastq.gz > %s' %
            file_in)
        os.remove('viral_reads.fastq.gz')
        os.remove('undetermined_reads.fastq.gz')

    # streams will be used during the execution
    oh = open('stats.tsv', 'a')
    bh = open('unique.tsv', 'w')
    bh.write(
        'qseqid\tsseqid\tssciname\tstitle\tpident\tqcovs\tscore\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tstaxid\n'
    )

    if not os.path.exists('hq_decont_reads.fastq'):
        os.rename(file_in, 'hq_decont_reads.fastq')
    fasta_file = 'hq_decont_reads.fasta'
    run_child('seqtk seq -A hq_decont_reads.fastq > %s' % fasta_file)
    try:
        tot_seqs = int(run_child('grep -c \"^>\" %s' % fasta_file).strip())
    except AttributeError:  # deals with empty file
        tot_seqs = 0
        logging.info('No reads left after decontamination')

    oh.write('reads_to_blast\t%d\n' % tot_seqs)

    if tot_seqs == 0:
        bh.close()
        oh.write('viral_reads\t0\n')
        oh.write('undetermined_reads\t0\n')
        oh.close()
        return

    max_n = (tot_seqs / n_proc) + 1

    # We want to split in n_proc processors, so each file has at most
    # (tot_seqs / n_proc) + 1 reads
    cml = "awk -v \"MAX_N=%d\" \'BEGIN {n_seq=0;} /^>/ \
    {if(n_seq %% %d == 0){file=sprintf(\"splitted_clean_%%d.fasta\", n_seq/%d);} \
    print >> file; n_seq++; next;} { print >> file; }' %s" % (
        max_n, max_n, max_n, fasta_file)
    run_child(cml)

    # blast needs access to taxdb files to retrieve organism name
    os.environ['BLASTDB'] = DB_DIR
    if sys.platform.startswith('linux'):
        xargs_thread = 0  # means on all available cores, caution
    elif sys.platform.startswith('darwin'):
        xargs_thread = n_proc  # darwin xargs does not accept -P 0
    else:
        logging.info('could not detect system platform: runnning on %d cores',
                     n_proc)
        xargs_thread = n_proc
    # if Darwin then xargs_thread must be n_proc
    cml = 'seq 0 %s | xargs -P %d -I {} blastn -task megablast \
           -query splitted_clean_{}.fasta -db %s \
           -out tmp_{}.tsv \
           -outfmt \'6 qseqid sseqid ssciname stitle pident qcovs score length mismatch gapopen qstart qend sstart send staxid\'' \
        % (n_proc - 1, xargs_thread, os.path.join(DB_DIR, 'viral_nuccore/viral_db'))
    logging.debug('running blast now')
    run_child(cml)

    logging.debug('saving blast database info')
    cml = shlex.split(
        'blastdbcmd -db /data/virmet_databases/viral_nuccore/viral_db -info')
    with open('blast_info.txt', 'a') as boh:
        subprocess.call(cml, stdout=boh)

    logging.debug('parsing best HSP for each query sequence')
    qseqid = ''
    # write to unique.tsv
    for tmpf in glob.glob('tmp_*.tsv'):
        i = tmpf.split('_')[1].split('.')[0]
        with open(tmpf) as f:
            for line in f:
                if line.split('\t')[0] != qseqid:
                    bh.write(line)
                    qseqid = line.split('\t')[0]
        os.remove(tmpf)
        os.remove('splitted_clean_%s.fasta' % i)
    bh.close()

    logging.debug('filtering and grouping by hit sequence')
    hits = pd.read_csv('unique.tsv', index_col='qseqid', delimiter="\t")
    logging.debug('found %d hits', hits.shape[0])
    # select according to identity and coverage, count occurrences
    good_hits = hits[(hits.pident > blast_ident_threshold)
                     & (hits.qcovs > blast_cov_threshold)]
    matched_reads = good_hits.shape[0]
    logging.debug('%d hits passing coverage and identity filter',
                  matched_reads)
    oh.write('viral_reads\t%s\n' % matched_reads)
    unknown_reads = tot_seqs - matched_reads
    oh.write('undetermined_reads\t%d\n' % unknown_reads)
    oh.close()

    if matched_reads == 0:  # deals with no good_hits
        warnings.warn('No hits')
        return

    # create a column for accession number
    good_hits['accn'] = good_hits.apply(
        lambda row: re.search(r'([A-Z]+_?\d*)\.?\d*', row['sseqid']).group(1),
        axis=1)
    good_hits = good_hits.rename(columns={'staxid': 'tax_id'})

    viral_info_file = os.path.join(DB_DIR, 'viral_nuccore/viral_seqs_info.tsv')
    viral_info = pd.read_table(
        viral_info_file,
        names=['accn', 'TaxId', 'seq_len', 'Organism', 'Title'])
    good_hits = pd.merge(good_hits, viral_info, on='accn')
    # if blastn gives no taxid and scientific name, fill these col from viral_seqs_info.tsv file
    good_hits.loc[:, 'ssciname'] = good_hits.loc[:, 'ssciname'].fillna(
        good_hits['Organism']).astype(str)
    good_hits.loc[:, 'tax_id'] = good_hits.loc[:, 'tax_id'].fillna(
        good_hits['TaxId']).astype(int)
    # fill the species and the covered range on subject sequence
    good_hits['species'] = good_hits.apply(
        lambda row: get_parent_species(row, nodes, names), axis=1)
    good_hits['covered_region'] = good_hits.apply(
        lambda row: span_coverage(row), axis=1)
    if good_hits.isnull().any().any():
        logging.error(
            "There is 'nan' in the result of the blastn after selecting good hits."
        )

    # now summarise and write the covered region length
    ds = good_hits.groupby(['accn', 'stitle', 'ssciname', 'species',
                            'tax_id']).agg({'covered_region': merge_coverage})
    ds['reads'] = good_hits.groupby(
        ['accn', 'stitle', 'ssciname', 'species', 'tax_id']).size()
    ds = ds.reset_index()

    viral_info = viral_info.drop(columns=['TaxId', 'Organism', 'Title'])
    ds = pd.merge(ds, viral_info)
    #ds['covered_fraction'] = round(ds['covered_region'] / ds['seq_len'], 4)
    ds = ds.loc[:, [
        'species', 'reads', 'stitle', 'ssciname', 'covered_region', 'seq_len'
    ]]
    ds = ds.sort_values(by=['reads', 'covered_region'],
                        ascending=[False, False])
    ds.to_csv('orgs_list.tsv', header=True, sep='\t', index=False)
Ejemplo n.º 30
0
def main(args):
    """Extract the best species, realign reads, run ``covplot.R`` script to create the plot
    """
    import datetime
    outdir = args.outdir
    organism = args.organism

    assert os.path.isdir(outdir), 'Where is the output dir? Check the path.'

    org_file = os.path.join(outdir, 'orgs_list.tsv')
    best_spec = best_species(org_file, organism)

    # parse blast results
    blast_file = os.path.join(outdir, 'unique.tsv.gz')
    unique = pd.read_csv(blast_file, sep='\t', header=0, compression='gzip')
    matching_reads = unique[unique['ssciname'] == best_spec]
    best_seqids = matching_reads.groupby('sseqid').size().sort_values(ascending=False)
    try:
        dsc, acc = str(best_seqids.index.tolist()[0]).split('|')[:2]
    except ValueError:
        dsc = 'None'
        acc  = str(best_seqids.index.tolist()[0])
    logging.info('Best hit in blast results: %s accession:%s', dsc, acc)

    # copy single genome, index, align viral_reads
    os.chdir(outdir)
    organism = organism.replace(' ', '_').replace('/', '_')
    try:
        os.mkdir(organism)
    except FileExistsError:
        warn('directory %s exists already: delete it to run covplot from scratch' % organism)
    os.chdir(organism)
    
    if os.path.exists('single.fasta'):
        warn('Reusing single.fasta')
        best_seq = SeqIO.parse('single.fasta', 'fasta')
    else:
        viral_db = os.path.join(DB_DIR, 'viral_nuccore/viral_database.fasta')
        time1 = datetime.datetime.now()
        #best_seq = [s for s in SeqIO.parse(viral_db, 'fasta') if acc in s.id]
        with open(viral_db) as handle:
            best_seq = [s for name, s in SimpleFastaParser(handle) if acc in name]
        print('best_seq found', file=sys.stderr)
        print(datetime.datetime.now() - time1, file=sys.stderr)
        sr = [SeqRecord(Seq(best_seq[0]), id=acc, description='')]
        SeqIO.write(sr, 'single.fasta', 'fasta')

    seq_len = len(list(best_seq)[0])

    bam_file = 'single_sorted.bam'
    if os.path.exists(bam_file):
        warn('Reusing alignment')
        logging.info('Refusing to rerun alignment')
    else:
        run_child('bwa index single.fasta')
        logging.info('Aligning viral reads')
        run_child('bwa mem -t 8 single.fasta ../viral_reads.fastq.gz 2> /dev/null | samtools view -@ 2 -u - | samtools sort -@ 2 -O bam -T tmp -o %s -' % bam_file)
        run_child('samtools index %s' % bam_file)
    n_reads = int(subprocess.check_output("samtools stats %s | grep ^SN | grep \"reads mapped:\" | cut -f 3" % bam_file, shell=True).strip())
    depth_file = 'depth.txt'
    if os.path.exists(depth_file):
        warn('Reusing depth file')
    else:
        run_child('samtools depth -a -q 0 -Q 0 %s > %s' % (bam_file, depth_file))
    image_name = organism + '_coverage.pdf'
    logging.info('Plotting coverage')
    perc_obs = subprocess.check_output(shlex.split('Rscript %s %s %s %s %s %d' % (covpl_exe, depth_file, acc, seq_len, image_name, n_reads)))
    try:
        perc_obs_string = perc_obs.decode('ascii').split()[1]
    except IndexError:
        perc_obs_string = 'NA'
    print('acc:%s seq_len:%s n_reads:%d perc_obs:%s' % (acc, seq_len, n_reads, perc_obs_string))

    return best_species
Ejemplo n.º 31
0
def cleaning_up():
    """sift reads into viral/unknown, compresses and removes files
    """
    import multiprocessing as mp
    from Bio.SeqIO.QualityIO import FastqGeneralIterator

    # selects reads with coverage and identity higher than 75
    df = pd.read_csv('unique.tsv', sep='\t')
    viral_ids = set(df[(df.qcovs > blast_cov_threshold)
                       & (df.pident > blast_ident_threshold)].qseqid)
    viral_c = 0
    undet_c = 0
    all_reads = 'hq_decont_reads.fastq'
    all_handle = open(all_reads)
    undet_handle = open('undetermined_reads.fastq', 'w')
    viral_handle = open('viral_reads.fastq', 'w')
    # Using FastqGeneralIterator allows fast performance
    for title, seq, qual in FastqGeneralIterator(all_handle):
        if title.split()[0] not in viral_ids:
            undet_c += 1
            undet_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
            if undet_c % 100000 == 0:
                logging.debug('written %d undet reads', undet_c)
        else:
            viral_c += 1
            viral_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
            if viral_c % 10000 == 0:
                logging.debug('written %d viral reads', viral_c)
    undet_handle.close()
    viral_handle.close()
    logging.info('written %d undet reads', undet_c)
    logging.info('written %d viral reads', viral_c)

    run_child('gzip -f viral_reads.fastq')
    run_child('gzip -f undetermined_reads.fastq')
    os.remove(all_reads)

    cmls = []
    for samfile in glob.glob('*.sam'):
        stem = os.path.splitext(samfile)[0]
        cont = stem.split('_')[-1]
        if cont == 'ref':  # hack because _ in bovine file name
            cont = 'bt_ref'
        cml = 'samtools sort -O bam -l 0 -T /tmp -@ 4 %s | \
        samtools view -T %s -C -o %s.cram -@ 4 -' % (samfile, ref_map[cont],
                                                     stem)
        cmls.append(cml)

    # run in parallel
    pool = mp.Pool()
    results = pool.map(run_child, cmls)
    for r in results:
        logging.debug(r)

    # removing and zipping
    for samfile in glob.glob('*.sam'):
        os.remove(samfile)
    for rf in ['good.fastq', 'bad.fastq', 'hq_decont_reads.fasta']:
        try:
            os.remove(rf)
        except FileNotFoundError:
            pass

    for gf in glob.glob('good_*fastq'):
        os.remove(gf)
    run_child('gzip -f unique.tsv')
Ejemplo n.º 32
0
def main(args):
    '''only function doing all the indexing'''
    logging.info('now in index')

    if args.viral == 'n':
        target_dir = os.path.join(DB_DIR, 'viral_nuccore')
        os.chdir(target_dir)
        dt = datetime.date.today().isoformat()
        cml = "makeblastdb -in viral_database.fasta -dbtype nucl -hash_index \
        -title \"Viral database indexed {}\" \
        -out viral_db \
        -logfile blast.log -parse_seqids -taxid_map viral_accn_taxid.dmp".format(
            dt)
        run_child(cml)

    if args.viral == 'p':
        target_dir = os.path.join(DB_DIR, 'viral_protein')
        os.chdir(target_dir)
        dt = datetime.date.today().isoformat()
        cml = "makeblastdb -in viral_database.fasta -dbtype prot -hash_index \
        -title \"Viral database indexed {}\" \
        -out viral_db \
        -logfile blast.log -parse_seqids -taxid_map viral_accn_taxid.dmp".format(
            dt)
        run_child(cml)

    index_pairs = []  # holds (fasta, index) tuples to run in parallel
    if args.bact:
        bwa_dir = os.path.join(DB_DIR, 'bacteria', 'bwa')
        try:
            os.mkdir(bwa_dir)
        except FileExistsError as err:
            logging.warning('FileExistsError: %s' % err)
        for i in range(1, N_FILES_BACT + 1):
            fasta_file = os.path.join(DB_DIR, 'bacteria', 'fasta',
                                      'bact%d.fasta.gz' % i)
            index_prefix = os.path.join(bwa_dir, 'bact%d' % i)
            index_pairs.append((fasta_file, index_prefix))

    if args.human:
        bwa_dir = os.path.join(DB_DIR, 'human', 'bwa')
        try:
            os.mkdir(bwa_dir)
        except FileExistsError as err:
            logging.warning('FileExistsError: %s' % err)
        fasta_file = os.path.join(DB_DIR, 'human', 'fasta', 'GRCh38.fasta.gz')
        index_prefix = os.path.join(bwa_dir, 'humanGRCh38')
        index_pairs.append((fasta_file, index_prefix))

    if args.fungal:
        bwa_dir = os.path.join(DB_DIR, 'fungi', 'bwa')
        try:
            os.mkdir(bwa_dir)
        except FileExistsError as err:
            logging.warning('FileExistsError: %s' % err)
        fasta_file = os.path.join(DB_DIR, 'fungi', 'fasta', 'fungi1.fasta.gz')
        index_prefix = os.path.join(bwa_dir, 'fungi1')
        index_pairs.append((fasta_file, index_prefix))

    if args.bovine:
        bwa_dir = os.path.join(DB_DIR, 'bovine', 'bwa')
        try:
            os.mkdir(bwa_dir)
        except FileExistsError as err:
            logging.warning('FileExistsError: %s' % err)
        fasta_file = os.path.join(
            DB_DIR, 'bovine', 'fasta',
            'ref_Bos_taurus_GCF_002263795.1_ARS-UCD1.2.fasta.gz')
        index_prefix = os.path.join(bwa_dir, 'bt_ref')
        index_pairs.append((fasta_file, index_prefix))

    # run in parallel
    # TODO: use single_process
    pool = mp.Pool()
    results = pool.map(single_bwa_index, index_pairs)
    for r in results:
        logging.info(r)

    # TODO parallelize this too
    for fasta_file, prefix in index_pairs:
        run_child('samtools faidx %s' % fasta_file)