Ejemplo n.º 1
0
    def _get_from_srst2_argannot(self, outprefix):
        srst2_version = '0.2.0'
        srst2_url = 'https://github.com/katholt/srst2/raw/v' + srst2_version + '/data/ARGannot.r1.fasta'
        srst2_fa = outprefix + '.original.fa'
        command = 'wget -O ' + srst2_fa + ' ' + srst2_url
        common.syscall(command, verbose=True)

        final_fasta = outprefix + '.fa'
        final_tsv = outprefix + '.tsv'

        f_out_fa = pyfastaq.utils.open_file_write(final_fasta)
        f_out_meta = pyfastaq.utils.open_file_write(final_tsv)
        seq_reader = pyfastaq.sequences.file_reader(srst2_fa)

        for seq in seq_reader:
            original_id = seq.id
            name, extra = seq.id.split()
            cluster_id, cluster_name, allele_name, allele_id = name.split('__')
            seq.id = cluster_name + '.' + name
            print(seq, file=f_out_fa)
            print(seq.id, 1, 0, '.', '.', 'Original name: ' + original_id, sep='\t', file=f_out_meta)

        pyfastaq.utils.close(f_out_fa)
        pyfastaq.utils.close(f_out_meta)
        if not self.debug:
            os.unlink(srst2_fa)

        print('Finished downloading and converting data. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
        print('You can use them with ARIBA like this:')
        print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print('"SRST2: Rapid genomic surveillance for public health and hospital microbiology labs",\nInouye et al 2014, Genome Medicine, PMID: 25422674\n')
        print(argannot_ref)
        print('and in your methods say that the ARG-ANNOT sequences were used from version', srst2_version, 'of SRST2.')
Ejemplo n.º 2
0
    def _gap_fill_with_gapfiller(self):
        if not os.path.exists(self.scaffolder_scaffolds):
            raise Error('Cannot gap fill because scaffolds file not found: ' + self.scaffolder_scaffolds)


        cwd = os.getcwd()

        if self.gapfiller_exe is None or not self._has_gaps_to_fill(self.scaffolder_scaffolds):
            self._rename_scaffolds(self.scaffolder_scaffolds, self.gapfilled_scaffolds)
            return

        try:
            os.mkdir(self.gapfill_dir)
        except:
            raise Error('Error mkdir '+  self.gapfill_dir)

        os.chdir(self.gapfill_dir)
        lib_file = 'lib'
        with open(lib_file, 'w') as f:
            print('LIB', 'bwa', self.reads1, self.reads2, self.reads_insert, self.sspace_sd, 'FR', file=f)

        cmd = ' '.join([
            'perl', self.gapfiller_exe,
            '-l', lib_file,
            '-s', self.scaffolder_scaffolds
        ])

        gapfilled_scaffolds = os.path.join(self.gapfill_dir, 'standard_output', 'standard_output.gapfilled.final.fa')
        common.syscall(cmd, verbose=self.verbose)
        self._rename_scaffolds(gapfilled_scaffolds, self.gapfilled_scaffolds)
        os.chdir(cwd)
Ejemplo n.º 3
0
    def run(self):
        tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd())
        cdhit_fasta = os.path.join(tmpdir, 'cdhit')
        cluster_info_outfile = cdhit_fasta + '.bak.clstr'
        infile_renamed = os.path.join(tmpdir, 'input.renamed.fa')

        # cd-hit truncates all names to 19 bases in its report of which
        # sequences belong to which clusters. So need to temporarily
        # rename all sequences to have short enough names. Grrr.
        new_to_old_name = self._enumerate_fasta(self.infile, infile_renamed)

        cmd = ' '.join([
            'cd-hit-est',
            '-i', infile_renamed,
            '-o', cdhit_fasta,
            '-c', str(self.seq_identity_threshold),
            '-T', str(self.threads),
            '-s', str(self.length_diff_cutoff),
            '-bak 1',
        ])

        common.syscall(cmd, verbose=self.verbose)

        cluster_representatives = self._get_ids(cdhit_fasta)
        clusters, cluster_rep_to_cluster = self._parse_cluster_info_file(cluster_info_outfile, new_to_old_name, cluster_representatives)
        self._rename_fasta(cdhit_fasta, self.outfile, cluster_rep_to_cluster)
        shutil.rmtree(tmpdir)
        return clusters
Ejemplo n.º 4
0
    def _scaffold_with_sspace(self):
        if not os.path.exists(self.assembly_contigs):
            raise Error('Cannot scaffold because contigs file not found: ' + self.assembly_contigs)

        try:
            os.mkdir(self.scaffold_dir)
        except:
            raise Error('Error mkdir '+  self.scaffold_dir)

        cwd = os.getcwd()

        if self.sspace_exe is None:
            os.chdir(self.assembly_dir)
            os.symlink(os.path.basename(self.assembly_contigs), os.path.basename(self.scaffolder_scaffolds))
            os.chdir(cwd)
            return

        os.chdir(self.scaffold_dir)
        lib_file = 'lib'
        with open(lib_file, 'w') as f:
            print('LIB', self.reads1, self.reads2, int(self.reads_insert), self.sspace_sd, 'FR', file=f)

        cmd = ' '.join([
            'perl', self.sspace_exe,
            '-k', str(self.sspace_k),
            '-l', lib_file,
            '-s', self.assembly_contigs
        ])

        sspace_scaffolds = os.path.abspath('standard_output.final.scaffolds.fasta')
        common.syscall(cmd, verbose=self.verbose)
        os.chdir(self.assembly_dir)
        os.symlink(os.path.relpath(sspace_scaffolds), os.path.basename(self.scaffolder_scaffolds))
        os.chdir(cwd)
Ejemplo n.º 5
0
    def _gap_fill_with_gapfiller(self):
        if not os.path.exists(self.scaffolder_scaffolds):
            raise Error('Cannot gap fill because scaffolds file not found: ' + self.scaffolder_scaffolds)

        cwd = os.getcwd()

        if self.extern_progs.exe('gapfiller') is None or not self._has_gaps_to_fill(self.scaffolder_scaffolds):
            self._rename_scaffolds(self.scaffolder_scaffolds, self.gapfilled_scaffolds, self.scaff_name_prefix)
            return

        try:
            os.mkdir(self.gapfill_dir)
        except:
            raise Error('Error mkdir '+  self.gapfill_dir)

        os.chdir(self.gapfill_dir)
        lib_file = 'lib'
        with open(lib_file, 'w') as f:
            print('LIB', 'bwa', self.reads1, self.reads2, self.reads_insert, self.sspace_sd, 'FR', file=f)

        cmd = ' '.join([
            'perl', self.extern_progs.exe('gapfiller'),
            '-l', lib_file,
            '-s', self.scaffolder_scaffolds
        ])

        gapfilled_scaffolds = os.path.join(self.gapfill_dir, 'standard_output', 'standard_output.gapfilled.final.fa')
        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
        self._rename_scaffolds(gapfilled_scaffolds, self.gapfilled_scaffolds, self.scaff_name_prefix)
        os.chdir(cwd)
        if self.clean:
            print('Deleting GapFiller directory', self.gapfill_dir, file=self.log_fh)
            shutil.rmtree(self.gapfill_dir)
Ejemplo n.º 6
0
Archivo: mash.py Proyecto: satta/ariba
    def _sketch(self, infile, individual):
        cmd_list = [self.extern_progs.exe("mash"), "sketch", "-s 100000"]

        if individual:
            cmd_list.append("-i")

        cmd_list.append(infile)
        common.syscall(" ".join(cmd_list), verbose=True, verbose_filehandle=self.log_fh)
Ejemplo n.º 7
0
    def _get_from_argannot(self, outprefix):
        outprefix = os.path.abspath(outprefix)
        tmpdir = outprefix + '.tmp.download'
        current_dir = os.getcwd()

        try:
            os.mkdir(tmpdir)
            os.chdir(tmpdir)
        except:
            raise Error('Error mkdir/chdir ' + tmpdir)

        zipfile = 'arg-annot-database_doc.zip'
        common.download_file(
            'http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/304/arg-annot-database_doc.zip',
            zipfile,
            max_attempts=self.max_download_attempts,
            sleep_time=self.sleep_time,
            verbose=True)
        common.syscall('unzip ' + zipfile)
        os.chdir(current_dir)
        print('Extracted files.')

        genes_file = os.path.join(tmpdir, 'Database Nt Sequences File.txt')
        final_fasta = outprefix + '.fa'
        final_tsv = outprefix + '.tsv'

        seq_reader = pyfastaq.sequences.file_reader(genes_file)
        f_out_tsv = pyfastaq.utils.open_file_write(final_tsv)
        f_out_fa = pyfastaq.utils.open_file_write(final_fasta)

        for seq in seq_reader:
            original_id = seq.id
            seq.id = re.sub(r'\((.*)\)', r'\1.', seq.id.split()[0])
            print(seq, file=f_out_fa)
            print(seq.id,
                  '1',
                  '0',
                  '.',
                  '.',
                  'Original name: ' + original_id,
                  sep='\t',
                  file=f_out_tsv)

        pyfastaq.utils.close(f_out_tsv)
        pyfastaq.utils.close(f_out_fa)
        if not self.debug:
            shutil.rmtree(tmpdir)

        print('Finished. Final files are:',
              final_fasta,
              final_tsv,
              sep='\n\t',
              end='\n\n')
        print('You can use them with ARIBA like this:')
        print('ariba prepareref -f', final_fasta, '-m', final_tsv,
              'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print(argannot_ref)
Ejemplo n.º 8
0
    def _get_from_resfinder(self, outprefix):
        outprefix = os.path.abspath(outprefix)
        final_fasta = outprefix + '.fa'
        final_tsv = outprefix + '.tsv'
        tmpdir = outprefix + '.tmp.download'
        current_dir = os.getcwd()

        try:
            os.mkdir(tmpdir)
            os.chdir(tmpdir)
        except:
            raise Error('Error mkdir/chdir ' + tmpdir)

        zipfile = 'resfinder.zip'
        cmd = 'curl -X POST --data "folder=resfinder&filename=resfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
        print('Downloading data with:', cmd, sep='\n')
        common.syscall(cmd)
        common.syscall('unzip ' + zipfile)

        print('Combining downloaded fasta files...')
        fout_fa = pyfastaq.utils.open_file_write(final_fasta)
        fout_tsv = pyfastaq.utils.open_file_write(final_tsv)
        used_names = {}

        for filename in os.listdir():
            if filename.endswith('.fsa'):
                print('   ', filename)
                file_reader = pyfastaq.sequences.file_reader(filename)
                for seq in file_reader:
                    try:
                        prefix, suffix = seq.id.split('_', maxsplit=1)
                        description = 'Original name: ' + seq.id
                        seq.id = prefix + '.' + suffix
                    except:
                        description = '.'

                    # names are not unique across the files
                    if seq.id in used_names:
                        used_names[seq.id] += 1
                        seq.id += '_' + str(used_names[seq.id])
                    else:
                        used_names[seq.id] = 1

                    print(seq, file=fout_fa)
                    print(seq.id, '1', '0', '.', '.', description, sep='\t', file=fout_tsv)

        pyfastaq.utils.close(fout_fa)
        pyfastaq.utils.close(fout_tsv)
        print('\nFinished combining files\n')
        os.chdir(current_dir)
        if not self.debug:
            shutil.rmtree(tmpdir)
        print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
        print('You can use them with ARIBA like this:')
        print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print('"Identification of acquired antimicrobial resistance genes", Zankari et al 2012, PMID: 22782487\n')
Ejemplo n.º 9
0
    def _get_from_resfinder(self, outprefix):
        outprefix = os.path.abspath(outprefix)
        final_fasta = outprefix + '.fa'
        final_tsv = outprefix + '.tsv'
        tmpdir = outprefix + '.tmp.download'
        current_dir = os.getcwd()

        try:
            os.mkdir(tmpdir)
            os.chdir(tmpdir)
        except:
            raise Error('Error mkdir/chdir ' + tmpdir)

        zipfile = 'resfinder.zip'
        cmd = 'curl -X POST --data "folder=resfinder&filename=resfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
        print('Downloading data with:', cmd, sep='\n')
        common.syscall(cmd)
        common.syscall('unzip ' + zipfile)

        print('Combining downloaded fasta files...')
        fout_fa = pyfastaq.utils.open_file_write(final_fasta)
        fout_tsv = pyfastaq.utils.open_file_write(final_tsv)
        used_names = {}

        for filename in os.listdir():
            if filename.endswith('.fsa'):
                print('   ', filename)
                file_reader = pyfastaq.sequences.file_reader(filename)
                for seq in file_reader:
                    try:
                        prefix, suffix = seq.id.split('_', maxsplit=1)
                        description = 'Original name: ' + seq.id
                        seq.id = prefix + '.' + suffix
                    except:
                        description = '.'

                    # names are not unique across the files
                    if seq.id in used_names:
                        used_names[seq.id] += 1
                        seq.id += '_' + str(used_names[seq.id])
                    else:
                        used_names[seq.id] = 1

                    print(seq, file=fout_fa)
                    print(seq.id, '1', '0', '.', '.', description, sep='\t', file=fout_tsv)

        pyfastaq.utils.close(fout_fa)
        pyfastaq.utils.close(fout_tsv)
        print('\nFinished combining files\n')
        os.chdir(current_dir)
        if not self.debug:
            shutil.rmtree(tmpdir)
        print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
        print('You can use them with ARIBA like this:')
        print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print('"Identification of acquired antimicrobial resistance genes", Zankari et al 2012, PMID: 22782487\n')
Ejemplo n.º 10
0
    def _get_from_virulencefinder(self, outprefix):
        outprefix = os.path.abspath(outprefix)
        final_fasta = outprefix + '.fa'
        final_tsv = outprefix + '.tsv'
        tmpdir = outprefix + '.tmp.download'
        current_dir = os.getcwd()

        if self.version == 'old':
            try:
                os.mkdir(tmpdir)
                os.chdir(tmpdir)
            except:
                raise Error('Error mkdir/chdir ' + tmpdir)

            zipfile = 'virulencefinder.zip'
            cmd = 'curl -X POST --data "folder=virulencefinder&filename=virulencefinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
            print('Downloading data with:', cmd, sep='\n')
            common.syscall(cmd)
            common.syscall('unzip ' + zipfile)
        else:
            RefGenesGetter._get_genetic_epi_database_from_bitbucket('virulencefinder', tmpdir, git_commit=self.version)
            os.chdir(tmpdir)

        print('Combining downloaded fasta files...')
        fout_fa = pyfastaq.utils.open_file_write(final_fasta)
        fout_tsv = pyfastaq.utils.open_file_write(final_tsv)
        name_count = {}

        for filename in os.listdir(tmpdir):
            if filename.endswith('.fsa'):
                print('   ', filename)
                fix_file = os.path.join(tmpdir, filename + '.fix.fsa')
                RefGenesGetter._fix_virulencefinder_fasta_file(os.path.join(tmpdir, filename), fix_file)
                file_reader = pyfastaq.sequences.file_reader(fix_file)
                for seq in file_reader:
                    original_id = seq.id
                    seq.id = seq.id.replace('_', '.', 1)
                    seq.id = seq.id.replace(' ', '_')
                    if seq.id in name_count:
                        name_count[seq.id] += 1
                        seq.id = seq.id + '.' + str(name_count[seq.id])
                    else:
                        name_count[seq.id] = 1
                    print(seq, file=fout_fa)
                    print(seq.id, '0', '0', '.', '.', 'Original name was ' + original_id, sep='\t', file=fout_tsv)

        pyfastaq.utils.close(fout_fa)
        pyfastaq.utils.close(fout_tsv)
        print('\nFinished combining files\n')
        os.chdir(current_dir)
        if not self.debug:
            common.rmtree(tmpdir)
        print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
        print('You can use them with ARIBA like this:')
        print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print('"Real-time whole-genome sequencing for routine typing, surveillance, and outbreak detection of verotoxigenic Escherichia coli", Joensen al 2014, PMID: 24574290\n')
Ejemplo n.º 11
0
 def run(self):
     tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd())
     cdhit_fasta = os.path.join(tmpdir, 'cdhit')
     cluster_info_outfile = cdhit_fasta + '.bak.clstr'
     cmd = self.get_run_cmd(cdhit_fasta)
     common.syscall(cmd, verbose=self.verbose)
     clusters = self._get_clusters_from_bak_file(cluster_info_outfile, self.min_cluster_number)
     common.rmtree(tmpdir)
     return clusters
Ejemplo n.º 12
0
    def _assemble_with_velvet(self):
        # map reads to reference gene to make BAM input to velvet columbus
        mapping.run_bowtie2(
            self.reads1,
            self.reads2,
            self.gene_fa,
            self.gene_bam[:-4],
            threads=self.threads,
            sort=True,
            samtools=self.samtools_exe,
            bowtie2=self.bowtie2_exe,
            bowtie2_preset=self.bowtie2_preset,
            verbose=self.verbose,
        )

        cmd = ' '.join([
            self.velveth,
            self.assembler_dir,
            str(self.assembly_kmer),
            '-reference', self.gene_fa,
            '-shortPaired -bam', self.gene_bam[:-4] + '.unsorted.bam'
        ])

        cwd = os.getcwd()
        os.chdir(self.assembly_dir)
        velvet_contigs = os.path.join(os.path.split(self.assembler_dir)[1], 'contigs.fa')

        self.velveth_ok, err = common.syscall(cmd, verbose=self.verbose, allow_fail=True)
        if not self.velveth_ok:
            with open('velveth_errors', 'w') as f:
                print(err, file=f)
                f.close()
            self.status_flag.add('assembly_fail')
            os.chdir(cwd)
            return

        cmd = ' '.join([
            self.velvetg,
            self.assembler_dir,
            '-ins_length', str(int(self.reads_insert)),
            '-scaffolding no',
            '-exp_cov auto',
            '-very_clean yes',
            '-cov_cutoff auto',
        ])

        self.assembled_ok, err = common.syscall(cmd, verbose=self.verbose, allow_fail=True)
        if self.assembled_ok:
            os.symlink(velvet_contigs, os.path.basename(self.assembly_contigs))
        else:
            with open('velvetg_errors', 'w') as f:
                print(err, file=f)
                f.close()
            self.status_flag.add('assembly_fail')

        os.chdir(cwd)
Ejemplo n.º 13
0
    def _get_from_plasmidfinder(self, outprefix):
        outprefix = os.path.abspath(outprefix)
        final_fasta = outprefix + ".fa"
        final_tsv = outprefix + ".tsv"
        tmpdir = outprefix + ".tmp.download"
        current_dir = os.getcwd()

        try:
            os.mkdir(tmpdir)
            os.chdir(tmpdir)
        except:
            raise Error("Error mkdir/chdir " + tmpdir)

        zipfile = "plasmidfinder.zip"
        cmd = (
            'curl -X POST --data "folder=plasmidfinder&filename=plasmidfinder.zip" -o '
            + zipfile
            + " https://cge.cbs.dtu.dk/cge/download_data.php"
        )
        print("Downloading data with:", cmd, sep="\n")
        common.syscall(cmd)
        common.syscall("unzip " + zipfile)

        print("Combining downloaded fasta files...")
        fout_fa = pyfastaq.utils.open_file_write(final_fasta)
        fout_tsv = pyfastaq.utils.open_file_write(final_tsv)
        name_count = {}

        for filename in os.listdir(tmpdir):
            if filename.endswith(".fsa"):
                print("   ", filename)
                file_reader = pyfastaq.sequences.file_reader(os.path.join(tmpdir, filename))
                for seq in file_reader:
                    original_id = seq.id
                    seq.id = seq.id.replace("_", ".", 1)
                    if seq.id in name_count:
                        name_count[seq.id] += 1
                        seq.id = seq.id + "." + str(name_count[seq.id])
                    else:
                        name_count[seq.id] = 1

                    print(seq, file=fout_fa)
                    print(seq.id, "0", "0", ".", ".", "Original name was " + original_id, sep="\t", file=fout_tsv)

        pyfastaq.utils.close(fout_fa)
        pyfastaq.utils.close(fout_tsv)
        print("\nFinished combining files\n")
        os.chdir(current_dir)
        shutil.rmtree(tmpdir)
        print("Finished. Final files are:", final_fasta, final_tsv, sep="\n\t", end="\n\n")
        print("You can use them with ARIBA like this:")
        print("ariba prepareref -f", final_fasta, "-m", final_tsv, "output_directory\n")
        print("If you use this downloaded data, please cite:")
        print(
            '"PlasmidFinder and pMLST: in silico detection and typing of plasmids", Carattoli et al 2014, PMID: 24777092\n'
        )
Ejemplo n.º 14
0
    def _get_genetic_epi_database_from_bitbucket(cls, db_name, outdir, git_commit=None):
        assert db_name in {'plasmidfinder', 'resfinder', 'virulencefinder'}
        cmd = 'git clone ' + 'https://bitbucket.org/genomicepidemiology/' + db_name + '_db.git ' + outdir
        common.syscall(cmd)

        if git_commit is not None:
            common.syscall('cd ' + outdir + ' && git checkout ' + git_commit)

        print('Using this git commit for ' + db_name + ' database:')
        subprocess.check_call('cd ' + outdir + ' && git log -n 1', shell=True)
Ejemplo n.º 15
0
 def _run_cdhit_est_2d(reference, reads, outfile, cdhitest2d, verbose=False, verbose_fh=None):
     cmd = ' '.join([
         cdhitest2d,
         '-i', reference,
         '-i2', reads,
         '-G 0 -M 0 -d 0 -aS 0.95',
         '-o', outfile
     ])
     common.syscall(cmd, verbose=verbose, verbose_filehandle=verbose_fh)
     os.unlink(outfile)
Ejemplo n.º 16
0
    def _get_from_plasmidfinder(self, outprefix):
        outprefix = os.path.abspath(outprefix)
        final_fasta = outprefix + '.fa'
        final_tsv = outprefix + '.tsv'
        tmpdir = outprefix + '.tmp.download'
        current_dir = os.getcwd()

        if self.version == 'old':
            try:
                os.mkdir(tmpdir)
                os.chdir(tmpdir)
            except:
                raise Error('Error mkdir/chdir ' + tmpdir)

            zipfile = 'plasmidfinder.zip'
            cmd = 'curl -X POST --data "folder=plasmidfinder&filename=plasmidfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
            print('Downloading data with:', cmd, sep='\n')
            common.syscall(cmd)
            common.syscall('unzip ' + zipfile)
        else:
            RefGenesGetter._get_genetic_epi_database_from_bitbucket('plasmidfinder', tmpdir, git_commit=self.version)
            os.chdir(tmpdir)

        print('Combining downloaded fasta files...')
        fout_fa = pyfastaq.utils.open_file_write(final_fasta)
        fout_tsv = pyfastaq.utils.open_file_write(final_tsv)
        name_count = {}

        for filename in os.listdir(tmpdir):
            if filename.endswith('.fsa'):
                print('   ', filename)
                file_reader = pyfastaq.sequences.file_reader(os.path.join(tmpdir, filename))
                for seq in file_reader:
                    original_id = seq.id
                    seq.id = seq.id.replace('_', '.', 1)
                    if seq.id in name_count:
                        name_count[seq.id] += 1
                        seq.id = seq.id + '.' + str(name_count[seq.id])
                    else:
                        name_count[seq.id] = 1

                    print(seq, file=fout_fa)
                    print(seq.id, '0', '0', '.', '.', 'Original name was ' + original_id, sep='\t', file=fout_tsv)

        pyfastaq.utils.close(fout_fa)
        pyfastaq.utils.close(fout_tsv)
        print('\nFinished combining files\n')
        os.chdir(current_dir)
        if not self.debug:
            shutil.rmtree(tmpdir)
        print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
        print('You can use them with ARIBA like this:')
        print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print('"PlasmidFinder and pMLST: in silico detection and typing of plasmids", Carattoli et al 2014, PMID: 24777092\n')
Ejemplo n.º 17
0
 def _run_cdhit_est_2d(reference,
                       reads,
                       outfile,
                       cdhitest2d,
                       verbose=False,
                       verbose_fh=None):
     cmd = ' '.join([
         cdhitest2d, '-i', reference, '-i2', reads,
         '-G 0 -M 0 -d 0 -aS 0.95', '-o', outfile
     ])
     common.syscall(cmd, verbose=verbose, verbose_filehandle=verbose_fh)
     os.unlink(outfile)
Ejemplo n.º 18
0
Archivo: mash.py Proyecto: satta/ariba
 def _dist(self, outfile):
     cmd = " ".join(
         [
             self.extern_progs.exe("mash"),
             "dist",
             self.reference_fa + ".msh",
             self.query_fa + ".msh",
             "| sort -k3n >",
             outfile,
         ]
     )
     common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
Ejemplo n.º 19
0
    def _get_from_srst2_argannot(self, outprefix):
        if self.version is None:
            self.version = 'r2'
        if self.version not in {'r1', 'r2'}:
            raise Error('srst2_argannot version must be r1 or r2. Got this: ' +
                        self.version)

        version_string = '.r1' if self.version == 'r1' else '_r2'
        srst2_url = 'https://raw.githubusercontent.com/katholt/srst2/master/data/ARGannot' + version_string + '.fasta'
        srst2_fa = outprefix + '.original.fa'
        command = 'wget -O ' + srst2_fa + ' ' + srst2_url
        common.syscall(command, verbose=True)

        final_fasta = outprefix + '.fa'
        final_tsv = outprefix + '.tsv'

        f_out_fa = pyfastaq.utils.open_file_write(final_fasta)
        f_out_meta = pyfastaq.utils.open_file_write(final_tsv)
        seq_reader = pyfastaq.sequences.file_reader(srst2_fa)

        for seq in seq_reader:
            original_id = seq.id
            name, extra = seq.id.split()
            cluster_id, cluster_name, allele_name, allele_id = name.split('__')
            seq.id = cluster_name + '.' + name
            print(seq, file=f_out_fa)
            print(seq.id,
                  1,
                  0,
                  '.',
                  '.',
                  'Original name: ' + original_id,
                  sep='\t',
                  file=f_out_meta)

        pyfastaq.utils.close(f_out_fa)
        pyfastaq.utils.close(f_out_meta)
        if not self.debug:
            os.unlink(srst2_fa)

        print('Finished downloading and converting data. Final files are:',
              final_fasta,
              final_tsv,
              sep='\n\t',
              end='\n\n')
        print('You can use them with ARIBA like this:')
        print('ariba prepareref -f', final_fasta, '-m', final_tsv,
              'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print(
            '"SRST2: Rapid genomic surveillance for public health and hospital microbiology labs",\nInouye et al 2014, Genome Medicine, PMID: 25422674\n'
        )
        print(argannot_ref)
Ejemplo n.º 20
0
    def _newick_from_dist_matrix(cls, distance_file, outfile):
        r_script = outfile + '.tmp.R'

        with open(r_script, 'w') as f:
            print('library(ape)', file=f)
            print('a=read.table("', distance_file, '", header=TRUE, row.names=1, comment.char="")', sep='', file=f)
            print('h=hclust(dist(a))', file=f)
            print('write.tree(as.phylo(h), file="', outfile, '")', sep='', file=f)

        common.syscall('Rscript --no-save ' + r_script)
        if os.path.exists(r_script + 'out'):
            os.unlink(r_script + 'out')
        os.unlink(r_script)
Ejemplo n.º 21
0
    def _get_from_virulencefinder(self, outprefix):
        outprefix = os.path.abspath(outprefix)
        final_fasta = outprefix + '.fa'
        final_tsv = outprefix + '.tsv'
        tmpdir = outprefix + '.tmp.download'
        current_dir = os.getcwd()

        try:
            os.mkdir(tmpdir)
            os.chdir(tmpdir)
        except:
            raise Error('Error mkdir/chdir ' + tmpdir)

        zipfile = 'plasmidfinder.zip'
        cmd = 'curl -X POST --data "folder=virulencefinder&filename=virulencefinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
        print('Downloading data with:', cmd, sep='\n')
        common.syscall(cmd)
        common.syscall('unzip ' + zipfile)

        print('Combining downloaded fasta files...')
        fout_fa = pyfastaq.utils.open_file_write(final_fasta)
        fout_tsv = pyfastaq.utils.open_file_write(final_tsv)
        name_count = {}

        for filename in os.listdir(tmpdir):
            if filename.endswith('.fsa'):
                print('   ', filename)
                file_reader = pyfastaq.sequences.file_reader(os.path.join(tmpdir, filename))
                for seq in file_reader:
                    original_id = seq.id
                    seq.id = seq.id.replace('_', '.', 1)
                    seq.id = seq.id.replace(' ', '_')
                    if seq.id in name_count:
                        name_count[seq.id] += 1
                        seq.id = seq.id + '.' + str(name_count[seq.id])
                    else:
                        name_count[seq.id] = 1
                    print(seq, file=fout_fa)
                    print(seq.id, '0', '0', '.', '.', 'Original name was ' + original_id, sep='\t', file=fout_tsv)

        pyfastaq.utils.close(fout_fa)
        pyfastaq.utils.close(fout_tsv)
        print('\nFinished combining files\n')
        os.chdir(current_dir)
        if not self.debug:
            shutil.rmtree(tmpdir)
        print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
        print('You can use them with ARIBA like this:')
        print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print('"Real-time whole-genome sequencing for routine typing, surveillance, and outbreak detection of verotoxigenic Escherichia coli", Joensen al 2014, PMID: 24574290\n')
Ejemplo n.º 22
0
Archivo: faidx.py Proyecto: rleir/ariba
def write_fa_subset(seq_names, infile, outfile, samtools_exe='samtools', verbose=False, verbose_filehandle=sys.stdout):
    if not os.path.exists(infile + '.fai'):
        common.syscall(samtools_exe + ' faidx ' + infile, verbose=verbose, verbose_filehandle=verbose_filehandle)

    if os.path.exists(outfile):
        os.path.unlink(outfile)

    for name in seq_names:
        common.syscall(' '.join([
            samtools_exe + ' faidx',
            infile,
            '"' + name + '"',
            '>>', outfile
        ]))
Ejemplo n.º 23
0
    def _get_from_srst2_argannot(self, outprefix):
        srst2_version = '0.2.0'
        srst2_url = 'https://github.com/katholt/srst2/raw/v' + srst2_version + '/data/ARGannot.r1.fasta'
        srst2_fa = outprefix + '.original.fa'
        command = 'wget -O ' + srst2_fa + ' ' + srst2_url
        common.syscall(command, verbose=True)

        final_fasta = outprefix + '.fa'
        final_tsv = outprefix + '.tsv'

        f_out_fa = pyfastaq.utils.open_file_write(final_fasta)
        f_out_meta = pyfastaq.utils.open_file_write(final_tsv)
        seq_reader = pyfastaq.sequences.file_reader(srst2_fa)

        for seq in seq_reader:
            original_id = seq.id
            name, extra = seq.id.split()
            cluster_id, cluster_name, allele_name, allele_id = name.split('__')
            seq.id = cluster_name + '.' + name
            print(seq, file=f_out_fa)
            print(seq.id,
                  1,
                  0,
                  '.',
                  '.',
                  'Original name: ' + original_id,
                  sep='\t',
                  file=f_out_meta)

        pyfastaq.utils.close(f_out_fa)
        pyfastaq.utils.close(f_out_meta)
        if not self.debug:
            os.unlink(srst2_fa)

        print('Finished downloading and converting data. Final files are:',
              final_fasta,
              final_tsv,
              sep='\n\t',
              end='\n\n')
        print('You can use them with ARIBA like this:')
        print('ariba prepareref -f', final_fasta, '-m', final_tsv,
              'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print(
            '"SRST2: Rapid genomic surveillance for public health and hospital microbiology labs",\nInouye et al 2014, Genome Medicine, PMID: 25422674\n'
        )
        print(argannot_ref)
        print(
            'and in your methods say that the ARG-ANNOT sequences were used from version',
            srst2_version, 'of SRST2.')
Ejemplo n.º 24
0
def write_fa_subset(seq_names,
                    infile,
                    outfile,
                    samtools_exe='samtools',
                    verbose=False):
    if not os.path.exists(infile + '.fai'):
        common.syscall(samtools_exe + ' faidx ' + infile, verbose=verbose)

    if os.path.exists(outfile):
        os.path.unlink(outfile)

    for name in seq_names:
        common.syscall(' '.join(
            [samtools_exe + ' faidx', infile, '"' + name + '"', '>>',
             outfile]))
Ejemplo n.º 25
0
    def _assemble_with_spades(self, unittest=False):
        cmd = ' '.join([
            self.spades_exe,
            '-1', self.reads1,
            '-2', self.reads2,
            '-o', self.assembler_dir,
            '-k', str(self.assembly_kmer),
            '--threads', str(self.threads),
            '--untrusted-contigs', self.gene_fa,
        ])
        if self.spades_other is not None:
            cmd += ' ' + self.spades_other

        cwd = os.getcwd()
        os.chdir(self.assembly_dir)
        spades_contigs = os.path.join(os.path.split(self.assembler_dir)[1], 'scaffolds.fasta')

        if unittest:
            os.mkdir(self.assembler_dir)
            open(spades_contigs, 'w').close()
            self.assembled_ok = True
        else:
            self.assembled_ok, err = common.syscall(cmd, verbose=self.verbose, allow_fail=True)
        if self.assembled_ok:
            os.symlink(spades_contigs, os.path.basename(self.assembly_contigs))
        else:
            with open('spades_errors', 'w') as f:
                print(err, file=f)
            f.close()
            self.status_flag.add('assembly_fail')

        os.chdir(cwd)
Ejemplo n.º 26
0
    def _get_from_resfinder(self, outprefix):
        outprefix = os.path.abspath(outprefix)
        final_fasta = outprefix + ".fa"
        final_tsv = outprefix + ".tsv"
        tmpdir = outprefix + ".tmp.download"
        current_dir = os.getcwd()

        try:
            os.mkdir(tmpdir)
            os.chdir(tmpdir)
        except:
            raise Error("Error mkdir/chdir " + tmpdir)

        zipfile = "resfinder.zip"
        cmd = (
            'curl -X POST --data "folder=resfinder&filename=resfinder.zip" -o '
            + zipfile
            + " https://cge.cbs.dtu.dk/cge/download_data.php"
        )
        print("Downloading data with:", cmd, sep="\n")
        common.syscall(cmd)
        common.syscall("unzip " + zipfile)

        print("Combining downloaded fasta files...")
        fout_fa = pyfastaq.utils.open_file_write(final_fasta)
        fout_tsv = pyfastaq.utils.open_file_write(final_tsv)

        for filename in os.listdir("database"):
            if filename.endswith(".fsa"):
                print("   ", filename)
                prefix = filename.split(".")[0]
                file_reader = pyfastaq.sequences.file_reader(os.path.join("database", filename))
                for seq in file_reader:
                    seq.id = prefix + "." + seq.id
                    print(seq, file=fout_fa)
                    print(seq.id, "1", "0", ".", ".", ".", sep="\t", file=fout_tsv)

        pyfastaq.utils.close(fout_fa)
        pyfastaq.utils.close(fout_tsv)
        print("\nFinished combining files\n")
        os.chdir(current_dir)
        shutil.rmtree(tmpdir)
        print("Finished. Final files are:", final_fasta, final_tsv, sep="\n\t", end="\n\n")
        print("You can use them with ARIBA like this:")
        print("ariba prepareref -f", final_fasta, "-m", final_tsv, "output_directory\n")
        print("If you use this downloaded data, please cite:")
        print('"Identification of acquired antimicrobial resistance genes", Zankari et al 2012, PMID: 22782487\n')
Ejemplo n.º 27
0
    def _scaffold_with_sspace(self):
        if not os.path.exists(self.assembly_contigs):
            raise Error('Cannot scaffold because contigs file not found: ' + self.assembly_contigs)

        try:
            os.mkdir(self.scaffold_dir)
        except:
            raise Error('Error mkdir '+  self.scaffold_dir)

        cwd = os.getcwd()

        #if self.extern_progs.exe('sspace') is None:
        if True:  # no longer use sspace, but leave the option here just in case
            os.chdir(self.working_dir)
            os.symlink(self.assembly_contigs, os.path.basename(self.scaffolder_scaffolds))
            os.chdir(cwd)
            return

        os.chdir(self.scaffold_dir)
        lib_file = 'lib'
        with open(lib_file, 'w') as f:
            print('LIB', self.reads1, self.reads2, int(self.reads_insert), self.sspace_sd, 'FR', file=f)

        cmd = ' '.join([
            'perl', self.extern_progs.exe('sspace'),
            '-k', str(self.sspace_k),
            '-l', lib_file,
            '-s', self.assembly_contigs
        ])

        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
        sspace_scaffolds = os.path.abspath('standard_output.final.scaffolds.fasta')
        sspace_log = os.path.abspath('standard_output.logfile.txt')
        with open(sspace_log) as f:
            print('\n_______________ SSPACE log __________________\n', file=self.log_fh)
            for line in f:
                print(line.rstrip(), file=self.log_fh)
            print('_______________ End of SSPACE log __________________\n', file=self.log_fh)

        os.rename(sspace_scaffolds, self.scaffolder_scaffolds)
        os.chdir(cwd)

        if self.clean:
            print('Deleting scaffolding directory', self.scaffold_dir, file=self.log_fh)
            shutil.rmtree(self.scaffold_dir)
Ejemplo n.º 28
0
    def _assemble_with_spades(self, unittest=False):
        cmd = ' '.join([
            self.extern_progs.exe('spades'),
            '-1', self.reads1,
            '-2', self.reads2,
            '-o', self.assembler_dir,
            '-k', str(self.assembly_kmer),
            '--threads 1', # otherwise defaults to 16!
            '--untrusted-contigs', self.ref_fasta,
        ])
        if self.spades_other_options is not None:
            cmd += ' ' + self.spades_other_options

        cwd = os.getcwd()
        try:
            os.chdir(self.working_dir)
        except:
            raise Error('Error chdir ' + self.working_dir)
        spades_contigs = os.path.join(os.path.split(self.assembler_dir)[1], 'scaffolds.fasta')

        if unittest:
            os.mkdir(self.assembler_dir)
            open(spades_contigs, 'w').close()
            self.assembled_ok = True
        else:
            self.assembled_ok, err = common.syscall(cmd, verbose=True, allow_fail=True, verbose_filehandle=self.log_fh, print_errors=False)
        if self.assembled_ok:
            os.rename(spades_contigs, os.path.basename(self.assembly_contigs))
        else:
            print('Assembly finished with errors. These are the errors:', file=self.log_fh)
            print(err, file=self.log_fh)
            print('\nEnd of spades errors\n', file=self.log_fh)

        spades_log = os.path.join(self.assembler_dir, 'spades.log')
        if os.path.exists(spades_log):
            self._check_spades_log_file(spades_log)

            with open(spades_log) as f:
                print('\n______________ SPAdes log ___________________\n', file=self.log_fh)
                for line in f:
                    print(line.rstrip(), file=self.log_fh)
                print('\n______________ End of SPAdes log _________________\n', file=self.log_fh)


        spades_warnings = os.path.join(self.assembler_dir, 'warnings.log')
        if os.path.exists(spades_warnings):
            with open(spades_warnings) as f:
                print('\n______________ SPAdes warnings ___________________\n', file=self.log_fh)
                for line in f:
                    print(line.rstrip(), file=self.log_fh)
                print('\n______________ End of SPAdes warnings _________________\n', file=self.log_fh)

        os.chdir(cwd)

        if self.clean:
            print('Deleting assembly directory', self.assembler_dir, file=self.log_fh)
            shutil.rmtree(self.assembler_dir)
Ejemplo n.º 29
0
def run_bowtie2(
      reads_fwd,
      reads_rev,
      ref_fa,
      out_prefix,
      threads=1,
      max_insert=1000,
      sort=False,
      samtools='samtools',
      bowtie2='bowtie2',
      bowtie2_preset='very-sensitive-local',
      verbose=False
    ):

    map_index = out_prefix + '.map_index'
    clean_files = [map_index + '.' + x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']]
    index_cmd = ' '.join([
        bowtie2 + '-build',
        '-q',
        ref_fa,
        map_index
    ])

    final_bam = out_prefix + '.bam'
    if sort:
        intermediate_bam = out_prefix + '.unsorted.bam'
    else:
        intermediate_bam = final_bam

    map_cmd = ' '.join([
        bowtie2,
        '--threads', str(threads),
        '--' + bowtie2_preset,
        '-X', str(max_insert),
        '-x', map_index,
        '-1', reads_fwd,
        '-2', reads_rev,
        '|', samtools, 'view',
        '-bS -T', ref_fa,
        '- >', intermediate_bam
    ])

    common.syscall(index_cmd, verbose=verbose)
    common.syscall(map_cmd, verbose=verbose)

    if sort:
        threads = min(4, threads)
        thread_mem = int(500 / threads)
        sort_cmd = samtools + ' sort -@' + str(threads) + ' -m ' + str(thread_mem) + 'M ' + intermediate_bam + ' ' + out_prefix
        index_cmd = samtools + ' index ' + final_bam
        common.syscall(sort_cmd, verbose=verbose)
        common.syscall(index_cmd, verbose=verbose)
    for fname in clean_files:
        os.unlink(fname)
Ejemplo n.º 30
0
def bowtie2_index(ref_fa, outprefix, bowtie2='bowtie2', verbose=False, verbose_filehandle=sys.stdout):
    expected_files = [outprefix + '.' + x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']]
    file_missing = False
    for filename in expected_files:
        if not os.path.exists(filename):
            file_missing = True
            break

    if not file_missing:
        return

    cmd = ' '.join([
        bowtie2 + '-build',
        '-q',
        ref_fa,
        outprefix
    ])

    common.syscall(cmd, verbose=verbose, verbose_filehandle=verbose_filehandle)
Ejemplo n.º 31
0
def bowtie2_index(ref_fa, outprefix, bowtie2='bowtie2', verbose=False, verbose_filehandle=sys.stdout):
    expected_files = [outprefix + '.' + x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']]
    file_missing = False
    for filename in expected_files:
        if not os.path.exists(filename):
            file_missing = True
            break

    if not file_missing:
        return

    cmd = ' '.join([
        bowtie2 + '-build',
        '-q',
        ref_fa,
        outprefix
    ])

    common.syscall(cmd, verbose=verbose, verbose_filehandle=verbose_filehandle)
Ejemplo n.º 32
0
    def _get_from_argannot(self, outprefix):
        outprefix = os.path.abspath(outprefix)
        tmpdir = outprefix + ".tmp.download"
        current_dir = os.getcwd()

        try:
            os.mkdir(tmpdir)
            os.chdir(tmpdir)
        except:
            raise Error("Error mkdir/chdir " + tmpdir)

        zipfile = "arg-annot-database_doc.zip"
        self._download_file(
            "http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/304/arg-annot-database_doc.zip",
            zipfile,
        )
        common.syscall("unzip " + zipfile)
        os.chdir(current_dir)
        print("Extracted files.")

        genes_file = os.path.join(tmpdir, "Database Nt Sequences File.txt")
        final_fasta = outprefix + ".fa"
        final_tsv = outprefix + ".tsv"

        seq_reader = pyfastaq.sequences.file_reader(genes_file)
        f_out_tsv = pyfastaq.utils.open_file_write(final_tsv)
        f_out_fa = pyfastaq.utils.open_file_write(final_fasta)

        for seq in seq_reader:
            original_id = seq.id
            seq.id = re.sub(r"\((.*)\)", r"\1.", seq.id)
            print(seq, file=f_out_fa)
            print(seq.id, "1", "0", ".", ".", "Original name was " + original_id, sep="\t", file=f_out_tsv)

        pyfastaq.utils.close(f_out_tsv)
        pyfastaq.utils.close(f_out_fa)
        shutil.rmtree(tmpdir)

        print("Finished. Final files are:", final_fasta, final_tsv, sep="\n\t", end="\n\n")
        print("You can use them with ARIBA like this:")
        print("ariba prepareref -f", final_fasta, "-m", final_tsv, "output_directory\n")
        print("If you use this downloaded data, please cite:")
        print(argannot_ref)
Ejemplo n.º 33
0
    def _scaffold_with_sspace(self):
        if not os.path.exists(self.assembly_contigs):
            raise Error('Cannot scaffold because contigs file not found: ' +
                        self.assembly_contigs)

        try:
            os.mkdir(self.scaffold_dir)
        except:
            raise Error('Error mkdir ' + self.scaffold_dir)

        cwd = os.getcwd()

        if self.sspace_exe is None:
            os.chdir(self.assembly_dir)
            os.symlink(os.path.basename(self.assembly_contigs),
                       os.path.basename(self.scaffolder_scaffolds))
            os.chdir(cwd)
            return

        os.chdir(self.scaffold_dir)
        lib_file = 'lib'
        with open(lib_file, 'w') as f:
            print('LIB',
                  self.reads1,
                  self.reads2,
                  int(self.reads_insert),
                  self.sspace_sd,
                  'FR',
                  file=f)

        cmd = ' '.join([
            'perl', self.sspace_exe, '-k',
            str(self.sspace_k), '-l', lib_file, '-s', self.assembly_contigs
        ])

        sspace_scaffolds = os.path.abspath(
            'standard_output.final.scaffolds.fasta')
        common.syscall(cmd, verbose=self.verbose)
        os.chdir(self.assembly_dir)
        os.symlink(os.path.relpath(sspace_scaffolds),
                   os.path.basename(self.scaffolder_scaffolds))
        os.chdir(cwd)
Ejemplo n.º 34
0
    def _get_from_argannot(self, outprefix):
        outprefix = os.path.abspath(outprefix)
        tmpdir = outprefix + '.tmp.download'
        current_dir = os.getcwd()

        try:
            os.mkdir(tmpdir)
            os.chdir(tmpdir)
        except:
            raise Error('Error mkdir/chdir ' + tmpdir)

        zipfile = 'arg-annot-database_doc.zip'
        common.download_file('http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/304/arg-annot-database_doc.zip', zipfile, max_attempts=self.max_download_attempts, sleep_time=self.sleep_time, verbose=True)
        common.syscall('unzip ' + zipfile)
        os.chdir(current_dir)
        print('Extracted files.')

        genes_file = os.path.join(tmpdir, 'Database Nt Sequences File.txt')
        final_fasta = outprefix + '.fa'
        final_tsv = outprefix + '.tsv'

        seq_reader = pyfastaq.sequences.file_reader(genes_file)
        f_out_tsv = pyfastaq.utils.open_file_write(final_tsv)
        f_out_fa = pyfastaq.utils.open_file_write(final_fasta)

        for seq in seq_reader:
            original_id = seq.id
            seq.id = re.sub(r'\((.*)\)', r'\1.', seq.id.split()[0])
            print(seq, file=f_out_fa)
            print(seq.id, '1', '0', '.', '.', 'Original name: ' + original_id, sep='\t', file=f_out_tsv)


        pyfastaq.utils.close(f_out_tsv)
        pyfastaq.utils.close(f_out_fa)
        if not self.debug:
            shutil.rmtree(tmpdir)

        print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
        print('You can use them with ARIBA like this:')
        print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print(argannot_ref)
Ejemplo n.º 35
0
    def run(self):
        tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd())
        cdhit_fasta = os.path.join(tmpdir, 'cdhit')
        cluster_info_outfile = cdhit_fasta + '.bak.clstr'

        cmd = ' '.join([
            self.cd_hit_est,
            '-i', self.infile,
            '-o', cdhit_fasta,
            '-c', str(self.seq_identity_threshold),
            '-T', str(self.threads),
            '-s', str(self.length_diff_cutoff),
            '-d 0',
            '-bak 1',
        ])

        common.syscall(cmd, verbose=self.verbose)
        clusters = self._get_clusters_from_bak_file(cluster_info_outfile, self.min_cluster_number)
        shutil.rmtree(tmpdir)
        return clusters
Ejemplo n.º 36
0
    def run(self):
        tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd())
        cdhit_fasta = os.path.join(tmpdir, 'cdhit')
        cluster_info_outfile = cdhit_fasta + '.bak.clstr'

        cmd = ' '.join([
            self.cd_hit_est,
            '-i', self.infile,
            '-o', cdhit_fasta,
            '-c', str(self.seq_identity_threshold),
            '-T', str(self.threads),
            '-s', str(self.length_diff_cutoff),
            '-d 0',
            '-bak 1',
        ])

        common.syscall(cmd, verbose=self.verbose)
        clusters = self._get_clusters_from_bak_file(cluster_info_outfile, self.min_cluster_number)
        shutil.rmtree(tmpdir)
        return clusters
Ejemplo n.º 37
0
    def _gap_fill_with_gapfiller(self):
        if not os.path.exists(self.scaffolder_scaffolds):
            raise Error('Cannot gap fill because scaffolds file not found: ' +
                        self.scaffolder_scaffolds)

        cwd = os.getcwd()

        if self.gapfiller_exe is None or not self._has_gaps_to_fill(
                self.scaffolder_scaffolds):
            self._rename_scaffolds(self.scaffolder_scaffolds,
                                   self.gapfilled_scaffolds)
            return

        try:
            os.mkdir(self.gapfill_dir)
        except:
            raise Error('Error mkdir ' + self.gapfill_dir)

        os.chdir(self.gapfill_dir)
        lib_file = 'lib'
        with open(lib_file, 'w') as f:
            print('LIB',
                  'bwa',
                  self.reads1,
                  self.reads2,
                  self.reads_insert,
                  self.sspace_sd,
                  'FR',
                  file=f)

        cmd = ' '.join([
            'perl', self.gapfiller_exe, '-l', lib_file, '-s',
            self.scaffolder_scaffolds
        ])

        gapfilled_scaffolds = os.path.join(
            self.gapfill_dir, 'standard_output',
            'standard_output.gapfilled.final.fa')
        common.syscall(cmd, verbose=self.verbose)
        self._rename_scaffolds(gapfilled_scaffolds, self.gapfilled_scaffolds)
        os.chdir(cwd)
Ejemplo n.º 38
0
    def _get_from_resfinder(self, outprefix):
        outprefix = os.path.abspath(outprefix)
        final_fasta = outprefix + '.presence_absence.fa'
        tmpdir = outprefix + '.tmp.download'
        current_dir = os.getcwd()

        try:
            os.mkdir(tmpdir)
            os.chdir(tmpdir)
        except:
            raise Error('Error mkdir/chdir ' + tmpdir)

        zipfile = 'resfinder.zip'
        cmd = 'curl -X POST --data "folder=resfinder&filename=resfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
        print('Downloading data with:', cmd, sep='\n')
        common.syscall(cmd)
        common.syscall('unzip ' + zipfile)

        print('Combining downloaded fasta files...')
        f = pyfastaq.utils.open_file_write(final_fasta)

        for filename in os.listdir('database'):
            if filename.endswith('.fsa'):
                print('   ', filename)
                prefix = filename.split('.')[0]
                file_reader = pyfastaq.sequences.file_reader(os.path.join('database', filename))
                for seq in file_reader:
                    seq.id = prefix + '.' + seq.id
                    print(seq, file=f)

        pyfastaq.utils.close(f)

        print('\nCombined files. Final genes file is called', final_fasta, end='\n\n')
        os.chdir(current_dir)
        shutil.rmtree(tmpdir)

        print('You can use it with ARIBA like this:')
        print('ariba prepareref --ref_prefix', outprefix, 'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print('"Identification of acquired antimicrobial resistance genes", Zankari et al 2012, PMID: 22782487\n')
Ejemplo n.º 39
0
    def _make_assembly_vcf(self):
        tmp_vcf = self.final_assembly_vcf + '.tmp'
        cmd = ' '.join([
            self.samtools_exe, 'mpileup', '-t INFO/DPR,DV', '-A', '-f',
            self.final_assembly_fa, '-u', '-v', self.final_assembly_bam, '>',
            tmp_vcf
        ])

        common.syscall(cmd, verbose=self.verbose)

        cmd = ' '.join([
            self.bcftools_exe, 'call -m', tmp_vcf, '|', self.bcftools_exe,
            'query', r'''-f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t%DPR]\n' ''', '>',
            self.final_assembly_read_depths + '.tmp'
        ])

        common.syscall(cmd, verbose=self.verbose)
        pysam.tabix_compress(self.final_assembly_read_depths + '.tmp',
                             self.final_assembly_read_depths)
        pysam.tabix_index(self.final_assembly_read_depths,
                          seq_col=0,
                          start_col=1,
                          end_col=1)
        os.unlink(self.final_assembly_read_depths + '.tmp')

        cmd = ' '.join([
            self.bcftools_exe, 'call -m -v', tmp_vcf, '|', self.bcftools_exe,
            'filter', '-i', '"MIN(DP)>=' + str(self.bcf_min_dp),
            ' & MIN(DV)>=' + str(self.bcf_min_dv),
            ' & MIN(DV/DP)>=' + str(self.bcf_min_dv_over_dp), ' & QUAL >=',
            str(self.bcf_min_qual), '"', '-o', self.final_assembly_vcf
        ])

        common.syscall(cmd, verbose=self.verbose)
        os.unlink(tmp_vcf)
Ejemplo n.º 40
0
    def _get_from_argannot(self, outprefix):
        outprefix = os.path.abspath(outprefix)
        tmpdir = outprefix + '.tmp.download'
        current_dir = os.getcwd()

        try:
            os.mkdir(tmpdir)
            os.chdir(tmpdir)
        except:
            raise Error('Error mkdir/chdir ' + tmpdir)

        zipfile = 'arg-annot-database_doc.zip'
        self._download_file('http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/304/arg-annot-database_doc.zip', zipfile)
        common.syscall('unzip ' + zipfile)
        os.chdir(current_dir)
        print('Extracted files.')

        genes_file = os.path.join(tmpdir, 'Database Nt Sequences File.txt')
        final_fasta = outprefix + '.presence_absence.fa'

        seq_reader = pyfastaq.sequences.file_reader(genes_file)
        ids = {}
        for seq in seq_reader:
            ids[seq.id] = ids.get(seq.id, 0) + 1

        for name, count in sorted(ids.items()):
            if count > 1:
                print('Warning! Sequence name', name, 'found', count, 'times in download. Keeping longest sequence', file=sys.stderr)

        pyfastaq.tasks.to_unique_by_id(genes_file, final_fasta)
        shutil.rmtree(tmpdir)

        print('Finished. Final genes file is called', final_fasta, end='\n\n')
        print('You can use it with ARIBA like this:')
        print('ariba prepareref --ref_prefix', outprefix, 'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print('"ARG-ANNOT, a new bioinformatic tool to discover antibiotic resistance genes in bacterial genomes",\nGupta et al 2014, PMID: 24145532\n')
Ejemplo n.º 41
0
    def _get_from_srst2_argannot(self, outprefix):
        srst2_version = "0.2.0"
        srst2_url = "https://github.com/katholt/srst2/raw/v" + srst2_version + "/data/ARGannot.r1.fasta"
        srst2_fa = outprefix + ".original.fa"
        command = "wget -O " + srst2_fa + " " + srst2_url
        common.syscall(command, verbose=True)

        final_fasta = outprefix + ".fa"
        final_tsv = outprefix + ".tsv"

        f_out_fa = pyfastaq.utils.open_file_write(final_fasta)
        f_out_meta = pyfastaq.utils.open_file_write(final_tsv)
        seq_reader = pyfastaq.sequences.file_reader(srst2_fa)

        for seq in seq_reader:
            original_id = seq.id
            name, extra = seq.id.split()
            cluster_id, cluster_name, allele_name, allele_id = name.split("__")
            seq.id = cluster_name + "." + name
            print(seq, file=f_out_fa)
            print(seq.id, 1, 0, ".", ".", "Original name: " + original_id, sep="\t", file=f_out_meta)

        pyfastaq.utils.close(f_out_fa)
        pyfastaq.utils.close(f_out_meta)

        print(
            "Finished downloading and converting data. Final files are:", final_fasta, final_tsv, sep="\n\t", end="\n\n"
        )
        print("You can use them with ARIBA like this:")
        print("ariba prepareref -f", final_fasta, "-m", final_tsv, "output_directory\n")
        print("If you use this downloaded data, please cite:")
        print(
            '"SRST2: Rapid genomic surveillance for public health and hospital microbiology labs",\nInouye et al 2014, Genome Medicine, PMID: 25422674\n'
        )
        print(argannot_ref)
        print("and in your methods say that the ARG-ANNOT sequences were used from version", srst2_version, "of SRST2.")
Ejemplo n.º 42
0
def run_bowtie2(reads_fwd,
                reads_rev,
                ref_fa,
                out_prefix,
                threads=1,
                max_insert=1000,
                sort=False,
                samtools='samtools',
                bowtie2='bowtie2',
                bowtie2_preset='very-sensitive-local',
                verbose=False):

    map_index = out_prefix + '.map_index'
    clean_files = [
        map_index + '.' + x + '.bt2'
        for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']
    ]
    index_cmd = ' '.join([bowtie2 + '-build', '-q', ref_fa, map_index])

    final_bam = out_prefix + '.bam'
    if sort:
        intermediate_bam = out_prefix + '.unsorted.bam'
    else:
        intermediate_bam = final_bam

    map_cmd = ' '.join([
        bowtie2, '--threads',
        str(threads), '--' + bowtie2_preset, '-X',
        str(max_insert), '-x', map_index, '-1', reads_fwd, '-2', reads_rev,
        '|', samtools, 'view', '-bS -T', ref_fa, '- >', intermediate_bam
    ])

    common.syscall(index_cmd, verbose=verbose)
    common.syscall(map_cmd, verbose=verbose)

    if sort:
        threads = min(4, threads)
        thread_mem = int(500 / threads)
        sort_cmd = samtools + ' sort -@' + str(threads) + ' -m ' + str(
            thread_mem) + 'M ' + intermediate_bam + ' ' + out_prefix
        index_cmd = samtools + ' index ' + final_bam
        common.syscall(sort_cmd, verbose=verbose)
        common.syscall(index_cmd, verbose=verbose)
    for fname in clean_files:
        os.unlink(fname)
Ejemplo n.º 43
0
    def _assemble_with_spades(self, unittest=False):
        cmd = ' '.join([
            self.spades_exe,
            '-1',
            self.reads1,
            '-2',
            self.reads2,
            '-o',
            self.assembler_dir,
            '-k',
            str(self.assembly_kmer),
            '--threads',
            str(self.threads),
            '--untrusted-contigs',
            self.gene_fa,
        ])
        if self.spades_other is not None:
            cmd += ' ' + self.spades_other

        cwd = os.getcwd()
        os.chdir(self.assembly_dir)
        spades_contigs = os.path.join(
            os.path.split(self.assembler_dir)[1], 'scaffolds.fasta')

        if unittest:
            os.mkdir(self.assembler_dir)
            open(spades_contigs, 'w').close()
            self.assembled_ok = True
        else:
            self.assembled_ok, err = common.syscall(cmd,
                                                    verbose=self.verbose,
                                                    allow_fail=True)
        if self.assembled_ok:
            os.symlink(spades_contigs, os.path.basename(self.assembly_contigs))
        else:
            with open('spades_errors', 'w') as f:
                print(err, file=f)
            f.close()
            self.status_flag.add('assembly_fail')

        os.chdir(cwd)
Ejemplo n.º 44
0
    def _make_assembly_vcf(self):
        tmp_vcf = self.final_assembly_vcf + '.tmp'
        cmd = ' '.join([
            self.samtools_exe, 'mpileup',
            '-t INFO/DPR,DV',
            '-A',
            '-f', self.final_assembly_fa,
            '-u',
            '-v',
            self.final_assembly_bam,
            '>',
            tmp_vcf
        ])

        common.syscall(cmd, verbose=self.verbose)

        cmd = ' '.join([
            self.bcftools_exe, 'call -m',
            tmp_vcf,
            '|',
            self.bcftools_exe, 'query',
            r'''-f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t%DPR]\n' ''',
            '>',
            self.final_assembly_read_depths + '.tmp'
        ])

        common.syscall(cmd, verbose=self.verbose)
        pysam.tabix_compress(self.final_assembly_read_depths + '.tmp', self.final_assembly_read_depths)
        pysam.tabix_index(self.final_assembly_read_depths, seq_col=0, start_col=1, end_col=1)
        os.unlink(self.final_assembly_read_depths + '.tmp')

        cmd = ' '.join([
            self.bcftools_exe, 'call -m -v',
            tmp_vcf,
            '|',
            self.bcftools_exe, 'filter',
            '-i', '"MIN(DP)>=' + str(self.bcf_min_dp),
                  ' & MIN(DV)>=' + str(self.bcf_min_dv),
                  ' & MIN(DV/DP)>=' + str(self.bcf_min_dv_over_dp),
                  ' & QUAL >=', str(self.bcf_min_qual), '"',
            '-o', self.final_assembly_vcf
        ])

        common.syscall(cmd, verbose=self.verbose)
        os.unlink(tmp_vcf)
Ejemplo n.º 45
0
    def _make_vcf_and_read_depths_files(self):
        tmp_vcf = self.vcf_file + '.tmp'
        cmd = ' '.join([
            self.samtools_exe, 'mpileup',
            '-t INFO/AD',
            '-A',
            '-f', self.ref_fa,
            '-u',
            '-v',
            self.bam,
            '>',
            tmp_vcf
        ])

        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)

        cmd = ' '.join([
            self.bcftools_exe, 'call -m',
            tmp_vcf,
            '|',
            self.bcftools_exe, 'query',
            r'''-f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t%AD]\n' ''',
            '>',
            self.read_depths_file + '.tmp'
        ])

        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
        pysam.tabix_compress(self.read_depths_file + '.tmp', self.read_depths_file)
        pysam.tabix_index(self.read_depths_file, seq_col=0, start_col=1, end_col=1)
        os.unlink(self.read_depths_file + '.tmp')

        cmd = ' '.join([
            self.bcftools_exe, 'call -m -v',
            tmp_vcf,
            '|',
            self.bcftools_exe, 'filter',
            '-i', '"SUM(AD)>=5 & MIN(AD)/DP>=0.1"',
            '-o', self.vcf_file
        ])

        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
        os.unlink(tmp_vcf)
Ejemplo n.º 46
0
def run_bowtie2(
      reads_fwd,
      reads_rev,
      ref_fa,
      out_prefix,
      threads=1,
      max_insert=1000,
      sort=False,
      bowtie2='bowtie2',
      bowtie2_preset='very-sensitive-local',
      bowtie2_version=None,
      verbose=False,
      verbose_filehandle=sys.stdout,
      remove_both_unmapped=False,
      clean_index=True,
    ):

    ref_is_indexed = True
    for ext in bowtie2_index_extensions:
        if not os.path.exists(ref_fa + '.' + ext):
            ref_is_indexed = False
            break

    clean_files = []

    if ref_is_indexed:
        if verbose:
            print('Bowtie2 index files found (', ref_fa, '.*.bt2) so no need to index', sep='', file=verbose_filehandle)
        map_index = ref_fa
    else:
        map_index = out_prefix + '.map_index'
        bowtie2_index(ref_fa, map_index, bowtie2=bowtie2, verbose=verbose, verbose_filehandle=verbose_filehandle)

        if clean_index:
            clean_files = [map_index + '.' + x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']]

    final_bam = out_prefix + '.bam'
    if sort:
        intermediate_bam = out_prefix + '.unsorted.bam'
    else:
        intermediate_bam = final_bam

    map_cmd = [
        bowtie2,
        '--threads', str(threads),
        '--reorder',
        '--' + bowtie2_preset,
        '-X', str(max_insert),
        '-x', map_index,
        '-1', reads_fwd,
        '-2', reads_rev,
    ]

    if LooseVersion(bowtie2_version) >= LooseVersion('2.3.1'):
        map_cmd.append('--score-min G,1,10')

    # We use gawk instead of awk here as we need bitwise comparisons
    # and these are not available via awk on Mac OSX.
    if remove_both_unmapped:
        map_cmd.append(r''' | gawk ' !(and($2,4)) || !(and($2,8)) ' ''')

    tmp_sam_file = out_prefix + '.unsorted.sam'
    map_cmd.append(' > ' + tmp_sam_file)
    map_cmd = ' '.join(map_cmd)

    common.syscall(map_cmd, verbose=verbose, verbose_filehandle=verbose_filehandle)

    if verbose:
        print('Converting', tmp_sam_file, '->', intermediate_bam, file=verbose_filehandle)
    infile = pysam.AlignmentFile(tmp_sam_file, "r")
    outfile = pysam.AlignmentFile(intermediate_bam, "wb", template=infile)
    for x in infile:
        outfile.write(x)
    infile.close()
    outfile.close()
    os.unlink(tmp_sam_file)

    if sort:
        if verbose:
            print('Sorting', intermediate_bam, '->', final_bam, file=verbose_filehandle)
        pysam.sort('-o', final_bam, '-O', 'BAM', intermediate_bam)
        if verbose:
            print('Indexing', final_bam, file=verbose_filehandle)
        pysam.index(final_bam)
        clean_files.append(intermediate_bam)

    for fname in clean_files:
        os.unlink(fname)
Ejemplo n.º 47
0
 def _sort_file(infile, outfile, log_fh=None):
     cmd = 'sort -k1,1 -k 2,2n ' + infile + ' > ' + outfile
     verbose = log_fh is not None
     common.syscall(cmd, verbose=verbose, verbose_filehandle=log_fh)
Ejemplo n.º 48
0
    def _assemble_with_spades(self):
        cwd = os.getcwd()
        self.assembled_ok = False
        try:
            try:
                os.chdir(self.working_dir)
            except:
                raise Error('Error chdir ' + self.working_dir)
            spades_exe = self.extern_progs.exe('spades')
            if not spades_exe:
                raise Error("Spades executable has not been found")
            spades_options = self.spades_options
            if spades_options is not None:
                spades_options = shlex.split(self.spades_options)
            if self.spades_mode == "rna":
                spades_options = ["--rna"] + (["-k", "127"] if spades_options
                                              is None else spades_options)
                spades_out_seq_base = "transcripts.fasta"
            elif self.spades_mode == "sc":
                spades_options = ["--sc"] + ([
                    "-k", "33,55,77,99,127", "--careful"
                ] if spades_options is None else spades_options)
                spades_out_seq_base = "contigs.fasta"
            elif self.spades_mode == "wgs":
                spades_options = [
                    "-k", "33,55,77,99,127", "--careful"
                ] if spades_options is None else spades_options
                spades_out_seq_base = "contigs.fasta"
            else:
                raise ValueError("Unknown spades_mode value: {}".format(
                    self.spades_mode))
            asm_cmd = [spades_exe, "-t", str(self.threads), "--pe1-1", self.reads1, "--pe1-2", self.reads2, "-o", self.assembler_dir] + \
                spades_options
            asm_ok, err = common.syscall(asm_cmd,
                                         verbose=True,
                                         verbose_filehandle=self.log_fh,
                                         shell=False,
                                         allow_fail=True)
            if not asm_ok:
                print('Assembly finished with errors. These are the errors:',
                      file=self.log_fh)
                print(err, file=self.log_fh)
                print('\nEnd of spades errors\n', file=self.log_fh)
            else:

                spades_log = os.path.join(self.assembler_dir, 'spades.log')
                if os.path.exists(spades_log):
                    self._check_spades_log_file(spades_log)

                    with open(spades_log) as f:
                        print(
                            '\n______________ SPAdes log ___________________\n',
                            file=self.log_fh)
                        for line in f:
                            print(line.rstrip(), file=self.log_fh)
                        print(
                            '\n______________ End of SPAdes log _________________\n',
                            file=self.log_fh)

                spades_warnings = os.path.join(self.assembler_dir,
                                               'warnings.log')
                if os.path.exists(spades_warnings):
                    with open(spades_warnings) as f:
                        print(
                            '\n______________ SPAdes warnings ___________________\n',
                            file=self.log_fh)
                        for line in f:
                            print(line.rstrip(), file=self.log_fh)
                        print(
                            '\n______________ End of SPAdes warnings _________________\n',
                            file=self.log_fh)

                ## fermilight module generates contig names that look like `cluster_1.l15.c17.ctg.1` where 'cluster_1'==self.contig_name_prefix
                ## the whole structure of the contig name is expected in several places downstream where it is parsed into individual components.
                ## For example, it is parsed into to l and c parts in ref_seq_chooser (although the parts are not actually used).
                ## This is the code from fermilight module that generates the contig ID string:
                ## ofs << ">" << namePrefix << ".l" << overlap << ".c" << minCount << ".ctg." << i + 1 << '\n'
                ##
                ## We generate the same contig name structure here using dummy values for overlap and minCount, in order
                ## to avoid distrupting the downstream code.
                ## Note that the fermilight module generates multiple versions of the assembly on a grid of l and c values,
                ## and ref_seq_chooser then picks a single "best" (l,c) version based on coverage/identity of the nucmer
                ## alignment to the reference. Spades generates a single version of the assembly, so ref_seq_chooser
                ## can only pick that one version.

                spades_out_seq = os.path.join(self.assembler_dir,
                                              spades_out_seq_base)
                ## No need really to use general-purpose pyfastaq.sequences.file_reader here and pay performance cost for
                ## its multi-format line tests since we are only replacing the IDs in a pre-defined format
                if os.path.exists(spades_out_seq):
                    with open(spades_out_seq,
                              "r") as inp, open(self.all_assembly_contigs_fa,
                                                "w") as out:
                        pref = self.contig_name_prefix
                        i_cont = 0
                        for line in inp:
                            if line.startswith(">"):
                                i_cont += 1
                                line = ">{}.l15.c17.ctg.{}\n".format(
                                    pref, i_cont)
                            out.write(line)
                        if i_cont > 0:
                            self.assembled_ok = True
            if self.clean:
                print('Deleting assembly directory',
                      self.assembler_dir,
                      file=self.log_fh)
                shutil.rmtree(self.assembler_dir, ignore_errors=True)
        finally:
            os.chdir(cwd)
Ejemplo n.º 49
0
def run_smalt(reads_fwd,
              reads_rev,
              ref_fa,
              out_prefix,
              index_k=9,
              index_s=2,
              threads=1,
              max_insert=1000,
              minid=0.9,
              sort=False,
              extra_smalt_map_ops='-x',
              samtools='samtools',
              smalt='smalt',
              verbose=False):
    if extra_smalt_map_ops is None:
        extra_smalt_map_ops = ''
    map_index = out_prefix + '.map_index'
    clean_files = [map_index + '.' + x for x in ['smi', 'sma']]
    index_cmd = ' '.join([
        smalt, 'index', '-k',
        str(index_k), '-s',
        str(index_s), map_index, ref_fa
    ])

    map_cmd = smalt + ' map ' + extra_smalt_map_ops + ' '

    # depending on OS, -n can break smalt, so only use -n if it's > 1.
    if threads > 1:
        map_cmd += '-n ' + str(threads) + ' -O '

    if reads_rev is None:
        map_cmd += ' '.join([
            '-y',
            str(minid),
            map_index,
            reads_fwd,
        ])
    else:
        map_cmd += ' '.join([
            '-i',
            str(max_insert),
            '-y',
            str(minid),
            map_index,
            reads_fwd,
            reads_rev,
        ])

    map_cmd += ' | ' + samtools + ' view'

    final_bam = out_prefix + '.bam'
    if sort:
        intermediate_bam = out_prefix + '.unsorted.bam'
    else:
        intermediate_bam = final_bam

    map_cmd += ' -bS -T ' + ref_fa + '  - > ' + intermediate_bam
    common.syscall(index_cmd, verbose=verbose)
    common.syscall(map_cmd, verbose=verbose)

    if sort:
        threads = min(4, threads)
        thread_mem = int(500 / threads)
        sort_cmd = samtools + ' sort -@' + str(threads) + ' -m ' + str(
            thread_mem) + 'M ' + intermediate_bam + ' ' + out_prefix
        index_cmd = samtools + ' index ' + final_bam
        common.syscall(sort_cmd, verbose=verbose)
        common.syscall(index_cmd, verbose=verbose)
    for fname in clean_files:
        os.unlink(fname)
Ejemplo n.º 50
0
 def _sort_file(infile, outfile, log_fh=None):
     cmd = 'sort -k1,1 -k 2,2n ' + infile + ' > ' + outfile
     verbose = log_fh is not None
     common.syscall(cmd, verbose=verbose, verbose_filehandle=log_fh)
Ejemplo n.º 51
0
    def _get_from_card(self, outprefix):
        outprefix = os.path.abspath(outprefix)
        tmpdir = outprefix + '.download'
        current_dir = os.getcwd()

        try:
            os.mkdir(tmpdir)
            os.chdir(tmpdir)
        except:
            raise Error('Error mkdir/chdir ' + tmpdir)

        versions = self._get_card_versions('download.html')
        if self.version is not None:
            key = tuple([int(x) for x in self.version.split('.')])
            if key not in versions:
                raise Error('Error! Did not find requested version ' +
                            self.version)
        else:
            key = sorted(list(versions.keys()))[-1]
            self.version = '.'.join([str(x) for x in key])

        print('Getting version', self.version)
        card_tarball_url = versions[key]
        card_tarball = 'card.tar.bz2'
        print('Working in temporary directory', tmpdir)
        print('Downloading data from card:', card_tarball_url, flush=True)
        common.syscall('wget -O ' + card_tarball + ' ' + card_tarball_url,
                       verbose=True)
        print('...finished downloading', flush=True)
        if not tarfile.is_tarfile(card_tarball):
            raise Error(
                'File ' + card_tarball + ' downloaded from ' +
                card_tarball_url +
                ' does not look like a valid tar archive. Cannot continue')

        json_file = './card.json'
        with tarfile.open(card_tarball, 'r') as tfile:
            tfile.extract(json_file)

        print('Extracted json data file ',
              json_file,
              '. Reading its contents...',
              sep='')

        final_fasta = outprefix + '.fa'
        final_tsv = outprefix + '.tsv'
        log_file = outprefix + '.log'
        f_out_fa = pyfastaq.utils.open_file_write(final_fasta)
        f_out_tsv = pyfastaq.utils.open_file_write(final_tsv)
        f_out_log = pyfastaq.utils.open_file_write(log_file)

        with open(json_file) as f:
            json_data = json.load(f)

        json_data = {
            int(x): json_data[x]
            for x in json_data if not x.startswith('_')
        }
        print('Found',
              len(json_data),
              'records in the json file. Analysing...',
              flush=True)

        for gene_key, gene_dict in sorted(json_data.items()):
            crecord = card_record.CardRecord(gene_dict)
            data = crecord.get_data()
            data['ARO_description'] = data['ARO_description'].encode('utf-8')
            fasta_name_prefix = '.'.join([
                card_record.CardRecord._ARO_name_to_fasta_name(
                    data['ARO_name']),
                data['ARO_accession'],
            ])

            for card_key, gi, genbank_id, start, end, dna_seq, protein_seq in data[
                    'dna_seqs_and_ids']:
                if dna_seq == '':
                    print('Empty dna sequence',
                          gene_key,
                          data['ARO_id'],
                          data['ARO_accession'],
                          sep='\t',
                          file=f_out_log)
                    continue

                fasta_id = '.'.join([
                    fasta_name_prefix, genbank_id, start + '-' + end, card_key
                ])
                fasta = pyfastaq.sequences.Fasta(fasta_id, dna_seq)

                if gi != 'NA':
                    gene_tuple = fasta.make_into_gene()
                    if gene_tuple is None:
                        print('Could not make gene from sequence',
                              fasta.id,
                              sep='\t',
                              file=f_out_log)
                        continue
                    else:
                        translated = gene_tuple[0].translate()
                        if gene_tuple[0][:3] in pyfastaq.genetic_codes.starts[
                                self.genetic_code]:
                            translated.seq = 'M' + translated.seq[1:]

                        if translated.seq[:-1] != protein_seq:
                            print(
                                'Translation of inferred gene dna sequence does not match protein sequence',
                                fasta.id,
                                sep='\t',
                                file=f_out_log)
                            continue

                print(fasta, file=f_out_fa)

                if gi == 'NA':
                    gene_or_not = '0'
                    variant_only = '0'
                elif len(data['snps']) == 0:
                    gene_or_not = '1'
                    variant_only = '0'
                else:
                    gene_or_not = '1'
                    variant_only = '1'

                print(fasta.id,
                      gene_or_not,
                      variant_only,
                      '.',
                      '.',
                      data['ARO_name'],
                      sep='\t',
                      file=f_out_tsv)

                if len(data['snps']) == 0 and data['ARO_description'] != '':
                    print(fasta.id,
                          gene_or_not,
                          variant_only,
                          '.',
                          '.',
                          data['ARO_description'],
                          sep='\t',
                          file=f_out_tsv)
                else:
                    for snp in data['snps']:
                        if data['ARO_description'] != '':
                            print(fasta.id,
                                  gene_or_not,
                                  variant_only,
                                  snp,
                                  '.',
                                  data['ARO_description'],
                                  sep='\t',
                                  file=f_out_tsv)

        pyfastaq.utils.close(f_out_fa)
        pyfastaq.utils.close(f_out_tsv)
        pyfastaq.utils.close(f_out_log)
        os.chdir(current_dir)
        if not self.debug:
            common.rmtree(tmpdir)

        print('Extracted data and written ARIBA input files\n')
        print('Finished. Final files are:',
              final_fasta,
              final_tsv,
              sep='\n\t',
              end='\n\n')
        print('You can use them with ARIBA like this:')
        print('ariba prepareref -f', final_fasta, '-m', final_tsv,
              'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print(
            '"CARD 2020: antibiotic resistome surveillance with the comprehensive antibiotic resistance database", Alcock et al 2020, PMID: 31665441'
        )
        print('and in your methods say that version', self.version,
              'of the database was used')
Ejemplo n.º 52
0
def run_smalt(
      reads_fwd,
      reads_rev,
      ref_fa,
      out_prefix,
      index_k=9,
      index_s=2,
      threads=1,
      max_insert=1000,
      minid=0.9,
      sort=False,
      extra_smalt_map_ops='-x',
      samtools='samtools',
      smalt='smalt',
      verbose=False
    ):
    if extra_smalt_map_ops is None:
        extra_smalt_map_ops = ''
    map_index = out_prefix + '.map_index'
    clean_files = [map_index + '.' + x for x in ['smi', 'sma']]
    index_cmd = ' '.join([
        smalt, 'index',
        '-k', str(index_k),
        '-s', str(index_s),
        map_index,
        ref_fa
    ])

    map_cmd = smalt + ' map ' + extra_smalt_map_ops + ' '

    # depending on OS, -n can break smalt, so only use -n if it's > 1.
    if threads > 1:
        map_cmd += '-n ' + str(threads) + ' -O '

    if reads_rev is None:
        map_cmd += ' '.join([
            '-y', str(minid),
            map_index,
            reads_fwd,
        ])
    else:
        map_cmd += ' '.join([
            '-i', str(max_insert),
            '-y', str(minid),
            map_index,
            reads_fwd,
            reads_rev,
        ])

    map_cmd += ' | ' + samtools + ' view'

    final_bam = out_prefix + '.bam'
    if sort:
        intermediate_bam = out_prefix + '.unsorted.bam'
    else:
        intermediate_bam = final_bam

    map_cmd += ' -bS -T ' + ref_fa + '  - > ' + intermediate_bam
    common.syscall(index_cmd, verbose=verbose)
    common.syscall(map_cmd, verbose=verbose)

    if sort:
        threads = min(4, threads)
        thread_mem = int(500 / threads)
        sort_cmd = samtools + ' sort -@' + str(threads) + ' -m ' + str(thread_mem) + 'M ' + intermediate_bam + ' ' + out_prefix
        index_cmd = samtools + ' index ' + final_bam
        common.syscall(sort_cmd, verbose=verbose)
        common.syscall(index_cmd, verbose=verbose)
    for fname in clean_files:
        os.unlink(fname)