Esempio n. 1
0
def run(options):
    if options.no_cdhit and options.cdhit_clusters is not None:
        sys.exit('Cannot use both --no_cdhit and --cdhit_clusters. Neither or exactly one of those options must be used')

    extern_progs, version_report_lines = versions.get_all_versions(using_spades=False)
    if options.verbose:
        print(*version_report_lines, sep='\n')

    preparer = ref_preparer.RefPreparer(
        options.fasta_files,
        extern_progs,
        metadata_tsv_files=options.tsv_files,
        all_coding=options.all_coding,
        version_report_lines=version_report_lines,
        min_gene_length=options.min_gene_length,
        max_gene_length=options.max_gene_length,
        genetic_code=options.genetic_code,
        cdhit_min_id=options.cdhit_min_id,
        cdhit_min_length=options.cdhit_min_length,
        cdhit_max_memory=options.cdhit_max_memory,
        run_cdhit=not options.no_cdhit,
        clusters_file=options.cdhit_clusters,
        threads=options.threads,
        verbose=options.verbose,
        force=options.force,
    )

    preparer.run(options.outdir)
Esempio n. 2
0
    def test_run_all_noncoding(self):
        '''test run with no metadata input, all sequences are noncoding'''
        fasta_in = [
            os.path.join(data_dir, 'ref_preparer_test_run.in.1.fa'),
            os.path.join(data_dir, 'ref_preparer_test_run.in.2.fa'),
            os.path.join(data_dir, 'ref_preparer_test_run.in.3.fa'),
        ]

        extern_progs = external_progs.ExternalProgs()
        refprep = ref_preparer.RefPreparer(fasta_in, extern_progs, all_coding='no', genetic_code=1)
        tmp_out = 'tmp.ref_preparer_test_run'
        refprep.run(tmp_out)
        expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run_all_noncoding.out')

        test_files = [
            '00.auto_metadata.tsv',
            '01.filter.check_metadata.tsv',
            '01.filter.check_genes.log',
            '01.filter.check_noncoding.log',
            '01.filter.check_metadata.log',
            '02.cdhit.all.fa',
            '02.cdhit.clusters.tsv',
            '02.cdhit.gene.fa',
            '02.cdhit.gene.varonly.fa',
            '02.cdhit.noncoding.fa',
            '02.cdhit.noncoding.varonly.fa',
        ]

        for filename in test_files:
            expected = os.path.join(expected_outdir, filename)
            got = os.path.join(tmp_out, filename)
            self.assertTrue(filecmp.cmp(expected, got, shallow=False))

        common.rmtree(tmp_out)
Esempio n. 3
0
    def test_run(self):
        '''test run'''
        fasta_in = [
            os.path.join(data_dir, 'ref_preparer_test_run.in.1.fa'),
            os.path.join(data_dir, 'ref_preparer_test_run.in.2.fa'),
            os.path.join(data_dir, 'ref_preparer_test_run.in.3.fa'),
        ]
        tsv_in = [
            os.path.join(data_dir, 'ref_preparer_test_run.in.1.tsv'),
            os.path.join(data_dir, 'ref_preparer_test_run.in.2.tsv'),
        ]

        extern_progs = external_progs.ExternalProgs()
        refprep = ref_preparer.RefPreparer(fasta_in,
                                           extern_progs,
                                           metadata_tsv_files=tsv_in,
                                           genetic_code=1)
        tmp_out = 'tmp.ref_preparer_test_run'
        refprep.run(tmp_out)
        expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run.out')

        test_files = [
            '01.filter.check_metadata.tsv',
            '01.filter.check_genes.log',
            '01.filter.check_metadata.log',
            '02.cdhit.all.fa',
            '02.cdhit.clusters.tsv',
            '02.cdhit.gene.fa',
            '02.cdhit.gene.varonly.fa',
            '02.cdhit.noncoding.fa',
            '02.cdhit.noncoding.varonly.fa',
        ]

        for filename in test_files:
            expected = os.path.join(expected_outdir, filename)
            got = os.path.join(tmp_out, filename)
            self.assertTrue(filecmp.cmp(expected, got, shallow=False))

        shutil.rmtree(tmp_out)
Esempio n. 4
0
def make_prepareref_dir(outdir):
    if os.path.exists(outdir):
        raise Exception('Output directory ' + outdir +
                        ' already exists. Cannot continue')

    tmpdir = tempfile.mkdtemp(prefix=outdir + '.tmp', dir=os.getcwd())
    tmp_prefix = os.path.join(tmpdir, 'out')
    make_prepareref_files(tmp_prefix)
    ref_prep = ref_preparer.RefPreparer(
        [tmp_prefix + '.fa'],
        None,
        metadata_tsv_files=[tmp_prefix + '.tsv'],
        run_cdhit=False,
        threads=1,
    )
    ref_prep.run(outdir)
    common.rmtree(tmpdir)

    json_data = {'tb': True}
    json_file = os.path.join(outdir, '00.params.json')
    with open(json_file, 'w') as f:
        json.dump(json_data, f)
Esempio n. 5
0
    def run(self):
        try:
            os.mkdir(self.outdir)
        except:
            raise Error('Error making output directory ' + self.outdir)

        pubmlst = pubmlst_getter.PubmlstGetter(debug=self.debug,
                                               verbose=self.verbose)
        pubmlst.get_species_files(self.species, self.mlst_download_dir)
        if self.verbose:
            print('Downloaded data from pubmlst')

        profile_file = os.path.join(self.mlst_download_dir, 'profile.txt')
        self.profile = mlst_profile.MlstProfile(profile_file,
                                                duplicate_warnings=True)
        if self.verbose:
            print('Loaded mlst profile file', profile_file)

        self._load_fasta_files_and_write_clusters_file(self.mlst_download_dir)
        if self.verbose:
            print('Loaded fasta files and wrote clusters file')
            print('Putting data in ariba db directory', self.prepareref_dir)

        refprep = ref_preparer.RefPreparer(
            self.fasta_files,
            self.extern_progs,
            all_coding='no',
            clusters_file=self.clusters_file,
            verbose=self.verbose,
        )
        refprep.run(self.prepareref_dir)
        shutil.copy(profile_file,
                    os.path.join(self.prepareref_dir, 'pubmlst.profile.txt'))

        print('ariba db directory prepared. You can use it like this:')
        print('ariba run', self.prepareref_dir,
              'reads_1.fq reads_2.fq output_directory')
Esempio n. 6
0
    def test_run_noncoding_checks(self):
        '''test run with noncoding sequences that are outside of the allowed size range'''
        fasta_in = [
            os.path.join(data_dir, 'ref_preparer_test_run.in.4.fa')
        ]
        tsv_in = [
            os.path.join(data_dir, 'ref_preparer_test_run.in.4.tsv')
        ]

        extern_progs = external_progs.ExternalProgs()
        refprep = ref_preparer.RefPreparer(
            fasta_in, extern_progs, min_noncoding_length=6, max_noncoding_length=20,
            metadata_tsv_files=tsv_in, genetic_code=1)
        tmp_out = 'tmp.ref_preparer_test_run_noncoding_checks'
        refprep.run(tmp_out)
        expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run_noncoding_checks.out')

        test_files = [
            '01.filter.check_metadata.tsv',
            '01.filter.check_genes.log',
            '01.filter.check_noncoding.log',
            '01.filter.check_metadata.log',
            '02.cdhit.all.fa',
            '02.cdhit.clusters.tsv',
            '02.cdhit.gene.fa',
            '02.cdhit.gene.varonly.fa',
            '02.cdhit.noncoding.fa',
            '02.cdhit.noncoding.varonly.fa',
        ]

        for filename in test_files:
            expected = os.path.join(expected_outdir, filename)
            got = os.path.join(tmp_out, filename)
            self.assertTrue(filecmp.cmp(expected, got, shallow=False))

        common.rmtree(tmp_out)