コード例 #1
0
ファイル: test_filterlfn.py プロジェクト: aitgon/vtam
    def setUpClass(cls):

        cmd = '{} -m pip install . -q --upgrade --use-feature=in-tree-build'.format(
            sys.executable)
        if sys.platform.startswith("win"):
            args = cmd
        else:
            args = shlex.split(cmd)
        subprocess.run(args=args, cwd=PathManager.get_project_path())

        cls.package_path = os.path.join(PathManager.get_package_path())
        cls.test_path = os.path.join(PathManager.get_test_path())
        cls.outdir_path = os.path.join(cls.test_path, 'outdir')
        cls.outdir_data_path = os.path.join(cls.outdir_path, 'data')
        shutil.rmtree(cls.outdir_path, ignore_errors=True)
        pathlib.Path(cls.outdir_data_path).mkdir(parents=True, exist_ok=True)

        ############################################################################################
        #
        # Download sorted reads dataset
        #
        ############################################################################################

        sorted_tar_path = os.path.join(cls.outdir_data_path, "sorted.tar.gz")
        pathlib.Path(os.path.dirname(sorted_tar_path)).mkdir(parents=True,
                                                             exist_ok=True)
        # Test first in local dir, otherwise in the remote URLs
        if not os.path.isfile(sorted_tar_path) or pathlib.Path(
                sorted_tar_path).stat().st_size < 1000000:
            try:
                # urllib.request.urlretrieve(sorted_tar_gz_url1, sorted_tar_path, MyProgressBar())
                with tqdm(...) as t:
                    t.set_description(os.path.basename(sorted_tar_path))
                    urllib.request.urlretrieve(sorted_tar_gz_url1,
                                               sorted_tar_path,
                                               reporthook=tqdm_hook(t))
            except Exception:
                try:
                    # urllib.request.urlretrieve(sorted_tar_gz_url2, sorted_tar_path, MyProgressBar())
                    with tqdm(...) as t:
                        t.set_description(os.path.basename(sorted_tar_path))
                        urllib.request.urlretrieve(sorted_tar_gz_url2,
                                                   sorted_tar_path,
                                                   reporthook=tqdm_hook(t))
                except Exception:
                    # urllib.request.urlretrieve(sorted_tar_gz_url3, sorted_tar_path, MyProgressBar())
                    with tqdm(...) as t:
                        t.set_description(os.path.basename(sorted_tar_path))
                        urllib.request.urlretrieve(sorted_tar_gz_url3,
                                                   sorted_tar_path,
                                                   reporthook=tqdm_hook(t))
        tar = tarfile.open(sorted_tar_path, "r:gz")
        tar.extractall(path=cls.outdir_data_path)
        tar.close()
コード例 #2
0
    def setUpClass(cls):

        ########################################################################
        #
        # These tests need the vtam command in the path
        #
        ########################################################################

        pip_install_vtam_for_tests()

        cls.package_path = os.path.join(PathManager.get_package_path())
        cls.test_path = os.path.join(PathManager.get_test_path())
        cls.outdir_path = os.path.join(cls.test_path, 'outdir')
        cls.outdir_data_path = os.path.join(cls.outdir_path, 'data')
        shutil.rmtree(cls.outdir_path, ignore_errors=True)
        pathlib.Path(cls.outdir_data_path).mkdir(parents=True, exist_ok=True)

        ############################################################################################
        #
        # Download sorted reads dataset (Updated Oct 10, 2020)
        #
        ############################################################################################

        sorted_tar_path = os.path.join(cls.outdir_data_path, "sorted.tar.gz")
        # Test first in local dir, otherwise in the remote URLs
        if not os.path.isfile(sorted_tar_path) or pathlib.Path(
                sorted_tar_path).stat().st_size < 1000000:
            try:
                # urllib.request.urlretrieve(sorted_tar_gz_url1, sorted_tar_path, MyProgressBar())
                with tqdm(...) as t:
                    t.set_description(os.path.basename(sorted_tar_path))
                    urllib.request.urlretrieve(sorted_tar_gz_url1,
                                               sorted_tar_path,
                                               reporthook=tqdm_hook(t))
            except Exception:
                try:
                    # urllib.request.urlretrieve(sorted_tar_gz_url2, sorted_tar_path, MyProgressBar())
                    with tqdm(...) as t:
                        t.set_description(os.path.basename(sorted_tar_path))
                        urllib.request.urlretrieve(sorted_tar_gz_url2,
                                                   sorted_tar_path,
                                                   reporthook=tqdm_hook(t))
                except Exception:
                    # urllib.request.urlretrieve(sorted_tar_gz_url3, sorted_tar_path, MyProgressBar())
                    with tqdm(...) as t:
                        t.set_description(os.path.basename(sorted_tar_path))
                        urllib.request.urlretrieve(sorted_tar_gz_url3,
                                                   sorted_tar_path,
                                                   reporthook=tqdm_hook(t))
        tar = tarfile.open(sorted_tar_path, "r:gz")
        tar.extractall(path=cls.outdir_data_path)
        tar.close()
コード例 #3
0
    def setUp(self):

        self.package_path = PathManager.get_package_path()
        test_path = PathManager.get_test_path()

        # Minimal merge command
        foopaths = {}
        foopaths['foofile'] = os.path.relpath(__file__, self.package_path)
        foopaths['foodir'] = os.path.relpath(os.path.dirname(__file__), self.package_path)
        foopaths['outdir'] = 'tests/output'
        foopaths['sortedinfo_tsv'] = "data/example/sortedinfo_mfzr.tsv"
        foopaths['tsv_path'] = "data/example/sortedinfo_mfzr.tsv"
        foopaths['known_occurrences'] = "data/example/known_occurrences.tsv"
        self.foopaths = foopaths
コード例 #4
0
    def setUp(self):
        asv_table_str = """variant_id	marker_name	run_name	sequence_length	read_count	sample1	sample2	sample3	chimera_borderline	sequence
3	MFZR	prerun	176	9713	9712	1	0	FALSE	TCTATATTTCATTTTTGGTGCTTGGGCAGGTATGGTAGGTACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCTCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT
33	MFZR	prerun	174	9713	9703	10	0	FALSE	CTATATTTCATTTTTGGTGCTTGGGCAGGTATGGTAGGTACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCTCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT
333	ZFZR	prerun	157	10000	9900	10	0	FALSE	TGCTTGGGCAGGTATGGTAGGTACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCTCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT
836	MFZR	prerun	176	11588	123	56	0	FALSE	TCTATATTTCATTTTTGGTGCTTGGGCAGGTATGGTAGGGACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCCCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT
8368	ZFZR	prerun	157	545	500	0	45	FALSE	TGCTTGGGCAGGTATGGTAGGGACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCCCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT
83683	MFZR	prerun	175	484	0	28	456	FALSE	TCTAAATTTCATTTTTGGTGCTTGGGCAGGTATGGTAGGGACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCCCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT
"""
        asv_table_df = pandas.read_csv(io.StringIO(asv_table_str),
                                       sep="\t",
                                       header=0)
        self.asv_table_df = asv_table_df
        # Create this_tempdir
        this_tempdir = os.path.join(PathManager.instance().get_tempdir(),
                                    os.path.basename(__file__))
        pathlib.Path(this_tempdir).mkdir(exist_ok=True)
        # Define fasta_path tsv_path
        fasta_path = os.path.join(PathManager.instance().get_tempdir(),
                                  os.path.basename(__file__), 'variants.fa')
        # Create variant variant_read_count_input_df
        variant_df = asv_table_df[['variant_id', 'sequence', 'read_count'
                                   ]].drop_duplicates(inplace=False)
        variant_df.columns = ['id', 'sequence', 'size']
        # Create fasta_path file from asv_table_df
        variant_df_utils = DataframeVariant(variant_df)
        variant_df_utils.to_fasta(fasta_path, add_column='size')
        # Define vsearch output tsv_path
        vsearch_output_path = os.path.join(
            PathManager.instance().get_tempdir(), os.path.basename(__file__),
            'centroid_out.fa')
        # Define cluster output tsv_path
        vsearch_cluster_output_path = os.path.join(
            PathManager.instance().get_tempdir(), os.path.basename(__file__),
            'cluster.fa')
        #
        # Create object and run_name vsearch
        os.environ["VTAM_THREADS"] = "1"
        vsearch_parameters = {
            '--cluster_size': fasta_path,
            '--clusters': vsearch_cluster_output_path,
            '--id': 1,
            '--sizein': None,
            '--centroids': vsearch_output_path,
            "--threads": int(os.getenv('VTAM_THREADS')),
        }
        vsearch_cluster = RunnerVSearch(parameters=vsearch_parameters)
        vsearch_cluster.run()
コード例 #5
0
ファイル: CommandExample.py プロジェクト: ulysse06/vtam
    def main(outdir):

        package_path = PathManager.get_package_path()
        pathlib.Path(outdir).mkdir(parents=True, exist_ok=True)

        #######################################################################
        #
        # Download fastq
        #
        #######################################################################

        fastq_tar_path = os.path.join(outdir, "fastq.tar.gz")
        # Test first in local dir, otherwise in the remote URLs
        if not os.path.isfile(fastq_tar_path) or pathlib.Path(fastq_tar_path).stat().st_size < 1000000:
            try:
                # urllib.request.urlretrieve(fastq_tar_gz_url1, fastq_tar_path, MyProgressBar())
                with tqdm(...) as t:
                    t.set_description(os.path.basename(fastq_tar_path))
                    urllib.request.urlretrieve(fastq_tar_gz_url1, fastq_tar_path, reporthook=tqdm_hook(t))
            except Exception:
                try:
                    # urllib.request.urlretrieve(fastq_tar_gz_url2, fastq_tar_path, MyProgressBar())
                    with tqdm(...) as t:
                        t.set_description(os.path.basename(fastq_tar_path))
                        urllib.request.urlretrieve(fastq_tar_gz_url2, fastq_tar_path, reporthook=tqdm_hook(t))
                except Exception:
                    # urllib.request.urlretrieve(fastq_tar_gz_url3, fastq_tar_path, MyProgressBar())
                    with tqdm(...) as t:
                        t.set_description(os.path.basename(fastq_tar_path))
                        urllib.request.urlretrieve(fastq_tar_gz_url3, fastq_tar_path, reporthook=tqdm_hook(t))
        tar = tarfile.open(fastq_tar_path, "r:gz")
        tar.extractall(path=outdir)
        tar.close()

        os.remove(fastq_tar_path)

        #######################################################################
        #
        # Set command args
        #
        #######################################################################

        args = {}
        args['package_path'] = package_path
        args['snake_tuto_data'] = os.path.join(package_path, "data/snake.tuto.data.yml")

        #######################################################################
        #
        # Copy data to directory tree
        #
        #######################################################################

        cmd = "snakemake --cores 1 -s {snake_tuto_data} --config MARKER=mfzr " \
              "PROJECT=asper1 PACKAGE_PATH={package_path} --until all_one_marker".format(**args)

        if sys.platform.startswith("win"):
            args = cmd
        else:
            args = shlex.split(cmd)
        subprocess.run(args=args, check=True, cwd=outdir)
コード例 #6
0
ファイル: test_cmd_example.py プロジェクト: aitgon/vtam
    def setUp(self):

        pip_install_vtam_for_tests()

        self.test_path = PathManager.get_test_path()
        self.outdir_path = os.path.join(self.test_path, 'outdir')
        pathlib.Path(self.outdir_path).mkdir(exist_ok=True, parents=True)
コード例 #7
0
 def setUpClass(cls):
     cls.test_path = PathManager.get_test_path()
     cls.tags_file_path = os.path.join(cls.test_path, "test_files",
                                       "FilesInputCutadapt")
     cls.fastainfo = os.path.join(cls.tags_file_path, "fastainfo.tsv")
     cls.fastainfoNoDuplicates = os.path.join(cls.tags_file_path,
                                              "fastainfoNoDuplicates.tsv")
     cls.mergedFasta1 = "14Ben01_1_fw_48.fasta"
コード例 #8
0
    def setUp(self):
        """>parent1;size=650
TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATAATTAGTTGG
>parent2;size=700
AACTATGTACACAAATTTTAGTATATTGGCAGGGATAGTAGGAACTTTACTATCGTTAGTTATCAGAATGGAATTATCAA
CAGGAAACATGTTAGATGGAGACGGTCAACAATATAACGTAATCGTAACCGCACATGGATTAATAATGATATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG
>Chimera1;size=50
TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG
>Chimera2;size=300
TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTG
CAGGAAACATGTTAGATGGAGACGGTCAACAATATAACGTAATCGTAACCGCACATGGATTAATAATGATATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG
>Chimera3;size=50
TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG"""
        """(vtam_appli) gonzalez@milan:~/tmp/vsearch_uchime$ vsearch --uchime_denovo i.fa --borderline borderline.fa --nonchimeras nonchimeras.fa --chimeras chimeras.fa
vsearch v2.7.0_linux_x86_64, 15.5GB RAM, 8 cores
https://github.com/torognes/vsearch

Reading file i.fa 100%
1500 nt in 5 seqs, min 300, max 300, avg 300
Masking 100%
Sorting by abundance 100%
Counting k-mers 100%
Detecting chimeras 100%
Found 2 (40.0%) chimeras, 2 (40.0%) non-chimeras,
and 1 (20.0%) borderline sequences in 5 unique sequences.
Taking abundance information into account, this corresponds to
350 (20.0%) chimeras, 1350 (77.1%) non-chimeras,
and 50 (2.9%) borderline sequences in 1750 total sequences"""

        # Input from min_replicate_number
        # Variants 1 and 2 are ok but 3-5 are chimeras
        self.variant_df = pandas.DataFrame(
            data={
                'sequence': [
                    'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATAATTAGTTGG',
                    'AACTATGTACACAAATTTTAGTATATTGGCAGGGATAGTAGGAACTTTACTATCGTTAGTTATCAGAATGGAATTATCAACAGGAAACATGTTAGATGGAGACGGTCAACAATATAACGTAATCGTAACCGCACATGGATTAATAATGATATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG',
                    'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG',
                    'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCAGGAAACATGTTAGATGGAGACGGTCAACAATATAACGTAATCGTAACCGCACATGGATTAATAATGATATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG',
                    'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG',
                ],
            },
            index=list(range(1, 6)),
        )
        #
        self.variant_read_count_df = pandas.DataFrame({
            'run_id': [1] * 5,
            'marker_id': [1] * 5,
            'sample_id': [1] * 5,
            'replicate': [1] * 5,
            'variant_id':
            list(range(1, 6)),
            'read_count': [650, 700, 50, 350, 50],
        })
        self.this_tempdir = os.path.join(PathManager.instance().get_tempdir(),
                                         os.path.basename(__file__))
        pathlib.Path(self.this_tempdir).mkdir(parents=True, exist_ok=True)
        os.environ['VTAM_THREADS'] = str(multiprocessing.cpu_count())
コード例 #9
0
    def setUpClass(cls):

        cls.package_path = PathManager.get_package_path()
        cls.test_path = PathManager.get_test_path()

        foopaths = {}
        foopaths['foofile'] = os.path.relpath(__file__, cls.package_path)
        foopaths['foodir'] = os.path.relpath(os.path.dirname(__file__),
                                             cls.package_path)
        foopaths['sorteddir'] = 'output'
        foopaths['sortedinfo_tsv'] = "data/example/sortedinfo_mfzr.tsv"
        foopaths[
            'optimize_lfn_variant_specific'] = "tests/test_files_dryad.f40v5_small/run1_mfzr_zfzr/optimize_lfn_variant_specific.tsv"
        cls.foopaths = foopaths

        cls.minseqlength_value_32 = 32
        cls.minseqlength_value_40 = 40
        cls.lfn_variant_replicate_cutoff = 0.002
コード例 #10
0
def pip_install_vtam_for_tests():
    """This function is used in the tests when the vtam command is run"""

    cmd = '{} -m pip install . -q --upgrade'.format(sys.executable)
    if sys.platform.startswith("win"):
        args = cmd
    else:
        args = shlex.split(cmd)
    subprocess.run(args=args,
                   check=True,
                   cwd=PathManager.instance().get_project_path())
コード例 #11
0
ファイル: CommandTaxonomy.py プロジェクト: ulysse06/vtam
    def __init__(self, taxonomy_tsv=None):
        """

        :param taxonomy_tsv: Path to the taxonomy_tsv. Default None
        :type taxonomy_tsv: str

        :rtype: None
        """

        if taxonomy_tsv is None:  # If None, download to current wdir
            self.taxonomy_tsv_path = os.path.join(os.getcwd(), "taxonomy.tsv")
        else:  # Download to tsv_path
            self.taxonomy_tsv_path = taxonomy_tsv

        pathlib.Path(os.path.dirname(taxonomy_tsv)).mkdir(parents=True,
                                                          exist_ok=True)

        self.tempdir = PathManager.instance().get_tempdir()

        package_path = os.path.join(PathManager.get_package_path())
        self.taxonomy_tsv_gz_path = os.path.join(package_path, "..", "data",
                                                 "taxonomy.tsv.gz")
コード例 #12
0
    def setUp(self):

        test_path = os.path.join(PathManager.get_test_path())
        self.variantid_identity_lineage_df = pandas.read_csv(os.path.join(test_path, "test_runner_ltg_selection", "variantid_identity_lineage.tsv"), sep="\t", header=0)
        self.ltg_bak_df = pandas.read_csv(os.path.join(test_path, "test_runner_ltg_selection", "ltg_bak.tsv"), sep="\t")

        # create_vtam_data_dir()
        testdir_path = os.path.join(PathManager.get_test_path())
        self.outdir_path = os.path.join(testdir_path, "outdir")
        pathlib.Path(self.outdir_path).mkdir(exist_ok=True, parents=True)
        taxonomy_tsv_path = os.path.join(self.outdir_path, "taxonomy.tsv")
        CommandTaxonomy(
            taxonomy_tsv=taxonomy_tsv_path).download_precomputed_taxonomy()

        self.taxonomy_df = pandas.read_csv(taxonomy_tsv_path, sep="\t", header=0,
                                      dtype={'tax_id': 'int', 'parent_tax_id': 'int',
                                             'old_tax_id': 'float'}).drop_duplicates()
        self.taxonomy_df.set_index('tax_id', drop=True, inplace=True)
        self.taxonomy_df = self.taxonomy_df[[
            'parent_tax_id', 'rank', 'name_txt']].drop_duplicates()
        taxonomy = Taxonomy(taxonomy_tsv_path)
        self.taxonomy_df = taxonomy.df
コード例 #13
0
    def test_wopmars_runner_filter_with_cutoff_specific(self):

        cmd = 'filter --sortedinfo {sortedinfo_tsv} --sorteddir {foodir} --asvtable asvtableoutput.tsv' \
                   ' --cutoff_specific {optimize_lfn_variant_specific}'.format(**self.foopaths)

        cwd = os.getcwd()
        os.chdir(self.package_path)
        args = ArgParser.get_main_arg_parser().parse_args(cmd.split(" "))
        os.chdir(cwd)

        wopmars_runner = RunnerWopmars(command='filter',
                                       cli_args_dic=vars(args))
        wopfile_path = os.path.relpath(
            os.path.join(PathManager.get_package_path(),
                         "tests/output/wopfile"),
            PathManager.get_package_path())
        wopfile_path, wopfile_content = wopmars_runner.create_wopfile(
            path=wopfile_path)

        self.assertTrue(
            yaml.load(wopfile_content, Loader=yaml.SafeLoader)
            ['rule FilterLFN']['params']['lfn_variant_specific_cutoff'] ==
            self.foopaths['optimize_lfn_variant_specific'])
コード例 #14
0
    def __init__(self, variant_fasta, blast_db_dir, blast_db_name, num_threads,
                 qcov_hsp_perc):

        self.variant_fasta = variant_fasta
        self.blast_db_dir = blast_db_dir
        self.blast_db_name = blast_db_name
        # self.ltg_rule_threshold = ltg_rule_threshold
        # self.include_prop = include_prop
        # self.min_number_of_taxa = min_number_of_taxa
        self.num_threads = num_threads
        self.qcov_hsp_perc = qcov_hsp_perc

        self.this_temp_dir = os.path.join(PathManager.instance().get_tempdir(),
                                          os.path.basename(__file__))
        pathlib.Path(self.this_temp_dir).mkdir(exist_ok=True, parents=True)
コード例 #15
0
    def __init__(self, variant_expected_df, variant_unexpected_df,
                 variant_read_count_df):
        """
        Initiates object for the PCR error filter

        :param variant_expected_df: DataFrame (id, sequence) with expected variants
        :param variant_unexpected_df: DataFrame (id, sequence) with unexpected variants
        :param variant_read_count_df: DataFrame (run_id, marker_id, sample_id, replicate, variant_id, read_count)
        """
        self.__variant_expected_df = variant_expected_df
        self.__variant_unexpected_df = variant_unexpected_df
        self.__variant_read_count_df = variant_read_count_df
        self.__tmp_dir = os.path.join(PathManager.instance().get_tempdir(),
                                      self.__class__.__name__)
        pathlib.Path(self.__tmp_dir).mkdir(parents=True, exist_ok=True)
コード例 #16
0
ファイル: OptimizePCRerror.py プロジェクト: ulysse06/vtam
    def run(self):
        session = self.session
        engine = session._session().get_bind()

        this_temp_dir = os.path.join(PathManager.instance().get_tempdir(),
                                     os.path.basename(__file__))
        pathlib.Path(this_temp_dir).mkdir(exist_ok=True)

        ############################################################################################
        #
        # Wrapper inputs, outputs and parameters
        #
        ############################################################################################

        # Input file paths
        known_occurrences_tsv = self.input_file(
            OptimizePCRerror.__input_file_known_occurrences)
        fasta_info_tsv = self.input_file(
            OptimizePCRerror.__input_file_sortedinfo)
        #
        # Output file paths
        output_optimize_path = self.output_file(
            OptimizePCRerror.__output_file_optimize_pcr_error)

        ############################################################################################
        #
        # Get nijk_df, known_occurrences_df
        #
        ############################################################################################

        sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv)
        variant_read_count_df = sample_info_tsv_obj.get_nijk_df(
            VariantReadCount, engine=engine)

        known_occurrences_df = FileKnownOccurrences(
            known_occurrences_tsv).to_identifier_df(engine)

        ############################################################################################
        #
        # Run optimizer and Write
        #
        ############################################################################################

        optimize_pcr_error_runner = RunnerOptimizePCRerror(
            variant_read_count_df=variant_read_count_df,
            known_occurrences_df=known_occurrences_df)
        optimize_pcr_error_runner.to_tsv(optimize_path=output_optimize_path,
                                         engine=engine)
コード例 #17
0
ファイル: SequenceClusterer.py プロジェクト: aitgon/vtam
    def compute_clusters(self):

        tempcluster_dir = PathManager.instance().get_tempdir()

        i_fas = os.path.join(tempcluster_dir, 'cluster_input.fas')
        with open(i_fas, 'w') as fout:
            for idx, row in self.variant_info_df.iterrows():
                valdict = {}
                valdict['variant_id'] = row.variant_id
                valdict['read_count'] = row.read_count
                valdict['sequence'] = row.sequence
                fout.write(
                    ">{variant_id};size={read_count}\n{sequence}\n".format(
                        **valdict))
        cmd = "vsearch --cluster_size cluster_input.fas --id {} --otutabout otutabout.txt --clusters test".format(
            self.cluster_identity)
        if sys.platform.startswith("win"):
            args = cmd
        else:
            args = shlex.split(cmd)
        subprocess.run(args=args, cwd=tempcluster_dir)

        otutabout_path = os.path.join(tempcluster_dir, "otutabout.txt")
        otutabout_df = pandas.read_csv(otutabout_path, sep="\t")
        otutabout_df.rename({'#OTU ID': 'centroid'}, axis=1, inplace=True)

        otutabout_long_df = pandas.melt(otutabout_df,
                                        id_vars=['centroid'],
                                        var_name='variant_id',
                                        value_name='read_count')
        otutabout_long_df.rename({'centroid': 'clusterid'},
                                 axis=1,
                                 inplace=True)
        otutabout_long_df = otutabout_long_df.loc[
            otutabout_long_df.read_count > 0]
        otutabout_long_df.variant_id = otutabout_long_df.variant_id.astype(
            'int')

        cluster_count_df = otutabout_long_df[['clusterid', 'variant_id'
                                              ]].groupby('clusterid').count()
        cluster_count_df.rename({'variant_id': 'clustersize'},
                                axis=1,
                                inplace=True)
        cluster_count_df = otutabout_long_df[['clusterid', 'variant_id'
                                              ]].merge(cluster_count_df,
                                                       on='clusterid')

        return cluster_count_df
コード例 #18
0
    def __init__(self, asv_table_df, readcounts, run_marker_df=None):
        """
        Constructor of the CommandPoolRunMarkers class

        Parameters
        ----------
        asv_table_df : pandas dataframe
            ASV table.
        readcount : bool
            Default false.
            If false, boolean 0/1 is given for presence or absence of variant in pooled table.
            If true, read integer is given with sum or reads in the pooled runs or markers.
        run_marker_df: pandas dataframe
            Output ASV table with pooled variants
        """

        header = {
            'run_name', 'marker_name', 'variant_id', 'sequence_length',
            'read_count'
        }
        if not set(asv_table_df.columns
                   ) >= header:  # contains at least the 'header_lower' columns
            Logger.instance().error(
                VTAMexception(
                    "The ASV table structure is wrong. It is expected to contain these columns: "
                    "run_name, marker_name, variant_id, sequence_length, read_count"
                ))
            sys.exit(1)

        self.sample_names = asv_table_df.columns.tolist()[5:-2]

        if run_marker_df is None:  # Default: pool all marker_name
            self.asv_table_df = asv_table_df
        else:  # if run_marker_df: pool only markers in this variant_read_count_input_df
            self.asv_table_df = asv_table_df.merge(
                run_marker_df, on=['run_name', 'marker_name'])

        self.tmp_dir = os.path.join(PathManager.instance().get_tempdir(),
                                    os.path.basename(__file__))
        pathlib.Path(self.tmp_dir).mkdir(exist_ok=True)

        self.cluster_path = None  # returned by run_vsearch_to_cluster_sequences

        self.cluster_df = None  # returned by get_vsearch_clusters_to_df
        self.readcounts = readcounts  # returned by get_vsearch_clusters_to_df
コード例 #19
0
ファイル: RunnerWopmars.py プロジェクト: ulysse06/vtam
    def __init__(self, command, cli_args_dic):
        """

        :param command: takes one of two values: filter or optimize
        :param cli_args_dic: dictionnary (CLIargumentDict.instance()) with command
        """

        self.command = command
        self.cli_args_and_numerical_params = {}
        self.cli_args_and_numerical_params.update(cli_args_dic)

        # Add user params_lfn_variant.yml parameters

        params_dic = FileParams(cli_args_dic['params']).get_params_dic()
        self.cli_args_and_numerical_params.update(params_dic)

        self.wopfile_path = None
        self.tempdir = PathManager.instance().get_tempdir()
コード例 #20
0
ファイル: test_cmd_taxassign.py プロジェクト: aitgon/vtam
    def setUpClass(cls):

        # vtam needs to be in the tsv_path
        pip_install_vtam_for_tests()

        cls.test_path = os.path.join(PathManager.get_test_path())
        cls.outdir_path = os.path.join(cls.test_path, 'outdir')

        cls.args = {}
        cls.args['taxonomy'] = os.path.join(cls.outdir_path, "taxonomy.tsv")
        cls.args['coi_blast_db_dir'] = os.path.join(cls.outdir_path, "coi_blast_db_dir")
        pathlib.Path(cls.args['coi_blast_db_dir']).mkdir(exist_ok=True, parents=True)

        ############################################################################################
        #
        # Run 'vtam taxonomy'
        #
        ############################################################################################

        cmd = "vtam taxonomy --output {taxonomy} --precomputed".format(**cls.args)
        if sys.platform.startswith("win"):
            args = cmd
        else:
            args = shlex.split(cmd)
        subprocess.run(args=args)

        ############################################################################################
        #
        # Run 'vtam coi_blast_db'
        #
        ############################################################################################

        cmd = "vtam coi_blast_db --blastdbdir {coi_blast_db_dir} --blastdbname coi_blast_db_20200420 ".format(**cls.args)

        # if not (os.path.isfile(os.path.join(cls.args['coi_blast_db_dir'], "coi_blast_db_20200420.nhr"))):
        if sys.platform.startswith("win"):
            args = cmd
        else:
            args = shlex.split(cmd)
        subprocess.run(args=args)
コード例 #21
0
    def setUp(self):
        os.environ['VTAM_THREADS'] = str(multiprocessing.cpu_count())

        # Input from min_replicate_number
        self.variant_df = pandas.DataFrame(
            {
                'sequence': [
                    'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG',
                    'TGTTCTTTATTTATTATTTGATGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG',
                    'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTCGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG',
                    'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTCGCTGTAACTTTATCATTATCAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG',
                ],
            },
            index=list(range(1, 5)))
        #
        self.variant_read_count_df = pandas.DataFrame({
            'run_id': [1] * 8,
            'marker_id': [1] * 8,
            'sample_id': [1] * 8,
            'replicate': [1, 2] * 4,
            'variant_id': [1] * 2 + [2] * 2 + [3] * 2 + [4] * 2,
            'read_count': [
                350,
                300,
                300,
                220,
                60,
                0,
                2,
                0,
            ],
        })

        self.this_tempdir = os.path.join(PathManager.instance().get_tempdir(),
                                         os.path.basename(__file__))
        pathlib.Path(self.this_tempdir).mkdir(parents=True, exist_ok=True)
コード例 #22
0
ファイル: test_cmd_pool.py プロジェクト: ulysse06/vtam
    def setUp(self):

        pip_install_vtam_for_tests()

        self.test_path = PathManager.get_test_path()
        self.package_path = PathManager.get_package_path()
        self.outdir_path = os.path.join(self.test_path, 'outdir')
        shutil.rmtree(self.outdir_path, ignore_errors=True)
        pathlib.Path(self.outdir_path).mkdir(parents=True, exist_ok=True)

        self.args = {}
        self.args['runmarker'] = os.path.join(self.package_path, "data",
                                              "example", "pool_run_marker.tsv")
        self.args['db'] = os.path.join(self.outdir_path, "db.sqlite")

        ############################################################################################
        #
        # Init DB
        #
        ############################################################################################

        filter_codon_stop_path = os.path.join(self.test_path,
                                              "test_files_dryad.f40v5_small",
                                              "run1_mfzr_zfzr",
                                              "filter_codon_stop.tsv")
        variant_path = os.path.join(self.test_path,
                                    "test_files_dryad.f40v5_small",
                                    "run1_mfzr_zfzr",
                                    "variant_filter_codon_stop.tsv")
        sample_information_path = os.path.join(self.test_path,
                                               "test_files_dryad.f40v5_small",
                                               "run1_mfzr_zfzr",
                                               "sample_information.tsv")

        self.engine = sqlalchemy.create_engine('sqlite:///{}'.format(
            self.args['db']),
                                               echo=False)

        sample_information_df = pandas.read_csv(sample_information_path,
                                                sep="\t",
                                                header=0)
        sample_information_df.to_sql(name=SampleInformation.__tablename__,
                                     con=self.engine.connect(),
                                     if_exists='replace')

        run_df = pandas.DataFrame({'name': ['run1']}, index=range(1, 2))
        run_df.to_sql(name=Run.__tablename__,
                      con=self.engine.connect(),
                      index_label='id',
                      if_exists='replace')

        marker_df = pandas.DataFrame({'name': ['MFZR', 'ZFZR']},
                                     index=range(1, 3))
        marker_df.to_sql(name=Marker.__tablename__,
                         con=self.engine.connect(),
                         index_label='id',
                         if_exists='replace')

        sample_df = pandas.DataFrame(
            {'name': ['tpos1_run1', 'tnegtag_run1', '14ben01', '14ben02']},
            index=range(1, 5))
        sample_df.to_sql(name=Sample.__tablename__,
                         con=self.engine.connect(),
                         index_label='id',
                         if_exists='replace')

        variant_df = pandas.read_csv(variant_path,
                                     sep="\t",
                                     header=0,
                                     index_col='id')
        variant_df.to_sql(name=Variant.__tablename__,
                          con=self.engine.connect(),
                          index_label='id',
                          if_exists='replace')

        filter_codon_stop_df = pandas.read_csv(filter_codon_stop_path,
                                               sep="\t",
                                               header=0)
        filter_codon_stop_df.to_sql(name=FilterCodonStop.__tablename__,
                                    con=self.engine.connect(),
                                    if_exists='replace')

        filter_chimera_borderline_path = os.path.join(
            self.test_path, "test_files_dryad.f40v5_small", "run1_mfzr_zfzr",
            "filter_chimera_borderline_and_filter_codon_stop.tsv")
        filter_chimera_borderline_db = pandas.read_csv(
            filter_chimera_borderline_path, sep="\t", header=0)
        filter_chimera_borderline_db.to_sql(
            name=FilterChimeraBorderline.__tablename__,
            con=self.engine.connect(),
            if_exists='replace')

        self.sample_list = ['tpos1_run1', 'tnegtag_run1', '14ben01', '14ben02']
コード例 #23
0
ファイル: CommandSortReads.py プロジェクト: ulysse06/vtam
    def main(fastainfo,
             fastadir,
             sorteddir,
             params=None,
             num_threads=multiprocessing.cpu_count()):

        if sys.platform.startswith('win'):
            num_threads = 1

        ############################################################################################
        #
        # params.yml parameters
        #
        ############################################################################################

        params_dic = FileParams(params).get_params_dic()

        cutadapt_error_rate = params_dic['cutadapt_error_rate']
        cutadapt_minimum_length = params_dic['cutadapt_minimum_length']
        cutadapt_maximum_length = params_dic['cutadapt_maximum_length']

        ############################################################################################
        #
        # Loop over tag and primer pairs to demultiplex and trim reads
        #
        ############################################################################################

        merged_fastainfo_df = FileSampleInformation(
            fastainfo).read_tsv_into_df()

        pathlib.Path(sorteddir).mkdir(parents=True, exist_ok=True)
        tempdir = PathManager.instance().get_tempdir()

        sorted_read_info_df = pandas.DataFrame()

        for i in range(0, merged_fastainfo_df.shape[0]):
            fasta_info_series = merged_fastainfo_df.iloc[i]

            tag_fwd = fasta_info_series.tagfwd
            tag_rev = fasta_info_series.tagrev
            primer_fwd = fasta_info_series.primerfwd
            primer_rev = fasta_info_series.primerrev
            in_fasta_basename = fasta_info_series.mergedfasta

            Logger.instance().debug(
                "Analysing FASTA file: {}".format(in_fasta_basename))

            fasta_info_df_i = fasta_info_series.to_frame().T
            in_raw_fasta_path = os.path.join(fastadir, in_fasta_basename)

            ########################################################################################
            #
            # Cut adapt tag of forward reads
            # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only
            # --front 'tcgatcacgatgt;min_overlap=13...gctgtagatcgaca;min_overlap=14'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta
            # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta
            #
            ########################################################################################

            if generic_dna:  # Biopython <1.78
                tag_rev_rc = str(
                    Seq(tag_rev, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                tag_rev_rc = str(Seq(tag_rev).reverse_complement())

            out_fasta_basename = os.path.basename(in_raw_fasta_path).replace(
                '.fasta', '_sorted_%03d.fasta' % i)
            out_fasta_path = os.path.join(tempdir, out_fasta_basename)

            cmd_cutadapt_tag_dic = {
                'tag_fwd': tag_fwd,
                'tag_fwd_len': len(tag_fwd),
                'tag_rev_rc': tag_rev_rc,
                'tag_rev_rc_len': len(tag_rev_rc),
                'in_fasta_path': in_raw_fasta_path,
                'out_fasta': out_fasta_path,
                'num_threads': num_threads,
            }

            cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \
                '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic)

            Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_tag_str
            else:
                args = shlex.split(cmd_cutadapt_tag_str)
            run_result = subprocess.run(args=args,
                                        capture_output=True,
                                        check=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ########################################################################################
            #
            # Trim primers from output
            # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only
            # --front 'TCCACTAATCACAARGATATTGGTAC;min_overlap=26...GGAGGATTTGGWAATTGATTAGTW;min_overlap=24'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_trimmed_000.fasta
            # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta
            #
            ########################################################################################

            if generic_dna:  # Biopython <1.78
                primer_rev_rc = str(
                    Seq(primer_rev, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                primer_rev_rc = str(Seq(primer_rev).reverse_complement())

            in_fasta_path = out_fasta_path
            out_fasta_basename = os.path.basename(in_fasta_path).replace(
                '_sorted_%03d.fasta' % i, '_sorted_trimmed_%03d.fasta' % i)
            out_fasta_path = os.path.join(tempdir, out_fasta_basename)

            cmd_cutadapt_primer_dic = {
                'primer_fwd': primer_fwd,
                'primer_fwd_len': len(primer_fwd),
                'primer_rev_rc': primer_rev_rc,
                'primer_rev_rc_len': len(primer_rev_rc),
                'in_fasta_path': in_fasta_path,
                'out_fasta': out_fasta_path,
                'error_rate': cutadapt_error_rate,
                'read_min_length': cutadapt_minimum_length,
                'read_max_length': cutadapt_maximum_length,
                'num_threads': num_threads,
            }

            cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                                      '--minimum-length {read_min_length} ' \
                                      '--maximum-length {read_max_length} --trimmed-only  ' \
                                      '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" '  \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic)

            Logger.instance().debug(
                "Running: {}".format(cmd_cutadapt_primer_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_primer_str
            else:
                args = shlex.split(cmd_cutadapt_primer_str)
            run_result = subprocess.run(args=args, capture_output=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ########################################################################################
            #
            # Cut adapt tag of reverse-complement reads
            # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only
            # --front 'tgtcgatctacagc;min_overlap=14...acatcgtgatcga;min_overlap=13'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta
            # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta
            #
            ########################################################################################

            if generic_dna:  # Biopython <1.78
                tag_fwd_rc = str(
                    Seq(tag_fwd, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                tag_fwd_rc = str(Seq(tag_fwd).reverse_complement())

            out_rc_fasta_basename = os.path.basename(
                in_raw_fasta_path).replace('.fasta',
                                           '_rc_sorted_%03d.fasta' % i)
            out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename)

            cmd_cutadapt_tag_dic = {
                'tag_fwd': tag_rev,
                'tag_fwd_len': len(tag_rev),
                'tag_rev_rc': tag_fwd_rc,
                'tag_rev_rc_len': len(tag_fwd_rc),
                'in_fasta_path': in_raw_fasta_path,
                'out_fasta': out_rc_fasta_path,
                'num_threads': num_threads,
            }

            cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \
                '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic)

            Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_tag_str
            else:
                args = shlex.split(cmd_cutadapt_tag_str)
            run_result = subprocess.run(args=args, capture_output=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ###################################################################
            #
            # Trim primers from output
            # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only
            # --front 'WACTAATCAATTWCCAAATCCTCC;min_overlap=24...GTACCAATATCYTTGTGATTAGTGGA;min_overlap=26'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_trimmed_000.fasta
            # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta
            #
            ###################################################################

            if generic_dna:  # Biopython <1.78
                primer_fwd_rc = str(
                    Seq(primer_fwd, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                primer_fwd_rc = str(Seq(primer_fwd).reverse_complement())

            in_fasta_path = out_rc_fasta_path
            out_rc_fasta_basename = os.path.basename(in_fasta_path).replace(
                '_rc_sorted_%03d.fasta' % i,
                '_rc_sorted_trimmed_%03d.fasta' % i)
            out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename)

            cmd_cutadapt_primer_dic = {
                'primer_fwd': primer_rev,
                'primer_fwd_len': len(primer_rev),
                'primer_rev_rc': primer_fwd_rc,
                'primer_rev_rc_len': len(primer_fwd_rc),
                'in_fasta_path': in_fasta_path,
                'out_fasta': out_rc_fasta_path,
                'error_rate': cutadapt_error_rate,
                'read_min_length': cutadapt_minimum_length,
                'read_max_length': cutadapt_maximum_length,
                'num_threads': num_threads,
            }
            cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                '--minimum-length {read_min_length} ' \
                '--maximum-length {read_max_length} --trimmed-only  ' \
                '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" ' \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic)

            Logger.instance().debug(
                "Running: {}".format(cmd_cutadapt_primer_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_primer_str
            else:
                args = shlex.split(cmd_cutadapt_primer_str)
            run_result = subprocess.run(args=args, capture_output=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ###################################################################
            #
            # Reverse complement back rc fasta and pool
            #
            ###################################################################

            out_final_fasta_basename = os.path.basename(
                in_raw_fasta_path).replace('.fasta', '_%03d.fasta' % i)
            out_final_fasta_path = os.path.join(sorteddir,
                                                out_final_fasta_basename)
            shutil.copy(out_fasta_path, out_final_fasta_path)

            Logger.instance().debug("Pooling fwd and rc reads...")
            with open(out_final_fasta_path, 'a') as fout:
                with open(out_rc_fasta_path, 'r') as fin:
                    for line in fin:
                        if not line.startswith('>'):

                            if generic_dna:  # Biopython <1.78
                                fout.write("%s\n" % str(
                                    Seq(line.strip(),
                                        generic_dna).reverse_complement()))
                            else:  # Biopython =>1.78
                                fout.write("%s\n" % str(
                                    Seq(line.strip()).reverse_complement()))

                        else:
                            fout.write(line)

            fasta_info_df_i = fasta_info_df_i[[
                'run', 'marker', 'sample', 'replicate'
            ]]
            fasta_info_df_i['sortedfasta'] = out_final_fasta_basename
            sorted_read_info_df = pandas.concat(
                [sorted_read_info_df, fasta_info_df_i], axis=0)

        fasta_trimmed_info_tsv = os.path.join(sorteddir, 'sortedinfo.tsv')
        sorted_read_info_df.to_csv(fasta_trimmed_info_tsv,
                                   sep="\t",
                                   header=True,
                                   index=False)
コード例 #24
0
    def setUp(self):

        # vtam needs to be in the tsv_path
        cmd = '{} -m pip install . -q --upgrade --use-feature=in-tree-build'.format(
            sys.executable)
        if sys.platform.startswith("win"):
            args = cmd
        else:
            args = shlex.split(cmd)
        subprocess.run(args=args, cwd=PathManager.get_project_path())

        self.package_path = os.path.join(PathManager.get_package_path())
        self.test_path = os.path.join(PathManager.get_test_path())
        self.outdir_path = os.path.join(self.test_path, 'outdir')
        self.outdir_data_path = os.path.join(self.outdir_path, 'data')
        # during development of the test, this prevents errors
        shutil.rmtree(self.outdir_path, ignore_errors=True)
        pathlib.Path(self.outdir_data_path).mkdir(parents=True, exist_ok=True)
        os.environ['VTAM_LOG_VERBOSITY'] = str(10)

        ############################################################################################
        #
        # Download sorted fasta test dataset
        #
        ############################################################################################

        sorted_tar_path = os.path.join(self.outdir_data_path, "sorted.tar.gz")
        # Test first in local dir, otherwise in the remote URLs
        if not os.path.isfile(sorted_tar_path) or pathlib.Path(
                sorted_tar_path).stat().st_size < 1000000:
            try:
                # urllib.request.urlretrieve(sorted_tar_gz_url1, sorted_tar_path, MyProgressBar())
                with tqdm(...) as t:
                    t.set_description(os.path.basename(sorted_tar_path))
                    urllib.request.urlretrieve(sorted_tar_gz_url1,
                                               sorted_tar_path,
                                               reporthook=tqdm_hook(t))
            except Exception:
                try:
                    # urllib.request.urlretrieve(sorted_tar_gz_url2, sorted_tar_path, MyProgressBar())
                    with tqdm(...) as t:
                        t.set_description(os.path.basename(sorted_tar_path))
                        urllib.request.urlretrieve(sorted_tar_gz_url2,
                                                   sorted_tar_path,
                                                   reporthook=tqdm_hook(t))
                except Exception:
                    # urllib.request.urlretrieve(sorted_tar_gz_url3, sorted_tar_path, MyProgressBar())
                    with tqdm(...) as t:
                        t.set_description(os.path.basename(sorted_tar_path))
                        urllib.request.urlretrieve(sorted_tar_gz_url3,
                                                   sorted_tar_path,
                                                   reporthook=tqdm_hook(t))
        tar = tarfile.open(sorted_tar_path, "r:gz")
        tar.extractall(path=self.outdir_path)
        tar.close()

        ############################################################################################
        #
        # Paths
        #
        ############################################################################################

        self.asvtable_path = os.path.join(self.outdir_path,
                                          "asvtable_default.tsv")

        self.args = {}
        self.args['sortedinfo'] = os.path.join(os.path.dirname(__file__),
                                               "sortedinfo.tsv")
        self.args['params'] = os.path.join(os.path.dirname(__file__),
                                           "params_min_replicate_number1.yml")
        self.args['params_lfn_variant'] = os.path.join(
            os.path.dirname(__file__), "params_lfn_variant.yml")
        self.args['params_lfn_variant_replicate'] = os.path.join(
            os.path.dirname(__file__), "params_lfn_variant_replicate.yml")
コード例 #25
0
ファイル: test_asvtable_runner.py プロジェクト: ulysse06/vtam
    def setUp(self):

        self.test_path = os.path.join(PathManager.get_test_path())
        self.outdir_path = os.path.join(self.test_path, 'outdir')

        # during development of the test, this prevents errors
        shutil.rmtree(self.outdir_path, ignore_errors=True)
        pathlib.Path(self.outdir_path).mkdir(parents=True, exist_ok=True)

        db_path = os.path.join(self.outdir_path, "db.sqlite")
        filter_codon_stop_path = os.path.join(
            self.test_path,
            "test_files_dryad.f40v5_small/run1_mfzr_zfzr/filter_codon_stop.tsv"
        )
        variant_path = os.path.join(
            self.test_path,
            "test_files_dryad.f40v5_small/run1_mfzr_zfzr/variant_filter_codon_stop.tsv"
        )
        filter_chimera_borderline_path = os.path.join(
            self.test_path,
            "test_files_dryad.f40v5_small/run1_mfzr_zfzr/filter_chimera_borderline_and_filter_codon_stop.tsv"
        )

        self.engine = sqlalchemy.create_engine('sqlite:///{}'.format(db_path),
                                               echo=False)
        run_df = pandas.DataFrame({'name': ['run1']}, index=range(1, 2))
        run_df.to_sql(name=Run.__tablename__,
                      con=self.engine.connect(),
                      index_label='id')

        marker_df = pandas.DataFrame({'name': ['MFZR', 'ZFZR']},
                                     index=range(1, 3))
        marker_df.to_sql(name=Marker.__tablename__,
                         con=self.engine.connect(),
                         index_label='id')

        sample_df = pandas.DataFrame(
            {'name': ['tpos1_run1', 'tnegtag_run1', '14ben01', '14ben02']},
            index=range(1, 5))
        sample_df.to_sql(name=Sample.__tablename__,
                         con=self.engine.connect(),
                         index_label='id')

        variant_df = pandas.read_csv(variant_path,
                                     sep="\t",
                                     header=0,
                                     index_col='id')
        variant_df.to_sql(name=Variant.__tablename__,
                          con=self.engine.connect(),
                          index_label='id')

        filter_chimera_borderline_db = pandas.read_csv(
            filter_chimera_borderline_path, sep="\t", header=0)
        filter_chimera_borderline_db.to_sql(
            name=FilterChimeraBorderline.__tablename__,
            con=self.engine.connect())

        self.filter_codon_stop_df = pandas.read_csv(filter_codon_stop_path,
                                                    sep="\t",
                                                    header=0)
        self.sample_list = ['tpos1_run1', 'tnegtag_run1', '14ben01', '14ben02']
コード例 #26
0
 def setUpClass(cls):
     cls.test_path = PathManager.get_test_path()
     cls.outdir_path = os.path.join(cls.test_path, 'outdir')
コード例 #27
0
ファイル: test_cmd_merge_gz.py プロジェクト: aitgon/vtam
    def setUpClass(cls):

        cls.test_path = PathManager.get_test_path() # return the path vtam.test_path__path__[0]/tests
        cls.outdir_path = os.path.join(cls.test_path, 'outdir_gz')
コード例 #28
0
ファイル: test_filter_lfn.py プロジェクト: ulysse06/vtam
    def setUp(self):

        self.__testdir_path = os.path.join(PathManager.get_test_path())

        #
        self.variant_df = pandas.DataFrame({
            'id': [1, 22],
            'sequence_': ["tata", "tgtg"],
        })
        self.variant_read_count_df = pandas.DataFrame({
            'run_id': [1] * 150,
            'marker_id':
            150 * [1],
            'sample_id': [1, 1, 1, 2, 2, 2] * 25,
            'replicate': [1, 2, 3] * 50,
            'variant_id': [*itertools.chain(*[[l] * 6 for l in range(1, 26)])],
            # [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, ..
            'read_count': [
                10,
                5,
                0,
                249,
                58,
                185,
                68,
                54,
                100,
                0,
                0,
                0,
                0,
                0,
                0,
                258,
                126,
                500,
                0,
                0,
                0,
                0,
                1,
                0,
                0,
                0,
                1,
                0,
                0,
                0,
                1524,
                1815,
                789,
                118,
                98,
                50,
                1,
                0,
                0,
                0,
                0,
                0,
                0,
                1,
                0,
                0,
                0,
                0,
                125,
                214,
                20,
                1284,
                1789,
                1913,
                0,
                1,
                0,
                0,
                1,
                0,
                15,
                0,
                1,
                0,
                0,
                25,
                0,
                0,
                2,
                598,
                50,
                875,
                2,
                60,
                12,
                1,
                0,
                0,
                1,
                0,
                0,
                0,
                0,
                2,
                0,
                3,
                0,
                0,
                5,
                0,
                65,
                98,
                152,
                2,
                0,
                1,
                52,
                74,
                85,
                0,
                0,
                0,
                1,
                0,
                0,
                5,
                0,
                8,
                5,
                0,
                1,
                0,
                0,
                21,
                0,
                0,
                0,
                524,
                658,
                125,
                0,
                0,
                0,
                2,
                0,
                10,
                25,
                58,
                23,
                10980,
                8999,
                13814,
                0,
                5,
                0,
                0,
                2,
                0,
                1,
                0,
                1,
                1,
                0,
                284,
                0,
                2,
                0,
                0,
                5,
                0,
            ],
        })
        self.marker_id = 1
        #
        self.filter_lfn_runner = RunnerFilterLFN(self.variant_read_count_df)
コード例 #29
0
    def setUpClass(cls):

        ########################################################################
        #
        # These tests need the vtam command in the path
        #
        ########################################################################

        pip_install_vtam_for_tests()  # vtam needs to be in the path

        cls.package_path = PathManager.get_package_path()
        cls.test_path = PathManager.get_test_path()

        cls.outdir_path = os.path.join(cls.test_path, 'outdir')
        shutil.rmtree(cls.outdir_path, ignore_errors=True)
        cls.outdir_data_path = os.path.join(cls.outdir_path, 'data')
        pathlib.Path(cls.outdir_data_path).mkdir(parents=True, exist_ok=True)

        cls.outdir_download_path = os.path.join(cls.test_path,
                                                'outdir_download')
        pathlib.Path(cls.outdir_download_path).mkdir(parents=True,
                                                     exist_ok=True)

        cls.snakefile_tuto_data = os.path.join(
            cls.package_path, "data/snake.tuto.data_makeknownoccurrences.yml")

        ############################################################################################
        #
        # Set command args
        #
        ############################################################################################

        cls.args = {}
        cls.args['package_path'] = cls.package_path
        cls.args['snake_tuto_data'] = cls.snakefile_tuto_data

        ############################################################################################
        #
        # Download fastq test dataset
        #
        ############################################################################################

        fastq_tar_path = os.path.join(cls.outdir_download_path, "fastq.tar.gz")
        # Test first in local dir, otherwise in the remote URLs
        if not os.path.isfile(fastq_tar_path) or pathlib.Path(
                fastq_tar_path).stat().st_size < 1000000:
            try:
                # urllib.request.urlretrieve(fastq_tar_gz_url1, fastq_tar_path, MyProgressBar())
                with tqdm(...) as t:
                    t.set_description(os.path.basename(fastq_tar_path))
                    urllib.request.urlretrieve(fastq_tar_gz_url1,
                                               fastq_tar_path,
                                               reporthook=tqdm_hook(t))
            except Exception:
                try:
                    # urllib.request.urlretrieve(fastq_tar_gz_url2, fastq_tar_path, MyProgressBar())
                    with tqdm(...) as t:
                        t.set_description(os.path.basename(fastq_tar_path))
                        urllib.request.urlretrieve(fastq_tar_gz_url2,
                                                   fastq_tar_path,
                                                   reporthook=tqdm_hook(t))
                except Exception:
                    # urllib.request.urlretrieve(fastq_tar_gz_url3, fastq_tar_path, MyProgressBar())
                    with tqdm(...) as t:
                        t.set_description(os.path.basename(fastq_tar_path))
                        urllib.request.urlretrieve(fastq_tar_gz_url3,
                                                   fastq_tar_path,
                                                   reporthook=tqdm_hook(t))
        tar = tarfile.open(fastq_tar_path, "r:gz")
        tar.extractall(path=cls.outdir_path)
        tar.close()

        ############################################################################################
        #
        # Copy data to directory tree
        #
        ############################################################################################

        cmd = "snakemake --cores 1 -s {snake_tuto_data} --config MARKER=mfzr " \
              "PROJECT=asper1 PACKAGE_PATH={package_path} --until all_one_marker_makeknownoccurrences".format(**cls.args)

        if sys.platform.startswith("win"):
            args = cmd
        else:
            args = shlex.split(cmd)
        subprocess.run(args=args, check=True, cwd=cls.outdir_path)
コード例 #30
0
    def setUp(self):

        self.test_path = os.path.join(PathManager.get_test_path())
        self.outdir_path = os.path.join(self.test_path, 'outdir')
        shutil.rmtree(self.outdir_path, ignore_errors=True)
        pathlib.Path(self.outdir_path).mkdir(parents=True, exist_ok=True)