def setUpClass(cls): cmd = '{} -m pip install . -q --upgrade --use-feature=in-tree-build'.format( sys.executable) if sys.platform.startswith("win"): args = cmd else: args = shlex.split(cmd) subprocess.run(args=args, cwd=PathManager.get_project_path()) cls.package_path = os.path.join(PathManager.get_package_path()) cls.test_path = os.path.join(PathManager.get_test_path()) cls.outdir_path = os.path.join(cls.test_path, 'outdir') cls.outdir_data_path = os.path.join(cls.outdir_path, 'data') shutil.rmtree(cls.outdir_path, ignore_errors=True) pathlib.Path(cls.outdir_data_path).mkdir(parents=True, exist_ok=True) ############################################################################################ # # Download sorted reads dataset # ############################################################################################ sorted_tar_path = os.path.join(cls.outdir_data_path, "sorted.tar.gz") pathlib.Path(os.path.dirname(sorted_tar_path)).mkdir(parents=True, exist_ok=True) # Test first in local dir, otherwise in the remote URLs if not os.path.isfile(sorted_tar_path) or pathlib.Path( sorted_tar_path).stat().st_size < 1000000: try: # urllib.request.urlretrieve(sorted_tar_gz_url1, sorted_tar_path, MyProgressBar()) with tqdm(...) as t: t.set_description(os.path.basename(sorted_tar_path)) urllib.request.urlretrieve(sorted_tar_gz_url1, sorted_tar_path, reporthook=tqdm_hook(t)) except Exception: try: # urllib.request.urlretrieve(sorted_tar_gz_url2, sorted_tar_path, MyProgressBar()) with tqdm(...) as t: t.set_description(os.path.basename(sorted_tar_path)) urllib.request.urlretrieve(sorted_tar_gz_url2, sorted_tar_path, reporthook=tqdm_hook(t)) except Exception: # urllib.request.urlretrieve(sorted_tar_gz_url3, sorted_tar_path, MyProgressBar()) with tqdm(...) as t: t.set_description(os.path.basename(sorted_tar_path)) urllib.request.urlretrieve(sorted_tar_gz_url3, sorted_tar_path, reporthook=tqdm_hook(t)) tar = tarfile.open(sorted_tar_path, "r:gz") tar.extractall(path=cls.outdir_data_path) tar.close()
def setUpClass(cls): ######################################################################## # # These tests need the vtam command in the path # ######################################################################## pip_install_vtam_for_tests() cls.package_path = os.path.join(PathManager.get_package_path()) cls.test_path = os.path.join(PathManager.get_test_path()) cls.outdir_path = os.path.join(cls.test_path, 'outdir') cls.outdir_data_path = os.path.join(cls.outdir_path, 'data') shutil.rmtree(cls.outdir_path, ignore_errors=True) pathlib.Path(cls.outdir_data_path).mkdir(parents=True, exist_ok=True) ############################################################################################ # # Download sorted reads dataset (Updated Oct 10, 2020) # ############################################################################################ sorted_tar_path = os.path.join(cls.outdir_data_path, "sorted.tar.gz") # Test first in local dir, otherwise in the remote URLs if not os.path.isfile(sorted_tar_path) or pathlib.Path( sorted_tar_path).stat().st_size < 1000000: try: # urllib.request.urlretrieve(sorted_tar_gz_url1, sorted_tar_path, MyProgressBar()) with tqdm(...) as t: t.set_description(os.path.basename(sorted_tar_path)) urllib.request.urlretrieve(sorted_tar_gz_url1, sorted_tar_path, reporthook=tqdm_hook(t)) except Exception: try: # urllib.request.urlretrieve(sorted_tar_gz_url2, sorted_tar_path, MyProgressBar()) with tqdm(...) as t: t.set_description(os.path.basename(sorted_tar_path)) urllib.request.urlretrieve(sorted_tar_gz_url2, sorted_tar_path, reporthook=tqdm_hook(t)) except Exception: # urllib.request.urlretrieve(sorted_tar_gz_url3, sorted_tar_path, MyProgressBar()) with tqdm(...) as t: t.set_description(os.path.basename(sorted_tar_path)) urllib.request.urlretrieve(sorted_tar_gz_url3, sorted_tar_path, reporthook=tqdm_hook(t)) tar = tarfile.open(sorted_tar_path, "r:gz") tar.extractall(path=cls.outdir_data_path) tar.close()
def setUp(self): self.package_path = PathManager.get_package_path() test_path = PathManager.get_test_path() # Minimal merge command foopaths = {} foopaths['foofile'] = os.path.relpath(__file__, self.package_path) foopaths['foodir'] = os.path.relpath(os.path.dirname(__file__), self.package_path) foopaths['outdir'] = 'tests/output' foopaths['sortedinfo_tsv'] = "data/example/sortedinfo_mfzr.tsv" foopaths['tsv_path'] = "data/example/sortedinfo_mfzr.tsv" foopaths['known_occurrences'] = "data/example/known_occurrences.tsv" self.foopaths = foopaths
def setUp(self): asv_table_str = """variant_id marker_name run_name sequence_length read_count sample1 sample2 sample3 chimera_borderline sequence 3 MFZR prerun 176 9713 9712 1 0 FALSE TCTATATTTCATTTTTGGTGCTTGGGCAGGTATGGTAGGTACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCTCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT 33 MFZR prerun 174 9713 9703 10 0 FALSE CTATATTTCATTTTTGGTGCTTGGGCAGGTATGGTAGGTACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCTCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT 333 ZFZR prerun 157 10000 9900 10 0 FALSE TGCTTGGGCAGGTATGGTAGGTACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCTCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT 836 MFZR prerun 176 11588 123 56 0 FALSE TCTATATTTCATTTTTGGTGCTTGGGCAGGTATGGTAGGGACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCCCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT 8368 ZFZR prerun 157 545 500 0 45 FALSE TGCTTGGGCAGGTATGGTAGGGACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCCCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT 83683 MFZR prerun 175 484 0 28 456 FALSE TCTAAATTTCATTTTTGGTGCTTGGGCAGGTATGGTAGGGACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCCCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT """ asv_table_df = pandas.read_csv(io.StringIO(asv_table_str), sep="\t", header=0) self.asv_table_df = asv_table_df # Create this_tempdir this_tempdir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(this_tempdir).mkdir(exist_ok=True) # Define fasta_path tsv_path fasta_path = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__), 'variants.fa') # Create variant variant_read_count_input_df variant_df = asv_table_df[['variant_id', 'sequence', 'read_count' ]].drop_duplicates(inplace=False) variant_df.columns = ['id', 'sequence', 'size'] # Create fasta_path file from asv_table_df variant_df_utils = DataframeVariant(variant_df) variant_df_utils.to_fasta(fasta_path, add_column='size') # Define vsearch output tsv_path vsearch_output_path = os.path.join( PathManager.instance().get_tempdir(), os.path.basename(__file__), 'centroid_out.fa') # Define cluster output tsv_path vsearch_cluster_output_path = os.path.join( PathManager.instance().get_tempdir(), os.path.basename(__file__), 'cluster.fa') # # Create object and run_name vsearch os.environ["VTAM_THREADS"] = "1" vsearch_parameters = { '--cluster_size': fasta_path, '--clusters': vsearch_cluster_output_path, '--id': 1, '--sizein': None, '--centroids': vsearch_output_path, "--threads": int(os.getenv('VTAM_THREADS')), } vsearch_cluster = RunnerVSearch(parameters=vsearch_parameters) vsearch_cluster.run()
def main(outdir): package_path = PathManager.get_package_path() pathlib.Path(outdir).mkdir(parents=True, exist_ok=True) ####################################################################### # # Download fastq # ####################################################################### fastq_tar_path = os.path.join(outdir, "fastq.tar.gz") # Test first in local dir, otherwise in the remote URLs if not os.path.isfile(fastq_tar_path) or pathlib.Path(fastq_tar_path).stat().st_size < 1000000: try: # urllib.request.urlretrieve(fastq_tar_gz_url1, fastq_tar_path, MyProgressBar()) with tqdm(...) as t: t.set_description(os.path.basename(fastq_tar_path)) urllib.request.urlretrieve(fastq_tar_gz_url1, fastq_tar_path, reporthook=tqdm_hook(t)) except Exception: try: # urllib.request.urlretrieve(fastq_tar_gz_url2, fastq_tar_path, MyProgressBar()) with tqdm(...) as t: t.set_description(os.path.basename(fastq_tar_path)) urllib.request.urlretrieve(fastq_tar_gz_url2, fastq_tar_path, reporthook=tqdm_hook(t)) except Exception: # urllib.request.urlretrieve(fastq_tar_gz_url3, fastq_tar_path, MyProgressBar()) with tqdm(...) as t: t.set_description(os.path.basename(fastq_tar_path)) urllib.request.urlretrieve(fastq_tar_gz_url3, fastq_tar_path, reporthook=tqdm_hook(t)) tar = tarfile.open(fastq_tar_path, "r:gz") tar.extractall(path=outdir) tar.close() os.remove(fastq_tar_path) ####################################################################### # # Set command args # ####################################################################### args = {} args['package_path'] = package_path args['snake_tuto_data'] = os.path.join(package_path, "data/snake.tuto.data.yml") ####################################################################### # # Copy data to directory tree # ####################################################################### cmd = "snakemake --cores 1 -s {snake_tuto_data} --config MARKER=mfzr " \ "PROJECT=asper1 PACKAGE_PATH={package_path} --until all_one_marker".format(**args) if sys.platform.startswith("win"): args = cmd else: args = shlex.split(cmd) subprocess.run(args=args, check=True, cwd=outdir)
def setUp(self): pip_install_vtam_for_tests() self.test_path = PathManager.get_test_path() self.outdir_path = os.path.join(self.test_path, 'outdir') pathlib.Path(self.outdir_path).mkdir(exist_ok=True, parents=True)
def setUpClass(cls): cls.test_path = PathManager.get_test_path() cls.tags_file_path = os.path.join(cls.test_path, "test_files", "FilesInputCutadapt") cls.fastainfo = os.path.join(cls.tags_file_path, "fastainfo.tsv") cls.fastainfoNoDuplicates = os.path.join(cls.tags_file_path, "fastainfoNoDuplicates.tsv") cls.mergedFasta1 = "14Ben01_1_fw_48.fasta"
def setUp(self): """>parent1;size=650 TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATAATTAGTTGG >parent2;size=700 AACTATGTACACAAATTTTAGTATATTGGCAGGGATAGTAGGAACTTTACTATCGTTAGTTATCAGAATGGAATTATCAA CAGGAAACATGTTAGATGGAGACGGTCAACAATATAACGTAATCGTAACCGCACATGGATTAATAATGATATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG >Chimera1;size=50 TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG >Chimera2;size=300 TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTG CAGGAAACATGTTAGATGGAGACGGTCAACAATATAACGTAATCGTAACCGCACATGGATTAATAATGATATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG >Chimera3;size=50 TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG""" """(vtam_appli) gonzalez@milan:~/tmp/vsearch_uchime$ vsearch --uchime_denovo i.fa --borderline borderline.fa --nonchimeras nonchimeras.fa --chimeras chimeras.fa vsearch v2.7.0_linux_x86_64, 15.5GB RAM, 8 cores https://github.com/torognes/vsearch Reading file i.fa 100% 1500 nt in 5 seqs, min 300, max 300, avg 300 Masking 100% Sorting by abundance 100% Counting k-mers 100% Detecting chimeras 100% Found 2 (40.0%) chimeras, 2 (40.0%) non-chimeras, and 1 (20.0%) borderline sequences in 5 unique sequences. Taking abundance information into account, this corresponds to 350 (20.0%) chimeras, 1350 (77.1%) non-chimeras, and 50 (2.9%) borderline sequences in 1750 total sequences""" # Input from min_replicate_number # Variants 1 and 2 are ok but 3-5 are chimeras self.variant_df = pandas.DataFrame( data={ 'sequence': [ 'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATAATTAGTTGG', 'AACTATGTACACAAATTTTAGTATATTGGCAGGGATAGTAGGAACTTTACTATCGTTAGTTATCAGAATGGAATTATCAACAGGAAACATGTTAGATGGAGACGGTCAACAATATAACGTAATCGTAACCGCACATGGATTAATAATGATATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG', 'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG', 'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCAGGAAACATGTTAGATGGAGACGGTCAACAATATAACGTAATCGTAACCGCACATGGATTAATAATGATATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG', 'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG', ], }, index=list(range(1, 6)), ) # self.variant_read_count_df = pandas.DataFrame({ 'run_id': [1] * 5, 'marker_id': [1] * 5, 'sample_id': [1] * 5, 'replicate': [1] * 5, 'variant_id': list(range(1, 6)), 'read_count': [650, 700, 50, 350, 50], }) self.this_tempdir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(self.this_tempdir).mkdir(parents=True, exist_ok=True) os.environ['VTAM_THREADS'] = str(multiprocessing.cpu_count())
def setUpClass(cls): cls.package_path = PathManager.get_package_path() cls.test_path = PathManager.get_test_path() foopaths = {} foopaths['foofile'] = os.path.relpath(__file__, cls.package_path) foopaths['foodir'] = os.path.relpath(os.path.dirname(__file__), cls.package_path) foopaths['sorteddir'] = 'output' foopaths['sortedinfo_tsv'] = "data/example/sortedinfo_mfzr.tsv" foopaths[ 'optimize_lfn_variant_specific'] = "tests/test_files_dryad.f40v5_small/run1_mfzr_zfzr/optimize_lfn_variant_specific.tsv" cls.foopaths = foopaths cls.minseqlength_value_32 = 32 cls.minseqlength_value_40 = 40 cls.lfn_variant_replicate_cutoff = 0.002
def pip_install_vtam_for_tests(): """This function is used in the tests when the vtam command is run""" cmd = '{} -m pip install . -q --upgrade'.format(sys.executable) if sys.platform.startswith("win"): args = cmd else: args = shlex.split(cmd) subprocess.run(args=args, check=True, cwd=PathManager.instance().get_project_path())
def __init__(self, taxonomy_tsv=None): """ :param taxonomy_tsv: Path to the taxonomy_tsv. Default None :type taxonomy_tsv: str :rtype: None """ if taxonomy_tsv is None: # If None, download to current wdir self.taxonomy_tsv_path = os.path.join(os.getcwd(), "taxonomy.tsv") else: # Download to tsv_path self.taxonomy_tsv_path = taxonomy_tsv pathlib.Path(os.path.dirname(taxonomy_tsv)).mkdir(parents=True, exist_ok=True) self.tempdir = PathManager.instance().get_tempdir() package_path = os.path.join(PathManager.get_package_path()) self.taxonomy_tsv_gz_path = os.path.join(package_path, "..", "data", "taxonomy.tsv.gz")
def setUp(self): test_path = os.path.join(PathManager.get_test_path()) self.variantid_identity_lineage_df = pandas.read_csv(os.path.join(test_path, "test_runner_ltg_selection", "variantid_identity_lineage.tsv"), sep="\t", header=0) self.ltg_bak_df = pandas.read_csv(os.path.join(test_path, "test_runner_ltg_selection", "ltg_bak.tsv"), sep="\t") # create_vtam_data_dir() testdir_path = os.path.join(PathManager.get_test_path()) self.outdir_path = os.path.join(testdir_path, "outdir") pathlib.Path(self.outdir_path).mkdir(exist_ok=True, parents=True) taxonomy_tsv_path = os.path.join(self.outdir_path, "taxonomy.tsv") CommandTaxonomy( taxonomy_tsv=taxonomy_tsv_path).download_precomputed_taxonomy() self.taxonomy_df = pandas.read_csv(taxonomy_tsv_path, sep="\t", header=0, dtype={'tax_id': 'int', 'parent_tax_id': 'int', 'old_tax_id': 'float'}).drop_duplicates() self.taxonomy_df.set_index('tax_id', drop=True, inplace=True) self.taxonomy_df = self.taxonomy_df[[ 'parent_tax_id', 'rank', 'name_txt']].drop_duplicates() taxonomy = Taxonomy(taxonomy_tsv_path) self.taxonomy_df = taxonomy.df
def test_wopmars_runner_filter_with_cutoff_specific(self): cmd = 'filter --sortedinfo {sortedinfo_tsv} --sorteddir {foodir} --asvtable asvtableoutput.tsv' \ ' --cutoff_specific {optimize_lfn_variant_specific}'.format(**self.foopaths) cwd = os.getcwd() os.chdir(self.package_path) args = ArgParser.get_main_arg_parser().parse_args(cmd.split(" ")) os.chdir(cwd) wopmars_runner = RunnerWopmars(command='filter', cli_args_dic=vars(args)) wopfile_path = os.path.relpath( os.path.join(PathManager.get_package_path(), "tests/output/wopfile"), PathManager.get_package_path()) wopfile_path, wopfile_content = wopmars_runner.create_wopfile( path=wopfile_path) self.assertTrue( yaml.load(wopfile_content, Loader=yaml.SafeLoader) ['rule FilterLFN']['params']['lfn_variant_specific_cutoff'] == self.foopaths['optimize_lfn_variant_specific'])
def __init__(self, variant_fasta, blast_db_dir, blast_db_name, num_threads, qcov_hsp_perc): self.variant_fasta = variant_fasta self.blast_db_dir = blast_db_dir self.blast_db_name = blast_db_name # self.ltg_rule_threshold = ltg_rule_threshold # self.include_prop = include_prop # self.min_number_of_taxa = min_number_of_taxa self.num_threads = num_threads self.qcov_hsp_perc = qcov_hsp_perc self.this_temp_dir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(self.this_temp_dir).mkdir(exist_ok=True, parents=True)
def __init__(self, variant_expected_df, variant_unexpected_df, variant_read_count_df): """ Initiates object for the PCR error filter :param variant_expected_df: DataFrame (id, sequence) with expected variants :param variant_unexpected_df: DataFrame (id, sequence) with unexpected variants :param variant_read_count_df: DataFrame (run_id, marker_id, sample_id, replicate, variant_id, read_count) """ self.__variant_expected_df = variant_expected_df self.__variant_unexpected_df = variant_unexpected_df self.__variant_read_count_df = variant_read_count_df self.__tmp_dir = os.path.join(PathManager.instance().get_tempdir(), self.__class__.__name__) pathlib.Path(self.__tmp_dir).mkdir(parents=True, exist_ok=True)
def run(self): session = self.session engine = session._session().get_bind() this_temp_dir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(this_temp_dir).mkdir(exist_ok=True) ############################################################################################ # # Wrapper inputs, outputs and parameters # ############################################################################################ # Input file paths known_occurrences_tsv = self.input_file( OptimizePCRerror.__input_file_known_occurrences) fasta_info_tsv = self.input_file( OptimizePCRerror.__input_file_sortedinfo) # # Output file paths output_optimize_path = self.output_file( OptimizePCRerror.__output_file_optimize_pcr_error) ############################################################################################ # # Get nijk_df, known_occurrences_df # ############################################################################################ sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( VariantReadCount, engine=engine) known_occurrences_df = FileKnownOccurrences( known_occurrences_tsv).to_identifier_df(engine) ############################################################################################ # # Run optimizer and Write # ############################################################################################ optimize_pcr_error_runner = RunnerOptimizePCRerror( variant_read_count_df=variant_read_count_df, known_occurrences_df=known_occurrences_df) optimize_pcr_error_runner.to_tsv(optimize_path=output_optimize_path, engine=engine)
def compute_clusters(self): tempcluster_dir = PathManager.instance().get_tempdir() i_fas = os.path.join(tempcluster_dir, 'cluster_input.fas') with open(i_fas, 'w') as fout: for idx, row in self.variant_info_df.iterrows(): valdict = {} valdict['variant_id'] = row.variant_id valdict['read_count'] = row.read_count valdict['sequence'] = row.sequence fout.write( ">{variant_id};size={read_count}\n{sequence}\n".format( **valdict)) cmd = "vsearch --cluster_size cluster_input.fas --id {} --otutabout otutabout.txt --clusters test".format( self.cluster_identity) if sys.platform.startswith("win"): args = cmd else: args = shlex.split(cmd) subprocess.run(args=args, cwd=tempcluster_dir) otutabout_path = os.path.join(tempcluster_dir, "otutabout.txt") otutabout_df = pandas.read_csv(otutabout_path, sep="\t") otutabout_df.rename({'#OTU ID': 'centroid'}, axis=1, inplace=True) otutabout_long_df = pandas.melt(otutabout_df, id_vars=['centroid'], var_name='variant_id', value_name='read_count') otutabout_long_df.rename({'centroid': 'clusterid'}, axis=1, inplace=True) otutabout_long_df = otutabout_long_df.loc[ otutabout_long_df.read_count > 0] otutabout_long_df.variant_id = otutabout_long_df.variant_id.astype( 'int') cluster_count_df = otutabout_long_df[['clusterid', 'variant_id' ]].groupby('clusterid').count() cluster_count_df.rename({'variant_id': 'clustersize'}, axis=1, inplace=True) cluster_count_df = otutabout_long_df[['clusterid', 'variant_id' ]].merge(cluster_count_df, on='clusterid') return cluster_count_df
def __init__(self, asv_table_df, readcounts, run_marker_df=None): """ Constructor of the CommandPoolRunMarkers class Parameters ---------- asv_table_df : pandas dataframe ASV table. readcount : bool Default false. If false, boolean 0/1 is given for presence or absence of variant in pooled table. If true, read integer is given with sum or reads in the pooled runs or markers. run_marker_df: pandas dataframe Output ASV table with pooled variants """ header = { 'run_name', 'marker_name', 'variant_id', 'sequence_length', 'read_count' } if not set(asv_table_df.columns ) >= header: # contains at least the 'header_lower' columns Logger.instance().error( VTAMexception( "The ASV table structure is wrong. It is expected to contain these columns: " "run_name, marker_name, variant_id, sequence_length, read_count" )) sys.exit(1) self.sample_names = asv_table_df.columns.tolist()[5:-2] if run_marker_df is None: # Default: pool all marker_name self.asv_table_df = asv_table_df else: # if run_marker_df: pool only markers in this variant_read_count_input_df self.asv_table_df = asv_table_df.merge( run_marker_df, on=['run_name', 'marker_name']) self.tmp_dir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(self.tmp_dir).mkdir(exist_ok=True) self.cluster_path = None # returned by run_vsearch_to_cluster_sequences self.cluster_df = None # returned by get_vsearch_clusters_to_df self.readcounts = readcounts # returned by get_vsearch_clusters_to_df
def __init__(self, command, cli_args_dic): """ :param command: takes one of two values: filter or optimize :param cli_args_dic: dictionnary (CLIargumentDict.instance()) with command """ self.command = command self.cli_args_and_numerical_params = {} self.cli_args_and_numerical_params.update(cli_args_dic) # Add user params_lfn_variant.yml parameters params_dic = FileParams(cli_args_dic['params']).get_params_dic() self.cli_args_and_numerical_params.update(params_dic) self.wopfile_path = None self.tempdir = PathManager.instance().get_tempdir()
def setUpClass(cls): # vtam needs to be in the tsv_path pip_install_vtam_for_tests() cls.test_path = os.path.join(PathManager.get_test_path()) cls.outdir_path = os.path.join(cls.test_path, 'outdir') cls.args = {} cls.args['taxonomy'] = os.path.join(cls.outdir_path, "taxonomy.tsv") cls.args['coi_blast_db_dir'] = os.path.join(cls.outdir_path, "coi_blast_db_dir") pathlib.Path(cls.args['coi_blast_db_dir']).mkdir(exist_ok=True, parents=True) ############################################################################################ # # Run 'vtam taxonomy' # ############################################################################################ cmd = "vtam taxonomy --output {taxonomy} --precomputed".format(**cls.args) if sys.platform.startswith("win"): args = cmd else: args = shlex.split(cmd) subprocess.run(args=args) ############################################################################################ # # Run 'vtam coi_blast_db' # ############################################################################################ cmd = "vtam coi_blast_db --blastdbdir {coi_blast_db_dir} --blastdbname coi_blast_db_20200420 ".format(**cls.args) # if not (os.path.isfile(os.path.join(cls.args['coi_blast_db_dir'], "coi_blast_db_20200420.nhr"))): if sys.platform.startswith("win"): args = cmd else: args = shlex.split(cmd) subprocess.run(args=args)
def setUp(self): os.environ['VTAM_THREADS'] = str(multiprocessing.cpu_count()) # Input from min_replicate_number self.variant_df = pandas.DataFrame( { 'sequence': [ 'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG', 'TGTTCTTTATTTATTATTTGATGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG', 'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTCGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG', 'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTCGCTGTAACTTTATCATTATCAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG', ], }, index=list(range(1, 5))) # self.variant_read_count_df = pandas.DataFrame({ 'run_id': [1] * 8, 'marker_id': [1] * 8, 'sample_id': [1] * 8, 'replicate': [1, 2] * 4, 'variant_id': [1] * 2 + [2] * 2 + [3] * 2 + [4] * 2, 'read_count': [ 350, 300, 300, 220, 60, 0, 2, 0, ], }) self.this_tempdir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(self.this_tempdir).mkdir(parents=True, exist_ok=True)
def setUp(self): pip_install_vtam_for_tests() self.test_path = PathManager.get_test_path() self.package_path = PathManager.get_package_path() self.outdir_path = os.path.join(self.test_path, 'outdir') shutil.rmtree(self.outdir_path, ignore_errors=True) pathlib.Path(self.outdir_path).mkdir(parents=True, exist_ok=True) self.args = {} self.args['runmarker'] = os.path.join(self.package_path, "data", "example", "pool_run_marker.tsv") self.args['db'] = os.path.join(self.outdir_path, "db.sqlite") ############################################################################################ # # Init DB # ############################################################################################ filter_codon_stop_path = os.path.join(self.test_path, "test_files_dryad.f40v5_small", "run1_mfzr_zfzr", "filter_codon_stop.tsv") variant_path = os.path.join(self.test_path, "test_files_dryad.f40v5_small", "run1_mfzr_zfzr", "variant_filter_codon_stop.tsv") sample_information_path = os.path.join(self.test_path, "test_files_dryad.f40v5_small", "run1_mfzr_zfzr", "sample_information.tsv") self.engine = sqlalchemy.create_engine('sqlite:///{}'.format( self.args['db']), echo=False) sample_information_df = pandas.read_csv(sample_information_path, sep="\t", header=0) sample_information_df.to_sql(name=SampleInformation.__tablename__, con=self.engine.connect(), if_exists='replace') run_df = pandas.DataFrame({'name': ['run1']}, index=range(1, 2)) run_df.to_sql(name=Run.__tablename__, con=self.engine.connect(), index_label='id', if_exists='replace') marker_df = pandas.DataFrame({'name': ['MFZR', 'ZFZR']}, index=range(1, 3)) marker_df.to_sql(name=Marker.__tablename__, con=self.engine.connect(), index_label='id', if_exists='replace') sample_df = pandas.DataFrame( {'name': ['tpos1_run1', 'tnegtag_run1', '14ben01', '14ben02']}, index=range(1, 5)) sample_df.to_sql(name=Sample.__tablename__, con=self.engine.connect(), index_label='id', if_exists='replace') variant_df = pandas.read_csv(variant_path, sep="\t", header=0, index_col='id') variant_df.to_sql(name=Variant.__tablename__, con=self.engine.connect(), index_label='id', if_exists='replace') filter_codon_stop_df = pandas.read_csv(filter_codon_stop_path, sep="\t", header=0) filter_codon_stop_df.to_sql(name=FilterCodonStop.__tablename__, con=self.engine.connect(), if_exists='replace') filter_chimera_borderline_path = os.path.join( self.test_path, "test_files_dryad.f40v5_small", "run1_mfzr_zfzr", "filter_chimera_borderline_and_filter_codon_stop.tsv") filter_chimera_borderline_db = pandas.read_csv( filter_chimera_borderline_path, sep="\t", header=0) filter_chimera_borderline_db.to_sql( name=FilterChimeraBorderline.__tablename__, con=self.engine.connect(), if_exists='replace') self.sample_list = ['tpos1_run1', 'tnegtag_run1', '14ben01', '14ben02']
def main(fastainfo, fastadir, sorteddir, params=None, num_threads=multiprocessing.cpu_count()): if sys.platform.startswith('win'): num_threads = 1 ############################################################################################ # # params.yml parameters # ############################################################################################ params_dic = FileParams(params).get_params_dic() cutadapt_error_rate = params_dic['cutadapt_error_rate'] cutadapt_minimum_length = params_dic['cutadapt_minimum_length'] cutadapt_maximum_length = params_dic['cutadapt_maximum_length'] ############################################################################################ # # Loop over tag and primer pairs to demultiplex and trim reads # ############################################################################################ merged_fastainfo_df = FileSampleInformation( fastainfo).read_tsv_into_df() pathlib.Path(sorteddir).mkdir(parents=True, exist_ok=True) tempdir = PathManager.instance().get_tempdir() sorted_read_info_df = pandas.DataFrame() for i in range(0, merged_fastainfo_df.shape[0]): fasta_info_series = merged_fastainfo_df.iloc[i] tag_fwd = fasta_info_series.tagfwd tag_rev = fasta_info_series.tagrev primer_fwd = fasta_info_series.primerfwd primer_rev = fasta_info_series.primerrev in_fasta_basename = fasta_info_series.mergedfasta Logger.instance().debug( "Analysing FASTA file: {}".format(in_fasta_basename)) fasta_info_df_i = fasta_info_series.to_frame().T in_raw_fasta_path = os.path.join(fastadir, in_fasta_basename) ######################################################################################## # # Cut adapt tag of forward reads # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only # --front 'tcgatcacgatgt;min_overlap=13...gctgtagatcgaca;min_overlap=14' # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta # ######################################################################################## if generic_dna: # Biopython <1.78 tag_rev_rc = str( Seq(tag_rev, generic_dna).reverse_complement()) else: # Biopython =>1.78 tag_rev_rc = str(Seq(tag_rev).reverse_complement()) out_fasta_basename = os.path.basename(in_raw_fasta_path).replace( '.fasta', '_sorted_%03d.fasta' % i) out_fasta_path = os.path.join(tempdir, out_fasta_basename) cmd_cutadapt_tag_dic = { 'tag_fwd': tag_fwd, 'tag_fwd_len': len(tag_fwd), 'tag_rev_rc': tag_rev_rc, 'tag_rev_rc_len': len(tag_rev_rc), 'in_fasta_path': in_raw_fasta_path, 'out_fasta': out_fasta_path, 'num_threads': num_threads, } cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \ '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \ '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic) Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_tag_str else: args = shlex.split(cmd_cutadapt_tag_str) run_result = subprocess.run(args=args, capture_output=True, check=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode()) ######################################################################################## # # Trim primers from output # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only # --front 'TCCACTAATCACAARGATATTGGTAC;min_overlap=26...GGAGGATTTGGWAATTGATTAGTW;min_overlap=24' # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_trimmed_000.fasta # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta # ######################################################################################## if generic_dna: # Biopython <1.78 primer_rev_rc = str( Seq(primer_rev, generic_dna).reverse_complement()) else: # Biopython =>1.78 primer_rev_rc = str(Seq(primer_rev).reverse_complement()) in_fasta_path = out_fasta_path out_fasta_basename = os.path.basename(in_fasta_path).replace( '_sorted_%03d.fasta' % i, '_sorted_trimmed_%03d.fasta' % i) out_fasta_path = os.path.join(tempdir, out_fasta_basename) cmd_cutadapt_primer_dic = { 'primer_fwd': primer_fwd, 'primer_fwd_len': len(primer_fwd), 'primer_rev_rc': primer_rev_rc, 'primer_rev_rc_len': len(primer_rev_rc), 'in_fasta_path': in_fasta_path, 'out_fasta': out_fasta_path, 'error_rate': cutadapt_error_rate, 'read_min_length': cutadapt_minimum_length, 'read_max_length': cutadapt_maximum_length, 'num_threads': num_threads, } cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \ '--minimum-length {read_min_length} ' \ '--maximum-length {read_max_length} --trimmed-only ' \ '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" ' \ '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic) Logger.instance().debug( "Running: {}".format(cmd_cutadapt_primer_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_primer_str else: args = shlex.split(cmd_cutadapt_primer_str) run_result = subprocess.run(args=args, capture_output=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode()) ######################################################################################## # # Cut adapt tag of reverse-complement reads # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only # --front 'tgtcgatctacagc;min_overlap=14...acatcgtgatcga;min_overlap=13' # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta # ######################################################################################## if generic_dna: # Biopython <1.78 tag_fwd_rc = str( Seq(tag_fwd, generic_dna).reverse_complement()) else: # Biopython =>1.78 tag_fwd_rc = str(Seq(tag_fwd).reverse_complement()) out_rc_fasta_basename = os.path.basename( in_raw_fasta_path).replace('.fasta', '_rc_sorted_%03d.fasta' % i) out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename) cmd_cutadapt_tag_dic = { 'tag_fwd': tag_rev, 'tag_fwd_len': len(tag_rev), 'tag_rev_rc': tag_fwd_rc, 'tag_rev_rc_len': len(tag_fwd_rc), 'in_fasta_path': in_raw_fasta_path, 'out_fasta': out_rc_fasta_path, 'num_threads': num_threads, } cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \ '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \ '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic) Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_tag_str else: args = shlex.split(cmd_cutadapt_tag_str) run_result = subprocess.run(args=args, capture_output=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode()) ################################################################### # # Trim primers from output # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only # --front 'WACTAATCAATTWCCAAATCCTCC;min_overlap=24...GTACCAATATCYTTGTGATTAGTGGA;min_overlap=26' # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_trimmed_000.fasta # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta # ################################################################### if generic_dna: # Biopython <1.78 primer_fwd_rc = str( Seq(primer_fwd, generic_dna).reverse_complement()) else: # Biopython =>1.78 primer_fwd_rc = str(Seq(primer_fwd).reverse_complement()) in_fasta_path = out_rc_fasta_path out_rc_fasta_basename = os.path.basename(in_fasta_path).replace( '_rc_sorted_%03d.fasta' % i, '_rc_sorted_trimmed_%03d.fasta' % i) out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename) cmd_cutadapt_primer_dic = { 'primer_fwd': primer_rev, 'primer_fwd_len': len(primer_rev), 'primer_rev_rc': primer_fwd_rc, 'primer_rev_rc_len': len(primer_fwd_rc), 'in_fasta_path': in_fasta_path, 'out_fasta': out_rc_fasta_path, 'error_rate': cutadapt_error_rate, 'read_min_length': cutadapt_minimum_length, 'read_max_length': cutadapt_maximum_length, 'num_threads': num_threads, } cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \ '--minimum-length {read_min_length} ' \ '--maximum-length {read_max_length} --trimmed-only ' \ '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" ' \ '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic) Logger.instance().debug( "Running: {}".format(cmd_cutadapt_primer_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_primer_str else: args = shlex.split(cmd_cutadapt_primer_str) run_result = subprocess.run(args=args, capture_output=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode()) ################################################################### # # Reverse complement back rc fasta and pool # ################################################################### out_final_fasta_basename = os.path.basename( in_raw_fasta_path).replace('.fasta', '_%03d.fasta' % i) out_final_fasta_path = os.path.join(sorteddir, out_final_fasta_basename) shutil.copy(out_fasta_path, out_final_fasta_path) Logger.instance().debug("Pooling fwd and rc reads...") with open(out_final_fasta_path, 'a') as fout: with open(out_rc_fasta_path, 'r') as fin: for line in fin: if not line.startswith('>'): if generic_dna: # Biopython <1.78 fout.write("%s\n" % str( Seq(line.strip(), generic_dna).reverse_complement())) else: # Biopython =>1.78 fout.write("%s\n" % str( Seq(line.strip()).reverse_complement())) else: fout.write(line) fasta_info_df_i = fasta_info_df_i[[ 'run', 'marker', 'sample', 'replicate' ]] fasta_info_df_i['sortedfasta'] = out_final_fasta_basename sorted_read_info_df = pandas.concat( [sorted_read_info_df, fasta_info_df_i], axis=0) fasta_trimmed_info_tsv = os.path.join(sorteddir, 'sortedinfo.tsv') sorted_read_info_df.to_csv(fasta_trimmed_info_tsv, sep="\t", header=True, index=False)
def setUp(self): # vtam needs to be in the tsv_path cmd = '{} -m pip install . -q --upgrade --use-feature=in-tree-build'.format( sys.executable) if sys.platform.startswith("win"): args = cmd else: args = shlex.split(cmd) subprocess.run(args=args, cwd=PathManager.get_project_path()) self.package_path = os.path.join(PathManager.get_package_path()) self.test_path = os.path.join(PathManager.get_test_path()) self.outdir_path = os.path.join(self.test_path, 'outdir') self.outdir_data_path = os.path.join(self.outdir_path, 'data') # during development of the test, this prevents errors shutil.rmtree(self.outdir_path, ignore_errors=True) pathlib.Path(self.outdir_data_path).mkdir(parents=True, exist_ok=True) os.environ['VTAM_LOG_VERBOSITY'] = str(10) ############################################################################################ # # Download sorted fasta test dataset # ############################################################################################ sorted_tar_path = os.path.join(self.outdir_data_path, "sorted.tar.gz") # Test first in local dir, otherwise in the remote URLs if not os.path.isfile(sorted_tar_path) or pathlib.Path( sorted_tar_path).stat().st_size < 1000000: try: # urllib.request.urlretrieve(sorted_tar_gz_url1, sorted_tar_path, MyProgressBar()) with tqdm(...) as t: t.set_description(os.path.basename(sorted_tar_path)) urllib.request.urlretrieve(sorted_tar_gz_url1, sorted_tar_path, reporthook=tqdm_hook(t)) except Exception: try: # urllib.request.urlretrieve(sorted_tar_gz_url2, sorted_tar_path, MyProgressBar()) with tqdm(...) as t: t.set_description(os.path.basename(sorted_tar_path)) urllib.request.urlretrieve(sorted_tar_gz_url2, sorted_tar_path, reporthook=tqdm_hook(t)) except Exception: # urllib.request.urlretrieve(sorted_tar_gz_url3, sorted_tar_path, MyProgressBar()) with tqdm(...) as t: t.set_description(os.path.basename(sorted_tar_path)) urllib.request.urlretrieve(sorted_tar_gz_url3, sorted_tar_path, reporthook=tqdm_hook(t)) tar = tarfile.open(sorted_tar_path, "r:gz") tar.extractall(path=self.outdir_path) tar.close() ############################################################################################ # # Paths # ############################################################################################ self.asvtable_path = os.path.join(self.outdir_path, "asvtable_default.tsv") self.args = {} self.args['sortedinfo'] = os.path.join(os.path.dirname(__file__), "sortedinfo.tsv") self.args['params'] = os.path.join(os.path.dirname(__file__), "params_min_replicate_number1.yml") self.args['params_lfn_variant'] = os.path.join( os.path.dirname(__file__), "params_lfn_variant.yml") self.args['params_lfn_variant_replicate'] = os.path.join( os.path.dirname(__file__), "params_lfn_variant_replicate.yml")
def setUp(self): self.test_path = os.path.join(PathManager.get_test_path()) self.outdir_path = os.path.join(self.test_path, 'outdir') # during development of the test, this prevents errors shutil.rmtree(self.outdir_path, ignore_errors=True) pathlib.Path(self.outdir_path).mkdir(parents=True, exist_ok=True) db_path = os.path.join(self.outdir_path, "db.sqlite") filter_codon_stop_path = os.path.join( self.test_path, "test_files_dryad.f40v5_small/run1_mfzr_zfzr/filter_codon_stop.tsv" ) variant_path = os.path.join( self.test_path, "test_files_dryad.f40v5_small/run1_mfzr_zfzr/variant_filter_codon_stop.tsv" ) filter_chimera_borderline_path = os.path.join( self.test_path, "test_files_dryad.f40v5_small/run1_mfzr_zfzr/filter_chimera_borderline_and_filter_codon_stop.tsv" ) self.engine = sqlalchemy.create_engine('sqlite:///{}'.format(db_path), echo=False) run_df = pandas.DataFrame({'name': ['run1']}, index=range(1, 2)) run_df.to_sql(name=Run.__tablename__, con=self.engine.connect(), index_label='id') marker_df = pandas.DataFrame({'name': ['MFZR', 'ZFZR']}, index=range(1, 3)) marker_df.to_sql(name=Marker.__tablename__, con=self.engine.connect(), index_label='id') sample_df = pandas.DataFrame( {'name': ['tpos1_run1', 'tnegtag_run1', '14ben01', '14ben02']}, index=range(1, 5)) sample_df.to_sql(name=Sample.__tablename__, con=self.engine.connect(), index_label='id') variant_df = pandas.read_csv(variant_path, sep="\t", header=0, index_col='id') variant_df.to_sql(name=Variant.__tablename__, con=self.engine.connect(), index_label='id') filter_chimera_borderline_db = pandas.read_csv( filter_chimera_borderline_path, sep="\t", header=0) filter_chimera_borderline_db.to_sql( name=FilterChimeraBorderline.__tablename__, con=self.engine.connect()) self.filter_codon_stop_df = pandas.read_csv(filter_codon_stop_path, sep="\t", header=0) self.sample_list = ['tpos1_run1', 'tnegtag_run1', '14ben01', '14ben02']
def setUpClass(cls): cls.test_path = PathManager.get_test_path() cls.outdir_path = os.path.join(cls.test_path, 'outdir')
def setUpClass(cls): cls.test_path = PathManager.get_test_path() # return the path vtam.test_path__path__[0]/tests cls.outdir_path = os.path.join(cls.test_path, 'outdir_gz')
def setUp(self): self.__testdir_path = os.path.join(PathManager.get_test_path()) # self.variant_df = pandas.DataFrame({ 'id': [1, 22], 'sequence_': ["tata", "tgtg"], }) self.variant_read_count_df = pandas.DataFrame({ 'run_id': [1] * 150, 'marker_id': 150 * [1], 'sample_id': [1, 1, 1, 2, 2, 2] * 25, 'replicate': [1, 2, 3] * 50, 'variant_id': [*itertools.chain(*[[l] * 6 for l in range(1, 26)])], # [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, .. 'read_count': [ 10, 5, 0, 249, 58, 185, 68, 54, 100, 0, 0, 0, 0, 0, 0, 258, 126, 500, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1524, 1815, 789, 118, 98, 50, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 125, 214, 20, 1284, 1789, 1913, 0, 1, 0, 0, 1, 0, 15, 0, 1, 0, 0, 25, 0, 0, 2, 598, 50, 875, 2, 60, 12, 1, 0, 0, 1, 0, 0, 0, 0, 2, 0, 3, 0, 0, 5, 0, 65, 98, 152, 2, 0, 1, 52, 74, 85, 0, 0, 0, 1, 0, 0, 5, 0, 8, 5, 0, 1, 0, 0, 21, 0, 0, 0, 524, 658, 125, 0, 0, 0, 2, 0, 10, 25, 58, 23, 10980, 8999, 13814, 0, 5, 0, 0, 2, 0, 1, 0, 1, 1, 0, 284, 0, 2, 0, 0, 5, 0, ], }) self.marker_id = 1 # self.filter_lfn_runner = RunnerFilterLFN(self.variant_read_count_df)
def setUpClass(cls): ######################################################################## # # These tests need the vtam command in the path # ######################################################################## pip_install_vtam_for_tests() # vtam needs to be in the path cls.package_path = PathManager.get_package_path() cls.test_path = PathManager.get_test_path() cls.outdir_path = os.path.join(cls.test_path, 'outdir') shutil.rmtree(cls.outdir_path, ignore_errors=True) cls.outdir_data_path = os.path.join(cls.outdir_path, 'data') pathlib.Path(cls.outdir_data_path).mkdir(parents=True, exist_ok=True) cls.outdir_download_path = os.path.join(cls.test_path, 'outdir_download') pathlib.Path(cls.outdir_download_path).mkdir(parents=True, exist_ok=True) cls.snakefile_tuto_data = os.path.join( cls.package_path, "data/snake.tuto.data_makeknownoccurrences.yml") ############################################################################################ # # Set command args # ############################################################################################ cls.args = {} cls.args['package_path'] = cls.package_path cls.args['snake_tuto_data'] = cls.snakefile_tuto_data ############################################################################################ # # Download fastq test dataset # ############################################################################################ fastq_tar_path = os.path.join(cls.outdir_download_path, "fastq.tar.gz") # Test first in local dir, otherwise in the remote URLs if not os.path.isfile(fastq_tar_path) or pathlib.Path( fastq_tar_path).stat().st_size < 1000000: try: # urllib.request.urlretrieve(fastq_tar_gz_url1, fastq_tar_path, MyProgressBar()) with tqdm(...) as t: t.set_description(os.path.basename(fastq_tar_path)) urllib.request.urlretrieve(fastq_tar_gz_url1, fastq_tar_path, reporthook=tqdm_hook(t)) except Exception: try: # urllib.request.urlretrieve(fastq_tar_gz_url2, fastq_tar_path, MyProgressBar()) with tqdm(...) as t: t.set_description(os.path.basename(fastq_tar_path)) urllib.request.urlretrieve(fastq_tar_gz_url2, fastq_tar_path, reporthook=tqdm_hook(t)) except Exception: # urllib.request.urlretrieve(fastq_tar_gz_url3, fastq_tar_path, MyProgressBar()) with tqdm(...) as t: t.set_description(os.path.basename(fastq_tar_path)) urllib.request.urlretrieve(fastq_tar_gz_url3, fastq_tar_path, reporthook=tqdm_hook(t)) tar = tarfile.open(fastq_tar_path, "r:gz") tar.extractall(path=cls.outdir_path) tar.close() ############################################################################################ # # Copy data to directory tree # ############################################################################################ cmd = "snakemake --cores 1 -s {snake_tuto_data} --config MARKER=mfzr " \ "PROJECT=asper1 PACKAGE_PATH={package_path} --until all_one_marker_makeknownoccurrences".format(**cls.args) if sys.platform.startswith("win"): args = cmd else: args = shlex.split(cmd) subprocess.run(args=args, check=True, cwd=cls.outdir_path)
def setUp(self): self.test_path = os.path.join(PathManager.get_test_path()) self.outdir_path = os.path.join(self.test_path, 'outdir') shutil.rmtree(self.outdir_path, ignore_errors=True) pathlib.Path(self.outdir_path).mkdir(parents=True, exist_ok=True)