def main(cls, db, pooled_marker_tsv, run_marker_tsv, params, readcounts): """ Parameters ---------- db : str path to DB in sqlite format pooled_marker_tsv : str path to output pooled_marker.tsv file run_marker_tsv : str path to input run_marker.tsv file params readcounts: bool Output absence/presence (False) or sum of read counts (True) Returns ------- """ ####################################################################### # # Parameters # ####################################################################### # params_dic = constants.get_params_default_dic() params_dic = FileParams(params).get_params_dic() cluster_identity = params_dic['cluster_identity'] run_marker_file_obj = FileRunMarker(tsv_path=run_marker_tsv) # run_marker_tsv_reader = RunMarkerTSVreader(db=db, run_marker_tsv_path=run_marker_tsv) if not (run_marker_tsv is None): run_marker_df = run_marker_file_obj.read_tsv_into_df() else: run_marker_df = None engine = sqlalchemy.create_engine('sqlite:///{}'.format(db), echo=False) Base = automap_base() Base.prepare(engine, reflect=True) sample_list = run_marker_file_obj.get_sample_ids(engine) sample_list = NameIdConverter(id_name_or_sequence_list=sample_list, engine=engine).to_names(Sample) ############################################################################################ # # Compute all variant_read_count_input_df required for ASV table # ############################################################################################ variant_read_count_df = run_marker_file_obj.get_variant_read_count_df( engine=engine, variant_read_count_like_model=FilterCodonStop) asv_table_runner = RunnerAsvTable( variant_read_count_df=variant_read_count_df, engine=engine, sample_list=sample_list, cluster_identity=cluster_identity) asv_table_df = asv_table_runner.create_asvtable_df() asv_table_df.rename( { 'run': 'run_name', 'marker': 'marker_name', 'variant': 'variant_id' }, axis=1, inplace=True) ############################################################################################ # # Prefix biosample columns with run name for same biosample name in different runs # ############################################################################################ asv_table_2_df = asv_table_df.copy() for run_name_i, run_name in enumerate(asv_table_df.run_name.unique()): asv_table_runi_df = ( asv_table_df.loc[asv_table_df.run_name == run_name]).copy() for biosample in asv_table_runi_df.iloc[:, 5:-4].columns.tolist(): asv_table_runi_df.rename( {biosample: run_name + '-' + biosample}, axis=1, inplace=True) if run_name_i == 0: asv_table_2_df = asv_table_runi_df else: asv_table_2_df = pandas.concat([ asv_table_2_df, pandas.DataFrame(columns=asv_table_runi_df.columns) ]) asv_table_2_df = asv_table_2_df.fillna(0) asv_table_2_df = pandas.concat( [asv_table_2_df, asv_table_runi_df], axis=0, join='outer') del (asv_table_runi_df) ############################################################################################ # # Reorder columns # ############################################################################################ column_list = asv_table_2_df.columns.tolist() column_list.remove("run_name") column_list.insert(0, "run_name") column_list.remove("clusterid") column_list.remove("clustersize") column_list.remove("chimera_borderline") column_list.remove("sequence") column_list = column_list + [ 'clusterid', 'clustersize', 'chimera_borderline', 'sequence' ] column_list.remove("sequence_length") column_list.remove("read_count") column_list.insert(3, "sequence_length") column_list.insert(4, "read_count") asv_table_2_df = asv_table_2_df[column_list] ############################################################################################ # # Pool markers # ############################################################################################ pool_marker_runner = CommandPoolRunMarkers(asv_table_df=asv_table_2_df, run_marker_df=run_marker_df, readcounts=readcounts) pooled_marker_df = pool_marker_runner.get_pooled_marker_df() ####################################################################### # # Cluster sequences # ####################################################################### # reset asvtable-based clusterid and clustersize pooled_marker_df.drop(['clusterid', 'clustersize'], axis=1, inplace=True) pooled_marker_df.rename({'variant': 'variant_id'}, axis=1, inplace=True) # prepare pooled_marker_df['read_count'] = pooled_marker_df.iloc[:, 4:-2].sum( axis=1) # prepare seq_clusterer_obj = SequenceClusterer( pooled_marker_df, cluster_identity=cluster_identity) cluster_count_df = seq_clusterer_obj.compute_clusters() pooled_marker_df = pooled_marker_df.merge(cluster_count_df, on='variant_id') pooled_marker_df.drop(['read_count'], axis=1, inplace=True) ############################################################################################ # # Reorder columns # ############################################################################################ column_list = pooled_marker_df.columns.tolist() column_list.remove("pooled_sequences") column_list.remove("sequence") column_list = column_list + ['pooled_sequences', 'sequence'] pooled_marker_df = pooled_marker_df[column_list] # change dtypes for col in pooled_marker_df.columns[4:-4]: pooled_marker_df[col] = pooled_marker_df[col].astype(int) # verify here if the run-sample exists in the sampleinformation database # and drop if not run_biosample_cols = pooled_marker_df.columns[4:-4] # run_biosample_item = run_biosample_cols[0] from sqlalchemy.orm import sessionmaker Session = sessionmaker(bind=engine) session = Session() for run_biosample_item in run_biosample_cols: thisrun, thisbiosample = run_biosample_item.split('-') rowcount = session.query( SampleInformation, Sample, Run).filter(SampleInformation.sample_id == Sample.id).filter( SampleInformation.run_id == Run.id).filter( Run.name == thisrun).filter( Sample.name == thisbiosample).count() if rowcount <= 0: pooled_marker_df.drop([run_biosample_item], axis=1, inplace=True) ####################################################################### # # To tsv # ####################################################################### pooled_marker_df.to_csv(pooled_marker_tsv, sep="\t", index=False)
def main(fastainfo, fastadir, sorteddir, params=None, num_threads=multiprocessing.cpu_count()): if sys.platform.startswith('win'): num_threads = 1 ############################################################################################ # # params.yml parameters # ############################################################################################ params_dic = FileParams(params).get_params_dic() cutadapt_error_rate = params_dic['cutadapt_error_rate'] cutadapt_minimum_length = params_dic['cutadapt_minimum_length'] cutadapt_maximum_length = params_dic['cutadapt_maximum_length'] ############################################################################################ # # Loop over tag and primer pairs to demultiplex and trim reads # ############################################################################################ merged_fastainfo_df = FileSampleInformation( fastainfo).read_tsv_into_df() pathlib.Path(sorteddir).mkdir(parents=True, exist_ok=True) tempdir = PathManager.instance().get_tempdir() sorted_read_info_df = pandas.DataFrame() for i in range(0, merged_fastainfo_df.shape[0]): fasta_info_series = merged_fastainfo_df.iloc[i] tag_fwd = fasta_info_series.tagfwd tag_rev = fasta_info_series.tagrev primer_fwd = fasta_info_series.primerfwd primer_rev = fasta_info_series.primerrev in_fasta_basename = fasta_info_series.mergedfasta Logger.instance().debug( "Analysing FASTA file: {}".format(in_fasta_basename)) fasta_info_df_i = fasta_info_series.to_frame().T in_raw_fasta_path = os.path.join(fastadir, in_fasta_basename) ######################################################################################## # # Cut adapt tag of forward reads # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only # --front 'tcgatcacgatgt;min_overlap=13...gctgtagatcgaca;min_overlap=14' # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta # ######################################################################################## if generic_dna: # Biopython <1.78 tag_rev_rc = str( Seq(tag_rev, generic_dna).reverse_complement()) else: # Biopython =>1.78 tag_rev_rc = str(Seq(tag_rev).reverse_complement()) out_fasta_basename = os.path.basename(in_raw_fasta_path).replace( '.fasta', '_sorted_%03d.fasta' % i) out_fasta_path = os.path.join(tempdir, out_fasta_basename) cmd_cutadapt_tag_dic = { 'tag_fwd': tag_fwd, 'tag_fwd_len': len(tag_fwd), 'tag_rev_rc': tag_rev_rc, 'tag_rev_rc_len': len(tag_rev_rc), 'in_fasta_path': in_raw_fasta_path, 'out_fasta': out_fasta_path, 'num_threads': num_threads, } cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \ '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \ '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic) Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_tag_str else: args = shlex.split(cmd_cutadapt_tag_str) run_result = subprocess.run(args=args, capture_output=True, check=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode()) ######################################################################################## # # Trim primers from output # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only # --front 'TCCACTAATCACAARGATATTGGTAC;min_overlap=26...GGAGGATTTGGWAATTGATTAGTW;min_overlap=24' # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_trimmed_000.fasta # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta # ######################################################################################## if generic_dna: # Biopython <1.78 primer_rev_rc = str( Seq(primer_rev, generic_dna).reverse_complement()) else: # Biopython =>1.78 primer_rev_rc = str(Seq(primer_rev).reverse_complement()) in_fasta_path = out_fasta_path out_fasta_basename = os.path.basename(in_fasta_path).replace( '_sorted_%03d.fasta' % i, '_sorted_trimmed_%03d.fasta' % i) out_fasta_path = os.path.join(tempdir, out_fasta_basename) cmd_cutadapt_primer_dic = { 'primer_fwd': primer_fwd, 'primer_fwd_len': len(primer_fwd), 'primer_rev_rc': primer_rev_rc, 'primer_rev_rc_len': len(primer_rev_rc), 'in_fasta_path': in_fasta_path, 'out_fasta': out_fasta_path, 'error_rate': cutadapt_error_rate, 'read_min_length': cutadapt_minimum_length, 'read_max_length': cutadapt_maximum_length, 'num_threads': num_threads, } cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \ '--minimum-length {read_min_length} ' \ '--maximum-length {read_max_length} --trimmed-only ' \ '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" ' \ '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic) Logger.instance().debug( "Running: {}".format(cmd_cutadapt_primer_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_primer_str else: args = shlex.split(cmd_cutadapt_primer_str) run_result = subprocess.run(args=args, capture_output=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode()) ######################################################################################## # # Cut adapt tag of reverse-complement reads # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only # --front 'tgtcgatctacagc;min_overlap=14...acatcgtgatcga;min_overlap=13' # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta # ######################################################################################## if generic_dna: # Biopython <1.78 tag_fwd_rc = str( Seq(tag_fwd, generic_dna).reverse_complement()) else: # Biopython =>1.78 tag_fwd_rc = str(Seq(tag_fwd).reverse_complement()) out_rc_fasta_basename = os.path.basename( in_raw_fasta_path).replace('.fasta', '_rc_sorted_%03d.fasta' % i) out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename) cmd_cutadapt_tag_dic = { 'tag_fwd': tag_rev, 'tag_fwd_len': len(tag_rev), 'tag_rev_rc': tag_fwd_rc, 'tag_rev_rc_len': len(tag_fwd_rc), 'in_fasta_path': in_raw_fasta_path, 'out_fasta': out_rc_fasta_path, 'num_threads': num_threads, } cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \ '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \ '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic) Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_tag_str else: args = shlex.split(cmd_cutadapt_tag_str) run_result = subprocess.run(args=args, capture_output=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode()) ################################################################### # # Trim primers from output # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only # --front 'WACTAATCAATTWCCAAATCCTCC;min_overlap=24...GTACCAATATCYTTGTGATTAGTGGA;min_overlap=26' # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_trimmed_000.fasta # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta # ################################################################### if generic_dna: # Biopython <1.78 primer_fwd_rc = str( Seq(primer_fwd, generic_dna).reverse_complement()) else: # Biopython =>1.78 primer_fwd_rc = str(Seq(primer_fwd).reverse_complement()) in_fasta_path = out_rc_fasta_path out_rc_fasta_basename = os.path.basename(in_fasta_path).replace( '_rc_sorted_%03d.fasta' % i, '_rc_sorted_trimmed_%03d.fasta' % i) out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename) cmd_cutadapt_primer_dic = { 'primer_fwd': primer_rev, 'primer_fwd_len': len(primer_rev), 'primer_rev_rc': primer_fwd_rc, 'primer_rev_rc_len': len(primer_fwd_rc), 'in_fasta_path': in_fasta_path, 'out_fasta': out_rc_fasta_path, 'error_rate': cutadapt_error_rate, 'read_min_length': cutadapt_minimum_length, 'read_max_length': cutadapt_maximum_length, 'num_threads': num_threads, } cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \ '--minimum-length {read_min_length} ' \ '--maximum-length {read_max_length} --trimmed-only ' \ '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" ' \ '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic) Logger.instance().debug( "Running: {}".format(cmd_cutadapt_primer_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_primer_str else: args = shlex.split(cmd_cutadapt_primer_str) run_result = subprocess.run(args=args, capture_output=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode()) ################################################################### # # Reverse complement back rc fasta and pool # ################################################################### out_final_fasta_basename = os.path.basename( in_raw_fasta_path).replace('.fasta', '_%03d.fasta' % i) out_final_fasta_path = os.path.join(sorteddir, out_final_fasta_basename) shutil.copy(out_fasta_path, out_final_fasta_path) Logger.instance().debug("Pooling fwd and rc reads...") with open(out_final_fasta_path, 'a') as fout: with open(out_rc_fasta_path, 'r') as fin: for line in fin: if not line.startswith('>'): if generic_dna: # Biopython <1.78 fout.write("%s\n" % str( Seq(line.strip(), generic_dna).reverse_complement())) else: # Biopython =>1.78 fout.write("%s\n" % str( Seq(line.strip()).reverse_complement())) else: fout.write(line) fasta_info_df_i = fasta_info_df_i[[ 'run', 'marker', 'sample', 'replicate' ]] fasta_info_df_i['sortedfasta'] = out_final_fasta_basename sorted_read_info_df = pandas.concat( [sorted_read_info_df, fasta_info_df_i], axis=0) fasta_trimmed_info_tsv = os.path.join(sorteddir, 'sortedinfo.tsv') sorted_read_info_df.to_csv(fasta_trimmed_info_tsv, sep="\t", header=True, index=False)
def __init__(self, sequence_list, taxonomy, blast_db_dir, blast_db_name, num_threads, params): """ Parameters ---------- sequence_list : list List of se param2 : str The second parameter. """ self.old_tax_id_df = taxonomy.old_tax_df self.taxonomy_df = taxonomy.df self.blast_db_dir = blast_db_dir self.this_temp_dir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(self.this_temp_dir).mkdir(exist_ok=True) self.num_threads = num_threads ####################################################################### # # Parameters # ####################################################################### params_dic = FileParams(params).get_params_dic() qcov_hsp_perc = params_dic['qcov_hsp_perc'] ####################################################################### # # 2 Create FASTA file with Variants # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Create SortedReadFile from Variants".format( __file__, inspect.currentframe().f_lineno)) variant_fasta = os.path.join(self.this_temp_dir, 'variant.fasta') with open(variant_fasta, 'w') as fout: for seq in sequence_list: fout.write(">{}\n{}\n".format(seq, seq)) ####################################################################### # # 3 Run local blast # ####################################################################### runner_blast = RunnerBlast(variant_fasta, blast_db_dir, blast_db_name, num_threads, qcov_hsp_perc) # run blast blast_output_tsv = runner_blast.run_local_blast() # process blast results blast_output_df = RunnerBlast.process_blast_result(blast_output_tsv) ####################################################################### # # Compute tax lineages for Blast target tax ids # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Open taxonomy.tsv DB".format( __file__, inspect.currentframe().f_lineno)) blast_output_df.target_tax_id = pandas.to_numeric( blast_output_df.target_tax_id) # Logger.instance().debug( "file: {}; line: {}; Annotate each target_tax_id with its lineage as columns in wide format" .format(__file__, inspect.currentframe().f_lineno)) tax_id_list = blast_output_df.target_tax_id.unique().tolist() tax_id_to_lineage_df = taxonomy.get_several_tax_id_lineages( tax_id_list) ####################################################################### # # Merge tax lineages and the blast result # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Merge blast result including tax_id with their lineages" .format(__file__, inspect.currentframe().f_lineno)) # Merge local blast output with tax_id_to_lineage_df # variant_identity_lineage_df = blast_output_df.merge( # tax_id_to_lineage_df, left_on='target_tax_id', right_on='tax_id') variantid_identity_lineage_df = blast_output_df.merge( tax_id_to_lineage_df, left_on='target_tax_id', right_index=True) # variant_identity_lineage_df.drop('tax_id', axis=1, inplace=True) """(Pdb) variant_identity_lineage_df.columns Index(['variant_id', 'target_id', 'identity', 'evalue', 'coverage', 'target_tax_id', 'no rank', 'species', 'genus', 'family', 'order', 'class', 'subphylum', 'phylum', 'subkingdom', 'kingdom', 'superkingdom', 'superfamily', 'infraorder', 'suborder', 'infraclass', 'subclass', 'tribe', 'subfamily', 'cohort', 'subgenus', 'subspecies', 'parvorder', 'superorder', 'subcohort', 'superclass', 'species group', 'subtribe', 'section', 'varietas', 'species subgroup'], dtype='object')""" ####################################################################### # # several_variants_to_ltg # this function returns a data frame containing the Ltg rank and Ltg Tax_id for each variant # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Main loop over variant and identity to" "compute the whole set of ltg_tax_id and ltg_rank for each variant_id" "to a dataframe".format(__file__, inspect.currentframe().f_lineno)) runner_ltg_selection = RunnerLTGselection( variant_identity_lineage_df=variantid_identity_lineage_df, taxonomy_df=self.taxonomy_df, params=params) self.ltg_df = runner_ltg_selection.several_variants_to_ltg()
def main(fastqinfo, fastqdir, fastainfo, fastadir, params=None, num_threads=multiprocessing.cpu_count()): ############################################################################################ # # params.yml parameters # ############################################################################################ params_dic = FileParams(params).get_params_dic() ############################################################################################ # # Read fastq information into stats_df # ############################################################################################ fastqinfo_df = FileSampleInformation(fastqinfo).read_tsv_into_df() pathlib.Path( os.path.dirname(fastainfo)).mkdir( parents=True, exist_ok=True) pathlib.Path(fastadir).mkdir(parents=True, exist_ok=True) fastainfo_df = pandas.DataFrame() ############################################################################################ # # Loop over fastq pairs to merge # ############################################################################################ # File with analysis stats data stats_df = pandas.DataFrame({'FastqFwd': [], 'FastqRev': [], 'NbReadsFwd': [], 'NbReadsRev': [], 'FastaMerged': [], 'NbMergedReads': []}) for fastqfwd, fastqrev in fastqinfo_df[[ 'fastqfwd', 'fastqrev']].drop_duplicates().values: fastq_info_df_i = fastqinfo_df.loc[(fastqinfo_df.fastqfwd == fastqfwd) & ( fastqinfo_df.fastqrev == fastqrev)] fastq_fw_abspath = os.path.join(fastqdir, fastqfwd) with open(fastq_fw_abspath, 'rb') as fin: fastq_fw_linecount = int(sum(1 for i in fin.read())/4) fastq_rv_abspath = os.path.join(fastqdir, fastqrev) with open(fastq_rv_abspath, 'rb') as fin: fastq_rv_linecount = int(sum(1 for i in fin.read())/4) Logger.instance().debug( "Analysing FASTQ files: {} and ".format( fastqfwd, fastqrev)) try: pathlib.Path(fastq_fw_abspath).resolve(strict=True) except FileNotFoundError: Logger.instance().error( VTAMexception( "VTAMexception: This FASTQ file was not found: {}.".format(fastq_fw_abspath))) sys.exit(1) try: pathlib.Path(fastq_rv_abspath).resolve(strict=True) except FileNotFoundError: Logger.instance().error( VTAMexception( "VTAMexception: This FASTQ file was not found: {}.".format(fastq_rv_abspath))) sys.exit(1) fasta_merged_basename = os.path.basename( fastq_fw_abspath).replace('.fastq', '.fasta') out_fasta_path = os.path.join(fastadir, fasta_merged_basename) ######################################################################################## # # Run vsearch merge # ######################################################################################## vsearch_args_dic = {} vsearch_args_dic['fastq_ascii'] = params_dic['fastq_ascii'] vsearch_args_dic['fastq_maxee'] = params_dic['fastq_maxee'] vsearch_args_dic['fastq_maxmergelen'] = params_dic['fastq_maxmergelen'] vsearch_args_dic['fastq_maxns'] = params_dic['fastq_maxns'] vsearch_args_dic['fastq_minlen'] = params_dic['fastq_minlen'] vsearch_args_dic['fastq_minmergelen'] = params_dic['fastq_minmergelen'] vsearch_args_dic['fastq_minovlen'] = params_dic['fastq_minovlen'] vsearch_args_dic['fastq_truncqual'] = params_dic['fastq_truncqual'] vsearch_args_dic['fastq_mergepairs'] = fastq_fw_abspath vsearch_args_dic['reverse'] = fastq_rv_abspath vsearch_args_dic['fastaout'] = out_fasta_path vsearch_args_dic['threads'] = num_threads vsearch_cluster = RunnerVSearch(parameters=vsearch_args_dic) vsearch_cluster.run() fastq_info_df_i = fastq_info_df_i[['run', 'marker', 'sample', 'replicate', 'tagfwd', 'primerfwd', 'tagrev', 'primerrev']] fastq_info_df_i['mergedfasta'] = fasta_merged_basename fastainfo_df = pandas.concat( [fastainfo_df, fastq_info_df_i], axis=0) with open(out_fasta_path, 'rb') as fin: fasta_merged_linecount = int(sum(1 for i in fin.read()) / 4) ######################################################################################## # # Summary file # ######################################################################################## stats_df = pandas.concat([stats_df, pandas.DataFrame({ 'FastqFwd': [fastq_fw_abspath], 'FastqRev': [fastq_fw_linecount], 'NbReadsFwd': [fastq_rv_abspath], 'NbReadsRev': [fastq_rv_linecount], 'FastaMerged': [out_fasta_path], 'NbMergedReads': [fasta_merged_linecount]})]) for mergedfasta in fastainfo_df[['mergedfasta']].drop_duplicates().values: mergedfasta = mergedfasta[0] if mergedfasta.endswith('.bz2') or mergedfasta.endswith('.gz'): fasta_merged_abspath = os.path.join(fastadir, mergedfasta) mergedfasta_compressor = FileCompression(fasta_merged_abspath) if mergedfasta.endswith('.gz'): mergedfasta_c = mergedfasta_compressor.pigz_compression() if mergedfasta_c is None: mergedfasta_c = mergedfasta_compressor.gzip_compression() elif mergedfasta.endswith('.bz2'): mergedfasta_c = mergedfasta_compressor.bz2_compression() mergedfasta_compressor.delete_file() _, relPath = os.path.split(mergedfasta_c) fastainfo_df.loc[fastainfo_df['mergedfasta'] == mergedfasta, 'mergedfasta'] = relPath else: fastq_info_df_i['mergedfasta'] = fasta_merged_basename fastainfo_df.to_csv(fastainfo, sep="\t", header=True, index=False)
def main(fastainfo, fastadir, sorteddir, params=None, num_threads=multiprocessing.cpu_count(), no_reverse=False, tag_to_end=False, primer_to_end=False): Logger.instance().info(f"OPTIONS:\n no_reverse: {not no_reverse} \n tag_to_end {not tag_to_end} \n primer_to_end {not primer_to_end}") if sys.platform.startswith('win'): num_threads = 1 ############################################################################################ # # params.yml parameters # ############################################################################################ params_dic = FileParams(params).get_params_dic() cutadapt_error_rate = params_dic['cutadapt_error_rate'] cutadapt_minimum_length = params_dic['cutadapt_minimum_length'] cutadapt_maximum_length = params_dic['cutadapt_maximum_length'] ############################################################################################ # # Loop over tag and primer pairs to demultiplex and trim reads # ############################################################################################ merged_fastainfo_df = FileSampleInformation(fastainfo).read_tsv_into_df() pathlib.Path(sorteddir).mkdir(parents=True, exist_ok=True) tempdir = PathManager.instance().get_tempdir() merged_fasta_list = [] results_list = [] sample_info = {} # make sure every file is analysed once. for i in range(merged_fastainfo_df.shape[0]): if merged_fastainfo_df.iloc[i].mergedfasta not in merged_fasta_list: merged_fasta_list.append(merged_fastainfo_df.iloc[i].mergedfasta) for mergedfasta in merged_fasta_list: inputFiles = FilesInputCutadapt(fastainfo, mergedfasta, no_reverse, tag_to_end) tagFile_path = inputFiles.tags_file() info = inputFiles.get_df_info() for key in info.keys(): if key in sample_info.keys(): sample_info[key] = sample_info[key] + info[key] else: sample_info[key] = info[key] Logger.instance().debug("Analysing FASTA file: {}".format(mergedfasta)) in_raw_fasta_path = os.path.join(fastadir, mergedfasta) ######################################################################################## # # cutadapt --cores=0 -e 0 --no-indels --trimmed-only -g tagFile:$tagfile # --overlap length -o "tagtrimmed.{name}.fasta" in_raw_fasta_path # ######################################################################################## base = os.path.basename(in_raw_fasta_path) base, base_suffix = base.split('.', 1) out_fasta_path = os.path.join(tempdir, "sorted") cmd_cutadapt_tag_dic = { 'in_fasta_path': in_raw_fasta_path, 'out_fasta': out_fasta_path, 'num_threads': num_threads, 'tagFile': tagFile_path, 'base_suffix': base_suffix, } cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \ '-g file:{tagFile} --output {out_fasta}_{{name}}.{base_suffix} {in_fasta_path}' \ .format(**cmd_cutadapt_tag_dic) Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_tag_str else: args = shlex.split(cmd_cutadapt_tag_str) run_result = subprocess.run(args=args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) Logger.instance().info(run_result.stdout.decode()) inputFiles.remove_tags_file() ######################################################################################## # # Trim primers from output # cutadapt --quiet --cores=0 -e trim_error --no-indels --trimmed-only # --minimum-length minimum_length --maximum-length maximum_length # --output input_path + {name} + suffix outputfile # ######################################################################################## primers = inputFiles.primers() try: tags_samples = inputFiles.get_sample_names() except Exception as e: Logger.instance().error(e) return for primer in primers: marker, primerfwd, primerrev, lenprimerfwd, lenprimerrev = primer for tag_sample in tags_samples: name, run, marker2, sample, replicate, _, _ = tag_sample if marker not in marker2: continue in_fasta_path = out_fasta_path + "_" + name + "." + base_suffix baseMerge = mergedfasta.split(".")[0] outname = run + "_" + marker + "_" + sample + "_" + replicate + "_" + baseMerge + "_trimmed" if name.endswith("_reversed"): outname = outname + "_reversed" out_fasta_path_new = os.path.join(tempdir, outname + "." + base_suffix) results_list.append(out_fasta_path_new) if not "_reversed" in name: if generic_dna: # Biopython <1.78 primerRev = str(Seq(primerrev, generic_dna).reverse_complement()) else: # Biopython =>1.78 primerRev = str(Seq(primerrev).reverse_complement()) primerFwd = primerfwd lenPrimerFwd = lenprimerfwd lenPrimerRev = lenprimerrev else: if generic_dna: # Biopython <1.78 primerRev = str(Seq(primerfwd, generic_dna).reverse_complement()) else: # Biopython =>1.78 primerRev = str(Seq(primerfwd).reverse_complement()) primerFwd = primerrev lenPrimerFwd = lenprimerrev lenPrimerRev = lenprimerfwd cmd_cutadapt_primer_dic = { 'in_fasta_path': in_fasta_path, 'out_fasta': out_fasta_path_new, 'error_rate': cutadapt_error_rate, 'num_threads': num_threads, 'primerFwd': primerFwd, 'primerRev': primerRev, 'lenPrimerFwd': lenPrimerFwd, 'lenPrimerRev': lenPrimerRev, 'read_min_length': cutadapt_minimum_length, 'read_max_length': cutadapt_maximum_length, } if not primer_to_end: #works if the command is selected cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \ '--minimum-length {read_min_length} --maximum-length {read_max_length} ' \ '--trimmed-only -g "^{primerFwd}...{primerRev}$" --output {out_fasta} {in_fasta_path}'\ .format(**cmd_cutadapt_primer_dic) else: cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \ '--minimum-length {read_min_length} --maximum-length {read_max_length} ' \ '--trimmed-only -g "{primerFwd};min_overlap={lenPrimerFwd}...{primerRev};min_overlap={lenPrimerRev}" '\ '--output {out_fasta} {in_fasta_path}'\ .format(**cmd_cutadapt_primer_dic) Logger.instance().debug("Running: {}".format(cmd_cutadapt_primer_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_primer_str else: args = shlex.split(cmd_cutadapt_primer_str) run_result = subprocess.run(args=args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) Logger.instance().info(run_result.stdout.decode()) ################################################################### # # Reverse complement back rc fasta and pool # ################################################################### for file in results_list: if "_trimmed" in file: out_final_fasta_path = os.path.join(sorteddir, os.path.split(file)[-1]) in_fasta_path = os.path.join(tempdir, file) if out_final_fasta_path.endswith(".gz"): _open = partial(gzip.open) elif out_final_fasta_path.endswith(".bz2"): _open = partial(bz2.open) else: _open = open if in_fasta_path.endswith(".gz"): _open2 = partial(gzip.open) elif in_fasta_path.endswith(".bz2"): _open2 = partial(bz2.open) else: _open2 = open if "_reversed" in file: Logger.instance().debug("Pooling fwd and rc reads...") out_final_fasta_path = out_final_fasta_path.replace("_reversed", "") with _open(out_final_fasta_path, 'at') as fout: with _open2(in_fasta_path, 'rt') as fin: for line in fin.readlines(): if not line.startswith('>'): if generic_dna: # Biopython <1.78 fout.write("%s\n" % str( Seq(line.strip(), generic_dna).reverse_complement())) else: # Biopython =>1.78 fout.write("%s\n" % str( Seq(line.strip()).reverse_complement())) else: fout.write(line) else: with _open(out_final_fasta_path, 'at') as fout: with _open2(in_fasta_path, 'rt') as fin: for line in fin.readlines(): fout.write(line) results_list = [os.path.split(result)[-1] for result in results_list if "_reversed" not in result] del sample_info['mergedfasta'] del sample_info['primerrev'] del sample_info['primerfwd'] del sample_info['tagrev'] del sample_info['tagfwd'] sample_info['sortedfasta'] = results_list sample_info_df = pandas.DataFrame(sample_info) fasta_trimmed_info_tsv = os.path.join(sorteddir, 'sortedinfo.tsv') sample_info_df.to_csv(fasta_trimmed_info_tsv, sep="\t", header=True, index=False)
class ArgParser: ############################################################################################ # # Specific parsers # ############################################################################################ parser_params = argparse.ArgumentParser(add_help=False) parser_params.add_argument('--params', action='store', default=None, help="YML file with parameter values", required=False, type=lambda x: FileParams(params_path=x). argparse_checker_params_file()) parser_log = argparse.ArgumentParser(add_help=False) parser_log.add_argument('--log', dest='log', action='store', help="write log to LOG file.", required=False) parser_threads = argparse.ArgumentParser(add_help=False) parser_threads.add_argument('--threads', dest='threads', action='store', help="number of threads", required=False, default=multiprocessing.cpu_count()) parser_verbosity = argparse.ArgumentParser(add_help=False) parser_verbosity.add_argument('-v', dest='log_verbosity', action='count', default=0, required=False, help="set verbosity level -v or -vv") parser_wopmars_db = argparse.ArgumentParser(add_help=False) parser_wopmars_db.add_argument('--db', dest='db', action='store', default='db.sqlite', required=False, help="database file in SQLITE format") parser_wopmars_dryrun = argparse.ArgumentParser(add_help=False) parser_wopmars_dryrun.add_argument( '--dry-run', '-n', dest='dryrun', action='store_true', required=False, help="displays only command out without running it") parser_wopmars_forceall = argparse.ArgumentParser(add_help=False) parser_wopmars_forceall.add_argument('-F', '--forceall', dest='forceall', action='store_true', help="force rerun all rules", required=False) parser_vtam_main = None @classmethod def get_main_arg_parser(cls): """ :return: """ ############################################################################################ # # Top-level parser # ############################################################################################ # config = RawConfigParser() # config.read(os.path.join(PathManager.get_package_path(), 'setup.cfg')) # version = config.get('metadata', 'version') parser_vtam_main = argparse.ArgumentParser( prog='vtam', description= '%(prog)s {} - VTAM - Validation and Taxonomic Assignation of Metabarcoding Data' .format(vtam.__version__)) parser_vtam_main.add_argument('--version', action='version', version='%(prog)s {}'.format( vtam.__version__)) subparsers = parser_vtam_main.add_subparsers(title='VTAM sub-commands') ############################################################################################ # # create the parsers # ############################################################################################ cls.add_parser_example(subparsers=subparsers) cls.add_parser_merge(subparsers=subparsers) cls.add_parser_random_seq(subparsers=subparsers) cls.add_parser_sortreads(subparsers=subparsers) cls.add_parser_filter(subparsers=subparsers) cls.add_parser_optimize(subparsers=subparsers) cls.add_parser_makeKnownOccurrences(subparsers=subparsers) cls.add_parser_pool(subparsers=subparsers) cls.add_parser_taxassign(subparsers=subparsers) cls.add_parser_taxonomy(subparsers=subparsers) cls.add_parser_coiblastdb(subparsers=subparsers) return parser_vtam_main @classmethod def add_parser_example(cls, subparsers): parser_vtam_merge = subparsers.add_parser( 'example', add_help=True, parents=[ cls.parser_params, cls.parser_log, cls.parser_threads, cls.parser_verbosity ], help="generates data for quick start") parser_vtam_merge.add_argument('--outdir', action='store', help="directory for quick start data", required=False, default='example', type=lambda x: pathlib.Path(x).mkdir( exist_ok=True, parents=True) or x) parser_vtam_merge.set_defaults(command='example') @classmethod def add_parser_merge(cls, subparsers): parser_vtam_merge = subparsers.add_parser( 'merge', add_help=True, parents=[ cls.parser_params, cls.parser_log, cls.parser_threads, cls.parser_verbosity ], help="merges paired-end reads") parser_vtam_merge.add_argument( '--fastqinfo', action='store', help="input TSV file with paired FASTQ file information", required=True, type=lambda x: FileSampleInformation(x).check_args( header=header_paired_fastq)) parser_vtam_merge.add_argument( '--fastainfo', action='store', help="output TSV file with merged FASTA file information", required=True) parser_vtam_merge.add_argument( '--fastqdir', action='store', help="input directory with paired FASTQ files", required=True, type=ArgParserChecker.check_dir_exists_and_is_nonempty) parser_vtam_merge.add_argument( '--fastadir', action='store', help="output directory with merged FASTA files", required=True) # This attribute will trigger the good command parser_vtam_merge.set_defaults(command='merge') @classmethod def add_parser_random_seq(cls, subparsers): parser_vtam_random_seq = subparsers.add_parser( 'random_seq', add_help=True, parents=[ cls.parser_params, cls.parser_log, cls.parser_threads, cls.parser_verbosity ], help= "make a folder with sample files containing 'size' number of sequences randomly selected from the files in input folder" ) parser_vtam_random_seq.add_argument( '--fastadir', action='store', help="input directory with FASTA files", required=True, type=ArgParserChecker.check_dir_exists_and_is_nonempty) parser_vtam_random_seq.add_argument( '--random_seqdir', action='store', help= "output directory with randomly selected sequences in FASTA format", required=True) parser_vtam_random_seq.add_argument( '--fastainfo', action='store', help="input TSV file with FASTA file information", required=True, type=lambda x: FileSampleInformation(x).check_args( header=header_merged_fasta)) parser_vtam_random_seq.add_argument( '--random_seqinfo', action='store', help="output TSV file with output FASTA file information", required=True) parser_vtam_random_seq.add_argument( '--samplesize', action='store', help="number of sequences to be selected from the input files", type=int, required=True) parser_vtam_random_seq.set_defaults(command='random_seq') @classmethod def add_parser_sortreads(cls, subparsers): parser_vtam_sortreads = subparsers.add_parser( 'sortreads', add_help=True, parents=[ cls.parser_params, cls.parser_log, cls.parser_threads, cls.parser_verbosity ], help= "sorts (Trims and demultiplexes) reads to biological samples and replicates according to the presence of sequence tags and primers" ) parser_vtam_sortreads.add_argument( '--fastainfo', action='store', help="input TSV file with FASTA file information", required=True, type=lambda x: FileSampleInformation(x).check_args( header=header_merged_fasta)) parser_vtam_sortreads.add_argument( '--fastadir', action='store', help="input directory with FASTA files", required=True, type=ArgParserChecker.check_dir_exists_and_is_nonempty) parser_vtam_sortreads.add_argument( '--sorteddir', action='store', help= "output directory with sorted reads (Trimmed and demultiplexed) in FASTA files and TSV file with corresponnding FASTA file information ('SORTEDDIR/sortedinfo.tsv')", default="out", required=True) # This attribute will trigger the good command parser_vtam_sortreads.add_argument( "--no_reverse", action="store_false", help="don't check reverse sequences", required=False) parser_vtam_sortreads.add_argument( "--tag_to_end", action="store_false", help="look for tags only at the edges of the sequence", required=False) parser_vtam_sortreads.add_argument( "--primer_to_end", action="store_false", help="look for primers only at the edges of the sequence", required=False) parser_vtam_sortreads.set_defaults(command='sortreads') @classmethod def add_parser_filter(cls, subparsers): parser_vtam_filter = subparsers.add_parser( 'filter', add_help=True, parents=[ cls.parser_params, cls.parser_log, cls.parser_threads, cls.parser_verbosity, cls.parser_wopmars_db, cls.parser_wopmars_dryrun, cls.parser_wopmars_forceall ], help= "filters out sequence artifacts and creates an amplicon sequence variant (ASV) table." ) parser_vtam_filter.add_argument( '--sortedinfo', action='store', help= "input TSV file with information about FASTA files containing sorted reads", required=True, type=lambda x: FileSampleInformation(x).check_args( header=header_sortedread_fasta)) parser_vtam_filter.add_argument( '--sorteddir', action='store', help= "input directory with sorted (Trimmed and demultiplexed) FASTA files", required=True, type=ArgParserChecker.check_dir_exists_and_is_nonempty) parser_vtam_filter.add_argument( '--asvtable', action='store', help= "output TSV file for the amplicon sequence variants (ASV) table", required=True) parser_vtam_filter.add_argument( '--cutoff_specific', dest='cutoff_specific', default=None, action='store', required=False, help= "TSV file with variant (col1: variant; col2: cutoff) or variant-replicate " "(col1: variant; col2: replicate; col3: cutoff)specific cutoffs", type=lambda x: FileCutoffSpecific(x).argparse_checker()) parser_vtam_filter.add_argument( '--lfn_variant_replicate', action='store_true', help= "if set, VTAM will run the algorithm for the low frequency noise over variant and replicates", required=False, default=False) parser_vtam_filter.add_argument( '--known_occurrences', action='store', help="TSV file with expected (keep) occurrences", required=False, type=lambda x: FileKnownOccurrences( x).argparse_checker_known_occurrences()) parser_vtam_filter.add_argument( '-U', '--until', dest='until', action='store', default=None, help= """execute '%(prog)s' UNTIL one rule, where the rule order looks like: 1. SampleInformation, 2. VariantReadCount, 3. FilterLFN, 4. FilterMinReplicateNumber, 5. FilterPCRerror, 6. FilterChimera, 7. FilterMinReplicateNumber2, 8. FilterRenkonen, 9. FilterMinReplicateNumber3, 10. FilterIndel, 11. FilterCodonStop, 12. ReadCountAverageOverReplicates, 13. MakeAsvTable""", required=False) parser_vtam_filter.add_argument( '-S', '--since', dest='since', action='store', default=None, help= """execute '%(prog)s' SINCE one rule, where the rule order looks like: 1. SampleInformation, 2. VariantReadCount, 3. FilterLFN, 4. FilterMinReplicateNumber, 5. FilterPCRerror, 6. FilterChimera, 7. FilterMinReplicateNumber2, 8. FilterRenkonen, 9. FilterMinReplicateNumber3, 10. FilterIndel, 11. FilterCodonStop, 12. ReadCountAverageOverReplicates, 13. MakeAsvTable""", required=False) # This attribute will trigger the good command parser_vtam_filter.set_defaults(command='filter') @classmethod def add_parser_taxassign(cls, subparsers): parser_vtam_taxassign = subparsers.add_parser( 'taxassign', add_help=True, parents=[ cls.parser_params, cls.parser_log, cls.parser_threads, cls.parser_verbosity, cls.parser_wopmars_db ], help="assigns amplicon sequence variants (ASVs) to taxonomic groups" ) parser_vtam_taxassign.add_argument( '--asvtable', action='store', help= "input TSV file with variant sequences and sequence header in the last column", required=True, type=lambda x: ArgParserChecker.check_taxassign_variants(x)) parser_vtam_taxassign.add_argument( '--output', action='store', help="output TSV file where the assigned taxa have been added", required=True) parser_vtam_taxassign.add_argument( '--mode', dest='mode', default="unassigned", action='store', required=False, choices=['unassigned', 'reset'], help= "the default 'unassigned' mode will only assign 'unassigned' variants" "The alternative 'reset' mode will erase the TaxAssign table and reassigned all " "input variants") parser_vtam_taxassign.add_argument( '--blastdbdir', action='store', help= "input directory with (Full or custom one) Blast database files", required=True, type=ArgParserChecker.check_dir_exists_and_is_nonempty) parser_vtam_taxassign.add_argument( '--blastdbname', action='store', help= "input Blast database name, which corresponds to the file name without suffix of the Blast database files", required=True) parser_vtam_taxassign.add_argument( '--taxonomy', dest='taxonomy', action='store', help="""input TSV file with taxonomy information. This file is created with the 'taxonomy' sub-command. For instance 'vtam taxonomy -o taxonomy.tsv' creates the 'taxonomy.tsv' file in the current directory""", required=True, type=ArgParserChecker.check_taxassign_taxonomy) # This attribute will trigger the good command parser_vtam_taxassign.set_defaults(command='taxassign') @classmethod def add_parser_optimize(cls, subparsers): parser_vtam_optimize = subparsers.add_parser( 'optimize', add_help=True, parents=[ cls.parser_params, cls.parser_log, cls.parser_threads, cls.parser_verbosity, cls.parser_wopmars_db, cls.parser_wopmars_dryrun, cls.parser_wopmars_forceall ], help="finds out optimal parameters for filtering") parser_vtam_optimize.add_argument( '--sortedinfo', action='store', help= "input TSV file with information about FASTA files containing sorted (trimmed and demultiplexed) reads", required=True, type=lambda x: FileSampleInformation(x).check_args( header=header_sortedread_fasta)) parser_vtam_optimize.add_argument( '--sorteddir', action='store', help= "input directory with sorted (Trimmed and demultiplexed) FASTA files", required=True, type=ArgParserChecker.check_dir_exists_and_is_nonempty) parser_vtam_optimize.add_argument('-o', '--outdir', action='store', help="output directory", default="out", required=True) parser_vtam_optimize.add_argument( '--known_occurrences', action='store', help="TSV file with known variants", required=True, type=lambda x: FileKnownOccurrences( x).argparse_checker_known_occurrences()) parser_vtam_optimize.add_argument( '--lfn_variant_replicate', action='store_true', help= "if set, VTAM will run the algorithm for the low frequency noise over variant and replicates", required=False, default=False) parser_vtam_optimize.add_argument( '-U', '--until', dest='until', action='store', default=None, help= """executes '%(prog)s' UNTIL one rule, where the rules follow this order: 1. SampleInformation, 2. VariantReadCount, 3. either OptimizeLFNsampleReplicate or OptimizePCRerror or OptimizeLFNreadCountAndLFNvariant""", required=False) parser_vtam_optimize.add_argument( '-S', '--since', dest='since', action='store', default=None, help= """executes '%(prog)s' SINCE one rule, where the rules follow this order: 1. SampleInformation, 2. VariantReadCount, 3. either OptimizeLFNsampleReplicate or OptimizePCRerror or OptimizeLFNreadCountAndLFNvariant""", required=False) # This attribute will trigger the good command parser_vtam_optimize.set_defaults(command='optimize') @classmethod def add_parser_makeKnownOccurrences(cls, subparsers): parser_vtam_makeKnownOccurrences = subparsers.add_parser( 'make_known_occurrences', add_help=True, parents=[cls.parser_threads, cls.parser_verbosity], help="create a file with know occurrences") parser_vtam_makeKnownOccurrences.add_argument( '--asvtable', action='store', help="input an ASV table file (tsv format)", required=True, ) # type=lambda x: FileSampleInformation(x).check_args( # header=header_paired_fastq)) parser_vtam_makeKnownOccurrences.add_argument( '--sample_types', action='store', help="input a tsv file with the sample types", required=True, ) # type=lambda x: FileSampleInformation(x).check_args( # header=header_paired_fastq)) parser_vtam_makeKnownOccurrences.add_argument( '--mock_composition', action='store', help="input a tsv file with the mock composition", required=True, ) # type=lambda x: FileSampleInformation(x).check_args( # header=header_paired_fastq)) parser_vtam_makeKnownOccurrences.add_argument( '--known_occurrences', action='store', help= "Default: ./known_occurrences.tsv. Output a .tsv file with the known occurences", required=False, default='./known_occurrences.tsv') parser_vtam_makeKnownOccurrences.add_argument( '--missing_occurrences', action='store', help= "Default: ./missing_occurrences.tsv. Output a .tsv file with the missing occurences", required=False, default='./missing_occurrences.tsv') parser_vtam_makeKnownOccurrences.add_argument( '--habitat_proportion', action='store', help="Default: 0.5. Input a threshold for habitat proportion", required=False, default=0.5) # This attribute will trigger the good command parser_vtam_makeKnownOccurrences.set_defaults( command='make_known_occurrences') @classmethod def add_parser_pool(cls, subparsers): parser_vtam_pool_markers = subparsers.add_parser( 'pool', add_help=True, parents=[ cls.parser_params, cls.parser_log, cls.parser_threads, cls.parser_verbosity ], help= "pools amplicon sequence variants (ASVs) from different but overlapping markers" ) parser_vtam_pool_markers.add_argument('--db', action='store', required=True, help="SQLITE file with DB") from vtam.utils.FileRunMarker import FileRunMarker parser_vtam_pool_markers.add_argument( '--runmarker', action='store', default=None, help=FileRunMarker.help(), required=True, type=lambda x: FileRunMarker(x).check_argument()) parser_vtam_pool_markers.add_argument( '--asvtable', action='store', help= "output TSV file with pooled markers and their occurrences in biological samples", required=True) parser_vtam_pool_markers.add_argument( '--readcounts', action='store_true', help= "Default: False. If False, presence/absence of reads in sample is given." "If True, sum of reads over pooled runs et/ou markers is given", required=False, default=False) # This attribute will trigger the good command parser_vtam_pool_markers.set_defaults(command='pool') @classmethod def add_parser_taxonomy(cls, subparsers): parser_vtam_taxonomy = subparsers.add_parser( 'taxonomy', add_help=True, parents=[], help="downloads a TSV file with the NCBI taxonomy information") parser_vtam_taxonomy.add_argument( '-o', '--output', dest='output', action='store', help="default: taxonomy.tsv. Path to TSV taxonomy file", required=False, default=os.path.join(os.getcwd(), 'taxonomy.tsv')) parser_vtam_taxonomy.add_argument( '--precomputed', dest='precomputed', action='store_true', default=False, help="default: False. Downloads precomputed taxonomy database, " "which is likely an older database", required=False) # This attribute will trigger the good command parser_vtam_taxonomy.set_defaults(command='taxonomy') @classmethod def add_parser_coiblastdb(cls, subparsers): parser_vtam_coi_blast_db = subparsers.add_parser( 'coi_blast_db', add_help=True, help= "downloads a precomputed BLAST database for the cytochrome C oxidase subunit I (COI) marker" ) parser_vtam_coi_blast_db.add_argument( '--blastdbdir', dest='blastdbdir', action='store', help= "output directory with custom Blast database files of the cytochrome C oxidase subunit I (COI) marker files", required=False, default='blastdb') parser_vtam_coi_blast_db.add_argument( '--blastdbname', dest='blastdbname', action='store', help= "cytochrome C oxidase subunit I (COI) Blast database name among these current possibilities: coi_blast_db, coi_blast_db_20200420. Other versions if available can be found here: {}" .format(os.path.dirname(coi_blast_db_gz_url1)), required=False, default='coi_blast_db', type=lambda x: CommandBlastCOI( x).argparse_checker_blast_coi_blastdbname(), ) # This attribute will trigger the good command parser_vtam_coi_blast_db.set_defaults(command='coi_blast_db')