Python FileParams Examples

Programming Language: Python

Namespace/Package Name: vtam.utils.FileParams

Class/Type: FileParams

Examples at hotexamples.com: 6

Python FileParams - 6 examples found. These are the top rated real world Python examples of vtam.utils.FileParams.FileParams extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

FileParams(8)

Frequently Used Methods

FileParams (8)

Example #1

Show file

    def main(cls, db, pooled_marker_tsv, run_marker_tsv, params, readcounts):
        """

        Parameters
        ----------
        db : str
            path to DB in sqlite format
        pooled_marker_tsv : str
            path to output pooled_marker.tsv file
        run_marker_tsv : str
            path to input run_marker.tsv file
        params
        readcounts: bool
            Output absence/presence (False) or sum of read counts (True)

        Returns
        -------

        """

        #######################################################################
        #
        # Parameters
        #
        #######################################################################

        # params_dic = constants.get_params_default_dic()
        params_dic = FileParams(params).get_params_dic()

        cluster_identity = params_dic['cluster_identity']

        run_marker_file_obj = FileRunMarker(tsv_path=run_marker_tsv)

        # run_marker_tsv_reader = RunMarkerTSVreader(db=db, run_marker_tsv_path=run_marker_tsv)
        if not (run_marker_tsv is None):
            run_marker_df = run_marker_file_obj.read_tsv_into_df()
        else:
            run_marker_df = None

        engine = sqlalchemy.create_engine('sqlite:///{}'.format(db),
                                          echo=False)
        Base = automap_base()
        Base.prepare(engine, reflect=True)

        sample_list = run_marker_file_obj.get_sample_ids(engine)
        sample_list = NameIdConverter(id_name_or_sequence_list=sample_list,
                                      engine=engine).to_names(Sample)

        ############################################################################################
        #
        # Compute all variant_read_count_input_df required for ASV table
        #
        ############################################################################################

        variant_read_count_df = run_marker_file_obj.get_variant_read_count_df(
            engine=engine, variant_read_count_like_model=FilterCodonStop)

        asv_table_runner = RunnerAsvTable(
            variant_read_count_df=variant_read_count_df,
            engine=engine,
            sample_list=sample_list,
            cluster_identity=cluster_identity)
        asv_table_df = asv_table_runner.create_asvtable_df()
        asv_table_df.rename(
            {
                'run': 'run_name',
                'marker': 'marker_name',
                'variant': 'variant_id'
            },
            axis=1,
            inplace=True)

        ############################################################################################
        #
        # Prefix biosample columns with run name for same biosample name in different runs
        #
        ############################################################################################

        asv_table_2_df = asv_table_df.copy()

        for run_name_i, run_name in enumerate(asv_table_df.run_name.unique()):
            asv_table_runi_df = (
                asv_table_df.loc[asv_table_df.run_name == run_name]).copy()

            for biosample in asv_table_runi_df.iloc[:, 5:-4].columns.tolist():
                asv_table_runi_df.rename(
                    {biosample: run_name + '-' + biosample},
                    axis=1,
                    inplace=True)

            if run_name_i == 0:
                asv_table_2_df = asv_table_runi_df
            else:

                asv_table_2_df = pandas.concat([
                    asv_table_2_df,
                    pandas.DataFrame(columns=asv_table_runi_df.columns)
                ])
                asv_table_2_df = asv_table_2_df.fillna(0)
                asv_table_2_df = pandas.concat(
                    [asv_table_2_df, asv_table_runi_df], axis=0, join='outer')

            del (asv_table_runi_df)

        ############################################################################################
        #
        # Reorder columns
        #
        ############################################################################################

        column_list = asv_table_2_df.columns.tolist()
        column_list.remove("run_name")
        column_list.insert(0, "run_name")

        column_list.remove("clusterid")
        column_list.remove("clustersize")
        column_list.remove("chimera_borderline")
        column_list.remove("sequence")
        column_list = column_list + [
            'clusterid', 'clustersize', 'chimera_borderline', 'sequence'
        ]

        column_list.remove("sequence_length")
        column_list.remove("read_count")
        column_list.insert(3, "sequence_length")
        column_list.insert(4, "read_count")

        asv_table_2_df = asv_table_2_df[column_list]

        ############################################################################################
        #
        # Pool markers
        #
        ############################################################################################

        pool_marker_runner = CommandPoolRunMarkers(asv_table_df=asv_table_2_df,
                                                   run_marker_df=run_marker_df,
                                                   readcounts=readcounts)
        pooled_marker_df = pool_marker_runner.get_pooled_marker_df()

        #######################################################################
        #
        # Cluster sequences
        #
        #######################################################################

        # reset asvtable-based clusterid and clustersize
        pooled_marker_df.drop(['clusterid', 'clustersize'],
                              axis=1,
                              inplace=True)
        pooled_marker_df.rename({'variant': 'variant_id'},
                                axis=1,
                                inplace=True)  # prepare
        pooled_marker_df['read_count'] = pooled_marker_df.iloc[:, 4:-2].sum(
            axis=1)  # prepare

        seq_clusterer_obj = SequenceClusterer(
            pooled_marker_df, cluster_identity=cluster_identity)
        cluster_count_df = seq_clusterer_obj.compute_clusters()

        pooled_marker_df = pooled_marker_df.merge(cluster_count_df,
                                                  on='variant_id')
        pooled_marker_df.drop(['read_count'], axis=1, inplace=True)
        ############################################################################################
        #
        # Reorder columns
        #
        ############################################################################################

        column_list = pooled_marker_df.columns.tolist()
        column_list.remove("pooled_sequences")
        column_list.remove("sequence")
        column_list = column_list + ['pooled_sequences', 'sequence']
        pooled_marker_df = pooled_marker_df[column_list]

        # change dtypes
        for col in pooled_marker_df.columns[4:-4]:
            pooled_marker_df[col] = pooled_marker_df[col].astype(int)

        # verify here if the run-sample exists in the sampleinformation database
        # and drop if not
        run_biosample_cols = pooled_marker_df.columns[4:-4]
        # run_biosample_item = run_biosample_cols[0]
        from sqlalchemy.orm import sessionmaker
        Session = sessionmaker(bind=engine)
        session = Session()
        for run_biosample_item in run_biosample_cols:
            thisrun, thisbiosample = run_biosample_item.split('-')
            rowcount = session.query(
                SampleInformation, Sample,
                Run).filter(SampleInformation.sample_id == Sample.id).filter(
                    SampleInformation.run_id == Run.id).filter(
                        Run.name == thisrun).filter(
                            Sample.name == thisbiosample).count()
            if rowcount <= 0:
                pooled_marker_df.drop([run_biosample_item],
                                      axis=1,
                                      inplace=True)

        #######################################################################
        #
        # To tsv
        #
        #######################################################################

        pooled_marker_df.to_csv(pooled_marker_tsv, sep="\t", index=False)

Example #2

Show file

File: CommandSortReads.py Project: ulysse06/vtam

    def main(fastainfo,
             fastadir,
             sorteddir,
             params=None,
             num_threads=multiprocessing.cpu_count()):

        if sys.platform.startswith('win'):
            num_threads = 1

        ############################################################################################
        #
        # params.yml parameters
        #
        ############################################################################################

        params_dic = FileParams(params).get_params_dic()

        cutadapt_error_rate = params_dic['cutadapt_error_rate']
        cutadapt_minimum_length = params_dic['cutadapt_minimum_length']
        cutadapt_maximum_length = params_dic['cutadapt_maximum_length']

        ############################################################################################
        #
        # Loop over tag and primer pairs to demultiplex and trim reads
        #
        ############################################################################################

        merged_fastainfo_df = FileSampleInformation(
            fastainfo).read_tsv_into_df()

        pathlib.Path(sorteddir).mkdir(parents=True, exist_ok=True)
        tempdir = PathManager.instance().get_tempdir()

        sorted_read_info_df = pandas.DataFrame()

        for i in range(0, merged_fastainfo_df.shape[0]):
            fasta_info_series = merged_fastainfo_df.iloc[i]

            tag_fwd = fasta_info_series.tagfwd
            tag_rev = fasta_info_series.tagrev
            primer_fwd = fasta_info_series.primerfwd
            primer_rev = fasta_info_series.primerrev
            in_fasta_basename = fasta_info_series.mergedfasta

            Logger.instance().debug(
                "Analysing FASTA file: {}".format(in_fasta_basename))

            fasta_info_df_i = fasta_info_series.to_frame().T
            in_raw_fasta_path = os.path.join(fastadir, in_fasta_basename)

            ########################################################################################
            #
            # Cut adapt tag of forward reads
            # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only
            # --front 'tcgatcacgatgt;min_overlap=13...gctgtagatcgaca;min_overlap=14'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta
            # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta
            #
            ########################################################################################

            if generic_dna:  # Biopython <1.78
                tag_rev_rc = str(
                    Seq(tag_rev, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                tag_rev_rc = str(Seq(tag_rev).reverse_complement())

            out_fasta_basename = os.path.basename(in_raw_fasta_path).replace(
                '.fasta', '_sorted_%03d.fasta' % i)
            out_fasta_path = os.path.join(tempdir, out_fasta_basename)

            cmd_cutadapt_tag_dic = {
                'tag_fwd': tag_fwd,
                'tag_fwd_len': len(tag_fwd),
                'tag_rev_rc': tag_rev_rc,
                'tag_rev_rc_len': len(tag_rev_rc),
                'in_fasta_path': in_raw_fasta_path,
                'out_fasta': out_fasta_path,
                'num_threads': num_threads,
            }

            cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \
                '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic)

            Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_tag_str
            else:
                args = shlex.split(cmd_cutadapt_tag_str)
            run_result = subprocess.run(args=args,
                                        capture_output=True,
                                        check=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ########################################################################################
            #
            # Trim primers from output
            # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only
            # --front 'TCCACTAATCACAARGATATTGGTAC;min_overlap=26...GGAGGATTTGGWAATTGATTAGTW;min_overlap=24'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_trimmed_000.fasta
            # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta
            #
            ########################################################################################

            if generic_dna:  # Biopython <1.78
                primer_rev_rc = str(
                    Seq(primer_rev, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                primer_rev_rc = str(Seq(primer_rev).reverse_complement())

            in_fasta_path = out_fasta_path
            out_fasta_basename = os.path.basename(in_fasta_path).replace(
                '_sorted_%03d.fasta' % i, '_sorted_trimmed_%03d.fasta' % i)
            out_fasta_path = os.path.join(tempdir, out_fasta_basename)

            cmd_cutadapt_primer_dic = {
                'primer_fwd': primer_fwd,
                'primer_fwd_len': len(primer_fwd),
                'primer_rev_rc': primer_rev_rc,
                'primer_rev_rc_len': len(primer_rev_rc),
                'in_fasta_path': in_fasta_path,
                'out_fasta': out_fasta_path,
                'error_rate': cutadapt_error_rate,
                'read_min_length': cutadapt_minimum_length,
                'read_max_length': cutadapt_maximum_length,
                'num_threads': num_threads,
            }

            cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                                      '--minimum-length {read_min_length} ' \
                                      '--maximum-length {read_max_length} --trimmed-only  ' \
                                      '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" '  \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic)

            Logger.instance().debug(
                "Running: {}".format(cmd_cutadapt_primer_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_primer_str
            else:
                args = shlex.split(cmd_cutadapt_primer_str)
            run_result = subprocess.run(args=args, capture_output=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ########################################################################################
            #
            # Cut adapt tag of reverse-complement reads
            # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only
            # --front 'tgtcgatctacagc;min_overlap=14...acatcgtgatcga;min_overlap=13'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta
            # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta
            #
            ########################################################################################

            if generic_dna:  # Biopython <1.78
                tag_fwd_rc = str(
                    Seq(tag_fwd, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                tag_fwd_rc = str(Seq(tag_fwd).reverse_complement())

            out_rc_fasta_basename = os.path.basename(
                in_raw_fasta_path).replace('.fasta',
                                           '_rc_sorted_%03d.fasta' % i)
            out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename)

            cmd_cutadapt_tag_dic = {
                'tag_fwd': tag_rev,
                'tag_fwd_len': len(tag_rev),
                'tag_rev_rc': tag_fwd_rc,
                'tag_rev_rc_len': len(tag_fwd_rc),
                'in_fasta_path': in_raw_fasta_path,
                'out_fasta': out_rc_fasta_path,
                'num_threads': num_threads,
            }

            cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \
                '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic)

            Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_tag_str
            else:
                args = shlex.split(cmd_cutadapt_tag_str)
            run_result = subprocess.run(args=args, capture_output=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ###################################################################
            #
            # Trim primers from output
            # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only
            # --front 'WACTAATCAATTWCCAAATCCTCC;min_overlap=24...GTACCAATATCYTTGTGATTAGTGGA;min_overlap=26'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_trimmed_000.fasta
            # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta
            #
            ###################################################################

            if generic_dna:  # Biopython <1.78
                primer_fwd_rc = str(
                    Seq(primer_fwd, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                primer_fwd_rc = str(Seq(primer_fwd).reverse_complement())

            in_fasta_path = out_rc_fasta_path
            out_rc_fasta_basename = os.path.basename(in_fasta_path).replace(
                '_rc_sorted_%03d.fasta' % i,
                '_rc_sorted_trimmed_%03d.fasta' % i)
            out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename)

            cmd_cutadapt_primer_dic = {
                'primer_fwd': primer_rev,
                'primer_fwd_len': len(primer_rev),
                'primer_rev_rc': primer_fwd_rc,
                'primer_rev_rc_len': len(primer_fwd_rc),
                'in_fasta_path': in_fasta_path,
                'out_fasta': out_rc_fasta_path,
                'error_rate': cutadapt_error_rate,
                'read_min_length': cutadapt_minimum_length,
                'read_max_length': cutadapt_maximum_length,
                'num_threads': num_threads,
            }
            cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                '--minimum-length {read_min_length} ' \
                '--maximum-length {read_max_length} --trimmed-only  ' \
                '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" ' \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic)

            Logger.instance().debug(
                "Running: {}".format(cmd_cutadapt_primer_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_primer_str
            else:
                args = shlex.split(cmd_cutadapt_primer_str)
            run_result = subprocess.run(args=args, capture_output=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ###################################################################
            #
            # Reverse complement back rc fasta and pool
            #
            ###################################################################

            out_final_fasta_basename = os.path.basename(
                in_raw_fasta_path).replace('.fasta', '_%03d.fasta' % i)
            out_final_fasta_path = os.path.join(sorteddir,
                                                out_final_fasta_basename)
            shutil.copy(out_fasta_path, out_final_fasta_path)

            Logger.instance().debug("Pooling fwd and rc reads...")
            with open(out_final_fasta_path, 'a') as fout:
                with open(out_rc_fasta_path, 'r') as fin:
                    for line in fin:
                        if not line.startswith('>'):

                            if generic_dna:  # Biopython <1.78
                                fout.write("%s\n" % str(
                                    Seq(line.strip(),
                                        generic_dna).reverse_complement()))
                            else:  # Biopython =>1.78
                                fout.write("%s\n" % str(
                                    Seq(line.strip()).reverse_complement()))

                        else:
                            fout.write(line)

            fasta_info_df_i = fasta_info_df_i[[
                'run', 'marker', 'sample', 'replicate'
            ]]
            fasta_info_df_i['sortedfasta'] = out_final_fasta_basename
            sorted_read_info_df = pandas.concat(
                [sorted_read_info_df, fasta_info_df_i], axis=0)

        fasta_trimmed_info_tsv = os.path.join(sorteddir, 'sortedinfo.tsv')
        sorted_read_info_df.to_csv(fasta_trimmed_info_tsv,
                                   sep="\t",
                                   header=True,
                                   index=False)

Example #3

Show file

File: RunnerTaxAssign.py Project: ulysse06/vtam

    def __init__(self, sequence_list, taxonomy, blast_db_dir, blast_db_name,
                 num_threads, params):
        """

        Parameters
        ----------
        sequence_list : list
            List of se
        param2 : str
            The second parameter.

        """

        self.old_tax_id_df = taxonomy.old_tax_df
        self.taxonomy_df = taxonomy.df
        self.blast_db_dir = blast_db_dir
        self.this_temp_dir = os.path.join(PathManager.instance().get_tempdir(),
                                          os.path.basename(__file__))
        pathlib.Path(self.this_temp_dir).mkdir(exist_ok=True)

        self.num_threads = num_threads

        #######################################################################
        #
        # Parameters
        #
        #######################################################################

        params_dic = FileParams(params).get_params_dic()
        qcov_hsp_perc = params_dic['qcov_hsp_perc']

        #######################################################################
        #
        # 2 Create FASTA file with Variants
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Create SortedReadFile from Variants".format(
                __file__,
                inspect.currentframe().f_lineno))
        variant_fasta = os.path.join(self.this_temp_dir, 'variant.fasta')
        with open(variant_fasta, 'w') as fout:
            for seq in sequence_list:
                fout.write(">{}\n{}\n".format(seq, seq))

        #######################################################################
        #
        # 3 Run local blast
        #
        #######################################################################

        runner_blast = RunnerBlast(variant_fasta, blast_db_dir, blast_db_name,
                                   num_threads, qcov_hsp_perc)
        # run blast
        blast_output_tsv = runner_blast.run_local_blast()
        # process blast results
        blast_output_df = RunnerBlast.process_blast_result(blast_output_tsv)

        #######################################################################
        #
        # Compute tax lineages for Blast target tax ids
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Open taxonomy.tsv DB".format(
                __file__,
                inspect.currentframe().f_lineno))
        blast_output_df.target_tax_id = pandas.to_numeric(
            blast_output_df.target_tax_id)
        #
        Logger.instance().debug(
            "file: {}; line: {}; Annotate each target_tax_id with its lineage as columns in wide format"
            .format(__file__,
                    inspect.currentframe().f_lineno))
        tax_id_list = blast_output_df.target_tax_id.unique().tolist()
        tax_id_to_lineage_df = taxonomy.get_several_tax_id_lineages(
            tax_id_list)

        #######################################################################
        #
        # Merge tax lineages and the blast result
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Merge blast result including tax_id with their lineages"
            .format(__file__,
                    inspect.currentframe().f_lineno))
        # Merge local blast output with tax_id_to_lineage_df
        # variant_identity_lineage_df = blast_output_df.merge(
        #     tax_id_to_lineage_df, left_on='target_tax_id', right_on='tax_id')
        variantid_identity_lineage_df = blast_output_df.merge(
            tax_id_to_lineage_df, left_on='target_tax_id', right_index=True)
        # variant_identity_lineage_df.drop('tax_id', axis=1, inplace=True)
        """(Pdb) variant_identity_lineage_df.columns  
Index(['variant_id', 'target_id', 'identity', 'evalue', 'coverage',
       'target_tax_id', 'no rank', 'species', 'genus', 'family', 'order',
       'class', 'subphylum', 'phylum', 'subkingdom', 'kingdom', 'superkingdom',
       'superfamily', 'infraorder', 'suborder', 'infraclass', 'subclass',
       'tribe', 'subfamily', 'cohort', 'subgenus', 'subspecies', 'parvorder',
       'superorder', 'subcohort', 'superclass', 'species group', 'subtribe',
       'section', 'varietas', 'species subgroup'],
      dtype='object')"""

        #######################################################################
        #
        #  several_variants_to_ltg
        # this function returns a data frame containing the Ltg rank and Ltg Tax_id for each variant
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Main loop over variant and identity to"
            "compute the whole set of ltg_tax_id and ltg_rank for each variant_id"
            "to a dataframe".format(__file__,
                                    inspect.currentframe().f_lineno))
        runner_ltg_selection = RunnerLTGselection(
            variant_identity_lineage_df=variantid_identity_lineage_df,
            taxonomy_df=self.taxonomy_df,
            params=params)
        self.ltg_df = runner_ltg_selection.several_variants_to_ltg()

Example #4

Show file

    def main(fastqinfo, fastqdir, fastainfo, fastadir, params=None, num_threads=multiprocessing.cpu_count()):
        ############################################################################################
        #
        # params.yml parameters
        #
        ############################################################################################

        params_dic = FileParams(params).get_params_dic()

        ############################################################################################
        #
        # Read fastq information into stats_df
        #
        ############################################################################################

        fastqinfo_df = FileSampleInformation(fastqinfo).read_tsv_into_df()

        pathlib.Path(
            os.path.dirname(fastainfo)).mkdir(
            parents=True,
            exist_ok=True)
        pathlib.Path(fastadir).mkdir(parents=True, exist_ok=True)

        fastainfo_df = pandas.DataFrame()

        ############################################################################################
        #
        # Loop over fastq pairs to merge
        #
        ############################################################################################

        # File with analysis stats data
        stats_df = pandas.DataFrame({'FastqFwd': [], 'FastqRev': [], 'NbReadsFwd': [], 'NbReadsRev': [], 'FastaMerged': [], 'NbMergedReads': []})

        for fastqfwd, fastqrev in fastqinfo_df[[
                'fastqfwd', 'fastqrev']].drop_duplicates().values:

            fastq_info_df_i = fastqinfo_df.loc[(fastqinfo_df.fastqfwd == fastqfwd) & (
                fastqinfo_df.fastqrev == fastqrev)]

            fastq_fw_abspath = os.path.join(fastqdir, fastqfwd)
            with open(fastq_fw_abspath, 'rb') as fin:
                fastq_fw_linecount = int(sum(1 for i in fin.read())/4)

            fastq_rv_abspath = os.path.join(fastqdir, fastqrev)
            with open(fastq_rv_abspath, 'rb') as fin:
                fastq_rv_linecount = int(sum(1 for i in fin.read())/4)

            Logger.instance().debug(
                "Analysing FASTQ files: {} and ".format(
                    fastqfwd, fastqrev))

            try:
                pathlib.Path(fastq_fw_abspath).resolve(strict=True)
            except FileNotFoundError:
                Logger.instance().error(
                    VTAMexception(
                        "VTAMexception: This FASTQ file was not found: {}.".format(fastq_fw_abspath)))
                sys.exit(1)
            try:
                pathlib.Path(fastq_rv_abspath).resolve(strict=True)
            except FileNotFoundError:
                Logger.instance().error(
                    VTAMexception(
                        "VTAMexception: This FASTQ file was not found: {}.".format(fastq_rv_abspath)))
                sys.exit(1)

            fasta_merged_basename = os.path.basename(
                fastq_fw_abspath).replace('.fastq', '.fasta')
            out_fasta_path = os.path.join(fastadir, fasta_merged_basename)

            ########################################################################################
            #
            # Run vsearch merge
            #
            ########################################################################################

            vsearch_args_dic = {}

            vsearch_args_dic['fastq_ascii'] = params_dic['fastq_ascii']
            vsearch_args_dic['fastq_maxee'] = params_dic['fastq_maxee']
            vsearch_args_dic['fastq_maxmergelen'] = params_dic['fastq_maxmergelen']
            vsearch_args_dic['fastq_maxns'] = params_dic['fastq_maxns']
            vsearch_args_dic['fastq_minlen'] = params_dic['fastq_minlen']
            vsearch_args_dic['fastq_minmergelen'] = params_dic['fastq_minmergelen']
            vsearch_args_dic['fastq_minovlen'] = params_dic['fastq_minovlen']
            vsearch_args_dic['fastq_truncqual'] = params_dic['fastq_truncqual']

            vsearch_args_dic['fastq_mergepairs'] = fastq_fw_abspath
            vsearch_args_dic['reverse'] = fastq_rv_abspath
            vsearch_args_dic['fastaout'] = out_fasta_path
            vsearch_args_dic['threads'] = num_threads

            vsearch_cluster = RunnerVSearch(parameters=vsearch_args_dic)
            vsearch_cluster.run()

            fastq_info_df_i = fastq_info_df_i[['run', 'marker', 'sample', 'replicate', 'tagfwd',
                                               'primerfwd', 'tagrev', 'primerrev']]
            fastq_info_df_i['mergedfasta'] = fasta_merged_basename
            fastainfo_df = pandas.concat(
                [fastainfo_df, fastq_info_df_i], axis=0)

            with open(out_fasta_path, 'rb') as fin:
                fasta_merged_linecount = int(sum(1 for i in fin.read()) / 4)

            ########################################################################################
            #
            # Summary file
            #
            ########################################################################################

            stats_df = pandas.concat([stats_df, pandas.DataFrame({
                'FastqFwd': [fastq_fw_abspath], 'FastqRev': [fastq_fw_linecount],
                'NbReadsFwd': [fastq_rv_abspath], 'NbReadsRev': [fastq_rv_linecount], 'FastaMerged': [out_fasta_path], 'NbMergedReads': [fasta_merged_linecount]})])
    
        for mergedfasta in fastainfo_df[['mergedfasta']].drop_duplicates().values:
            mergedfasta = mergedfasta[0]

            if mergedfasta.endswith('.bz2') or  mergedfasta.endswith('.gz'):
                fasta_merged_abspath = os.path.join(fastadir, mergedfasta)
                mergedfasta_compressor = FileCompression(fasta_merged_abspath)
            
                if mergedfasta.endswith('.gz'):
                    mergedfasta_c = mergedfasta_compressor.pigz_compression()
                    if mergedfasta_c is None:
                        mergedfasta_c = mergedfasta_compressor.gzip_compression()

                    
                elif mergedfasta.endswith('.bz2'):
                    mergedfasta_c = mergedfasta_compressor.bz2_compression()
                    
                mergedfasta_compressor.delete_file()
                _, relPath = os.path.split(mergedfasta_c)
                fastainfo_df.loc[fastainfo_df['mergedfasta'] == mergedfasta, 'mergedfasta'] = relPath
                
            else: 
                fastq_info_df_i['mergedfasta'] = fasta_merged_basename

        
        fastainfo_df.to_csv(fastainfo, sep="\t", header=True, index=False)

Example #5

Show file

    def main(fastainfo, fastadir, sorteddir, params=None, num_threads=multiprocessing.cpu_count(), 
        no_reverse=False, tag_to_end=False, primer_to_end=False):
        
        Logger.instance().info(f"OPTIONS:\n no_reverse: {not no_reverse} \n tag_to_end {not tag_to_end} \n primer_to_end {not primer_to_end}")

        if sys.platform.startswith('win'):
            num_threads = 1

        ############################################################################################
        #
        # params.yml parameters
        #
        ############################################################################################

        params_dic = FileParams(params).get_params_dic()

        cutadapt_error_rate = params_dic['cutadapt_error_rate']
        cutadapt_minimum_length = params_dic['cutadapt_minimum_length']
        cutadapt_maximum_length = params_dic['cutadapt_maximum_length']

        ############################################################################################
        #
        # Loop over tag and primer pairs to demultiplex and trim reads
        #
        ############################################################################################

        merged_fastainfo_df = FileSampleInformation(fastainfo).read_tsv_into_df()
        
        pathlib.Path(sorteddir).mkdir(parents=True, exist_ok=True)
        tempdir = PathManager.instance().get_tempdir()

        merged_fasta_list = []
        results_list = []
        sample_info = {}

        # make sure every file is analysed once.
        for i in range(merged_fastainfo_df.shape[0]):
            if merged_fastainfo_df.iloc[i].mergedfasta not in merged_fasta_list:
                merged_fasta_list.append(merged_fastainfo_df.iloc[i].mergedfasta)
            
        for mergedfasta in merged_fasta_list:

            inputFiles = FilesInputCutadapt(fastainfo, mergedfasta, no_reverse, tag_to_end)
            
            tagFile_path = inputFiles.tags_file()
            info = inputFiles.get_df_info()

            for key in info.keys():
                if key in sample_info.keys():
                    sample_info[key] = sample_info[key] + info[key]
                else:
                    sample_info[key] = info[key]

            Logger.instance().debug("Analysing FASTA file: {}".format(mergedfasta))

            in_raw_fasta_path = os.path.join(fastadir, mergedfasta)

            ########################################################################################
            #
            #   cutadapt --cores=0 -e 0 --no-indels --trimmed-only -g tagFile:$tagfile 
            #   --overlap length -o "tagtrimmed.{name}.fasta" in_raw_fasta_path
            #
            ########################################################################################

            base = os.path.basename(in_raw_fasta_path)
            base, base_suffix = base.split('.', 1)
            
            out_fasta_path = os.path.join(tempdir, "sorted") 

            cmd_cutadapt_tag_dic = {
                'in_fasta_path': in_raw_fasta_path,
                'out_fasta': out_fasta_path,
                'num_threads': num_threads,
                'tagFile': tagFile_path,
                'base_suffix': base_suffix,
            }

            cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \
                '-g file:{tagFile} --output {out_fasta}_{{name}}.{base_suffix} {in_fasta_path}' \
                .format(**cmd_cutadapt_tag_dic)

            Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_tag_str
            else:
                args = shlex.split(cmd_cutadapt_tag_str)
            run_result = subprocess.run(args=args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

            Logger.instance().info(run_result.stdout.decode())

            inputFiles.remove_tags_file()

            ########################################################################################
            #
            # Trim primers from output
            # cutadapt --quiet --cores=0 -e trim_error --no-indels --trimmed-only 
            # --minimum-length minimum_length --maximum-length maximum_length 
            # --output input_path + {name} + suffix outputfile
            #
            ########################################################################################
            
            primers = inputFiles.primers()
            try:
                tags_samples = inputFiles.get_sample_names()
            except Exception as e:
                Logger.instance().error(e)
                return 
            
            for primer in primers:
                
                marker, primerfwd, primerrev, lenprimerfwd, lenprimerrev = primer

                for tag_sample in tags_samples:

                    name, run, marker2, sample, replicate, _, _ = tag_sample
                    
                    if marker not in marker2:
                        continue

                    in_fasta_path = out_fasta_path + "_" + name + "." + base_suffix

                    baseMerge =  mergedfasta.split(".")[0]
                                        
                    outname = run + "_" + marker + "_" + sample + "_" + replicate + "_" + baseMerge + "_trimmed"
                    if name.endswith("_reversed"):
                        outname = outname + "_reversed"
                    out_fasta_path_new = os.path.join(tempdir, outname + "." + base_suffix)

                    results_list.append(out_fasta_path_new)
                    
                    if not "_reversed" in name:
                        if generic_dna:  # Biopython <1.78
                            primerRev = str(Seq(primerrev, generic_dna).reverse_complement())
                        else:  # Biopython =>1.78
                            primerRev = str(Seq(primerrev).reverse_complement())
                        primerFwd = primerfwd
                        lenPrimerFwd = lenprimerfwd
                        lenPrimerRev = lenprimerrev
                    else:
                        if generic_dna:  # Biopython <1.78
                            primerRev = str(Seq(primerfwd, generic_dna).reverse_complement())
                        else:  # Biopython =>1.78
                            primerRev = str(Seq(primerfwd).reverse_complement())
                        primerFwd = primerrev
                        lenPrimerFwd = lenprimerrev
                        lenPrimerRev = lenprimerfwd


                    cmd_cutadapt_primer_dic = {
                        'in_fasta_path': in_fasta_path,
                        'out_fasta': out_fasta_path_new,
                        'error_rate': cutadapt_error_rate,
                        'num_threads': num_threads,
                        'primerFwd': primerFwd,
                        'primerRev': primerRev,
                        'lenPrimerFwd': lenPrimerFwd,
                        'lenPrimerRev': lenPrimerRev,
                        'read_min_length': cutadapt_minimum_length,
                        'read_max_length': cutadapt_maximum_length,
                    }

                    if not primer_to_end: #works if the command is selected
                        cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                            '--minimum-length {read_min_length} --maximum-length {read_max_length} ' \
                            '--trimmed-only -g "^{primerFwd}...{primerRev}$" --output {out_fasta} {in_fasta_path}'\
                            .format(**cmd_cutadapt_primer_dic)
                    else:
                        cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                            '--minimum-length {read_min_length} --maximum-length {read_max_length} ' \
                            '--trimmed-only -g "{primerFwd};min_overlap={lenPrimerFwd}...{primerRev};min_overlap={lenPrimerRev}" '\
                            '--output {out_fasta} {in_fasta_path}'\
                            .format(**cmd_cutadapt_primer_dic)

                    Logger.instance().debug("Running: {}".format(cmd_cutadapt_primer_str))

                    if sys.platform.startswith("win"):
                        args = cmd_cutadapt_primer_str
                    else:
                        args = shlex.split(cmd_cutadapt_primer_str)

                    run_result = subprocess.run(args=args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

                    Logger.instance().info(run_result.stdout.decode())

        ###################################################################
        #
        # Reverse complement back rc fasta and pool
        #
        ###################################################################   
     
        for file in results_list:
            if "_trimmed" in file:

                out_final_fasta_path = os.path.join(sorteddir, os.path.split(file)[-1])
                in_fasta_path = os.path.join(tempdir, file)

                if out_final_fasta_path.endswith(".gz"):      
                    _open = partial(gzip.open) 
                elif out_final_fasta_path.endswith(".bz2"):
                    _open = partial(bz2.open)
                else:
                    _open = open

                if in_fasta_path.endswith(".gz"):
                    _open2 = partial(gzip.open) 
                elif in_fasta_path.endswith(".bz2"):
                    _open2 = partial(bz2.open) 
                else: 
                    _open2 = open

                if "_reversed" in file:
                    Logger.instance().debug("Pooling fwd and rc reads...")

                    out_final_fasta_path = out_final_fasta_path.replace("_reversed", "")

                    with _open(out_final_fasta_path, 'at') as fout:
                        with _open2(in_fasta_path, 'rt') as fin:
                            for line in fin.readlines():
                                if not line.startswith('>'):
                                    if generic_dna:  # Biopython <1.78
                                        fout.write("%s\n" % str(
                                            Seq(line.strip(), generic_dna).reverse_complement()))
                                    else:  # Biopython =>1.78
                                        fout.write("%s\n" % str(
                                            Seq(line.strip()).reverse_complement()))

                                else:
                                    fout.write(line)
                else:
                    with _open(out_final_fasta_path, 'at') as fout:
                        with _open2(in_fasta_path, 'rt') as fin:
                            for line in fin.readlines():
                                fout.write(line)
        
        results_list = [os.path.split(result)[-1] for result in results_list if "_reversed" not in result]

        del sample_info['mergedfasta']
        del sample_info['primerrev']
        del sample_info['primerfwd']
        del sample_info['tagrev']
        del sample_info['tagfwd']

        sample_info['sortedfasta'] = results_list

        sample_info_df = pandas.DataFrame(sample_info)

        fasta_trimmed_info_tsv = os.path.join(sorteddir, 'sortedinfo.tsv')
        sample_info_df.to_csv(fasta_trimmed_info_tsv, sep="\t", header=True, index=False)

Example #6

Show file

class ArgParser:

    ############################################################################################
    #
    # Specific parsers
    #
    ############################################################################################

    parser_params = argparse.ArgumentParser(add_help=False)
    parser_params.add_argument('--params',
                               action='store',
                               default=None,
                               help="YML file with parameter values",
                               required=False,
                               type=lambda x: FileParams(params_path=x).
                               argparse_checker_params_file())

    parser_log = argparse.ArgumentParser(add_help=False)
    parser_log.add_argument('--log',
                            dest='log',
                            action='store',
                            help="write log to LOG file.",
                            required=False)

    parser_threads = argparse.ArgumentParser(add_help=False)
    parser_threads.add_argument('--threads',
                                dest='threads',
                                action='store',
                                help="number of threads",
                                required=False,
                                default=multiprocessing.cpu_count())

    parser_verbosity = argparse.ArgumentParser(add_help=False)
    parser_verbosity.add_argument('-v',
                                  dest='log_verbosity',
                                  action='count',
                                  default=0,
                                  required=False,
                                  help="set verbosity level -v or -vv")

    parser_wopmars_db = argparse.ArgumentParser(add_help=False)
    parser_wopmars_db.add_argument('--db',
                                   dest='db',
                                   action='store',
                                   default='db.sqlite',
                                   required=False,
                                   help="database file in SQLITE format")

    parser_wopmars_dryrun = argparse.ArgumentParser(add_help=False)
    parser_wopmars_dryrun.add_argument(
        '--dry-run',
        '-n',
        dest='dryrun',
        action='store_true',
        required=False,
        help="displays only command out without running it")

    parser_wopmars_forceall = argparse.ArgumentParser(add_help=False)
    parser_wopmars_forceall.add_argument('-F',
                                         '--forceall',
                                         dest='forceall',
                                         action='store_true',
                                         help="force rerun all rules",
                                         required=False)

    parser_vtam_main = None

    @classmethod
    def get_main_arg_parser(cls):
        """

        :return:
        """

        ############################################################################################
        #
        # Top-level parser
        #
        ############################################################################################

        # config = RawConfigParser()
        # config.read(os.path.join(PathManager.get_package_path(), 'setup.cfg'))
        # version = config.get('metadata', 'version')

        parser_vtam_main = argparse.ArgumentParser(
            prog='vtam',
            description=
            '%(prog)s {} - VTAM - Validation and Taxonomic Assignation of Metabarcoding Data'
            .format(vtam.__version__))
        parser_vtam_main.add_argument('--version',
                                      action='version',
                                      version='%(prog)s {}'.format(
                                          vtam.__version__))
        subparsers = parser_vtam_main.add_subparsers(title='VTAM sub-commands')

        ############################################################################################
        #
        # create the parsers
        #
        ############################################################################################

        cls.add_parser_example(subparsers=subparsers)

        cls.add_parser_merge(subparsers=subparsers)

        cls.add_parser_random_seq(subparsers=subparsers)

        cls.add_parser_sortreads(subparsers=subparsers)

        cls.add_parser_filter(subparsers=subparsers)

        cls.add_parser_optimize(subparsers=subparsers)

        cls.add_parser_makeKnownOccurrences(subparsers=subparsers)

        cls.add_parser_pool(subparsers=subparsers)

        cls.add_parser_taxassign(subparsers=subparsers)

        cls.add_parser_taxonomy(subparsers=subparsers)

        cls.add_parser_coiblastdb(subparsers=subparsers)

        return parser_vtam_main

    @classmethod
    def add_parser_example(cls, subparsers):
        parser_vtam_merge = subparsers.add_parser(
            'example',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity
            ],
            help="generates data for quick start")

        parser_vtam_merge.add_argument('--outdir',
                                       action='store',
                                       help="directory for quick start data",
                                       required=False,
                                       default='example',
                                       type=lambda x: pathlib.Path(x).mkdir(
                                           exist_ok=True, parents=True) or x)

        parser_vtam_merge.set_defaults(command='example')

    @classmethod
    def add_parser_merge(cls, subparsers):
        parser_vtam_merge = subparsers.add_parser(
            'merge',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity
            ],
            help="merges paired-end reads")

        parser_vtam_merge.add_argument(
            '--fastqinfo',
            action='store',
            help="input TSV file with paired FASTQ file information",
            required=True,
            type=lambda x: FileSampleInformation(x).check_args(
                header=header_paired_fastq))

        parser_vtam_merge.add_argument(
            '--fastainfo',
            action='store',
            help="output TSV file with merged FASTA file information",
            required=True)

        parser_vtam_merge.add_argument(
            '--fastqdir',
            action='store',
            help="input directory with paired FASTQ files",
            required=True,
            type=ArgParserChecker.check_dir_exists_and_is_nonempty)

        parser_vtam_merge.add_argument(
            '--fastadir',
            action='store',
            help="output directory with merged FASTA files",
            required=True)
        # This attribute will trigger the good command

        parser_vtam_merge.set_defaults(command='merge')

    @classmethod
    def add_parser_random_seq(cls, subparsers):

        parser_vtam_random_seq = subparsers.add_parser(
            'random_seq',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity
            ],
            help=
            "make a folder with sample files containing 'size' number of sequences randomly selected from the files in input folder"
        )

        parser_vtam_random_seq.add_argument(
            '--fastadir',
            action='store',
            help="input directory with FASTA files",
            required=True,
            type=ArgParserChecker.check_dir_exists_and_is_nonempty)

        parser_vtam_random_seq.add_argument(
            '--random_seqdir',
            action='store',
            help=
            "output directory with randomly selected sequences in FASTA format",
            required=True)

        parser_vtam_random_seq.add_argument(
            '--fastainfo',
            action='store',
            help="input TSV file with FASTA file information",
            required=True,
            type=lambda x: FileSampleInformation(x).check_args(
                header=header_merged_fasta))

        parser_vtam_random_seq.add_argument(
            '--random_seqinfo',
            action='store',
            help="output TSV file with output FASTA file information",
            required=True)

        parser_vtam_random_seq.add_argument(
            '--samplesize',
            action='store',
            help="number of sequences to be selected from the input files",
            type=int,
            required=True)

        parser_vtam_random_seq.set_defaults(command='random_seq')

    @classmethod
    def add_parser_sortreads(cls, subparsers):
        parser_vtam_sortreads = subparsers.add_parser(
            'sortreads',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity
            ],
            help=
            "sorts (Trims and demultiplexes) reads to biological samples and replicates according to the presence of sequence tags and primers"
        )

        parser_vtam_sortreads.add_argument(
            '--fastainfo',
            action='store',
            help="input TSV file with FASTA file information",
            required=True,
            type=lambda x: FileSampleInformation(x).check_args(
                header=header_merged_fasta))

        parser_vtam_sortreads.add_argument(
            '--fastadir',
            action='store',
            help="input directory with FASTA files",
            required=True,
            type=ArgParserChecker.check_dir_exists_and_is_nonempty)

        parser_vtam_sortreads.add_argument(
            '--sorteddir',
            action='store',
            help=
            "output directory with sorted reads (Trimmed and demultiplexed) in FASTA files and TSV file with corresponnding FASTA file information ('SORTEDDIR/sortedinfo.tsv')",
            default="out",
            required=True)
        # This attribute will trigger the good command

        parser_vtam_sortreads.add_argument(
            "--no_reverse",
            action="store_false",
            help="don't check reverse sequences",
            required=False)

        parser_vtam_sortreads.add_argument(
            "--tag_to_end",
            action="store_false",
            help="look for tags only at the edges of the sequence",
            required=False)

        parser_vtam_sortreads.add_argument(
            "--primer_to_end",
            action="store_false",
            help="look for primers only at the edges of the sequence",
            required=False)

        parser_vtam_sortreads.set_defaults(command='sortreads')

    @classmethod
    def add_parser_filter(cls, subparsers):
        parser_vtam_filter = subparsers.add_parser(
            'filter',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity, cls.parser_wopmars_db,
                cls.parser_wopmars_dryrun, cls.parser_wopmars_forceall
            ],
            help=
            "filters out sequence artifacts and creates an amplicon sequence variant (ASV) table."
        )

        parser_vtam_filter.add_argument(
            '--sortedinfo',
            action='store',
            help=
            "input TSV file with information about FASTA files containing sorted reads",
            required=True,
            type=lambda x: FileSampleInformation(x).check_args(
                header=header_sortedread_fasta))
        parser_vtam_filter.add_argument(
            '--sorteddir',
            action='store',
            help=
            "input directory with sorted (Trimmed and demultiplexed) FASTA files",
            required=True,
            type=ArgParserChecker.check_dir_exists_and_is_nonempty)
        parser_vtam_filter.add_argument(
            '--asvtable',
            action='store',
            help=
            "output TSV file for the amplicon sequence variants (ASV) table",
            required=True)

        parser_vtam_filter.add_argument(
            '--cutoff_specific',
            dest='cutoff_specific',
            default=None,
            action='store',
            required=False,
            help=
            "TSV file with variant (col1: variant; col2: cutoff) or variant-replicate "
            "(col1: variant; col2: replicate; col3: cutoff)specific cutoffs",
            type=lambda x: FileCutoffSpecific(x).argparse_checker())

        parser_vtam_filter.add_argument(
            '--lfn_variant_replicate',
            action='store_true',
            help=
            "if set, VTAM will run the algorithm for the low frequency noise over variant and replicates",
            required=False,
            default=False)

        parser_vtam_filter.add_argument(
            '--known_occurrences',
            action='store',
            help="TSV file with expected (keep) occurrences",
            required=False,
            type=lambda x: FileKnownOccurrences(
                x).argparse_checker_known_occurrences())

        parser_vtam_filter.add_argument(
            '-U',
            '--until',
            dest='until',
            action='store',
            default=None,
            help=
            """execute '%(prog)s' UNTIL one rule, where the rule order looks like:            
1. SampleInformation, 2. VariantReadCount, 3. FilterLFN, 4. FilterMinReplicateNumber, 5. FilterPCRerror, 6. FilterChimera, 7. FilterMinReplicateNumber2, 8. FilterRenkonen, 9. FilterMinReplicateNumber3, 10. FilterIndel, 11. FilterCodonStop, 12. ReadCountAverageOverReplicates, 13. MakeAsvTable""",
            required=False)

        parser_vtam_filter.add_argument(
            '-S',
            '--since',
            dest='since',
            action='store',
            default=None,
            help=
            """execute '%(prog)s' SINCE one rule, where the rule order looks like:
            1. SampleInformation, 2. VariantReadCount, 3. FilterLFN, 4. FilterMinReplicateNumber, 5. FilterPCRerror, 6. FilterChimera, 7. FilterMinReplicateNumber2, 8. FilterRenkonen, 9. FilterMinReplicateNumber3, 10. FilterIndel, 11. FilterCodonStop, 12. ReadCountAverageOverReplicates, 13. MakeAsvTable""",
            required=False)

        # This attribute will trigger the good command
        parser_vtam_filter.set_defaults(command='filter')

    @classmethod
    def add_parser_taxassign(cls, subparsers):
        parser_vtam_taxassign = subparsers.add_parser(
            'taxassign',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity, cls.parser_wopmars_db
            ],
            help="assigns amplicon sequence variants (ASVs) to taxonomic groups"
        )

        parser_vtam_taxassign.add_argument(
            '--asvtable',
            action='store',
            help=
            "input TSV file with variant sequences and sequence header in the last column",
            required=True,
            type=lambda x: ArgParserChecker.check_taxassign_variants(x))
        parser_vtam_taxassign.add_argument(
            '--output',
            action='store',
            help="output TSV file where the assigned taxa have been added",
            required=True)
        parser_vtam_taxassign.add_argument(
            '--mode',
            dest='mode',
            default="unassigned",
            action='store',
            required=False,
            choices=['unassigned', 'reset'],
            help=
            "the default 'unassigned' mode will only assign 'unassigned' variants"
            "The alternative 'reset' mode will erase the TaxAssign table and reassigned all "
            "input variants")
        parser_vtam_taxassign.add_argument(
            '--blastdbdir',
            action='store',
            help=
            "input directory with (Full or custom one) Blast database files",
            required=True,
            type=ArgParserChecker.check_dir_exists_and_is_nonempty)
        parser_vtam_taxassign.add_argument(
            '--blastdbname',
            action='store',
            help=
            "input Blast database name, which corresponds to the file name without suffix of the Blast database files",
            required=True)
        parser_vtam_taxassign.add_argument(
            '--taxonomy',
            dest='taxonomy',
            action='store',
            help="""input TSV file with taxonomy information.
        This file is created with the 'taxonomy' sub-command. For instance
        'vtam taxonomy -o taxonomy.tsv' creates the 'taxonomy.tsv' file in the current directory""",
            required=True,
            type=ArgParserChecker.check_taxassign_taxonomy)

        # This attribute will trigger the good command
        parser_vtam_taxassign.set_defaults(command='taxassign')

    @classmethod
    def add_parser_optimize(cls, subparsers):
        parser_vtam_optimize = subparsers.add_parser(
            'optimize',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity, cls.parser_wopmars_db,
                cls.parser_wopmars_dryrun, cls.parser_wopmars_forceall
            ],
            help="finds out optimal parameters for filtering")

        parser_vtam_optimize.add_argument(
            '--sortedinfo',
            action='store',
            help=
            "input TSV file with information about FASTA files containing sorted (trimmed and demultiplexed) reads",
            required=True,
            type=lambda x: FileSampleInformation(x).check_args(
                header=header_sortedread_fasta))

        parser_vtam_optimize.add_argument(
            '--sorteddir',
            action='store',
            help=
            "input directory with sorted (Trimmed and demultiplexed) FASTA files",
            required=True,
            type=ArgParserChecker.check_dir_exists_and_is_nonempty)

        parser_vtam_optimize.add_argument('-o',
                                          '--outdir',
                                          action='store',
                                          help="output directory",
                                          default="out",
                                          required=True)

        parser_vtam_optimize.add_argument(
            '--known_occurrences',
            action='store',
            help="TSV file with known variants",
            required=True,
            type=lambda x: FileKnownOccurrences(
                x).argparse_checker_known_occurrences())

        parser_vtam_optimize.add_argument(
            '--lfn_variant_replicate',
            action='store_true',
            help=
            "if set, VTAM will run the algorithm for the low frequency noise over variant and replicates",
            required=False,
            default=False)

        parser_vtam_optimize.add_argument(
            '-U',
            '--until',
            dest='until',
            action='store',
            default=None,
            help=
            """executes '%(prog)s' UNTIL one rule, where the rules follow this order:
            1. SampleInformation, 2. VariantReadCount, 3. either OptimizeLFNsampleReplicate or OptimizePCRerror or OptimizeLFNreadCountAndLFNvariant""",
            required=False)
        parser_vtam_optimize.add_argument(
            '-S',
            '--since',
            dest='since',
            action='store',
            default=None,
            help=
            """executes '%(prog)s' SINCE one rule, where the rules follow this order: 
            1. SampleInformation, 2. VariantReadCount, 3. either OptimizeLFNsampleReplicate or OptimizePCRerror or OptimizeLFNreadCountAndLFNvariant""",
            required=False)

        # This attribute will trigger the good command
        parser_vtam_optimize.set_defaults(command='optimize')

    @classmethod
    def add_parser_makeKnownOccurrences(cls, subparsers):
        parser_vtam_makeKnownOccurrences = subparsers.add_parser(
            'make_known_occurrences',
            add_help=True,
            parents=[cls.parser_threads, cls.parser_verbosity],
            help="create a file with know occurrences")

        parser_vtam_makeKnownOccurrences.add_argument(
            '--asvtable',
            action='store',
            help="input an ASV table file (tsv format)",
            required=True,
        )
        #    type=lambda x: FileSampleInformation(x).check_args(
        #        header=header_paired_fastq))

        parser_vtam_makeKnownOccurrences.add_argument(
            '--sample_types',
            action='store',
            help="input a tsv file with the sample types",
            required=True,
        )
        #    type=lambda x: FileSampleInformation(x).check_args(
        #        header=header_paired_fastq))

        parser_vtam_makeKnownOccurrences.add_argument(
            '--mock_composition',
            action='store',
            help="input a tsv file with the mock composition",
            required=True,
        )
        #    type=lambda x: FileSampleInformation(x).check_args(
        #        header=header_paired_fastq))

        parser_vtam_makeKnownOccurrences.add_argument(
            '--known_occurrences',
            action='store',
            help=
            "Default: ./known_occurrences.tsv. Output a .tsv file with the known occurences",
            required=False,
            default='./known_occurrences.tsv')

        parser_vtam_makeKnownOccurrences.add_argument(
            '--missing_occurrences',
            action='store',
            help=
            "Default: ./missing_occurrences.tsv. Output a .tsv file with the missing occurences",
            required=False,
            default='./missing_occurrences.tsv')

        parser_vtam_makeKnownOccurrences.add_argument(
            '--habitat_proportion',
            action='store',
            help="Default: 0.5. Input a threshold for habitat proportion",
            required=False,
            default=0.5)

        # This attribute will trigger the good command
        parser_vtam_makeKnownOccurrences.set_defaults(
            command='make_known_occurrences')

    @classmethod
    def add_parser_pool(cls, subparsers):
        parser_vtam_pool_markers = subparsers.add_parser(
            'pool',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity
            ],
            help=
            "pools amplicon sequence variants (ASVs) from different but overlapping markers"
        )

        parser_vtam_pool_markers.add_argument('--db',
                                              action='store',
                                              required=True,
                                              help="SQLITE file with DB")

        from vtam.utils.FileRunMarker import FileRunMarker
        parser_vtam_pool_markers.add_argument(
            '--runmarker',
            action='store',
            default=None,
            help=FileRunMarker.help(),
            required=True,
            type=lambda x: FileRunMarker(x).check_argument())

        parser_vtam_pool_markers.add_argument(
            '--asvtable',
            action='store',
            help=
            "output TSV file with pooled markers and their occurrences in biological samples",
            required=True)

        parser_vtam_pool_markers.add_argument(
            '--readcounts',
            action='store_true',
            help=
            "Default: False. If False, presence/absence of reads in sample is given."
            "If True, sum of reads over pooled runs et/ou markers is given",
            required=False,
            default=False)

        # This attribute will trigger the good command
        parser_vtam_pool_markers.set_defaults(command='pool')

    @classmethod
    def add_parser_taxonomy(cls, subparsers):
        parser_vtam_taxonomy = subparsers.add_parser(
            'taxonomy',
            add_help=True,
            parents=[],
            help="downloads a TSV file with the NCBI taxonomy information")

        parser_vtam_taxonomy.add_argument(
            '-o',
            '--output',
            dest='output',
            action='store',
            help="default: taxonomy.tsv. Path to TSV taxonomy file",
            required=False,
            default=os.path.join(os.getcwd(), 'taxonomy.tsv'))
        parser_vtam_taxonomy.add_argument(
            '--precomputed',
            dest='precomputed',
            action='store_true',
            default=False,
            help="default: False. Downloads precomputed taxonomy database, "
            "which is likely an older database",
            required=False)
        # This attribute will trigger the good command
        parser_vtam_taxonomy.set_defaults(command='taxonomy')

    @classmethod
    def add_parser_coiblastdb(cls, subparsers):
        parser_vtam_coi_blast_db = subparsers.add_parser(
            'coi_blast_db',
            add_help=True,
            help=
            "downloads a precomputed BLAST database for the cytochrome C oxidase subunit I (COI) marker"
        )

        parser_vtam_coi_blast_db.add_argument(
            '--blastdbdir',
            dest='blastdbdir',
            action='store',
            help=
            "output directory with custom Blast database files of the cytochrome C oxidase subunit I (COI) marker files",
            required=False,
            default='blastdb')
        parser_vtam_coi_blast_db.add_argument(
            '--blastdbname',
            dest='blastdbname',
            action='store',
            help=
            "cytochrome C oxidase subunit I (COI) Blast database name among these current possibilities: coi_blast_db, coi_blast_db_20200420. Other versions if available can be found here: {}"
            .format(os.path.dirname(coi_blast_db_gz_url1)),
            required=False,
            default='coi_blast_db',
            type=lambda x: CommandBlastCOI(
                x).argparse_checker_blast_coi_blastdbname(),
        )
        # This attribute will trigger the good command
        parser_vtam_coi_blast_db.set_defaults(command='coi_blast_db')