Beispiel #1
0
    def run(self):
        session = self.session
        engine = session._session().get_bind()

        this_temp_dir = os.path.join(PathManager.instance().get_tempdir(),
                                     os.path.basename(__file__))
        pathlib.Path(this_temp_dir).mkdir(exist_ok=True)

        ############################################################################################
        #
        # Wrapper inputs, outputs and parameters
        #
        ############################################################################################

        # Input file paths
        known_occurrences_tsv = self.input_file(
            OptimizePCRerror.__input_file_known_occurrences)
        fasta_info_tsv = self.input_file(
            OptimizePCRerror.__input_file_sortedinfo)
        #
        # Output file paths
        output_optimize_path = self.output_file(
            OptimizePCRerror.__output_file_optimize_pcr_error)

        ############################################################################################
        #
        # Get nijk_df, known_occurrences_df
        #
        ############################################################################################

        sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv)
        variant_read_count_df = sample_info_tsv_obj.get_nijk_df(
            VariantReadCount, engine=engine)

        known_occurrences_df = FileKnownOccurrences(
            known_occurrences_tsv).to_identifier_df(engine)

        ############################################################################################
        #
        # Run optimizer and Write
        #
        ############################################################################################

        optimize_pcr_error_runner = RunnerOptimizePCRerror(
            variant_read_count_df=variant_read_count_df,
            known_occurrences_df=known_occurrences_df)
        optimize_pcr_error_runner.to_tsv(optimize_path=output_optimize_path,
                                         engine=engine)
Beispiel #2
0
    def run(self):
        session = self.session

        #######################################################################
        #
        # Wrapper inputs, outputs and parameters
        #
        #######################################################################

        # input file paths
        csv_path = self.input_file(SampleInformation.__input_file_csv)

        FileSampleInformation(csv_path).to_sqlite(session=session)

        #######################################################################
        #
        # Touch output tables, to update modification date
        #
        #######################################################################

        for output_table_i in self.specify_output_table():
            declarative_meta_i = self.output_table(output_table_i)
            obj = session.query(declarative_meta_i).order_by(
                declarative_meta_i.id.desc()).first()
            session.query(declarative_meta_i).filter_by(id=obj.id).update(
                {'id': obj.id})
            session.commit()
    def run(self):
        session = self.session
        engine = session._session().get_bind()

        ############################################################################################
        #
        # Wrapper inputs, outputs and parameters
        #
        ############################################################################################

        # Input file output
        known_occurrences_tsv = self.input_file(
            OptimizeLFNsampleReplicate.__input_file_known_occurrences)
        fasta_info_tsv = self.input_file(
            OptimizeLFNsampleReplicate.__input_file_sortedinfo)

        # Output file output
        output_optimize_path = self.output_file(
            OptimizeLFNsampleReplicate.
            __output_file_optimize_lfn_sample_replicate)

        ############################################################################################
        #
        # Get nijk_df and known_occurrences_df (keep)
        #
        ############################################################################################

        sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv)
        variant_read_count_df = sample_info_tsv_obj.get_nijk_df(
            VariantReadCount, engine=engine)
        known_occurrences_df = FileKnownOccurrences(
            known_occurrences_tsv).to_identifier_df(engine)
        known_occurrences_df = known_occurrences_df.loc[
            (known_occurrences_df.mock == 1) &
            (known_occurrences_df.action == 'keep'), ]

        ############################################################################################
        #
        # Run optimizer and Write
        #
        ############################################################################################

        optimize_lfn_sample_replicate_runner = RunnerOptimizeLFNsampleReplicate(
            variant_read_count_df=variant_read_count_df,
            known_occurrences_df=known_occurrences_df)
        optimize_lfn_sample_replicate_runner.to_tsv(
            optimize_path=output_optimize_path, engine=engine)
Beispiel #4
0
    def add_parser_sortreads(cls, subparsers):
        parser_vtam_sortreads = subparsers.add_parser(
            'sortreads',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity
            ],
            help=
            "sorts (Trims and demultiplexes) reads to biological samples and replicates according to the presence of sequence tags and primers"
        )

        parser_vtam_sortreads.add_argument(
            '--fastainfo',
            action='store',
            help="input TSV file with FASTA file information",
            required=True,
            type=lambda x: FileSampleInformation(x).check_args(
                header=header_merged_fasta))

        parser_vtam_sortreads.add_argument(
            '--fastadir',
            action='store',
            help="input directory with FASTA files",
            required=True,
            type=ArgParserChecker.check_dir_exists_and_is_nonempty)

        parser_vtam_sortreads.add_argument(
            '--sorteddir',
            action='store',
            help=
            "output directory with sorted reads (Trimmed and demultiplexed) in FASTA files and TSV file with corresponnding FASTA file information ('SORTEDDIR/sortedinfo.tsv')",
            default="out",
            required=True)
        # This attribute will trigger the good command

        parser_vtam_sortreads.add_argument(
            "--no_reverse",
            action="store_false",
            help="don't check reverse sequences",
            required=False)

        parser_vtam_sortreads.add_argument(
            "--tag_to_end",
            action="store_false",
            help="look for tags only at the edges of the sequence",
            required=False)

        parser_vtam_sortreads.add_argument(
            "--primer_to_end",
            action="store_false",
            help="look for primers only at the edges of the sequence",
            required=False)

        parser_vtam_sortreads.set_defaults(command='sortreads')
Beispiel #5
0
    def add_parser_random_seq(cls, subparsers):

        parser_vtam_random_seq = subparsers.add_parser(
            'random_seq',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity
            ],
            help=
            "make a folder with sample files containing 'size' number of sequences randomly selected from the files in input folder"
        )

        parser_vtam_random_seq.add_argument(
            '--fastadir',
            action='store',
            help="input directory with FASTA files",
            required=True,
            type=ArgParserChecker.check_dir_exists_and_is_nonempty)

        parser_vtam_random_seq.add_argument(
            '--random_seqdir',
            action='store',
            help=
            "output directory with randomly selected sequences in FASTA format",
            required=True)

        parser_vtam_random_seq.add_argument(
            '--fastainfo',
            action='store',
            help="input TSV file with FASTA file information",
            required=True,
            type=lambda x: FileSampleInformation(x).check_args(
                header=header_merged_fasta))

        parser_vtam_random_seq.add_argument(
            '--random_seqinfo',
            action='store',
            help="output TSV file with output FASTA file information",
            required=True)

        parser_vtam_random_seq.add_argument(
            '--samplesize',
            action='store',
            help="number of sequences to be selected from the input files",
            type=int,
            required=True)

        parser_vtam_random_seq.set_defaults(command='random_seq')
Beispiel #6
0
    def add_parser_merge(cls, subparsers):
        parser_vtam_merge = subparsers.add_parser(
            'merge',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity
            ],
            help="merges paired-end reads")

        parser_vtam_merge.add_argument(
            '--fastqinfo',
            action='store',
            help="input TSV file with paired FASTQ file information",
            required=True,
            type=lambda x: FileSampleInformation(x).check_args(
                header=header_paired_fastq))

        parser_vtam_merge.add_argument(
            '--fastainfo',
            action='store',
            help="output TSV file with merged FASTA file information",
            required=True)

        parser_vtam_merge.add_argument(
            '--fastqdir',
            action='store',
            help="input directory with paired FASTQ files",
            required=True,
            type=ArgParserChecker.check_dir_exists_and_is_nonempty)

        parser_vtam_merge.add_argument(
            '--fastadir',
            action='store',
            help="output directory with merged FASTA files",
            required=True)
        # This attribute will trigger the good command

        parser_vtam_merge.set_defaults(command='merge')
Beispiel #7
0
    def run(self):
        """
        Algorithm (Updated Oct 13, 2019)

        1. Read file with known variants (Mock/tolerate, delete and real)
        2. Control if user variants and sequence are consistent in the database
        3. Get variant_read_count of this run_name-marker_name-sample-replicate experiment
        5. Compute maximal lfn_nijk_cutoff that keeps all 'keep' variants with the 'run_lfn_read_count_and_lfn_variant' algorithm
        6. Compute maximal lfn_variant_cutoff that keeps all 'keep' variants with the 'run_lfn_read_count_and_lfn_variant' algorithm (See below)
        7. Loop between default and lfn_nijk_cutoff and run_lfn_read_count_and_lfn_variant parameters
            7.1 Compute number of keep variants. Should be always maximal.
            7.2 Compute number of delete variants Should decrease.
        8. Compute variant(-replicate) specific cutoff for delete variants
            8.1 For each variant i (Or variant-replicate i-k ),
                get N_ijk_max and use it to computer variant specific cutoff

        Description of the 'run_lfn_read_count_and_lfn_variant' algorithm

        1. Remove if does not pass these filter
            1.1 Filter lfn_variant (Or lfn_variant_replicate)
            1.2 Filter lfn_sample_replicate
            1.3 Filter absolute read count
        2. Filter if not min replicate number

        """
        session = self.session
        engine = session._session().get_bind()

        ############################################################################################
        #
        # Wrapper inputs, outputs and parameters
        #
        ############################################################################################

        # Input file output
        known_occurrences_tsv = self.input_file(
            OptimizeLFNreadCountAndLFNvariant.__input_file_known_occurrences)
        fasta_info_tsv = self.input_file(
            OptimizeLFNreadCountAndLFNvariant.__input_file_sortedinfo)

        # Output file output
        output_file_optimize_lfn_tsv = self.output_file(
            OptimizeLFNreadCountAndLFNvariant.
            __output_file_optimize_lfn_read_count_and_lfn_variant)
        output_file_lfn_variant_specific_cutoff_tsv = self.output_file(
            OptimizeLFNreadCountAndLFNvariant.
            __output_file_optimize_lfn_variant_specific)

        # Options
        lfn_ni_cutoff = self.option("lfn_variant_cutoff")
        lfn_nik_cutoff = self.option("lfn_variant_replicate_cutoff")
        min_replicate_number = self.option("min_replicate_number")
        lfn_njk_cutoff = self.option("lfn_sample_replicate_cutoff")
        lfn_nijk_cutoff = int(self.option("lfn_read_count_cutoff"))

        filter_kwargs = {
            "lfn_ni_cutoff": lfn_ni_cutoff,
            "lfn_nik_cutoff": lfn_nik_cutoff,
            "lfn_njk_cutoff": lfn_njk_cutoff,
            "lfn_nijk_cutoff": lfn_nijk_cutoff,
            'min_replicate_number': min_replicate_number,
        }

        ############################################################################################
        #
        # Get nijk_df and known_occurrences_df (keep)
        #
        ############################################################################################

        sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv)
        nijk_df = sample_info_tsv_obj.get_nijk_df(VariantReadCount,
                                                  engine=engine)

        known_occurrences_df = FileKnownOccurrences(
            known_occurrences_tsv).to_identifier_df(engine)

        ############################################################################################
        #
        # Create cutoff values lists
        #
        ############################################################################################

        # # lfn_nijk_cutoff_list = range(lfn_nijk_cutoff, lfn_nijk_cutoff_global_max + 1, round(int((lfn_nijk_cutoff_global_max - lfn_nijk_cutoff + 1)/10), -1))
        # lfn_nijk_cutoff_list = range(lfn_nijk_cutoff, lfn_nijk_cutoff_global_max + 1, round(int((lfn_nijk_cutoff_global_max - lfn_nijk_cutoff + 1)/10), -1))
        # lfn_nijk_cutoff_list = RunnerOptimizeLFNreadCountAndVariantRunMarker.get_lfn_nijk_cutoff_lst(start=lfn_nijk_cutoff, stop=lfn_nijk_cutoff_global_max, nb_points=10)
        # lfn_nijk_cutoff_list = RunnerOptimizeLFNreadCountAndVariantRunMarker.get_lfn_nijk_cutoff_lst(start=lfn_nijk_cutoff, stop=lfn_nijk_cutoff_global_max, nb_points=10)
        # if lfn_nik_cutoff is None:  # lfn_variant optimization
        #     lfn_ni_nik_cutoff_list = [round(x, 3) for x in numpy.arange(lfn_ni_cutoff, lfn_ni_njk_cutoff_global_max + 0.001, (lfn_ni_njk_cutoff_global_max - lfn_ni_cutoff + 0.001)/10)]
        # else:  # lfn_variant_replicate optimization
        #     lfn_ni_nik_cutoff_list = [round(x, 3) for x in numpy.arange(lfn_ni_cutoff, lfn_ni_njk_cutoff_global_max + 0.001, (lfn_ni_njk_cutoff_global_max - lfn_ni_cutoff + 0.001)/10)]

        ############################################################################################
        #
        # Group and run_name this genetic_code by run_name/marker_name combination
        # Loop by run_name/marker_name
        #
        ############################################################################################

        optim_lfn_readcount_variant_runner = RunnerOptimizeLFNreadCountAndVariant(
            nijk_df=nijk_df, known_occurrences_df=known_occurrences_df)
        out_optimize_df, out_optimize2_df = optim_lfn_readcount_variant_runner.get_optimize_df(
            lfn_ni_cutoff=lfn_ni_cutoff,
            lfn_nik_cutoff=lfn_nik_cutoff,
            lfn_njk_cutoff=lfn_njk_cutoff,
            lfn_nijk_cutoff=lfn_nijk_cutoff,
            min_replicate_number=min_replicate_number)

        ############################################################################################
        #
        # out_optimize_df: Format and write
        #
        ############################################################################################

        out_optimize_df.marker_id = NameIdConverter(
            out_optimize_df.marker_id, engine=engine).to_names(Marker)
        out_optimize_df.run_id = NameIdConverter(out_optimize_df.run_id,
                                                 engine=engine).to_names(Run)
        out_optimize_df.rename({
            'run_id': 'run',
            'marker_id': 'marker'
        },
                               axis=1,
                               inplace=True)
        out_optimize_df.to_csv(output_file_optimize_lfn_tsv,
                               header=True,
                               sep='\t',
                               index=False)

        ############################################################################################
        #
        # out_optimize_df: Format and write
        #
        ############################################################################################

        out_optimize2_df.marker_id = NameIdConverter(
            out_optimize2_df.marker_id, engine=engine).to_names(Marker)
        out_optimize2_df.run_id = NameIdConverter(out_optimize2_df.run_id,
                                                  engine=engine).to_names(Run)
        out_optimize2_df['action'] = 'delete'
        out_optimize2_df['sequence'] = NameIdConverter(
            out_optimize2_df.variant_id,
            engine=engine).variant_id_to_sequence()
        out_optimize2_df.rename(
            {
                'run_id': 'run',
                'marker_id': 'marker',
                'variant_id': 'variant',
                'read_count': 'read_count_max'
            },
            axis=1,
            inplace=True)

        if self.option("lfn_variant_replicate_cutoff") is None:
            out_optimize2_df = out_optimize2_df[[
                'run', 'marker', 'variant', 'action', 'read_count_max', 'N_i',
                'lfn_variant_cutoff', 'sequence'
            ]]
        else:
            out_optimize2_df = out_optimize2_df[[
                'run', 'marker', 'variant', 'replicate', 'action',
                'read_count_max', 'N_ik', 'lfn_variant_replicate_cutoff',
                'sequence'
            ]]

        out_optimize2_df.to_csv(output_file_lfn_variant_specific_cutoff_tsv,
                                header=True,
                                sep='\t',
                                index=False)
    def run(self):
        session = self.session
        engine = session._session().get_bind()

        #######################################################################
        #
        # Wrapper inputs, outputs and parameters
        #
        #######################################################################
        #
        # Input files
        fasta_info_tsv = self.input_file(
            FilterMinReplicateNumber.__input_file_sortedinfo)
        #
        # Input tables
        input_filter_lfn_model = self.input_table(
            FilterMinReplicateNumber.__input_table_variant_filter_lfn)
        #
        # Options
        min_replicate_number = self.option("min_replicate_number")
        # input_filter_lfn = self.option("input_filter_lfn")
        #
        # Output tables
        output_filter_min_replicate_model = self.output_table(
            FilterMinReplicateNumber.__output_table_filter_min_replicate_number)

        #######################################################################
        #
        # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis
        # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model
        # 3. Get nijk_df input
        #
        #######################################################################

        sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv)

        sample_info_tsv_obj.delete_from_db(
            engine=engine, variant_read_count_like_model=output_filter_min_replicate_model)
        filter_id = None
        if input_filter_lfn_model.__tablename__ == "FilterLFN":
            filter_id = 8  # Variant pass all filters LFN
        variant_read_count_df = sample_info_tsv_obj.get_nijk_df(
            variant_read_count_like_model=input_filter_lfn_model, engine=engine, filter_id=filter_id)

        #######################################################################
        #
        # 4. Run Filter
        #
        #######################################################################

        variant_read_count_delete_df = RunnerFilterMinReplicateNumber(
            variant_read_count_df) .get_variant_read_count_delete_df(min_replicate_number)

        #######################################################################
        #
        # 5. Write to DB
        # 6. Touch output tables, to update modification date
        # 7. Exit vtam if all variants delete
        #
        #######################################################################

        DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql(
            engine=engine, variant_read_count_like_model=output_filter_min_replicate_model)

        for output_table_i in self.specify_output_table():
            declarative_meta_i = self.output_table(output_table_i)
            obj = session.query(declarative_meta_i).order_by(
                declarative_meta_i.id.desc()).first()
            session.query(declarative_meta_i).filter_by(
                id=obj.id).update({'id': obj.id})
            session.commit()

        if variant_read_count_delete_df.filter_delete.sum(
        ) == variant_read_count_delete_df.shape[0]:
            Logger.instance().warning(
                VTAMexception(
                    "This filter has deleted all the variants: {}. "
                    "The analysis will stop here.".format(
                        self.__class__.__name__)))
            sys.exit(0)
    def run(self):
        session = self.session
        engine = session._session().get_bind()
        #
        # Input file output
        fasta_info_tsv = self.input_file(
            ReadCountAverageOverReplicates.__input_file_sortedinfo)
        #
        codon_stop_model = self.input_table(
            ReadCountAverageOverReplicates.__input_table_filter_codon_stop)

        #
        # Output table models
        consensus_model = self.output_table(
            ReadCountAverageOverReplicates.__output_table_filter_consensus)

        # #######################################################################
        # #
        # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis
        # #
        # #######################################################################
        #
        # # fasta_info_tsv = FastaInformationTSV(engine=engine, fasta_info_tsv=input_file_sortedinfo)
        # sample_info_tsv_obj = FileSampleInformation(tsv_path=input_file_sortedinfo)
        #
        # #######################################################################
        # #
        # # 2. Delete /run_name/markersamples/replicate from this filter table
        # #
        # #######################################################################
        # # with engine.connect() as conn:
        # #     # conn.execute(consensus_model.__table__.delete(), sample_instance_list)
        # #     conn.execute(consensus_model.__table__.delete(), sample_instance_list)
        # #
        # variant_read_count_like_utils = ModelVariantReadCountLike(
        #     variant_read_count_like_model=consensus_model, engine=engine)
        # sample_record_list = sample_info_tsv_obj.to_identifier_df(
        #     engine=engine).to_dict('records')
        # variant_read_count_like_utils.delete_from_db(
        #     sample_record_list=sample_record_list)
        #
        # #######################################################################
        # #
        # # 3. Select marker_name/run_name/sample/replicate from variant_read_count_model
        # #
        # #######################################################################
        #
        # nijk_df = sample_info_tsv_obj.get_nijk_df(
        #     variant_read_count_like_model=codon_stop_model, filter_id=None)
        #
        # # Exit if no variants for analysis
        # try:
        #     assert nijk_df.shape[0] > 0
        # except AssertionError:
        #     sys.stderr.write(
        #         "Error: No variants available for this filter: {}".format(
        #             os.path.basename(__file__)))
        #     sys.exit(1)

        #######################################################################
        #
        # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis
        # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model
        # 3. Get nijk_df input
        #
        #######################################################################

        sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv)

        sample_info_tsv_obj.delete_from_db(
            engine=engine, variant_read_count_like_model=consensus_model)

        variant_read_count_df = sample_info_tsv_obj.get_nijk_df(
            variant_read_count_like_model=codon_stop_model,
            engine=engine,
            filter_id=None)

        #######################################################################
        #
        # 4. Run Filter
        #
        #######################################################################

        variant_read_count_delete_df = read_count_average_over_replicates(
            variant_read_count_df)

        #######################################################################
        #
        # Write to DB
        #
        #######################################################################

        record_list = ModelVariantReadCountLike.filter_delete_df_to_dict(
            variant_read_count_delete_df)
        with engine.connect() as conn:

            # Insert new instances
            conn.execute(consensus_model.__table__.insert(), record_list)

        #######################################################################
        #
        # Touch output tables, to update modification date
        #
        #######################################################################

        for output_table_i in self.specify_output_table():
            declarative_meta_i = self.output_table(output_table_i)
            obj = session.query(declarative_meta_i).order_by(
                declarative_meta_i.id.desc()).first()
            session.query(declarative_meta_i).filter_by(id=obj.id).update(
                {'id': obj.id})
            session.commit()
Beispiel #10
0
    def run(self):
        session = self.session
        engine = session._session().get_bind()

        ############################################################################################
        #
        # Wrapper inputs, outputs and parameters
        #
        ############################################################################################

        # Input file
        fasta_info_tsv = self.input_file(
            FilterRenkonen.__input_file_sortedinfo)
        #
        # Input table models
        input_filter_chimera_model = self.input_table(
            FilterRenkonen.__input_table_chimera)
        #
        # Options
        renkonen_distance_quantile = float(
            self.option("renkonen_distance_quantile"))
        #
        # Output table models
        output_filter_renkonen_model = self.output_table(
            FilterRenkonen.__output_table_filter_renkonen)

        ############################################################################################
        #
        # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis
        # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model
        # 3. Get nijk_df input
        #
        ############################################################################################

        sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv)

        sample_info_tsv_obj.delete_from_db(
            engine=engine,
            variant_read_count_like_model=output_filter_renkonen_model)

        variant_read_count_df = sample_info_tsv_obj.get_nijk_df(
            variant_read_count_like_model=input_filter_chimera_model,
            engine=engine,
            filter_id=None)

        ############################################################################################
        #
        # Run per run_id, marker_id
        #
        ############################################################################################

        variant_read_count_delete_df = pandas.DataFrame()
        run_marker_df = variant_read_count_df[['run_id',
                                               'marker_id']].drop_duplicates()

        for row in run_marker_df.itertuples():
            run_id = row.run_id
            marker_id = row.marker_id

            variant_read_count_per_run_marker_df = variant_read_count_df.loc[
                (variant_read_count_df.run_id == run_id)
                & (variant_read_count_df.marker_id == marker_id)]

            if variant_read_count_per_run_marker_df.replicate.unique(
            ).shape[0] > 1:  # if more than one replicate
                filter_renkonen_runner_obj = RunnerFilterRenkonen(
                    variant_read_count_per_run_marker_df)
                filter_output_i_df = filter_renkonen_runner_obj.get_variant_read_count_delete_df(
                    renkonen_distance_quantile)
            else:  # Just one replicate
                filter_output_i_df = variant_read_count_df.copy()
                filter_output_i_df['filter_delete'] = False

            variant_read_count_delete_df = pandas.concat(
                [variant_read_count_delete_df, filter_output_i_df], axis=0)

        ############################################################################################
        #
        # 5. Write to DB
        # 6. Touch output tables, to update modification date
        # 7. Exit vtam if all variants delete
        #
        ############################################################################################

        DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql(
            engine=engine,
            variant_read_count_like_model=output_filter_renkonen_model)

        for output_table_i in self.specify_output_table():
            declarative_meta_i = self.output_table(output_table_i)
            obj = session.query(declarative_meta_i).order_by(
                declarative_meta_i.id.desc()).first()
            session.query(declarative_meta_i).filter_by(id=obj.id).update(
                {'id': obj.id})
            session.commit()

        if variant_read_count_delete_df.filter_delete.sum(
        ) == variant_read_count_delete_df.shape[0]:
            Logger.instance().warning(
                VTAMexception("This filter has deleted all the variants: {}. "
                              "The analysis will stop here.".format(
                                  self.__class__.__name__)))
            sys.exit(0)
Beispiel #11
0
    def main(fastainfo,
             fastadir,
             sorteddir,
             params=None,
             num_threads=multiprocessing.cpu_count()):

        if sys.platform.startswith('win'):
            num_threads = 1

        ############################################################################################
        #
        # params.yml parameters
        #
        ############################################################################################

        params_dic = FileParams(params).get_params_dic()

        cutadapt_error_rate = params_dic['cutadapt_error_rate']
        cutadapt_minimum_length = params_dic['cutadapt_minimum_length']
        cutadapt_maximum_length = params_dic['cutadapt_maximum_length']

        ############################################################################################
        #
        # Loop over tag and primer pairs to demultiplex and trim reads
        #
        ############################################################################################

        merged_fastainfo_df = FileSampleInformation(
            fastainfo).read_tsv_into_df()

        pathlib.Path(sorteddir).mkdir(parents=True, exist_ok=True)
        tempdir = PathManager.instance().get_tempdir()

        sorted_read_info_df = pandas.DataFrame()

        for i in range(0, merged_fastainfo_df.shape[0]):
            fasta_info_series = merged_fastainfo_df.iloc[i]

            tag_fwd = fasta_info_series.tagfwd
            tag_rev = fasta_info_series.tagrev
            primer_fwd = fasta_info_series.primerfwd
            primer_rev = fasta_info_series.primerrev
            in_fasta_basename = fasta_info_series.mergedfasta

            Logger.instance().debug(
                "Analysing FASTA file: {}".format(in_fasta_basename))

            fasta_info_df_i = fasta_info_series.to_frame().T
            in_raw_fasta_path = os.path.join(fastadir, in_fasta_basename)

            ########################################################################################
            #
            # Cut adapt tag of forward reads
            # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only
            # --front 'tcgatcacgatgt;min_overlap=13...gctgtagatcgaca;min_overlap=14'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta
            # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta
            #
            ########################################################################################

            if generic_dna:  # Biopython <1.78
                tag_rev_rc = str(
                    Seq(tag_rev, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                tag_rev_rc = str(Seq(tag_rev).reverse_complement())

            out_fasta_basename = os.path.basename(in_raw_fasta_path).replace(
                '.fasta', '_sorted_%03d.fasta' % i)
            out_fasta_path = os.path.join(tempdir, out_fasta_basename)

            cmd_cutadapt_tag_dic = {
                'tag_fwd': tag_fwd,
                'tag_fwd_len': len(tag_fwd),
                'tag_rev_rc': tag_rev_rc,
                'tag_rev_rc_len': len(tag_rev_rc),
                'in_fasta_path': in_raw_fasta_path,
                'out_fasta': out_fasta_path,
                'num_threads': num_threads,
            }

            cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \
                '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic)

            Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_tag_str
            else:
                args = shlex.split(cmd_cutadapt_tag_str)
            run_result = subprocess.run(args=args,
                                        capture_output=True,
                                        check=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ########################################################################################
            #
            # Trim primers from output
            # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only
            # --front 'TCCACTAATCACAARGATATTGGTAC;min_overlap=26...GGAGGATTTGGWAATTGATTAGTW;min_overlap=24'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_trimmed_000.fasta
            # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta
            #
            ########################################################################################

            if generic_dna:  # Biopython <1.78
                primer_rev_rc = str(
                    Seq(primer_rev, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                primer_rev_rc = str(Seq(primer_rev).reverse_complement())

            in_fasta_path = out_fasta_path
            out_fasta_basename = os.path.basename(in_fasta_path).replace(
                '_sorted_%03d.fasta' % i, '_sorted_trimmed_%03d.fasta' % i)
            out_fasta_path = os.path.join(tempdir, out_fasta_basename)

            cmd_cutadapt_primer_dic = {
                'primer_fwd': primer_fwd,
                'primer_fwd_len': len(primer_fwd),
                'primer_rev_rc': primer_rev_rc,
                'primer_rev_rc_len': len(primer_rev_rc),
                'in_fasta_path': in_fasta_path,
                'out_fasta': out_fasta_path,
                'error_rate': cutadapt_error_rate,
                'read_min_length': cutadapt_minimum_length,
                'read_max_length': cutadapt_maximum_length,
                'num_threads': num_threads,
            }

            cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                                      '--minimum-length {read_min_length} ' \
                                      '--maximum-length {read_max_length} --trimmed-only  ' \
                                      '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" '  \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic)

            Logger.instance().debug(
                "Running: {}".format(cmd_cutadapt_primer_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_primer_str
            else:
                args = shlex.split(cmd_cutadapt_primer_str)
            run_result = subprocess.run(args=args, capture_output=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ########################################################################################
            #
            # Cut adapt tag of reverse-complement reads
            # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only
            # --front 'tgtcgatctacagc;min_overlap=14...acatcgtgatcga;min_overlap=13'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta
            # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta
            #
            ########################################################################################

            if generic_dna:  # Biopython <1.78
                tag_fwd_rc = str(
                    Seq(tag_fwd, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                tag_fwd_rc = str(Seq(tag_fwd).reverse_complement())

            out_rc_fasta_basename = os.path.basename(
                in_raw_fasta_path).replace('.fasta',
                                           '_rc_sorted_%03d.fasta' % i)
            out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename)

            cmd_cutadapt_tag_dic = {
                'tag_fwd': tag_rev,
                'tag_fwd_len': len(tag_rev),
                'tag_rev_rc': tag_fwd_rc,
                'tag_rev_rc_len': len(tag_fwd_rc),
                'in_fasta_path': in_raw_fasta_path,
                'out_fasta': out_rc_fasta_path,
                'num_threads': num_threads,
            }

            cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \
                '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic)

            Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_tag_str
            else:
                args = shlex.split(cmd_cutadapt_tag_str)
            run_result = subprocess.run(args=args, capture_output=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ###################################################################
            #
            # Trim primers from output
            # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only
            # --front 'WACTAATCAATTWCCAAATCCTCC;min_overlap=24...GTACCAATATCYTTGTGATTAGTGGA;min_overlap=26'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_trimmed_000.fasta
            # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta
            #
            ###################################################################

            if generic_dna:  # Biopython <1.78
                primer_fwd_rc = str(
                    Seq(primer_fwd, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                primer_fwd_rc = str(Seq(primer_fwd).reverse_complement())

            in_fasta_path = out_rc_fasta_path
            out_rc_fasta_basename = os.path.basename(in_fasta_path).replace(
                '_rc_sorted_%03d.fasta' % i,
                '_rc_sorted_trimmed_%03d.fasta' % i)
            out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename)

            cmd_cutadapt_primer_dic = {
                'primer_fwd': primer_rev,
                'primer_fwd_len': len(primer_rev),
                'primer_rev_rc': primer_fwd_rc,
                'primer_rev_rc_len': len(primer_fwd_rc),
                'in_fasta_path': in_fasta_path,
                'out_fasta': out_rc_fasta_path,
                'error_rate': cutadapt_error_rate,
                'read_min_length': cutadapt_minimum_length,
                'read_max_length': cutadapt_maximum_length,
                'num_threads': num_threads,
            }
            cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                '--minimum-length {read_min_length} ' \
                '--maximum-length {read_max_length} --trimmed-only  ' \
                '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" ' \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic)

            Logger.instance().debug(
                "Running: {}".format(cmd_cutadapt_primer_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_primer_str
            else:
                args = shlex.split(cmd_cutadapt_primer_str)
            run_result = subprocess.run(args=args, capture_output=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ###################################################################
            #
            # Reverse complement back rc fasta and pool
            #
            ###################################################################

            out_final_fasta_basename = os.path.basename(
                in_raw_fasta_path).replace('.fasta', '_%03d.fasta' % i)
            out_final_fasta_path = os.path.join(sorteddir,
                                                out_final_fasta_basename)
            shutil.copy(out_fasta_path, out_final_fasta_path)

            Logger.instance().debug("Pooling fwd and rc reads...")
            with open(out_final_fasta_path, 'a') as fout:
                with open(out_rc_fasta_path, 'r') as fin:
                    for line in fin:
                        if not line.startswith('>'):

                            if generic_dna:  # Biopython <1.78
                                fout.write("%s\n" % str(
                                    Seq(line.strip(),
                                        generic_dna).reverse_complement()))
                            else:  # Biopython =>1.78
                                fout.write("%s\n" % str(
                                    Seq(line.strip()).reverse_complement()))

                        else:
                            fout.write(line)

            fasta_info_df_i = fasta_info_df_i[[
                'run', 'marker', 'sample', 'replicate'
            ]]
            fasta_info_df_i['sortedfasta'] = out_final_fasta_basename
            sorted_read_info_df = pandas.concat(
                [sorted_read_info_df, fasta_info_df_i], axis=0)

        fasta_trimmed_info_tsv = os.path.join(sorteddir, 'sortedinfo.tsv')
        sorted_read_info_df.to_csv(fasta_trimmed_info_tsv,
                                   sep="\t",
                                   header=True,
                                   index=False)
Beispiel #12
0
    def run(self):

        session = self.session
        engine = session._session().get_bind()

        ############################################################################################

        #
        # Wrapper inputs, outputs and parameters
        #
        ############################################################################################

        # Input file output
        fasta_info_tsv = self.input_file(FilterLFN.__input_file_sortedinfo)

        #
        # Input table models
        input_variant_read_count_model = self.input_table(
            FilterLFN.__input_table_variant_read_count)
        #
        # Output table models
        output_filter_lfn_model = self.output_table(
            FilterLFN.__output_table_filter_lfn)
        #
        # Options
        lfn_variant_cutoff = self.option("lfn_variant_cutoff")
        lfn_variant_specific_cutoff = self.option(
            "lfn_variant_specific_cutoff")
        lfn_variant_replicate_cutoff = self.option(
            "lfn_variant_replicate_cutoff")
        lfn_variant_replicate_specific_cutoff = self.option(
            "lfn_variant_replicate_specific_cutoff")
        lfn_sample_replicate_cutoff = self.option(
            "lfn_sample_replicate_cutoff")
        lfn_read_count_cutoff = self.option("lfn_read_count_cutoff")

        ############################################################################################
        #
        # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis
        # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model
        # 3. Get nijk_df input
        #
        ############################################################################################

        sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv)

        sample_info_tsv_obj.delete_from_db(
            engine=engine,
            variant_read_count_like_model=output_filter_lfn_model)

        variant_read_count_df = sample_info_tsv_obj.get_nijk_df(
            variant_read_count_like_model=input_variant_read_count_model,
            engine=engine,
            filter_id=None)

        lfn_variant_specific_cutoff_df = None
        if (not (lfn_variant_cutoff is None)
            ) and pathlib.Path(lfn_variant_specific_cutoff).stat().st_size > 0:
            lfn_variant_specific_cutoff_df = FileCutoffSpecific(
                lfn_variant_specific_cutoff).to_identifier_df(
                    engine=engine, is_lfn_variant_replicate=False)

        lfn_variant_replicate_specific_cutoff_df = None
        if (not (lfn_variant_replicate_cutoff is None)) and pathlib.Path(
                lfn_variant_replicate_specific_cutoff).stat().st_size > 0:
            lfn_variant_replicate_specific_cutoff_df = FileCutoffSpecific(
                lfn_variant_replicate_specific_cutoff).to_identifier_df(
                    engine=engine, is_lfn_variant_replicate=True)

        ############################################################################################
        #
        # Create filter object and run_name
        #
        ############################################################################################

        variant_read_count_delete_df = RunnerFilterLFN(
            variant_read_count_df).get_variant_read_count_delete_df(
                lfn_variant_cutoff=lfn_variant_cutoff,
                lfn_variant_specific_cutoff=lfn_variant_specific_cutoff_df,
                lfn_variant_replicate_cutoff=lfn_variant_replicate_cutoff,
                lfn_variant_replicate_specific_cutoff=
                lfn_variant_replicate_specific_cutoff_df,
                lfn_sample_replicate_cutoff=lfn_sample_replicate_cutoff,
                lfn_read_count_cutoff=lfn_read_count_cutoff)

        DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql(
            engine=engine,
            variant_read_count_like_model=output_filter_lfn_model)

        for output_table_i in self.specify_output_table():
            declarative_meta_i = self.output_table(output_table_i)
            obj = session.query(declarative_meta_i).order_by(
                declarative_meta_i.id.desc()).first()
            session.query(declarative_meta_i).filter_by(id=obj.id).update(
                {'id': obj.id})
            session.commit()

        if variant_read_count_delete_df.filter_delete.sum(
        ) == variant_read_count_delete_df.shape[0]:
            Logger.instance().warning(
                VTAMexception("This filter has deleted all the variants: {}. "
                              "The analysis will stop here.".format(
                                  self.__class__.__name__)))
            sys.exit(0)
Beispiel #13
0
    def run(self):
        session = self.session
        engine = session._session().get_bind()

        #######################################################################
        #
        # Wrapper inputs, outputs and parameters
        #
        #######################################################################

        # Input file
        # sort_reads_tsv = self.input_file(VariantReadCount.__input_file_sort_reads)
        input_file_sortedinfo = self.input_file(
            VariantReadCount.__input_file_sortedinfo)
        #
        # Input table models
        run_model = self.input_table(VariantReadCount.__input_table_run)
        marker_model = self.input_table(VariantReadCount.__input_table_marker)
        sample_model = self.input_table(VariantReadCount.__input_table_sample)
        #
        # Output
        # Output table
        variant_model = self.output_table(
            VariantReadCount.__output_table_variant)
        variant_read_count_model = self.output_table(
            VariantReadCount.__output_table_variant_read_count)
        # Options
        read_dir = self.option("read_dir")
        global_read_count_cutoff = self.option("global_read_count_cutoff")

        #######################################################################
        #
        # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis
        # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model
        # 3. Read tsv file with sorted reads
        # 4. Group by read sequence
        # 5. Delete variants if below global_read_count_cutoff
        # 6. Insert into Variant and DataframeVariantReadCountLike tables
        #
        #######################################################################

        #######################################################################
        #
        # 1. Read sample information to get run_id, marker_id, sample_id, replicate for current analysis
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Read sample information".format(
                __file__,
                inspect.currentframe().f_lineno))
        sortedinfo_df = pandas.read_csv(input_file_sortedinfo,
                                        sep="\t",
                                        header=0)
        sample_instance_list = []
        sortedinfo_df.columns = sortedinfo_df.columns.str.lower()

        for row in sortedinfo_df.itertuples():
            Logger.instance().debug(row)
            marker_name = row.marker
            run_name = row.run
            sample_name = row.sample
            replicate = row.replicate
            with engine.connect() as conn:
                # get run_id ###########
                stmt_select_run_id = select([
                    run_model.__table__.c.id
                ]).where(run_model.__table__.c.name == run_name)
                run_id = conn.execute(stmt_select_run_id).first()[0]
                # get marker_id ###########
                stmt_select_marker_id = select([
                    marker_model.__table__.c.id
                ]).where(marker_model.__table__.c.name == marker_name)
                marker_id = conn.execute(stmt_select_marker_id).first()[0]
                # get sample_id ###########
                stmt_select_sample_id = select([
                    sample_model.__table__.c.id
                ]).where(sample_model.__table__.c.name == sample_name)
                sample_id = conn.execute(stmt_select_sample_id).first()[0]
                # add this sample_instance ###########
                sample_instance_list.append({
                    'run_id': run_id,
                    'marker_id': marker_id,
                    'sample_id': sample_id,
                    'replicate': replicate
                })

        #######################################################################
        #
        # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Delete marker_name/run_name/sample/replicate".
            format(__file__,
                   inspect.currentframe().f_lineno))

        with engine.connect() as conn:
            stmt_del = variant_read_count_model.__table__.delete()
            stmt_del = stmt_del.where(variant_read_count_model.__table__.c.
                                      run_id == bindparam('run_id'))
            stmt_del = stmt_del.where(variant_read_count_model.__table__.c.
                                      marker_id == bindparam('marker_id'))
            stmt_del = stmt_del.where(variant_read_count_model.__table__.c.
                                      sample_id == bindparam('sample_id'))
            stmt_del = stmt_del.where(variant_read_count_model.__table__.c.
                                      replicate == bindparam('replicate'))
            conn.execute(stmt_del, sample_instance_list)

        #######################################################################
        #
        # 3. Read tsv file with sorted reads
        #
        #######################################################################

        # fasta_info_obj = FastaInformationTSV(input_file_sortedinfo, engine=engine)
        # sample_info_ids_df = fasta_info_obj.get_ids_df()
        sample_info_tsv_obj = FileSampleInformation(
            tsv_path=input_file_sortedinfo)
        sample_info_ids_df = sample_info_tsv_obj.to_identifier_df(
            engine=engine)

        Logger.instance().debug(
            "file: {}; line: {}; Read demultiplexed FASTA files".format(
                __file__,
                inspect.currentframe().f_lineno))

        variant_read_count_df = pandas.DataFrame()

        for row in sample_info_ids_df.itertuples():
            run_id = row.run_id
            marker_id = row.marker_id
            sample_id = row.sample_id
            replicate = row.replicate
            read_fasta = row.sortedfasta

            Logger.instance().debug(
                "file: {}; line: {}; Read FASTA: {}".format(
                    __file__,
                    inspect.currentframe().f_lineno, read_fasta))

            read_fasta_path = os.path.join(read_dir, read_fasta)

            if os.path.exists(read_fasta_path):

                ####################################################################################
                #
                # Read FASTA
                #
                ####################################################################################

                sorted_read_list = VariantReadCount.get_sorted_read_list(
                    read_fasta_path, generic_dna)

                variant_read_count_df_sorted_i = pandas.DataFrame({
                    'run_id': [run_id] * len(sorted_read_list),
                    'marker_id': [marker_id] * len(sorted_read_list),
                    'sample_id': [sample_id] * len(sorted_read_list),
                    'replicate': [replicate] * len(sorted_read_list),
                    'read_sequence':
                    sorted_read_list,
                    'read_count': [1] * len(sorted_read_list)
                })
                #  Compute read count
                variant_read_count_df_sorted_i = variant_read_count_df_sorted_i.groupby(
                    [
                        'run_id', 'marker_id', 'sample_id', 'replicate',
                        'read_sequence'
                    ]).sum().reset_index()

                #variant_read_count_df = variant_read_count_df.append(
                #    variant_read_count_df_sorted_i)
                variant_read_count_df = pandas.concat(
                    [variant_read_count_df, variant_read_count_df_sorted_i],
                    axis=0)

            else:
                Logger.instance().warning(
                    'This file {} doest not exists'.format(read_fasta_path))

        #######################################################################
        #
        # 4. Group by read sequence to variant_read_count with run_id, marker_name, ...
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Group by read sequence".format(
                __file__,
                inspect.currentframe().f_lineno))
        variant_read_count_df = variant_read_count_df.groupby(
            ['run_id', 'marker_id', 'sample_id', 'replicate',
             'read_sequence']).sum().reset_index()
        variant_read_count_df.rename(columns={'read_sequence': 'variant_id'},
                                     inplace=True)
        variant_read_count_df.sort_values(
            by=variant_read_count_df.columns.tolist())

        #######################################################################
        #
        # 5. Remove variants with read count across all run_name, markers, samples and replicates lower than
        # global_read_count_cutoff parameter
        #
        #######################################################################

        variant_read_count_like_df_obj = DataframeVariantReadCountLike(
            variant_read_count_df)
        Logger.instance().debug(
            "file: {}; line: {}; Remove variants with global read count lower than parameter 'global_read_count_cutoff'"
            .format(__file__,
                    inspect.currentframe().f_lineno))
        variant_read_count_df = variant_read_count_like_df_obj.filter_out_below_global_read_count_cutoff(
            global_read_count_cutoff=global_read_count_cutoff)
        variant_read_count_df.rename(
            columns={'variant_id': 'variant_sequence'}, inplace=True)

        #######################################################################
        #
        # 6. Insert into Variant and VariantReadCount tables
        #
        #######################################################################

        Logger.instance().debug("file: {}; line: {}; Insert variants".format(
            __file__,
            inspect.currentframe().f_lineno))
        variant_read_count_instance_list = []
        variant_read_count_df.sort_values(by=[
            'variant_sequence', 'run_id', 'marker_id', 'sample_id', 'replicate'
        ],
                                          inplace=True)
        variant_new_set = set()
        variant_new_instance_list = []
        with engine.connect() as conn:
            # Retrieve maximal variant id if possible
            select_variant_id_max = conn.execute(
                sqlalchemy.select([func.max(variant_model.__table__.c.id)
                                   ])).first()[0]
            if select_variant_id_max is None:
                select_variant_id_max = 0  # If no variants, then maximal variant id is 0
            for row in variant_read_count_df.itertuples():
                run_id = row.run_id
                marker_id = row.marker_id
                sample_id = row.sample_id
                replicate = row.replicate
                variant_sequence = row.variant_sequence
                read_count = row.read_count
                select_row = conn.execute(
                    sqlalchemy.select([
                        variant_model.__table__.c.id
                    ]).where(variant_model.__table__.c.sequence ==
                             variant_sequence)).first()
                if select_row is None:  # variant_sequence IS NOT in the database, so will INSERT it
                    if not (variant_sequence in variant_new_set):
                        variant_id = select_variant_id_max + \
                            len(variant_new_instance_list) + 1
                        variant_new_set.add(variant_sequence)
                        variant_new_instance_list.append({
                            'id':
                            variant_id,
                            'sequence':
                            variant_sequence
                        })
                else:  # variant_sequence IS in the database
                    variant_id = select_row[0]
                variant_read_count_instance_list.append({
                    'run_id':
                    run_id,
                    'marker_id':
                    marker_id,
                    'variant_id':
                    variant_id,
                    'sample_id':
                    sample_id,
                    'replicate':
                    replicate,
                    'read_count':
                    read_count
                })

        #######################################################################
        #
        # Exit if variant_read_count_instance_list empty
        #
        #######################################################################

        if not len(variant_read_count_instance_list):
            Logger.instance().warning(
                VTAMexception(
                    "No new variants in these samples. Maybe singletons? The analysis will stop here."
                    .format(self.__class__.__name__)))
            sys.exit(0)

        #######################################################################
        #
        # Write variant_read_count table
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {};  Insert variant read count".format(
                __file__,
                inspect.currentframe().f_lineno))

        with engine.connect() as conn:

            # Insert if there some new variants
            if len(variant_new_instance_list) > 0:
                conn.execute(variant_model.__table__.insert(),
                             variant_new_instance_list)

            # Insert new variant_read_count_instances
            conn.execute(variant_read_count_model.__table__.insert(),
                         variant_read_count_instance_list)

        #######################################################################
        #
        # Touch output tables, to update modification date
        #
        #######################################################################

        for output_table_i in self.specify_output_table():
            declarative_meta_i = self.output_table(output_table_i)
            obj = session.query(declarative_meta_i).order_by(
                declarative_meta_i.id.desc()).first()
            session.query(declarative_meta_i).filter_by(id=obj.id).update(
                {'id': obj.id})
            session.commit()
Beispiel #14
0
    def run(self):
        session = self.session
        engine = session._session().get_bind()

        #######################################################################
        #
        # Wrapper inputs, outputs and parameters
        #
        #######################################################################

        # Input file output
        fasta_info_tsv = self.input_file(FilterChimera.__input_file_sortedinfo)
        #
        # Input table models
        # Variant = self.input_table(FilterChimera.__input_table_Variant)
        input_filter_pcr_error_model = self.input_table(
            FilterChimera.__input_table_filter_pcr_error)
        #
        # Output table models
        output_filter_chimera_model = self.output_table(
            FilterChimera.__output_table_filter_chimera)
        output_filter_borderline_model = self.output_table(
            FilterChimera.__output_table_filter_chimera_borderline)
        #
        # Params
        uchime3_denovo_abskew = self.option("uchime3_denovo_abskew")

        #######################################################################
        #
        # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis
        # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model
        # 3. Get nijk_df input
        #
        #######################################################################

        sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv)

        sample_info_tsv_obj.delete_from_db(
            engine=engine,
            variant_read_count_like_model=output_filter_chimera_model)

        sample_info_tsv_obj.delete_from_db(
            engine=engine,
            variant_read_count_like_model=output_filter_borderline_model)

        variant_read_count_df = sample_info_tsv_obj.get_nijk_df(
            variant_read_count_like_model=input_filter_pcr_error_model,
            engine=engine,
            filter_id=None)

        #######################################################################
        #
        # 4. Run Filter
        #
        #######################################################################

        variant_df = sample_info_tsv_obj.get_variant_df(
            variant_read_count_like_model=input_filter_pcr_error_model,
            engine=engine)
        filter_chimera_runner = RunnerFilterChimera(
            variant_read_count_df=variant_read_count_df)
        filter_output_chimera_df, filter_borderline_output_df = \
            filter_chimera_runner.get_variant_read_count_delete_df(
                variant_df=variant_df, uchime3_denovo_abskew=uchime3_denovo_abskew)

        #######################################################################
        #
        # 5. Write to DB
        # 6. Touch output tables, to update modification date
        # 7. Exit vtam if all variants delete
        #
        #######################################################################

        DataframeVariantReadCountLike(filter_output_chimera_df).to_sql(
            engine=engine,
            variant_read_count_like_model=output_filter_chimera_model)

        DataframeVariantReadCountLike(filter_borderline_output_df).to_sql(
            engine=engine,
            variant_read_count_like_model=output_filter_borderline_model)

        for output_table_i in self.specify_output_table():
            declarative_meta_i = self.output_table(output_table_i)
            obj = session.query(declarative_meta_i).order_by(
                declarative_meta_i.id.desc()).first()
            session.query(declarative_meta_i).filter_by(id=obj.id).update(
                {'id': obj.id})
            session.commit()

        if filter_output_chimera_df.filter_delete.sum(
        ) == filter_output_chimera_df.shape[0]:
            Logger.instance().warning(
                VTAMexception("This filter has deleted all the variants: {}. "
                              "The analysis will stop here.".format(
                                  self.__class__.__name__)))
            sys.exit(0)
Beispiel #15
0
    def add_parser_filter(cls, subparsers):
        parser_vtam_filter = subparsers.add_parser(
            'filter',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity, cls.parser_wopmars_db,
                cls.parser_wopmars_dryrun, cls.parser_wopmars_forceall
            ],
            help=
            "filters out sequence artifacts and creates an amplicon sequence variant (ASV) table."
        )

        parser_vtam_filter.add_argument(
            '--sortedinfo',
            action='store',
            help=
            "input TSV file with information about FASTA files containing sorted reads",
            required=True,
            type=lambda x: FileSampleInformation(x).check_args(
                header=header_sortedread_fasta))
        parser_vtam_filter.add_argument(
            '--sorteddir',
            action='store',
            help=
            "input directory with sorted (Trimmed and demultiplexed) FASTA files",
            required=True,
            type=ArgParserChecker.check_dir_exists_and_is_nonempty)
        parser_vtam_filter.add_argument(
            '--asvtable',
            action='store',
            help=
            "output TSV file for the amplicon sequence variants (ASV) table",
            required=True)

        parser_vtam_filter.add_argument(
            '--cutoff_specific',
            dest='cutoff_specific',
            default=None,
            action='store',
            required=False,
            help=
            "TSV file with variant (col1: variant; col2: cutoff) or variant-replicate "
            "(col1: variant; col2: replicate; col3: cutoff)specific cutoffs",
            type=lambda x: FileCutoffSpecific(x).argparse_checker())

        parser_vtam_filter.add_argument(
            '--lfn_variant_replicate',
            action='store_true',
            help=
            "if set, VTAM will run the algorithm for the low frequency noise over variant and replicates",
            required=False,
            default=False)

        parser_vtam_filter.add_argument(
            '--known_occurrences',
            action='store',
            help="TSV file with expected (keep) occurrences",
            required=False,
            type=lambda x: FileKnownOccurrences(
                x).argparse_checker_known_occurrences())

        parser_vtam_filter.add_argument(
            '-U',
            '--until',
            dest='until',
            action='store',
            default=None,
            help=
            """execute '%(prog)s' UNTIL one rule, where the rule order looks like:            
1. SampleInformation, 2. VariantReadCount, 3. FilterLFN, 4. FilterMinReplicateNumber, 5. FilterPCRerror, 6. FilterChimera, 7. FilterMinReplicateNumber2, 8. FilterRenkonen, 9. FilterMinReplicateNumber3, 10. FilterIndel, 11. FilterCodonStop, 12. ReadCountAverageOverReplicates, 13. MakeAsvTable""",
            required=False)

        parser_vtam_filter.add_argument(
            '-S',
            '--since',
            dest='since',
            action='store',
            default=None,
            help=
            """execute '%(prog)s' SINCE one rule, where the rule order looks like:
            1. SampleInformation, 2. VariantReadCount, 3. FilterLFN, 4. FilterMinReplicateNumber, 5. FilterPCRerror, 6. FilterChimera, 7. FilterMinReplicateNumber2, 8. FilterRenkonen, 9. FilterMinReplicateNumber3, 10. FilterIndel, 11. FilterCodonStop, 12. ReadCountAverageOverReplicates, 13. MakeAsvTable""",
            required=False)

        # This attribute will trigger the good command
        parser_vtam_filter.set_defaults(command='filter')
Beispiel #16
0
    def run(self):
        session = self.session
        engine = session._session().get_bind()

        ##########################################################
        #
        # Wrapper inputs, outputs and parameters
        #
        ##########################################################
        #
        # Input file output
        fasta_info_tsv = self.input_file(FilterCodonStop.__input_file_sortedinfo)
        #
        # Input table models
        input_filter_indel_model = self.input_table(
            FilterCodonStop.__input_table_filter_indel)
        #
        # Options
        genetic_code = int(self.option("genetic_code"))
        skip_filter_codon_stop = bool(int(self.option("skip_filter_codon_stop")))
        #
        # Output table models
        output_filter_codon_stop_model = self.output_table(
            FilterCodonStop.__output_table_filter_codon_stop)

        #######################################################################
        #
        # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis
        # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model
        # 3. Get nijk_df input
        #
        #######################################################################

        sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv)

        sample_info_tsv_obj.delete_from_db(
            engine=engine, variant_read_count_like_model=output_filter_codon_stop_model)

        variant_read_count_df = sample_info_tsv_obj.get_nijk_df(
            variant_read_count_like_model=input_filter_indel_model, engine=engine, filter_id=None)

        #######################################################################
        #
        # 4. Run Filter
        #
        #######################################################################

        variant_df = sample_info_tsv_obj.get_variant_df(
            variant_read_count_like_model=input_filter_indel_model, engine=engine)
        variant_read_count_delete_df = RunnerFilterCodonStop(
            variant_read_count_df=variant_read_count_df).get_variant_read_count_delete_df(
            variant_df=variant_df,
            genetic_code=genetic_code,
            skip_filter_codon_stop=skip_filter_codon_stop)

        #######################################################################
        #
        # 5. Write to DB
        # 6. Touch output tables, to update modification date
        # 7. Exit vtam if all variants delete
        #
        #######################################################################

        DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql(
            engine=engine, variant_read_count_like_model=output_filter_codon_stop_model)

        for output_table_i in self.specify_output_table():
            declarative_meta_i = self.output_table(output_table_i)
            obj = session.query(declarative_meta_i).order_by(
                declarative_meta_i.id.desc()).first()
            session.query(declarative_meta_i).filter_by(
                id=obj.id).update({'id': obj.id})
            session.commit()

        if variant_read_count_delete_df.filter_delete.sum(
        ) == variant_read_count_delete_df.shape[0]:
            Logger.instance().warning(
                VTAMexception(
                    "This filter has deleted all the variants: {}. "
                    "The analysis will stop here.".format(
                        self.__class__.__name__)))
            sys.exit(0)
Beispiel #17
0
    def main(fastainfo, fastadir, sorteddir, params=None, num_threads=multiprocessing.cpu_count(), 
        no_reverse=False, tag_to_end=False, primer_to_end=False):
        
        Logger.instance().info(f"OPTIONS:\n no_reverse: {not no_reverse} \n tag_to_end {not tag_to_end} \n primer_to_end {not primer_to_end}")

        if sys.platform.startswith('win'):
            num_threads = 1

        ############################################################################################
        #
        # params.yml parameters
        #
        ############################################################################################

        params_dic = FileParams(params).get_params_dic()

        cutadapt_error_rate = params_dic['cutadapt_error_rate']
        cutadapt_minimum_length = params_dic['cutadapt_minimum_length']
        cutadapt_maximum_length = params_dic['cutadapt_maximum_length']

        ############################################################################################
        #
        # Loop over tag and primer pairs to demultiplex and trim reads
        #
        ############################################################################################

        merged_fastainfo_df = FileSampleInformation(fastainfo).read_tsv_into_df()
        
        pathlib.Path(sorteddir).mkdir(parents=True, exist_ok=True)
        tempdir = PathManager.instance().get_tempdir()

        merged_fasta_list = []
        results_list = []
        sample_info = {}

        # make sure every file is analysed once.
        for i in range(merged_fastainfo_df.shape[0]):
            if merged_fastainfo_df.iloc[i].mergedfasta not in merged_fasta_list:
                merged_fasta_list.append(merged_fastainfo_df.iloc[i].mergedfasta)
            
        for mergedfasta in merged_fasta_list:

            inputFiles = FilesInputCutadapt(fastainfo, mergedfasta, no_reverse, tag_to_end)
            
            tagFile_path = inputFiles.tags_file()
            info = inputFiles.get_df_info()

            for key in info.keys():
                if key in sample_info.keys():
                    sample_info[key] = sample_info[key] + info[key]
                else:
                    sample_info[key] = info[key]

            Logger.instance().debug("Analysing FASTA file: {}".format(mergedfasta))

            in_raw_fasta_path = os.path.join(fastadir, mergedfasta)

            ########################################################################################
            #
            #   cutadapt --cores=0 -e 0 --no-indels --trimmed-only -g tagFile:$tagfile 
            #   --overlap length -o "tagtrimmed.{name}.fasta" in_raw_fasta_path
            #
            ########################################################################################

            base = os.path.basename(in_raw_fasta_path)
            base, base_suffix = base.split('.', 1)
            
            out_fasta_path = os.path.join(tempdir, "sorted") 

            cmd_cutadapt_tag_dic = {
                'in_fasta_path': in_raw_fasta_path,
                'out_fasta': out_fasta_path,
                'num_threads': num_threads,
                'tagFile': tagFile_path,
                'base_suffix': base_suffix,
            }

            cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \
                '-g file:{tagFile} --output {out_fasta}_{{name}}.{base_suffix} {in_fasta_path}' \
                .format(**cmd_cutadapt_tag_dic)

            Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_tag_str
            else:
                args = shlex.split(cmd_cutadapt_tag_str)
            run_result = subprocess.run(args=args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

            Logger.instance().info(run_result.stdout.decode())

            inputFiles.remove_tags_file()

            ########################################################################################
            #
            # Trim primers from output
            # cutadapt --quiet --cores=0 -e trim_error --no-indels --trimmed-only 
            # --minimum-length minimum_length --maximum-length maximum_length 
            # --output input_path + {name} + suffix outputfile
            #
            ########################################################################################
            
            primers = inputFiles.primers()
            try:
                tags_samples = inputFiles.get_sample_names()
            except Exception as e:
                Logger.instance().error(e)
                return 
            
            for primer in primers:
                
                marker, primerfwd, primerrev, lenprimerfwd, lenprimerrev = primer

                for tag_sample in tags_samples:

                    name, run, marker2, sample, replicate, _, _ = tag_sample
                    
                    if marker not in marker2:
                        continue

                    in_fasta_path = out_fasta_path + "_" + name + "." + base_suffix

                    baseMerge =  mergedfasta.split(".")[0]
                                        
                    outname = run + "_" + marker + "_" + sample + "_" + replicate + "_" + baseMerge + "_trimmed"
                    if name.endswith("_reversed"):
                        outname = outname + "_reversed"
                    out_fasta_path_new = os.path.join(tempdir, outname + "." + base_suffix)

                    results_list.append(out_fasta_path_new)
                    
                    if not "_reversed" in name:
                        if generic_dna:  # Biopython <1.78
                            primerRev = str(Seq(primerrev, generic_dna).reverse_complement())
                        else:  # Biopython =>1.78
                            primerRev = str(Seq(primerrev).reverse_complement())
                        primerFwd = primerfwd
                        lenPrimerFwd = lenprimerfwd
                        lenPrimerRev = lenprimerrev
                    else:
                        if generic_dna:  # Biopython <1.78
                            primerRev = str(Seq(primerfwd, generic_dna).reverse_complement())
                        else:  # Biopython =>1.78
                            primerRev = str(Seq(primerfwd).reverse_complement())
                        primerFwd = primerrev
                        lenPrimerFwd = lenprimerrev
                        lenPrimerRev = lenprimerfwd


                    cmd_cutadapt_primer_dic = {
                        'in_fasta_path': in_fasta_path,
                        'out_fasta': out_fasta_path_new,
                        'error_rate': cutadapt_error_rate,
                        'num_threads': num_threads,
                        'primerFwd': primerFwd,
                        'primerRev': primerRev,
                        'lenPrimerFwd': lenPrimerFwd,
                        'lenPrimerRev': lenPrimerRev,
                        'read_min_length': cutadapt_minimum_length,
                        'read_max_length': cutadapt_maximum_length,
                    }

                    if not primer_to_end: #works if the command is selected
                        cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                            '--minimum-length {read_min_length} --maximum-length {read_max_length} ' \
                            '--trimmed-only -g "^{primerFwd}...{primerRev}$" --output {out_fasta} {in_fasta_path}'\
                            .format(**cmd_cutadapt_primer_dic)
                    else:
                        cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                            '--minimum-length {read_min_length} --maximum-length {read_max_length} ' \
                            '--trimmed-only -g "{primerFwd};min_overlap={lenPrimerFwd}...{primerRev};min_overlap={lenPrimerRev}" '\
                            '--output {out_fasta} {in_fasta_path}'\
                            .format(**cmd_cutadapt_primer_dic)

                    Logger.instance().debug("Running: {}".format(cmd_cutadapt_primer_str))

                    if sys.platform.startswith("win"):
                        args = cmd_cutadapt_primer_str
                    else:
                        args = shlex.split(cmd_cutadapt_primer_str)

                    run_result = subprocess.run(args=args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

                    Logger.instance().info(run_result.stdout.decode())

        ###################################################################
        #
        # Reverse complement back rc fasta and pool
        #
        ###################################################################   
     
        for file in results_list:
            if "_trimmed" in file:

                out_final_fasta_path = os.path.join(sorteddir, os.path.split(file)[-1])
                in_fasta_path = os.path.join(tempdir, file)

                if out_final_fasta_path.endswith(".gz"):      
                    _open = partial(gzip.open) 
                elif out_final_fasta_path.endswith(".bz2"):
                    _open = partial(bz2.open)
                else:
                    _open = open

                if in_fasta_path.endswith(".gz"):
                    _open2 = partial(gzip.open) 
                elif in_fasta_path.endswith(".bz2"):
                    _open2 = partial(bz2.open) 
                else: 
                    _open2 = open

                if "_reversed" in file:
                    Logger.instance().debug("Pooling fwd and rc reads...")

                    out_final_fasta_path = out_final_fasta_path.replace("_reversed", "")

                    with _open(out_final_fasta_path, 'at') as fout:
                        with _open2(in_fasta_path, 'rt') as fin:
                            for line in fin.readlines():
                                if not line.startswith('>'):
                                    if generic_dna:  # Biopython <1.78
                                        fout.write("%s\n" % str(
                                            Seq(line.strip(), generic_dna).reverse_complement()))
                                    else:  # Biopython =>1.78
                                        fout.write("%s\n" % str(
                                            Seq(line.strip()).reverse_complement()))

                                else:
                                    fout.write(line)
                else:
                    with _open(out_final_fasta_path, 'at') as fout:
                        with _open2(in_fasta_path, 'rt') as fin:
                            for line in fin.readlines():
                                fout.write(line)
        
        results_list = [os.path.split(result)[-1] for result in results_list if "_reversed" not in result]

        del sample_info['mergedfasta']
        del sample_info['primerrev']
        del sample_info['primerfwd']
        del sample_info['tagrev']
        del sample_info['tagfwd']

        sample_info['sortedfasta'] = results_list

        sample_info_df = pandas.DataFrame(sample_info)

        fasta_trimmed_info_tsv = os.path.join(sorteddir, 'sortedinfo.tsv')
        sample_info_df.to_csv(fasta_trimmed_info_tsv, sep="\t", header=True, index=False)
Beispiel #18
0
    def main(fastadir, random_seqdir, fastainfo, random_seqinfo, samplesize):

        if not os.path.isdir(fastadir) or not os.listdir(fastadir):
            Logger.instance().error(f"{fastadir} is empty or does not exists!")
            return

        fastainfo_df = FileSampleInformation(fastainfo).read_tsv_into_df()
        input_files = fastainfo_df.to_dict(orient='list')['mergedfasta']

        fastadir_path = os.path.abspath(fastadir)

        # check number is not > the sizes of the fasta files in fastadir
        files_size = {}

        for input_file in input_files:

            file_path = os.path.join(fastadir_path, input_file)

            line_counter = LineCounter(file_path)
            file_size = line_counter.sequence_counter()

            files_size[input_file] = file_size

        smallest = min(files_size.values())

        if smallest < samplesize:
            Logger.instance().error(
                f"The smallest file in fastadir has {smallest} sequences.\nSamplesize cannot exceed this number of sequences"
            )
            return

        ###################################################################
        #
        # Make the random files
        #
        ###################################################################

        # create output folder
        pathlib.Path(random_seqdir).mkdir(parents=True, exist_ok=True)

        output_files = []
        input_files_no_repeat = []

        # create the list to put in the ouput csv file and the list of file to sample (no duplicate)
        for input_file in input_files:
            base, ext = input_file.split(".", 1)
            output_files.append(base + "_sampled." + ext)
            if input_file not in input_files_no_repeat:
                input_files_no_repeat.append(input_file)

        for input_file in input_files_no_repeat:

            # get random indexes of lines in the file
            lines = []
            num = 0
            for _ in range(samplesize):
                while num in lines:
                    num = randint(0, files_size[input_file])
                lines.append(num)

            lines = sorted(lines)

            # make output file path
            base, ext = input_file.split(".", 1)
            output_file = os.path.join(random_seqdir, base + "_sampled." + ext)

            #if the file already exists delete it
            if os.path.exists(output_file):
                os.remove(output_file)

            # check extension
            if input_file.endswith(".gz"):
                _open = partial(gzip.open)
            elif input_file.endswith(".bz2"):
                _open = partial(bz2.open)
            else:
                _open = open

            input_file = os.path.join(fastadir_path, input_file)

            with _open(input_file, 'rb') as f_in:
                with _open(output_file, 'ab') as f_out:
                    countline = -1
                    countSelect = 0
                    for line in f_in:
                        if line.startswith(b">"):
                            countline += 1

                            if countSelect + 1 < len(
                                    lines) and countline == lines[countSelect +
                                                                  1]:
                                countSelect += 1
                        if countline == lines[countSelect]:
                            f_out.write(line)

                        if countline > lines[-1]:
                            break

        random_seqinfo_df = fastainfo_df.copy()
        random_seqinfo_df['mergedfasta'] = output_files
        random_seqinfo_df.to_csv(random_seqinfo,
                                 sep="\t",
                                 header=True,
                                 index=False)
Beispiel #19
0
    def add_parser_optimize(cls, subparsers):
        parser_vtam_optimize = subparsers.add_parser(
            'optimize',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity, cls.parser_wopmars_db,
                cls.parser_wopmars_dryrun, cls.parser_wopmars_forceall
            ],
            help="finds out optimal parameters for filtering")

        parser_vtam_optimize.add_argument(
            '--sortedinfo',
            action='store',
            help=
            "input TSV file with information about FASTA files containing sorted (trimmed and demultiplexed) reads",
            required=True,
            type=lambda x: FileSampleInformation(x).check_args(
                header=header_sortedread_fasta))

        parser_vtam_optimize.add_argument(
            '--sorteddir',
            action='store',
            help=
            "input directory with sorted (Trimmed and demultiplexed) FASTA files",
            required=True,
            type=ArgParserChecker.check_dir_exists_and_is_nonempty)

        parser_vtam_optimize.add_argument('-o',
                                          '--outdir',
                                          action='store',
                                          help="output directory",
                                          default="out",
                                          required=True)

        parser_vtam_optimize.add_argument(
            '--known_occurrences',
            action='store',
            help="TSV file with known variants",
            required=True,
            type=lambda x: FileKnownOccurrences(
                x).argparse_checker_known_occurrences())

        parser_vtam_optimize.add_argument(
            '--lfn_variant_replicate',
            action='store_true',
            help=
            "if set, VTAM will run the algorithm for the low frequency noise over variant and replicates",
            required=False,
            default=False)

        parser_vtam_optimize.add_argument(
            '-U',
            '--until',
            dest='until',
            action='store',
            default=None,
            help=
            """executes '%(prog)s' UNTIL one rule, where the rules follow this order:
            1. SampleInformation, 2. VariantReadCount, 3. either OptimizeLFNsampleReplicate or OptimizePCRerror or OptimizeLFNreadCountAndLFNvariant""",
            required=False)
        parser_vtam_optimize.add_argument(
            '-S',
            '--since',
            dest='since',
            action='store',
            default=None,
            help=
            """executes '%(prog)s' SINCE one rule, where the rules follow this order: 
            1. SampleInformation, 2. VariantReadCount, 3. either OptimizeLFNsampleReplicate or OptimizePCRerror or OptimizeLFNreadCountAndLFNvariant""",
            required=False)

        # This attribute will trigger the good command
        parser_vtam_optimize.set_defaults(command='optimize')
Beispiel #20
0
    def main(fastqinfo, fastqdir, fastainfo, fastadir, params=None, num_threads=multiprocessing.cpu_count()):
        ############################################################################################
        #
        # params.yml parameters
        #
        ############################################################################################

        params_dic = FileParams(params).get_params_dic()

        ############################################################################################
        #
        # Read fastq information into stats_df
        #
        ############################################################################################

        fastqinfo_df = FileSampleInformation(fastqinfo).read_tsv_into_df()

        pathlib.Path(
            os.path.dirname(fastainfo)).mkdir(
            parents=True,
            exist_ok=True)
        pathlib.Path(fastadir).mkdir(parents=True, exist_ok=True)

        fastainfo_df = pandas.DataFrame()

        ############################################################################################
        #
        # Loop over fastq pairs to merge
        #
        ############################################################################################

        # File with analysis stats data
        stats_df = pandas.DataFrame({'FastqFwd': [], 'FastqRev': [], 'NbReadsFwd': [], 'NbReadsRev': [], 'FastaMerged': [], 'NbMergedReads': []})

        for fastqfwd, fastqrev in fastqinfo_df[[
                'fastqfwd', 'fastqrev']].drop_duplicates().values:

            fastq_info_df_i = fastqinfo_df.loc[(fastqinfo_df.fastqfwd == fastqfwd) & (
                fastqinfo_df.fastqrev == fastqrev)]

            fastq_fw_abspath = os.path.join(fastqdir, fastqfwd)
            with open(fastq_fw_abspath, 'rb') as fin:
                fastq_fw_linecount = int(sum(1 for i in fin.read())/4)

            fastq_rv_abspath = os.path.join(fastqdir, fastqrev)
            with open(fastq_rv_abspath, 'rb') as fin:
                fastq_rv_linecount = int(sum(1 for i in fin.read())/4)

            Logger.instance().debug(
                "Analysing FASTQ files: {} and ".format(
                    fastqfwd, fastqrev))

            try:
                pathlib.Path(fastq_fw_abspath).resolve(strict=True)
            except FileNotFoundError:
                Logger.instance().error(
                    VTAMexception(
                        "VTAMexception: This FASTQ file was not found: {}.".format(fastq_fw_abspath)))
                sys.exit(1)
            try:
                pathlib.Path(fastq_rv_abspath).resolve(strict=True)
            except FileNotFoundError:
                Logger.instance().error(
                    VTAMexception(
                        "VTAMexception: This FASTQ file was not found: {}.".format(fastq_rv_abspath)))
                sys.exit(1)

            fasta_merged_basename = os.path.basename(
                fastq_fw_abspath).replace('.fastq', '.fasta')
            out_fasta_path = os.path.join(fastadir, fasta_merged_basename)

            ########################################################################################
            #
            # Run vsearch merge
            #
            ########################################################################################

            vsearch_args_dic = {}

            vsearch_args_dic['fastq_ascii'] = params_dic['fastq_ascii']
            vsearch_args_dic['fastq_maxee'] = params_dic['fastq_maxee']
            vsearch_args_dic['fastq_maxmergelen'] = params_dic['fastq_maxmergelen']
            vsearch_args_dic['fastq_maxns'] = params_dic['fastq_maxns']
            vsearch_args_dic['fastq_minlen'] = params_dic['fastq_minlen']
            vsearch_args_dic['fastq_minmergelen'] = params_dic['fastq_minmergelen']
            vsearch_args_dic['fastq_minovlen'] = params_dic['fastq_minovlen']
            vsearch_args_dic['fastq_truncqual'] = params_dic['fastq_truncqual']

            vsearch_args_dic['fastq_mergepairs'] = fastq_fw_abspath
            vsearch_args_dic['reverse'] = fastq_rv_abspath
            vsearch_args_dic['fastaout'] = out_fasta_path
            vsearch_args_dic['threads'] = num_threads

            vsearch_cluster = RunnerVSearch(parameters=vsearch_args_dic)
            vsearch_cluster.run()

            fastq_info_df_i = fastq_info_df_i[['run', 'marker', 'sample', 'replicate', 'tagfwd',
                                               'primerfwd', 'tagrev', 'primerrev']]
            fastq_info_df_i['mergedfasta'] = fasta_merged_basename
            fastainfo_df = pandas.concat(
                [fastainfo_df, fastq_info_df_i], axis=0)

            with open(out_fasta_path, 'rb') as fin:
                fasta_merged_linecount = int(sum(1 for i in fin.read()) / 4)

            ########################################################################################
            #
            # Summary file
            #
            ########################################################################################

            stats_df = pandas.concat([stats_df, pandas.DataFrame({
                'FastqFwd': [fastq_fw_abspath], 'FastqRev': [fastq_fw_linecount],
                'NbReadsFwd': [fastq_rv_abspath], 'NbReadsRev': [fastq_rv_linecount], 'FastaMerged': [out_fasta_path], 'NbMergedReads': [fasta_merged_linecount]})])
    
        for mergedfasta in fastainfo_df[['mergedfasta']].drop_duplicates().values:
            mergedfasta = mergedfasta[0]

            if mergedfasta.endswith('.bz2') or  mergedfasta.endswith('.gz'):
                fasta_merged_abspath = os.path.join(fastadir, mergedfasta)
                mergedfasta_compressor = FileCompression(fasta_merged_abspath)
            
                if mergedfasta.endswith('.gz'):
                    mergedfasta_c = mergedfasta_compressor.pigz_compression()
                    if mergedfasta_c is None:
                        mergedfasta_c = mergedfasta_compressor.gzip_compression()

                    
                elif mergedfasta.endswith('.bz2'):
                    mergedfasta_c = mergedfasta_compressor.bz2_compression()
                    
                mergedfasta_compressor.delete_file()
                _, relPath = os.path.split(mergedfasta_c)
                fastainfo_df.loc[fastainfo_df['mergedfasta'] == mergedfasta, 'mergedfasta'] = relPath
                
            else: 
                fastq_info_df_i['mergedfasta'] = fasta_merged_basename

        
        fastainfo_df.to_csv(fastainfo, sep="\t", header=True, index=False)
Beispiel #21
0
    def run(self):
        session = self.session
        engine = session._session().get_bind()

        #######################################################################
        #
        # 1. Wrapper inputs, outputs and parameters
        #
        #######################################################################

        # Input file
        fasta_info_tsv = self.input_file(MakeAsvTable.__input_file_sortedinfo)

        # Output file
        asvtable_tsv_path = self.output_file(MakeAsvTable.__output_table_asv)
        #
        # Options
        cluster_identity = float(self.option("cluster_identity"))
        known_occurrences_tsv = str(self.option("known_occurrences"))

        #######################################################################
        #
        # Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis
        # Compute variant_read_count_input_df and other dfs for the asv_table_runner
        #
        #######################################################################

        sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv)

        variant_read_count_df = sample_info_tsv_obj.get_nijk_df(
            FilterCodonStop, engine=engine)

        ############################################################################################
        #
        # FileKnownOccurrences
        #
        ############################################################################################

        if known_occurrences_tsv == 'None' or known_occurrences_tsv is None:
            known_occurrences_df = None
        else:
            known_occurrences_df = FileKnownOccurrences(
                known_occurrences_tsv).to_identifier_df(engine)
            known_occurrences_df = known_occurrences_df.loc[
                (known_occurrences_df.mock == 1) &
                (known_occurrences_df.action == 'keep'), ]

        #######################################################################
        #
        # Compute variant_to_chimera_borderline_df
        #
        #######################################################################

        sample_list = sample_info_tsv_obj.read_tsv_into_df(
        )['sample'].drop_duplicates(keep='first').tolist()
        asvtable_runner = RunnerAsvTable(
            variant_read_count_df=variant_read_count_df,
            engine=engine,
            sample_list=sample_list,
            cluster_identity=cluster_identity,
            known_occurrences_df=known_occurrences_df)
        asvtable_runner.to_tsv(asvtable_tsv_path)
Beispiel #22
0
    def run(self):
        session = self.session
        engine = session._session().get_bind()

        this_temp_dir = os.path.join(PathManager.instance().get_tempdir(),
                                     os.path.basename(__file__))
        pathlib.Path(this_temp_dir).mkdir(exist_ok=True)

        ############################################################################################
        #
        # Wrapper inputs, outputs and parameters
        #
        ############################################################################################
        #
        # Input file output
        fasta_info_tsv = self.input_file(
            FilterPCRerror.__input_file_sortedinfo)
        #
        # Input table models
        input_filter_min_replicate_model = self.input_table(
            FilterPCRerror.__input_table_filter_min_replicate_number)
        #
        # Options
        pcr_error_var_prop = self.option("pcr_error_var_prop")
        #
        # Output table models
        output_filter_pcr_error_model = self.output_table(
            FilterPCRerror.__output_table_filter_pcr_error)

        ############################################################################################
        #
        # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis
        # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model
        # 3. Get nijk_df input
        #
        ############################################################################################

        sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv)

        sample_info_tsv_obj.delete_from_db(
            engine=engine,
            variant_read_count_like_model=output_filter_pcr_error_model)

        variant_read_count_df = sample_info_tsv_obj.get_nijk_df(
            variant_read_count_like_model=input_filter_min_replicate_model,
            engine=engine,
            filter_id=None)

        ############################################################################################
        #
        # Run per sample_id
        #
        ############################################################################################

        variant_df = sample_info_tsv_obj.get_variant_df(
            variant_read_count_like_model=input_filter_min_replicate_model,
            engine=engine)

        record_list = []

        run_marker_sample_df = variant_read_count_df[[
            'run_id', 'marker_id', 'sample_id'
        ]].drop_duplicates()
        for row in run_marker_sample_df.itertuples():
            run_id = row.run_id
            marker_id = row.marker_id
            sample_id = row.sample_id

            # Get variant read for the current run-marker-sample
            variant_read_count_per_sample_df = variant_read_count_df.loc[
                (variant_read_count_df.run_id == run_id)
                & (variant_read_count_df.marker_id == marker_id) &
                (variant_read_count_df.sample_id == sample_id)]

            variant_per_sample_df = variant_df.loc[variant_df.index.isin(
                variant_read_count_per_sample_df.variant_id.unique().tolist())]
            this_step_tmp_per_sample_dir = os.path.join(
                this_temp_dir,
                "run_{}_marker_{}_sample{}".format(run_id, marker_id,
                                                   sample_id))
            pathlib.Path(this_step_tmp_per_sample_dir).mkdir(exist_ok=True)

            ########################################################################################
            #
            # Run vsearch and get alignement variant_read_count_input_df
            #
            ########################################################################################

            filter_pcr_error_runner = RunnerFilterPCRerror(
                variant_expected_df=variant_per_sample_df,
                variant_unexpected_df=variant_per_sample_df,
                variant_read_count_df=variant_read_count_per_sample_df)
            filter_output_per_sample_df = filter_pcr_error_runner.get_variant_read_count_delete_df(
                pcr_error_var_prop)

            ########################################################################################
            #
            # Per sample add to record list
            #
            ########################################################################################

            record_per_sample_list = ModelVariantReadCountLike.filter_delete_df_to_dict(
                filter_output_per_sample_df)
            record_list = record_list + record_per_sample_list

        variant_read_count_delete_df = pandas.DataFrame.from_records(
            data=record_list)

        ############################################################################################
        #
        # 5. Write to DB
        # 6. Touch output tables, to update modification date
        # 7. Exit vtam if all variants delete
        #
        #######################################################################

        DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql(
            engine=engine,
            variant_read_count_like_model=output_filter_pcr_error_model)

        for output_table_i in self.specify_output_table():
            declarative_meta_i = self.output_table(output_table_i)
            obj = session.query(declarative_meta_i).order_by(
                declarative_meta_i.id.desc()).first()
            session.query(declarative_meta_i).filter_by(id=obj.id).update(
                {'id': obj.id})
            session.commit()

        if variant_read_count_delete_df.filter_delete.sum(
        ) == variant_read_count_delete_df.shape[0]:
            Logger.instance().warning(
                VTAMexception("This filter has deleted all the variants: {}. "
                              "The analysis will stop here.".format(
                                  self.__class__.__name__)))
            sys.exit(0)