def run(self): session = self.session engine = session._session().get_bind() this_temp_dir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(this_temp_dir).mkdir(exist_ok=True) ############################################################################################ # # Wrapper inputs, outputs and parameters # ############################################################################################ # Input file paths known_occurrences_tsv = self.input_file( OptimizePCRerror.__input_file_known_occurrences) fasta_info_tsv = self.input_file( OptimizePCRerror.__input_file_sortedinfo) # # Output file paths output_optimize_path = self.output_file( OptimizePCRerror.__output_file_optimize_pcr_error) ############################################################################################ # # Get nijk_df, known_occurrences_df # ############################################################################################ sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( VariantReadCount, engine=engine) known_occurrences_df = FileKnownOccurrences( known_occurrences_tsv).to_identifier_df(engine) ############################################################################################ # # Run optimizer and Write # ############################################################################################ optimize_pcr_error_runner = RunnerOptimizePCRerror( variant_read_count_df=variant_read_count_df, known_occurrences_df=known_occurrences_df) optimize_pcr_error_runner.to_tsv(optimize_path=output_optimize_path, engine=engine)
def run(self): session = self.session ####################################################################### # # Wrapper inputs, outputs and parameters # ####################################################################### # input file paths csv_path = self.input_file(SampleInformation.__input_file_csv) FileSampleInformation(csv_path).to_sqlite(session=session) ####################################################################### # # Touch output tables, to update modification date # ####################################################################### for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by(id=obj.id).update( {'id': obj.id}) session.commit()
def run(self): session = self.session engine = session._session().get_bind() ############################################################################################ # # Wrapper inputs, outputs and parameters # ############################################################################################ # Input file output known_occurrences_tsv = self.input_file( OptimizeLFNsampleReplicate.__input_file_known_occurrences) fasta_info_tsv = self.input_file( OptimizeLFNsampleReplicate.__input_file_sortedinfo) # Output file output output_optimize_path = self.output_file( OptimizeLFNsampleReplicate. __output_file_optimize_lfn_sample_replicate) ############################################################################################ # # Get nijk_df and known_occurrences_df (keep) # ############################################################################################ sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( VariantReadCount, engine=engine) known_occurrences_df = FileKnownOccurrences( known_occurrences_tsv).to_identifier_df(engine) known_occurrences_df = known_occurrences_df.loc[ (known_occurrences_df.mock == 1) & (known_occurrences_df.action == 'keep'), ] ############################################################################################ # # Run optimizer and Write # ############################################################################################ optimize_lfn_sample_replicate_runner = RunnerOptimizeLFNsampleReplicate( variant_read_count_df=variant_read_count_df, known_occurrences_df=known_occurrences_df) optimize_lfn_sample_replicate_runner.to_tsv( optimize_path=output_optimize_path, engine=engine)
def add_parser_sortreads(cls, subparsers): parser_vtam_sortreads = subparsers.add_parser( 'sortreads', add_help=True, parents=[ cls.parser_params, cls.parser_log, cls.parser_threads, cls.parser_verbosity ], help= "sorts (Trims and demultiplexes) reads to biological samples and replicates according to the presence of sequence tags and primers" ) parser_vtam_sortreads.add_argument( '--fastainfo', action='store', help="input TSV file with FASTA file information", required=True, type=lambda x: FileSampleInformation(x).check_args( header=header_merged_fasta)) parser_vtam_sortreads.add_argument( '--fastadir', action='store', help="input directory with FASTA files", required=True, type=ArgParserChecker.check_dir_exists_and_is_nonempty) parser_vtam_sortreads.add_argument( '--sorteddir', action='store', help= "output directory with sorted reads (Trimmed and demultiplexed) in FASTA files and TSV file with corresponnding FASTA file information ('SORTEDDIR/sortedinfo.tsv')", default="out", required=True) # This attribute will trigger the good command parser_vtam_sortreads.add_argument( "--no_reverse", action="store_false", help="don't check reverse sequences", required=False) parser_vtam_sortreads.add_argument( "--tag_to_end", action="store_false", help="look for tags only at the edges of the sequence", required=False) parser_vtam_sortreads.add_argument( "--primer_to_end", action="store_false", help="look for primers only at the edges of the sequence", required=False) parser_vtam_sortreads.set_defaults(command='sortreads')
def add_parser_random_seq(cls, subparsers): parser_vtam_random_seq = subparsers.add_parser( 'random_seq', add_help=True, parents=[ cls.parser_params, cls.parser_log, cls.parser_threads, cls.parser_verbosity ], help= "make a folder with sample files containing 'size' number of sequences randomly selected from the files in input folder" ) parser_vtam_random_seq.add_argument( '--fastadir', action='store', help="input directory with FASTA files", required=True, type=ArgParserChecker.check_dir_exists_and_is_nonempty) parser_vtam_random_seq.add_argument( '--random_seqdir', action='store', help= "output directory with randomly selected sequences in FASTA format", required=True) parser_vtam_random_seq.add_argument( '--fastainfo', action='store', help="input TSV file with FASTA file information", required=True, type=lambda x: FileSampleInformation(x).check_args( header=header_merged_fasta)) parser_vtam_random_seq.add_argument( '--random_seqinfo', action='store', help="output TSV file with output FASTA file information", required=True) parser_vtam_random_seq.add_argument( '--samplesize', action='store', help="number of sequences to be selected from the input files", type=int, required=True) parser_vtam_random_seq.set_defaults(command='random_seq')
def add_parser_merge(cls, subparsers): parser_vtam_merge = subparsers.add_parser( 'merge', add_help=True, parents=[ cls.parser_params, cls.parser_log, cls.parser_threads, cls.parser_verbosity ], help="merges paired-end reads") parser_vtam_merge.add_argument( '--fastqinfo', action='store', help="input TSV file with paired FASTQ file information", required=True, type=lambda x: FileSampleInformation(x).check_args( header=header_paired_fastq)) parser_vtam_merge.add_argument( '--fastainfo', action='store', help="output TSV file with merged FASTA file information", required=True) parser_vtam_merge.add_argument( '--fastqdir', action='store', help="input directory with paired FASTQ files", required=True, type=ArgParserChecker.check_dir_exists_and_is_nonempty) parser_vtam_merge.add_argument( '--fastadir', action='store', help="output directory with merged FASTA files", required=True) # This attribute will trigger the good command parser_vtam_merge.set_defaults(command='merge')
def run(self): """ Algorithm (Updated Oct 13, 2019) 1. Read file with known variants (Mock/tolerate, delete and real) 2. Control if user variants and sequence are consistent in the database 3. Get variant_read_count of this run_name-marker_name-sample-replicate experiment 5. Compute maximal lfn_nijk_cutoff that keeps all 'keep' variants with the 'run_lfn_read_count_and_lfn_variant' algorithm 6. Compute maximal lfn_variant_cutoff that keeps all 'keep' variants with the 'run_lfn_read_count_and_lfn_variant' algorithm (See below) 7. Loop between default and lfn_nijk_cutoff and run_lfn_read_count_and_lfn_variant parameters 7.1 Compute number of keep variants. Should be always maximal. 7.2 Compute number of delete variants Should decrease. 8. Compute variant(-replicate) specific cutoff for delete variants 8.1 For each variant i (Or variant-replicate i-k ), get N_ijk_max and use it to computer variant specific cutoff Description of the 'run_lfn_read_count_and_lfn_variant' algorithm 1. Remove if does not pass these filter 1.1 Filter lfn_variant (Or lfn_variant_replicate) 1.2 Filter lfn_sample_replicate 1.3 Filter absolute read count 2. Filter if not min replicate number """ session = self.session engine = session._session().get_bind() ############################################################################################ # # Wrapper inputs, outputs and parameters # ############################################################################################ # Input file output known_occurrences_tsv = self.input_file( OptimizeLFNreadCountAndLFNvariant.__input_file_known_occurrences) fasta_info_tsv = self.input_file( OptimizeLFNreadCountAndLFNvariant.__input_file_sortedinfo) # Output file output output_file_optimize_lfn_tsv = self.output_file( OptimizeLFNreadCountAndLFNvariant. __output_file_optimize_lfn_read_count_and_lfn_variant) output_file_lfn_variant_specific_cutoff_tsv = self.output_file( OptimizeLFNreadCountAndLFNvariant. __output_file_optimize_lfn_variant_specific) # Options lfn_ni_cutoff = self.option("lfn_variant_cutoff") lfn_nik_cutoff = self.option("lfn_variant_replicate_cutoff") min_replicate_number = self.option("min_replicate_number") lfn_njk_cutoff = self.option("lfn_sample_replicate_cutoff") lfn_nijk_cutoff = int(self.option("lfn_read_count_cutoff")) filter_kwargs = { "lfn_ni_cutoff": lfn_ni_cutoff, "lfn_nik_cutoff": lfn_nik_cutoff, "lfn_njk_cutoff": lfn_njk_cutoff, "lfn_nijk_cutoff": lfn_nijk_cutoff, 'min_replicate_number': min_replicate_number, } ############################################################################################ # # Get nijk_df and known_occurrences_df (keep) # ############################################################################################ sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) nijk_df = sample_info_tsv_obj.get_nijk_df(VariantReadCount, engine=engine) known_occurrences_df = FileKnownOccurrences( known_occurrences_tsv).to_identifier_df(engine) ############################################################################################ # # Create cutoff values lists # ############################################################################################ # # lfn_nijk_cutoff_list = range(lfn_nijk_cutoff, lfn_nijk_cutoff_global_max + 1, round(int((lfn_nijk_cutoff_global_max - lfn_nijk_cutoff + 1)/10), -1)) # lfn_nijk_cutoff_list = range(lfn_nijk_cutoff, lfn_nijk_cutoff_global_max + 1, round(int((lfn_nijk_cutoff_global_max - lfn_nijk_cutoff + 1)/10), -1)) # lfn_nijk_cutoff_list = RunnerOptimizeLFNreadCountAndVariantRunMarker.get_lfn_nijk_cutoff_lst(start=lfn_nijk_cutoff, stop=lfn_nijk_cutoff_global_max, nb_points=10) # lfn_nijk_cutoff_list = RunnerOptimizeLFNreadCountAndVariantRunMarker.get_lfn_nijk_cutoff_lst(start=lfn_nijk_cutoff, stop=lfn_nijk_cutoff_global_max, nb_points=10) # if lfn_nik_cutoff is None: # lfn_variant optimization # lfn_ni_nik_cutoff_list = [round(x, 3) for x in numpy.arange(lfn_ni_cutoff, lfn_ni_njk_cutoff_global_max + 0.001, (lfn_ni_njk_cutoff_global_max - lfn_ni_cutoff + 0.001)/10)] # else: # lfn_variant_replicate optimization # lfn_ni_nik_cutoff_list = [round(x, 3) for x in numpy.arange(lfn_ni_cutoff, lfn_ni_njk_cutoff_global_max + 0.001, (lfn_ni_njk_cutoff_global_max - lfn_ni_cutoff + 0.001)/10)] ############################################################################################ # # Group and run_name this genetic_code by run_name/marker_name combination # Loop by run_name/marker_name # ############################################################################################ optim_lfn_readcount_variant_runner = RunnerOptimizeLFNreadCountAndVariant( nijk_df=nijk_df, known_occurrences_df=known_occurrences_df) out_optimize_df, out_optimize2_df = optim_lfn_readcount_variant_runner.get_optimize_df( lfn_ni_cutoff=lfn_ni_cutoff, lfn_nik_cutoff=lfn_nik_cutoff, lfn_njk_cutoff=lfn_njk_cutoff, lfn_nijk_cutoff=lfn_nijk_cutoff, min_replicate_number=min_replicate_number) ############################################################################################ # # out_optimize_df: Format and write # ############################################################################################ out_optimize_df.marker_id = NameIdConverter( out_optimize_df.marker_id, engine=engine).to_names(Marker) out_optimize_df.run_id = NameIdConverter(out_optimize_df.run_id, engine=engine).to_names(Run) out_optimize_df.rename({ 'run_id': 'run', 'marker_id': 'marker' }, axis=1, inplace=True) out_optimize_df.to_csv(output_file_optimize_lfn_tsv, header=True, sep='\t', index=False) ############################################################################################ # # out_optimize_df: Format and write # ############################################################################################ out_optimize2_df.marker_id = NameIdConverter( out_optimize2_df.marker_id, engine=engine).to_names(Marker) out_optimize2_df.run_id = NameIdConverter(out_optimize2_df.run_id, engine=engine).to_names(Run) out_optimize2_df['action'] = 'delete' out_optimize2_df['sequence'] = NameIdConverter( out_optimize2_df.variant_id, engine=engine).variant_id_to_sequence() out_optimize2_df.rename( { 'run_id': 'run', 'marker_id': 'marker', 'variant_id': 'variant', 'read_count': 'read_count_max' }, axis=1, inplace=True) if self.option("lfn_variant_replicate_cutoff") is None: out_optimize2_df = out_optimize2_df[[ 'run', 'marker', 'variant', 'action', 'read_count_max', 'N_i', 'lfn_variant_cutoff', 'sequence' ]] else: out_optimize2_df = out_optimize2_df[[ 'run', 'marker', 'variant', 'replicate', 'action', 'read_count_max', 'N_ik', 'lfn_variant_replicate_cutoff', 'sequence' ]] out_optimize2_df.to_csv(output_file_lfn_variant_specific_cutoff_tsv, header=True, sep='\t', index=False)
def run(self): session = self.session engine = session._session().get_bind() ####################################################################### # # Wrapper inputs, outputs and parameters # ####################################################################### # # Input files fasta_info_tsv = self.input_file( FilterMinReplicateNumber.__input_file_sortedinfo) # # Input tables input_filter_lfn_model = self.input_table( FilterMinReplicateNumber.__input_table_variant_filter_lfn) # # Options min_replicate_number = self.option("min_replicate_number") # input_filter_lfn = self.option("input_filter_lfn") # # Output tables output_filter_min_replicate_model = self.output_table( FilterMinReplicateNumber.__output_table_filter_min_replicate_number) ####################################################################### # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # 3. Get nijk_df input # ####################################################################### sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=output_filter_min_replicate_model) filter_id = None if input_filter_lfn_model.__tablename__ == "FilterLFN": filter_id = 8 # Variant pass all filters LFN variant_read_count_df = sample_info_tsv_obj.get_nijk_df( variant_read_count_like_model=input_filter_lfn_model, engine=engine, filter_id=filter_id) ####################################################################### # # 4. Run Filter # ####################################################################### variant_read_count_delete_df = RunnerFilterMinReplicateNumber( variant_read_count_df) .get_variant_read_count_delete_df(min_replicate_number) ####################################################################### # # 5. Write to DB # 6. Touch output tables, to update modification date # 7. Exit vtam if all variants delete # ####################################################################### DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql( engine=engine, variant_read_count_like_model=output_filter_min_replicate_model) for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by( id=obj.id).update({'id': obj.id}) session.commit() if variant_read_count_delete_df.filter_delete.sum( ) == variant_read_count_delete_df.shape[0]: Logger.instance().warning( VTAMexception( "This filter has deleted all the variants: {}. " "The analysis will stop here.".format( self.__class__.__name__))) sys.exit(0)
def run(self): session = self.session engine = session._session().get_bind() # # Input file output fasta_info_tsv = self.input_file( ReadCountAverageOverReplicates.__input_file_sortedinfo) # codon_stop_model = self.input_table( ReadCountAverageOverReplicates.__input_table_filter_codon_stop) # # Output table models consensus_model = self.output_table( ReadCountAverageOverReplicates.__output_table_filter_consensus) # ####################################################################### # # # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # # # ####################################################################### # # # fasta_info_tsv = FastaInformationTSV(engine=engine, fasta_info_tsv=input_file_sortedinfo) # sample_info_tsv_obj = FileSampleInformation(tsv_path=input_file_sortedinfo) # # ####################################################################### # # # # 2. Delete /run_name/markersamples/replicate from this filter table # # # ####################################################################### # # with engine.connect() as conn: # # # conn.execute(consensus_model.__table__.delete(), sample_instance_list) # # conn.execute(consensus_model.__table__.delete(), sample_instance_list) # # # variant_read_count_like_utils = ModelVariantReadCountLike( # variant_read_count_like_model=consensus_model, engine=engine) # sample_record_list = sample_info_tsv_obj.to_identifier_df( # engine=engine).to_dict('records') # variant_read_count_like_utils.delete_from_db( # sample_record_list=sample_record_list) # # ####################################################################### # # # # 3. Select marker_name/run_name/sample/replicate from variant_read_count_model # # # ####################################################################### # # nijk_df = sample_info_tsv_obj.get_nijk_df( # variant_read_count_like_model=codon_stop_model, filter_id=None) # # # Exit if no variants for analysis # try: # assert nijk_df.shape[0] > 0 # except AssertionError: # sys.stderr.write( # "Error: No variants available for this filter: {}".format( # os.path.basename(__file__))) # sys.exit(1) ####################################################################### # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # 3. Get nijk_df input # ####################################################################### sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=consensus_model) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( variant_read_count_like_model=codon_stop_model, engine=engine, filter_id=None) ####################################################################### # # 4. Run Filter # ####################################################################### variant_read_count_delete_df = read_count_average_over_replicates( variant_read_count_df) ####################################################################### # # Write to DB # ####################################################################### record_list = ModelVariantReadCountLike.filter_delete_df_to_dict( variant_read_count_delete_df) with engine.connect() as conn: # Insert new instances conn.execute(consensus_model.__table__.insert(), record_list) ####################################################################### # # Touch output tables, to update modification date # ####################################################################### for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by(id=obj.id).update( {'id': obj.id}) session.commit()
def run(self): session = self.session engine = session._session().get_bind() ############################################################################################ # # Wrapper inputs, outputs and parameters # ############################################################################################ # Input file fasta_info_tsv = self.input_file( FilterRenkonen.__input_file_sortedinfo) # # Input table models input_filter_chimera_model = self.input_table( FilterRenkonen.__input_table_chimera) # # Options renkonen_distance_quantile = float( self.option("renkonen_distance_quantile")) # # Output table models output_filter_renkonen_model = self.output_table( FilterRenkonen.__output_table_filter_renkonen) ############################################################################################ # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # 3. Get nijk_df input # ############################################################################################ sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=output_filter_renkonen_model) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( variant_read_count_like_model=input_filter_chimera_model, engine=engine, filter_id=None) ############################################################################################ # # Run per run_id, marker_id # ############################################################################################ variant_read_count_delete_df = pandas.DataFrame() run_marker_df = variant_read_count_df[['run_id', 'marker_id']].drop_duplicates() for row in run_marker_df.itertuples(): run_id = row.run_id marker_id = row.marker_id variant_read_count_per_run_marker_df = variant_read_count_df.loc[ (variant_read_count_df.run_id == run_id) & (variant_read_count_df.marker_id == marker_id)] if variant_read_count_per_run_marker_df.replicate.unique( ).shape[0] > 1: # if more than one replicate filter_renkonen_runner_obj = RunnerFilterRenkonen( variant_read_count_per_run_marker_df) filter_output_i_df = filter_renkonen_runner_obj.get_variant_read_count_delete_df( renkonen_distance_quantile) else: # Just one replicate filter_output_i_df = variant_read_count_df.copy() filter_output_i_df['filter_delete'] = False variant_read_count_delete_df = pandas.concat( [variant_read_count_delete_df, filter_output_i_df], axis=0) ############################################################################################ # # 5. Write to DB # 6. Touch output tables, to update modification date # 7. Exit vtam if all variants delete # ############################################################################################ DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql( engine=engine, variant_read_count_like_model=output_filter_renkonen_model) for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by(id=obj.id).update( {'id': obj.id}) session.commit() if variant_read_count_delete_df.filter_delete.sum( ) == variant_read_count_delete_df.shape[0]: Logger.instance().warning( VTAMexception("This filter has deleted all the variants: {}. " "The analysis will stop here.".format( self.__class__.__name__))) sys.exit(0)
def main(fastainfo, fastadir, sorteddir, params=None, num_threads=multiprocessing.cpu_count()): if sys.platform.startswith('win'): num_threads = 1 ############################################################################################ # # params.yml parameters # ############################################################################################ params_dic = FileParams(params).get_params_dic() cutadapt_error_rate = params_dic['cutadapt_error_rate'] cutadapt_minimum_length = params_dic['cutadapt_minimum_length'] cutadapt_maximum_length = params_dic['cutadapt_maximum_length'] ############################################################################################ # # Loop over tag and primer pairs to demultiplex and trim reads # ############################################################################################ merged_fastainfo_df = FileSampleInformation( fastainfo).read_tsv_into_df() pathlib.Path(sorteddir).mkdir(parents=True, exist_ok=True) tempdir = PathManager.instance().get_tempdir() sorted_read_info_df = pandas.DataFrame() for i in range(0, merged_fastainfo_df.shape[0]): fasta_info_series = merged_fastainfo_df.iloc[i] tag_fwd = fasta_info_series.tagfwd tag_rev = fasta_info_series.tagrev primer_fwd = fasta_info_series.primerfwd primer_rev = fasta_info_series.primerrev in_fasta_basename = fasta_info_series.mergedfasta Logger.instance().debug( "Analysing FASTA file: {}".format(in_fasta_basename)) fasta_info_df_i = fasta_info_series.to_frame().T in_raw_fasta_path = os.path.join(fastadir, in_fasta_basename) ######################################################################################## # # Cut adapt tag of forward reads # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only # --front 'tcgatcacgatgt;min_overlap=13...gctgtagatcgaca;min_overlap=14' # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta # ######################################################################################## if generic_dna: # Biopython <1.78 tag_rev_rc = str( Seq(tag_rev, generic_dna).reverse_complement()) else: # Biopython =>1.78 tag_rev_rc = str(Seq(tag_rev).reverse_complement()) out_fasta_basename = os.path.basename(in_raw_fasta_path).replace( '.fasta', '_sorted_%03d.fasta' % i) out_fasta_path = os.path.join(tempdir, out_fasta_basename) cmd_cutadapt_tag_dic = { 'tag_fwd': tag_fwd, 'tag_fwd_len': len(tag_fwd), 'tag_rev_rc': tag_rev_rc, 'tag_rev_rc_len': len(tag_rev_rc), 'in_fasta_path': in_raw_fasta_path, 'out_fasta': out_fasta_path, 'num_threads': num_threads, } cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \ '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \ '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic) Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_tag_str else: args = shlex.split(cmd_cutadapt_tag_str) run_result = subprocess.run(args=args, capture_output=True, check=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode()) ######################################################################################## # # Trim primers from output # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only # --front 'TCCACTAATCACAARGATATTGGTAC;min_overlap=26...GGAGGATTTGGWAATTGATTAGTW;min_overlap=24' # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_trimmed_000.fasta # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta # ######################################################################################## if generic_dna: # Biopython <1.78 primer_rev_rc = str( Seq(primer_rev, generic_dna).reverse_complement()) else: # Biopython =>1.78 primer_rev_rc = str(Seq(primer_rev).reverse_complement()) in_fasta_path = out_fasta_path out_fasta_basename = os.path.basename(in_fasta_path).replace( '_sorted_%03d.fasta' % i, '_sorted_trimmed_%03d.fasta' % i) out_fasta_path = os.path.join(tempdir, out_fasta_basename) cmd_cutadapt_primer_dic = { 'primer_fwd': primer_fwd, 'primer_fwd_len': len(primer_fwd), 'primer_rev_rc': primer_rev_rc, 'primer_rev_rc_len': len(primer_rev_rc), 'in_fasta_path': in_fasta_path, 'out_fasta': out_fasta_path, 'error_rate': cutadapt_error_rate, 'read_min_length': cutadapt_minimum_length, 'read_max_length': cutadapt_maximum_length, 'num_threads': num_threads, } cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \ '--minimum-length {read_min_length} ' \ '--maximum-length {read_max_length} --trimmed-only ' \ '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" ' \ '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic) Logger.instance().debug( "Running: {}".format(cmd_cutadapt_primer_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_primer_str else: args = shlex.split(cmd_cutadapt_primer_str) run_result = subprocess.run(args=args, capture_output=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode()) ######################################################################################## # # Cut adapt tag of reverse-complement reads # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only # --front 'tgtcgatctacagc;min_overlap=14...acatcgtgatcga;min_overlap=13' # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta # ######################################################################################## if generic_dna: # Biopython <1.78 tag_fwd_rc = str( Seq(tag_fwd, generic_dna).reverse_complement()) else: # Biopython =>1.78 tag_fwd_rc = str(Seq(tag_fwd).reverse_complement()) out_rc_fasta_basename = os.path.basename( in_raw_fasta_path).replace('.fasta', '_rc_sorted_%03d.fasta' % i) out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename) cmd_cutadapt_tag_dic = { 'tag_fwd': tag_rev, 'tag_fwd_len': len(tag_rev), 'tag_rev_rc': tag_fwd_rc, 'tag_rev_rc_len': len(tag_fwd_rc), 'in_fasta_path': in_raw_fasta_path, 'out_fasta': out_rc_fasta_path, 'num_threads': num_threads, } cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \ '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \ '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic) Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_tag_str else: args = shlex.split(cmd_cutadapt_tag_str) run_result = subprocess.run(args=args, capture_output=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode()) ################################################################### # # Trim primers from output # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only # --front 'WACTAATCAATTWCCAAATCCTCC;min_overlap=24...GTACCAATATCYTTGTGATTAGTGGA;min_overlap=26' # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_trimmed_000.fasta # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta # ################################################################### if generic_dna: # Biopython <1.78 primer_fwd_rc = str( Seq(primer_fwd, generic_dna).reverse_complement()) else: # Biopython =>1.78 primer_fwd_rc = str(Seq(primer_fwd).reverse_complement()) in_fasta_path = out_rc_fasta_path out_rc_fasta_basename = os.path.basename(in_fasta_path).replace( '_rc_sorted_%03d.fasta' % i, '_rc_sorted_trimmed_%03d.fasta' % i) out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename) cmd_cutadapt_primer_dic = { 'primer_fwd': primer_rev, 'primer_fwd_len': len(primer_rev), 'primer_rev_rc': primer_fwd_rc, 'primer_rev_rc_len': len(primer_fwd_rc), 'in_fasta_path': in_fasta_path, 'out_fasta': out_rc_fasta_path, 'error_rate': cutadapt_error_rate, 'read_min_length': cutadapt_minimum_length, 'read_max_length': cutadapt_maximum_length, 'num_threads': num_threads, } cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \ '--minimum-length {read_min_length} ' \ '--maximum-length {read_max_length} --trimmed-only ' \ '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" ' \ '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic) Logger.instance().debug( "Running: {}".format(cmd_cutadapt_primer_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_primer_str else: args = shlex.split(cmd_cutadapt_primer_str) run_result = subprocess.run(args=args, capture_output=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode()) ################################################################### # # Reverse complement back rc fasta and pool # ################################################################### out_final_fasta_basename = os.path.basename( in_raw_fasta_path).replace('.fasta', '_%03d.fasta' % i) out_final_fasta_path = os.path.join(sorteddir, out_final_fasta_basename) shutil.copy(out_fasta_path, out_final_fasta_path) Logger.instance().debug("Pooling fwd and rc reads...") with open(out_final_fasta_path, 'a') as fout: with open(out_rc_fasta_path, 'r') as fin: for line in fin: if not line.startswith('>'): if generic_dna: # Biopython <1.78 fout.write("%s\n" % str( Seq(line.strip(), generic_dna).reverse_complement())) else: # Biopython =>1.78 fout.write("%s\n" % str( Seq(line.strip()).reverse_complement())) else: fout.write(line) fasta_info_df_i = fasta_info_df_i[[ 'run', 'marker', 'sample', 'replicate' ]] fasta_info_df_i['sortedfasta'] = out_final_fasta_basename sorted_read_info_df = pandas.concat( [sorted_read_info_df, fasta_info_df_i], axis=0) fasta_trimmed_info_tsv = os.path.join(sorteddir, 'sortedinfo.tsv') sorted_read_info_df.to_csv(fasta_trimmed_info_tsv, sep="\t", header=True, index=False)
def run(self): session = self.session engine = session._session().get_bind() ############################################################################################ # # Wrapper inputs, outputs and parameters # ############################################################################################ # Input file output fasta_info_tsv = self.input_file(FilterLFN.__input_file_sortedinfo) # # Input table models input_variant_read_count_model = self.input_table( FilterLFN.__input_table_variant_read_count) # # Output table models output_filter_lfn_model = self.output_table( FilterLFN.__output_table_filter_lfn) # # Options lfn_variant_cutoff = self.option("lfn_variant_cutoff") lfn_variant_specific_cutoff = self.option( "lfn_variant_specific_cutoff") lfn_variant_replicate_cutoff = self.option( "lfn_variant_replicate_cutoff") lfn_variant_replicate_specific_cutoff = self.option( "lfn_variant_replicate_specific_cutoff") lfn_sample_replicate_cutoff = self.option( "lfn_sample_replicate_cutoff") lfn_read_count_cutoff = self.option("lfn_read_count_cutoff") ############################################################################################ # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # 3. Get nijk_df input # ############################################################################################ sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=output_filter_lfn_model) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( variant_read_count_like_model=input_variant_read_count_model, engine=engine, filter_id=None) lfn_variant_specific_cutoff_df = None if (not (lfn_variant_cutoff is None) ) and pathlib.Path(lfn_variant_specific_cutoff).stat().st_size > 0: lfn_variant_specific_cutoff_df = FileCutoffSpecific( lfn_variant_specific_cutoff).to_identifier_df( engine=engine, is_lfn_variant_replicate=False) lfn_variant_replicate_specific_cutoff_df = None if (not (lfn_variant_replicate_cutoff is None)) and pathlib.Path( lfn_variant_replicate_specific_cutoff).stat().st_size > 0: lfn_variant_replicate_specific_cutoff_df = FileCutoffSpecific( lfn_variant_replicate_specific_cutoff).to_identifier_df( engine=engine, is_lfn_variant_replicate=True) ############################################################################################ # # Create filter object and run_name # ############################################################################################ variant_read_count_delete_df = RunnerFilterLFN( variant_read_count_df).get_variant_read_count_delete_df( lfn_variant_cutoff=lfn_variant_cutoff, lfn_variant_specific_cutoff=lfn_variant_specific_cutoff_df, lfn_variant_replicate_cutoff=lfn_variant_replicate_cutoff, lfn_variant_replicate_specific_cutoff= lfn_variant_replicate_specific_cutoff_df, lfn_sample_replicate_cutoff=lfn_sample_replicate_cutoff, lfn_read_count_cutoff=lfn_read_count_cutoff) DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql( engine=engine, variant_read_count_like_model=output_filter_lfn_model) for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by(id=obj.id).update( {'id': obj.id}) session.commit() if variant_read_count_delete_df.filter_delete.sum( ) == variant_read_count_delete_df.shape[0]: Logger.instance().warning( VTAMexception("This filter has deleted all the variants: {}. " "The analysis will stop here.".format( self.__class__.__name__))) sys.exit(0)
def run(self): session = self.session engine = session._session().get_bind() ####################################################################### # # Wrapper inputs, outputs and parameters # ####################################################################### # Input file # sort_reads_tsv = self.input_file(VariantReadCount.__input_file_sort_reads) input_file_sortedinfo = self.input_file( VariantReadCount.__input_file_sortedinfo) # # Input table models run_model = self.input_table(VariantReadCount.__input_table_run) marker_model = self.input_table(VariantReadCount.__input_table_marker) sample_model = self.input_table(VariantReadCount.__input_table_sample) # # Output # Output table variant_model = self.output_table( VariantReadCount.__output_table_variant) variant_read_count_model = self.output_table( VariantReadCount.__output_table_variant_read_count) # Options read_dir = self.option("read_dir") global_read_count_cutoff = self.option("global_read_count_cutoff") ####################################################################### # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # 3. Read tsv file with sorted reads # 4. Group by read sequence # 5. Delete variants if below global_read_count_cutoff # 6. Insert into Variant and DataframeVariantReadCountLike tables # ####################################################################### ####################################################################### # # 1. Read sample information to get run_id, marker_id, sample_id, replicate for current analysis # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Read sample information".format( __file__, inspect.currentframe().f_lineno)) sortedinfo_df = pandas.read_csv(input_file_sortedinfo, sep="\t", header=0) sample_instance_list = [] sortedinfo_df.columns = sortedinfo_df.columns.str.lower() for row in sortedinfo_df.itertuples(): Logger.instance().debug(row) marker_name = row.marker run_name = row.run sample_name = row.sample replicate = row.replicate with engine.connect() as conn: # get run_id ########### stmt_select_run_id = select([ run_model.__table__.c.id ]).where(run_model.__table__.c.name == run_name) run_id = conn.execute(stmt_select_run_id).first()[0] # get marker_id ########### stmt_select_marker_id = select([ marker_model.__table__.c.id ]).where(marker_model.__table__.c.name == marker_name) marker_id = conn.execute(stmt_select_marker_id).first()[0] # get sample_id ########### stmt_select_sample_id = select([ sample_model.__table__.c.id ]).where(sample_model.__table__.c.name == sample_name) sample_id = conn.execute(stmt_select_sample_id).first()[0] # add this sample_instance ########### sample_instance_list.append({ 'run_id': run_id, 'marker_id': marker_id, 'sample_id': sample_id, 'replicate': replicate }) ####################################################################### # # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Delete marker_name/run_name/sample/replicate". format(__file__, inspect.currentframe().f_lineno)) with engine.connect() as conn: stmt_del = variant_read_count_model.__table__.delete() stmt_del = stmt_del.where(variant_read_count_model.__table__.c. run_id == bindparam('run_id')) stmt_del = stmt_del.where(variant_read_count_model.__table__.c. marker_id == bindparam('marker_id')) stmt_del = stmt_del.where(variant_read_count_model.__table__.c. sample_id == bindparam('sample_id')) stmt_del = stmt_del.where(variant_read_count_model.__table__.c. replicate == bindparam('replicate')) conn.execute(stmt_del, sample_instance_list) ####################################################################### # # 3. Read tsv file with sorted reads # ####################################################################### # fasta_info_obj = FastaInformationTSV(input_file_sortedinfo, engine=engine) # sample_info_ids_df = fasta_info_obj.get_ids_df() sample_info_tsv_obj = FileSampleInformation( tsv_path=input_file_sortedinfo) sample_info_ids_df = sample_info_tsv_obj.to_identifier_df( engine=engine) Logger.instance().debug( "file: {}; line: {}; Read demultiplexed FASTA files".format( __file__, inspect.currentframe().f_lineno)) variant_read_count_df = pandas.DataFrame() for row in sample_info_ids_df.itertuples(): run_id = row.run_id marker_id = row.marker_id sample_id = row.sample_id replicate = row.replicate read_fasta = row.sortedfasta Logger.instance().debug( "file: {}; line: {}; Read FASTA: {}".format( __file__, inspect.currentframe().f_lineno, read_fasta)) read_fasta_path = os.path.join(read_dir, read_fasta) if os.path.exists(read_fasta_path): #################################################################################### # # Read FASTA # #################################################################################### sorted_read_list = VariantReadCount.get_sorted_read_list( read_fasta_path, generic_dna) variant_read_count_df_sorted_i = pandas.DataFrame({ 'run_id': [run_id] * len(sorted_read_list), 'marker_id': [marker_id] * len(sorted_read_list), 'sample_id': [sample_id] * len(sorted_read_list), 'replicate': [replicate] * len(sorted_read_list), 'read_sequence': sorted_read_list, 'read_count': [1] * len(sorted_read_list) }) # Compute read count variant_read_count_df_sorted_i = variant_read_count_df_sorted_i.groupby( [ 'run_id', 'marker_id', 'sample_id', 'replicate', 'read_sequence' ]).sum().reset_index() #variant_read_count_df = variant_read_count_df.append( # variant_read_count_df_sorted_i) variant_read_count_df = pandas.concat( [variant_read_count_df, variant_read_count_df_sorted_i], axis=0) else: Logger.instance().warning( 'This file {} doest not exists'.format(read_fasta_path)) ####################################################################### # # 4. Group by read sequence to variant_read_count with run_id, marker_name, ... # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Group by read sequence".format( __file__, inspect.currentframe().f_lineno)) variant_read_count_df = variant_read_count_df.groupby( ['run_id', 'marker_id', 'sample_id', 'replicate', 'read_sequence']).sum().reset_index() variant_read_count_df.rename(columns={'read_sequence': 'variant_id'}, inplace=True) variant_read_count_df.sort_values( by=variant_read_count_df.columns.tolist()) ####################################################################### # # 5. Remove variants with read count across all run_name, markers, samples and replicates lower than # global_read_count_cutoff parameter # ####################################################################### variant_read_count_like_df_obj = DataframeVariantReadCountLike( variant_read_count_df) Logger.instance().debug( "file: {}; line: {}; Remove variants with global read count lower than parameter 'global_read_count_cutoff'" .format(__file__, inspect.currentframe().f_lineno)) variant_read_count_df = variant_read_count_like_df_obj.filter_out_below_global_read_count_cutoff( global_read_count_cutoff=global_read_count_cutoff) variant_read_count_df.rename( columns={'variant_id': 'variant_sequence'}, inplace=True) ####################################################################### # # 6. Insert into Variant and VariantReadCount tables # ####################################################################### Logger.instance().debug("file: {}; line: {}; Insert variants".format( __file__, inspect.currentframe().f_lineno)) variant_read_count_instance_list = [] variant_read_count_df.sort_values(by=[ 'variant_sequence', 'run_id', 'marker_id', 'sample_id', 'replicate' ], inplace=True) variant_new_set = set() variant_new_instance_list = [] with engine.connect() as conn: # Retrieve maximal variant id if possible select_variant_id_max = conn.execute( sqlalchemy.select([func.max(variant_model.__table__.c.id) ])).first()[0] if select_variant_id_max is None: select_variant_id_max = 0 # If no variants, then maximal variant id is 0 for row in variant_read_count_df.itertuples(): run_id = row.run_id marker_id = row.marker_id sample_id = row.sample_id replicate = row.replicate variant_sequence = row.variant_sequence read_count = row.read_count select_row = conn.execute( sqlalchemy.select([ variant_model.__table__.c.id ]).where(variant_model.__table__.c.sequence == variant_sequence)).first() if select_row is None: # variant_sequence IS NOT in the database, so will INSERT it if not (variant_sequence in variant_new_set): variant_id = select_variant_id_max + \ len(variant_new_instance_list) + 1 variant_new_set.add(variant_sequence) variant_new_instance_list.append({ 'id': variant_id, 'sequence': variant_sequence }) else: # variant_sequence IS in the database variant_id = select_row[0] variant_read_count_instance_list.append({ 'run_id': run_id, 'marker_id': marker_id, 'variant_id': variant_id, 'sample_id': sample_id, 'replicate': replicate, 'read_count': read_count }) ####################################################################### # # Exit if variant_read_count_instance_list empty # ####################################################################### if not len(variant_read_count_instance_list): Logger.instance().warning( VTAMexception( "No new variants in these samples. Maybe singletons? The analysis will stop here." .format(self.__class__.__name__))) sys.exit(0) ####################################################################### # # Write variant_read_count table # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Insert variant read count".format( __file__, inspect.currentframe().f_lineno)) with engine.connect() as conn: # Insert if there some new variants if len(variant_new_instance_list) > 0: conn.execute(variant_model.__table__.insert(), variant_new_instance_list) # Insert new variant_read_count_instances conn.execute(variant_read_count_model.__table__.insert(), variant_read_count_instance_list) ####################################################################### # # Touch output tables, to update modification date # ####################################################################### for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by(id=obj.id).update( {'id': obj.id}) session.commit()
def run(self): session = self.session engine = session._session().get_bind() ####################################################################### # # Wrapper inputs, outputs and parameters # ####################################################################### # Input file output fasta_info_tsv = self.input_file(FilterChimera.__input_file_sortedinfo) # # Input table models # Variant = self.input_table(FilterChimera.__input_table_Variant) input_filter_pcr_error_model = self.input_table( FilterChimera.__input_table_filter_pcr_error) # # Output table models output_filter_chimera_model = self.output_table( FilterChimera.__output_table_filter_chimera) output_filter_borderline_model = self.output_table( FilterChimera.__output_table_filter_chimera_borderline) # # Params uchime3_denovo_abskew = self.option("uchime3_denovo_abskew") ####################################################################### # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # 3. Get nijk_df input # ####################################################################### sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=output_filter_chimera_model) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=output_filter_borderline_model) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( variant_read_count_like_model=input_filter_pcr_error_model, engine=engine, filter_id=None) ####################################################################### # # 4. Run Filter # ####################################################################### variant_df = sample_info_tsv_obj.get_variant_df( variant_read_count_like_model=input_filter_pcr_error_model, engine=engine) filter_chimera_runner = RunnerFilterChimera( variant_read_count_df=variant_read_count_df) filter_output_chimera_df, filter_borderline_output_df = \ filter_chimera_runner.get_variant_read_count_delete_df( variant_df=variant_df, uchime3_denovo_abskew=uchime3_denovo_abskew) ####################################################################### # # 5. Write to DB # 6. Touch output tables, to update modification date # 7. Exit vtam if all variants delete # ####################################################################### DataframeVariantReadCountLike(filter_output_chimera_df).to_sql( engine=engine, variant_read_count_like_model=output_filter_chimera_model) DataframeVariantReadCountLike(filter_borderline_output_df).to_sql( engine=engine, variant_read_count_like_model=output_filter_borderline_model) for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by(id=obj.id).update( {'id': obj.id}) session.commit() if filter_output_chimera_df.filter_delete.sum( ) == filter_output_chimera_df.shape[0]: Logger.instance().warning( VTAMexception("This filter has deleted all the variants: {}. " "The analysis will stop here.".format( self.__class__.__name__))) sys.exit(0)
def add_parser_filter(cls, subparsers): parser_vtam_filter = subparsers.add_parser( 'filter', add_help=True, parents=[ cls.parser_params, cls.parser_log, cls.parser_threads, cls.parser_verbosity, cls.parser_wopmars_db, cls.parser_wopmars_dryrun, cls.parser_wopmars_forceall ], help= "filters out sequence artifacts and creates an amplicon sequence variant (ASV) table." ) parser_vtam_filter.add_argument( '--sortedinfo', action='store', help= "input TSV file with information about FASTA files containing sorted reads", required=True, type=lambda x: FileSampleInformation(x).check_args( header=header_sortedread_fasta)) parser_vtam_filter.add_argument( '--sorteddir', action='store', help= "input directory with sorted (Trimmed and demultiplexed) FASTA files", required=True, type=ArgParserChecker.check_dir_exists_and_is_nonempty) parser_vtam_filter.add_argument( '--asvtable', action='store', help= "output TSV file for the amplicon sequence variants (ASV) table", required=True) parser_vtam_filter.add_argument( '--cutoff_specific', dest='cutoff_specific', default=None, action='store', required=False, help= "TSV file with variant (col1: variant; col2: cutoff) or variant-replicate " "(col1: variant; col2: replicate; col3: cutoff)specific cutoffs", type=lambda x: FileCutoffSpecific(x).argparse_checker()) parser_vtam_filter.add_argument( '--lfn_variant_replicate', action='store_true', help= "if set, VTAM will run the algorithm for the low frequency noise over variant and replicates", required=False, default=False) parser_vtam_filter.add_argument( '--known_occurrences', action='store', help="TSV file with expected (keep) occurrences", required=False, type=lambda x: FileKnownOccurrences( x).argparse_checker_known_occurrences()) parser_vtam_filter.add_argument( '-U', '--until', dest='until', action='store', default=None, help= """execute '%(prog)s' UNTIL one rule, where the rule order looks like: 1. SampleInformation, 2. VariantReadCount, 3. FilterLFN, 4. FilterMinReplicateNumber, 5. FilterPCRerror, 6. FilterChimera, 7. FilterMinReplicateNumber2, 8. FilterRenkonen, 9. FilterMinReplicateNumber3, 10. FilterIndel, 11. FilterCodonStop, 12. ReadCountAverageOverReplicates, 13. MakeAsvTable""", required=False) parser_vtam_filter.add_argument( '-S', '--since', dest='since', action='store', default=None, help= """execute '%(prog)s' SINCE one rule, where the rule order looks like: 1. SampleInformation, 2. VariantReadCount, 3. FilterLFN, 4. FilterMinReplicateNumber, 5. FilterPCRerror, 6. FilterChimera, 7. FilterMinReplicateNumber2, 8. FilterRenkonen, 9. FilterMinReplicateNumber3, 10. FilterIndel, 11. FilterCodonStop, 12. ReadCountAverageOverReplicates, 13. MakeAsvTable""", required=False) # This attribute will trigger the good command parser_vtam_filter.set_defaults(command='filter')
def run(self): session = self.session engine = session._session().get_bind() ########################################################## # # Wrapper inputs, outputs and parameters # ########################################################## # # Input file output fasta_info_tsv = self.input_file(FilterCodonStop.__input_file_sortedinfo) # # Input table models input_filter_indel_model = self.input_table( FilterCodonStop.__input_table_filter_indel) # # Options genetic_code = int(self.option("genetic_code")) skip_filter_codon_stop = bool(int(self.option("skip_filter_codon_stop"))) # # Output table models output_filter_codon_stop_model = self.output_table( FilterCodonStop.__output_table_filter_codon_stop) ####################################################################### # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # 3. Get nijk_df input # ####################################################################### sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=output_filter_codon_stop_model) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( variant_read_count_like_model=input_filter_indel_model, engine=engine, filter_id=None) ####################################################################### # # 4. Run Filter # ####################################################################### variant_df = sample_info_tsv_obj.get_variant_df( variant_read_count_like_model=input_filter_indel_model, engine=engine) variant_read_count_delete_df = RunnerFilterCodonStop( variant_read_count_df=variant_read_count_df).get_variant_read_count_delete_df( variant_df=variant_df, genetic_code=genetic_code, skip_filter_codon_stop=skip_filter_codon_stop) ####################################################################### # # 5. Write to DB # 6. Touch output tables, to update modification date # 7. Exit vtam if all variants delete # ####################################################################### DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql( engine=engine, variant_read_count_like_model=output_filter_codon_stop_model) for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by( id=obj.id).update({'id': obj.id}) session.commit() if variant_read_count_delete_df.filter_delete.sum( ) == variant_read_count_delete_df.shape[0]: Logger.instance().warning( VTAMexception( "This filter has deleted all the variants: {}. " "The analysis will stop here.".format( self.__class__.__name__))) sys.exit(0)
def main(fastainfo, fastadir, sorteddir, params=None, num_threads=multiprocessing.cpu_count(), no_reverse=False, tag_to_end=False, primer_to_end=False): Logger.instance().info(f"OPTIONS:\n no_reverse: {not no_reverse} \n tag_to_end {not tag_to_end} \n primer_to_end {not primer_to_end}") if sys.platform.startswith('win'): num_threads = 1 ############################################################################################ # # params.yml parameters # ############################################################################################ params_dic = FileParams(params).get_params_dic() cutadapt_error_rate = params_dic['cutadapt_error_rate'] cutadapt_minimum_length = params_dic['cutadapt_minimum_length'] cutadapt_maximum_length = params_dic['cutadapt_maximum_length'] ############################################################################################ # # Loop over tag and primer pairs to demultiplex and trim reads # ############################################################################################ merged_fastainfo_df = FileSampleInformation(fastainfo).read_tsv_into_df() pathlib.Path(sorteddir).mkdir(parents=True, exist_ok=True) tempdir = PathManager.instance().get_tempdir() merged_fasta_list = [] results_list = [] sample_info = {} # make sure every file is analysed once. for i in range(merged_fastainfo_df.shape[0]): if merged_fastainfo_df.iloc[i].mergedfasta not in merged_fasta_list: merged_fasta_list.append(merged_fastainfo_df.iloc[i].mergedfasta) for mergedfasta in merged_fasta_list: inputFiles = FilesInputCutadapt(fastainfo, mergedfasta, no_reverse, tag_to_end) tagFile_path = inputFiles.tags_file() info = inputFiles.get_df_info() for key in info.keys(): if key in sample_info.keys(): sample_info[key] = sample_info[key] + info[key] else: sample_info[key] = info[key] Logger.instance().debug("Analysing FASTA file: {}".format(mergedfasta)) in_raw_fasta_path = os.path.join(fastadir, mergedfasta) ######################################################################################## # # cutadapt --cores=0 -e 0 --no-indels --trimmed-only -g tagFile:$tagfile # --overlap length -o "tagtrimmed.{name}.fasta" in_raw_fasta_path # ######################################################################################## base = os.path.basename(in_raw_fasta_path) base, base_suffix = base.split('.', 1) out_fasta_path = os.path.join(tempdir, "sorted") cmd_cutadapt_tag_dic = { 'in_fasta_path': in_raw_fasta_path, 'out_fasta': out_fasta_path, 'num_threads': num_threads, 'tagFile': tagFile_path, 'base_suffix': base_suffix, } cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \ '-g file:{tagFile} --output {out_fasta}_{{name}}.{base_suffix} {in_fasta_path}' \ .format(**cmd_cutadapt_tag_dic) Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_tag_str else: args = shlex.split(cmd_cutadapt_tag_str) run_result = subprocess.run(args=args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) Logger.instance().info(run_result.stdout.decode()) inputFiles.remove_tags_file() ######################################################################################## # # Trim primers from output # cutadapt --quiet --cores=0 -e trim_error --no-indels --trimmed-only # --minimum-length minimum_length --maximum-length maximum_length # --output input_path + {name} + suffix outputfile # ######################################################################################## primers = inputFiles.primers() try: tags_samples = inputFiles.get_sample_names() except Exception as e: Logger.instance().error(e) return for primer in primers: marker, primerfwd, primerrev, lenprimerfwd, lenprimerrev = primer for tag_sample in tags_samples: name, run, marker2, sample, replicate, _, _ = tag_sample if marker not in marker2: continue in_fasta_path = out_fasta_path + "_" + name + "." + base_suffix baseMerge = mergedfasta.split(".")[0] outname = run + "_" + marker + "_" + sample + "_" + replicate + "_" + baseMerge + "_trimmed" if name.endswith("_reversed"): outname = outname + "_reversed" out_fasta_path_new = os.path.join(tempdir, outname + "." + base_suffix) results_list.append(out_fasta_path_new) if not "_reversed" in name: if generic_dna: # Biopython <1.78 primerRev = str(Seq(primerrev, generic_dna).reverse_complement()) else: # Biopython =>1.78 primerRev = str(Seq(primerrev).reverse_complement()) primerFwd = primerfwd lenPrimerFwd = lenprimerfwd lenPrimerRev = lenprimerrev else: if generic_dna: # Biopython <1.78 primerRev = str(Seq(primerfwd, generic_dna).reverse_complement()) else: # Biopython =>1.78 primerRev = str(Seq(primerfwd).reverse_complement()) primerFwd = primerrev lenPrimerFwd = lenprimerrev lenPrimerRev = lenprimerfwd cmd_cutadapt_primer_dic = { 'in_fasta_path': in_fasta_path, 'out_fasta': out_fasta_path_new, 'error_rate': cutadapt_error_rate, 'num_threads': num_threads, 'primerFwd': primerFwd, 'primerRev': primerRev, 'lenPrimerFwd': lenPrimerFwd, 'lenPrimerRev': lenPrimerRev, 'read_min_length': cutadapt_minimum_length, 'read_max_length': cutadapt_maximum_length, } if not primer_to_end: #works if the command is selected cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \ '--minimum-length {read_min_length} --maximum-length {read_max_length} ' \ '--trimmed-only -g "^{primerFwd}...{primerRev}$" --output {out_fasta} {in_fasta_path}'\ .format(**cmd_cutadapt_primer_dic) else: cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \ '--minimum-length {read_min_length} --maximum-length {read_max_length} ' \ '--trimmed-only -g "{primerFwd};min_overlap={lenPrimerFwd}...{primerRev};min_overlap={lenPrimerRev}" '\ '--output {out_fasta} {in_fasta_path}'\ .format(**cmd_cutadapt_primer_dic) Logger.instance().debug("Running: {}".format(cmd_cutadapt_primer_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_primer_str else: args = shlex.split(cmd_cutadapt_primer_str) run_result = subprocess.run(args=args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) Logger.instance().info(run_result.stdout.decode()) ################################################################### # # Reverse complement back rc fasta and pool # ################################################################### for file in results_list: if "_trimmed" in file: out_final_fasta_path = os.path.join(sorteddir, os.path.split(file)[-1]) in_fasta_path = os.path.join(tempdir, file) if out_final_fasta_path.endswith(".gz"): _open = partial(gzip.open) elif out_final_fasta_path.endswith(".bz2"): _open = partial(bz2.open) else: _open = open if in_fasta_path.endswith(".gz"): _open2 = partial(gzip.open) elif in_fasta_path.endswith(".bz2"): _open2 = partial(bz2.open) else: _open2 = open if "_reversed" in file: Logger.instance().debug("Pooling fwd and rc reads...") out_final_fasta_path = out_final_fasta_path.replace("_reversed", "") with _open(out_final_fasta_path, 'at') as fout: with _open2(in_fasta_path, 'rt') as fin: for line in fin.readlines(): if not line.startswith('>'): if generic_dna: # Biopython <1.78 fout.write("%s\n" % str( Seq(line.strip(), generic_dna).reverse_complement())) else: # Biopython =>1.78 fout.write("%s\n" % str( Seq(line.strip()).reverse_complement())) else: fout.write(line) else: with _open(out_final_fasta_path, 'at') as fout: with _open2(in_fasta_path, 'rt') as fin: for line in fin.readlines(): fout.write(line) results_list = [os.path.split(result)[-1] for result in results_list if "_reversed" not in result] del sample_info['mergedfasta'] del sample_info['primerrev'] del sample_info['primerfwd'] del sample_info['tagrev'] del sample_info['tagfwd'] sample_info['sortedfasta'] = results_list sample_info_df = pandas.DataFrame(sample_info) fasta_trimmed_info_tsv = os.path.join(sorteddir, 'sortedinfo.tsv') sample_info_df.to_csv(fasta_trimmed_info_tsv, sep="\t", header=True, index=False)
def main(fastadir, random_seqdir, fastainfo, random_seqinfo, samplesize): if not os.path.isdir(fastadir) or not os.listdir(fastadir): Logger.instance().error(f"{fastadir} is empty or does not exists!") return fastainfo_df = FileSampleInformation(fastainfo).read_tsv_into_df() input_files = fastainfo_df.to_dict(orient='list')['mergedfasta'] fastadir_path = os.path.abspath(fastadir) # check number is not > the sizes of the fasta files in fastadir files_size = {} for input_file in input_files: file_path = os.path.join(fastadir_path, input_file) line_counter = LineCounter(file_path) file_size = line_counter.sequence_counter() files_size[input_file] = file_size smallest = min(files_size.values()) if smallest < samplesize: Logger.instance().error( f"The smallest file in fastadir has {smallest} sequences.\nSamplesize cannot exceed this number of sequences" ) return ################################################################### # # Make the random files # ################################################################### # create output folder pathlib.Path(random_seqdir).mkdir(parents=True, exist_ok=True) output_files = [] input_files_no_repeat = [] # create the list to put in the ouput csv file and the list of file to sample (no duplicate) for input_file in input_files: base, ext = input_file.split(".", 1) output_files.append(base + "_sampled." + ext) if input_file not in input_files_no_repeat: input_files_no_repeat.append(input_file) for input_file in input_files_no_repeat: # get random indexes of lines in the file lines = [] num = 0 for _ in range(samplesize): while num in lines: num = randint(0, files_size[input_file]) lines.append(num) lines = sorted(lines) # make output file path base, ext = input_file.split(".", 1) output_file = os.path.join(random_seqdir, base + "_sampled." + ext) #if the file already exists delete it if os.path.exists(output_file): os.remove(output_file) # check extension if input_file.endswith(".gz"): _open = partial(gzip.open) elif input_file.endswith(".bz2"): _open = partial(bz2.open) else: _open = open input_file = os.path.join(fastadir_path, input_file) with _open(input_file, 'rb') as f_in: with _open(output_file, 'ab') as f_out: countline = -1 countSelect = 0 for line in f_in: if line.startswith(b">"): countline += 1 if countSelect + 1 < len( lines) and countline == lines[countSelect + 1]: countSelect += 1 if countline == lines[countSelect]: f_out.write(line) if countline > lines[-1]: break random_seqinfo_df = fastainfo_df.copy() random_seqinfo_df['mergedfasta'] = output_files random_seqinfo_df.to_csv(random_seqinfo, sep="\t", header=True, index=False)
def add_parser_optimize(cls, subparsers): parser_vtam_optimize = subparsers.add_parser( 'optimize', add_help=True, parents=[ cls.parser_params, cls.parser_log, cls.parser_threads, cls.parser_verbosity, cls.parser_wopmars_db, cls.parser_wopmars_dryrun, cls.parser_wopmars_forceall ], help="finds out optimal parameters for filtering") parser_vtam_optimize.add_argument( '--sortedinfo', action='store', help= "input TSV file with information about FASTA files containing sorted (trimmed and demultiplexed) reads", required=True, type=lambda x: FileSampleInformation(x).check_args( header=header_sortedread_fasta)) parser_vtam_optimize.add_argument( '--sorteddir', action='store', help= "input directory with sorted (Trimmed and demultiplexed) FASTA files", required=True, type=ArgParserChecker.check_dir_exists_and_is_nonempty) parser_vtam_optimize.add_argument('-o', '--outdir', action='store', help="output directory", default="out", required=True) parser_vtam_optimize.add_argument( '--known_occurrences', action='store', help="TSV file with known variants", required=True, type=lambda x: FileKnownOccurrences( x).argparse_checker_known_occurrences()) parser_vtam_optimize.add_argument( '--lfn_variant_replicate', action='store_true', help= "if set, VTAM will run the algorithm for the low frequency noise over variant and replicates", required=False, default=False) parser_vtam_optimize.add_argument( '-U', '--until', dest='until', action='store', default=None, help= """executes '%(prog)s' UNTIL one rule, where the rules follow this order: 1. SampleInformation, 2. VariantReadCount, 3. either OptimizeLFNsampleReplicate or OptimizePCRerror or OptimizeLFNreadCountAndLFNvariant""", required=False) parser_vtam_optimize.add_argument( '-S', '--since', dest='since', action='store', default=None, help= """executes '%(prog)s' SINCE one rule, where the rules follow this order: 1. SampleInformation, 2. VariantReadCount, 3. either OptimizeLFNsampleReplicate or OptimizePCRerror or OptimizeLFNreadCountAndLFNvariant""", required=False) # This attribute will trigger the good command parser_vtam_optimize.set_defaults(command='optimize')
def main(fastqinfo, fastqdir, fastainfo, fastadir, params=None, num_threads=multiprocessing.cpu_count()): ############################################################################################ # # params.yml parameters # ############################################################################################ params_dic = FileParams(params).get_params_dic() ############################################################################################ # # Read fastq information into stats_df # ############################################################################################ fastqinfo_df = FileSampleInformation(fastqinfo).read_tsv_into_df() pathlib.Path( os.path.dirname(fastainfo)).mkdir( parents=True, exist_ok=True) pathlib.Path(fastadir).mkdir(parents=True, exist_ok=True) fastainfo_df = pandas.DataFrame() ############################################################################################ # # Loop over fastq pairs to merge # ############################################################################################ # File with analysis stats data stats_df = pandas.DataFrame({'FastqFwd': [], 'FastqRev': [], 'NbReadsFwd': [], 'NbReadsRev': [], 'FastaMerged': [], 'NbMergedReads': []}) for fastqfwd, fastqrev in fastqinfo_df[[ 'fastqfwd', 'fastqrev']].drop_duplicates().values: fastq_info_df_i = fastqinfo_df.loc[(fastqinfo_df.fastqfwd == fastqfwd) & ( fastqinfo_df.fastqrev == fastqrev)] fastq_fw_abspath = os.path.join(fastqdir, fastqfwd) with open(fastq_fw_abspath, 'rb') as fin: fastq_fw_linecount = int(sum(1 for i in fin.read())/4) fastq_rv_abspath = os.path.join(fastqdir, fastqrev) with open(fastq_rv_abspath, 'rb') as fin: fastq_rv_linecount = int(sum(1 for i in fin.read())/4) Logger.instance().debug( "Analysing FASTQ files: {} and ".format( fastqfwd, fastqrev)) try: pathlib.Path(fastq_fw_abspath).resolve(strict=True) except FileNotFoundError: Logger.instance().error( VTAMexception( "VTAMexception: This FASTQ file was not found: {}.".format(fastq_fw_abspath))) sys.exit(1) try: pathlib.Path(fastq_rv_abspath).resolve(strict=True) except FileNotFoundError: Logger.instance().error( VTAMexception( "VTAMexception: This FASTQ file was not found: {}.".format(fastq_rv_abspath))) sys.exit(1) fasta_merged_basename = os.path.basename( fastq_fw_abspath).replace('.fastq', '.fasta') out_fasta_path = os.path.join(fastadir, fasta_merged_basename) ######################################################################################## # # Run vsearch merge # ######################################################################################## vsearch_args_dic = {} vsearch_args_dic['fastq_ascii'] = params_dic['fastq_ascii'] vsearch_args_dic['fastq_maxee'] = params_dic['fastq_maxee'] vsearch_args_dic['fastq_maxmergelen'] = params_dic['fastq_maxmergelen'] vsearch_args_dic['fastq_maxns'] = params_dic['fastq_maxns'] vsearch_args_dic['fastq_minlen'] = params_dic['fastq_minlen'] vsearch_args_dic['fastq_minmergelen'] = params_dic['fastq_minmergelen'] vsearch_args_dic['fastq_minovlen'] = params_dic['fastq_minovlen'] vsearch_args_dic['fastq_truncqual'] = params_dic['fastq_truncqual'] vsearch_args_dic['fastq_mergepairs'] = fastq_fw_abspath vsearch_args_dic['reverse'] = fastq_rv_abspath vsearch_args_dic['fastaout'] = out_fasta_path vsearch_args_dic['threads'] = num_threads vsearch_cluster = RunnerVSearch(parameters=vsearch_args_dic) vsearch_cluster.run() fastq_info_df_i = fastq_info_df_i[['run', 'marker', 'sample', 'replicate', 'tagfwd', 'primerfwd', 'tagrev', 'primerrev']] fastq_info_df_i['mergedfasta'] = fasta_merged_basename fastainfo_df = pandas.concat( [fastainfo_df, fastq_info_df_i], axis=0) with open(out_fasta_path, 'rb') as fin: fasta_merged_linecount = int(sum(1 for i in fin.read()) / 4) ######################################################################################## # # Summary file # ######################################################################################## stats_df = pandas.concat([stats_df, pandas.DataFrame({ 'FastqFwd': [fastq_fw_abspath], 'FastqRev': [fastq_fw_linecount], 'NbReadsFwd': [fastq_rv_abspath], 'NbReadsRev': [fastq_rv_linecount], 'FastaMerged': [out_fasta_path], 'NbMergedReads': [fasta_merged_linecount]})]) for mergedfasta in fastainfo_df[['mergedfasta']].drop_duplicates().values: mergedfasta = mergedfasta[0] if mergedfasta.endswith('.bz2') or mergedfasta.endswith('.gz'): fasta_merged_abspath = os.path.join(fastadir, mergedfasta) mergedfasta_compressor = FileCompression(fasta_merged_abspath) if mergedfasta.endswith('.gz'): mergedfasta_c = mergedfasta_compressor.pigz_compression() if mergedfasta_c is None: mergedfasta_c = mergedfasta_compressor.gzip_compression() elif mergedfasta.endswith('.bz2'): mergedfasta_c = mergedfasta_compressor.bz2_compression() mergedfasta_compressor.delete_file() _, relPath = os.path.split(mergedfasta_c) fastainfo_df.loc[fastainfo_df['mergedfasta'] == mergedfasta, 'mergedfasta'] = relPath else: fastq_info_df_i['mergedfasta'] = fasta_merged_basename fastainfo_df.to_csv(fastainfo, sep="\t", header=True, index=False)
def run(self): session = self.session engine = session._session().get_bind() ####################################################################### # # 1. Wrapper inputs, outputs and parameters # ####################################################################### # Input file fasta_info_tsv = self.input_file(MakeAsvTable.__input_file_sortedinfo) # Output file asvtable_tsv_path = self.output_file(MakeAsvTable.__output_table_asv) # # Options cluster_identity = float(self.option("cluster_identity")) known_occurrences_tsv = str(self.option("known_occurrences")) ####################################################################### # # Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # Compute variant_read_count_input_df and other dfs for the asv_table_runner # ####################################################################### sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( FilterCodonStop, engine=engine) ############################################################################################ # # FileKnownOccurrences # ############################################################################################ if known_occurrences_tsv == 'None' or known_occurrences_tsv is None: known_occurrences_df = None else: known_occurrences_df = FileKnownOccurrences( known_occurrences_tsv).to_identifier_df(engine) known_occurrences_df = known_occurrences_df.loc[ (known_occurrences_df.mock == 1) & (known_occurrences_df.action == 'keep'), ] ####################################################################### # # Compute variant_to_chimera_borderline_df # ####################################################################### sample_list = sample_info_tsv_obj.read_tsv_into_df( )['sample'].drop_duplicates(keep='first').tolist() asvtable_runner = RunnerAsvTable( variant_read_count_df=variant_read_count_df, engine=engine, sample_list=sample_list, cluster_identity=cluster_identity, known_occurrences_df=known_occurrences_df) asvtable_runner.to_tsv(asvtable_tsv_path)
def run(self): session = self.session engine = session._session().get_bind() this_temp_dir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(this_temp_dir).mkdir(exist_ok=True) ############################################################################################ # # Wrapper inputs, outputs and parameters # ############################################################################################ # # Input file output fasta_info_tsv = self.input_file( FilterPCRerror.__input_file_sortedinfo) # # Input table models input_filter_min_replicate_model = self.input_table( FilterPCRerror.__input_table_filter_min_replicate_number) # # Options pcr_error_var_prop = self.option("pcr_error_var_prop") # # Output table models output_filter_pcr_error_model = self.output_table( FilterPCRerror.__output_table_filter_pcr_error) ############################################################################################ # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # 3. Get nijk_df input # ############################################################################################ sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=output_filter_pcr_error_model) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( variant_read_count_like_model=input_filter_min_replicate_model, engine=engine, filter_id=None) ############################################################################################ # # Run per sample_id # ############################################################################################ variant_df = sample_info_tsv_obj.get_variant_df( variant_read_count_like_model=input_filter_min_replicate_model, engine=engine) record_list = [] run_marker_sample_df = variant_read_count_df[[ 'run_id', 'marker_id', 'sample_id' ]].drop_duplicates() for row in run_marker_sample_df.itertuples(): run_id = row.run_id marker_id = row.marker_id sample_id = row.sample_id # Get variant read for the current run-marker-sample variant_read_count_per_sample_df = variant_read_count_df.loc[ (variant_read_count_df.run_id == run_id) & (variant_read_count_df.marker_id == marker_id) & (variant_read_count_df.sample_id == sample_id)] variant_per_sample_df = variant_df.loc[variant_df.index.isin( variant_read_count_per_sample_df.variant_id.unique().tolist())] this_step_tmp_per_sample_dir = os.path.join( this_temp_dir, "run_{}_marker_{}_sample{}".format(run_id, marker_id, sample_id)) pathlib.Path(this_step_tmp_per_sample_dir).mkdir(exist_ok=True) ######################################################################################## # # Run vsearch and get alignement variant_read_count_input_df # ######################################################################################## filter_pcr_error_runner = RunnerFilterPCRerror( variant_expected_df=variant_per_sample_df, variant_unexpected_df=variant_per_sample_df, variant_read_count_df=variant_read_count_per_sample_df) filter_output_per_sample_df = filter_pcr_error_runner.get_variant_read_count_delete_df( pcr_error_var_prop) ######################################################################################## # # Per sample add to record list # ######################################################################################## record_per_sample_list = ModelVariantReadCountLike.filter_delete_df_to_dict( filter_output_per_sample_df) record_list = record_list + record_per_sample_list variant_read_count_delete_df = pandas.DataFrame.from_records( data=record_list) ############################################################################################ # # 5. Write to DB # 6. Touch output tables, to update modification date # 7. Exit vtam if all variants delete # ####################################################################### DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql( engine=engine, variant_read_count_like_model=output_filter_pcr_error_model) for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by(id=obj.id).update( {'id': obj.id}) session.commit() if variant_read_count_delete_df.filter_delete.sum( ) == variant_read_count_delete_df.shape[0]: Logger.instance().warning( VTAMexception("This filter has deleted all the variants: {}. " "The analysis will stop here.".format( self.__class__.__name__))) sys.exit(0)