def to_identifier_df(self, engine, is_lfn_variant_replicate): """Returns a list of dictionnaries with run_id, marker_id, sample_id entries (See return) :return: pandas.DataFrame: with columns run_id, marker_id, ... """ df = self.read_tsv_into_df(is_lfn_variant_replicate) df.run_name = NameIdConverter(df.run_name.tolist(), engine).to_ids(Run) df.marker_name = NameIdConverter(df.marker_name.tolist(), engine).to_ids(Marker) variant_id_user_lst = df.variant_id.tolist() df['variant_id'] = NameIdConverter(df.variant_sequence.tolist(), engine).variant_sequence_to_id() if not df.variant_id.tolist() == variant_id_user_lst: Logger.instance().warning( VTAMexception( "Some variant IDs and sequences do not agree in the --cutoff_specific file and in the database." )) df.rename({ 'run_name': 'run_id', 'marker_name': 'marker_id' }, axis=1, inplace=True) return df
def is_valid(self): """Check if user parameter set is contained in the default parameter set""" for k in self.params_file_dic: if not (k in self.params_default_dic): Logger.instance().error( VTAMexception( 'Non-valid parameter "{}" in the file "{}"'.format( k, self.params_path))) sys.exit(1) return True
def download_precomputed_taxonomy(self): """ Copy the online TSV taxonomy DB to the pathname output """ Logger.instance().debug( "file: {}; line: {}; Downloading taxonomy tsv".format( __file__, inspect.currentframe().f_lineno, )) ############################################################################################ # # Download sorted reads dataset # ############################################################################################ taxonomy_tsv_gz_path = '{}.gz'.format(self.taxonomy_tsv_path) # Test first in local dir, otherwise in the remote URLs if not os.path.isfile(self.taxonomy_tsv_path) or pathlib.Path( self.taxonomy_tsv_path).stat().st_size < 1000000: try: # urllib.request.urlretrieve(taxonomy_tsv_gz_url1, taxonomy_tsv_gz_path, MyProgressBar()) with tqdm(...) as t: t.set_description(os.path.basename(taxonomy_tsv_gz_url1)) urllib.request.urlretrieve(taxonomy_tsv_gz_url1, taxonomy_tsv_gz_path, reporthook=tqdm_hook(t)) except Exception: try: # urllib.request.urlretrieve(taxonomy_tsv_gz_url2, taxonomy_tsv_gz_path, # MyProgressBar()) with tqdm(...) as t: t.set_description( os.path.basename(taxonomy_tsv_gz_url2)) urllib.request.urlretrieve(taxonomy_tsv_gz_url2, taxonomy_tsv_gz_path, reporthook=tqdm_hook(t)) except Exception: # urllib.request.urlretrieve(taxonomy_tsv_gz_url3, taxonomy_tsv_gz_path, # MyProgressBar()) with tqdm(...) as t: t.set_description( os.path.basename(taxonomy_tsv_gz_url1)) urllib.request.urlretrieve(taxonomy_tsv_gz_url3, taxonomy_tsv_gz_path, reporthook=tqdm_hook(t)) with gzip.open(taxonomy_tsv_gz_path, 'rb') as fin: with open(self.taxonomy_tsv_path, 'wb') as fout: shutil.copyfileobj(fin, fout) try: pathlib.Path(taxonomy_tsv_gz_path).unlink() except FileNotFoundError: pass
def read_tsv_into_df(self, is_lfn_variant_replicate): """Read into stats_df Updated: June 3, 2020 Parameters ---------- is_lfn_variant_replicate: Boolean that tells if algorithm is lfn_variant_replicate or not Returns ------- pandas.DataFrame """ df = pandas.read_csv(self.cutoff_specific_tsv, sep="\t", header=0) df.columns = df.columns.str.lower() df.rename( { 'lfn_variant_cutoff': 'cutoff', 'lfn_variant_replicate_cutoff': 'cutoff' }, inplace=True, axis=1) if is_lfn_variant_replicate and set(df.columns.tolist()) >= { 'run', 'marker', 'variant', 'replicate', 'cutoff', 'sequence' }: df = df[[ 'run', 'marker', 'variant', 'replicate', 'cutoff', 'sequence' ]] elif not is_lfn_variant_replicate and set(df.columns.tolist()) >= { 'run', 'marker', 'variant', 'cutoff', 'sequence' }: df = df[['run', 'marker', 'variant', 'cutoff', 'sequence']] else: Logger.instance().critical( VTAMexception( "The format of file '{}' is wrong. Columns 'lfn_variant_cutoff' or 'lfn_variant_replicate_cutoff' are required." .format(self.cutoff_specific_tsv))) sys.exit(1) df.rename( { 'run': 'run_name', 'marker': 'marker_name', 'variant': 'variant_id', 'sequence': 'variant_sequence' }, axis=1, inplace=True) return df
def main(arg_parser_dic): ################################################################### # # Create FilterLFNreference table and fill it # ################################################################### engine = sqlalchemy.create_engine('sqlite:///{}'.format( str(arg_parser_dic['db'])), echo=False) meta = sqlalchemy.MetaData() filter_lfn_reference = sqlalchemy.Table( 'FilterLFNreference', meta, sqlalchemy.Column('filter_id', sqlalchemy.Integer, primary_key=True), sqlalchemy.Column('filter_name', sqlalchemy.String), ) meta.create_all(engine) with engine.connect() as conn: for filter_rec in FilterLFNreference_records: filter_name = filter_rec['filter_name'] select_row = conn.execute( sqlalchemy.select([ filter_lfn_reference.c.filter_id ]).where(filter_lfn_reference.c.filter_name == filter_name)).first() if select_row is None: # variant_sequence IS NOT in the database, so INSERT it conn.execute( filter_lfn_reference.insert().values(**filter_rec)) wopmars_runner = RunnerWopmars(command=arg_parser_dic['command'], cli_args_dic=arg_parser_dic) wopmars_command = wopmars_runner.get_wopmars_command() ######################################################################################## # # Run wopmars # ######################################################################################## # Some arguments will be passed through environmental variables if 'threads' in arg_parser_dic: os.environ['VTAM_THREADS'] = str(arg_parser_dic['threads']) Logger.instance().info(wopmars_command) run_result = subprocess.run(wopmars_command, shell=True) sys.exit(run_result.returncode)
def process_blast_result(blast_output_tsv): """Reads blast_output_tsv and creates a DF that is compatible to the following taxassign. If this DF is empty, vtam will exit with a warning """ Logger.instance().debug( "file: {}; line: {}; Reading Blast output from: {}".format( __file__, inspect.currentframe().f_lineno, blast_output_tsv)) blast_output_df = pandas.read_csv(blast_output_tsv, sep='\t', header=None, names=[ 'variant_id', 'target_id', 'identity', 'evalue', 'coverage', 'target_tax_id' ]) # Remove null target tax ids blast_output_df = blast_output_df.loc[~blast_output_df.target_tax_id. isnull()] # expand multiple target_tax_ids # first convert as string blast_output_df.target_tax_id = blast_output_df.target_tax_id.astype( 'str') # split by ';' to keep just one target_tax_id and reassign in DF blast_output_df.target_tax_id = blast_output_df.target_tax_id.str.split( pat=';', n=1, expand=True) # Convert back to numeric/int blast_output_df.target_tax_id = blast_output_df.target_tax_id.astype( 'float').astype('int') # blast_output_df = (pandas.concat([ # blast_output_df, blast_output_df.target_tax_id.str.split(pat=';', n=1, expand=True)], # axis=1)) # blast_output_df.drop(['target_tax_id'], axis=1, inplace=true) # Blast output extract """ variant_id target_id identity evalue coverage target_tax_id 0 2 MF7836761 99.429 1.620000e-86 100 1469487 1 2 MF7836761 99.429 1.620000e-86 100 189839 2 2 KY2618191 98.857 7.520000e-85 100 189839 3 2 MF7834791 98.857 7.520000e-85 100 189839 4 2 KU9559321 98.857 7.520000e-85 100 189839 """ if blast_output_df.shape[0] == 0: Logger.instance().warning( VTAMexception("Blast did not find any target. " "VTAM will stop here.")) sys.exit(0) return blast_output_df
def variant_sequence_to_id(self): variant_id_lst = [] with self.engine.connect() as conn: for sequence in self.id_name_or_sequence_list: result = conn.execute( sqlalchemy.select([Variant.__table__.c.id]).where( Variant.__table__.c.sequence == sequence)).first() if result is None: Logger.instance().error( "Sequence {} not found in table {}".format( sequence, str(Variant.__table__))) sys.exit(1) variant_id_lst.append(result[0]) return variant_id_lst
def run(self): """Run the vsearch :return: void """ cmd = self.create_command() if sys.platform.startswith("win"): args = cmd else: args = shlex.split(cmd) run_result = subprocess.run(args=args, capture_output=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode())
def create_command(self): """Create the vsearch command that will be run_name :return: void """ command = 'vsearch' for param in self.parameters: if not self.parameters[param] is None: command += ' --{} {}'.format(param, self.parameters[param]) else: command += ' --{}'.format(param) Logger.instance().debug(command) return command
def to_names(self, declarative_model): nameid_lst = [] with self.engine.connect() as conn: for idx in self.id_name_or_sequence_list: result = conn.execute( sqlalchemy.select([ declarative_model.__table__.c.name ]).where(declarative_model.__table__.c.id == idx)).first() if result is None: Logger.instance().error( "Id {} not found in table {}".format( idx, str(declarative_model.__table__))) sys.exit(1) nameid_lst.append(result[0]) return nameid_lst
def variant_id_is_chimera_borderline(self): chimera_borderline_lst = [] with self.engine.connect() as conn: for variant_id in self.id_name_or_sequence_list: result = conn.execute( sqlalchemy.select([ FilterChimeraBorderline.__table__.c.filter_delete ]).where(FilterChimeraBorderline.__table__.c.variant_id == variant_id).distinct()).first() if result is None: Logger.instance().error( "Variant ID {} not found in table FilterChimeraBorderline" .format(variant_id)) sys.exit(1) chimera_borderline_lst.append(result[0]) return chimera_borderline_lst
def __init__(self, asv_table_df, readcounts, run_marker_df=None): """ Constructor of the CommandPoolRunMarkers class Parameters ---------- asv_table_df : pandas dataframe ASV table. readcount : bool Default false. If false, boolean 0/1 is given for presence or absence of variant in pooled table. If true, read integer is given with sum or reads in the pooled runs or markers. run_marker_df: pandas dataframe Output ASV table with pooled variants """ header = { 'run_name', 'marker_name', 'variant_id', 'sequence_length', 'read_count' } if not set(asv_table_df.columns ) >= header: # contains at least the 'header_lower' columns Logger.instance().error( VTAMexception( "The ASV table structure is wrong. It is expected to contain these columns: " "run_name, marker_name, variant_id, sequence_length, read_count" )) sys.exit(1) self.sample_names = asv_table_df.columns.tolist()[5:-2] if run_marker_df is None: # Default: pool all marker_name self.asv_table_df = asv_table_df else: # if run_marker_df: pool only markers in this variant_read_count_input_df self.asv_table_df = asv_table_df.merge( run_marker_df, on=['run_name', 'marker_name']) self.tmp_dir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(self.tmp_dir).mkdir(exist_ok=True) self.cluster_path = None # returned by run_vsearch_to_cluster_sequences self.cluster_df = None # returned by get_vsearch_clusters_to_df self.readcounts = readcounts # returned by get_vsearch_clusters_to_df
def __download_ncbi_taxonomy_dump(self): # Download files remotefile = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz" new_taxdump_path = os.path.join(self.tempdir, os.path.basename(remotefile)) Logger.instance().debug( "file: {}; line: {}; Downloading NCBI taxonomy dump".format( __file__, inspect.currentframe().f_lineno)) if not os.path.isfile(new_taxdump_path): Logger.instance().info("Downloading NCBI taxonomy dump") # urllib.request.urlretrieve(remotefile, new_taxdump_path, MyProgressBar()) with tqdm(...) as t: t.set_description(os.path.basename(new_taxdump_path)) urllib.request.urlretrieve(remotefile, new_taxdump_path, reporthook=tqdm_hook(t)) return new_taxdump_path
def get_one_tax_id_lineage(self, tax_id): """ Takes a tax_id and creates a dictionary with the taxonomy lineage in this form {'species': 183142, 'genus': 10194, 'family': 10193, 'order': 84394, 'superorder': 1709201, 'class': 10191, 'phylum': 10190, 'no rank': 131567, 'kingdom': 33208, 'superkingdom': 2759} Parameters ---------- tax_id : int NCBI taxon id Returns ------- dic Dictionnary with taxonomy lineage for given tax_id """ # lineage_dic = {'tax_id': tax_id} lineage_dic = {} while tax_id != 1: # tax_id is found as normal tax in the taxonomy file if tax_id in self.df.index: tax_id_row = self.df.loc[tax_id, ] # tax_id is found as old_tax_id column in the taxonomy file elif tax_id in self.old_tax_df.index.tolist(): # Try old tax id tax_id_new = self.old_tax_df.loc[tax_id, 'tax_id'] tax_id_row = self.df.loc[tax_id_new, ] # tax_id is not found in the taxonomy file. # Return current lineage dic and exit the function else: Logger.instance().warning( "The taxon ID {} in the Blast database is missing in the taxonomy.tsv. " "Consider updating this file with the following command: vtam taxonomy --output taxonomy.tsv." .format(tax_id)) # raise VTAMexception("tax_id {} from Blast database not found in the taxonomy.tsv file".format(tax_id)) return lineage_dic rank = tax_id_row['rank'] parent_tax_id = tax_id_row['parent_tax_id'] lineage_dic[rank] = tax_id tax_id = parent_tax_id return lineage_dic
def get_several_tax_id_lineages(self, tax_id_list): """ Takes a list of tax_id's and creates a DataFrame with the taxonomy lineages in columns and the tax_id as index tax_id (index) no rank species genus family order class 1246992 131567 741276.0 5533.0 1799696.0 231213.0 162481.0 29000.0 1112827 131567 1112827.0 6220.0 941271.0 6219.0 6218.0 Parameters ---------- tax_id : int NCBI taxon id Returns ------- DataFrame DataFrame with lineages in columns and tax_id as index """ lineage_list = [] for target_tax_id_i, target_tax_id in enumerate(tax_id_list): if target_tax_id_i % 100 == 0: Logger.instance().debug( "Get lineage of {}-th tax id {} (Total {} tax ids)".format( target_tax_id_i, target_tax_id, len(tax_id_list))) lineage_list.append({ **{ 'tax_id': target_tax_id }, **self.get_one_tax_id_lineage(tax_id=target_tax_id) }) tax_id_lineage_df = pandas.DataFrame(lineage_list) tax_id_lineage_df.set_index('tax_id', drop=True, inplace=True, verify_integrity=True) return tax_id_lineage_df
def run_local_blast(self): """Runs a local blast and returns the path to the output TSV file""" ####################################################################### # # 3 Run local blast # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Running local blast with FASTA input {}". format(__file__, inspect.currentframe().f_lineno, self.variant_fasta)) # Run and read local blast result blast_output_tsv = os.path.join(self.this_temp_dir, 'blast_output.tsv') # blast_output_tsv = "/home/gonzalez/tmp/blast/blast_output.tsv" # uncomment for testing # get blast db dir and filename prefix from NHR file os.environ['BLASTDB'] = self.blast_db_dir blastn_cline = NcbiblastnCommandline( query=self.variant_fasta, db=self.blast_db_name, evalue=1e-5, outfmt='"6 qseqid sacc pident evalue qcovhsp staxids"', dust='yes', qcov_hsp_perc=self.qcov_hsp_perc, num_threads=self.num_threads, out=blast_output_tsv) Logger.instance().debug("file: {}; line: {}; {}".format( __file__, inspect.currentframe().f_lineno, str(blastn_cline))) # # Run blast stdout, stderr = blastn_cline() return blast_output_tsv
def main(fastainfo, fastadir, sorteddir, params=None, num_threads=multiprocessing.cpu_count()): if sys.platform.startswith('win'): num_threads = 1 ############################################################################################ # # params.yml parameters # ############################################################################################ params_dic = FileParams(params).get_params_dic() cutadapt_error_rate = params_dic['cutadapt_error_rate'] cutadapt_minimum_length = params_dic['cutadapt_minimum_length'] cutadapt_maximum_length = params_dic['cutadapt_maximum_length'] ############################################################################################ # # Loop over tag and primer pairs to demultiplex and trim reads # ############################################################################################ merged_fastainfo_df = FileSampleInformation( fastainfo).read_tsv_into_df() pathlib.Path(sorteddir).mkdir(parents=True, exist_ok=True) tempdir = PathManager.instance().get_tempdir() sorted_read_info_df = pandas.DataFrame() for i in range(0, merged_fastainfo_df.shape[0]): fasta_info_series = merged_fastainfo_df.iloc[i] tag_fwd = fasta_info_series.tagfwd tag_rev = fasta_info_series.tagrev primer_fwd = fasta_info_series.primerfwd primer_rev = fasta_info_series.primerrev in_fasta_basename = fasta_info_series.mergedfasta Logger.instance().debug( "Analysing FASTA file: {}".format(in_fasta_basename)) fasta_info_df_i = fasta_info_series.to_frame().T in_raw_fasta_path = os.path.join(fastadir, in_fasta_basename) ######################################################################################## # # Cut adapt tag of forward reads # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only # --front 'tcgatcacgatgt;min_overlap=13...gctgtagatcgaca;min_overlap=14' # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta # ######################################################################################## if generic_dna: # Biopython <1.78 tag_rev_rc = str( Seq(tag_rev, generic_dna).reverse_complement()) else: # Biopython =>1.78 tag_rev_rc = str(Seq(tag_rev).reverse_complement()) out_fasta_basename = os.path.basename(in_raw_fasta_path).replace( '.fasta', '_sorted_%03d.fasta' % i) out_fasta_path = os.path.join(tempdir, out_fasta_basename) cmd_cutadapt_tag_dic = { 'tag_fwd': tag_fwd, 'tag_fwd_len': len(tag_fwd), 'tag_rev_rc': tag_rev_rc, 'tag_rev_rc_len': len(tag_rev_rc), 'in_fasta_path': in_raw_fasta_path, 'out_fasta': out_fasta_path, 'num_threads': num_threads, } cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \ '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \ '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic) Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_tag_str else: args = shlex.split(cmd_cutadapt_tag_str) run_result = subprocess.run(args=args, capture_output=True, check=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode()) ######################################################################################## # # Trim primers from output # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only # --front 'TCCACTAATCACAARGATATTGGTAC;min_overlap=26...GGAGGATTTGGWAATTGATTAGTW;min_overlap=24' # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_trimmed_000.fasta # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta # ######################################################################################## if generic_dna: # Biopython <1.78 primer_rev_rc = str( Seq(primer_rev, generic_dna).reverse_complement()) else: # Biopython =>1.78 primer_rev_rc = str(Seq(primer_rev).reverse_complement()) in_fasta_path = out_fasta_path out_fasta_basename = os.path.basename(in_fasta_path).replace( '_sorted_%03d.fasta' % i, '_sorted_trimmed_%03d.fasta' % i) out_fasta_path = os.path.join(tempdir, out_fasta_basename) cmd_cutadapt_primer_dic = { 'primer_fwd': primer_fwd, 'primer_fwd_len': len(primer_fwd), 'primer_rev_rc': primer_rev_rc, 'primer_rev_rc_len': len(primer_rev_rc), 'in_fasta_path': in_fasta_path, 'out_fasta': out_fasta_path, 'error_rate': cutadapt_error_rate, 'read_min_length': cutadapt_minimum_length, 'read_max_length': cutadapt_maximum_length, 'num_threads': num_threads, } cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \ '--minimum-length {read_min_length} ' \ '--maximum-length {read_max_length} --trimmed-only ' \ '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" ' \ '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic) Logger.instance().debug( "Running: {}".format(cmd_cutadapt_primer_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_primer_str else: args = shlex.split(cmd_cutadapt_primer_str) run_result = subprocess.run(args=args, capture_output=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode()) ######################################################################################## # # Cut adapt tag of reverse-complement reads # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only # --front 'tgtcgatctacagc;min_overlap=14...acatcgtgatcga;min_overlap=13' # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta # ######################################################################################## if generic_dna: # Biopython <1.78 tag_fwd_rc = str( Seq(tag_fwd, generic_dna).reverse_complement()) else: # Biopython =>1.78 tag_fwd_rc = str(Seq(tag_fwd).reverse_complement()) out_rc_fasta_basename = os.path.basename( in_raw_fasta_path).replace('.fasta', '_rc_sorted_%03d.fasta' % i) out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename) cmd_cutadapt_tag_dic = { 'tag_fwd': tag_rev, 'tag_fwd_len': len(tag_rev), 'tag_rev_rc': tag_fwd_rc, 'tag_rev_rc_len': len(tag_fwd_rc), 'in_fasta_path': in_raw_fasta_path, 'out_fasta': out_rc_fasta_path, 'num_threads': num_threads, } cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \ '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \ '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic) Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_tag_str else: args = shlex.split(cmd_cutadapt_tag_str) run_result = subprocess.run(args=args, capture_output=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode()) ################################################################### # # Trim primers from output # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only # --front 'WACTAATCAATTWCCAAATCCTCC;min_overlap=24...GTACCAATATCYTTGTGATTAGTGGA;min_overlap=26' # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_trimmed_000.fasta # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta # ################################################################### if generic_dna: # Biopython <1.78 primer_fwd_rc = str( Seq(primer_fwd, generic_dna).reverse_complement()) else: # Biopython =>1.78 primer_fwd_rc = str(Seq(primer_fwd).reverse_complement()) in_fasta_path = out_rc_fasta_path out_rc_fasta_basename = os.path.basename(in_fasta_path).replace( '_rc_sorted_%03d.fasta' % i, '_rc_sorted_trimmed_%03d.fasta' % i) out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename) cmd_cutadapt_primer_dic = { 'primer_fwd': primer_rev, 'primer_fwd_len': len(primer_rev), 'primer_rev_rc': primer_fwd_rc, 'primer_rev_rc_len': len(primer_fwd_rc), 'in_fasta_path': in_fasta_path, 'out_fasta': out_rc_fasta_path, 'error_rate': cutadapt_error_rate, 'read_min_length': cutadapt_minimum_length, 'read_max_length': cutadapt_maximum_length, 'num_threads': num_threads, } cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \ '--minimum-length {read_min_length} ' \ '--maximum-length {read_max_length} --trimmed-only ' \ '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" ' \ '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic) Logger.instance().debug( "Running: {}".format(cmd_cutadapt_primer_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_primer_str else: args = shlex.split(cmd_cutadapt_primer_str) run_result = subprocess.run(args=args, capture_output=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode()) ################################################################### # # Reverse complement back rc fasta and pool # ################################################################### out_final_fasta_basename = os.path.basename( in_raw_fasta_path).replace('.fasta', '_%03d.fasta' % i) out_final_fasta_path = os.path.join(sorteddir, out_final_fasta_basename) shutil.copy(out_fasta_path, out_final_fasta_path) Logger.instance().debug("Pooling fwd and rc reads...") with open(out_final_fasta_path, 'a') as fout: with open(out_rc_fasta_path, 'r') as fin: for line in fin: if not line.startswith('>'): if generic_dna: # Biopython <1.78 fout.write("%s\n" % str( Seq(line.strip(), generic_dna).reverse_complement())) else: # Biopython =>1.78 fout.write("%s\n" % str( Seq(line.strip()).reverse_complement())) else: fout.write(line) fasta_info_df_i = fasta_info_df_i[[ 'run', 'marker', 'sample', 'replicate' ]] fasta_info_df_i['sortedfasta'] = out_final_fasta_basename sorted_read_info_df = pandas.concat( [sorted_read_info_df, fasta_info_df_i], axis=0) fasta_trimmed_info_tsv = os.path.join(sorteddir, 'sortedinfo.tsv') sorted_read_info_df.to_csv(fasta_trimmed_info_tsv, sep="\t", header=True, index=False)
def run(self): session = self.session engine = session._session().get_bind() ############################################################################################ # # Wrapper inputs, outputs and parameters # ############################################################################################ # Input file fasta_info_tsv = self.input_file( FilterRenkonen.__input_file_sortedinfo) # # Input table models input_filter_chimera_model = self.input_table( FilterRenkonen.__input_table_chimera) # # Options renkonen_distance_quantile = float( self.option("renkonen_distance_quantile")) # # Output table models output_filter_renkonen_model = self.output_table( FilterRenkonen.__output_table_filter_renkonen) ############################################################################################ # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # 3. Get nijk_df input # ############################################################################################ sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=output_filter_renkonen_model) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( variant_read_count_like_model=input_filter_chimera_model, engine=engine, filter_id=None) ############################################################################################ # # Run per run_id, marker_id # ############################################################################################ variant_read_count_delete_df = pandas.DataFrame() run_marker_df = variant_read_count_df[['run_id', 'marker_id']].drop_duplicates() for row in run_marker_df.itertuples(): run_id = row.run_id marker_id = row.marker_id variant_read_count_per_run_marker_df = variant_read_count_df.loc[ (variant_read_count_df.run_id == run_id) & (variant_read_count_df.marker_id == marker_id)] if variant_read_count_per_run_marker_df.replicate.unique( ).shape[0] > 1: # if more than one replicate filter_renkonen_runner_obj = RunnerFilterRenkonen( variant_read_count_per_run_marker_df) filter_output_i_df = filter_renkonen_runner_obj.get_variant_read_count_delete_df( renkonen_distance_quantile) else: # Just one replicate filter_output_i_df = variant_read_count_df.copy() filter_output_i_df['filter_delete'] = False variant_read_count_delete_df = pandas.concat( [variant_read_count_delete_df, filter_output_i_df], axis=0) ############################################################################################ # # 5. Write to DB # 6. Touch output tables, to update modification date # 7. Exit vtam if all variants delete # ############################################################################################ DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql( engine=engine, variant_read_count_like_model=output_filter_renkonen_model) for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by(id=obj.id).update( {'id': obj.id}) session.commit() if variant_read_count_delete_df.filter_delete.sum( ) == variant_read_count_delete_df.shape[0]: Logger.instance().warning( VTAMexception("This filter has deleted all the variants: {}. " "The analysis will stop here.".format( self.__class__.__name__))) sys.exit(0)
def get_nijk_df(self, variant_read_count_like_model, engine, filter_id=None): """Based on the SortedReadFile samples and the variant_read_count_model, returns the variant_read_count_input_df :param variant_read_count_like_model: SQLalchemy models with columns: run_id, marker_id, sample_id, replicate, variant_id, read_count :param filter_id: :return: DataFrame with columns: run_id, marker_id, sample_id, replicate, variant_id, read_count """ variant_read_count_like_table = variant_read_count_like_model.__table__ variant_read_count_list = [] # for sample_instance in self.get_fasta_information_record_list(): for sample_instance_row in self.to_identifier_df( engine=engine).itertuples(): run_id = sample_instance_row.run_id marker_id = sample_instance_row.marker_id sample_id = sample_instance_row.sample_id replicate = sample_instance_row.replicate stmt_select = sqlalchemy.select([ variant_read_count_like_table.c.run_id, variant_read_count_like_table.c.marker_id, variant_read_count_like_table.c.sample_id, variant_read_count_like_table.c.replicate, variant_read_count_like_table.c.variant_id, variant_read_count_like_table.c.read_count ]).distinct( ).where(variant_read_count_like_table.c.run_id == run_id).where( variant_read_count_like_table.c.marker_id == marker_id).where( variant_read_count_like_table.c.sample_id == sample_id ).where(variant_read_count_like_table.c.replicate == replicate) # Used for filters tables where filter_delete attribute exists if 'filter_delete' in [ column.key for column in variant_read_count_like_table.columns ]: stmt_select = stmt_select.where( variant_read_count_like_table.c.filter_delete == 0) # used for filter lfn where filter_id = 8 is necessary (do not pass # all filters) if filter_id is not None: stmt_select = stmt_select.where( variant_read_count_like_table.c.filter_id == filter_id) with engine.connect() as conn2: for row in conn2.execute(stmt_select).fetchall(): variant_read_count_list.append(row) # variant_read_count_df = pandas.DataFrame.from_records( variant_read_count_list, columns=[ 'run_id', 'marker_id', 'sample_id', 'replicate', 'variant_id', 'read_count' ]) # Exit if no variants for analysis try: assert variant_read_count_df.shape[0] > 0 except AssertionError: Logger.instance().warning( VTAMexception("No variants available after this filter. " "The pipeline will stop here.".format( self.__class__.__name__))) sys.exit(0) return variant_read_count_df
def run(self): session = self.session engine = session._session().get_bind() ####################################################################### # # Wrapper inputs, outputs and parameters # ####################################################################### # Input file # sort_reads_tsv = self.input_file(VariantReadCount.__input_file_sort_reads) input_file_sortedinfo = self.input_file( VariantReadCount.__input_file_sortedinfo) # # Input table models run_model = self.input_table(VariantReadCount.__input_table_run) marker_model = self.input_table(VariantReadCount.__input_table_marker) sample_model = self.input_table(VariantReadCount.__input_table_sample) # # Output # Output table variant_model = self.output_table( VariantReadCount.__output_table_variant) variant_read_count_model = self.output_table( VariantReadCount.__output_table_variant_read_count) # Options read_dir = self.option("read_dir") global_read_count_cutoff = self.option("global_read_count_cutoff") ####################################################################### # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # 3. Read tsv file with sorted reads # 4. Group by read sequence # 5. Delete variants if below global_read_count_cutoff # 6. Insert into Variant and DataframeVariantReadCountLike tables # ####################################################################### ####################################################################### # # 1. Read sample information to get run_id, marker_id, sample_id, replicate for current analysis # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Read sample information".format( __file__, inspect.currentframe().f_lineno)) sortedinfo_df = pandas.read_csv(input_file_sortedinfo, sep="\t", header=0) sample_instance_list = [] sortedinfo_df.columns = sortedinfo_df.columns.str.lower() for row in sortedinfo_df.itertuples(): Logger.instance().debug(row) marker_name = row.marker run_name = row.run sample_name = row.sample replicate = row.replicate with engine.connect() as conn: # get run_id ########### stmt_select_run_id = select([ run_model.__table__.c.id ]).where(run_model.__table__.c.name == run_name) run_id = conn.execute(stmt_select_run_id).first()[0] # get marker_id ########### stmt_select_marker_id = select([ marker_model.__table__.c.id ]).where(marker_model.__table__.c.name == marker_name) marker_id = conn.execute(stmt_select_marker_id).first()[0] # get sample_id ########### stmt_select_sample_id = select([ sample_model.__table__.c.id ]).where(sample_model.__table__.c.name == sample_name) sample_id = conn.execute(stmt_select_sample_id).first()[0] # add this sample_instance ########### sample_instance_list.append({ 'run_id': run_id, 'marker_id': marker_id, 'sample_id': sample_id, 'replicate': replicate }) ####################################################################### # # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Delete marker_name/run_name/sample/replicate". format(__file__, inspect.currentframe().f_lineno)) with engine.connect() as conn: stmt_del = variant_read_count_model.__table__.delete() stmt_del = stmt_del.where(variant_read_count_model.__table__.c. run_id == bindparam('run_id')) stmt_del = stmt_del.where(variant_read_count_model.__table__.c. marker_id == bindparam('marker_id')) stmt_del = stmt_del.where(variant_read_count_model.__table__.c. sample_id == bindparam('sample_id')) stmt_del = stmt_del.where(variant_read_count_model.__table__.c. replicate == bindparam('replicate')) conn.execute(stmt_del, sample_instance_list) ####################################################################### # # 3. Read tsv file with sorted reads # ####################################################################### # fasta_info_obj = FastaInformationTSV(input_file_sortedinfo, engine=engine) # sample_info_ids_df = fasta_info_obj.get_ids_df() sample_info_tsv_obj = FileSampleInformation( tsv_path=input_file_sortedinfo) sample_info_ids_df = sample_info_tsv_obj.to_identifier_df( engine=engine) Logger.instance().debug( "file: {}; line: {}; Read demultiplexed FASTA files".format( __file__, inspect.currentframe().f_lineno)) variant_read_count_df = pandas.DataFrame() for row in sample_info_ids_df.itertuples(): run_id = row.run_id marker_id = row.marker_id sample_id = row.sample_id replicate = row.replicate read_fasta = row.sortedfasta Logger.instance().debug( "file: {}; line: {}; Read FASTA: {}".format( __file__, inspect.currentframe().f_lineno, read_fasta)) read_fasta_path = os.path.join(read_dir, read_fasta) if os.path.exists(read_fasta_path): #################################################################################### # # Read FASTA # #################################################################################### sorted_read_list = VariantReadCount.get_sorted_read_list( read_fasta_path, generic_dna) variant_read_count_df_sorted_i = pandas.DataFrame({ 'run_id': [run_id] * len(sorted_read_list), 'marker_id': [marker_id] * len(sorted_read_list), 'sample_id': [sample_id] * len(sorted_read_list), 'replicate': [replicate] * len(sorted_read_list), 'read_sequence': sorted_read_list, 'read_count': [1] * len(sorted_read_list) }) # Compute read count variant_read_count_df_sorted_i = variant_read_count_df_sorted_i.groupby( [ 'run_id', 'marker_id', 'sample_id', 'replicate', 'read_sequence' ]).sum().reset_index() #variant_read_count_df = variant_read_count_df.append( # variant_read_count_df_sorted_i) variant_read_count_df = pandas.concat( [variant_read_count_df, variant_read_count_df_sorted_i], axis=0) else: Logger.instance().warning( 'This file {} doest not exists'.format(read_fasta_path)) ####################################################################### # # 4. Group by read sequence to variant_read_count with run_id, marker_name, ... # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Group by read sequence".format( __file__, inspect.currentframe().f_lineno)) variant_read_count_df = variant_read_count_df.groupby( ['run_id', 'marker_id', 'sample_id', 'replicate', 'read_sequence']).sum().reset_index() variant_read_count_df.rename(columns={'read_sequence': 'variant_id'}, inplace=True) variant_read_count_df.sort_values( by=variant_read_count_df.columns.tolist()) ####################################################################### # # 5. Remove variants with read count across all run_name, markers, samples and replicates lower than # global_read_count_cutoff parameter # ####################################################################### variant_read_count_like_df_obj = DataframeVariantReadCountLike( variant_read_count_df) Logger.instance().debug( "file: {}; line: {}; Remove variants with global read count lower than parameter 'global_read_count_cutoff'" .format(__file__, inspect.currentframe().f_lineno)) variant_read_count_df = variant_read_count_like_df_obj.filter_out_below_global_read_count_cutoff( global_read_count_cutoff=global_read_count_cutoff) variant_read_count_df.rename( columns={'variant_id': 'variant_sequence'}, inplace=True) ####################################################################### # # 6. Insert into Variant and VariantReadCount tables # ####################################################################### Logger.instance().debug("file: {}; line: {}; Insert variants".format( __file__, inspect.currentframe().f_lineno)) variant_read_count_instance_list = [] variant_read_count_df.sort_values(by=[ 'variant_sequence', 'run_id', 'marker_id', 'sample_id', 'replicate' ], inplace=True) variant_new_set = set() variant_new_instance_list = [] with engine.connect() as conn: # Retrieve maximal variant id if possible select_variant_id_max = conn.execute( sqlalchemy.select([func.max(variant_model.__table__.c.id) ])).first()[0] if select_variant_id_max is None: select_variant_id_max = 0 # If no variants, then maximal variant id is 0 for row in variant_read_count_df.itertuples(): run_id = row.run_id marker_id = row.marker_id sample_id = row.sample_id replicate = row.replicate variant_sequence = row.variant_sequence read_count = row.read_count select_row = conn.execute( sqlalchemy.select([ variant_model.__table__.c.id ]).where(variant_model.__table__.c.sequence == variant_sequence)).first() if select_row is None: # variant_sequence IS NOT in the database, so will INSERT it if not (variant_sequence in variant_new_set): variant_id = select_variant_id_max + \ len(variant_new_instance_list) + 1 variant_new_set.add(variant_sequence) variant_new_instance_list.append({ 'id': variant_id, 'sequence': variant_sequence }) else: # variant_sequence IS in the database variant_id = select_row[0] variant_read_count_instance_list.append({ 'run_id': run_id, 'marker_id': marker_id, 'variant_id': variant_id, 'sample_id': sample_id, 'replicate': replicate, 'read_count': read_count }) ####################################################################### # # Exit if variant_read_count_instance_list empty # ####################################################################### if not len(variant_read_count_instance_list): Logger.instance().warning( VTAMexception( "No new variants in these samples. Maybe singletons? The analysis will stop here." .format(self.__class__.__name__))) sys.exit(0) ####################################################################### # # Write variant_read_count table # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Insert variant read count".format( __file__, inspect.currentframe().f_lineno)) with engine.connect() as conn: # Insert if there some new variants if len(variant_new_instance_list) > 0: conn.execute(variant_model.__table__.insert(), variant_new_instance_list) # Insert new variant_read_count_instances conn.execute(variant_read_count_model.__table__.insert(), variant_read_count_instance_list) ####################################################################### # # Touch output tables, to update modification date # ####################################################################### for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by(id=obj.id).update( {'id': obj.id}) session.commit()
def get_variant_read_count_delete_df(self, variant_df, uchime3_denovo_abskew): temp_dir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(temp_dir).mkdir(exist_ok=True) filter_output_chimera_df = self.variant_read_count_df.copy() filter_output_chimera_df['filter_delete'] = False # filter_output_borderline_df = self.variant_read_count_df.copy() filter_output_borderline_df['filter_delete'] = False run_marker_sample_df = self.variant_read_count_df[[ 'run_id', 'marker_id', 'sample_id' ]].drop_duplicates(inplace=False) for row in run_marker_sample_df.itertuples(): run_id = row.run_id marker_id = row.marker_id sample_id = row.sample_id variant_read_count_df = self.variant_read_count_df.loc[ (self.variant_read_count_df.run_id == run_id) & (self.variant_read_count_df.marker_id == marker_id) & (self.variant_read_count_df.sample_id == sample_id)] variant_read_count_df_obj = DataframeVariantReadCountLike( variant_read_count_df=variant_read_count_df) N_i_df = variant_read_count_df_obj.get_N_i_df() variant_size_df = variant_df.merge(N_i_df, left_index=True, right_on='variant_id') variant_size_df = variant_size_df[[ 'variant_id', 'sequence', 'N_i' ]] variant_size_df.rename(columns={'N_i': 'size'}, inplace=True) variant_size_df.set_index('variant_id', inplace=True) ################################################################### # # Sort variants by abundance and write to fasta_path # ################################################################### variant_size_df.sort_values(by='size', ascending=False, inplace=True) variant_df_utils_obj = DataframeVariant(variant_size_df) uchime_fasta_path = os.path.join( temp_dir, 'run_{}_marker_{}_sample_{}.fasta'.format( run_id, marker_id, sample_id)) variant_df_utils_obj.to_fasta(fasta_path=uchime_fasta_path, add_column="size") ################################################################### # # Run uchime_denovo # ################################################################### uchime_borderline_fasta_path = os.path.join( temp_dir, 'run_{}_marker_{}_sample_{}_borderline.fasta'.format( run_id, marker_id, sample_id)) uchime_nonchimeras_fasta_path = os.path.join( temp_dir, 'run_{}_marker_{}_sample_id_{}_nonchimeras.fasta'.format( run_id, marker_id, sample_id)) uchime_chimeras_fasta_path = os.path.join( temp_dir, 'run_{}_marker_{}_sample_{}_chimeras.fasta'.format( run_id, marker_id, sample_id)) # # Create object and run_name vsearch vsearch_parameters = { 'uchime3_denovo': uchime_fasta_path, 'borderline': uchime_borderline_fasta_path, 'nonchimeras': uchime_nonchimeras_fasta_path, 'chimeras': uchime_chimeras_fasta_path, 'abskew': uchime3_denovo_abskew, } vsearch_cluster = RunnerVSearch(parameters=vsearch_parameters) vsearch_cluster.run() ################################################################### # # 4. Delete variant from replicate/sample if chimeras # ################################################################### Logger.instance().debug( "Vsearch uchime chimera tsv_path: {}".format( uchime_chimeras_fasta_path)) with open(uchime_chimeras_fasta_path, "r") as handle: for chimera_seqrecord in SeqIO.parse(handle, "fasta"): variant_id = int(chimera_seqrecord.id.split(';')[0]) filter_output_chimera_df.loc[ (filter_output_chimera_df['run_id'] == run_id) & (filter_output_chimera_df['marker_id'] == marker_id) & (filter_output_chimera_df['sample_id'] == sample_id) & (filter_output_chimera_df['variant_id'] == variant_id), 'filter_delete'] = True Logger.instance().debug( "Vsearch uchime chimera borderline tsv_path: {}".format( uchime_borderline_fasta_path)) with open(uchime_borderline_fasta_path, "r") as handle: for chimera_seqrecord in SeqIO.parse(handle, "fasta"): variant_id = int(chimera_seqrecord.id.split(';')[0]) filter_output_borderline_df.loc[ (filter_output_borderline_df['run_id'] == run_id) & (filter_output_borderline_df['marker_id'] == marker_id) & (filter_output_borderline_df['sample_id'] == sample_id) & (filter_output_borderline_df['variant_id'] == variant_id), 'filter_delete'] = True return filter_output_chimera_df, filter_output_borderline_df
def main(fastainfo, fastadir, sorteddir, params=None, num_threads=multiprocessing.cpu_count(), no_reverse=False, tag_to_end=False, primer_to_end=False): Logger.instance().info(f"OPTIONS:\n no_reverse: {not no_reverse} \n tag_to_end {not tag_to_end} \n primer_to_end {not primer_to_end}") if sys.platform.startswith('win'): num_threads = 1 ############################################################################################ # # params.yml parameters # ############################################################################################ params_dic = FileParams(params).get_params_dic() cutadapt_error_rate = params_dic['cutadapt_error_rate'] cutadapt_minimum_length = params_dic['cutadapt_minimum_length'] cutadapt_maximum_length = params_dic['cutadapt_maximum_length'] ############################################################################################ # # Loop over tag and primer pairs to demultiplex and trim reads # ############################################################################################ merged_fastainfo_df = FileSampleInformation(fastainfo).read_tsv_into_df() pathlib.Path(sorteddir).mkdir(parents=True, exist_ok=True) tempdir = PathManager.instance().get_tempdir() merged_fasta_list = [] results_list = [] sample_info = {} # make sure every file is analysed once. for i in range(merged_fastainfo_df.shape[0]): if merged_fastainfo_df.iloc[i].mergedfasta not in merged_fasta_list: merged_fasta_list.append(merged_fastainfo_df.iloc[i].mergedfasta) for mergedfasta in merged_fasta_list: inputFiles = FilesInputCutadapt(fastainfo, mergedfasta, no_reverse, tag_to_end) tagFile_path = inputFiles.tags_file() info = inputFiles.get_df_info() for key in info.keys(): if key in sample_info.keys(): sample_info[key] = sample_info[key] + info[key] else: sample_info[key] = info[key] Logger.instance().debug("Analysing FASTA file: {}".format(mergedfasta)) in_raw_fasta_path = os.path.join(fastadir, mergedfasta) ######################################################################################## # # cutadapt --cores=0 -e 0 --no-indels --trimmed-only -g tagFile:$tagfile # --overlap length -o "tagtrimmed.{name}.fasta" in_raw_fasta_path # ######################################################################################## base = os.path.basename(in_raw_fasta_path) base, base_suffix = base.split('.', 1) out_fasta_path = os.path.join(tempdir, "sorted") cmd_cutadapt_tag_dic = { 'in_fasta_path': in_raw_fasta_path, 'out_fasta': out_fasta_path, 'num_threads': num_threads, 'tagFile': tagFile_path, 'base_suffix': base_suffix, } cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \ '-g file:{tagFile} --output {out_fasta}_{{name}}.{base_suffix} {in_fasta_path}' \ .format(**cmd_cutadapt_tag_dic) Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_tag_str else: args = shlex.split(cmd_cutadapt_tag_str) run_result = subprocess.run(args=args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) Logger.instance().info(run_result.stdout.decode()) inputFiles.remove_tags_file() ######################################################################################## # # Trim primers from output # cutadapt --quiet --cores=0 -e trim_error --no-indels --trimmed-only # --minimum-length minimum_length --maximum-length maximum_length # --output input_path + {name} + suffix outputfile # ######################################################################################## primers = inputFiles.primers() try: tags_samples = inputFiles.get_sample_names() except Exception as e: Logger.instance().error(e) return for primer in primers: marker, primerfwd, primerrev, lenprimerfwd, lenprimerrev = primer for tag_sample in tags_samples: name, run, marker2, sample, replicate, _, _ = tag_sample if marker not in marker2: continue in_fasta_path = out_fasta_path + "_" + name + "." + base_suffix baseMerge = mergedfasta.split(".")[0] outname = run + "_" + marker + "_" + sample + "_" + replicate + "_" + baseMerge + "_trimmed" if name.endswith("_reversed"): outname = outname + "_reversed" out_fasta_path_new = os.path.join(tempdir, outname + "." + base_suffix) results_list.append(out_fasta_path_new) if not "_reversed" in name: if generic_dna: # Biopython <1.78 primerRev = str(Seq(primerrev, generic_dna).reverse_complement()) else: # Biopython =>1.78 primerRev = str(Seq(primerrev).reverse_complement()) primerFwd = primerfwd lenPrimerFwd = lenprimerfwd lenPrimerRev = lenprimerrev else: if generic_dna: # Biopython <1.78 primerRev = str(Seq(primerfwd, generic_dna).reverse_complement()) else: # Biopython =>1.78 primerRev = str(Seq(primerfwd).reverse_complement()) primerFwd = primerrev lenPrimerFwd = lenprimerrev lenPrimerRev = lenprimerfwd cmd_cutadapt_primer_dic = { 'in_fasta_path': in_fasta_path, 'out_fasta': out_fasta_path_new, 'error_rate': cutadapt_error_rate, 'num_threads': num_threads, 'primerFwd': primerFwd, 'primerRev': primerRev, 'lenPrimerFwd': lenPrimerFwd, 'lenPrimerRev': lenPrimerRev, 'read_min_length': cutadapt_minimum_length, 'read_max_length': cutadapt_maximum_length, } if not primer_to_end: #works if the command is selected cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \ '--minimum-length {read_min_length} --maximum-length {read_max_length} ' \ '--trimmed-only -g "^{primerFwd}...{primerRev}$" --output {out_fasta} {in_fasta_path}'\ .format(**cmd_cutadapt_primer_dic) else: cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \ '--minimum-length {read_min_length} --maximum-length {read_max_length} ' \ '--trimmed-only -g "{primerFwd};min_overlap={lenPrimerFwd}...{primerRev};min_overlap={lenPrimerRev}" '\ '--output {out_fasta} {in_fasta_path}'\ .format(**cmd_cutadapt_primer_dic) Logger.instance().debug("Running: {}".format(cmd_cutadapt_primer_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_primer_str else: args = shlex.split(cmd_cutadapt_primer_str) run_result = subprocess.run(args=args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) Logger.instance().info(run_result.stdout.decode()) ################################################################### # # Reverse complement back rc fasta and pool # ################################################################### for file in results_list: if "_trimmed" in file: out_final_fasta_path = os.path.join(sorteddir, os.path.split(file)[-1]) in_fasta_path = os.path.join(tempdir, file) if out_final_fasta_path.endswith(".gz"): _open = partial(gzip.open) elif out_final_fasta_path.endswith(".bz2"): _open = partial(bz2.open) else: _open = open if in_fasta_path.endswith(".gz"): _open2 = partial(gzip.open) elif in_fasta_path.endswith(".bz2"): _open2 = partial(bz2.open) else: _open2 = open if "_reversed" in file: Logger.instance().debug("Pooling fwd and rc reads...") out_final_fasta_path = out_final_fasta_path.replace("_reversed", "") with _open(out_final_fasta_path, 'at') as fout: with _open2(in_fasta_path, 'rt') as fin: for line in fin.readlines(): if not line.startswith('>'): if generic_dna: # Biopython <1.78 fout.write("%s\n" % str( Seq(line.strip(), generic_dna).reverse_complement())) else: # Biopython =>1.78 fout.write("%s\n" % str( Seq(line.strip()).reverse_complement())) else: fout.write(line) else: with _open(out_final_fasta_path, 'at') as fout: with _open2(in_fasta_path, 'rt') as fin: for line in fin.readlines(): fout.write(line) results_list = [os.path.split(result)[-1] for result in results_list if "_reversed" not in result] del sample_info['mergedfasta'] del sample_info['primerrev'] del sample_info['primerfwd'] del sample_info['tagrev'] del sample_info['tagfwd'] sample_info['sortedfasta'] = results_list sample_info_df = pandas.DataFrame(sample_info) fasta_trimmed_info_tsv = os.path.join(sorteddir, 'sortedinfo.tsv') sample_info_df.to_csv(fasta_trimmed_info_tsv, sep="\t", header=True, index=False)
def main(cls, db, mode, asvtable_tsv, output, taxonomy_tsv, blastdb_dir_path, blastdbname_str, num_threads=multiprocessing.cpu_count(), params=None): """ Parameters ---------- db: str Path to SQLITE database with Variant and Taxassign tables mode asvtable_tsv output taxonomy_tsv blastdb_dir_path blastdbname_str num_threads params Returns ------- """ this_temp_dir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(this_temp_dir).mkdir(exist_ok=True) ####################################################################### # # Parameters # ####################################################################### # params_dic = constants.get_params_default_dic() # params_dic = FileParams(params).get_params_dic() # ltg_rule_threshold = params_dic['ltg_rule_threshold'] # include_prop = params_dic['include_prop'] # min_number_of_taxa = params_dic['min_number_of_taxa'] # qcov_hsp_perc = params_dic['qcov_hsp_perc'] ####################################################################### # # Load db and tables as classes and delete taxassign in reset mode # ####################################################################### engine = sqlalchemy.create_engine('sqlite:///{}'.format(db), echo=False) variant_declarative_table = Variant.__table__ variant_declarative_table.create(bind=engine, checkfirst=True) tax_assign_declarative_table = TaxAssign.__table__ tax_assign_declarative_table.create(bind=engine, checkfirst=True) if mode == 'reset': with engine.connect() as conn: conn.execute(tax_assign_declarative_table.delete()) ####################################################################### # # Use variants that are not already already assigned in TaxAssign # ####################################################################### variant_input_df = pandas.read_csv(asvtable_tsv, sep="\t", header=0) # get list of variant sequences variant_sequence_list = variant_input_df.sequence.tolist() # Add variant to DB if not already there for variant_sequence in variant_sequence_list: with engine.connect() as conn: row_variant = conn.execute( sqlalchemy.select([ variant_declarative_table.c.id ]).where(variant_declarative_table.c.sequence == variant_sequence)).first() if row_variant is None: # variant_sequence IS NOT in the database, so INSERT it conn.execute(variant_declarative_table.insert().values( sequence=variant_sequence)) ####################################################################### # # Get already tax-assigned variants with all informations including sequence # ####################################################################### stmt_variant_tax_assign = sqlalchemy.select([ tax_assign_declarative_table.c.variant_id, tax_assign_declarative_table.c.identity, tax_assign_declarative_table.c.ltg_rank, tax_assign_declarative_table.c.ltg_tax_id, tax_assign_declarative_table.c.ltg_tax_name, tax_assign_declarative_table.c.blast_db, variant_declarative_table.c.sequence, ])\ .where(tax_assign_declarative_table.c.ltg_tax_id.isnot(None))\ .where(tax_assign_declarative_table.c.variant_id == variant_declarative_table.c.id)\ .where(variant_declarative_table.c.sequence.in_(variant_sequence_list))\ .distinct() # These are the variants that are already in taxassign and do not need # recalculate ltg_from_db_list = [] with engine.connect() as conn: for row in conn.execute(stmt_variant_tax_assign).fetchall(): ltg_from_db_list.append(dict(zip(row.keys(), row.values()))) """(Pdb) pandas.DataFrame.from_records(ltg_from_db_list) identity ltg_rank ltg_tax_id ltg_tax_name sequence variant_id 0 100 species 2028017 Orthocladiinae sp. BAP34 AGCATGATCTGGAATAGTAGGTACTTCCCTTAGTATCTTAATTCGA... 325 1 99 species 2028029 Rheocricotopus sp. DH90 GGCTTGATCCGGAATAGTAGGAACTTCTTTAAGAATTCTAATTCGA... 1203 2 100 species 1592914 Caenis pusilla GGCTTGATCCGGAATGCTGGGCACCTCTCTAAGCCTTCTAATTCGT... 1443 3 100 species 2028029 Rheocricotopus sp. DH90 TGCTTGATCAGGAATAGTAGGAACTTCTTTAAGAATTCTAATTCGA... 2298 4 90 family 7149 Chironomidae TGCTTGATCAGGGATAGTGGGAACTTCTTTAAGAATTCTTATTCGA... 2498 5 100 species 189839 Baetis rhodani TGCTTGGGCAGGTATGGTAGGTACCTCATTAAGACTTTTAATTCGA... 2610""" ltg_db_df = pandas.DataFrame.from_records(ltg_from_db_list) ltg_db_df = ltg_db_df.reindex(sorted(ltg_db_df.columns), axis=1) # sort columns ####################################################################### # # Get list of variants (id and sequence) that need blast for tax assignation # ####################################################################### stmt_variant = sqlalchemy.select([variant_declarative_table.c.id, variant_declarative_table.c.sequence]) \ .where(variant_declarative_table.c.sequence.in_(variant_sequence_list)) \ if ltg_db_df.shape[0] > 0: stmt_variant = stmt_variant.where( variant_declarative_table.c.id.notin_( ltg_db_df.variant_id.tolist())) stmt_variant = stmt_variant.distinct().order_by("id") variant_not_tax_assigned = [] with engine.connect() as conn: for row in conn.execute(stmt_variant).fetchall(): variant_not_tax_assigned.append( dict(zip(row.keys(), row.values()))) ####################################################################### # # Run RunnerTaxAssign for variant_not_tax_assigned # ####################################################################### blast_variant_df = pandas.DataFrame() ltg_blast_df = pandas.DataFrame() if len(variant_not_tax_assigned ) > 0: # Run blast for variants that need tax assignation blast_variant_df = pandas.DataFrame.from_records( variant_not_tax_assigned, index='id') taxonomy = Taxonomy(tsv=taxonomy_tsv) sequence_list = blast_variant_df.sequence.tolist() tax_assign_runner = RunnerTaxAssign(sequence_list=sequence_list, taxonomy=taxonomy, blast_db_dir=blastdb_dir_path, blast_db_name=blastdbname_str, num_threads=num_threads, params=None) ltg_blast_df = tax_assign_runner.ltg_df ###################################################### # Uncomment to debug because blast is slow # pandas.to_pickle(ltg_df, "ltg_df.pkl") # ltg_df = pandas.read_pickle("ltg_df.pkl") # import pdb; pdb.set_trace() ###################################################### ltg_blast_df.rename({'variant_id': 'sequence'}, inplace=True, axis=1) ltg_blast_df = blast_variant_df.merge(ltg_blast_df, on='sequence', how='outer') ltg_blast_df['blast_db'] = blastdbname_str ltg_blast_df = ltg_blast_df.reindex(sorted(ltg_blast_df.columns), axis=1) # sort columns del (blast_variant_df) ####################################################################### # # Concatenate tax-assigned variants from DB and from Blast # Merge variant_df and ltg_df and write to DB # ####################################################################### if ltg_db_df.shape[0] > 0 and ltg_blast_df.shape[0] > 0: ltg_df = pandas.concat([ ltg_db_df[[ "blast_db", "identity", "ltg_rank", "ltg_tax_id", "ltg_tax_name", "sequence" ]], ltg_blast_df ], axis=0) elif ltg_db_df.shape[0] > 0: ltg_df = ltg_db_df.copy() elif ltg_blast_df.shape[0] > 0: ltg_df = ltg_blast_df.copy() del (ltg_blast_df) ####################################################################### # # Insert or update variant and taxassign tables # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Insert variant_id, ltg_tax_id, ltg_rank to DB" .format(__file__, inspect.currentframe().f_lineno)) for ltg_row in ltg_df.itertuples(): variant_sequence = ltg_row.sequence with engine.connect() as conn: variant_id = conn.execute( sqlalchemy.select([ variant_declarative_table.c.id ]).where(variant_declarative_table.c.sequence == variant_sequence)).first()[0] select_row = conn.execute( sqlalchemy.select([ TaxAssign ]).where(tax_assign_declarative_table.c.variant_id == variant_id)).first() # import pdb; pdb.set_trace() if select_row is None: # variant_id IS NOT in the database, so INSERT it ltg_row_dic = ltg_row._asdict() ltg_row_dic['variant_id'] = variant_id conn.execute(tax_assign_declarative_table.insert(), dict(ltg_row_dic)) else: # variant_sequence IS in the database, so update row tax_assign_declarative_table.update().where( tax_assign_declarative_table.c.variant_id == variant_id).values() ####################################################################### # # Update LTGs for variant output file # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Update LTGs for variant output file".format( __file__, inspect.currentframe().f_lineno)) variant_output_df = variant_input_df.copy() del (variant_input_df) # Add ltg columns to variant_df if it do not exist for ltg_df_col in [ 'ltg_tax_id', 'ltg_tax_name', 'ltg_rank', 'identity', 'blast_db' ]: if not (ltg_df_col in variant_output_df.columns): variant_output_df[ltg_df_col] = None # Move sequence column to end variant_df_columns = variant_output_df.columns.tolist() variant_df_columns.append( variant_df_columns.pop(variant_df_columns.index('sequence'))) variant_output_df = variant_output_df[variant_df_columns] for variant_row in variant_output_df.itertuples(): # variant_id = variant_row.variant_id variant_sequence = variant_row.sequence with engine.connect() as conn: variant_id = conn.execute( sqlalchemy.select([ variant_declarative_table.c.id ]).where(variant_declarative_table.c.sequence == variant_sequence)).first()[0] select_row = conn.execute( sqlalchemy.select([ TaxAssign.ltg_tax_id, TaxAssign.ltg_tax_name, TaxAssign.ltg_rank, TaxAssign.identity, TaxAssign.blast_db, ]).where(tax_assign_declarative_table.c.variant_id == variant_id)).first() tax_assign_dict = dict( zip([ 'ltg_tax_id', 'ltg_tax_name', 'ltg_rank', 'identity', 'blast_db' ], select_row)) for k in tax_assign_dict: variant_output_df.loc[variant_output_df.sequence == variant_sequence, k] = tax_assign_dict[k] # do not move. required because sometimes tax_id is none variant_output_df = variant_output_df.astype({'ltg_tax_id': 'object'}) ####################################################################### # # Update tax lineages for variant output file # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Update tax lineages for variant output file". format(__file__, inspect.currentframe().f_lineno)) tax_id_list = variant_output_df.ltg_tax_id.unique().tolist( ) # unique list of tax ids tax_lineage = TaxLineage(taxonomic_tsv_path=taxonomy_tsv) tax_lineage_df = tax_lineage.create_lineage_from_tax_id_list( tax_id_list=tax_id_list, tax_name=True) # Merge variant_output_df = variant_output_df.merge(tax_lineage_df, left_on='ltg_tax_id', right_on='tax_id', how='left') variant_output_df.drop('tax_id', axis=1, inplace=True) Logger.instance().debug("file: {}; line: {}; Reorder columns".format( __file__, inspect.currentframe().f_lineno)) # Move sequence column to end variant_df_columns = variant_output_df.columns.tolist() variant_df_columns.append( variant_df_columns.pop(variant_df_columns.index('sequence'))) variant_output_df = variant_output_df[variant_df_columns] Logger.instance().debug("file: {}; line: {}; Write to TSV".format( __file__, inspect.currentframe().f_lineno)) variant_output_df.to_csv(output, sep='\t', index=False, header=True)
def create_denovo_from_ncbi(self): new_taxdump_path = self.__download_ncbi_taxonomy_dump() # Logger.instance().debug( "file: {}; line: {}; Extracting NCBI taxonomy dump".format( __file__, inspect.currentframe().f_lineno)) if not (os.path.isfile( os.path.join(os.path.dirname(new_taxdump_path), "nodes.dmp")) and os.path.isfile( os.path.join(os.path.dirname(new_taxdump_path), "names.dmp")) and os.path.isfile( os.path.join(os.path.dirname(new_taxdump_path), "merged.dmp"))): tar = tarfile.open(new_taxdump_path, "r:gz") tar.extractall(path=self.tempdir) tar.close() Logger.instance().debug( "file: {}; line: {}; Reading and processing NCBI taxonomy dump". format(__file__, inspect.currentframe().f_lineno)) # nodes_dmp = os.path.join(self.tempdir, "nodes.dmp") nodes_dmp_df = pandas.read_table( nodes_dmp, header=None, sep='\t', engine='python', usecols=[0, 2, 4], names=['tax_id', 'parent_tax_id', 'rank']) # names_dmp = os.path.join(self.tempdir, "names.dmp") names_dmp_df = pandas.read_table( names_dmp, header=None, sep=r'\t', engine='python', usecols=[0, 2, 6], names=['tax_id', 'name_txt', 'name_class']) names_dmp_df = names_dmp_df.loc[names_dmp_df.name_class == 'scientific name'] names_dmp_df = names_dmp_df[['tax_id', 'name_txt']] # taxonomy_df = nodes_dmp_df.merge(names_dmp_df, on='tax_id') # merged_dmp = os.path.join(self.tempdir, "merged.dmp") merged_dmp_df = pandas.read_table(merged_dmp, header=None, sep='\t', engine='python', usecols=[0, 2], names=['old_tax_id', 'tax_id']) # taxonomy_df = taxonomy_df.merge(merged_dmp_df, on='tax_id', how='left') # Logger.instance().debug("file: {}; line: {}; Write to TSV DB".format( __file__, inspect.currentframe().f_lineno)) try: taxonomy_df.to_csv(self.taxonomy_tsv_path, sep="\t", header=True, float_format='%.0f', index=False) except ValueError as valerr: Logger.instance().error( VTAMexception( "{}. Error during the creation of the taxonomy DB".format( valerr))) except sqlalchemy.exc.OperationalError as opererr: Logger.instance().error( VTAMexception( "{}. Please, verify the output argument: {}".format( opererr, self.taxonomy_tsv_path)))
def __init__(self, sequence_list, taxonomy, blast_db_dir, blast_db_name, num_threads, params): """ Parameters ---------- sequence_list : list List of se param2 : str The second parameter. """ self.old_tax_id_df = taxonomy.old_tax_df self.taxonomy_df = taxonomy.df self.blast_db_dir = blast_db_dir self.this_temp_dir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(self.this_temp_dir).mkdir(exist_ok=True) self.num_threads = num_threads ####################################################################### # # Parameters # ####################################################################### params_dic = FileParams(params).get_params_dic() qcov_hsp_perc = params_dic['qcov_hsp_perc'] ####################################################################### # # 2 Create FASTA file with Variants # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Create SortedReadFile from Variants".format( __file__, inspect.currentframe().f_lineno)) variant_fasta = os.path.join(self.this_temp_dir, 'variant.fasta') with open(variant_fasta, 'w') as fout: for seq in sequence_list: fout.write(">{}\n{}\n".format(seq, seq)) ####################################################################### # # 3 Run local blast # ####################################################################### runner_blast = RunnerBlast(variant_fasta, blast_db_dir, blast_db_name, num_threads, qcov_hsp_perc) # run blast blast_output_tsv = runner_blast.run_local_blast() # process blast results blast_output_df = RunnerBlast.process_blast_result(blast_output_tsv) ####################################################################### # # Compute tax lineages for Blast target tax ids # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Open taxonomy.tsv DB".format( __file__, inspect.currentframe().f_lineno)) blast_output_df.target_tax_id = pandas.to_numeric( blast_output_df.target_tax_id) # Logger.instance().debug( "file: {}; line: {}; Annotate each target_tax_id with its lineage as columns in wide format" .format(__file__, inspect.currentframe().f_lineno)) tax_id_list = blast_output_df.target_tax_id.unique().tolist() tax_id_to_lineage_df = taxonomy.get_several_tax_id_lineages( tax_id_list) ####################################################################### # # Merge tax lineages and the blast result # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Merge blast result including tax_id with their lineages" .format(__file__, inspect.currentframe().f_lineno)) # Merge local blast output with tax_id_to_lineage_df # variant_identity_lineage_df = blast_output_df.merge( # tax_id_to_lineage_df, left_on='target_tax_id', right_on='tax_id') variantid_identity_lineage_df = blast_output_df.merge( tax_id_to_lineage_df, left_on='target_tax_id', right_index=True) # variant_identity_lineage_df.drop('tax_id', axis=1, inplace=True) """(Pdb) variant_identity_lineage_df.columns Index(['variant_id', 'target_id', 'identity', 'evalue', 'coverage', 'target_tax_id', 'no rank', 'species', 'genus', 'family', 'order', 'class', 'subphylum', 'phylum', 'subkingdom', 'kingdom', 'superkingdom', 'superfamily', 'infraorder', 'suborder', 'infraclass', 'subclass', 'tribe', 'subfamily', 'cohort', 'subgenus', 'subspecies', 'parvorder', 'superorder', 'subcohort', 'superclass', 'species group', 'subtribe', 'section', 'varietas', 'species subgroup'], dtype='object')""" ####################################################################### # # several_variants_to_ltg # this function returns a data frame containing the Ltg rank and Ltg Tax_id for each variant # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Main loop over variant and identity to" "compute the whole set of ltg_tax_id and ltg_rank for each variant_id" "to a dataframe".format(__file__, inspect.currentframe().f_lineno)) runner_ltg_selection = RunnerLTGselection( variant_identity_lineage_df=variantid_identity_lineage_df, taxonomy_df=self.taxonomy_df, params=params) self.ltg_df = runner_ltg_selection.several_variants_to_ltg()
def mark_delete_lfn_per_Ni_or_Nik_or_Njk( self, lfn_denominator, cutoff, cutoff_specific_df=None, ): """ :param lfn_denominator: string that takes values either: 'N_i', 'N_ik' or 'N_jk' :param cutoff: float with general cutoff :param cutoff_specific_df: DataFrame with either variant-specific (N_i) or variant-replicate-specific deletion cutoff :return: None: The output of this filter is added to the 'self.variant_read_count_filter_delete_df' with filter_id=2 and 'filter_delete'=1 or 0 (General cutoff) and with filter_id=4 and 'filter_delete'=1 or 0 (Variant-specific cutoff) """ if not (cutoff_specific_df is None): cutoff_specific_df.drop(['variant_sequence'], axis=1, inplace=True) if lfn_denominator == 'N_i': # variant this_filter_id = 2 N_df = self.variant_read_count_lfn_df.get_N_i_df( ) # Compute N_i_df filter_df = self.variant_read_count_df.merge( N_df, on=['run_id', 'marker_id', 'variant_id']) filter_df['filter_id'] = this_filter_id filter_df['cutoff'] = cutoff filter_cutoff_specific_df = None if not (cutoff_specific_df is None): this_filter_id = 4 filter_cutoff_specific_df = filter_df.copy() filter_cutoff_specific_df.drop('cutoff', axis=1, inplace=True) filter_cutoff_specific_df = filter_cutoff_specific_df.merge( cutoff_specific_df, on=['run_id', 'marker_id', 'variant_id']) filter_cutoff_specific_df['filter_id'] = this_filter_id filter_df = pandas.concat([filter_df, filter_cutoff_specific_df], axis=0) filter_df['lfn_ratio'] = filter_df.read_count / filter_df.N_i elif lfn_denominator == 'N_ik': # variant_replicate this_filter_id = 3 N_df = self.variant_read_count_lfn_df.get_N_ik_df( ) # Compute N_ik_df filter_df = self.variant_read_count_df.merge( N_df, on=['run_id', 'marker_id', 'variant_id', 'replicate']) filter_df['lfn_ratio'] = filter_df.read_count / filter_df.N_ik filter_df['filter_id'] = this_filter_id filter_df['cutoff'] = cutoff filter_cutoff_specific_df = None if not (cutoff_specific_df is None): this_filter_id = 5 filter_cutoff_specific_df = filter_df.copy() filter_cutoff_specific_df.drop('cutoff', axis=1, inplace=True) filter_cutoff_specific_df = filter_cutoff_specific_df.merge( cutoff_specific_df, on=['run_id', 'marker_id', 'variant_id', 'replicate']) filter_cutoff_specific_df['filter_id'] = this_filter_id filter_df = pandas.concat([filter_df, filter_cutoff_specific_df], axis=0) filter_df['lfn_ratio'] = filter_df.read_count / filter_df.N_ik elif lfn_denominator == 'N_jk': # sample_replicate this_filter_id = 6 N_df = self.variant_read_count_lfn_df.get_N_jk_df( ) # Compute N_jk_df filter_df = self.variant_read_count_df.merge( N_df, left_on=['run_id', 'marker_id', 'sample_id', 'replicate'], right_on=['run_id', 'marker_id', 'sample_id', 'replicate']) filter_df['lfn_ratio'] = filter_df.read_count / filter_df.N_jk filter_df['filter_id'] = this_filter_id filter_df['cutoff'] = cutoff else: Logger.instance().critical( VTAMexception("Internal error. VTAM will exit.")) sys.exit(1) # Initialize filter: Keep everything filter_df['filter_delete'] = False # Mark for deletion all variants with read_count=0 filter_df.loc[filter_df.read_count == 0, 'filter_delete'] = True # Mark for deletion all filters with 'lfn_ratio'<=lfn_variant_cutoff filter_df.loc[filter_df['lfn_ratio'] <= filter_df['cutoff'], 'filter_delete'] = True # Keep important columns filter_df = filter_df[[ 'run_id', 'marker_id', 'sample_id', 'replicate', 'variant_id', 'read_count', 'filter_id', 'filter_delete' ]] # Prepare output variant_read_count_input_df and concatenate vertically output variant_read_count_input_df # to self.variant_read_count_filter_delete_df self.variant_read_count_filter_delete_df = pandas.concat( [self.variant_read_count_filter_delete_df, filter_df], sort=False, axis=0)
def run(self): session = self.session engine = session._session().get_bind() ####################################################################### # # Wrapper inputs, outputs and parameters # ####################################################################### # # Input files fasta_info_tsv = self.input_file( FilterMinReplicateNumber.__input_file_sortedinfo) # # Input tables input_filter_lfn_model = self.input_table( FilterMinReplicateNumber.__input_table_variant_filter_lfn) # # Options min_replicate_number = self.option("min_replicate_number") # input_filter_lfn = self.option("input_filter_lfn") # # Output tables output_filter_min_replicate_model = self.output_table( FilterMinReplicateNumber.__output_table_filter_min_replicate_number) ####################################################################### # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # 3. Get nijk_df input # ####################################################################### sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=output_filter_min_replicate_model) filter_id = None if input_filter_lfn_model.__tablename__ == "FilterLFN": filter_id = 8 # Variant pass all filters LFN variant_read_count_df = sample_info_tsv_obj.get_nijk_df( variant_read_count_like_model=input_filter_lfn_model, engine=engine, filter_id=filter_id) ####################################################################### # # 4. Run Filter # ####################################################################### variant_read_count_delete_df = RunnerFilterMinReplicateNumber( variant_read_count_df) .get_variant_read_count_delete_df(min_replicate_number) ####################################################################### # # 5. Write to DB # 6. Touch output tables, to update modification date # 7. Exit vtam if all variants delete # ####################################################################### DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql( engine=engine, variant_read_count_like_model=output_filter_min_replicate_model) for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by( id=obj.id).update({'id': obj.id}) session.commit() if variant_read_count_delete_df.filter_delete.sum( ) == variant_read_count_delete_df.shape[0]: Logger.instance().warning( VTAMexception( "This filter has deleted all the variants: {}. " "The analysis will stop here.".format( self.__class__.__name__))) sys.exit(0)
def main(fastqinfo, fastqdir, fastainfo, fastadir, params=None, num_threads=multiprocessing.cpu_count()): ############################################################################################ # # params.yml parameters # ############################################################################################ params_dic = FileParams(params).get_params_dic() ############################################################################################ # # Read fastq information into stats_df # ############################################################################################ fastqinfo_df = FileSampleInformation(fastqinfo).read_tsv_into_df() pathlib.Path( os.path.dirname(fastainfo)).mkdir( parents=True, exist_ok=True) pathlib.Path(fastadir).mkdir(parents=True, exist_ok=True) fastainfo_df = pandas.DataFrame() ############################################################################################ # # Loop over fastq pairs to merge # ############################################################################################ # File with analysis stats data stats_df = pandas.DataFrame({'FastqFwd': [], 'FastqRev': [], 'NbReadsFwd': [], 'NbReadsRev': [], 'FastaMerged': [], 'NbMergedReads': []}) for fastqfwd, fastqrev in fastqinfo_df[[ 'fastqfwd', 'fastqrev']].drop_duplicates().values: fastq_info_df_i = fastqinfo_df.loc[(fastqinfo_df.fastqfwd == fastqfwd) & ( fastqinfo_df.fastqrev == fastqrev)] fastq_fw_abspath = os.path.join(fastqdir, fastqfwd) with open(fastq_fw_abspath, 'rb') as fin: fastq_fw_linecount = int(sum(1 for i in fin.read())/4) fastq_rv_abspath = os.path.join(fastqdir, fastqrev) with open(fastq_rv_abspath, 'rb') as fin: fastq_rv_linecount = int(sum(1 for i in fin.read())/4) Logger.instance().debug( "Analysing FASTQ files: {} and ".format( fastqfwd, fastqrev)) try: pathlib.Path(fastq_fw_abspath).resolve(strict=True) except FileNotFoundError: Logger.instance().error( VTAMexception( "VTAMexception: This FASTQ file was not found: {}.".format(fastq_fw_abspath))) sys.exit(1) try: pathlib.Path(fastq_rv_abspath).resolve(strict=True) except FileNotFoundError: Logger.instance().error( VTAMexception( "VTAMexception: This FASTQ file was not found: {}.".format(fastq_rv_abspath))) sys.exit(1) fasta_merged_basename = os.path.basename( fastq_fw_abspath).replace('.fastq', '.fasta') out_fasta_path = os.path.join(fastadir, fasta_merged_basename) ######################################################################################## # # Run vsearch merge # ######################################################################################## vsearch_args_dic = {} vsearch_args_dic['fastq_ascii'] = params_dic['fastq_ascii'] vsearch_args_dic['fastq_maxee'] = params_dic['fastq_maxee'] vsearch_args_dic['fastq_maxmergelen'] = params_dic['fastq_maxmergelen'] vsearch_args_dic['fastq_maxns'] = params_dic['fastq_maxns'] vsearch_args_dic['fastq_minlen'] = params_dic['fastq_minlen'] vsearch_args_dic['fastq_minmergelen'] = params_dic['fastq_minmergelen'] vsearch_args_dic['fastq_minovlen'] = params_dic['fastq_minovlen'] vsearch_args_dic['fastq_truncqual'] = params_dic['fastq_truncqual'] vsearch_args_dic['fastq_mergepairs'] = fastq_fw_abspath vsearch_args_dic['reverse'] = fastq_rv_abspath vsearch_args_dic['fastaout'] = out_fasta_path vsearch_args_dic['threads'] = num_threads vsearch_cluster = RunnerVSearch(parameters=vsearch_args_dic) vsearch_cluster.run() fastq_info_df_i = fastq_info_df_i[['run', 'marker', 'sample', 'replicate', 'tagfwd', 'primerfwd', 'tagrev', 'primerrev']] fastq_info_df_i['mergedfasta'] = fasta_merged_basename fastainfo_df = pandas.concat( [fastainfo_df, fastq_info_df_i], axis=0) with open(out_fasta_path, 'rb') as fin: fasta_merged_linecount = int(sum(1 for i in fin.read()) / 4) ######################################################################################## # # Summary file # ######################################################################################## stats_df = pandas.concat([stats_df, pandas.DataFrame({ 'FastqFwd': [fastq_fw_abspath], 'FastqRev': [fastq_fw_linecount], 'NbReadsFwd': [fastq_rv_abspath], 'NbReadsRev': [fastq_rv_linecount], 'FastaMerged': [out_fasta_path], 'NbMergedReads': [fasta_merged_linecount]})]) for mergedfasta in fastainfo_df[['mergedfasta']].drop_duplicates().values: mergedfasta = mergedfasta[0] if mergedfasta.endswith('.bz2') or mergedfasta.endswith('.gz'): fasta_merged_abspath = os.path.join(fastadir, mergedfasta) mergedfasta_compressor = FileCompression(fasta_merged_abspath) if mergedfasta.endswith('.gz'): mergedfasta_c = mergedfasta_compressor.pigz_compression() if mergedfasta_c is None: mergedfasta_c = mergedfasta_compressor.gzip_compression() elif mergedfasta.endswith('.bz2'): mergedfasta_c = mergedfasta_compressor.bz2_compression() mergedfasta_compressor.delete_file() _, relPath = os.path.split(mergedfasta_c) fastainfo_df.loc[fastainfo_df['mergedfasta'] == mergedfasta, 'mergedfasta'] = relPath else: fastq_info_df_i['mergedfasta'] = fasta_merged_basename fastainfo_df.to_csv(fastainfo, sep="\t", header=True, index=False)
def run(self): session = self.session engine = session._session().get_bind() ####################################################################### # # Wrapper inputs, outputs and parameters # ####################################################################### # Input file output fasta_info_tsv = self.input_file(FilterChimera.__input_file_sortedinfo) # # Input table models # Variant = self.input_table(FilterChimera.__input_table_Variant) input_filter_pcr_error_model = self.input_table( FilterChimera.__input_table_filter_pcr_error) # # Output table models output_filter_chimera_model = self.output_table( FilterChimera.__output_table_filter_chimera) output_filter_borderline_model = self.output_table( FilterChimera.__output_table_filter_chimera_borderline) # # Params uchime3_denovo_abskew = self.option("uchime3_denovo_abskew") ####################################################################### # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # 3. Get nijk_df input # ####################################################################### sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=output_filter_chimera_model) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=output_filter_borderline_model) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( variant_read_count_like_model=input_filter_pcr_error_model, engine=engine, filter_id=None) ####################################################################### # # 4. Run Filter # ####################################################################### variant_df = sample_info_tsv_obj.get_variant_df( variant_read_count_like_model=input_filter_pcr_error_model, engine=engine) filter_chimera_runner = RunnerFilterChimera( variant_read_count_df=variant_read_count_df) filter_output_chimera_df, filter_borderline_output_df = \ filter_chimera_runner.get_variant_read_count_delete_df( variant_df=variant_df, uchime3_denovo_abskew=uchime3_denovo_abskew) ####################################################################### # # 5. Write to DB # 6. Touch output tables, to update modification date # 7. Exit vtam if all variants delete # ####################################################################### DataframeVariantReadCountLike(filter_output_chimera_df).to_sql( engine=engine, variant_read_count_like_model=output_filter_chimera_model) DataframeVariantReadCountLike(filter_borderline_output_df).to_sql( engine=engine, variant_read_count_like_model=output_filter_borderline_model) for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by(id=obj.id).update( {'id': obj.id}) session.commit() if filter_output_chimera_df.filter_delete.sum( ) == filter_output_chimera_df.shape[0]: Logger.instance().warning( VTAMexception("This filter has deleted all the variants: {}. " "The analysis will stop here.".format( self.__class__.__name__))) sys.exit(0)
def __init__(self, sys_argv): ############################################################################################ # # Parse arguments # ############################################################################################ self.sys_argv = sys_argv # AG do not use abspath for the moment. Maybe later it can be used as # option parser = ArgParser.get_main_arg_parser() self.args = parser.parse_args(sys_argv) arg_parser_dic = vars(self.args) ############################################################################################ # # If non-specified, initiate params.yml # ############################################################################################ if 'params' in arg_parser_dic and arg_parser_dic['params'] is None: params_yml = os.path.join(PathManager.instance().get_configdir(), "params.yml") if not os.path.isfile(params_yml): pathlib.Path(params_yml).touch(exist_ok=False) arg_parser_dic['params'] = params_yml ############################################################################################ # # Parse log arguments # ############################################################################################ if 'log_verbosity' in arg_parser_dic: (LoggerArguments.instance()).update({'log_verbosity': arg_parser_dic['log_verbosity']}) os.environ['VTAM_LOG_VERBOSITY'] = str( arg_parser_dic['log_verbosity']) if 'log' in arg_parser_dic: (LoggerArguments.instance()).update({'log': arg_parser_dic['log']}) os.environ['VTAM_LOG_FILE'] = str(arg_parser_dic['log']) ####################################################################### # # Set arguments, logger # ####################################################################### # Some arguments will be passed through environmental variables if 'threads' in arg_parser_dic: os.environ['VTAM_THREADS'] = str(arg_parser_dic['threads']) ############################################################################################ # # Subcommands: wopfile-dependent, filter, optimize # ############################################################################################ if arg_parser_dic['command'] in ['filter', 'optimize']: if arg_parser_dic['command'] in ['filter']: #################################################################################### # # Verify coherence of --lfn_variant_replicate and params arguments # #################################################################################### with open(arg_parser_dic['params']) as fin: # The FullLoader parameter handles the conversion from YAML # scalar values to Python the dictionary format params_dic = yaml.load(fin, Loader=yaml.SafeLoader) or {} if arg_parser_dic['lfn_variant_replicate']: if 'lfn_variant_cutoff' in params_dic: Logger.instance().error(VTAMexception( 'The parameter "lfn_variant_cutoff" in the parameter file "{}" is incompatible with' ' the --lfn_variant_replicate argument.'.format(arg_parser_dic['params']))) sys.exit(1) else: if 'lfn_variant_replicate_cutoff' in params_dic: Logger.instance().error(VTAMexception( 'The parameter "lfn_variant_replicate_cutoff" in the parameter file "{}" needs' ' the --lfn_variant_replicate argument.'.format(arg_parser_dic['params']))) sys.exit(1) #################################################################################### # # Verify coherence of --lfn_variant_replicate and cutoff_specific argument # #################################################################################### if not (arg_parser_dic['cutoff_specific'] is None): # cutoff specific argument if arg_parser_dic['lfn_variant_replicate']: # lfn_variant_replicate # cutoff_specific for lfn_variant if not FileCutoffSpecific(arg_parser_dic['cutoff_specific']).is_compatible_lfn_variant_replicate(): Logger.instance().error('The --lfn_variant_replicate argument is incompatible with the cutoff_specific file {}.'.format( arg_parser_dic['cutoff_specific'])) sys.exit(1) else: # lfn_variant # cutoff_specific for lfn_variant_replicate if FileCutoffSpecific(arg_parser_dic['cutoff_specific']).is_compatible_lfn_variant_replicate(): Logger.instance().error('The cutoff_specific file {} requires the --lfn_variant_replicate argument.'.format( arg_parser_dic['cutoff_specific'])) sys.exit(1) ############################################################################################ # # If non-specified, initiate cutoff specific # ############################################################################################ if arg_parser_dic['cutoff_specific'] is None: cutoff_specific_tsv = os.path.join(PathManager.instance().get_configdir(), "cutoff_specific.tsv") if not os.path.isfile(cutoff_specific_tsv): pathlib.Path(cutoff_specific_tsv).touch(exist_ok=False) arg_parser_dic['cutoff_specific'] = cutoff_specific_tsv CommandFilterOptimize.main(arg_parser_dic=arg_parser_dic) ############################################################################################ # # Subcommand: example # ############################################################################################ elif arg_parser_dic['command'] == 'example': outdir = arg_parser_dic['outdir'] CommandExample.main(outdir=outdir) ############################################################################################ # # Subcommand: merge # ############################################################################################ elif arg_parser_dic['command'] == 'merge': fastqinfo = arg_parser_dic['fastqinfo'] fastqdir = arg_parser_dic['fastqdir'] fastainfo = arg_parser_dic['fastainfo'] fastadir = arg_parser_dic['fastadir'] num_threads = arg_parser_dic['threads'] params = arg_parser_dic['params'] CommandMerge.main(fastqinfo=fastqinfo, fastqdir=fastqdir, fastainfo=fastainfo, fastadir=fastadir, params=params, num_threads=num_threads) ############################################################################################ # # Subcommand: sortreads # ############################################################################################ elif arg_parser_dic['command'] == 'sortreads': fastadir = arg_parser_dic['fastadir'] fastainfo = arg_parser_dic['fastainfo'] sorteddir = arg_parser_dic['sorteddir'] num_threads = arg_parser_dic['threads'] params = arg_parser_dic['params'] CommandSortReads.main(fastainfo=fastainfo, fastadir=fastadir, params=params, num_threads=num_threads, sorteddir=sorteddir) ############################################################################################ # # Subcommand: taxassign # ############################################################################################ elif arg_parser_dic['command'] == 'taxassign': db = arg_parser_dic['db'] asvtable_tsv = arg_parser_dic['asvtable'] output = arg_parser_dic['output'] mode = arg_parser_dic['mode'] taxonomy_tsv = arg_parser_dic['taxonomy'] blasdb_dir_path = arg_parser_dic['blastdbdir'] blastdbname_str = arg_parser_dic['blastdbname'] num_threads = arg_parser_dic['threads'] params = arg_parser_dic['params'] CommandTaxAssign.main(db=db, mode=mode, asvtable_tsv=asvtable_tsv, output=output, taxonomy_tsv=taxonomy_tsv, blastdb_dir_path=blasdb_dir_path, blastdbname_str=blastdbname_str, params=params, num_threads=num_threads) ############################################################################################ # # Subcommand: pool # ############################################################################################ elif arg_parser_dic['command'] == 'pool': db = arg_parser_dic['db'] readcounts = arg_parser_dic['readcounts'] run_marker_tsv = arg_parser_dic['runmarker'] pooled_marker_tsv = arg_parser_dic['asvtable'] params = arg_parser_dic['params'] CommandPoolRunMarkers.main(db=db, pooled_marker_tsv=pooled_marker_tsv, run_marker_tsv=run_marker_tsv, params=params, readcounts=readcounts) ############################################################################################ # # Subcommand: taxonomy # ############################################################################################ elif arg_parser_dic['command'] == 'taxonomy': taxonomy_tsv = arg_parser_dic['output'] precomputed = arg_parser_dic['precomputed'] taxonomy = CommandTaxonomy(taxonomy_tsv=taxonomy_tsv) taxonomy.main(precomputed=precomputed) ############################################################################################ # # Subcommand: coi blast # ############################################################################################ elif arg_parser_dic['command'] == 'coi_blast_db': blastdbdir = arg_parser_dic['blastdbdir'] blastdbname = arg_parser_dic['blastdbname'] coi_blast_db = CommandBlastCOI(blastdbname=blastdbname) coi_blast_db.download(blastdbdir=blastdbdir) ############################################################################################ # # Else: run_name usage message # ############################################################################################ else: self.args = parser.parse_args(['--help']) # if command unknown print help