def __call__(self): """ Launch the complete pipeline of analyse: * Reference importation/parsing * Facultative step of reference masking to remove homologies between reference sequences * Facultative step of Fastq quality Filtering/ adapter trimming * Facultative step of reference indexing for bwa from merged references * Short read alignment with bwa mem * Spliting of sam to attribute reads to each original references (or unmmapped) * Output per reference bam, sam, bedgraph, bed, covgraph, variant call * Output distribution table and graph """ stime = time() self.outdir = mkdir(path.abspath(self.outdir)) print("\n##### PARSE REFERENCES #####\n") # Create CV_Reference.Reference object for each reference easily accessible through # Reference class methods if self.ref_masking or not self.bwa_index: self.ref_dir = mkdir(path.join(self.outdir, "references/")) self.index_dir = mkdir(path.join(self.outdir, "bwa_index/")) self._extract_ref(expand=True) else: self.ref_dir = "" self.index_dir = "" self._extract_ref(expand=False) # Reference Masking if self.ref_masking: print("\n##### REFERENCE HOMOLOGIES MASKING #####\n") self.db_dir = mkdir(path.join(self.outdir, "blast_db/")) ref_list = self._iterative_masker() # Erase existing index value if ref masking was performed bwa_index = None # Fastq Filtering if self.quality_filtering or self.adapter_trimming: print("\n##### FASTQ FILTERING #####\n") self.fastq_dir = mkdir(path.join(self.outdir, "fastq/")) self.R1, self.R2 = self._fastq_filter() # BWA alignment print("\n##### READ REFERENCES AND ALIGN WITH BWA #####\n") # An index will be generated if no index was provided self.result_dir = mkdir(path.join(self.outdir, "results/")) self.sam = Mem.align(self.R1, self.R2, index=self.bwa_index, ref=Reference.allFasta(), align_opt=self.bwa_mem_opt, index_opt=self.bwa_index_opt, aligner=self.bwa_aligner, align_threads=self.bwa_threads, indexer=self.bwa_indexer, align_outdir=self.result_dir, index_outdir=self.index_dir, align_outname=self.outprefix + ".sam", index_outname=self.outprefix + ".idx") print("\n##### FILTER ALIGNED READS AND ASSIGN A REFERENCE #####\n") # Split the output sam file according to each reference self._sam_spliter() print("\n##### GENERATE OUTPUT FOR EACH REFERENCE #####\n") # Deal with garbage read dictionnary self._garbage_output() # Ask references to generate the output they were configured to Reference.mk_output_global(self.result_dir + self.outprefix) # Create a distribution table self._distribution_output() self._make_report() print("\n##### DONE #####\n") print("Total execution time = {}s".format(round(time() - stime, 2)))
def align(R1, R2='', index='', ref='', aligner="bwa mem", align_opt="", align_threads=1, align_outdir="./bwa_align/", align_outname="out.sam", indexer="bwa index", index_opt="", index_outdir="./bwa_index/", index_outname="out"): """ Main function of the package allowing to validate an existing index or to create a new one, then perform a alignment of single or paired fastq sequences against the index. Finally a sam file is returned for further analysis. If an valid existing index was given all index option and ref_fasta are not required. @param R1 Path to the file containing fastq sequences (can be gzipped) @param R2 Facultative path to the file containing paired fastq sequence (can be gzipped) @param index Index files basename if available @param ref Path of the fasta file containing the reference sequence (can be gzipped) This parameter can also be a list of fasta file (gzipped or not) in this case all references will be merged into a single fasta reference @param aligner Path ot the bwa mem executable. Not required if bwa if added to your path @param align_opt Bwa mem command line options as a string @param align_outdir Directory where to store the sam file @param align_outname Name of the output sam file @param indexer Path ot the bwa index executable. Not required if bwa if added to your path @param index_opt Bwa index command line options as a string @param index_outdir Directory where to store the index files @param index_outname Basename of the index file @return Path of the output sam file """ # Try to import an existing index try: if not index: raise Exception("No index provided") print("Existing index provided") idx = ExistingIndex(index) # If no index or if an error occured during validation of the existing index = create a new one except Exception as E: print(E) # Verify the presence of the reference fasta file if not ref: raise Exception( "Invalid or no fasta file provided. Cannot create an index") print("Generating index...") mkdir(index_outdir) index_path = path.join(index_outdir, index_outname) idx = NewIndex(ref, index_path, index_opt, indexer) # Create a Aligner object mem = Aligner(idx, align_opt, aligner, align_threads) #~print (repr(mem)) mkdir(align_outdir) # Align the reference index with R1 fastq (and R2) align_path = path.join(align_outdir, align_outname) return (mem.align(R1, R2, align_path))
def __call__(self): """ Launch the complete pipeline of analyse: * Reference importation/parsing * Facultative step of reference masking to remove homologies between reference sequences * Facultative step of Fastq quality Filtering/ adapter trimming * Facultative step of reference indexing for bwa from merged references * Short read alignment with bwa mem * Spliting of sam to attribute reads to each original references (or unmmapped) * Output per reference bam, sam, bedgraph, bed, covgraph, variant call * Output distribution table and graph """ stime = time() self.outdir = mkdir(path.abspath(self.outdir)) print ("\n##### PARSE REFERENCES #####\n") # Create CV_Reference.Reference object for each reference easily accessible through # Reference class methods if self.ref_masking or not self.bwa_index: self.ref_dir = mkdir(path.join(self.outdir, "references/")) self.index_dir = mkdir(path.join(self.outdir, "bwa_index/")) self._extract_ref(expand=True) else: self.ref_dir = "" self.index_dir = "" self._extract_ref(expand=False) # Reference Masking if self.ref_masking: print ("\n##### REFERENCE HOMOLOGIES MASKING #####\n") self.db_dir = mkdir(path.join(self.outdir, "blast_db/")) ref_list = self._iterative_masker() # Erase existing index value if ref masking was performed bwa_index = None # Fastq Filtering if self.quality_filtering or self.adapter_trimming: print ("\n##### FASTQ FILTERING #####\n") self.fastq_dir = mkdir(path.join(self.outdir, "fastq/")) self.R1, self.R2 = self._fastq_filter() # BWA alignment print ("\n##### READ REFERENCES AND ALIGN WITH BWA #####\n") # An index will be generated if no index was provided self.result_dir = mkdir(path.join(self.outdir, "results/")) self.sam = Mem.align ( self.R1, self.R2, index = self.bwa_index, ref = Reference.allFasta(), align_opt = self.bwa_mem_opt, index_opt = self.bwa_index_opt, aligner = self.bwa_aligner, align_threads = self.bwa_threads, indexer = self.bwa_indexer, align_outdir = self.result_dir, index_outdir = self.index_dir, align_outname = self.outprefix+".sam", index_outname = self.outprefix+".idx") print ("\n##### FILTER ALIGNED READS AND ASSIGN A REFERENCE #####\n") # Split the output sam file according to each reference self._sam_spliter () print ("\n##### GENERATE OUTPUT FOR EACH REFERENCE #####\n") # Deal with garbage read dictionnary self._garbage_output() # Ask references to generate the output they were configured to Reference.mk_output_global(self.result_dir+self.outprefix) # Create a distribution table self._distribution_output() self._make_report() print ("\n##### DONE #####\n") print ("Total execution time = {}s".format(round(time()-stime, 2)))
def align (query_list, subject_db = None, subject_fasta = None, aligner = "blastn", align_opt = "", num_threads = 1, db_maker = "makeblastdb", db_opt = "", db_outdir = "./blast_db/", db_outname = "out"): """ Main function of RefMasker that integrate database creation, blast and homology masking * Instantiate Blast database and blastn object * Perform iterative blasts of query sequences against the subject database and create a list of hits. @param query_list List of paths indicating fasta files containing query sequences (can be gzipped). Fasta can contains multiple sequences. @param subject_db Basename of file from a blast database created by "makeblastdb" if available @param subject_fasta Reference fasta file. Required if no ref_index is given (can be gzipped) @param aligner Path ot the blastn executable. Not required if blast+ if added to your path @param blastn_opt Blastn command line options as a string @param db_maker Path ot the makeblastdb executable. Not required if blast+ if added to your path @param db_opt makeblastdb command line options as a string @param db_outdir Directory where to store the database files @param db_outname Basename of the database files @return A list of BlastHit objects """ # Try to import an existing database try: if not subject_db: raise Exception("No Blast database was provided") print("Existing database provided") db = ExistingDB(subject_db) # If no DB or if an error occured during validation of the existing DB = create a new db except Exception as E: print (E) # Verify the presence of the reference fasta file if not subject_fasta or not path.isfile (subject_fasta): raise Exception("Invalid or no fasta file provided. Cannot create a database") print ("Generate a database...") mkdir(db_outdir) db_path = path.join (db_outdir, db_outname) # Create the new database db = NewDB(ref_path=subject_fasta, db_path=db_path, makeblastdb_opt=db_opt, makeblastdb=db_maker) # Initialise a Blastn object blast = Aligner(db, align_opt, aligner, num_threads) #~print (repr(blast)) # Generate a list of hit containing hits of all sequence in query list in subject hit_list = [] # Extend the list of hits for each query in a bigger list. for query in query_list: hit_list.extend(blast.align(query)) return hit_list
def mask(subject_fasta, hit_list, ref_outdir="./references/", ref_outname="masked_ref.fa", compress_ouput=True): """ Import a reference fasta sequence, Mask positions indicated by hits from a hit_list and write the modified fasta sequence in a new file. @param subject_fasta Fasta sequence of the subject to edit (can be gzipped) @param hit_list List of hit objects. Hits need at least 3 fields named s_id, s_start and s_end coresponding to the name of the sequence matched, and the hit start/end (0 based). @param ref_outdir Directory where the masked reference will be created @param ref_outname Name of the masked reference @param compress_ouput If true the output will be gzipped @return A path to the modified sequence if the hit list was valid. """ # Test if object the first object of hit_list have the require s_id, s_start and s_end fields try: a = hit_list[0].s_id a = hit_list[0].s_start a = hit_list[0].s_end except IndexError: print("No hit found, The subject fasta file will not be edited") return subject_fasta except AttributeError as E: print( "The list provided does not contain suitable hit object, The subject fasta file will not be edited" ) return subject_fasta # Initialize output folder mkdir(ref_outdir) # Initialize input fasta file if subject_fasta[-2:].lower() == "gz": in_handle = gzip.open(subject_fasta, "r") else: in_handle = open(subject_fasta, "r") # Initialize output fasta file if compress_ouput: ref_path = path.join(ref_outdir, ref_outname + ".gz") out_handle = gzip.open(ref_path, 'w') else: ref_path = path.join(ref_outdir, ref_outname) out_handle = open(ref_path, 'w') # Generate a list of ref that will need to be modified id_list = {hit.s_id: 0 for hit in hit_list}.keys() # Iterate over record in the subject fasta file print("Masking hit positions and writting a new reference for {} ".format( ref_outname)) i = j = 0 start_time = time() for record in SeqIO.parse(in_handle, "fasta"): # Progress Marker stdout.write("*") stdout.flush() # Check if the record is in the list of record to modify if record.id in id_list: i += 1 #~print ("Hit found in {}. Editing the sequence".format(record.id)) # Casting Seq type to MutableSeq Type to allow string editing record.seq = record.seq.tomutable() # For each hit in the list of hit found for hit in hit_list: if record.id == hit.s_id: # For all position between start and end coordinates modify the base by N for position in range(hit.s_start, hit.s_end): record.seq[position] = 'n' else: j += 1 #~print ("No hit found in {}".format(record.id)) # Finally write the sequence modified or not out_handle.write(record.format("fasta")) print("") # Report informations print("{} sequence(s) from {} modified in {}s".format( i, ref_outname, round(time() - start_time), 2)) # Close files and return the masked ref path in_handle.close() out_handle.close() return ref_path
def mask ( subject_fasta, hit_list, ref_outdir="./references/", ref_outname="masked_ref.fa", compress_ouput=True ): """ Import a reference fasta sequence, Mask positions indicated by hits from a hit_list and write the modified fasta sequence in a new file. @param subject_fasta Fasta sequence of the subject to edit (can be gzipped) @param hit_list List of hit objects. Hits need at least 3 fields named s_id, s_start and s_end coresponding to the name of the sequence matched, and the hit start/end (0 based). @param ref_outdir Directory where the masked reference will be created @param ref_outname Name of the masked reference @param compress_ouput If true the output will be gzipped @return A path to the modified sequence if the hit list was valid. """ # Test if object the first object of hit_list have the require s_id, s_start and s_end fields try: a = hit_list[0].s_id a = hit_list[0].s_start a = hit_list[0].s_end except IndexError: print ("No hit found, The subject fasta file will not be edited") return subject_fasta except AttributeError as E: print ("The list provided does not contain suitable hit object, The subject fasta file will not be edited") return subject_fasta # Initialize output folder mkdir(ref_outdir) # Initialize input fasta file if subject_fasta[-2:].lower() == "gz": in_handle = gzip.open(subject_fasta, "r") else: in_handle = open(subject_fasta, "r") # Initialize output fasta file if compress_ouput: ref_path = path.join (ref_outdir, ref_outname+".gz") out_handle = gzip.open(ref_path, 'w') else: ref_path = path.join (ref_outdir, ref_outname) out_handle = open(ref_path, 'w') # Generate a list of ref that will need to be modified id_list = {hit.s_id:0 for hit in hit_list}.keys() # Iterate over record in the subject fasta file print ("Masking hit positions and writting a new reference for {} ".format(ref_outname)) i=j=0 start_time = time() for record in SeqIO.parse(in_handle, "fasta"): # Progress Marker stdout.write("*") stdout.flush() # Check if the record is in the list of record to modify if record.id in id_list: i+=1 #~print ("Hit found in {}. Editing the sequence".format(record.id)) # Casting Seq type to MutableSeq Type to allow string editing record.seq = record.seq.tomutable() # For each hit in the list of hit found for hit in hit_list: if record.id == hit.s_id: # For all position between start and end coordinates modify the base by N for position in range (hit.s_start, hit.s_end): record.seq[position]= 'n' else: j+=1 #~print ("No hit found in {}".format(record.id)) # Finally write the sequence modified or not out_handle.write(record.format("fasta")) print("") # Report informations print("{} sequence(s) from {} modified in {}s".format(i,ref_outname, round(time()-start_time),2)) # Close files and return the masked ref path in_handle.close() out_handle.close() return ref_path
def align(query_list, subject_db=None, subject_fasta=None, aligner="blastn", align_opt="", num_threads=1, db_maker="makeblastdb", db_opt="", db_outdir="./blast_db/", db_outname="out"): """ Main function of RefMasker that integrate database creation, blast and homology masking * Instantiate Blast database and blastn object * Perform iterative blasts of query sequences against the subject database and create a list of hits. @param query_list List of paths indicating fasta files containing query sequences (can be gzipped). Fasta can contains multiple sequences. @param subject_db Basename of file from a blast database created by "makeblastdb" if available @param subject_fasta Reference fasta file. Required if no ref_index is given (can be gzipped) @param aligner Path ot the blastn executable. Not required if blast+ if added to your path @param blastn_opt Blastn command line options as a string @param db_maker Path ot the makeblastdb executable. Not required if blast+ if added to your path @param db_opt makeblastdb command line options as a string @param db_outdir Directory where to store the database files @param db_outname Basename of the database files @return A list of BlastHit objects """ # Try to import an existing database try: if not subject_db: raise Exception("No Blast database was provided") print("Existing database provided") db = ExistingDB(subject_db) # If no DB or if an error occured during validation of the existing DB = create a new db except Exception as E: print(E) # Verify the presence of the reference fasta file if not subject_fasta or not path.isfile(subject_fasta): raise Exception( "Invalid or no fasta file provided. Cannot create a database") print("Generate a database...") mkdir(db_outdir) db_path = path.join(db_outdir, db_outname) # Create the new database db = NewDB(ref_path=subject_fasta, db_path=db_path, makeblastdb_opt=db_opt, makeblastdb=db_maker) # Initialise a Blastn object blast = Aligner(db, align_opt, aligner, num_threads) #~print (repr(blast)) # Generate a list of hit containing hits of all sequence in query list in subject hit_list = [] # Extend the list of hits for each query in a bigger list. for query in query_list: hit_list.extend(blast.align(query)) return hit_list
def align (R1, R2='', index = '', ref = '', aligner = "bwa mem", align_opt="", align_threads = 1, align_outdir= "./bwa_align/", align_outname= "out.sam", indexer = "bwa index", index_opt="", index_outdir = "./bwa_index/", index_outname = "out"): """ Main function of the package allowing to validate an existing index or to create a new one, then perform a alignment of single or paired fastq sequences against the index. Finally a sam file is returned for further analysis. If an valid existing index was given all index option and ref_fasta are not required. @param R1 Path to the file containing fastq sequences (can be gzipped) @param R2 Facultative path to the file containing paired fastq sequence (can be gzipped) @param index Index files basename if available @param ref Path of the fasta file containing the reference sequence (can be gzipped) This parameter can also be a list of fasta file (gzipped or not) in this case all references will be merged into a single fasta reference @param aligner Path ot the bwa mem executable. Not required if bwa if added to your path @param align_opt Bwa mem command line options as a string @param align_outdir Directory where to store the sam file @param align_outname Name of the output sam file @param indexer Path ot the bwa index executable. Not required if bwa if added to your path @param index_opt Bwa index command line options as a string @param index_outdir Directory where to store the index files @param index_outname Basename of the index file @return Path of the output sam file """ # Try to import an existing index try: if not index: raise Exception("No index provided") print("Existing index provided") idx = ExistingIndex(index) # If no index or if an error occured during validation of the existing index = create a new one except Exception as E: print (E) # Verify the presence of the reference fasta file if not ref: raise Exception("Invalid or no fasta file provided. Cannot create an index") print("Generating index...") mkdir(index_outdir) index_path = path.join(index_outdir, index_outname) idx = NewIndex(ref, index_path, index_opt, indexer) # Create a Aligner object mem = Aligner(idx, align_opt, aligner, align_threads) #~print (repr(mem)) mkdir(align_outdir) # Align the reference index with R1 fastq (and R2) align_path = path.join(align_outdir, align_outname) return (mem.align(R1, R2, align_path))