def printArgs(parsedArgs): allArgsStr = "All Args \n" allArgsStr += "#" * 25 + "\n" allArgsStr += f"Simulation Type {parsedArgs.simulation_type}" + "\n" allArgsStr += "Input Genomes Files :" + "\n" indLevel = "\t" for inFile in parsedArgs.genomes: allArgsStr += indLevel + "* " + os.path.abspath(inFile) + "\n" allArgsStr += "Species Prefix/Name and Id:" + "\n" for speciesPrefix in parsedArgs.speciesPrefix: allArgsStr += f"{indLevel} * {speciesPrefix}:{parsedArgs.speciesIds[speciesPrefix]}" + "\n" if parsedArgs.simulation_type == "RNA": allArgsStr += "Input Annotations Files :" + "\n" indLevel = "\t" for inFile in parsedArgs.annotations: allArgsStr += indLevel + "* " + os.path.abspath(inFile) + "\n" allArgsStr += "Generated Transciptome Files :" + "\n" indLevel = "\t" for inFile in parsedArgs.fasta_names: allArgsStr += indLevel + "* " + os.path.abspath(inFile) + "\n" allArgsStr += f"Read Length : {parsedArgs.input_rlen}" + "\n" allArgsStr += f"Read Layout : {parsedArgs.read_layout}" + "\n" allArgsStr += "#" * 25 + "\n" getLogger().debug(allArgsStr) ## TODO :: complete the rest here return
def bwaIndex(parsedArgs): logger = getLogger() algo = "" #calcualte concat.fasta genome size genome_len=0 for rec in SeqIO.parse(parsedArgs.genomeConcatFasta, 'fasta'): genome_len+=len(rec.seq) if genome_len > 3000000000: logger.info("Concatenated genome size is larged than 3GB. Using bwtsw algorithm for index generation" ) algo = "-a bwtsw" logger.info("Starting genome indexing with BWA.") if os.path.isdir(f"{parsedArgs.out_dir}/BWA_index") == True: logger.info("BWA_index directory exists. Generating index files.") else: logger.info(f"Creating {parsedArgs.out_dir}/BWA_index directory. Writing index files to BWA_index.") os.makedirs(f"{parsedArgs.out_dir}/BWA_index") cmd_bwa_index = "bwa index " \ f"-p {parsedArgs.out_dir}/BWA_index/concat_BWA " \ f"{algo} " \ f"{parsedArgs.genomeConcatFasta}" # print(cmd_bwa_index) res = crossmapper.externalExec.execute(cmd_bwa_index,"BWA_index", outDir = f"{parsedArgs.out_dir}") if not res.resCheck( stdoutRemove = True ): sys.exit("Execution Failed") logger.info("Genome index for BWA is generated.")
def __init__(self , parsedArgs, mapperName = "mapper" ): self.parsedArgs = parsedArgs self.logger = getLogger() self.mapperName = mapperName self.sorted = False self.indexFolder = f"{parsedArgs.out_dir}/{mapperName}_index" self.mappingDir = f"{parsedArgs.out_dir}/{mapperName}_output"
def starIndex(parsedArgs): logger = getLogger() #calcualte concat.fasta genome size genome_len=0 for rec in SeqIO.parse(parsedArgs.genomeConcatFasta, 'fasta'): genome_len+=len(rec.seq) if genome_len > 3000000000: logger.warning("Concatenated genome size is larged than 3 Gb! More than 30 GB of RAM will be required for STAR mapping." ) SA_index_size = min(14, round(math.log(genome_len,2)/2) - 1) logger.debug("genomeSAindexNbases = %s"%(SA_index_size)) logger.info("Starting genome indexing with STAR.") star_index = f"{parsedArgs.out_dir}/STAR_index" if os.path.isdir(f"{star_index}") == True: logger.info("STAR_index directory exists. Generating index files.") else: logger.info(f"Creating {star_index} directory. Writing index files to STAR_index.") os.makedirs(f"{star_index}") cmd_star_index = "STAR --runMode genomeGenerate " \ f"--runThreadN {parsedArgs.threads} " \ f"--genomeDir {star_index} " \ f"--genomeFastaFiles {parsedArgs.genomeConcatFasta} " \ f"--genomeSAindexNbases {SA_index_size}" res = crossmapper.externalExec.execute(cmd_star_index,"STAR_index" , outDir = f"{parsedArgs.out_dir}" ) if not res.resCheck( stdoutRemove = True ): sys.exit("Execution Failed") logger.info("Genome index for STAR is generated.") parsedArgs.starIndex = star_index
def execute(cmd, softName="extr_cmd", stdOutFile=None, stdErrFile=None, outDir="", overwrite=True): mode = "w" if not overwrite: mode = 'a' logger = getLogger() logger.debug(f"Start Running {softName} CMD : {cmd}") ## remove white spaces cmd = cmd.strip() cmd_list = cmd.split(" ") ## split as list cmd_list = list(filter(None, cmd_list)) if stdOutFile == None: stdOutFile = os.path.join(outDir, softName + "_stdout.txt") if stdErrFile == None: stdErrFile = os.path.join(outDir, softName + "_stderr.txt") with open(stdOutFile, mode) as outfile, open(stdErrFile, mode) as errorfile: try: if not overwrite: outfile.write("#" * 25 + "\n") errorfile.write("#" * 25 + "\n") outfile.write(f"{cmd}\n" + "#" * 25 + "\n") errorfile.write(f"{cmd}\n" + "#" * 25 + "\n") outfile.flush() errorfile.flush() process = subprocess.run(cmd_list, shell=False, stdout=outfile, stderr=errorfile, check=False) logger.debug("Running CMD Return Code : " + str(process.returncode)) if process.returncode != 0: logger.error(f"Can not excecute {softName} CMD : \"{cmd}\".") return ExecRes(cmd, process, softName, stdOutFile, stdErrFile) except FileNotFoundError as no_file: logger.error("Error in execute CMD : NO SUCH FILE OR DIRECTORY. " + str(no_file), exc_info=True) raise Exception("Can not execute CMD, " + str(no_file)) except PermissionError as perm_denied: logger.error("PERMISSION DENIED, " + str(perm_denied), exc_info=True) raise Exception("Error in execute CMD, " + str(perm_denied)) except Exception as ex: logger.error("Error in execute CMD: " + str(ex), exc_info=True) #raise ex raise Exception("Error in execute CMD, " + str(ex)) return ExecRes(cmd, None, softName, stdOutFile, stdErrFile)
def resCheck(self, clean=True, stdoutRemove=False, stdErrRemove=True): logger = getLogger() if self.returnCode != 0: logger.error(f"Error running {self.cmd}") logger.error(f"See error log in {self.stdErrFile}") return False if clean: self.clean(stdoutRemove=stdoutRemove, stdErrRemove=stdErrRemove) return True
def clean(self, stdoutRemove=False, stdErrRemove=True): logger = getLogger() if stdoutRemove: logger.debug(f"Deleteing {self.stdOutFile}") os.remove(self.stdOutFile) if stdErrRemove: logger.debug(f"Deleteing {self.stdErrFile}") os.remove(self.stdErrFile) return
def bwaMapping(parsedArgs,reads,rlen,read_layout): logger = getLogger() logger.info("Starting mapping with BWA.") # cmd_bwa_mapping = "bwa mem " \ # f"-t {parsedArgs.threads} concat {reads} -a | " \ # f"samtools sort @{parsedArgs.threads} -o concat_{rlen}_{read_layout}_sorted.bam -" # print(cmd_bwa_mapping) bwa_dir = f"{parsedArgs.out_dir}/bwa_output" parsedArgs.mappingDir = bwa_dir if os.path.isdir(f"{bwa_dir}") == False: logger.info(f"Creating {bwa_dir} directory.") os.makedirs(f"{bwa_dir}") tmpSamFile = f"{bwa_dir}/concat_{rlen}_{read_layout}.sam" finalBamFile = f"{bwa_dir}/concat_{rlen}_{read_layout}_sorted.bam" cmd_bwa_mapping = f"bwa mem -a -t {parsedArgs.threads} -A {parsedArgs.match_score} -B {parsedArgs.mismatch_penalty} {parsedArgs.out_dir}/BWA_index/concat_BWA {reads}" #f"samtools sort @{parsedArgs.threads} -o concat_{rlen}_{read_layout}_sorted.bam - res = crossmapper.externalExec.execute(cmd_bwa_mapping,"BWA_mapping" , tmpSamFile, None, outDir = f"{bwa_dir}") if not res.resCheck(): sys.exit("Execution Failed") ## TODO :: samtools view -bS res = crossmapper.externalExec.execute(f"samtools sort -@{parsedArgs.threads} -o {finalBamFile} {tmpSamFile}", "samtools" , outDir = f"{bwa_dir}") if not res.resCheck(stdoutRemove=True,stdErrRemove=True): sys.exit("Execution Failed") logger.info("Mapping is finished. " + f"Final bam file writen to {finalBamFile}") try : logger.debug(f"Deleteing tmp sam file {tmpSamFile}") os.remove(tmpSamFile) except : logger.warning(f"Can not delete temporary sam files {tmpSamFile}") logger.info("Starting Bam indexing.") cmd_samtools_index = f"samtools index {finalBamFile}" #print(cmd_samtools_index) res = crossmapper.externalExec.execute(cmd_samtools_index, "samtools_index", outDir = f"{bwa_dir}") if not res.resCheck(stdoutRemove=True,stdErrRemove=True): sys.exit("Execution Failed") parsedArgs.mappingOutputFiles[rlen][read_layout] = finalBamFile logger.info("Bam Indexing is finished.")
def printOutputFileInfo(parsedArgs, step='All'): logger = getLogger() if step == "wgsim" or step == 'All': allArgsStr = "\n" for rlen, files in parsedArgs.simulationOutputFiles.items(): allArgsStr += f"\t\t * {rlen} : {files}\n" logger.debug("wgsim output files : " + allArgsStr) if step == "mapping" or step == 'All': allArgsStr = "\n" for rlen, layout_files in parsedArgs.mappingOutputFiles.items(): for layout, files in layout_files.items(): allArgsStr += f"\t\t * {rlen} ({layout}) : {files}\n" logger.debug("mapping output files : " + allArgsStr)
def concatGeneomes(parsedArgs): logger = getLogger() genome_list=[] for i in range(0,len(parsedArgs.chr_rename_fasta)): genome_list.append(parsedArgs.chr_rename_fasta[i]) genome_concat = ' '.join(genome_list) parsedArgs.genomeConcatFasta = f"{parsedArgs.out_dir}/concat.fasta" # cmd_genome_concat = f"cat {genome_concat} > {parsedArgs.out_dir}/concat.fasta" res = crossmapper.externalExec.execute(f"cat {genome_concat}", "cat", f"{parsedArgs.genomeConcatFasta}", None, f"{parsedArgs.out_dir}") if not res.resCheck(): sys.exit("Execution fail")
def createHTMLReport(resCounters, args): logger = getLogger() reportFilePath = os.path.join(args.out_dir, "report.html") logger.info(f"Creating Report File : {reportFilePath}") reportHTML = reportTemplete.render( headTemplate=headTemplate, contentTemplate=contentTemplate, barGroupChartTemplate=barGroupChartTemplate, lineChartTemplate=lineChartTemplate, seriesTemplate=seriesTemplate, drilldownTemplate=drilldownTemplate, barchart2DivtableTemplate=barchart2DivtableTemplate, barchart2Template=barchart2Template, counterRes=resCounters, args=args) with open(reportFilePath, "w+") as fh: fh.write(reportHTML) return
def concatAnnotations(parsedArgs): logger = getLogger() if parsedArgs.simulation_type == "RNA": ### concatenate gtf files gtf_list = [] for i in range(0, len(parsedArgs.genomes)): if parsedArgs.annotations[i].split(".")[-1] == "gtf": gtf_list.append(parsedArgs.annotations[i]) else: gtf_name = getBaseName(parsedArgs.annotations[i]) + ".gtf" gtf_list.append(f"{parsedArgs.out_dir}/{gtf_name}") gtf_concat = ' '.join(gtf_list) # cmd_gtf_concat = f"cat {gtf_concat} > {parsedArgs.out_dir}/concat.gtf" res = crossmapper.externalExec.execute( f"cat {gtf_concat}", "cat", f"{parsedArgs.out_dir}/concat.gtf", None, f"{parsedArgs.out_dir}") if not res.resCheck(): sys.exit("Execution fail") parsedArgs.annotationsGTFConcat = f"{parsedArgs.out_dir}/concat.gtf"
def readSimulation(parsedArgs, fasta_name, fasta_basename, file_number, read_len): logger = getLogger() fasta_len = 0 for rec in SeqIO.parse(f"{parsedArgs.genomeConcatFasta}", 'fasta'): fasta_len += len(rec.seq) parsedArgs.simDir = os.path.join(parsedArgs.out_dir, "wgsim_output") if os.path.isdir(f"{parsedArgs.simDir}") == False: logger.info(f"Creating {parsedArgs.simDir} directory.") os.makedirs(f"{parsedArgs.simDir}") ## if possible to assign, calculate N_reads, based on C, else use input value try: N_reads = round(parsedArgs.coverage[file_number] * fasta_len / read_len) except Exception as ex: N_reads = parsedArgs.N_read[file_number] random_seed = int(parsedArgs.random_seed) + random.randint(1, 100000) cmd_wgsim = f"wgsim " \ f"-e {parsedArgs.error} " \ f"-d {parsedArgs.outer_dist} " \ f"-s {parsedArgs.s_dev} " \ f"-N {N_reads} " \ f"-1 {read_len} " \ f"-2 {read_len} " \ f"-r {parsedArgs.mut_rate} " \ f"-R {parsedArgs.indel_fraction} " \ f"-X {parsedArgs.indel_extend} " \ f"-S {random_seed} " \ f"-A {parsedArgs.discard_ambig} " \ f"{fasta_name} {parsedArgs.simDir}/{fasta_basename}_{read_len}_read1.fastq {parsedArgs.simDir}/{fasta_basename}_{read_len}_read2.fastq " crossmapper.externalExec.execute(cmd_wgsim, "cmd_wgsim", outDir=f"{parsedArgs.simDir}", overwrite=False) return cmd_wgsim
def parseArgument(argumentParser): parsedArgs = argumentParser.parse_args() ## setup absole path for dir parsedArgs.out_dir = os.path.abspath(parsedArgs.out_dir) if os.path.isdir(parsedArgs.out_dir) != True: ## TODO :: create the folder here # cmd_mkdir = "mkdir ./%s"%(parsedArgs.out_dir) ## try and handle execption here os.makedirs(parsedArgs.out_dir) for i in range(0, len(parsedArgs.genomes)): if os.path.exists(parsedArgs.genomes[i]): if not os.path.getsize(parsedArgs.genomes[i]) > 0: sys.exit( f"Error: {parsedArgs.genomes[i]} file is empty! Please provide a valid file." ) else: sys.exit( f"Error: {parsedArgs.genomes[i]} file does not exist! Please provide a valid file." ) ############### checking input if len(parsedArgs.genomes) <= 1: sys.exit( f"Error: Number of provided input genomes must be at least 2.") if parsedArgs.simulation_type == "RNA": if len(parsedArgs.genomes) != len(parsedArgs.annotations): sys.exit( f"Error: Number of provided input genomes files does not match number of input annotations files." ) if parsedArgs.coverage is not None: if len(parsedArgs.coverage) == 1: for ic in range(1, len(parsedArgs.genomes)): parsedArgs.coverage.append(parsedArgs.coverage[0]) if len(parsedArgs.genomes) > len(parsedArgs.coverage): sys.exit( f"Error: Provided Coverage (--coverage) options do not match the input genomes files. You should provide coverage for each input fasta file or just one coverage for all of them." ) elif parsedArgs.N_read is not None: if len(parsedArgs.N_read) == 1: for ic in range(1, len(parsedArgs.genomes)): parsedArgs.N_read.append(parsedArgs.N_read[0]) elif len(parsedArgs.genomes) > len(parsedArgs.N_read): sys.exit( f"Error: Provided number of reads/read pairs to generate (--N_read) options do not match the input genomes files. You should provide one for each input fasta file or just one for all of them." ) ### for renaming chr names parsedArgs.chr_rename_fasta = [] for i in range(0, len(parsedArgs.genomes)): fasta_chr_rename = getBaseName( parsedArgs.genomes[i]) + "_rename" + ".fasta" parsedArgs.chr_rename_fasta.append( os.path.abspath(parsedArgs.out_dir) + "/" + fasta_chr_rename) #print(parsedArgs.chr_rename_fasta) ### for renaming chr names in gff if parsedArgs.simulation_type == "RNA": parsedArgs.chr_rename_gff = [] for i in range(0, len(parsedArgs.annotations)): if parsedArgs.annotations[i][-3:] == "gtf": gff_chr_rename = getBaseName( parsedArgs.annotations[i]) + "_rename" + ".gtf" parsedArgs.chr_rename_gff.append( os.path.abspath(parsedArgs.out_dir) + "/" + gff_chr_rename) elif parsedArgs.annotations[i][-3:] == "gff": gff_chr_rename = getBaseName( parsedArgs.annotations[i]) + "_rename" + ".gff" parsedArgs.chr_rename_gff.append( os.path.abspath(parsedArgs.out_dir) + "/" + gff_chr_rename) #print(parsedArgs.chr_rename_gff) parsedArgs.fasta_names = [] if parsedArgs.simulation_type == "RNA": for i in range(0, len(parsedArgs.chr_rename_fasta)): transcriptome_name = getBaseName( parsedArgs.chr_rename_fasta[i]) + "_transcriptome%s" % ( i + 1) + ".fasta" # parsedArgs.fasta_names.append(os.path.abspath(transcriptome_name)) parsedArgs.fasta_names.append( os.path.join(parsedArgs.out_dir, transcriptome_name)) if len(parsedArgs.annotations) > 0: for i in range(0, len(parsedArgs.annotations)): if os.path.exists(parsedArgs.annotations[i]): if not os.path.getsize(parsedArgs.annotations[i]) > 0: sys.exit( f"Error: {parsedArgs.annotations[i]} file is empty! Please provide a valid file." ) else: sys.exit( f"Error: {parsedArgs.annotations[i]} file does not exist! Please provide a valid file." ) else: for i in range(0, len(parsedArgs.chr_rename_fasta)): parsedArgs.fasta_names.append( os.path.abspath(parsedArgs.chr_rename_fasta[i])) #print(parsedArgs.fasta_names) ## check if not all values can be converted to int try: list(map(int, parsedArgs.read_length.split(","))) except Exception: sys.exit( "There are strings or floats in read length values. Please use only standard read lengths!" ) ## convert list of strings to list of integers input_rlen = list(map(int, parsedArgs.read_length.split(","))) #print(input_rlen) ## check if there are duplicated lengths if not len(set(input_rlen)) == len(input_rlen): sys.exit("Error: read lengths shoud not be duplicated!") ## check if any length is not standard for length in input_rlen: #print(length) if not length in standard_rlen: sys.exit( "Error: input read length %s is not a standard Illumina read length." % (length) + "\nPlease refer to our help page (crossmap -h) to find standard read lengths." ) parsedArgs.input_rlen = input_rlen ## other initilization parsedArgs.simulationOutputFiles = {} parsedArgs.mappingOutputFiles = {} ## default concat geneome fasta file name parsedArgs.simDir = f"{parsedArgs.out_dir}" parsedArgs.mappingDir = f"{parsedArgs.out_dir}" parsedArgs.genomeConcatFasta = f"{parsedArgs.out_dir}/concat.fasta" parsedArgs.annotationsGTFConcat = f"{parsedArgs.out_dir}/concat.gtf" ## setting internal variable to parsedArgs object parsedArgs.isDebug = __DEBUG__ parsedArgs.logPrefix = "crossmap.log" parsedArgs.logFile = os.path.join(parsedArgs.out_dir, parsedArgs.logPrefix) if parsedArgs.verbose == "Debug": parsedArgs.isDebug = True parsedArgs.verbose = VerboseLevel.All ## option to report crossmapp reads info files # parsedArgs.reportCrossmapped = True if parsedArgs.genome_names is not None: parsedArgs.speciesPrefix = parsedArgs.genome_names else: parsedArgs.speciesPrefix = None if parsedArgs.speciesPrefix == None: ## get basename from the genome file parsedArgs.speciesPrefix = [] for i in range(0, len(parsedArgs.genomes)): genomePrefix = getBaseName(parsedArgs.genomes[i]) parsedArgs.speciesPrefix.append(genomePrefix) ## create specied Ids dict parsedArgs.speciesIds = {} for i in range(0, len(parsedArgs.speciesPrefix)): parsedArgs.speciesIds[parsedArgs.speciesPrefix[i]] = i if __DEBUG__: printArgs(parsedArgs) cmdLine = " ".join(sys.argv) setupLogger(parsedArgs) getLogger().info("Starting the program with \"" + cmdLine + "\"") ## FIXE :: change this option to be global if parsedArgs.simulation_type == "DNA": parsedArgs.star_temp_dir = "./TMPs" if parsedArgs.mapper_template is None: if parsedArgs.simulation_type == "RNA": parsedArgs.mapper = STARMapper(parsedArgs) else: parsedArgs.mapper = BWAMapper(parsedArgs) else: if os.path.dirname(parsedArgs.mapper_template) == "": # if no path look at the current dir if not look to config folder of the module if not os.path.exists(parsedArgs.mapper_template): mappersConfigFolder = os.path.abspath( os.path.dirname(os.path.realpath(__file__)) + "/mappers_config/") mapperTemplatePath = os.path.join(mappersConfigFolder, parsedArgs.mapper_template) if not os.path.exists(mapperTemplatePath): ## see if we have an ext if os.path.splitext(parsedArgs.mapper_template)[1] == "": ## add yaml ext and try again mapperTemplatePath = os.path.join( mappersConfigFolder, parsedArgs.mapper_template + ".yaml") if not os.path.exists(mapperTemplatePath): sys.exit( f"Can not Find the mapper template {parsedArgs.mapper_template}." ) else: parsedArgs.mapper_template = mapperTemplatePath else: parsedArgs.mapper_template = mapperTemplatePath with open(parsedArgs.mapper_template, 'r') as inputTemplate: try: configTemplate = yaml.safe_load(inputTemplate) parsedArgs.mapper = TemplateMapper(configTemplate, parsedArgs) getLogger().info( f"Custom Mapper Tempalte {parsedArgs.mapper.mapperName} Will be used." ) parsedArgs.mapper.checkDep() except yaml.YAMLError as exc: sys.exit("Can not Parse config Tempalte, {0}".format(exc)) except Exception as ex: getLogger().error( "Error Can not use Custom Mapper, {0}".format( ex)) #raise ex sys.exit("Error Can not use Custom Mapper.") return parsedArgs
def extractTranscriptome(parsedArgs): logger = getLogger() for i in range(0, len(parsedArgs.annotations)): if parsedArgs.annotations[i].split(".")[-1] == "gtf": logger.info( "Annotation file %s detected as gtf. Proceeding to transriptome extraction." % (os.path.basename(parsedArgs.annotations[i]))) #get the transcriptome name transcriptome_name = getBaseName( parsedArgs.genomes[i]) + "_transcriptome%s" % (i + 1) + ".fasta" # extract the transcript cmd_gffread_extract = f"gffread " \ f"-w {parsedArgs.out_dir}/{transcriptome_name} " \ f"-g {parsedArgs.genomes[i]} " \ f"{parsedArgs.annotations[i]}" #print(cmd_gffread_extract) res = crossmapper.externalExec.execute( cmd_gffread_extract, "gffreadExtract", outDir=f"{parsedArgs.out_dir}") if not res.resCheck(): sys.exit("Execution fail") #gffread -w transcriptome_name -g parsedArgs.genomes[i] parsedArgs.annotations[i] logger.info("Transcriptome extracted for %s" % (os.path.basename(parsedArgs.genomes[i]))) elif parsedArgs.annotations[i].split(".")[-1] == "gff": logger.info( "Annotation file %s detected as gff. Converting to gtf using gffread." % (os.path.basename(parsedArgs.annotations[i]))) #converting to gtf gtf_name = getBaseName(parsedArgs.annotations[i]) + ".gtf" cmd_gffread_convert = f"gffread " \ f"{parsedArgs.annotations[i]} " \ f"-T -o {parsedArgs.out_dir}/{gtf_name}" #print(cmd_gffread_convert) res = crossmapper.externalExec.execute( cmd_gffread_convert, "gffreadConvert", outDir=f"{parsedArgs.out_dir}") if not res.resCheck(stdoutRemove=True, stdErrRemove=True): sys.exit("Execution fail") #gffread parsedArgs.annotations[i] -T -o gtf_name logger.info( "GFF --> GTF conversion is done. Proceeding to transriptome extraction." ) #get the transcriptome name transcriptome_name = getBaseName( parsedArgs.genomes[i]) + "_transcriptome%s" % (i + 1) + ".fasta" cmd_gffread_extract = f"gffread " \ f"-w {parsedArgs.out_dir}/{transcriptome_name} " \ f"-g {parsedArgs.genomes[i]} " \ f"{parsedArgs.out_dir}/{gtf_name}" #print(cmd_gffread_extract) res = crossmapper.externalExec.execute( cmd_gffread_extract, "gffreadExtract", outDir=f"{parsedArgs.out_dir}") if not res.resCheck(stdoutRemove=True, stdErrRemove=True): sys.exit("Execution fail") # extract the transcript #gffread -w transcriptome_name -g parsedArgs.genomes[i] gtf_name logger.info("Transcriptome extracted for %s" % (os.path.basename(parsedArgs.genomes[i]))) else: logger.error( "Error: annotation file %s is neither gtf nor in gff. Please check the annotation file." % (os.path.basename(parsedArgs.annotations[i]))) sys.exit("Execution Failed")
def concateFastqFiles(parsedArgs, rlen): logger = getLogger() #for rlen in parsedArgs.input_rlen: genome_list_r1 = [] genome_list_r2 = [] for i in range(0, len(parsedArgs.genomes)): if parsedArgs.simulation_type == "RNA": read_1 = parsedArgs.simDir + "/" + getBaseName( parsedArgs.genomes[i]) + "_transcriptome" + str( i + 1) + "_" + str(rlen) + "_read1.fastq" genome_list_r1.append(read_1) read_2 = parsedArgs.simDir + "/" + getBaseName( parsedArgs.genomes[i]) + "_transcriptome" + str( i + 1) + "_" + str(rlen) + "_read2.fastq" genome_list_r2.append(read_2) #print(genome_list_r2) else: read_1 = parsedArgs.simDir + "/" + getBaseName( parsedArgs.genomes[i]) + "_" + str(rlen) + "_read1.fastq" genome_list_r1.append(read_1) read_2 = parsedArgs.simDir + "/" + getBaseName( parsedArgs.genomes[i]) + "_" + str(rlen) + "_read2.fastq" genome_list_r2.append(read_2) genome_concat1 = ' '.join(genome_list_r1) #cmd_read1_concat = f"cat {genome_concat1} > {parsedArgs.out_dir}/concat_{rlen}_read1.fastq" res = crossmapper.externalExec.execute( f"cat {genome_concat1}", "cat", f"{parsedArgs.simDir}/concat_{rlen}_read1.fastq", None, f"{parsedArgs.out_dir}") if not res.resCheck(): sys.exit("Execution fail") genome_concat2 = ' '.join(genome_list_r2) # cmd_read2_concat = f"cat {genome_concat2} > {parsedArgs.simDir}/concat_{rlen}_read2.fastq" if parsedArgs.read_layout != "SE": res = crossmapper.externalExec.execute( f"cat {genome_concat2}", "cat", f"{parsedArgs.simDir}/concat_{rlen}_read2.fastq", None, f"{parsedArgs.out_dir}") if not res.resCheck(): sys.exit("Execution fail") # else: ## no need ?? # ## remove right reads files # try : # logger.debug(f"Removeing simulated reads 2 from wgsim {parsedArgs.out_dir}/concat_{rlen}_read2.fastq") # os.remove(f"{parsedArgs.out_dir}/concat_{rlen}_read2.fastq") # except: # logger.warning(f"Can not remove unwanted reads file {parsedArgs.out_dir}/concat_{rlen}_read2.fastq") parsedArgs.simulationOutputFiles[rlen].append( f"{parsedArgs.simDir}/concat_{rlen}_read1.fastq") if parsedArgs.read_layout != "SE": parsedArgs.simulationOutputFiles[rlen].append( f"{parsedArgs.simDir}/concat_{rlen}_read2.fastq") ## cleanning temp files tmpFiles = [] tmpFiles.extend(genome_list_r1) tmpFiles.extend(genome_list_r2) for tmpFile in tmpFiles: try: logger.debug(f"Deleteing tmp file {tmpFile}") os.remove(tmpFile) logger.debug(f"tmp file {tmpFile} delete") except Exception: logger.error(f"Can not delete tmp file {tmpFile}", exc_info=True)
def starMapping(parsedArgs,reads,rlen,read_layout): logger = getLogger() overhang = rlen - 1 star_dir = f"{parsedArgs.out_dir}/star_output" parsedArgs.mappingDir = star_dir if os.path.isdir(f"{star_dir}") == False: logger.info(f"Creating {star_dir} directory.") os.makedirs(f"{star_dir}") logger.info("Starting STAR mapping.") if parsedArgs.bacterial_mode is True: intron_len_max=1 else: intron_len_max=0 cmd_star_mapping = "STAR " \ f"--runThreadN {parsedArgs.threads} " \ f"--genomeDir {parsedArgs.starIndex} " \ f"--sjdbGTFfile {parsedArgs.out_dir}/concat.gtf " \ f"--sjdbOverhang {overhang} " \ f"--readFilesIn {reads} " \ "--readFilesCommand cat --outSAMtype BAM Unsorted " \ f"--outFileNamePrefix {star_dir}/concat_{rlen}_{read_layout}_ " \ f"--outFilterMismatchNmax {parsedArgs.outFilterMismatchNmax} " \ f"--outFilterMultimapNmax 10000 " \ f"--outFilterMismatchNoverReadLmax {parsedArgs.outFilterMismatchNoverReadLmax} " \ f"--alignIntronMax {intron_len_max} " \ f"--outTmpDir {parsedArgs.star_temp_dir}" # print(cmd_star_mapping) res = crossmapper.externalExec.execute(cmd_star_mapping,"STAR_mapping" , outDir = f"{star_dir}", overwrite = False) if not res.resCheck(clean = False): sys.exit("Execution Failed") logger.info("Mapping is finished. Started bam file sorting and indexing.") finalBamFile = f"{star_dir}/concat_{rlen}_{read_layout}_sorted.bam" tmpBamFile = f"{star_dir}/concat_{rlen}_{read_layout}_Aligned.out.bam" cmd_samtools_sort = "samtools sort " \ f"-@{parsedArgs.threads} " \ f"-o {finalBamFile} {tmpBamFile}" # print(cmd_samtools_sort) res = crossmapper.externalExec.execute(cmd_samtools_sort,"samtools_sort" , outDir = f"{star_dir}") if not res.resCheck(stdoutRemove=True,stdErrRemove=True): sys.exit("Execution Failed") logger.info("Sorting is finished. " +f"Final bam file writen to {finalBamFile}") try : logger.debug(f"Deleteing tmp sam file {tmpBamFile}") os.remove(tmpBamFile) except : logger.warning(f"Can not delete temporary bam file {tmpBamFile}") logger.info("Starting bam indexing.") cmd_samtools_index = f"samtools index {finalBamFile}" # print(cmd_samtools_index) res = crossmapper.externalExec.execute(cmd_samtools_index,"samtools_index" , outDir = f"{star_dir}") if not res.resCheck(stdoutRemove=True,stdErrRemove=True): sys.exit("Execution Failed") logger.info("Bam Indexing is finished.") parsedArgs.mappingOutputFiles[rlen][read_layout] = finalBamFile
def getReadCounters(args): logger = getLogger() speciesIds = args.speciesIds # { org1Name : 0, org2Name :1} logger.info("Reading Sequence Directory from Fasta files") spInputFastaFiles = args.genomes allSeqs = getSequencesPerOrganisms( spInputFastaFiles) ## testing was [sp1InputFasta,sp2InputFasta] seqsIndex = createSequenceIndex(allSeqs) seqsIndexToSeq = dict((v, k) for k, v in seqsIndex.items()) seqToOrg = sequenceToOrganism(allSeqs) transcriptMap = None if args.simulation_type == "RNA": logger.info("Reading GTF/GFF files for transcripts info ... ") transcriptMap = mapTranscriptToSequence( [args.annotationsGTFConcat], seqsIndex) ## test was [sp1InputGTF, sp2InputGTF] counters = {} reportCorssmappedReadFiles = None if args.reportCrossmapped: reportCorssmappedReadFiles = {} ## create file here and store them in dic crossmapReadsDirName = args.out_dir + "/crossmapped_reads" if os.path.isdir(crossmapReadsDirName) != True: os.makedirs(crossmapReadsDirName) for spName in args.speciesPrefix: outfilename = crossmapReadsDirName + "/" + spName reportReadFile = open(outfilename, "w+") reportCorssmappedReadFiles[spName] = reportReadFile ## new code here outputFile = open(os.path.join(args.out_dir, "report.txt"), "w") for rlen, layout_files in args.mappingOutputFiles.items(): counters[rlen] = {} for layout, inBamFileName in layout_files.items(): logger.info( f"Start counting reads for read lenghth {rlen} and ({layout}) layout:" ) bamFile = pysam.AlignmentFile(inBamFileName, "rb") ## chech if bamFile has NH tag or not , if not calc it in advance and pass it to the count method NHTags = checkNHTag(bamFile) allCounter, reads = countReads( bamFile, speciesIds, seqsIndexToSeq, seqToOrg, rlen=rlen, layout=layout, transcriptMap=transcriptMap, nhTag=NHTags, reportReadFiles=reportCorssmappedReadFiles) del NHTags bamFile.close() counters[rlen][layout] = allCounter outputFile.write( f"Summary Counter for lenghth {rlen} and ({layout}) layout : {inBamFileName}\n" ) allCounter.summary(outputFile) outputFile.write("*" * 50 + "\n\n") outputFile.close() if args.reportCrossmapped: ## close files for spName in args.speciesPrefix: reportCorssmappedReadFiles[spName].close() createHTMLReport(counters, args) return counters