def run_gapclosing(outdir, libraries, nogapsFname, scaffoldsFname, threads, limit, iters, resume, verbose, log, basename="_gapcloser", overlap=25, minReadLen=40): """Execute gapclosing step.""" pout = scaffoldsFname stop = 1 for i, (libnames, libFs, libRs, orientations, libIS, libISStDev, libreadlen) in enumerate(libraries, 1): # prepare config file and filter reads configFn = os.path.join(outdir, "%s.%s.conf" % (basename, i)) # skip if not suitable libraries maxReadLen = max(libreadlen) if not prepare_gapcloser(outdir, configFn, libFs, libRs, orientations, libIS, libISStDev, \ minReadLen, maxReadLen, limit, verbose, log): continue # run iterations for j in range(1, iters + 1): out = os.path.join(outdir, "%s.%s.%s.fa" % (basename, i, j)) # skip if file exists if resume > 1 or _corrupted_file(out): resume += 1 # run GapCloser cmd = ["GapCloser", "-t %s"%threads, "-p %s"%overlap, "-l %s"%maxReadLen, \ "-a", pout, "-b", configFn, "-o", out] if verbose: log.write(" iteration %s.%s: %s" % (i, j, FastaIndex(pout).stats())) # run GapCloser and save stdout/err to log file with open(out + ".log", "w") as gapcloselog: GapCloser = subprocess.Popen(cmd, stdout=gapcloselog, stderr=gapcloselog) GapCloser.wait() # store out info pout = out # skip if number of gaps smaller than 0.1% stats = FastaIndex(pout).stats() fastaSize = int(stats.split('\t')[2]) gapSize = int(stats.split('\t')[-2]) if 1.0 * gapSize / fastaSize < 0.001: stop = 1 break if stop: break # create symlink to final scaffolds or pout symlink(os.path.basename(pout), nogapsFname) symlink(os.path.basename(pout + ".fai"), nogapsFname + ".fai") return resume
def run_gapclosing(outdir, libraries, nogapsFname, scaffoldsFname, threads, limit, iters, resume, verbose, log, basename="_gapcloser", overlap=25, minReadLen=40): """Execute gapclosing step.""" pout = scaffoldsFname stop = 1 for i, (libnames, libFs, libRs, orientations, libIS, libISStDev, libreadlen) in enumerate(libraries, 1): # prepare config file and filter reads configFn = os.path.join(outdir, "%s.%s.conf"%(basename, i)) # skip if not suitable libraries maxReadLen = max(libreadlen) if not prepare_gapcloser(outdir, configFn, libFs, libRs, orientations, libIS, libISStDev, \ minReadLen, maxReadLen, limit, verbose, log): continue # run iterations for j in range(1, iters+1): out = os.path.join(outdir, "%s.%s.%s.fa"%(basename, i, j)) # skip if file exists if resume>1 or _corrupted_file(out): resume += 1 # run GapCloser cmd = ["GapCloser", "-t %s"%threads, "-p %s"%overlap, "-l %s"%maxReadLen, \ "-a", pout, "-b", configFn, "-o", out] if verbose: log.write(" iteration %s.%s: %s"%(i, j, FastaIndex(pout).stats())) # run GapCloser and save stdout/err to log file with open(out+".log", "w") as gapcloselog: GapCloser = subprocess.Popen(cmd, stdout=gapcloselog, stderr=gapcloselog) GapCloser.wait() # store out info pout = out # skip if number of gaps smaller than 0.1% stats = FastaIndex(pout).stats() fastaSize = int(stats.split('\t')[2]) gapSize = int(stats.split('\t')[-2]) if 1.0 * gapSize / fastaSize < 0.001: stop = 1 break if stop: break # create symlink to final scaffolds or pout symlink(os.path.basename(pout), nogapsFname) symlink(os.path.basename(pout+".fai"), nogapsFname+".fai") return resume
def run_scaffolding(outdir, scaffoldsFname, fastq, libraries, reducedFname, mapq, threads, \ joins, linkratio, limit, iters, sspacebin, gapclosing, verbose, usebwa, log, \ identity, overlap, minLength, resume, lib=""): """Execute scaffolding step using libraries with increasing insert size in multiple iterations. """ pout = reducedFname i = 0 while i < len(libraries): libnames, libFs, libRs, orients, libIS, libISStDev, libreadlen = libraries[i] i += 1 for j in range(1, iters+1): out = os.path.join(outdir, "_sspace.%s.%s"%(i, j)) # resume if files don't exist if resume>1 or _corrupted_file(out+".fa"): resume += 1 if verbose: log.write(" iteration %s.%s: %s"%(i, j, FastaIndex(pout).stats())) lib = "" # run fastq scaffolding fastq2sspace(out, open(pout), lib, libnames, libFs, libRs, orients, \ libIS, libISStDev, libreadlen, threads, mapq, limit, linkratio, joins, \ sspacebin, verbose=0, usebwa=usebwa, log=log) # store out info pout = out+".fa" # link output ie out/_sspace.1.1/_sspace.1.1.scaffolds.fasta --> out/_sspace.1.1.scaffolds.fasta targetout = os.path.join(os.path.basename(out), os.path.basename(out+".final.scaffolds.fasta")) symlink(targetout, pout) # if number of gaps larger than 1%, run gap closer & reduction stats = FastaIndex(pout).stats() fastaSize = int(stats.split('\t')[2]) gapSize = int(stats.split('\t')[-2]) if i<len(libraries) and gapclosing and 1.0 * gapSize / fastaSize > 0.02: nogapsFname = ".".join(pout.split(".")[:-1]) + ".filled.fa" if resume>1 or _corrupted_file(nogapsFname): resume += 1 # close gaps & reduce if verbose: log.write(" closing gaps ...\n") basename = "_sspace.%s.%s._gapcloser"%(i, j) run_gapclosing(outdir, [libraries[i-1],], nogapsFname, pout, threads, limit, \ iters=1, resume=resume, verbose=0, log=log, basename=basename) reducedFname = ".".join(pout.split(".")[:-1]) + ".reduced.fa" if resume>1 or _corrupted_file(reducedFname): if verbose: log.write(" reducing ...\n") with open(reducedFname, "w") as out: info = fasta2homozygous(out, open(nogapsFname), identity, overlap, minLength, threads, verbose=0, log=log) pout = reducedFname # update library insert size estimation, especially for mate-pairs libraries = get_libraries(fastq, pout, mapq, threads, verbose=0,log=log, libraries=libraries, usebwa=usebwa) # create symlink to final scaffolds or pout symlink(os.path.basename(pout), scaffoldsFname) symlink(os.path.basename(pout+".fai"), scaffoldsFname+".fai") return libraries, resume
def redundans(fastq, longreads, fasta, reference, outdir, mapq, threads, mem, resume, identity, overlap, minLength, \ joins, linkratio, readLimit, iters, sspacebin, \ reduction=1, scaffolding=1, gapclosing=1, cleaning=1, \ norearrangements=0, verbose=1, usebwa=0, log=sys.stderr, tmp="/tmp"): """Launch redundans pipeline.""" # check resume orgresume = resume if resume: log.write("%sResuming previous run from %s...\n" % (timestamp(), outdir)) if not os.path.isdir(outdir): log.write("No such directory: %s!\n" % outdir) sys.exit(1) # prepare outdir or quit if exists elif os.path.isdir(outdir): log.write("Directory %s exists!\n" % outdir) sys.exit(1) else: os.makedirs(outdir) # DE NOVO CONTIGS lastOutFn = os.path.join(outdir, "contigs.fa") if not fasta and _corrupted_file(lastOutFn): resume += 1 if verbose: log.write("%sDe novo assembly...\n" % timestamp()) fasta = denovo(os.path.join(outdir, "denovo"), fastq, threads, mem, verbose, log, tmp) elif not fasta: fasta = lastOutFn # REDUCTION fastas = [ fasta, ] _check_fasta(fasta) symlink(fasta, lastOutFn) fastas.append(lastOutFn) _check_fasta(lastOutFn) # update fasta list outfn = os.path.join(outdir, "contigs.reduced.fa") if reduction and _corrupted_file(outfn): resume += 1 if verbose: log.write("%sReduction...\n" % timestamp()) log.write( "#file name\tgenome size\tcontigs\theterozygous size\t[%]\theterozygous contigs\t[%]\tidentity [%]\tpossible joins\thomozygous size\t[%]\thomozygous contigs\t[%]\n" ) with open(outfn, "w") as out: info = fasta2homozygous(out, open(fastas[-1]), identity, overlap, minLength, threads, verbose=0, log=log) # update fasta list lastOutFn = outfn fastas.append(lastOutFn) _check_fasta(lastOutFn) # get read limit & libraries if fastq: if verbose: log.write("%sEstimating parameters of libraries...\n" % timestamp()) limit = get_read_limit(lastOutFn, readLimit, verbose, log) libraries = get_libraries(fastq, lastOutFn, mapq, threads, verbose, log, usebwa=usebwa) # SCAFFOLDING outfn = os.path.join(outdir, "scaffolds.fa") if fastq and scaffolding: if verbose: log.write("%sScaffolding...\n" % timestamp()) libraries, resume = run_scaffolding(outdir, outfn, fastq, libraries, lastOutFn, mapq, threads, joins, \ linkratio, limit, iters, sspacebin, gapclosing, verbose, usebwa, log, \ identity, overlap, minLength, resume) # update fasta list fastas += filter( lambda x: "_gapcloser" not in x, sorted(glob.glob(os.path.join(outdir, "_sspace.*.fa")))) lastOutFn = outfn fastas.append(lastOutFn) _check_fasta(lastOutFn) # SCAFFOLDING WITH LONG READS outfn = os.path.join(outdir, "scaffolds.longreads.fa") if longreads and _corrupted_file(outfn): # here maybe sort reads by increasing median read length resume += 1 if verbose: log.write("%sScaffolding with long reads...\n" % timestamp()) poutfn = lastOutFn for i, fname in enumerate(longreads, 1): if verbose: log.write(" iteration %s...\n" % i) s = LongReadGraph(lastOutFn, fname, identity, overlap, maxgap=0, threads=threads, \ dotplot="", norearrangements=norearrangements, log=0) # save output _outfn = os.path.join(outdir, "scaffolds.longreads.%s.fa" % i) with open(_outfn, "w") as out: s.save(out) # store fname fastas.append(_outfn) poutfn = _outfn # symlink last iteration symlink(poutfn, outfn) # update fasta list lastOutFn = outfn fastas.append(lastOutFn) _check_fasta(lastOutFn) # REFERENCE-BASED SCAFFOLDING outfn = os.path.join(outdir, "scaffolds.ref.fa") if reference and _corrupted_file(outfn): resume += 1 if verbose: log.write("%sScaffolding based on reference...\n" % timestamp()) s = SyntenyGraph(lastOutFn, reference, identity=0.51, overlap=0.66, maxgap=0, threads=threads, \ dotplot="", norearrangements=norearrangements, log=0) # save output with open(outfn, "w") as out: s.save(out) # update fasta list lastOutFn = outfn fastas.append(lastOutFn) _check_fasta(lastOutFn) # GAP CLOSING outfn = os.path.join(outdir, "scaffolds.filled.fa") if fastq and gapclosing: if verbose: log.write("%sGap closing...\n" % timestamp()) resume = run_gapclosing(outdir, libraries, outfn, lastOutFn, threads, limit, iters, resume, verbose, log) # update fasta list fastas += sorted(glob.glob(os.path.join(outdir, "_gap*.fa"))) lastOutFn = outfn fastas.append(lastOutFn) _check_fasta(lastOutFn) # FINAL REDUCTION outfn = os.path.join(outdir, "scaffolds.reduced.fa") if reduction and _corrupted_file(outfn): resume += 1 if verbose: log.write("%sFinal reduction...\n" % timestamp()) log.write( "#file name\tgenome size\tcontigs\theterozygous size\t[%]\theterozygous contigs\t[%]\tidentity [%]\tpossible joins\thomozygous size\t[%]\thomozygous contigs\t[%]\n" ) # reduce with open(outfn, "w") as out: info = fasta2homozygous(out, open(lastOutFn), identity, overlap, minLength, threads, verbose=0, log=log) # update fasta list lastOutFn = outfn fastas.append(lastOutFn) _check_fasta(lastOutFn) # FASTA STATS if verbose: log.write("%sReporting statistics...\n" % timestamp()) # report stats log.write( '#fname\tcontigs\tbases\tGC [%]\tcontigs >1kb\tbases in contigs >1kb\tN50\tN90\tNs\tlongest\n' ) for fn in fastas: log.write(FastaIndex(fn).stats()) # Clean-up if cleaning: if verbose: log.write("%sCleaning-up...\n" % timestamp()) for root, dirs, fnames in os.walk(outdir): endings = ('.fa', '.fasta', '.fai', '.tsv', '.png', '.log') for i, fn in enumerate( filter(lambda x: not x.endswith(endings), fnames), 1): os.unlink(os.path.join(root, fn)) # rmdir of snap index if root.endswith('.snap') and i == len(fnames): os.rmdir(root) if orgresume: log.write("%sResume report: %s step(s) have been recalculated.\n" % (timestamp(), resume - 1))
def run_scaffolding(outdir, scaffoldsFname, fastq, libraries, reducedFname, mapq, threads, \ joins, linkratio, limit, iters, sspacebin, gapclosing, verbose, usebwa, log, \ identity, overlap, minLength, resume, lib=""): """Execute scaffolding step using libraries with increasing insert size in multiple iterations. """ pout = reducedFname i = 0 while i < len(libraries): libnames, libFs, libRs, orients, libIS, libISStDev, libreadlen = libraries[ i] i += 1 for j in range(1, iters + 1): out = os.path.join(outdir, "_sspace.%s.%s" % (i, j)) # resume if files don't exist if resume > 1 or _corrupted_file(out + ".fa"): resume += 1 if verbose: log.write(" iteration %s.%s: %s" % (i, j, FastaIndex(pout).stats())) lib = "" # run fastq scaffolding fastq2sspace(out, open(pout), lib, libnames, libFs, libRs, orients, \ libIS, libISStDev, libreadlen, threads, mapq, limit, linkratio, joins, \ sspacebin, verbose=0, usebwa=usebwa, log=log) # store out info pout = out + ".fa" # link output ie out/_sspace.1.1/_sspace.1.1.scaffolds.fasta --> out/_sspace.1.1.scaffolds.fasta targetout = os.path.join( os.path.basename(out), os.path.basename(out + ".final.scaffolds.fasta")) symlink(targetout, pout) # if number of gaps larger than 1%, run gap closer & reduction stats = FastaIndex(pout).stats() fastaSize = int(stats.split('\t')[2]) gapSize = int(stats.split('\t')[-2]) if i < len( libraries) and gapclosing and 1.0 * gapSize / fastaSize > 0.02: nogapsFname = ".".join(pout.split(".")[:-1]) + ".filled.fa" if resume > 1 or _corrupted_file(nogapsFname): resume += 1 # close gaps & reduce if verbose: log.write(" closing gaps ...\n") basename = "_sspace.%s.%s._gapcloser" % (i, j) run_gapclosing(outdir, [libraries[i-1],], nogapsFname, pout, threads, limit, \ iters=1, resume=resume, verbose=0, log=log, basename=basename) reducedFname = ".".join(pout.split(".")[:-1]) + ".reduced.fa" if resume > 1 or _corrupted_file(reducedFname): if verbose: log.write(" reducing ...\n") with open(reducedFname, "w") as out: info = fasta2homozygous(out, open(nogapsFname), identity, overlap, minLength, threads, verbose=0, log=log) pout = reducedFname # update library insert size estimation, especially for mate-pairs libraries = get_libraries(fastq, pout, mapq, threads, verbose=0, log=log, libraries=libraries, usebwa=usebwa) # create symlink to final scaffolds or pout symlink(os.path.basename(pout), scaffoldsFname) symlink(os.path.basename(pout + ".fai"), scaffoldsFname + ".fai") return libraries, resume
def redundans(fastq, longreads, fasta, reference, outdir, mapq, threads, mem, resume, identity, overlap, minLength, \ joins, linkratio, readLimit, iters, sspacebin, \ reduction=1, scaffolding=1, gapclosing=1, cleaning=1, \ norearrangements=0, verbose=1, usebwa=0, log=sys.stderr, tmp="/tmp"): """Launch redundans pipeline.""" # check resume orgresume = resume if resume: log.write("%sResuming previous run from %s...\n"%(timestamp(), outdir)) if not os.path.isdir(outdir): log.write("No such directory: %s!\n"%outdir) sys.exit(1) # prepare outdir or quit if exists elif os.path.isdir(outdir): log.write("Directory %s exists!\n"%outdir) sys.exit(1) else: os.makedirs(outdir) # DE NOVO CONTIGS lastOutFn = os.path.join(outdir, "contigs.fa") if not fasta and _corrupted_file(lastOutFn): resume += 1 if verbose: log.write("%sDe novo assembly...\n"%timestamp()) fasta = denovo(os.path.join(outdir, "denovo"), fastq, threads, mem, verbose, log, tmp) # REDUCTION fastas = [fasta, ]; _check_fasta(fasta) symlink(fasta, lastOutFn) fastas.append(lastOutFn); _check_fasta(lastOutFn) # update fasta list outfn = os.path.join(outdir, "contigs.reduced.fa") if reduction and _corrupted_file(outfn): resume += 1 if verbose: log.write("%sReduction...\n"%timestamp()) log.write("#file name\tgenome size\tcontigs\theterozygous size\t[%]\theterozygous contigs\t[%]\tidentity [%]\tpossible joins\thomozygous size\t[%]\thomozygous contigs\t[%]\n") with open(outfn, "w") as out: info = fasta2homozygous(out, open(fastas[-1]), identity, overlap, minLength, threads, verbose=0, log=log) # update fasta list lastOutFn = outfn fastas.append(lastOutFn); _check_fasta(lastOutFn) # get read limit & libraries if fastq: if verbose: log.write("%sEstimating parameters of libraries...\n"%timestamp()) limit = get_read_limit(lastOutFn, readLimit, verbose, log) libraries = get_libraries(fastq, lastOutFn, mapq, threads, verbose, log, usebwa=usebwa) # SCAFFOLDING outfn = os.path.join(outdir, "scaffolds.fa") if fastq and scaffolding: if verbose: log.write("%sScaffolding...\n"%timestamp()) libraries, resume = run_scaffolding(outdir, outfn, fastq, libraries, lastOutFn, mapq, threads, joins, \ linkratio, limit, iters, sspacebin, gapclosing, verbose, usebwa, log, \ identity, overlap, minLength, resume) # update fasta list fastas += filter(lambda x: "_gapcloser" not in x, sorted(glob.glob(os.path.join(outdir, "_sspace.*.fa")))) lastOutFn = outfn fastas.append(lastOutFn); _check_fasta(lastOutFn) # SCAFFOLDING WITH LONG READS outfn = os.path.join(outdir, "scaffolds.longreads.fa") if longreads and _corrupted_file(outfn): # here maybe sort reads by increasing median read length resume += 1 if verbose: log.write("%sScaffolding with long reads...\n"%timestamp()) poutfn = lastOutFn for i, fname in enumerate(longreads, 1): if verbose: log.write(" iteration %s...\n"%i) s = LongReadGraph(lastOutFn, fname, identity, overlap, maxgap=0, threads=threads, \ dotplot="", norearrangements=norearrangements, log=0) # save output _outfn = os.path.join(outdir, "scaffolds.longreads.%s.fa"%i) with open(_outfn, "w") as out: s.save(out) # store fname fastas.append(_outfn) poutfn = _outfn # symlink last iteration symlink(poutfn, outfn) # update fasta list lastOutFn = outfn fastas.append(lastOutFn); _check_fasta(lastOutFn) # REFERENCE-BASED SCAFFOLDING outfn = os.path.join(outdir, "scaffolds.ref.fa") if reference and _corrupted_file(outfn): resume += 1 if verbose: log.write("%sScaffolding based on reference...\n"%timestamp()) s = SyntenyGraph(lastOutFn, reference, identity=0.51, overlap=0.66, maxgap=0, threads=threads, \ dotplot="", norearrangements=norearrangements, log=0) # save output with open(outfn, "w") as out: s.save(out) # update fasta list lastOutFn = outfn fastas.append(lastOutFn); _check_fasta(lastOutFn) # GAP CLOSING outfn = os.path.join(outdir, "scaffolds.filled.fa") if fastq and gapclosing: if verbose: log.write("%sGap closing...\n"%timestamp()) resume = run_gapclosing(outdir, libraries, outfn, lastOutFn, threads, limit, iters, resume, verbose, log) # update fasta list fastas += sorted(glob.glob(os.path.join(outdir, "_gap*.fa"))) lastOutFn = outfn fastas.append(lastOutFn); _check_fasta(lastOutFn) # FINAL REDUCTION outfn = os.path.join(outdir, "scaffolds.reduced.fa") if reduction and _corrupted_file(outfn): resume += 1 if verbose: log.write("%sFinal reduction...\n"%timestamp()) log.write("#file name\tgenome size\tcontigs\theterozygous size\t[%]\theterozygous contigs\t[%]\tidentity [%]\tpossible joins\thomozygous size\t[%]\thomozygous contigs\t[%]\n") # reduce with open(outfn, "w") as out: info = fasta2homozygous(out, open(lastOutFn), identity, overlap, minLength, threads, verbose=0, log=log) # update fasta list lastOutFn = outfn fastas.append(lastOutFn); _check_fasta(lastOutFn) # FASTA STATS if verbose: log.write("%sReporting statistics...\n"%timestamp()) # report stats log.write('#fname\tcontigs\tbases\tGC [%]\tcontigs >1kb\tbases in contigs >1kb\tN50\tN90\tNs\tlongest\n') for fn in fastas: log.write(FastaIndex(fn).stats()) # Clean-up if cleaning: if verbose: log.write("%sCleaning-up...\n"%timestamp()) for root, dirs, fnames in os.walk(outdir): endings = ('.fa', '.fasta', '.fai', '.tsv', '.png', '.log') for i, fn in enumerate(filter(lambda x: not x.endswith(endings), fnames), 1): os.unlink(os.path.join(root, fn)) # rmdir of snap index if root.endswith('.snap') and i==len(fnames): os.rmdir(root) if orgresume: log.write("%sResume report: %s step(s) have been recalculated.\n"%(timestamp(), resume-1))