def fpkm(args): """ %prog fpkm fastafile *.bam Calculate FPKM values from BAM file. """ p = OptionParser(fpkm.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) fastafile = args[0] bamfiles = args[1:] # Create a DUMMY gff file for cuffdiff gffile = fastafile.rsplit(".", 1)[0] + ".gff" if need_update(fastafile, gffile): fw = open(gffile, "w") f = Fasta(fastafile, lazy=True) for key, size in f.itersizes_ordered(): print >> fw, "\t".join(str(x) for x in (key, "dummy", "transcript",\ 1, size, ".", ".", ".", "ID=" + key)) fw.close() logging.debug("Dummy GFF created: {0}".format(gffile)) cmd = "cuffdiff {0} {1}".format(gffile, " ".join(bamfiles)) sh(cmd)
def __init__(self, filename, select=None): assert op.exists(filename), "File `{0}` not found".format(filename) # filename can be both .sizes file or FASTA formatted file sizesname = filename if not filename.endswith(".sizes"): sizesname = filename + ".sizes" filename = get_abs_path(filename) if need_update(filename, sizesname): cmd = "faSize" if which(cmd): cmd += " -detailed {0}".format(filename) sh(cmd, outfile=sizesname) else: from jcvi.formats.fasta import Fasta f = Fasta(filename) fw = open(sizesname, "w") for k, size in f.itersizes_ordered(): print >> fw, "\t".join((k, str(size))) fw.close() filename = sizesname assert filename.endswith(".sizes") super(Sizes, self).__init__(filename) self.fp = open(filename) self.filename = filename # get sizes for individual contigs, both in list and dict # this is to preserve the input order in the sizes file sizes = list(self.iter_sizes()) if select: assert select > 0 sizes = [x for x in sizes if x[1] >= select] self.sizes_mapping = dict(sizes) # get cumulative sizes, both in list and dict ctgs, sizes = zip(*sizes) self.sizes = sizes cumsizes = np.cumsum([0] + list(sizes)) self.ctgs = ctgs self.cumsizes = cumsizes self.cumsizes_mapping = dict(zip(ctgs, cumsizes))
def fpkm(args): """ %prog fpkm fastafile *.bam Calculate FPKM values from BAM file. """ p = OptionParser(fpkm.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) fastafile = args[0] bamfiles = args[1:] # Create a DUMMY gff file for cuffdiff gffile = fastafile.rsplit(".", 1)[0] + ".gff" if need_update(fastafile, gffile): fw = open(gffile, "w") f = Fasta(fastafile, lazy=True) for key, size in f.itersizes_ordered(): print( "\t".join( str(x) for x in ( key, "dummy", "transcript", 1, size, ".", ".", ".", "ID=" + key, )), file=fw, ) fw.close() logging.debug("Dummy GFF created: {0}".format(gffile)) cmd = "cuffdiff {0} {1}".format(gffile, " ".join(bamfiles)) sh(cmd)
def fasta(args): """ %prog fasta fastafile Convert reads formatted as FASTA file, and convert to CA frg file. If .qual file is found, then use it, otherwise just make a fake qual file. Mates are assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a matefile is given. """ from jcvi.formats.fasta import clean, make_qual p = OptionParser(fasta.__doc__) p.add_option("--clean", default=False, action="store_true", help="Clean up irregular chars in seq") p.add_option("--matefile", help="Matepairs file") p.add_option("--maxreadlen", default=262143, type="int", help="Maximum read length allowed") p.add_option("--minreadlen", default=1000, type="int", help="Minimum read length allowed") p.add_option("--sequential", default=False, action="store_true", help="Overwrite read name (e.g. long Pacbio name)") p.set_size() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args maxreadlen = opts.maxreadlen minreadlen = opts.minreadlen if maxreadlen > 0: split = False f = Fasta(fastafile, lazy=True) for id, size in f.itersizes_ordered(): if size > maxreadlen: logging.debug("Sequence {0} (size={1}) longer than max read len {2}".format(id, size, maxreadlen)) split = True break if split: for f in split_fastafile(fastafile, maxreadlen=maxreadlen): fasta([f, "--maxreadlen=0"]) return plate = op.basename(fastafile).split(".")[0] mated = opts.size != 0 mean, sv = get_mean_sv(opts.size) if mated: libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate else: libname = plate frgfile = libname + ".frg" if opts.clean: cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta" if need_update(fastafile, cleanfasta): clean([fastafile, "--canonical", "-o", cleanfasta]) fastafile = cleanfasta if mated: qualfile = make_qual(fastafile, score=21) if opts.matefile: matefile = opts.matefile assert op.exists(matefile) else: matefile = make_matepairs(fastafile) cmd = "convert-fasta-to-v2.pl" cmd += " -l {0} -s {1} -q {2} ".format(libname, fastafile, qualfile) if mated: cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile) sh(cmd, outfile=frgfile) return fw = must_open(frgfile, "w") print >> fw, headerTemplate.format(libID=libname) sequential = opts.sequential i = j = 0 for fragID, seq in parse_fasta(fastafile): if len(seq) < minreadlen: j += 1 continue i += 1 if sequential: fragID = libname + str(100000000 + i) emitFragment(fw, fragID, libname, seq) fw.close() logging.debug("A total of {0} fragments written to `{1}` ({2} discarded).".format(i, frgfile, j))
def fasta(args): """ %prog fasta fastafile Convert reads formatted as FASTA file, and convert to CA frg file. If .qual file is found, then use it, otherwise just make a fake qual file. Mates are assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a matefile is given. """ from jcvi.formats.fasta import clean, make_qual p = OptionParser(fasta.__doc__) p.add_option("-m", dest="matefile", default=None, help="matepairs file") p.add_option("--maxreadlen", default=32000, type="int", help="Maximum read length allowed [default: %default]") p.set_size() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args maxreadlen = opts.maxreadlen f = Fasta(fastafile, lazy=True) if maxreadlen > 0: split = False for id, size in f.itersizes_ordered(): if size > maxreadlen: logging.debug("Sequence {0} (size={1}) longer than max read len {2}".\ format(id, size, maxreadlen)) split = True break if split: for f in split_fastafile(fastafile, maxreadlen=maxreadlen): fasta([f, "--maxreadlen=0"]) return plate = op.basename(fastafile).split(".")[0] mated = (opts.size != 0) mean, sv = get_mean_sv(opts.size) if mated: libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate else: libname = "SangerFrags-" + plate frgfile = libname + ".frg" cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta" if need_update(fastafile, cleanfasta): clean([fastafile, "--canonical", "-o", cleanfasta]) fastafile = cleanfasta qualfile = make_qual(fastafile, score=21) if mated: if opts.matefile: matefile = opts.matefile assert op.exists(matefile) else: matefile = make_matepairs(fastafile) cmd = "convert-fasta-to-v2.pl" cmd += " -l {0} -s {1} -q {2} ".\ format(libname, fastafile, qualfile) if mated: cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile) sh(cmd, outfile=frgfile)
def fasta(args): """ %prog fasta fastafile Convert reads formatted as FASTA file, and convert to CA frg file. If .qual file is found, then use it, otherwise just make a fake qual file. Mates are assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a matefile is given. """ from jcvi.formats.fasta import clean, make_qual p = OptionParser(fasta.__doc__) p.add_option( "--clean", default=False, action="store_true", help="Clean up irregular chars in seq", ) p.add_option("--matefile", help="Matepairs file") p.add_option("--maxreadlen", default=262143, type="int", help="Maximum read length allowed") p.add_option("--minreadlen", default=1000, type="int", help="Minimum read length allowed") p.add_option( "--sequential", default=False, action="store_true", help="Overwrite read name (e.g. long Pacbio name)", ) p.set_size() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args maxreadlen = opts.maxreadlen minreadlen = opts.minreadlen if maxreadlen > 0: split = False f = Fasta(fastafile, lazy=True) for id, size in f.itersizes_ordered(): if size > maxreadlen: logging.debug( "Sequence {0} (size={1}) longer than max read len {2}". format(id, size, maxreadlen)) split = True break if split: for f in split_fastafile(fastafile, maxreadlen=maxreadlen): fasta([f, "--maxreadlen=0"]) return plate = op.basename(fastafile).split(".")[0] mated = opts.size != 0 mean, sv = get_mean_sv(opts.size) if mated: libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate else: libname = plate frgfile = libname + ".frg" if opts.clean: cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta" if need_update(fastafile, cleanfasta): clean([fastafile, "--canonical", "-o", cleanfasta]) fastafile = cleanfasta if mated: qualfile = make_qual(fastafile, score=21) if opts.matefile: matefile = opts.matefile assert op.exists(matefile) else: matefile = make_matepairs(fastafile) cmd = "convert-fasta-to-v2.pl" cmd += " -l {0} -s {1} -q {2} ".format(libname, fastafile, qualfile) if mated: cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile) sh(cmd, outfile=frgfile) return fw = must_open(frgfile, "w") print(headerTemplate.format(libID=libname), file=fw) sequential = opts.sequential i = j = 0 for fragID, seq in parse_fasta(fastafile): if len(seq) < minreadlen: j += 1 continue i += 1 if sequential: fragID = libname + str(100000000 + i) emitFragment(fw, fragID, libname, seq) fw.close() logging.debug( "A total of {0} fragments written to `{1}` ({2} discarded).".format( i, frgfile, j))