コード例 #1
0
ファイル: sam.py プロジェクト: arvin580/jcvi
def fpkm(args):
    """
    %prog fpkm fastafile *.bam

    Calculate FPKM values from BAM file.
    """
    p = OptionParser(fpkm.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    fastafile = args[0]
    bamfiles = args[1:]
    # Create a DUMMY gff file for cuffdiff
    gffile = fastafile.rsplit(".", 1)[0] + ".gff"
    if need_update(fastafile, gffile):
        fw = open(gffile, "w")
        f = Fasta(fastafile, lazy=True)
        for key, size in f.itersizes_ordered():
            print >> fw, "\t".join(str(x) for x in (key, "dummy", "transcript",\
                1, size, ".", ".", ".", "ID=" + key))
        fw.close()
        logging.debug("Dummy GFF created: {0}".format(gffile))

    cmd = "cuffdiff {0} {1}".format(gffile, " ".join(bamfiles))
    sh(cmd)
コード例 #2
0
ファイル: sizes.py プロジェクト: xuanblo/jcvi
    def __init__(self, filename, select=None):
        assert op.exists(filename), "File `{0}` not found".format(filename)

        # filename can be both .sizes file or FASTA formatted file
        sizesname = filename

        if not filename.endswith(".sizes"):
            sizesname = filename + ".sizes"
            filename = get_abs_path(filename)
            if need_update(filename, sizesname):
                cmd = "faSize"
                if which(cmd):
                    cmd += " -detailed {0}".format(filename)
                    sh(cmd, outfile=sizesname)
                else:
                    from jcvi.formats.fasta import Fasta

                    f = Fasta(filename)
                    fw = open(sizesname, "w")
                    for k, size in f.itersizes_ordered():
                        print >> fw, "\t".join((k, str(size)))
                    fw.close()

            filename = sizesname

        assert filename.endswith(".sizes")

        super(Sizes, self).__init__(filename)
        self.fp = open(filename)
        self.filename = filename

        # get sizes for individual contigs, both in list and dict
        # this is to preserve the input order in the sizes file
        sizes = list(self.iter_sizes())
        if select:
            assert select > 0
            sizes = [x for x in sizes if x[1] >= select]
        self.sizes_mapping = dict(sizes)

        # get cumulative sizes, both in list and dict
        ctgs, sizes = zip(*sizes)
        self.sizes = sizes
        cumsizes = np.cumsum([0] + list(sizes))
        self.ctgs = ctgs
        self.cumsizes = cumsizes
        self.cumsizes_mapping = dict(zip(ctgs, cumsizes))
コード例 #3
0
ファイル: sam.py プロジェクト: wroldwiedbwe/jcvi
def fpkm(args):
    """
    %prog fpkm fastafile *.bam

    Calculate FPKM values from BAM file.
    """
    p = OptionParser(fpkm.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    fastafile = args[0]
    bamfiles = args[1:]
    # Create a DUMMY gff file for cuffdiff
    gffile = fastafile.rsplit(".", 1)[0] + ".gff"
    if need_update(fastafile, gffile):
        fw = open(gffile, "w")
        f = Fasta(fastafile, lazy=True)
        for key, size in f.itersizes_ordered():
            print(
                "\t".join(
                    str(x) for x in (
                        key,
                        "dummy",
                        "transcript",
                        1,
                        size,
                        ".",
                        ".",
                        ".",
                        "ID=" + key,
                    )),
                file=fw,
            )
        fw.close()
        logging.debug("Dummy GFF created: {0}".format(gffile))

    cmd = "cuffdiff {0} {1}".format(gffile, " ".join(bamfiles))
    sh(cmd)
コード例 #4
0
ファイル: ca.py プロジェクト: arvin580/jcvi
def fasta(args):
    """
    %prog fasta fastafile

    Convert reads formatted as FASTA file, and convert to CA frg file. If .qual
    file is found, then use it, otherwise just make a fake qual file. Mates are
    assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a
    matefile is given.
    """
    from jcvi.formats.fasta import clean, make_qual

    p = OptionParser(fasta.__doc__)
    p.add_option("--clean", default=False, action="store_true", help="Clean up irregular chars in seq")
    p.add_option("--matefile", help="Matepairs file")
    p.add_option("--maxreadlen", default=262143, type="int", help="Maximum read length allowed")
    p.add_option("--minreadlen", default=1000, type="int", help="Minimum read length allowed")
    p.add_option("--sequential", default=False, action="store_true", help="Overwrite read name (e.g. long Pacbio name)")
    p.set_size()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    maxreadlen = opts.maxreadlen
    minreadlen = opts.minreadlen
    if maxreadlen > 0:
        split = False
        f = Fasta(fastafile, lazy=True)
        for id, size in f.itersizes_ordered():
            if size > maxreadlen:
                logging.debug("Sequence {0} (size={1}) longer than max read len {2}".format(id, size, maxreadlen))
                split = True
                break

        if split:
            for f in split_fastafile(fastafile, maxreadlen=maxreadlen):
                fasta([f, "--maxreadlen=0"])
            return

    plate = op.basename(fastafile).split(".")[0]

    mated = opts.size != 0
    mean, sv = get_mean_sv(opts.size)

    if mated:
        libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate
    else:
        libname = plate

    frgfile = libname + ".frg"

    if opts.clean:
        cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta"
        if need_update(fastafile, cleanfasta):
            clean([fastafile, "--canonical", "-o", cleanfasta])
        fastafile = cleanfasta

    if mated:
        qualfile = make_qual(fastafile, score=21)
        if opts.matefile:
            matefile = opts.matefile
            assert op.exists(matefile)
        else:
            matefile = make_matepairs(fastafile)

        cmd = "convert-fasta-to-v2.pl"
        cmd += " -l {0} -s {1} -q {2} ".format(libname, fastafile, qualfile)
        if mated:
            cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile)

        sh(cmd, outfile=frgfile)
        return

    fw = must_open(frgfile, "w")
    print >> fw, headerTemplate.format(libID=libname)

    sequential = opts.sequential
    i = j = 0
    for fragID, seq in parse_fasta(fastafile):
        if len(seq) < minreadlen:
            j += 1
            continue
        i += 1
        if sequential:
            fragID = libname + str(100000000 + i)
        emitFragment(fw, fragID, libname, seq)
    fw.close()

    logging.debug("A total of {0} fragments written to `{1}` ({2} discarded).".format(i, frgfile, j))
コード例 #5
0
def fasta(args):
    """
    %prog fasta fastafile

    Convert reads formatted as FASTA file, and convert to CA frg file. If .qual
    file is found, then use it, otherwise just make a fake qual file. Mates are
    assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a
    matefile is given.
    """
    from jcvi.formats.fasta import clean, make_qual

    p = OptionParser(fasta.__doc__)
    p.add_option("-m", dest="matefile", default=None, help="matepairs file")
    p.add_option("--maxreadlen",
                 default=32000,
                 type="int",
                 help="Maximum read length allowed [default: %default]")
    p.set_size()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    maxreadlen = opts.maxreadlen
    f = Fasta(fastafile, lazy=True)
    if maxreadlen > 0:
        split = False
        for id, size in f.itersizes_ordered():
            if size > maxreadlen:
                logging.debug("Sequence {0} (size={1}) longer than max read len {2}".\
                                format(id, size, maxreadlen))
                split = True
                break

        if split:
            for f in split_fastafile(fastafile, maxreadlen=maxreadlen):
                fasta([f, "--maxreadlen=0"])
            return

    plate = op.basename(fastafile).split(".")[0]

    mated = (opts.size != 0)
    mean, sv = get_mean_sv(opts.size)

    if mated:
        libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate
    else:
        libname = "SangerFrags-" + plate

    frgfile = libname + ".frg"

    cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta"
    if need_update(fastafile, cleanfasta):
        clean([fastafile, "--canonical", "-o", cleanfasta])
    fastafile = cleanfasta

    qualfile = make_qual(fastafile, score=21)
    if mated:
        if opts.matefile:
            matefile = opts.matefile
            assert op.exists(matefile)
        else:
            matefile = make_matepairs(fastafile)

    cmd = "convert-fasta-to-v2.pl"
    cmd += " -l {0} -s {1} -q {2} ".\
            format(libname, fastafile, qualfile)
    if mated:
        cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile)

    sh(cmd, outfile=frgfile)
コード例 #6
0
ファイル: ca.py プロジェクト: zjwang6/jcvi
def fasta(args):
    """
    %prog fasta fastafile

    Convert reads formatted as FASTA file, and convert to CA frg file. If .qual
    file is found, then use it, otherwise just make a fake qual file. Mates are
    assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a
    matefile is given.
    """
    from jcvi.formats.fasta import clean, make_qual

    p = OptionParser(fasta.__doc__)
    p.add_option(
        "--clean",
        default=False,
        action="store_true",
        help="Clean up irregular chars in seq",
    )
    p.add_option("--matefile", help="Matepairs file")
    p.add_option("--maxreadlen",
                 default=262143,
                 type="int",
                 help="Maximum read length allowed")
    p.add_option("--minreadlen",
                 default=1000,
                 type="int",
                 help="Minimum read length allowed")
    p.add_option(
        "--sequential",
        default=False,
        action="store_true",
        help="Overwrite read name (e.g. long Pacbio name)",
    )
    p.set_size()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    maxreadlen = opts.maxreadlen
    minreadlen = opts.minreadlen
    if maxreadlen > 0:
        split = False
        f = Fasta(fastafile, lazy=True)
        for id, size in f.itersizes_ordered():
            if size > maxreadlen:
                logging.debug(
                    "Sequence {0} (size={1}) longer than max read len {2}".
                    format(id, size, maxreadlen))
                split = True
                break

        if split:
            for f in split_fastafile(fastafile, maxreadlen=maxreadlen):
                fasta([f, "--maxreadlen=0"])
            return

    plate = op.basename(fastafile).split(".")[0]

    mated = opts.size != 0
    mean, sv = get_mean_sv(opts.size)

    if mated:
        libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate
    else:
        libname = plate

    frgfile = libname + ".frg"

    if opts.clean:
        cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta"
        if need_update(fastafile, cleanfasta):
            clean([fastafile, "--canonical", "-o", cleanfasta])
        fastafile = cleanfasta

    if mated:
        qualfile = make_qual(fastafile, score=21)
        if opts.matefile:
            matefile = opts.matefile
            assert op.exists(matefile)
        else:
            matefile = make_matepairs(fastafile)

        cmd = "convert-fasta-to-v2.pl"
        cmd += " -l {0} -s {1} -q {2} ".format(libname, fastafile, qualfile)
        if mated:
            cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile)

        sh(cmd, outfile=frgfile)
        return

    fw = must_open(frgfile, "w")
    print(headerTemplate.format(libID=libname), file=fw)

    sequential = opts.sequential
    i = j = 0
    for fragID, seq in parse_fasta(fastafile):
        if len(seq) < minreadlen:
            j += 1
            continue
        i += 1
        if sequential:
            fragID = libname + str(100000000 + i)
        emitFragment(fw, fragID, libname, seq)
    fw.close()

    logging.debug(
        "A total of {0} fragments written to `{1}` ({2} discarded).".format(
            i, frgfile, j))
コード例 #7
0
ファイル: ca.py プロジェクト: rrane/jcvi
def fasta(args):
    """
    %prog fasta fastafile

    Convert reads formatted as FASTA file, and convert to CA frg file. If .qual
    file is found, then use it, otherwise just make a fake qual file. Mates are
    assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a
    matefile is given.
    """
    from jcvi.formats.fasta import clean, make_qual

    p = OptionParser(fasta.__doc__)
    p.add_option("-m", dest="matefile", default=None,
            help="matepairs file")
    p.add_option("--maxreadlen", default=32000, type="int",
            help="Maximum read length allowed [default: %default]")
    p.set_size()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    maxreadlen = opts.maxreadlen
    f = Fasta(fastafile, lazy=True)
    if maxreadlen > 0:
        split = False
        for id, size in f.itersizes_ordered():
            if size > maxreadlen:
                logging.debug("Sequence {0} (size={1}) longer than max read len {2}".\
                                format(id, size, maxreadlen))
                split  = True
                break

        if split:
            for f in split_fastafile(fastafile, maxreadlen=maxreadlen):
                fasta([f, "--maxreadlen=0"])
            return

    plate = op.basename(fastafile).split(".")[0]

    mated = (opts.size != 0)
    mean, sv = get_mean_sv(opts.size)

    if mated:
        libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate
    else:
        libname = "SangerFrags-" + plate

    frgfile = libname + ".frg"

    cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta"
    if need_update(fastafile, cleanfasta):
        clean([fastafile, "--canonical", "-o", cleanfasta])
    fastafile = cleanfasta

    qualfile = make_qual(fastafile, score=21)
    if mated:
        if opts.matefile:
            matefile = opts.matefile
            assert op.exists(matefile)
        else:
            matefile = make_matepairs(fastafile)

    cmd = "convert-fasta-to-v2.pl"
    cmd += " -l {0} -s {1} -q {2} ".\
            format(libname, fastafile, qualfile)
    if mated:
        cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile)

    sh(cmd, outfile=frgfile)