Example #1
0
def blat(args):
    """
    %prog blat old.fasta new.fasta

    Generate psl file using blat.
    """
    p = OptionParser(blat.__doc__)
    p.add_option("--minscore", default=100, type="int",
                 help="Matches minus mismatches gap penalty [default: %default]")
    p.add_option("--minid", default=98, type="int",
                 help="Minimum sequence identity [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    oldfasta, newfasta = args
    twobitfiles = []
    for fastafile in args:
        tbfile = faToTwoBit(fastafile)
        twobitfiles.append(tbfile)

    oldtwobit, newtwobit = twobitfiles
    cmd = "pblat -threads={0}".format(opts.cpus) if which("pblat") else "blat"
    cmd += " {0} {1}".format(oldtwobit, newfasta)
    cmd += " -tileSize=12 -minScore={0} -minIdentity={1} ".\
                format(opts.minscore, opts.minid)
    pslfile = "{0}.{1}.psl".format(*(op.basename(x).split('.')[0] \
                for x in (newfasta, oldfasta)))
    cmd += pslfile
    sh(cmd)
Example #2
0
def check_cfg_fqtrim(c, njob = 1, noutdir = 3):
    c.outdirs = c.outdir.split(",")
    assert len(c.outdirs) == noutdir, "not %s outdirs: %s" % (noutdir, c.outdir)
    
    for subdir in [c.dirw, c.temp_dir] + c.outdirs:
        if not op.isdir(subdir):
            mkdir(subdir)
    
    for fn in [c.ilist, c.adapter, c.trimmomatic]:
        assert op.isfile(fn), "cannot read %s" % fn

    for key in ['fastqc', 'parallel']:
        fp = which(c[key])
        assert fp is not None, "not executable: %s" % c[key]
        c[key] = fp

    c.paired = str2bool(c.paired)
    
    c.pbs_walltimes = c.pbs_walltime.split(",")
    c.pbs_ppns = c.pbs_ppn.split(",")
    c.pbs_queues = c.pbs_queue.split(",")
    assert njob == len(c.pbs_queues) == len(c.pbs_walltimes) == len(c.pbs_ppns), "not %d jobs: %s" % (njob, c.pbs_queue)
    c.njob = njob

    return c
Example #3
0
def check_cfg_mapping(c):
    c.outdirs = c.outdir.split(",")
    assert len(c.outdirs) == 2, "not 2 outdirs: %s" % c.outdir

    for subdir in [c.dirw, c.temp_dir] + c.outdirs:
        if not op.isdir(subdir):
            mkdir(subdir)

    for fn in [c.ilist, c.genome, c.gff]:
        assert op.isfile(fn), "cannot read %s" % fn

    for key in 'samtools parallel sambamba bcftools bedtools'.split():
        fp = which(c[key])
        assert fp is not None, "not executable: %s" % c[key]
        c[key] = fp

    c.paired = str2bool(c.paired)

    if c.mapper == 'bwa':
        c.bwa = which(c.bwa)
        assert c.bwa is not None, "not executable: %s" % c.bwa
    elif c.mapper == 'hisat2':
        c.hisat2 = which(c.hisat2)
        assert c.hisat2 is not None, "not executable: %s" % c.hisat2
    elif c.mapper == 'bowtie2':
        c.bowtie2 = which(c.bowtie2)
        assert c.bowtie2 is not None, "not executable: %s" % c.bowtie2
    else:
        logging.error("unsupported mapper: %s" % c.mapper)
        sys.exit(1)

    njob = 3
    c.pbs_walltimes = c.pbs_walltime.split(",")
    c.pbs_ppns = c.pbs_ppn.split(",")
    c.pbs_queues = c.pbs_queue.split(",")
    assert njob == len(c.pbs_queues) == len(c.pbs_walltimes) == len(
        c.pbs_ppns), "not %d jobs: %s" % (njob, c.pbs_queue)
    c.njob = njob

    return c
Example #4
0
def run_blat(infile=None, outfile=None, db="UniVec_Core",
             pctid=95, hitlen=50, cpus=16, overwrite=True):

    cmd = "pblat -threads={0}".format(cpus) if which("pblat") else "blat"
    cmd += ' {0} {1} -out=blast8 {2}'.format(db, infile, outfile)
    sh(cmd)

    blatfile = outfile
    filtered_blatfile = outfile + ".P{0}L{1}".format(pctid, hitlen)
    run_blast_filter(infile=blatfile, outfile=filtered_blatfile,
            pctid=pctid, hitlen=hitlen)
    if overwrite:
        shutil.move(filtered_blatfile, blatfile)
Example #5
0
def check_cfg_mapping(c):
    c.outdirs = c.outdir.split(",")
    assert len(c.outdirs) == 2, "not 2 outdirs: %s" % c.outdir
    
    for subdir in [c.dirw, c.temp_dir] + c.outdirs:
        if not op.isdir(subdir):
            mkdir(subdir)
    
    for fn in [c.ilist, c.genome, c.gff]:
        assert op.isfile(fn), "cannot read %s" % fn

    for key in 'samtools parallel sambamba bcftools bedtools'.split():
        fp = which(c[key])
        assert fp is not None, "not executable: %s" % c[key]
        c[key] = fp

    c.paired = str2bool(c.paired)

    if c.mapper == 'bwa':
        c.bwa = which(c.bwa)
        assert c.bwa is not None, "not executable: %s" % c.bwa
    elif c.mapper == 'hisat2':
        c.hisat2 = which(c.hisat2)
        assert c.hisat2 is not None, "not executable: %s" % c.hisat2
    elif c.mapper == 'bowtie2':
        c.bowtie2 = which(c.bowtie2)
        assert c.bowtie2 is not None, "not executable: %s" % c.bowtie2
    else:
        logging.error("unsupported mapper: %s" % c.mapper)
        sys.exit(1)
    
    njob = 3
    c.pbs_walltimes = c.pbs_walltime.split(",")
    c.pbs_ppns = c.pbs_ppn.split(",")
    c.pbs_queues = c.pbs_queue.split(",")
    assert njob == len(c.pbs_queues) == len(c.pbs_walltimes) == len(c.pbs_ppns), "not %d jobs: %s" % (njob, c.pbs_queue)
    c.njob = njob

    return c
Example #6
0
    def __init__(self, filename, select=None):
        assert op.exists(filename), "File `{0}` not found".format(filename)

        # filename can be both .sizes file or FASTA formatted file
        sizesname = filename

        if not filename.endswith(".sizes"):
            sizesname = filename + ".sizes"
            filename = get_abs_path(filename)
            if need_update(filename, sizesname):
                cmd = "faSize"
                if which(cmd):
                    cmd += " -detailed {0}".format(filename)
                    sh(cmd, outfile=sizesname)
                else:
                    from jcvi.formats.fasta import Fasta

                    f = Fasta(filename)
                    fw = open(sizesname, "w")
                    for k, size in f.itersizes_ordered():
                        fw.write("\t".join((k, str(size))) + "\n")
                    fw.close()

            filename = sizesname

        assert filename.endswith(".sizes")

        super(Sizes, self).__init__(filename)
        self.fp = open(filename)
        self.filename = filename

        # get sizes for individual contigs, both in list and dict
        # this is to preserve the input order in the sizes file
        sizes = list(self.iter_sizes())
        if select:
            assert select > 0
            sizes = [x for x in sizes if x[1] >= select]
        self.sizes_mapping = dict(sizes)

        # get cumulative sizes, both in list and dict
        ctgs, sizes = zip(*sizes)
        self.sizes = sizes
        cumsizes = np.cumsum([0] + list(sizes))
        self.ctgs = ctgs
        self.cumsizes = cumsizes
        self.cumsizes_mapping = dict(zip(ctgs, cumsizes))
Example #7
0
def fromsra(args):
    """
    %prog fromsra srafile

    Convert sra file to fastq using the sratoolkit `fastq-dump`
    """
    p = OptionParser(fromsra.__doc__)
    sp1.add_argument("--paired", default=False, action="store_true",
            help="Specify if library layout is paired-end " + \
                 "[default: %default]")
    sp1.add_argument("--compress",
                     default=None,
                     choices=["gzip", "bzip2"],
                     help="Compress output fastq files [default: %default]")
    p.set_outdir()
    p.set_grid()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    srafile, = args
    paired = args.paired
    compress = args.compress
    outdir = args.outdir

    script_path = which("fastq-dump")
    if not script_path:
        logging.error("Cannot find `fastq-dump` in the PATH")
        sys.exit()

    cmd = [script_path]
    if compress:
        cmd.append("--{0}".format(compress))
    if paired:
        cmd.append("--split-files")
    if outdir:
        cmd.append("--outdir {0}".format(outdir))
    cmd.append(srafile)

    outcmd = " ".join(cmd)
    sh(outcmd, grid=args.grid)
Example #8
0
def check_cfg_index(c, noutdir = 1, njob = 3):
    c.outdirs = c.outdir.split(",")
    assert len(c.outdirs) == noutdir, "not %s outdirs: %s" % (noutdir, c.outdir)
    
    for subdir in [c.dirw, c.temp_dir] + c.outdirs:
        if not op.isdir(subdir):
            mkdir(subdir)
    
    for fn in [c.ilist, c.vcf, c.genome, c.gff]:
        assert op.isfile(fn), "cannot read %s" % fn

    for key in 'bcftools'.split():
        fp = which(c[key])
        assert fp is not None, "not executable: %s" % c[key]
        c[key] = fp
    
    c.pbs_walltimes = c.pbs_walltime.split(",")
    c.pbs_ppns = c.pbs_ppn.split(",")
    c.pbs_queues = c.pbs_queue.split(",")
    c.pbs_mems = c.pbs_mem.split(",")
    assert njob == len(c.pbs_queues) == len(c.pbs_walltimes) == len(c.pbs_ppns) == len(c.pbs_mems), "not %d jobs: %s" % (njob, c.pbs_queue)
    c.njob = njob

    return c
Example #9
0
def check_cfg_mapping(c, noutdir = 4, njob = 2):
    c.outdirs = c.outdir.split(",")
    assert len(c.outdirs) == noutdir, "not %s outdirs: %s" % (noutdir, c.outdir)
    
    for subdir in [c.dirw, c.temp_dir] + c.outdirs:
        if not op.isdir(subdir):
            mkdir(subdir)
    
    for fn in [c.ilist, c.vcf, c.gene_bed]:
        assert op.isfile(fn), "cannot read %s" % fn

    for key in 'samtools parallel sambamba htseq bcftools bedtools'.split():
        fp = which(c[key])
        assert fp is not None, "not executable: %s" % c[key]
        c[key] = fp

    c.paired = str2bool(c.paired)
    assert c.stranded in ['yes', 'no', 'reverse'], "unknown stranded option: %s" % c.stranded
    
    if c.mapper == 'tophat2':
        c.tophat2 = which(c.tophat2)
        assert c.tophat2 is not None, "not executable: %s" % c.tophat2
    elif c.mapper == 'hisat2':
        c.hisat2= which(c.hisat2)
        assert c.hisat2 is not None, "not executable: %s" % c.hisat2
    elif c.mapper == 'star':
        c.star= which(c.star)
        assert c.star is not None, "not executable: %s" % c.star
    else:
        logging.error("unsupported mapper: %s" % c.mapper)
        sys.exit(1)
    
    assert op.isdir(c.genomedir), "cannot access %s" % c.genomedir
    t = Table.read(c.ilist, format = 'ascii.tab')
    if 'genome' not in t[0]:
        genomeb = 'B73c'
        logging.debug("no 'genome' column detected: use %s" % genomeb)
        t.add_column(Column([genomeb] * len(t)), name = 'genome')
    c.t = t
    
    genomes = set()
    for i in range(len(t)):
        gts = t['genome'][i].split(",")
        for gt in gts:
            genomes.add(gt)
    genomes = sorted(list(genomes))
    logging.debug("checking %d genomes" % len(genomes))
   
    c.genomes = dict()
    for gt in genomes:
        c.genomes[gt] = dict()
        dirg = "%s/%s" % (c.genomedir, gt)
        dbpre = ''
        if c.mapper == 'tophat2':
            dbpre = "%s/21.bowtie2/db" % dirg
            assert op.isfile("%s.4.bt2" % dbpre), "no %s db-index: %s" % (c.mapper, dbpre)
        elif c.mapper == 'hisat2':
            dbpre = "%s/21.hisat2/db" % dirg
            assert op.isfile("%s.8.ht2" % dbpre), "no %s db-index: %s" % (c.mapper, dbpre)
        elif c.mapper == 'star':
            dbpre = "%s/21.star" % dirg
            assert op.isfile("%s/SA" % dbpre), "no %s db-index: %s" % (c.mapper, dbpre)
        c.genomes[gt]['db'] = dbpre
        gff = "%s/51.gff" % dirg
        assert op.isfile(gff), "no gff for %s: %s" % (gff, gt)
        c.genomes[gt]['gff'] = gff
    
    c.pbs_walltimes = c.pbs_walltime.split(",")
    c.pbs_ppns = c.pbs_ppn.split(",")
    c.pbs_queues = c.pbs_queue.split(",")
    assert njob == len(c.pbs_queues) == len(c.pbs_walltimes) == len(c.pbs_ppns), "not %d jobs: %s" % (njob, c.pbs_queue)
    c.njob = njob

    return c