Beispiel #1
0
def clean_fasta(args):
    dirw = make_genomedir(args.species)
    os.chdir(dirw)
    for fname in ["raw.fix.fas.index", "11_genome.fas.index"]:
        if op.isfile(fname):
            os.remove(fname)
    if op.islink("10_genome.fna"): os.unlink("10_genome.fna")

    if op.isfile("10_genome.fna") and not args.overwrite:
        logging.debug("10_genome.fna already exits: skipped")
    elif op.isfile("08_seq_map/renamed.fna"):
        sh("ln -sf 08_seq_map/renamed.fna 10_genome.fna")
        if op.isfile("08_seq_map/renamed.sizes"):
            sh("ln -sf 08_seq_map/renamed.sizes 10_genome.sizes")
    else:
        logging.error("08_seq_map/renamed.fna not there")
        sys.exit(1)

    if not op.isdir("15_intervals"):
        mkdir("15_intervals")

    if op.isfile("15_intervals/01.chrom.bed") and not args.overwrite:
        logging.debug("01.chrom.bed already exits - skipped")
    else:
        sh("fasta size --bed 10_genome.fna > 15_intervals/01.chrom.bed")
    if op.isfile("15_intervals/01.chrom.sizes") and not args.overwrite:
        logging.debug("01.chrom.sizes already exits - skipped")
    else:
        sh("faSize -detailed 10_genome.fna > 15_intervals/01.chrom.sizes")

    if op.isfile("15_intervals/11.gap.bed") and not args.overwrite:
        logging.debug("11.gap.bed already exits - skipped")
    else:
        sh("fasta gaps 10_genome.fna > 15_intervals/11.gap.bed")
Beispiel #2
0
def clean_fasta(args):
    dirw = make_genomedir(args.species)
    os.chdir(dirw)
    for fname in ["raw.fix.fas.index", "11_genome.fas.index"]:
        if op.isfile(fname):
            os.remove(fname)
    if op.islink("10_genome.fna"): os.unlink("10_genome.fna")
   
    if op.isfile("10_genome.fna") and not args.overwrite:
        logging.debug("10_genome.fna already exits: skipped")
    elif op.isfile("08_seq_map/renamed.fna"):
        sh("ln -sf 08_seq_map/renamed.fna 10_genome.fna")
        if op.isfile("08_seq_map/renamed.sizes"):
            sh("ln -sf 08_seq_map/renamed.sizes 10_genome.sizes")
    else:
        logging.error("08_seq_map/renamed.fna not there")
        sys.exit(1)
    
    if not op.isdir("15_intervals"):
        mkdir("15_intervals")
    
    if op.isfile("15_intervals/01.chrom.bed") and not args.overwrite:
        logging.debug("01.chrom.bed already exits - skipped")
    else:
        sh("fasta size --bed 10_genome.fna > 15_intervals/01.chrom.bed")
    if op.isfile("15_intervals/01.chrom.sizes") and not args.overwrite:
        logging.debug("01.chrom.sizes already exits - skipped")
    else:
        sh("faSize -detailed 10_genome.fna > 15_intervals/01.chrom.sizes")
    
    if op.isfile("15_intervals/11.gap.bed") and not args.overwrite:
        logging.debug("11.gap.bed already exits - skipped")
    else:
        sh("fasta gaps 10_genome.fna > 15_intervals/11.gap.bed")
Beispiel #3
0
def check_cfg_fqtrim(c, njob = 1, noutdir = 3):
    c.outdirs = c.outdir.split(",")
    assert len(c.outdirs) == noutdir, "not %s outdirs: %s" % (noutdir, c.outdir)
    
    for subdir in [c.dirw, c.temp_dir] + c.outdirs:
        if not op.isdir(subdir):
            mkdir(subdir)
    
    for fn in [c.ilist, c.adapter, c.trimmomatic]:
        assert op.isfile(fn), "cannot read %s" % fn

    for key in ['fastqc', 'parallel']:
        fp = which(c[key])
        assert fp is not None, "not executable: %s" % c[key]
        c[key] = fp

    c.paired = str2bool(c.paired)
    
    c.pbs_walltimes = c.pbs_walltime.split(",")
    c.pbs_ppns = c.pbs_ppn.split(",")
    c.pbs_queues = c.pbs_queue.split(",")
    assert njob == len(c.pbs_queues) == len(c.pbs_walltimes) == len(c.pbs_ppns), "not %d jobs: %s" % (njob, c.pbs_queue)
    c.njob = njob

    return c
Beispiel #4
0
def sra(args):
    """
    %prog sra [term|term.ids]

    Given an SRA run ID, fetch the corresponding .sra file from the sra-instant FTP.
    The term can also be a file containing list of SRR ids, one per line.

    Once downloaded, the SRA file is processed through `fastq-dump` to produce
    FASTQ formatted sequence files, which are gzipped by default.
    """
    p = OptionParser(sra.__doc__)

    sp1.add_argument("--nogzip", dest="nogzip",
            default=False, action="store_true",
            help="Do not gzip the FASTQ generated by fastq-dump")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    term, = args
    if op.isfile(term):
        terms = [x.strip() for x in open(term)]
    else:
        terms = [term]

    for term in terms:
        srafile = download_srr_term(term)
        pf = srafile.split(".")[0]
        mkdir(pf)
        _opts = [srafile, "--paired", "--outdir={0}".format(pf)]
        if not args.nogzip:
            _args.append("--compress=gzip")
        fromsra(_opts)
Beispiel #5
0
def index(cfg, args):
    c = AttrDict(cfg['index'])
    c = check_cfg_index(c)
    if args.check:
        return 0
    os.chdir(c.dirw)

    jcmds = [[
        "cd %s" % c.dirw
        ],[
        "cd %s" % c.dirw
        ],[
        "cd %s" % c.dirw
    ]]
    bcfgs = [
        [dict(opt = 'bash')],
        [dict(opt = 'parallel', thread = c.pbs_ppns[1])],
        [dict(opt = 'bash')]
    ]
    
    assert c.njob == len(bcfgs) == len(jcmds), "not %d jobs" % c.njob
    jobs = []
    for i in range(c.njob):
        prefix = "%s.%d" % (c.job_prefix, i+1)
        jcfg = {
            'queue': c.pbs_queues[i],
            'ppn': c.pbs_ppns[i], 
            'walltime': c.pbs_walltimes[i],
            'mem': c.pbs_mems[i], 
            'email': c.pbs_email,
        }
        job = PbsJob.from_cfg(jcfg = jcfg, jcmds = jcmds[i], bcfgs = bcfgs[i],
                prefix = prefix, njob = len(bcfgs[i]), 
                bash = c.bash, parallel = c.parallel)
        jobs.append(job)
 
    t = Table.read(c.ilist, format = 'ascii.tab')
    nrow = len(t)
    gts = [t['genotype'][x] for x in range(nrow) if t['type'][x] == 'Inbred']
    gts = set(gts)
    logging.debug("creating pseudo-refs for %d genomes" % len(gts))
    print(" ".join(gts))
    for gt in gts:
        diro = "%s/%s" % (c.outdirs[0], gt)
        mkdir(diro)
        jobs[0].subjobs[0].add_cmd("%s consensus -f %s %s -s %s \
                -c %s/25.chain -o %s/11_genome.fas" % \
                (c.bcftools, c.genome, c.vcf, gt, diro, diro))
        #jobs[1].subjobs[0].add_cmd("genome fasta %s" % diro)
        #jobs[0].subjobs[0].add_cmd("genome blat %s" % diro)
        #jobs[0].subjobs[0].add_cmd("genome bwa %s" % diro)
        jobs[1].subjobs[0].add_cmd("genome bowtie %s" % diro)
        jobs[2].subjobs[0].add_cmd("genome hisat %s" % diro)
   
    for job in jobs:
        job.write()
    fj = "%s.sh" % c.job_prefix
    create_job_chain([job.fname for job in jobs], fj)
    logging.debug("job chain with %s jobs was created: %s" % (c.njob, fj))
Beispiel #6
0
def make_genomedir(species):
    dirw = species
    if species.isalnum():
        dirw = op.join("/home/springer/zhoux379/data/genome", species)
        logging.debug("converting species to directory: %s" % dirw)
    if not op.isdir(dirw):
        logging.debug("creating diretory: %s" % dirw)
        mkdir(dirw)
    return dirw
Beispiel #7
0
def make_genomedir(species):
    dirw = species
    if species.isalnum():
        dirw = op.join("/home/springer/zhoux379/data/genome", species)
        logging.debug("converting species to directory: %s" % dirw)
    if not op.isdir(dirw):
        logging.debug("creating diretory: %s" % dirw)
        mkdir(dirw)
    return dirw
Beispiel #8
0
def build_bwa(args):
    dirg, fg = get_genomedir(args.species)
    dirw = op.join(dirg, "21_dbs/bwa")
    if not op.isdir(dirw): mkdir(dirw)
    os.chdir(dirw)

    if op.isfile("db.bwt") and not args.overwrite:
        logging.debug("db.bwt already exists - skipped")
    else:
        sh("bwa index -a bwtsw -p %s/db %s" % (dirw, fg))
Beispiel #9
0
def build_bwa(args):
    dirg, fg = get_genomedir(args.species)
    dirw = op.join(dirg, "21_dbs/bwa")
    if not op.isdir(dirw): mkdir(dirw)
    os.chdir(dirw)
   
    if op.isfile("db.bwt") and not args.overwrite:
        logging.debug("db.bwt already exists - skipped")
    else:
        sh("bwa index -a bwtsw -p %s/db %s" % (dirw, fg))
Beispiel #10
0
def build_blat(args):
    dirg, fg = get_genomedir(args.species)
    dirw = op.join(dirg, "21_dbs/blat")
    if not op.isdir(dirw): mkdir(dirw)
    os.chdir(dirw)

    if not args.overwrite and op.isfile('db.2bit'):
        logging.debug("db.2bit already exists - skipped")
    else:
        sh("faToTwoBit %s db.2bit" % fg)
        sh("blat db.2bit tmp.fas tmp.out -makeOoc=db.2bit.tile11.ooc")
    if op.isfile("tmp.out"): os.remove("tmp.out")
Beispiel #11
0
def build_blat(args):
    dirg, fg = get_genomedir(args.species)
    dirw = op.join(dirg, "21_dbs/blat")
    if not op.isdir(dirw): mkdir(dirw)
    os.chdir(dirw)
   
    if not args.overwrite and op.isfile('db.2bit'):
        logging.debug("db.2bit already exists - skipped")
    else:
        sh("faToTwoBit %s db.2bit" % fg)
        sh("blat db.2bit tmp.fas tmp.out -makeOoc=db.2bit.tile11.ooc")
    if op.isfile("tmp.out"): os.remove("tmp.out")
Beispiel #12
0
def fasta(args):
    """
    %prog fasta fastqfiles

    Convert fastq to fasta and qual file.
    """
    p = OptionParser(fasta.__doc__)
    sp1.add_argument("--seqtk",
                     default=False,
                     action="store_true",
                     help="Use seqtk to convert")
    p.set_outdir()
    p.set_outfile(outfile=None)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastqfiles = args
    outdir = args.outdir
    if outdir and outdir != ".":
        mkdir(outdir)

    fastqfile = fastqfiles[0]
    pf = op.basename(fastqfile)
    gzinput = pf.endswith(".gz")
    if gzinput:
        pf = pf.rsplit(".", 1)[0]

    pf, sf = pf.rsplit(".", 1)
    if sf not in ("fq", "fastq"):
        logging.debug("Assumed FASTA: suffix not `fq` or `fastq`")
        return fastqfile, None

    fastafile, qualfile = pf + ".fasta", pf + ".qual"
    outfile = args.outfile or fastafile
    outfile = op.join(outdir, outfile)
    if args.seqtk:
        if need_update(fastqfiles, outfile):
            for i, fastqfile in enumerate(fastqfiles):
                cmd = "seqtk seq -A {0} -L 30 -l 70".format(fastqfile)
                # First one creates file, following ones append to it
                sh(cmd, outfile=outfile, append=i)
        else:
            logging.debug("Outfile `{0}` already exists.".format(outfile))
        return outfile, None

    for fastqfile in fastqfiles:
        SeqIO.convert(fastqfile, "fastq", fastafile, "fasta")
        SeqIO.convert(fastqfile, "fastq", qualfile, "qual")

    return fastafile, qualfile
Beispiel #13
0
def build_bowtie(args):
    dirg, fg = get_genomedir(args.species)
    dirw = op.join(dirg, "21_dbs/bowtie2")
    if not op.isdir(dirw): mkdir(dirw)
    os.chdir(dirw)

    if op.isfile("db.rev.1.bt2") and not args.overwrite:
        logging.debug("db.*.bt2 already exists - skipped")
    else:
        sh("rm -rf *")
        sh("ln -sf %s db.fa" % fg)
        # need to "module load bowtie2"
        sh("bowtie2-build db.fa db")
Beispiel #14
0
def build_bowtie(args):
    dirg, fg = get_genomedir(args.species)
    dirw = op.join(dirg, "21_dbs/bowtie2")
    if not op.isdir(dirw): mkdir(dirw)
    os.chdir(dirw)
    
    if op.isfile("db.rev.1.bt2") and not args.overwrite:
        logging.debug("db.*.bt2 already exists - skipped")
    else:
        sh("rm -rf *")
        sh("ln -sf %s db.fa" % fg)
        # need to "module load bowtie2"
        sh("bowtie2-build db.fa db")
Beispiel #15
0
def build_gatk(args):
    dirg, fg = get_genomedir(args.species)
    dirw = op.join(dirg, "21_dbs/gatk")
    if not op.isdir(dirw): mkdir(dirw)
    os.chdir(dirw)

    if op.isfile("db.dict") and not args.overwrite:
        logging.debug("db.dict already exists - skipped")
    else:
        if op.exists("db.fasta"): sh("rm db.fasta")
        if op.exists("db.dict"): sh("rm db.dict")
        sh("cp ../../10_genome.fna db.fasta")
        sh("gatk CreateSequenceDictionary -R db.fasta")
        sh("samtools faidx db.fasta")
Beispiel #16
0
def build_gatk(args):
    dirg, fg = get_genomedir(args.species)
    dirw = op.join(dirg, "21_dbs/gatk")
    if not op.isdir(dirw): mkdir(dirw)
    os.chdir(dirw)
   
    if op.isfile("db.dict") and not args.overwrite:
        logging.debug("db.dict already exists - skipped")
    else:
        if op.exists("db.fasta"): sh("rm db.fasta")
        if op.exists("db.dict"): sh("rm db.dict")
        sh("cp ../../10_genome.fna db.fasta")
        sh("gatk CreateSequenceDictionary -R db.fasta")
        sh("samtools faidx db.fasta")
Beispiel #17
0
def build_hisat(args):
    dirg, fg = get_genomedir(args.species)
    dirw = op.join(dirg, "21_dbs/hisat2")
    if not op.isdir(dirw): mkdir(dirw)
    os.chdir(dirw)

    f_gtf = "../../50_annotation/10.gtf"
    if op.isfile("db.1.ht2") and not args.overwrite:
        logging.debug("db.1.ht2 already exists - skipped")
    elif not op.isfile(f_gtf):
        logging.error("no gtf file: f_gtf")
        sys.exit()
    else:
        sh("hisat2_extract_exons.py %s > db.exon" % f_gtf)
        sh("hisat2_extract_splice_sites.py %s > db.ss" % f_gtf)
        sh("hisat2-build -p %d --ss db.ss --exon db.exon %s db" % (args.p, fg))
Beispiel #18
0
def build_star(args):
    dirg, fg = get_genomedir(args.species)
    dirw = op.join(dirg, "21_dbs/star")
    if not op.isdir(dirw): mkdir(dirw)
    os.chdir(dirw)

    f_gtf = "../../50_annotation/10.gtf"
    if op.isfile("SA") and not args.overwrite:
        logging.debug("SA already exists - skipped")
    elif not op.isfile(f_gtf):
        logging.error("no gtf file: %s" % f_gtf)
        sys.exit()
    else:
        sh("STAR --runThreadN %d --runMode genomeGenerate --genomeDir %s \
                --genomeFastaFiles %s --sjdbGTFfile %s" %
           (args.p, ".", fg, f_gtf))
Beispiel #19
0
def build_star(args):
    dirg, fg = get_genomedir(args.species)
    dirw = op.join(dirg, "21_dbs/star")
    if not op.isdir(dirw): mkdir(dirw)
    os.chdir(dirw)
  
    f_gtf = "../../50_annotation/10.gtf"
    if op.isfile("SA") and not args.overwrite:
        logging.debug("SA already exists - skipped")
    elif not op.isfile(f_gtf):
        logging.error("no gtf file: %s" % f_gtf )
        sys.exit()
    else:
        sh("STAR --runThreadN %d --runMode genomeGenerate --genomeDir %s \
                --genomeFastaFiles %s --sjdbGTFfile %s" %
                (args.p, ".", fg, f_gtf))
Beispiel #20
0
def build_hisat(args):
    dirg, fg = get_genomedir(args.species)
    dirw = op.join(dirg, "21_dbs/hisat2")
    if not op.isdir(dirw): mkdir(dirw)
    os.chdir(dirw)
   
    f_gtf = "../../50_annotation/10.gtf"
    if op.isfile("db.1.ht2") and not args.overwrite:
        logging.debug("db.1.ht2 already exists - skipped")
    elif not op.isfile(f_gtf):
        logging.error("no gtf file: f_gtf")
        sys.exit()
    else:
        sh("hisat2_extract_exons.py %s > db.exon" % f_gtf)
        sh("hisat2_extract_splice_sites.py %s > db.ss" % f_gtf)
        sh("hisat2-build -p %d --ss db.ss --exon db.exon %s db" % (args.p, fg))
Beispiel #21
0
    def __init__(self, filename, outputdir=None, format="fasta", mode="cycle"):
        self.filename = filename
        self.outputdir = outputdir
        self.mode = mode

        format = format or self._guess_format(filename)
        logging.debug("format is %s" % format)

        if format in ("fasta", "fastq"):
            self.klass = "seqio"
        elif format == "clust":
            self.klass = "clust"
        else:
            self.klass = "txt"

        self.format = format
        mkdir(outputdir)
Beispiel #22
0
def merge_dirs(args):
    diris, diro = args.diri, args.diro
    mkdir(diro, overwrite=True)
    for diri in args.diri:
        for fn in os.listdir(diri):
            fi = op.join(diri, fn)
            fo = op.join(diro, fn)
            if not op.isfile(fi): continue
            if op.isfile(fo):
                if not cmp(fi, fo):
                    if args.replace:
                        copy(fi, fo)
                    else:
                        print("%s/%s diff from %s/%s - skipped" %
                              (diri, fn, diro, fn))
            else:
                copy(fi, fo)
Beispiel #23
0
def merge(args):
    """
    %prog merge merged_bams bams1_dir bams2_dir ...

    Merge BAM files. Treat the bams with the same prefix as a set.
    Output the commands first.
    """
    from maize.apps.grid import MakeManager

    p = OptionParser(merge.__doc__)
    p.set_sep(sep="_", help="Separator to group per prefix")
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    merged_bams = args[0]
    bamdirs = args[1:]

    mkdir(merged_bams)
    bams = []
    for x in bamdirs:
        bams += glob(op.join(x, "*.bam"))
    bams = [x for x in bams if "nsorted" not in x]

    logging.debug("Found a total of {0} BAM files.".format(len(bams)))

    sep = args.sep
    key = lambda x: op.basename(x).split(sep)[0]
    bams.sort(key=key)
    mm = MakeManager()
    for prefix, files in groupby(bams, key=key):
        files = sorted(list(files))
        nfiles = len(files)
        source = " ".join(files)
        target = op.join(merged_bams, op.basename(files[0]))
        if nfiles == 1:
            source = get_abs_path(source)
            cmd = "ln -s {0} {1}".format(source, target)
            mm.add("", target, cmd)
        else:
            cmd = "samtools merge -@ 8 {0} {1}".format(target, source)
            mm.add(files, target, cmd, remove=True)
    mm.write()
Beispiel #24
0
def merge(args):
    """
    %prog merge merged_bams bams1_dir bams2_dir ...

    Merge BAM files. Treat the bams with the same prefix as a set.
    Output the commands first.
    """
    from maize.apps.grid import MakeManager

    p = OptionParser(merge.__doc__)
    p.set_sep(sep="_", help="Separator to group per prefix")
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    merged_bams = args[0]
    bamdirs = args[1:]

    mkdir(merged_bams)
    bams = []
    for x in bamdirs:
        bams += glob(op.join(x, "*.bam"))
    bams = [x for x in bams if "nsorted" not in x]

    logging.debug("Found a total of {0} BAM files.".format(len(bams)))

    sep = args.sep
    key = lambda x: op.basename(x).split(sep)[0]
    bams.sort(key=key)
    mm = MakeManager()
    for prefix, files in groupby(bams, key=key):
        files = sorted(list(files))
        nfiles = len(files)
        source = " ".join(files)
        target = op.join(merged_bams, op.basename(files[0]))
        if nfiles == 1:
            source = get_abs_path(source)
            cmd = "ln -s {0} {1}".format(source, target)
            mm.add("", target, cmd)
        else:
            cmd = "samtools merge -@ 8 {0} {1}".format(target, source)
            mm.add(files, target, cmd, remove=True)
    mm.write()
Beispiel #25
0
def prepare(cfg):
    cfg = cfg['prepare']
    dirw, qry, tgt = cfg['dirw'], cfg['qry'], cfg['tgt']
    qry_fas, tgt_fas = cfg['qry_fas'], cfg['tgt_fas']
    tmpdir = cfg['temp_dir']
    npieces = int(cfg['npieces'])
    
    if not op.isdir(dirw):
        logging.debug("making directory: %s" % dirw)
        mkdir(dirw)
    os.chdir(dirw)
    subdirs = ['01_tgt_genome', '02_qry_genome']
    for subdir in subdirs:
        if not op.isdir(subdir):
            logging.debug("making directory: %s" % subdir)
            mkdir(subdir)

    if op.isfile(tgt_fas):
        fo = "raw.fa"
        if tgt_fas.endswith(".fa.gz") or tgt_fas.endswith(".fas.gz"):
            fo = "raw.fa.gz"
        sh("ln -sf %s 01_tgt_genome/%s" % (tgt_fas, fo))
    else:
        logging.error("%s not exist" % qry_fas)
    if op.isfile(qry_fas):
        fo = "raw.fa"
        if qry_fas.endswith(".fa.gz") or qry_fas.endswith(".fas.gz"):
            fo = "raw.fa.gz"
        sh("ln -sf %s 02_qry_genome/%s" % (qry_fas, fo))
    else:
        logging.error("%s not exist" % qry_fas)

    sh("genome fasta %s/%s" % (dirw, "01_tgt_genome"))
    sh("genome blat %s/%s" % (dirw, "01_tgt_genome"))
    sh("genome fasta %s/%s" % (dirw, "02_qry_genome"))
    sh("genome blat %s/%s" % (dirw, "02_qry_genome"))
    
    sh("bed filter --minsize 1000 02_qry_genome/16.gap.bed > 04.qry.gap.bed")
    sh("subtractBed -nonamecheck -a 02_qry_genome/15.bed -b 04.qry.gap.bed | bed filter -min 100 - | bed makewindow -w 100000 -s 95000 - > 05.qry.clean.bed")
    sh("bed size 05.qry.clean.bed")
    sh("fasta extract 02_qry_genome/11_genome.fas 05.qry.clean.bed > 06.qry.fas")
    sh("fasta split --N %d %s %s" % (npieces, '06.qry.fas', diro1))
Beispiel #26
0
def tsvs(args):
    """
    %prog tsvs excelfile

    Convert all worksheets in EXCEL to tsv files.
    """
    excelfile = args.excel
    odir = args.outdir
    sep = args.sep

    xl = pd.ExcelFile(excelfile)
    sheets = xl.sheet_names
    print("will convert %d sheets under %s" % (len(sheets), odir))
    mkdir(odir)

    suf = '.tsv' if sep == '\t' else '.csv'
    for sheet in sheets:
        fo = "%s/%s%s" % (odir, sheet, suf)
        print("    writing %s" % fo)
        df = pd.read_excel(excelfile, sheet_name=sheet, header=0)
        df.to_csv(fo, sep=sep, header=True, index=False)
Beispiel #27
0
def check_cfg_mapping(c):
    c.outdirs = c.outdir.split(",")
    assert len(c.outdirs) == 2, "not 2 outdirs: %s" % c.outdir

    for subdir in [c.dirw, c.temp_dir] + c.outdirs:
        if not op.isdir(subdir):
            mkdir(subdir)

    for fn in [c.ilist, c.genome, c.gff]:
        assert op.isfile(fn), "cannot read %s" % fn

    for key in 'samtools parallel sambamba bcftools bedtools'.split():
        fp = which(c[key])
        assert fp is not None, "not executable: %s" % c[key]
        c[key] = fp

    c.paired = str2bool(c.paired)

    if c.mapper == 'bwa':
        c.bwa = which(c.bwa)
        assert c.bwa is not None, "not executable: %s" % c.bwa
    elif c.mapper == 'hisat2':
        c.hisat2 = which(c.hisat2)
        assert c.hisat2 is not None, "not executable: %s" % c.hisat2
    elif c.mapper == 'bowtie2':
        c.bowtie2 = which(c.bowtie2)
        assert c.bowtie2 is not None, "not executable: %s" % c.bowtie2
    else:
        logging.error("unsupported mapper: %s" % c.mapper)
        sys.exit(1)

    njob = 3
    c.pbs_walltimes = c.pbs_walltime.split(",")
    c.pbs_ppns = c.pbs_ppn.split(",")
    c.pbs_queues = c.pbs_queue.split(",")
    assert njob == len(c.pbs_queues) == len(c.pbs_walltimes) == len(
        c.pbs_ppns), "not %d jobs: %s" % (njob, c.pbs_queue)
    c.njob = njob

    return c
Beispiel #28
0
def check_cfg_mapping(c):
    c.outdirs = c.outdir.split(",")
    assert len(c.outdirs) == 2, "not 2 outdirs: %s" % c.outdir
    
    for subdir in [c.dirw, c.temp_dir] + c.outdirs:
        if not op.isdir(subdir):
            mkdir(subdir)
    
    for fn in [c.ilist, c.genome, c.gff]:
        assert op.isfile(fn), "cannot read %s" % fn

    for key in 'samtools parallel sambamba bcftools bedtools'.split():
        fp = which(c[key])
        assert fp is not None, "not executable: %s" % c[key]
        c[key] = fp

    c.paired = str2bool(c.paired)

    if c.mapper == 'bwa':
        c.bwa = which(c.bwa)
        assert c.bwa is not None, "not executable: %s" % c.bwa
    elif c.mapper == 'hisat2':
        c.hisat2 = which(c.hisat2)
        assert c.hisat2 is not None, "not executable: %s" % c.hisat2
    elif c.mapper == 'bowtie2':
        c.bowtie2 = which(c.bowtie2)
        assert c.bowtie2 is not None, "not executable: %s" % c.bowtie2
    else:
        logging.error("unsupported mapper: %s" % c.mapper)
        sys.exit(1)
    
    njob = 3
    c.pbs_walltimes = c.pbs_walltime.split(",")
    c.pbs_ppns = c.pbs_ppn.split(",")
    c.pbs_queues = c.pbs_queue.split(",")
    assert njob == len(c.pbs_queues) == len(c.pbs_walltimes) == len(c.pbs_ppns), "not %d jobs: %s" % (njob, c.pbs_queue)
    c.njob = njob

    return c
Beispiel #29
0
def split_old(args):
    fi, dirw = op.realpath(args.fi), op.realpath(args.outdir)
    n = args.N
    if not op.exists(dirw):
        mkdir(dirw)
    else:
        sh("rm -rf %s/*" % dirw)

    cdir = os.path.dirname(os.path.realpath(__file__))
    cwd = os.getcwd()
    os.chdir(dirw)

    sh("ln -sf %s part.fas" % fi)
    sh("pyfasta split -n %d part.fas" % n)
    sh("rm part.fas part.fas.*")

    digit = ndigit(n)
    sizes = []
    for i in range(0,n):
        fmt = "part.%%0%dd.fas" % digit
        fp = fmt % i
        sizes.append(os.stat(fp).st_size)
    sizes.sort()
    print("size range: %s - %s" % (prettysize(sizes[0]), prettysize(sizes[n-1])))
Beispiel #30
0
def check_cfg_index(c, noutdir = 1, njob = 3):
    c.outdirs = c.outdir.split(",")
    assert len(c.outdirs) == noutdir, "not %s outdirs: %s" % (noutdir, c.outdir)
    
    for subdir in [c.dirw, c.temp_dir] + c.outdirs:
        if not op.isdir(subdir):
            mkdir(subdir)
    
    for fn in [c.ilist, c.vcf, c.genome, c.gff]:
        assert op.isfile(fn), "cannot read %s" % fn

    for key in 'bcftools'.split():
        fp = which(c[key])
        assert fp is not None, "not executable: %s" % c[key]
        c[key] = fp
    
    c.pbs_walltimes = c.pbs_walltime.split(",")
    c.pbs_ppns = c.pbs_ppn.split(",")
    c.pbs_queues = c.pbs_queue.split(",")
    c.pbs_mems = c.pbs_mem.split(",")
    assert njob == len(c.pbs_queues) == len(c.pbs_walltimes) == len(c.pbs_ppns) == len(c.pbs_mems), "not %d jobs: %s" % (njob, c.pbs_queue)
    c.njob = njob

    return c
Beispiel #31
0
        description='rename fastq files')
    parser.add_argument('diri', help='FROM directory path')
    parser.add_argument('diro', help='TO directory path')
    parser.add_argument('list', help='read list (sid Readfile1[ Readfile2])')
    args = parser.parse_args()

    #dirw = '/home/springer/zhoux379/projects/3rnaseq/cache'
    #diri = '/scratch.global/tenders/3RNA_0418/fastq_gz_files'
    #diro = '10.fastq'
    diri, diro, ilist = args.diri, args.diro, args.list

    if not op.isfile(ilist):
        logging.error("cannot read %s" % ilist)
        sys.exit()
    if not op.isdir(diro):
        mkdir(diro)

    t = Table.read(ilist, format='ascii.tab')
    paired = False
    if 'Readfile2' in t.colnames:
        paired = True
    tag = "paired" if paired else "single"
    logging.debug("proceed as %s-end reads" % tag)

    for i in range(len(t)):
        if paired:
            sid, fq1, fq2 = t['sid'][i], t['Readfile1'][i], t['Readfile2'][i]
            fq1, fq2 = op.join(diri, fq1), op.join(diri, fq2)
            assert op.isfile(fq1), "cannot access fq1: %s" % fq1
            assert op.isfile(fq2), "cannot access fq2: %s" % fq2
            cmd = "cp %s %s/%s_1.fq.gz" % (fq1, diro, sid)
Beispiel #32
0
def run_blat(cfg):
    cfg = cfg['blat']
    dirw, jobpre, diro1, diro2 = \
            cfg['dirw'], cfg['job_prefix'], cfg['outdir1'], cfg['outdir2']
    qry, tgt = cfg['qry'], cfg['tgt']
    parallel = cfg['parallel']
    npieces, npieces2 = int(cfg['npieces']), int(cfg['npieces2'])
    pbs_template, pbs_queue, pbs_walltime, pbs_ppn, pbs_email = \
            cfg['pbs_template'], cfg['pbs_queue'], cfg['pbs_walltime'], \
            cfg['pbs_ppn'], cfg['pbs_email']
   
    if not op.isdir(dirw): mkdir(dirw)
    os.chdir(dirw)
    for subdir in [diro1, diro2]:
        if not op.isdir(subdir):
            mkdir(subdir)
    
    dirt = "01_tgt_genome"
    dirq = "02_qry_genome"
    tgt_fas = "%s/11_genome.fas" % dirt
    qry_fas = "%s/11_genome.fas" % dirq
    tgt_2bit = "%s/21.blat/db.2bit" % dirt
    qry_2bit = "%s/21.blat/db.2bit" % dirq
    tgt_ooc = "%s/21.blat/db.2bit.tile11.ooc" % dirt
    tgt_size = "%s/15.sizes" % dirt
    qry_size = "%s/15.sizes" % dirq
    tgt_size_bed = "%s/15.bed" % dirt
    qry_size_bed = "%s/15.bed" % dirq
    tgt_gap = "%s/16.gap.bed" % dirt
    qry_gap = "%s/16.gap.bed" % dirq
    for fn in [tgt_fas, qry_fas, tgt_2bit, qry_2bit, tgt_ooc,
            tgt_size, qry_size, tgt_size_bed, qry_size_bed, tgt_gap, qry_gap]:
        if not op.isfile(fn):
            logging.error("%s is not there" % fn)
            sys.exit()
    
    pbs_queues = pbs_queue.split(",")
    pbs_ppns = pbs_ppn.split(",")
    pbs_walltimes = pbs_walltime.split(",")
    njob = len(pbs_queues)
    assert len(pbs_walltimes) == njob, "not %d jobs" % njob
    assert len(pbs_ppns) == njob, "not %d jobs" % njob
    fbs = ["%sb.%d.sh" % (jobpre, i+1) for i in range(njob)]
    fjs = ["%sj.%d.pbs" % (jobpre, i+1) for i in range(njob)]
    bcmds, jcmds = [], []

    #1 blat
    cmds = []
    bcmds.append(cmds)
    prepre = "part.%%0%dd" % ndigit(npieces-1)
    jcmds.append([
        "let i=${PBS_ARRAYID}",
        "cd %s" % dirw,
        "printf -v pre %s \"$i\"" % prepre,
        "pblat %s %s/${pre}.fas -threads=%s -ooc=%s %s/${pre}.psl" % \
            (tgt_2bit, diro1, pbs_ppns[0], tgt_ooc, diro2)
    ])

    #2 process blat 
    bcmds.append([
        "pslCat -nohead 12.blat/part.*.psl > 12.psl",
        "psl qcoord 12.psl %s > 13.psl" % qry_size,
        "pslCheck -querySizes=%s -targetSizes=%s -pass=14.check.psl 13.psl" %
            (qry_size, tgt_size),
        "axtChain -linearGap=medium -psl 14.check.psl %s %s 21.chain" %
            (tgt_2bit, qry_2bit),
        "chainPreNet 21.chain %s %s 23.chain" % (tgt_size, qry_size),
        "chain 2bed --qry 23.chain | sortBed -i stdin | \
                mergeBed -i stdin > 23.bed",
        "subtractBed -a %s -b %s -nonamecheck | \
                subtractBed -a stdin -b 23.bed -nonamecheck | \
                bed filter --minsize 50 - > 24.nov.bed" %
                (qry_size_bed, qry_gap),
        "seqret.pl -d %s -b 24.nov.bed -o 24.nov.fas" % qry_fas,
        "rm 23.chain 23.bed",
        "fasta split --N %d %s %s" % (npieces2, '24.nov.fas', '25.nov'),
    ])
    jcmds.append([
        "cd %s" % dirw,
        "bash %s" % fbs[1],
    ])
   
    #3 blat nov
    cmds = []
    bcmds.append(cmds)
    prepre = "part.%%0%dd" % ndigit(npieces2-1)
    jcmds.append([
        "let i=${PBS_ARRAYID}",
        "cd %s" % dirw,
        "printf -v pre %s \"$i\"" % prepre,
        "pblat %s %s/${pre}.fas -threads=%s -ooc=%s %s/${pre}.psl" % \
            (tgt_2bit, '25.nov', pbs_ppns[2], tgt_ooc, '25.nov')
    ])

    #4 process blat
    bcmds.append([
        "pslCat -nohead 25.nov/part.*.psl > 25.nov.psl",
        "psl qcoord 25.nov.psl %s > 26.psl" % qry_size,
        "pslCheck -querySizes=%s -targetSizes=%s -pass=27.check.psl 26.psl" %
            (qry_size, tgt_size),
        "pslCat 14.check.psl 27.check.psl > 31.1.psl",
        "pslSwap 31.1.psl 41.1.psl",
        "rm 25.nov.psl 26.psl",

        "axtChain -linearGap=medium -psl 31.1.psl %s %s 31.2.chain" % (tgt_2bit, qry_2bit),
        "chainPreNet 31.2.chain %s %s 31.3.chain" % (tgt_size, qry_size),
        "chainSwap 31.3.chain 31.3.q.chain",
        "chainNet 31.3.chain %s %s 31.5.net 31.5.q.net" % (tgt_size, qry_size),
        "netChainSubset 31.5.net 31.3.chain stdout | chainSort stdin 31.5.chain",
        "netChainSubset 31.5.q.net 31.3.q.chain stdout | chainSort stdin 31.5.q.chain",
        "chainNet 31.5.q.chain %s %s /dev/null 31.8.net" % (qry_size, tgt_size),
        "netChainSubset 31.8.net 31.3.chain 31.8.chain",
        
        "axtChain -linearGap=medium -psl 41.1.psl %s %s 41.2.chain" % (qry_2bit, tgt_2bit),
        "chainPreNet 41.2.chain %s %s 41.3.chain" % (qry_size, tgt_size),
        "chainSwap 41.3.chain 41.3.q.chain",
        "chainNet 41.3.chain %s %s 41.5.net 41.5.q.net" % (qry_size, tgt_size),
        "netChainSubset 41.5.net 41.3.chain stdout | chainSort stdin 41.5.chain",
        "netChainSubset 41.5.q.net 41.3.q.chain stdout | chainSort stdin 41.5.q.chain",
        "chainNet 41.5.q.chain %s %s /dev/null 41.8.net" % (tgt_size, qry_size),
        "netChainSubset 41.8.net 41.3.chain 41.8.chain",
    ])
    jcmds.append([
        "cd %s" % dirw,
        "bash %s" % fbs[3],
    ])

    #5 process vnt
    bcmds.append([
        "snp2vcf.pl -i snp -o snp.vcf -s %s" % qry,
    ])
    jcmds.append([
        "cd %s/31.9" % dirw,
        "bash %s" % fbs[4],
    ])
    
    assert len(bcmds) == njob, "not %d jobs" % njob
    assert len(jcmds) == njob, "not %d jobs" % njob
    for i in range(njob):
        fb, fj = fbs[i], fjs[i]
        if op.isfile(fb):
            os.remove(fb)
        if len(bcmds[i]) > 0:
            fhb = open(fb, "w")
            fhb.write("\n".join(bcmds[i]) + "\n")
            fhb.close()
        pbsjob = PbsJob(queue = pbs_queues[i],
                ppn = pbs_ppns[i],
                walltime = pbs_walltimes[i],
                email = pbs_email,
                cmds = "\n".join(jcmds[i])
        )
        pbsjob.write(fj)
        
    logging.debug("%s job scripts were created" % njob)
    eprint("qsub -t 0-%d %s" % (npieces-1, fjs[0]))
    eprint("qsub -W depend=afterok: %s" % fjs[1])
    eprint("qsub -t 0-%d %s" % (npieces2-1, fjs[2]))
    eprint("qsub -W depend=afterok: %s" % fjs[3])
    eprint("qsub -W depend=afterok: %s" % fjs[4])
Beispiel #33
0
def check_cfg_mapping(c, noutdir = 4, njob = 2):
    c.outdirs = c.outdir.split(",")
    assert len(c.outdirs) == noutdir, "not %s outdirs: %s" % (noutdir, c.outdir)
    
    for subdir in [c.dirw, c.temp_dir] + c.outdirs:
        if not op.isdir(subdir):
            mkdir(subdir)
    
    for fn in [c.ilist, c.vcf, c.gene_bed]:
        assert op.isfile(fn), "cannot read %s" % fn

    for key in 'samtools parallel sambamba htseq bcftools bedtools'.split():
        fp = which(c[key])
        assert fp is not None, "not executable: %s" % c[key]
        c[key] = fp

    c.paired = str2bool(c.paired)
    assert c.stranded in ['yes', 'no', 'reverse'], "unknown stranded option: %s" % c.stranded
    
    if c.mapper == 'tophat2':
        c.tophat2 = which(c.tophat2)
        assert c.tophat2 is not None, "not executable: %s" % c.tophat2
    elif c.mapper == 'hisat2':
        c.hisat2= which(c.hisat2)
        assert c.hisat2 is not None, "not executable: %s" % c.hisat2
    elif c.mapper == 'star':
        c.star= which(c.star)
        assert c.star is not None, "not executable: %s" % c.star
    else:
        logging.error("unsupported mapper: %s" % c.mapper)
        sys.exit(1)
    
    assert op.isdir(c.genomedir), "cannot access %s" % c.genomedir
    t = Table.read(c.ilist, format = 'ascii.tab')
    if 'genome' not in t[0]:
        genomeb = 'B73c'
        logging.debug("no 'genome' column detected: use %s" % genomeb)
        t.add_column(Column([genomeb] * len(t)), name = 'genome')
    c.t = t
    
    genomes = set()
    for i in range(len(t)):
        gts = t['genome'][i].split(",")
        for gt in gts:
            genomes.add(gt)
    genomes = sorted(list(genomes))
    logging.debug("checking %d genomes" % len(genomes))
   
    c.genomes = dict()
    for gt in genomes:
        c.genomes[gt] = dict()
        dirg = "%s/%s" % (c.genomedir, gt)
        dbpre = ''
        if c.mapper == 'tophat2':
            dbpre = "%s/21.bowtie2/db" % dirg
            assert op.isfile("%s.4.bt2" % dbpre), "no %s db-index: %s" % (c.mapper, dbpre)
        elif c.mapper == 'hisat2':
            dbpre = "%s/21.hisat2/db" % dirg
            assert op.isfile("%s.8.ht2" % dbpre), "no %s db-index: %s" % (c.mapper, dbpre)
        elif c.mapper == 'star':
            dbpre = "%s/21.star" % dirg
            assert op.isfile("%s/SA" % dbpre), "no %s db-index: %s" % (c.mapper, dbpre)
        c.genomes[gt]['db'] = dbpre
        gff = "%s/51.gff" % dirg
        assert op.isfile(gff), "no gff for %s: %s" % (gff, gt)
        c.genomes[gt]['gff'] = gff
    
    c.pbs_walltimes = c.pbs_walltime.split(",")
    c.pbs_ppns = c.pbs_ppn.split(",")
    c.pbs_queues = c.pbs_queue.split(",")
    assert njob == len(c.pbs_queues) == len(c.pbs_walltimes) == len(c.pbs_ppns), "not %d jobs: %s" % (njob, c.pbs_queue)
    c.njob = njob

    return c
Beispiel #34
0
    )
    parser.add_argument('diri', help = 'FROM directory path')
    parser.add_argument('diro', help = 'TO directory path')
    parser.add_argument('list', help = 'read list (sid Readfile1[ Readfile2])')
    args = parser.parse_args()
    
    #dirw = '/home/springer/zhoux379/projects/3rnaseq/cache'
    #diri = '/scratch.global/tenders/3RNA_0418/fastq_gz_files'
    #diro = '10.fastq'
    diri, diro, ilist = args.diri, args.diro, args.list
    
    if not op.isfile(ilist):
        logging.error("cannot read %s" % ilist)
        sys.exit()
    if not op.isdir(diro):
        mkdir(diro)
    
    t = Table.read(ilist, format = 'ascii.tab')
    paired = False
    if 'Readfile2' in t.colnames:
        paired = True
    tag = "paired" if paired else "single"
    logging.debug("proceed as %s-end reads" % tag)

    for i in range(len(t)):
        if paired:
            sid, fq1, fq2 = t['sid'][i], t['Readfile1'][i], t['Readfile2'][i]
            fq1, fq2 = op.join(diri, fq1), op.join(diri, fq2)
            assert op.isfile(fq1), "cannot access fq1: %s" % fq1
            assert op.isfile(fq2), "cannot access fq2: %s" % fq2
            cmd = "cp %s %s/%s_1.fq.gz" % (fq1, diro, sid)
Beispiel #35
0
def entrez(args):
    """
    %prog entrez <filename|term>

    `filename` contains a list of terms to search. Or just one term. If the
    results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed
    the download.
    """
    p = OptionParser(entrez.__doc__)

    allowed_databases = {"fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"],
                         "asn.1": ["genome", "nuccore", "nucgss", "protein", "gene"],
                         "xml"  : ["genome", "nuccore", "nucgss", "nucest", "gene"],
                         "gb"   : ["genome", "nuccore", "nucgss"],
                         "est"  : ["nucest"],
                         "gss"  : ["nucgss"],
                         "acc"  : ["nuccore"],
                        }

    valid_formats = tuple(allowed_databases.keys())
    valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein", "gene")

    sp1.add_argument("--noversion", dest="noversion",
            default=False, action="store_true",
            help="Remove trailing accession versions")
    sp1.add_argument("--format", default="fasta", choices=valid_formats,
            help="download format [default: %default]")
    sp1.add_argument("--database", default="nuccore", choices=valid_databases,
            help="search database [default: %default]")
    sp1.add_argument("--retmax", default=1000000, type="int",
            help="how many results to return [default: %default]")
    sp1.add_argument("--skipcheck", default=False, action="store_true",
            help="turn off prompt to check file existence [default: %default]")
    sp1.add_argument("--batchsize", default=500, type="int",
            help="download the results in batch for speed-up [default: %default]")
    p.set_outdir(outdir=None)
    sp1.add_argument("--outprefix", default="out",
            help="output file name prefix [default: %default]")
    p.set_email()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    filename, = args
    if op.exists(filename):
        pf = filename.rsplit(".", 1)[0]
        list_of_terms = [row.strip() for row in open(filename)]
        if args.noversion:
            list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms]
    else:
        pf = filename
        # the filename is the search term
        list_of_terms = [filename.strip()]

    fmt = args.format
    database = args.database
    batchsize = args.batchsize

    assert database in allowed_databases[fmt], \
        "For output format '{0}', allowed databases are: {1}".\
        format(fmt, allowed_databases[fmt])
    assert batchsize >= 1, "batchsize must >= 1"

    if " " in pf:
        pf = args.outprefix

    outfile = "{0}.{1}".format(pf, fmt)

    outdir = args.outdir
    if outdir:
        mkdir(outdir)

    # If noprompt, will not check file existence
    if not outdir:
        fw = must_open(outfile, "w", checkexists=True, \
                skipcheck=args.skipcheck)
        if fw is None:
            return

    seen = set()
    totalsize = 0
    for id, size, term, handle in batch_entrez(list_of_terms, retmax=args.retmax, \
                                 rettype=fmt, db=database, batchsize=batchsize, \
                                 email=args.email):
        if outdir:
            outfile = urljoin(outdir, "{0}.{1}".format(term, fmt))
            fw = must_open(outfile, "w", checkexists=True, \
                    skipcheck=args.skipcheck)
            if fw is None:
                continue

        rec = handle.read()
        if id in seen:
            logging.error("Duplicate key ({0}) found".format(rec))
            continue

        totalsize += size
        print >> fw, rec
        print >> fw

        seen.add(id)

    if seen:
        print >> sys.stderr, "A total of {0} {1} records downloaded.".\
                format(totalsize, fmt.upper())

    return outfile