Ejemplo n.º 1
0
def batchoverlap(args):
    """
    %prog batchoverlap pairs.txt outdir

    Check overlaps between pairs of sequences.
    """
    p = OptionParser(batchoverlap.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    pairsfile, outdir = args
    fp = open(pairsfile)
    cmds = []
    mkdir("overlaps")
    for row in fp:
        a, b = row.split()[:2]
        oa = op.join(outdir, a + ".fa")
        ob = op.join(outdir, b + ".fa")
        cmd = "python -m jcvi.assembly.goldenpath overlap {0} {1}".format(oa, ob)
        cmd += " -o overlaps/{0}_{1}.ov".format(a, b)
        cmds.append(cmd)

    print "\n".join(cmds)
Ejemplo n.º 2
0
def mconsensus(args):
    """
    %prog mconsensus *.consensus

    Call consensus along the stacks from cross-sample clustering.
    """
    p = OptionParser(mconsensus.__doc__)
    p.add_option("--allele_counts", default="allele_counts",
                 help="Directory to generate allele counts")
    add_consensus_options(p)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    consensusfiles = args
    prefix = opts.prefix
    acdir = opts.allele_counts
    store = ClustStores(consensusfiles)
    pctid = find_pctid(consensusfiles)
    pf = prefix + ".P{0}".format(pctid)

    clustSfile = pf + ".clustS"
    AC = makeloci(clustSfile, store, prefix, minsamp=opts.minsamp)

    mkdir(acdir)
    acfile = pf + ".allele_counts"
    fw = open(acfile, "w")
    seen = DefaultOrderedDict(list)        # chr, pos => taxa
    print >> fw, "# " + "\t".join(ACHEADER)
    # Sort allele counts into separate files
    for ac in AC:
        chrpos = ac.chr, ac.pos
        seen[chrpos].append(ac)
        print >> fw, ac.tostring(taxon=True)
    fw.close()

    logging.debug("Populate all taxa and instantiate empty vector if missing")
    all_taxa = set([op.basename(x).split(".")[0] for x in consensusfiles])
    taxon_to_ac = defaultdict(list)
    for chrpos, aclist in seen.items():
        included_taxa = set([x.taxon for x in aclist])
        missing_taxa = all_taxa - included_taxa
        template = deepcopy(aclist[0])
        template.clear()
        for ac in aclist:
            taxon_to_ac[ac.taxon].append(ac)
        for tx in missing_taxa:
            taxon_to_ac[tx].append(template)

    logging.debug("Write allele counts for all taxa")
    for tx, aclist in sorted(taxon_to_ac.items()):
        tx_acfile = op.join(acdir, tx + ".allele_counts")
        fw = open(tx_acfile, "w")
        print >> fw, "# " + "\t".join(ACHEADER_NO_TAXON)
        for ac in aclist:
            print >> fw, ac.tostring()
        fw.close()
        logging.debug("Written {0} sites in `{1}`".\
                format(len(aclist), tx_acfile))
Ejemplo n.º 3
0
def cib(args):
    """
    %prog cib bamfile samplekey

    Convert BAM to CIB (a binary storage of int8 per base).
    """
    p = OptionParser(cib.__doc__)
    p.add_option("--prefix", help="Report seqids with this prefix only")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, samplekey = args
    mkdir(samplekey)
    bam = pysam.AlignmentFile(bamfile, "rb")
    refs = [x for x in bam.header["SQ"]]
    prefix = opts.prefix
    if prefix:
        refs = [x for x in refs if x["SN"].startswith(prefix)]

    task_args = []
    for r in refs:
        task_args.append((bamfile, r, samplekey))
    cpus = min(opts.cpus, len(task_args))
    logging.debug("Use {} cpus".format(cpus))

    p = Pool(processes=cpus)
    for res in p.imap(bam_to_cib, task_args):
        continue
Ejemplo n.º 4
0
def batch(args):
    """
    %prog batch splits output

    The arguments are two folders.
    Input FASTA sequences are in splits/.
    Output csv files are in output/.

    Must have folders swissprot/, tair/, trembl/ that contains the respective
    BLAST output. Once finished, you can run, for example:

    $ parallel java -Xmx2g -jar ~/code/AHRD/dist/ahrd.jar {} ::: output/*.yml
    """
    p = OptionParser(batch.__doc__)
    p.add_option("--path",
                 default="~/code/AHRD/",
                 help="Path where AHRD is installed [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    splits, output = args
    mkdir(output)

    for f in glob("{0}/*.fasta".format(splits)):
        fb = op.basename(f).split(".")[0]
        fw = open(op.join(output, fb + ".yml"), "w")

        path = op.expanduser(opts.path)
        dir = op.join(path, "test/resources")
        outfile = op.join(output, fb + ".csv")
        print >> fw, Template.format(dir, fb, f, outfile)
Ejemplo n.º 5
0
Archivo: ahrd.py Proyecto: bennyyu/jcvi
def batch(args):
    """
    %prog batch splits output

    The arguments are two folders.
    Input FASTA sequences are in splits/.
    Output csv files are in output/.

    Must have folders swissprot/, tair/, trembl/ that contains the respective
    BLAST output. Once finished, you can run, for example:

    $ parallel java -Xmx2g -jar ~/code/AHRD/dist/ahrd.jar {} ::: output/*.yml
    """
    p = OptionParser(batch.__doc__)
    p.add_option("--path", default="~/code/AHRD/",
                 help="Path where AHRD is installed [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    splits, output = args
    mkdir(output)

    for f in glob("{0}/*.fasta".format(splits)):
        fb = op.basename(f).split(".")[0]
        fw = open(op.join(output, fb + ".yml"), "w")

        path = op.expanduser(opts.path)
        dir = op.join(path, "test/resources")
        outfile = op.join(output, fb + ".csv")
        print >> fw, Template.format(dir, fb, f, outfile)
Ejemplo n.º 6
0
def batchccn(args):
    """
    %prog batchccn test.csv

    Run CCN script in batch. Write makefile.
    """
    p = OptionParser(batchccn.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (csvfile, ) = args
    mm = MakeManager()
    pf = op.basename(csvfile).split(".")[0]
    mkdir(pf)

    header = next(open(csvfile))
    header = None if header.strip().endswith(".bam") else "infer"
    logging.debug("Header={}".format(header))
    df = pd.read_csv(csvfile, header=header)
    cmd = "perl /mnt/software/ccn_gcn_hg38_script/ccn_gcn_hg38.pl"
    cmd += " -n {} -b {}"
    cmd += " -o {} -r hg38".format(pf)
    for i, (sample_key, bam) in df.iterrows():
        cmdi = cmd.format(sample_key, bam)
        outfile = "{}/{}/{}.ccn".format(pf, sample_key, sample_key)
        mm.add(csvfile, outfile, cmdi)
    mm.write()
Ejemplo n.º 7
0
def pairs(args):
    """
    %prog pairs folder reference.fasta

    Estimate insert size distribution. Compatible with a variety of aligners,
    including BOWTIE and BWA.
    """
    p = OptionParser(pairs.__doc__)
    p.set_firstN()
    p.set_mates()
    p.set_aligner()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    cwd = os.getcwd()
    aligner = opts.aligner
    work = "-".join(("pairs", aligner))
    mkdir(work)

    from jcvi.formats.sam import pairs as ps

    if aligner == "bowtie":
        from jcvi.apps.bowtie import align
    elif aligner == "bwa":
        from jcvi.apps.bwa import align

    folder, ref = args
    ref = get_abs_path(ref)
    messages = []
    for p, prefix in iter_project(folder):
        samplefq = []
        for i in range(2):
            samplefq.append(
                op.join(work, prefix + "_{0}.first.fastq".format(i + 1)))
            first([str(opts.firstN)] + [p[i]] + ["-o", samplefq[i]])

        os.chdir(work)
        align_args = [ref] + [op.basename(fq) for fq in samplefq]
        outfile, logfile = align(align_args)
        bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)])
        os.chdir(cwd)

        median = stats.median
        tag = "MP" if median > 1000 else "PE"
        median = str(median)
        pf, sf = median[:2], median[2:]
        if sf and int(sf) != 0:
            pf = str(int(pf) + 1)  # Get the first two effective digits
        lib = "{0}-{1}".format(tag, pf + "0" * len(sf))
        for i, xp in enumerate(p):
            suffix = "fastq.gz" if xp.endswith(".gz") else "fastq"
            link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""),
                                            i + 1, suffix)
            m = "\t".join(str(x) for x in (xp, link))
            messages.append(m)

    messages = "\n".join(messages)
    write_file("f.meta", messages, tee=True)
Ejemplo n.º 8
0
def batch(args):
    """
    %prog batch all.cds *.anchors

    Compute Ks values for a set of anchors file. This will generate a bunch of
    work directories for each comparisons. The anchorsfile should be in the form
    of specie1.species2.anchors.
    """
    from jcvi.apps.grid import MakeManager

    p = OptionParser(batch.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    cdsfile = args[0]
    anchors = args[1:]
    workdirs = [".".join(op.basename(x).split(".")[:2]) for x in anchors]
    for wd in workdirs:
        mkdir(wd)

    mm = MakeManager()
    for wd, ac in zip(workdirs, anchors):
        pairscdsfile = wd + ".cds.fasta"
        cmd = "python -m jcvi.apps.ks prepare {} {} -o {}".\
                format(ac, cdsfile, pairscdsfile)
        mm.add((ac, cdsfile), pairscdsfile, cmd)
        ksfile = wd + ".ks"
        cmd = "python -m jcvi.apps.ks calc {} -o {} --workdir {}".\
                format(pairscdsfile, ksfile, wd)
        mm.add(pairscdsfile, ksfile, cmd)
    mm.write()
Ejemplo n.º 9
0
Archivo: hic.py Proyecto: xuanblo/jcvi
def prepare_synteny(tourfile, lastfile, odir, p, opts):
    """
    Prepare synteny plots for movie().
    """
    qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts)
    qbedfile = op.abspath(qbedfile)
    sbedfile = op.abspath(sbedfile)

    qbed = Bed(qbedfile, sorted=False)
    contig_to_beds = dict(qbed.sub_beds())

    # Create a separate directory for the subplots and movie
    mkdir(odir, overwrite=True)
    os.chdir(odir)
    logging.debug("Change into subdir `{}`".format(odir))

    # Make anchorsfile
    anchorsfile = ".".join(op.basename(lastfile).split(".", 2)[:2]) \
                  + ".anchors"
    fw = open(anchorsfile, "w")
    for b in Blast(lastfile):
        print >> fw, "\t".join((gene_name(b.query), gene_name(b.subject),
                                str(int(b.score))))
    fw.close()

    # Symlink sbed
    symlink(sbedfile, op.basename(sbedfile))

    return anchorsfile, qbedfile, contig_to_beds
Ejemplo n.º 10
0
def calc(args):
    """
    %prog calc [prot.fasta] cds.fasta > out.ks

    Protein file is optional. If only one file is given, it is assumed to
    be CDS sequences with correct frame (frame 0). Results will be written to
    stdout. Both protein file and nucleotide file are assumed to be Fasta format,
    with adjacent records as the pairs to compare.

    Author: Haibao Tang <*****@*****.**>, Brad Chapman
    Calculate synonymous mutation rates for gene pairs

    This does the following:
        1. Fetches a protein pair.
        2. Aligns the protein pair with clustalw
        3. Convert the output to Fasta format.
        4. Use this alignment info to align gene sequences using PAL2NAL
        5. Run PAML yn00 to calculate synonymous mutation rates.
    """
    p = OptionParser(calc.__doc__)
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) == 1:
        protein_file, dna_file = None, args[0]
    elif len(args) == 2:
        protein_file, dna_file = args
    else:
        print >> sys.stderr, "Incorrect arguments"
        sys.exit(not p.print_help())

    output_h = must_open(opts.outfile, "w")
    output_h.write("name,dS-yn,dN-yn,dS-ng,dN-ng\n")
    work_dir = op.join(os.getcwd(), "syn_analysis")
    mkdir(work_dir)

    if not protein_file:
        protein_file = translate_dna(dna_file)

    prot_iterator = SeqIO.parse(open(protein_file), "fasta")
    dna_iterator = SeqIO.parse(open(dna_file), "fasta")
    for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in \
            zip(prot_iterator, prot_iterator, dna_iterator, dna_iterator):

        print >> sys.stderr, "--------", p_rec_1.name, p_rec_2.name
        align_fasta = clustal_align_protein(p_rec_1, p_rec_2, work_dir)
        mrtrans_fasta = run_mrtrans(align_fasta, n_rec_1, n_rec_2, work_dir)
        if mrtrans_fasta:
            ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = \
                    find_synonymous(mrtrans_fasta, work_dir)
            if ds_subs_yn is not None:
                pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name)
                output_h.write("%s\n" % (",".join(
                    str(x) for x in (pair_name, ds_subs_yn, dn_subs_yn,
                                     ds_subs_ng, dn_subs_ng))))
                output_h.flush()

    # Clean-up
    sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")
Ejemplo n.º 11
0
def merge(args):
    """
    %prog merge folder1 ...

    Consolidate split contents in the folders. The folders can be generated by
    the split() process and several samples may be in separate fastq files. This
    program merges them.
    """
    p = OptionParser(merge.__doc__)
    p.add_option("--outdir",
                 default="outdir",
                 help="Output final reads in [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    folders = args
    outdir = opts.outdir
    mkdir(outdir)

    files = flatten(glob("{0}/*.*.fastq".format(x)) for x in folders)
    files = list(files)
    key = lambda x: op.basename(x).split(".")[0]
    files.sort(key=key)
    for id, fns in groupby(files, key=key):
        fns = list(fns)
        outfile = op.join(outdir, "{0}.fastq".format(id))
        FileMerger(fns, outfile=outfile).merge(checkexists=True)
Ejemplo n.º 12
0
def lobstrindex(args):
    """
    %prog lobstrindex hg38.trf.bed hg38.upper.fa

    Make lobSTR index. Make sure the FASTA contain only upper case (so use
    fasta.format --upper to convert from UCSC fasta). The bed file is generated
    by str().
    """
    p = OptionParser(lobstrindex.__doc__)
    p.add_option(
        "--notreds",
        default=False,
        action="store_true",
        help="Remove TREDs from the bed file",
    )
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    trfbed, fastafile = args
    pf = fastafile.split(".")[0]
    lhome = opts.lobstr_home
    mkdir(pf)

    if opts.notreds:
        newbedfile = trfbed + ".new"
        newbed = open(newbedfile, "w")
        fp = open(trfbed)
        retained = total = 0
        seen = set()
        for row in fp:
            r = STRLine(row)
            total += 1
            name = r.longname
            if name in seen:
                continue
            seen.add(name)
            print(r, file=newbed)
            retained += 1
        newbed.close()
        logging.debug("Retained: {0}".format(percentage(retained, total)))
    else:
        newbedfile = trfbed

    mm = MakeManager()
    cmd = "python {0}/scripts/lobstr_index.py".format(lhome)
    cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf)
    mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd)

    tabfile = "{0}/index.tab".format(pf)
    cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome)
    cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile)
    mm.add((newbedfile, fastafile), tabfile, cmd)

    infofile = "{0}/index.info".format(pf)
    cmd = "cp {0} {1}".format(newbedfile, infofile)
    mm.add(trfbed, infofile, cmd)
    mm.write()
Ejemplo n.º 13
0
def sra(args):
    """
    %prog sra [term|term.ids]

    Given an SRA run ID, fetch the corresponding .sra file from the sra-instant FTP.
    The term can also be a file containing list of SRR ids, one per line.
    """
    p = OptionParser(sra.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    term, = args
    if op.isfile(term):
        terms = [x.strip() for x in open(term)]
    else:
        terms = [term]

    for term in terms:
        srafile = download_srr_term(term)
        pf = srafile.split(".")[0]
        mkdir(pf)
        cmd = "fastq-dump --outdir {} --split-files {}".format(pf, srafile)
        sh(cmd)
Ejemplo n.º 14
0
Archivo: train.py Proyecto: rrane/jcvi
def augustus(args):
    """
    %prog augustus species gffile fastafile

    Train AUGUSTUS model given gffile and fastafile. Whole procedure taken from:
    <http://www.molecularevolution.org/molevolfiles/exercises/augustus/training.html>
    """
    p = OptionParser(snap.__doc__)
    p.set_home("augustus")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    species, gffile, fastafile = args
    mhome = opts.augustus_home
    augdir = "augustus"

    cwd = os.getcwd()
    mkdir(augdir)
    os.chdir(augdir)

    sh("{0}/scripts/new_species.pl --species={1}".format(mhome, species))
    sh("{0}/scripts/gff2gbSmallDNA.pl ../{1} ../{2} 1000 raw.gb".format(mhome, gffile, fastafile))
    sh("{0}/bin/etraining --species={1} raw.gb 2> train.err".format(mhome, species))
    sh("cat train.err | perl -pe 's/.*in sequence (\S+): .*/$1/' > badgenes.lst")
    sh("{0}/scripts/filterGenes.pl badgenes.lst raw.gb > training.gb".format(mhome))
    sh("grep -c LOCUS raw.gb training.gb")
    sh("{0}/scripts/autoAugTrain.pl --trainingset=training.gb --species={1}".format(mhome, species))

    os.chdir(cwd)
    sh("cp -r {0}/species/{1} augustus/".format(mhome, species))
Ejemplo n.º 15
0
Archivo: cnv.py Proyecto: xuanblo/jcvi
def gcn(args):
    """
    %prog gcn gencode.v26.exonunion.bed data/*.vcf.gz

    Compile gene copy njumber based on CANVAS results.
    """
    p = OptionParser(gcn.__doc__)
    p.set_cpus()
    p.set_tmpdir(tmpdir="tmp")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    exonbed = args[0]
    canvasvcfs = args[1:]
    tsvfile = opts.outfile
    tmpdir = opts.tmpdir

    mkdir(tmpdir)
    set_tempdir(tmpdir)

    df = vcf_to_df(canvasvcfs, exonbed, opts.cpus)
    for suffix in (".avgcn", ".medcn"):
        df_to_tsv(df, tsvfile, suffix)
Ejemplo n.º 16
0
    def __init__(
        self,
        data: TSPDataModel,
        work_dir=Work_dir,
        clean=True,
        verbose=False,
        precision=0,
        seed=666,
    ):
        """Run concorde on TSP instance

        Args:
            data (TSPDataModel): TSP instance with edge weights
            work_dir ([type], optional): Path to the work dir. Defaults to Work_dir.
            clean (bool, optional): Clean up intermediate results. Defaults to True.
            verbose (bool, optional): Show verbose messages. Defaults to False.
            precision (int, optional): Float precision of distance. Defaults to 0.
            seed (int, optional): Random seed. Defaults to 666.
        """
        self.data = data
        self.work_dir = work_dir
        self.clean = clean
        self.verbose = verbose

        mkdir(work_dir)
        tspfile = op.join(work_dir, "data.tsp")
        self.print_to_tsplib(tspfile, precision=precision)
        _, outfile = self.run_concorde(tspfile, seed=seed)
        self.tour = self.parse_output(outfile)

        if clean:
            shutil.rmtree(work_dir)
            residual_output = ["data.sol", "data.res", "Odata.res"]
            FileShredder(residual_output, verbose=False)
Ejemplo n.º 17
0
def split_old(args):
    fi, dirw = op.realpath(args.fi), op.realpath(args.outdir)
    n = args.N
    if not op.exists(dirw):
        mkdir(dirw)
    else:
        sh("rm -rf %s/*" % dirw)

    cdir = os.path.dirname(os.path.realpath(__file__))
    cwd = os.getcwd()
    os.chdir(dirw)

    sh("ln -sf %s part.fas" % fi)
    sh("pyfasta split -n %d part.fas" % n)
    sh("rm part.fas part.fas.*")

    digit = ndigit(n)
    sizes = []
    for i in range(0, n):
        fmt = "part.%%0%dd.fas" % digit
        fp = fmt % i
        sizes.append(os.stat(fp).st_size)
    sizes.sort()
    print("size range: %s - %s" %
          (prettysize(sizes[0]), prettysize(sizes[n - 1])))
Ejemplo n.º 18
0
Archivo: cnv.py Proyecto: xuanblo/jcvi
def batchccn(args):
    """
    %prog batchccn test.csv

    Run CCN script in batch. Write makefile.
    """
    p = OptionParser(batchccn.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    csvfile, = args
    mm = MakeManager()
    pf = op.basename(csvfile).split(".")[0]
    mkdir(pf)

    header = open(csvfile).next()
    header = None if header.strip().endswith(".bam") else "infer"
    logging.debug("Header={}".format(header))
    df = pd.read_csv(csvfile, header=header)
    cmd = "perl /mnt/software/ccn_gcn_hg38_script/ccn_gcn_hg38.pl"
    cmd += " -n {} -b {}"
    cmd += " -o {} -r hg38".format(pf)
    for i, (sample_key, bam) in df.iterrows():
        cmdi = cmd.format(sample_key, bam)
        outfile = "{}/{}/{}.ccn".format(pf, sample_key, sample_key)
        mm.add(csvfile, outfile, cmdi)
    mm.write()
Ejemplo n.º 19
0
def genemark(args):
    """
    %prog genemark species fastafile

    Train GENEMARK model given fastafile. GENEMARK self-trains so no trainig
    model gff file is needed.
    """
    p = OptionParser(genemark.__doc__)
    p.set_home("gmes")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    species, fastafile = args
    mhome = opts.gmes_home
    gmdir = "genemark"
    mkdir(gmdir)

    cwd = os.getcwd()
    os.chdir(gmdir)
    cmd = "ln -sf ../{0}".format(fastafile)
    sh(cmd)

    license = op.expanduser("~/.gm_key")
    assert op.exists(license), "License key ({0}) not found!".format(license)
    cmd = "{0}/gm_es.pl {1}".format(mhome, fastafile)
    sh(cmd)

    os.chdir(cwd)
    logging.debug("GENEMARK matrix written to `{0}/mod/{1}.mod`".format(
        gmdir, species))
Ejemplo n.º 20
0
def SH_raxml(reftree, querytree, phy_file, shout="SH_out.txt"):
    """
    SH test using RAxML

    querytree can be a single tree or a bunch of trees (eg. from bootstrapping)
    """
    assert op.isfile(reftree)
    shout = must_open(shout, "a")

    raxml_work = op.abspath(op.join(op.dirname(phy_file), "raxml_work"))
    mkdir(raxml_work)
    raxml_cl = RaxmlCommandline(cmd=RAXML_BIN("raxmlHPC"), \
    sequences=phy_file, algorithm="h", model="GTRGAMMA", \
    name="SH", starting_tree=reftree, bipartition_filename=querytree, \
    working_dir=raxml_work)

    logging.debug("Running SH test in RAxML: %s" % raxml_cl)
    o, stderr = raxml_cl()
    # hard coded
    try:
        pval = re.search('(Significantly.*:.*)', o).group(0)
    except:
        print >> sys.stderr, "SH test failed."
    else:
        pval = pval.strip().replace("\t", " ").replace("%", "\%")
        print >> shout, "{0}\t{1}".format(op.basename(querytree), pval)
        logging.debug("SH p-value appended to %s" % shout.name)

    shout.close()
    return shout.name
Ejemplo n.º 21
0
def build_ml_raxml(alignment, outfile, work_dir=".", **kwargs):
    """
    build maximum likelihood tree of DNA seqs with RAxML
    """
    work_dir = op.join(work_dir, "work")
    mkdir(work_dir)
    phy_file = op.join(work_dir, "aln.phy")
    AlignIO.write(alignment, file(phy_file, "w"), "phylip-relaxed")

    raxml_work = op.abspath(op.join(op.dirname(phy_file), "raxml_work"))
    mkdir(raxml_work)
    raxml_cl = RaxmlCommandline(cmd=RAXML_BIN("raxmlHPC"), \
        sequences=phy_file, algorithm="a", model="GTRGAMMA", \
        parsimony_seed=12345, rapid_bootstrap_seed=12345, \
        num_replicates=100, name="aln", \
        working_dir=raxml_work, **kwargs)

    logging.debug("Building ML tree using RAxML: %s" % raxml_cl)
    stdout, stderr = raxml_cl()

    tree_file = "{0}/RAxML_bipartitions.aln".format(raxml_work)
    if not op.exists(tree_file):
        print("***RAxML failed.", file=sys.stderr)
        sh("rm -rf %s" % raxml_work, log=False)
        return None
    sh("cp {0} {1}".format(tree_file, outfile), log=False)

    logging.debug("ML tree printed to %s" % outfile)
    sh("rm -rf %s" % raxml_work)

    return outfile, phy_file
Ejemplo n.º 22
0
Archivo: train.py Proyecto: rrane/jcvi
def genemark(args):
    """
    %prog genemark species fastafile

    Train GENEMARK model given fastafile. GENEMARK self-trains so no trainig
    model gff file is needed.
    """
    p = OptionParser(genemark.__doc__)
    p.set_home("gmes")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    species, fastafile = args
    mhome = opts.gmes_home
    gmdir = "genemark"
    mkdir(gmdir)

    cwd = os.getcwd()
    os.chdir(gmdir)
    cmd = "ln -sf ../{0}".format(fastafile)
    sh(cmd)

    license = op.expanduser("~/.gm_key")
    assert op.exists(license), "License key ({0}) not found!".format(license)
    cmd = "{0}/gm_es.pl {1}".format(mhome, fastafile)
    sh(cmd)

    os.chdir(cwd)
    logging.debug("GENEMARK matrix written to `{0}/mod/{1}.mod`".format(gmdir, species))
Ejemplo n.º 23
0
def batchoverlap(args):
    """
    %prog batchoverlap pairs.txt outdir

    Check overlaps between pairs of sequences.
    """
    p = OptionParser(batchoverlap.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    pairsfile, outdir = args
    fp = open(pairsfile)
    cmds = []
    mkdir("overlaps")
    for row in fp:
        a, b = row.split()[:2]
        oa = op.join(outdir, a + ".fa")
        ob = op.join(outdir, b + ".fa")
        cmd = "python -m jcvi.assembly.goldenpath overlap {0} {1}".format(
            oa, ob)
        cmd += " -o overlaps/{0}_{1}.ov".format(a, b)
        cmds.append(cmd)

    print "\n".join(cmds)
Ejemplo n.º 24
0
Archivo: ks.py Proyecto: ascendo/jcvi
def batch(args):
    """
    %prog batch all.cds *.anchors

    Compute Ks values for a set of anchors file. This will generate a bunch of
    work directories for each comparisons. The anchorsfile should be in the form
    of specie1.species2.anchors.
    """
    from jcvi.apps.grid import MakeManager

    p = OptionParser(batch.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    cdsfile = args[0]
    anchors = args[1:]
    workdirs = [".".join(op.basename(x).split(".")[:2]) for x in anchors]
    for wd in workdirs:
        mkdir(wd)

    mm = MakeManager()
    for wd, ac in zip(workdirs, anchors):
        pairscdsfile = wd + ".cds.fasta"
        cmd = "python -m jcvi.apps.ks prepare {} {} -o {}".\
                format(ac, cdsfile, pairscdsfile)
        mm.add((ac, cdsfile), pairscdsfile, cmd)
        ksfile = wd + ".ks"
        cmd = "python -m jcvi.apps.ks calc {} -o {} --workdir {}".\
                format(pairscdsfile, ksfile, wd)
        mm.add(pairscdsfile, ksfile, cmd)
    mm.write()
Ejemplo n.º 25
0
Archivo: cnv.py Proyecto: xuanblo/jcvi
def cib(args):
    """
    %prog cib bamfile samplekey

    Convert BAM to CIB (a binary storage of int8 per base).
    """
    p = OptionParser(cib.__doc__)
    p.add_option("--prefix", help="Report seqids with this prefix only")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, samplekey = args
    mkdir(samplekey)
    bam = pysam.AlignmentFile(bamfile, "rb")
    refs = [x for x in bam.header["SQ"]]
    prefix = opts.prefix
    if prefix:
        refs = [x for x in refs if x["SN"].startswith(prefix)]

    task_args = []
    for r in refs:
        task_args.append((bamfile, r, samplekey))
    cpus = min(opts.cpus, len(task_args))
    logging.debug("Use {} cpus".format(cpus))

    p = Pool(processes=cpus)
    for res in p.imap(bam_to_cib, task_args):
        continue
Ejemplo n.º 26
0
def sra(args):
    """
    %prog sra [term|term.ids]

    Given an SRA run ID, fetch the corresponding .sra file from the sra-instant FTP.
    The term can also be a file containing list of SRR ids, one per line.

    Once downloaded, the SRA file is processed through `fastq-dump` to produce
    FASTQ formatted sequence files, which are gzipped by default.
    """
    p = OptionParser(sra.__doc__)

    p.add_option("--nogzip", dest="nogzip",
                 default=False, action="store_true",
                 help="Do not gzip the FASTQ generated by fastq-dump")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    term, = args
    if op.isfile(term):
        terms = [x.strip() for x in open(term)]
    else:
        terms = [term]

    for term in terms:
        srafile = download_srr_term(term)
        pf = srafile.split(".")[0]
        mkdir(pf)
        _opts = [srafile, "--paired", "--outdir={0}".format(pf)]
        if not opts.nogzip:
            _opts.append("--compress=gzip")
        fromsra(_opts)
Ejemplo n.º 27
0
def prepare_synteny(tourfile, lastfile, odir, p, opts):
    """
    Prepare synteny plots for movie().
    """
    qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts)
    qbedfile = op.abspath(qbedfile)
    sbedfile = op.abspath(sbedfile)

    qbed = Bed(qbedfile, sorted=False)
    contig_to_beds = dict(qbed.sub_beds())

    # Create a separate directory for the subplots and movie
    mkdir(odir, overwrite=True)
    os.chdir(odir)
    logging.debug("Change into subdir `{}`".format(odir))

    # Make anchorsfile
    anchorsfile = ".".join(op.basename(lastfile).split(".",
                                                       2)[:2]) + ".anchors"
    fw = open(anchorsfile, "w")
    for b in Blast(lastfile):
        print >> fw, "\t".join(
            (gene_name(b.query), gene_name(b.subject), str(int(b.score))))
    fw.close()

    # Symlink sbed
    symlink(sbedfile, op.basename(sbedfile))

    return anchorsfile, qbedfile, contig_to_beds
Ejemplo n.º 28
0
def gcn(args):
    """
    %prog gcn gencode.v26.exonunion.bed data/*.vcf.gz

    Compile gene copy njumber based on CANVAS results.
    """
    p = OptionParser(gcn.__doc__)
    p.set_cpus()
    p.set_tmpdir(tmpdir="tmp")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    exonbed = args[0]
    canvasvcfs = args[1:]
    tsvfile = opts.outfile
    tmpdir = opts.tmpdir

    mkdir(tmpdir)
    set_tempdir(tmpdir)

    df = vcf_to_df(canvasvcfs, exonbed, opts.cpus)
    for suffix in (".avgcn", ".medcn"):
        df_to_tsv(df, tsvfile, suffix)
Ejemplo n.º 29
0
def sra(args):
    """
    %prog sra [term|term.ids]

    Given an SRA run ID, fetch the corresponding .sra file from the sra-instant FTP.
    The term can also be a file containing list of SRR ids, one per line.

    Once downloaded, the SRA file is processed through `fastq-dump` to produce
    FASTQ formatted sequence files, which are gzipped by default.
    """
    p = OptionParser(sra.__doc__)

    sp1.add_argument("--nogzip", dest="nogzip",
            default=False, action="store_true",
            help="Do not gzip the FASTQ generated by fastq-dump")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    term, = args
    if op.isfile(term):
        terms = [x.strip() for x in open(term)]
    else:
        terms = [term]

    for term in terms:
        srafile = download_srr_term(term)
        pf = srafile.split(".")[0]
        mkdir(pf)
        _opts = [srafile, "--paired", "--outdir={0}".format(pf)]
        if not args.nogzip:
            _args.append("--compress=gzip")
        fromsra(_opts)
Ejemplo n.º 30
0
def merge(args):
    """
    %prog merge folder1 ...

    Consolidate split contents in the folders. The folders can be generated by
    the split() process and several samples may be in separate fastq files. This
    program merges them.
    """
    p = OptionParser(merge.__doc__)
    p.set_outdir(outdir="outdir")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    folders = args
    outdir = opts.outdir
    mkdir(outdir)

    files = flatten(glob("{0}/*.*.fastq".format(x)) for x in folders)
    files = list(files)
    key = lambda x: op.basename(x).split(".")[0]
    files.sort(key=key)
    for id, fns in groupby(files, key=key):
        fns = list(fns)
        outfile = op.join(outdir, "{0}.fastq".format(id))
        FileMerger(fns, outfile=outfile).merge(checkexists=True)
Ejemplo n.º 31
0
def link(args):
    """
    %prog link metafile

    Link source to target based on a tabular file.
    """
    from jcvi.apps.base import mkdir

    p = OptionParser(link.__doc__)
    p.add_option("--dir", help="Place links in a subdirectory")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (meta, ) = args
    d = opts.dir
    if d:
        mkdir(d)

    fp = open(meta)
    cwd = op.dirname(get_abs_path(meta))
    for row in fp:
        source, target = row.split()
        source = op.join(cwd, source)
        if d:
            target = op.join(d, target)
        lnsf(source, target, log=True)
Ejemplo n.º 32
0
def pairs(args):
    """
    %prog pairs folder reference.fasta

    Estimate insert size distribution. Compatible with a variety of aligners,
    including CLC, BOWTIE and BWA.
    """
    p = OptionParser(pairs.__doc__)
    p.set_firstN()
    p.set_mates()
    p.set_aligner()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    cwd = os.getcwd()
    aligner = opts.aligner
    work = "-".join(("pairs", aligner))
    mkdir(work)

    if aligner == "clc":
        from jcvi.apps.clc import align
        from jcvi.formats.cas import pairs as ps
    else:
        from jcvi.formats.sam import pairs as ps

    if aligner == "bowtie":
        from jcvi.apps.bowtie import align
    elif aligner == "bwa":
        from jcvi.apps.bwa import align

    folder, ref = args
    ref = get_abs_path(ref)
    messages = []
    for p, prefix in iter_project(folder, 2):
        samplefq = op.join(work, prefix + ".first.fastq")
        first([str(opts.firstN)] + p + ["-o", samplefq])

        os.chdir(work)
        align_args = [ref, op.basename(samplefq)]
        outfile, logfile = align(align_args)
        bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)])
        os.chdir(cwd)

        median = stats.median
        tag = "MP" if median > 1000 else "PE"
        median = str(median)
        pf, sf = median[:2], median[2:]
        if sf and int(sf) != 0:
            pf = str(int(pf) + 1)  # Get the first two effective digits
        lib = "{0}-{1}".format(tag, pf + "0" * len(sf))
        for i, xp in enumerate(p):
            suffix = "fastq.gz" if xp.endswith(".gz") else "fastq"
            link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""), i + 1, suffix)
            m = "\t".join(str(x) for x in (xp, link))
            messages.append(m)

    messages = "\n".join(messages)
    write_file("f.meta", messages, tee=True)
Ejemplo n.º 33
0
def build_ml_raxml(alignment, outfile, work_dir=".", **kwargs):
    """
    build maximum likelihood tree of DNA seqs with RAxML
    """
    work_dir = op.join(work_dir, "work")
    mkdir(work_dir)
    phy_file = op.join(work_dir, "aln.phy")
    AlignIO.write(alignment, file(phy_file, "w"), "phylip-relaxed")

    raxml_work = op.abspath(op.join(op.dirname(phy_file), "raxml_work"))
    mkdir(raxml_work)
    raxml_cl = RaxmlCommandline(cmd=RAXML_BIN("raxmlHPC"), \
        sequences=phy_file, algorithm="a", model="GTRGAMMA", \
        parsimony_seed=12345, rapid_bootstrap_seed=12345, \
        num_replicates=100, name="aln", \
        working_dir=raxml_work, **kwargs)

    logging.debug("Building ML tree using RAxML: %s" % raxml_cl)
    stdout, stderr = raxml_cl()

    tree_file = "{0}/RAxML_bipartitions.aln".format(raxml_work)
    if not op.exists(tree_file):
        print >> sys.stderr, "***RAxML failed."
        sh("rm -rf %s" % raxml_work, log=False)
        return None
    sh("cp {0} {1}".format(tree_file, outfile), log=False)

    logging.debug("ML tree printed to %s" % outfile)
    sh("rm -rf %s" % raxml_work)

    return outfile, phy_file
Ejemplo n.º 34
0
def SH_raxml(reftree, querytree, phy_file, shout="SH_out.txt"):
    """
    SH test using RAxML

    querytree can be a single tree or a bunch of trees (eg. from bootstrapping)
    """
    assert op.isfile(reftree)
    shout = must_open(shout, "a")

    raxml_work = op.abspath(op.join(op.dirname(phy_file), "raxml_work"))
    mkdir(raxml_work)
    raxml_cl = RaxmlCommandline(cmd=RAXML_BIN("raxmlHPC"), \
    sequences=phy_file, algorithm="h", model="GTRGAMMA", \
    name="SH", starting_tree=reftree, bipartition_filename=querytree, \
    working_dir=raxml_work)

    logging.debug("Running SH test in RAxML: %s" % raxml_cl)
    o, stderr = raxml_cl()
    # hard coded
    try:
        pval = re.search('(Significantly.*:.*)', o).group(0)
    except:
        print("SH test failed.", file=sys.stderr)
    else:
        pval = pval.strip().replace("\t"," ").replace("%","\%")
        print("{0}\t{1}".format(op.basename(querytree), pval), file=shout)
        logging.debug("SH p-value appended to %s" % shout.name)

    shout.close()
    return shout.name
Ejemplo n.º 35
0
def link(args):
    """
    %prog link metafile

    Link source to target based on a tabular file.
    """
    from jcvi.apps.base import mkdir

    p = OptionParser(link.__doc__)
    p.add_option("--dir",
                 help="Place links in a subdirectory [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    meta, = args
    d = opts.dir
    if d:
        mkdir(d)

    fp = open(meta)
    for row in fp:
        source, target = row.split()
        source = get_abs_path(source)
        if d:
            target = op.join(d, target)
        lnsf(source, target, log=True)
Ejemplo n.º 36
0
Archivo: ahrd.py Proyecto: zjwang6/jcvi
def batch(args):
    """
    %prog batch splits output

    The arguments are two folders.
    Input FASTA sequences are in splits/.
    Output csv files are in output/.

    Must have folders swissprot/, tair/, trembl/ that contains the respective
    BLAST output. Once finished, you can run, for example:

    $ parallel java -Xmx2g -jar ~/code/AHRD/dist/ahrd.jar {} ::: output/*.yml
    """
    p = OptionParser(batch.__doc__)

    ahrd_weights = {"blastp": [0.5, 0.3, 0.2], "blastx": [0.6, 0.4, 0.0]}
    blast_progs = tuple(ahrd_weights.keys())

    p.add_option("--path",
                 default="~/code/AHRD/",
                 help="Path where AHRD is installed [default: %default]")
    p.add_option("--blastprog", default="blastp", choices=blast_progs,
                help="Specify the blast program being run. Based on this option," \
                   + " the AHRD parameters (score_weights) will be modified." \
                   + " [default: %default]")
    p.add_option("--iprscan", default=None,
                help="Specify path to InterProScan results file if available." \
                   + " If specified, the yml conf file will be modified" \
                   + " appropriately. [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    splits, output = args
    mkdir(output)

    bit_score, db_score, ovl_score = ahrd_weights[opts.blastprog]

    for f in glob("{0}/*.fa*".format(splits)):
        fb = op.basename(f).rsplit(".", 1)[0]
        fw = open(op.join(output, fb + ".yml"), "w")

        path = op.expanduser(opts.path)
        dir = op.join(path, "test/resources")
        outfile = op.join(output, fb + ".csv")
        interpro = iprscanTemplate.format(opts.iprscan) if opts.iprscan else ""

        print(Template.format(dir, fb, f, outfile, bit_score, db_score,
                              ovl_score, interpro),
              file=fw)

    if opts.iprscan:
        if not op.lexists("interpro.xml"):
            symlink(op.join(iprscan_datadir, "interpro.xml"), "interpro.xml")

        if not op.lexists("interpro.dtd"):
            symlink(op.join(iprscan_datadir, "interpro.dtd"), "interpro.dtd")
Ejemplo n.º 37
0
Archivo: ks.py Proyecto: bennyyu/jcvi
def calc(args):
    """
    %prog calc [prot.fasta] cds.fasta > out.ks

    Protein file is optional. If only one file is given, it is assumed to
    be CDS sequences with correct frame (frame 0). Results will be written to
    stdout. Both protein file and nucleotide file are assumed to be Fasta format,
    with adjacent records as the pairs to compare.

    Author: Haibao Tang <*****@*****.**>, Brad Chapman
    Calculate synonymous mutation rates for gene pairs

    This does the following:
        1. Fetches a protein pair.
        2. Aligns the protein pair with clustalw
        3. Convert the output to Fasta format.
        4. Use this alignment info to align gene sequences using PAL2NAL
        5. Run PAML yn00 to calculate synonymous mutation rates.
    """
    p = OptionParser(calc.__doc__)
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) == 1:
        protein_file, dna_file = None, args[0]
    elif len(args) == 2:
        protein_file, dna_file = args
    else:
        print >>sys.stderr, "Incorrect arguments"
        sys.exit(not p.print_help())

    output_h = must_open(opts.outfile, "w")
    output_h.write("name,dS-yn,dN-yn,dS-ng,dN-ng\n")
    work_dir = op.join(os.getcwd(), "syn_analysis")
    mkdir(work_dir)

    if not protein_file:
        protein_file = translate_dna(dna_file)

    prot_iterator = SeqIO.parse(open(protein_file), "fasta")
    dna_iterator = SeqIO.parse(open(dna_file), "fasta")
    for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in \
            zip(prot_iterator, prot_iterator, dna_iterator, dna_iterator):

        print >>sys.stderr, "--------", p_rec_1.name, p_rec_2.name
        align_fasta = clustal_align_protein(p_rec_1, p_rec_2, work_dir)
        mrtrans_fasta = run_mrtrans(align_fasta, n_rec_1, n_rec_2, work_dir)
        if mrtrans_fasta:
            ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = \
                    find_synonymous(mrtrans_fasta, work_dir)
            if ds_subs_yn is not None:
                pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name)
                output_h.write("%s\n" % (",".join(str(x) for x in (pair_name,
                        ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng))))
                output_h.flush()

    # Clean-up
    sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")
Ejemplo n.º 38
0
def correct(args):
    """
    %prog correct *.fastq

    Correct the fastqfile and generated corrected fastqfiles. This calls
    assembly.allpaths.prepare() to generate input files for ALLPATHS-LG. The
    naming convention for your fastqfiles are important, and are listed below.

    By default, this will correct all PE reads, and remove duplicates of all MP
    reads, and results will be placed in `frag_reads.corr.{pairs,frags}.fastq`
    and `jump_reads.corr.{pairs,frags}.fastq`.
    """
    from jcvi.assembly.allpaths import prepare
    from jcvi.assembly.base import FastqNamings

    p = OptionParser(correct.__doc__ + FastqNamings)
    p.add_option(
        "--nofragsdedup",
        default=False,
        action="store_true",
        help="Don't deduplicate the fragment reads [default: %default]")
    p.add_option("--cpus",
                 default=32,
                 type="int",
                 help="Number of threads to run [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastq = args
    tag, tagj = "frag_reads", "jump_reads"

    prepare(["Unknown"] + fastq + ["--norun"])

    datadir = "data"
    mkdir(datadir)
    fullpath = op.join(os.getcwd(), datadir)
    nthreads = " NUM_THREADS={0}".format(opts.cpus)
    phred64 = (guessoffset([args[0]]) == 64)

    orig = datadir + "/{0}_orig".format(tag)
    origfastb = orig + ".fastb"
    if need_update(fastq, origfastb):
        cmd = "PrepareAllPathsInputs.pl DATA_DIR={0} HOSTS='{1}'".\
                format(fullpath, opts.cpus)
        if phred64:
            cmd += " PHRED_64=True"
        sh(cmd)

    if op.exists(origfastb):
        dedup = not opts.nofragsdedup
        correct_frag(datadir, tag, origfastb, nthreads, dedup=dedup)

    origj = datadir + "/{0}_orig".format(tagj)
    origjfastb = origj + ".fastb"

    if op.exists(origjfastb):
        correct_jump(datadir, tagj, origjfastb, nthreads)
Ejemplo n.º 39
0
def stats(args):
    """
    %prog stats infile.gff

    Collect gene statistics based on gff file. There are some terminology issues
    here and so normally we call "gene" are actually mRNA, and sometimes "exon"
    are actually CDS, but they are configurable.

    Thee numbers are written to text file in four separate folders,
    corresponding to the four metrics:

    Exon length, Intron length, Gene length, Exon count

    With data written to disk then you can run %prog histogram
    """
    p = OptionParser(stats.__doc__)
    p.add_option("--gene", default="mRNA",
                 help="The gene type [default: %default]")
    p.add_option("--exon", default="CDS",
                 help="The exon type [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gff_file, = args
    g = make_index(gff_file)
    exon_lengths = []
    intron_lengths = []
    gene_lengths = []
    exon_counts = []
    for feat in g.features_of_type(opts.gene):
        exons = []
        for c in g.children(feat.id, 1):
            if c.featuretype != opts.exon:
                continue
            exons.append((c.chrom, c.start, c.stop))
        introns = range_interleave(exons)
        feat_exon_lengths = [(stop - start + 1) for (chrom, start, stop) in exons]
        feat_intron_lengths = [(stop - start + 1) for (chrom, start, stop) in introns]
        exon_lengths += feat_exon_lengths
        intron_lengths += feat_intron_lengths
        gene_lengths.append(sum(feat_exon_lengths))
        exon_counts.append(len(feat_exon_lengths))

    a = SummaryStats(exon_lengths)
    b = SummaryStats(intron_lengths)
    c = SummaryStats(gene_lengths)
    d = SummaryStats(exon_counts)
    for x, title in zip((a, b, c, d), metrics):
        x.title = title
        print(x, file=sys.stderr)

    prefix = gff_file.split(".")[0]
    for x in (a, b, c, d):
        dirname = x.title
        mkdir(dirname)
        txtfile = op.join(dirname, prefix + ".txt")
        x.tofile(txtfile)
Ejemplo n.º 40
0
def batch(args):
    """
    %prog batch splits output

    The arguments are two folders.
    Input FASTA sequences are in splits/.
    Output csv files are in output/.

    Must have folders swissprot/, tair/, trembl/ that contains the respective
    BLAST output. Once finished, you can run, for example:

    $ parallel java -Xmx2g -jar ~/code/AHRD/dist/ahrd.jar {} ::: output/*.yml
    """
    p = OptionParser(batch.__doc__)

    ahrd_weights = { "blastp": [0.5, 0.3, 0.2],
                     "blastx": [0.6, 0.4, 0.0]
                   }
    blast_progs = tuple(ahrd_weights.keys())

    p.add_option("--path", default="~/code/AHRD/",
                 help="Path where AHRD is installed [default: %default]")
    p.add_option("--blastprog", default="blastp", choices=blast_progs,
                help="Specify the blast program being run. Based on this option," \
                   + " the AHRD parameters (score_weights) will be modified." \
                   + " [default: %default]")
    p.add_option("--iprscan", default=None,
                help="Specify path to InterProScan results file if available." \
                   + " If specified, the yml conf file will be modified" \
                   + " appropriately. [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    splits, output = args
    mkdir(output)

    bit_score, db_score, ovl_score = ahrd_weights[opts.blastprog]

    for f in glob("{0}/*.fasta".format(splits)):
        fb = op.basename(f).rsplit(".", 1)[0]
        fw = open(op.join(output, fb + ".yml"), "w")

        path = op.expanduser(opts.path)
        dir = op.join(path, "test/resources")
        outfile = op.join(output, fb + ".csv")
        interpro = iprscanTemplate.format(opts.iprscan) if opts.iprscan else ""

        print >> fw, Template.format(dir, fb, f, outfile, bit_score, db_score, ovl_score, interpro)

    if opts.iprscan:
        if not op.lexists("interpro.xml"):
            symlink(op.join(iprscan_datadir, "interpro.xml"), "interpro.xml")

        if not op.lexists("interpro.dtd"):
            symlink(op.join(iprscan_datadir, "interpro.dtd"), "interpro.dtd")
Ejemplo n.º 41
0
    def make_link(self, firstN=0):
        mkdir(self.genome)
        if firstN > 0:
            first([str(firstN), self.fastq, "--outfile={0}".format(self.link)])
            return

        if op.islink(self.link):
            os.unlink(self.link)
        os.symlink(get_abs_path(self.fastq), self.link)
Ejemplo n.º 42
0
    def make_link(self, firstN=0):
        mkdir(self.genome)
        if firstN > 0:
            first([str(firstN), self.fastq, "--outfile={0}".format(self.link)])
            return

        if op.islink(self.link):
            os.unlink(self.link)
        os.symlink(get_abs_path(self.fastq), self.link)
Ejemplo n.º 43
0
def mergecn(args):
    """
    %prog mergecn FACE.csv

    Compile matrix of GC-corrected copy numbers. Place a bunch of folders in
    csv file. Each folder will be scanned, one chromosomes after another.
    """
    p = OptionParser(mergecn.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (csvfile, ) = args
    samples = [x.replace("-cn", "").strip().strip("/") for x in open(csvfile)]
    betadir = "beta"
    mkdir(betadir)
    for seqid in allsomes:
        names = [
            op.join(s + "-cn", "{}.{}.cn".format(op.basename(s), seqid))
            for s in samples
        ]
        arrays = [np.fromfile(name, dtype=np.float) for name in names]
        shapes = [x.shape[0] for x in arrays]
        med_shape = np.median(shapes)
        arrays = [x for x in arrays if x.shape[0] == med_shape]
        ploidy = 2 if seqid not in ("chrY", "chrM") else 1
        if seqid in sexsomes:
            chr_med = [np.median([x for x in a if x > 0]) for a in arrays]
            chr_med = np.array(chr_med)
            idx = get_kmeans(chr_med, k=2)
            zero_med = np.median(chr_med[idx == 0])
            one_med = np.median(chr_med[idx == 1])
            logging.debug("K-means with {} c0:{} c1:{}".format(
                seqid, zero_med, one_med))
            higher_idx = 1 if one_med > zero_med else 0
            # Use the higher mean coverage componen
            arrays = np.array(arrays)[idx == higher_idx]
        arrays = [[x] for x in arrays]
        ar = np.concatenate(arrays)
        print(seqid, ar.shape)
        rows, columns = ar.shape
        beta = []
        std = []
        for j in range(columns):
            a = ar[:, j]
            beta.append(np.median(a))
            std.append(np.std(a) / np.mean(a))
        beta = np.array(beta) / ploidy
        betafile = op.join(betadir, "{}.beta".format(seqid))
        beta.tofile(betafile)
        stdfile = op.join(betadir, "{}.std".format(seqid))
        std = np.array(std)
        std.tofile(stdfile)
        logging.debug("Written to `{}`".format(betafile))
        ar.tofile("{}.bin".format(seqid))
Ejemplo n.º 44
0
def minimap(args):
    """
    %prog minimap ref.fasta query.fasta

    Wrap minimap2 aligner using query against sequences. When query and ref
    is the same, we are in "self-scan" mode (e.g. useful for finding internal
    duplications resulted from mis-assemblies).
    """
    from jcvi.apps.grid import MakeManager
    from jcvi.formats.fasta import Fasta

    p = OptionParser(minimap.__doc__)
    p.add_option(
        "--chunks",
        type="int",
        default=2000000,
        help="Split ref.fasta into chunks of size in self-scan mode",
    )
    p.set_outdir(outdir="outdir")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, query = args
    chunks = opts.chunks
    outdir = opts.outdir
    if ref != query:
        raise NotImplementedError

    # "self-scan" mode
    # build faidx (otherwise, parallel make may complain)
    sh("samtools faidx {}".format(ref))
    f = Fasta(ref)
    mkdir(outdir)
    mm = MakeManager()
    for name, size in f.itersizes():
        start = 0
        for end in range(chunks, size, chunks):
            fafile = op.join(outdir,
                             "{}_{}_{}.fa".format(name, start + 1, end))
            cmd = "samtools faidx {} {}:{}-{} -o {}".format(
                ref, name, start + 1, end, fafile)
            mm.add(ref, fafile, cmd)

            paffile = fafile.rsplit(".", 1)[0] + ".paf"
            cmd = "minimap2 -P {} {} > {}".format(fafile, fafile, paffile)
            mm.add(fafile, paffile, cmd)

            epsfile = fafile.rsplit(".", 1)[0] + ".eps"
            cmd = "minidot {} > {}".format(paffile, epsfile)
            mm.add(paffile, epsfile, cmd)
            start += chunks

    mm.write()
Ejemplo n.º 45
0
def lobstrindex(args):
    """
    %prog lobstrindex hg38.trf.bed hg38.upper.fa

    Make lobSTR index. Make sure the FASTA contain only upper case (so use
    fasta.format --upper to convert from UCSC fasta). The bed file is generated
    by str().
    """
    p = OptionParser(lobstrindex.__doc__)
    p.add_option("--notreds", default=False, action="store_true",
                 help="Remove TREDs from the bed file")
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    trfbed, fastafile = args
    pf = fastafile.split(".")[0]
    lhome = opts.lobstr_home
    mkdir(pf)

    if opts.notreds:
        newbedfile = trfbed + ".new"
        newbed = open(newbedfile, "w")
        fp = open(trfbed)
        retained = total = 0
        seen = set()
        for row in fp:
            r = STRLine(row)
            total += 1
            name = r.longname
            if name in seen:
                continue
            seen.add(name)
            print >> newbed, r
            retained += 1
        newbed.close()
        logging.debug("Retained: {0}".format(percentage(retained, total)))
    else:
        newbedfile = trfbed

    mm = MakeManager()
    cmd = "python {0}/scripts/lobstr_index.py".format(lhome)
    cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf)
    mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd)

    tabfile = "{0}/index.tab".format(pf)
    cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome)
    cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile)
    mm.add((newbedfile, fastafile), tabfile, cmd)

    infofile = "{0}/index.info".format(pf)
    cmd = "cp {0} {1}".format(newbedfile, infofile)
    mm.add(trfbed, infofile, cmd)
    mm.write()
Ejemplo n.º 46
0
def augustus(args):
    """
    %prog augustus species gffile fastafile

    Train AUGUSTUS model given gffile and fastafile. Whole procedure taken from:
    <http://www.molecularevolution.org/molevolfiles/exercises/augustus/training.html>
    """
    p = OptionParser(augustus.__doc__)
    p.add_option(
        "--autotrain",
        default=False,
        action="store_true",
        help="Run autoAugTrain.pl to iteratively train AUGUSTUS",
    )
    p.set_home("augustus")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    species, gffile, fastafile = args
    gffile = os.path.abspath(gffile)
    fastafile = os.path.abspath(fastafile)
    mhome = opts.augustus_home
    augdir = "augustus"

    cwd = os.getcwd()
    mkdir(augdir)
    os.chdir(augdir)
    target = "{0}/config/species/{1}".format(mhome, species)

    if op.exists(target):
        logging.debug("Removing existing target `{0}`".format(target))
        sh("rm -rf {0}".format(target))

    config_path = "{0}/config".format(mhome)
    sh("{0}/scripts/new_species.pl --species={1} --AUGUSTUS_CONFIG_PATH={2}".
       format(mhome, species, config_path))
    sh("{0}/scripts/gff2gbSmallDNA.pl {1} {2} 1000 raw.gb".format(
        mhome, gffile, fastafile))
    sh("{0}/bin/etraining --species={1} raw.gb 2> train.err".format(
        mhome, species))
    sh(r"cat train.err | perl -pe 's/.*in sequence (\S+): .*/$1/' > badgenes.lst"
       )
    sh("{0}/scripts/filterGenes.pl badgenes.lst raw.gb > training.gb".format(
        mhome))
    sh("grep -c LOCUS raw.gb training.gb")

    # autoAugTrain failed to execute, disable for now
    if opts.autotrain:
        sh("rm -rf {0}".format(target))
        sh("{0}/scripts/autoAugTrain.pl --trainingset=training.gb --species={1}"
           .format(mhome, species))

    os.chdir(cwd)
    sh("cp -r {0} augustus/".format(target))
Ejemplo n.º 47
0
def lobstrindex(args):
    """
    %prog lobstrindex hg38.trf.bed hg38.upper.fa hg38

    Make lobSTR index. Make sure the FASTA contain only upper case (so use
    fasta.format --upper to convert from UCSC fasta). The bed file is generated
    by str().
    """
    p = OptionParser(lobstrindex.__doc__)
    p.add_option("--fixseq",
                 action="store_true",
                 default=False,
                 help="Scan sequences to extract perfect STRs")
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    trfbed, fastafile, pf = args
    lhome = opts.lobstr_home
    mkdir(pf)

    if opts.fixseq:
        genome = pyfasta.Fasta(fastafile)
        newbedfile = trfbed + ".new"
        newbed = open(newbedfile, "w")
        fp = open(trfbed)
        retained = total = 0
        for row in fp:
            s = STRLine(row)
            total += 1
            for ns in s.iter_exact_str(genome):
                if not ns.is_valid():
                    continue
                print >> newbed, ns
                retained += 1
        newbed.close()
        logging.debug("Retained: {0}".format(percentage(retained, total)))
    else:
        newbedfile = trfbed

    mm = MakeManager()
    cmd = "python {0}/scripts/lobstr_index.py".format(lhome)
    cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf)
    mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd)

    tabfile = "{0}/index.tab".format(pf)
    cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome)
    cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile)
    mm.add((newbedfile, fastafile), tabfile, cmd)

    infofile = "{0}/index.info".format(pf)
    cmd = "cp {0} {1}".format(trfbed, infofile)
    mm.add(trfbed, infofile, cmd)
    mm.write()
Ejemplo n.º 48
0
def correct(args):
    """
    %prog correct *.fastq

    Correct the fastqfile and generated corrected fastqfiles. This calls
    assembly.allpaths.prepare() to generate input files for ALLPATHS-LG. The
    naming convention for your fastqfiles are important, and are listed below.

    By default, this will correct all PE reads, and remove duplicates of all MP
    reads, and results will be placed in `frag_reads.corr.{pairs,frags}.fastq`
    and `jump_reads.corr.{pairs,frags}.fastq`.
    """
    from jcvi.assembly.allpaths import prepare
    from jcvi.assembly.base import FastqNamings

    p = OptionParser(correct.__doc__ + FastqNamings)
    p.add_option("--nofragsdedup", default=False, action="store_true",
                 help="Don't deduplicate the fragment reads [default: %default]")
    p.add_option("--cpus", default=32, type="int",
                 help="Number of threads to run [default: %default]")
    p.add_option("--phred64", default=False, action="store_true",
                 help="Reads are all phred 64 offset [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastq = args
    tag, tagj = "frag_reads", "jump_reads"

    prepare(["Unknown"] + fastq + ["--norun"])

    datadir = "data"
    mkdir(datadir)
    fullpath = op.join(os.getcwd(), datadir)
    nthreads = " NUM_THREADS={0}".format(opts.cpus)

    orig = datadir + "/{0}_orig".format(tag)
    origfastb = orig + ".fastb"
    if need_update(fastq, origfastb):
        cmd = "PrepareAllPathsInputs.pl DATA_DIR={0} HOSTS='{1}'".\
                format(fullpath, opts.cpus)
        if opts.phred64:
            cmd += " PHRED_64=True"
        sh(cmd)

    if op.exists(origfastb):
        dedup = not opts.nofragsdedup
        correct_frag(datadir, tag, origfastb, nthreads, dedup=dedup)

    origj = datadir + "/{0}_orig".format(tagj)
    origjfastb = origj + ".fastb"

    if op.exists(origjfastb):
        correct_jump(datadir, tagj, origjfastb, nthreads)
Ejemplo n.º 49
0
def compilevcf(args):
    """
    %prog compilevcf samples.csv

    Compile vcf results into master spreadsheet.
    """
    p = OptionParser(compilevcf.__doc__)
    p.add_option("--db", default="hg38", help="Use these lobSTR db")
    p.add_option(
        "--nofilter",
        default=False,
        action="store_true",
        help="Do not filter the variants",
    )
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (samples, ) = args
    workdir = opts.workdir
    store = opts.output_path
    cleanup = not opts.nocleanup
    filtered = not opts.nofilter
    dbs = opts.db.split(",")
    cwd = os.getcwd()
    mkdir(workdir)
    os.chdir(workdir)
    samples = op.join(cwd, samples)

    stridsfile = "STR.ids"
    if samples.endswith((".vcf", ".vcf.gz")):
        vcffiles = [samples]
    else:
        vcffiles = [x.strip() for x in must_open(samples)]
    if not op.exists(stridsfile):
        ids = []
        for db in dbs:
            ids.extend(STRFile(opts.lobstr_home, db=db).ids)
        uids = uniqify(ids)
        logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids)))

        fw = open(stridsfile, "w")
        print("\n".join(uids), file=fw)
        fw.close()

    run_args = [(x, filtered, cleanup, store) for x in vcffiles]
    cpus = min(opts.cpus, len(run_args))
    p = Pool(processes=cpus)
    for _ in p.map_async(run_compile, run_args).get():
        continue
Ejemplo n.º 50
0
Archivo: cnv.py Proyecto: xuanblo/jcvi
def mergecn(args):
    """
    %prog mergecn FACE.csv

    Compile matrix of GC-corrected copy numbers. Place a bunch of folders in
    csv file. Each folder will be scanned, one chromosomes after another.
    """
    p = OptionParser(mergecn.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    csvfile, = args
    samples = [x.replace("-cn", "").strip().strip("/") for x in open(csvfile)]
    betadir = "beta"
    mkdir(betadir)
    for seqid in allsomes:
        names = [op.join(s + "-cn", "{}.{}.cn".
                 format(op.basename(s), seqid)) for s in samples]
        arrays = [np.fromfile(name, dtype=np.float) for name in names]
        shapes = [x.shape[0] for x in arrays]
        med_shape = np.median(shapes)
        arrays = [x for x in arrays if x.shape[0] == med_shape]
        ploidy = 2 if seqid not in ("chrY", "chrM") else 1
        if seqid in sexsomes:
            chr_med = [np.median([x for x in a if x > 0]) for a in arrays]
            chr_med = np.array(chr_med)
            idx = get_kmeans(chr_med, k=2)
            zero_med = np.median(chr_med[idx == 0])
            one_med = np.median(chr_med[idx == 1])
            logging.debug("K-means with {} c0:{} c1:{}"
                          .format(seqid, zero_med, one_med))
            higher_idx = 1 if one_med > zero_med else 0
            # Use the higher mean coverage componen
            arrays = np.array(arrays)[idx == higher_idx]
        arrays = [[x] for x in arrays]
        ar = np.concatenate(arrays)
        print seqid, ar.shape
        rows, columns = ar.shape
        beta = []
        std = []
        for j in xrange(columns):
            a = ar[:, j]
            beta.append(np.median(a))
            std.append(np.std(a) / np.mean(a))
        beta = np.array(beta) / ploidy
        betafile = op.join(betadir, "{}.beta".format(seqid))
        beta.tofile(betafile)
        stdfile = op.join(betadir, "{}.std".format(seqid))
        std = np.array(std)
        std.tofile(stdfile)
        logging.debug("Written to `{}`".format(betafile))
        ar.tofile("{}.bin".format(seqid))
Ejemplo n.º 51
0
def prepare(args):
    """
    %prog prepare countfolder families

    Parse list of count files and group per family into families folder.
    """
    p = OptionParser(prepare.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    counts, families = args
    countfiles = glob(op.join(counts, "*.count"))
    countsdb = defaultdict(list)
    for c in countfiles:
        rs = RiceSample(c)
        countsdb[(rs.tissue, rs.ind)].append(rs)

    # Merge duplicates - data sequenced in different batches
    key = lambda x: (x.label, x.rep)
    for (tissue, ind), rs in sorted(countsdb.items()):
        rs.sort(key=key)
        nrs = len(rs)
        for i in xrange(nrs):
            ri = rs[i]
            if not ri.working:
                continue
            for j in xrange(i + 1, nrs):
                rj = rs[j]
                if key(ri) != key(rj):
                    continue
                ri.merge(rj)
                rj.working = False
        countsdb[(tissue, ind)] = [x for x in rs if x.working]

    # Group into families
    mkdir("families")
    for (tissue, ind), r in sorted(countsdb.items()):
        r = list(r)
        if r[0].label != "F1":
            continue
        P1, P2 = r[0].P1, r[0].P2
        P1, P2 = countsdb[(tissue, P1)], countsdb[(tissue, P2)]
        rs = P1 + P2 + r
        groups = [1] * len(P1) + [2] * len(P2) + [3] * len(r)
        assert len(rs) == len(groups)

        outfile = "-".join((tissue, ind))
        merge_counts(rs, op.join(families, outfile))
        groupsfile = outfile + ".groups"
        fw = open(op.join(families, groupsfile), "w")
        print >> fw, ",".join(str(x) for x in groups)
        fw.close()
Ejemplo n.º 52
0
def prepare(args):
    """
    %prog prepare countfolder families

    Parse list of count files and group per family into families folder.
    """
    p = OptionParser(prepare.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    counts, families = args
    countfiles = glob(op.join(counts, "*.count"))
    countsdb = defaultdict(list)
    for c in countfiles:
        rs = RiceSample(c)
        countsdb[(rs.tissue, rs.ind)].append(rs)

    # Merge duplicates - data sequenced in different batches
    key = lambda x: (x.label, x.rep)
    for (tissue, ind), rs in sorted(countsdb.items()):
        rs.sort(key=key)
        nrs = len(rs)
        for i in xrange(nrs):
            ri = rs[i]
            if not ri.working:
                continue
            for j in xrange(i + 1, nrs):
                rj = rs[j]
                if key(ri) != key(rj):
                    continue
                ri.merge(rj)
                rj.working = False
        countsdb[(tissue, ind)] = [x for x in rs if x.working]

    # Group into families
    mkdir("families")
    for (tissue, ind), r in sorted(countsdb.items()):
        r = list(r)
        if r[0].label != "F1":
            continue
        P1, P2 = r[0].P1, r[0].P2
        P1, P2 = countsdb[(tissue, P1)], countsdb[(tissue, P2)]
        rs = P1 + P2 + r
        groups = [1] * len(P1) + [2] * len(P2) + [3] * len(r)
        assert len(rs) == len(groups)

        outfile = "-".join((tissue, ind))
        merge_counts(rs, op.join(families, outfile))
        groupsfile = outfile + ".groups"
        fw = open(op.join(families, groupsfile), "w")
        print >> fw, ",".join(str(x) for x in groups)
        fw.close()
Ejemplo n.º 53
0
Archivo: str.py Proyecto: Hensonmw/jcvi
def lobstrindex(args):
    """
    %prog lobstrindex hg38.trf.bed hg38.upper.fa hg38

    Make lobSTR index. Make sure the FASTA contain only upper case (so use
    fasta.format --upper to convert from UCSC fasta). The bed file is generated
    by str().
    """
    p = OptionParser(lobstrindex.__doc__)
    p.add_option("--fixseq", action="store_true", default=False,
                 help="Scan sequences to extract perfect STRs")
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    trfbed, fastafile, pf = args
    lhome = opts.lobstr_home
    mkdir(pf)

    if opts.fixseq:
        genome = pyfasta.Fasta(fastafile)
        newbedfile = trfbed + ".new"
        newbed = open(newbedfile, "w")
        fp = open(trfbed)
        retained = total = 0
        for row in fp:
            s = STRLine(row)
            total += 1
            for ns in s.iter_exact_str(genome):
                if not ns.is_valid():
                    continue
                print >> newbed, ns
                retained += 1
        newbed.close()
        logging.debug("Retained: {0}".format(percentage(retained, total)))
    else:
        newbedfile = trfbed

    mm = MakeManager()
    cmd = "python {0}/scripts/lobstr_index.py".format(lhome)
    cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf)
    mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd)

    tabfile = "{0}/index.tab".format(pf)
    cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome)
    cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile)
    mm.add((newbedfile, fastafile), tabfile, cmd)

    infofile = "{0}/index.info".format(pf)
    cmd = "cp {0} {1}".format(trfbed, infofile)
    mm.add(trfbed, infofile, cmd)
    mm.write()
Ejemplo n.º 54
0
def omg(args):
    """
    %prog omg weightsfile

    Run Sankoff's OMG algorithm to get orthologs. Download OMG code at:
    <http://137.122.149.195/IsbraSoftware/OMGMec.html>

    This script only writes the partitions, but not launch OMGMec. You may need to:

    $ parallel "java -cp ~/code/OMGMec TestOMGMec {} 4 > {}.out" ::: work/gf?????

    Then followed by omgparse() to get the gene lists.
    """
    p = OptionParser(omg.__doc__)

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    weightsfiles = args
    groupfile = group(weightsfiles + ["--outfile=groups"])

    weights = get_weights(weightsfiles)
    info = get_info()

    fp = open(groupfile)

    work = "work"
    mkdir(work)
    for i, row in enumerate(fp):
        gf = op.join(work, "gf{0:05d}".format(i))
        genes = row.rstrip().split(",")

        fw = open(gf, "w")
        contents = ""
        npairs = 0
        for gene in genes:
            gene_pairs = weights[gene]
            for a, b, c in gene_pairs:
                if b not in genes:
                    continue

                contents += "weight {0}".format(c) + '\n'
                contents += info[a] + '\n'
                contents += info[b] + '\n\n'
                npairs += 1

        header = "a group of genes  :length ={0}".format(npairs)
        print >> fw, header
        print >> fw, contents

        fw.close()
Ejemplo n.º 55
0
def omg(args):
    """
    %prog omg weightsfile

    Run Sankoff's OMG algorithm to get orthologs. Download OMG code at:
    <http://137.122.149.195/IsbraSoftware/OMGMec.html>

    This script only writes the partitions, but not launch OMGMec. You may need to:

    $ parallel "java -cp ~/code/OMGMec TestOMGMec {} 4 > {}.out" ::: work/gf?????

    Then followed by omgparse() to get the gene lists.
    """
    p = OptionParser(omg.__doc__)

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    weightsfiles = args
    groupfile = group(weightsfiles + ["--outfile=groups"])

    weights = get_weights(weightsfiles)
    info = get_info()

    fp = open(groupfile)

    work = "work"
    mkdir(work)
    for i, row in enumerate(fp):
        gf = op.join(work, "gf{0:05d}".format(i))
        genes = row.rstrip().split(",")

        fw = open(gf, "w")
        contents = ""
        npairs = 0
        for gene in genes:
            gene_pairs = weights[gene]
            for a, b, c in gene_pairs:
                if b not in genes:
                    continue

                contents += "weight {0}".format(c) + '\n'
                contents += info[a] + '\n'
                contents += info[b] + '\n\n'
                npairs += 1

        header = "a group of genes  :length ={0}".format(npairs)
        print >> fw, header
        print >> fw, contents

        fw.close()
Ejemplo n.º 56
0
    def _get_records(self):
        gbdir = "gb"
        dirmade = mkdir(gbdir)
        if not dirmade:
            sh("rm -rf {0}_old; mv -f {0} {0}_old".format(gbdir,))
            assert mkdir(gbdir)

        entrez([self.idfile, "--format=gb", "--database=nuccore", "--outdir={0}"\
            .format(gbdir)])

        logging.debug('GenBank records written to {0}.'.format(gbdir))
        return gbdir
Ejemplo n.º 57
0
def write_lst(bedfile):
    pf = op.basename(bedfile).split(".")[0]
    mkdir(pf)
    bed = Bed(bedfile)
    stanza = []
    for seqid, bs in bed.sub_beds():
        fname = op.join(pf, "{0}.lst".format(seqid))
        fw = open(fname, "w")
        for b in bs:
            print >> fw, "{0}{1}".format(b.accn.replace(" ", ""), b.strand)
        stanza.append((seqid, fname))
        fw.close()
    return pf, stanza
Ejemplo n.º 58
0
def error(args):
    """
    %prog error backup_folder

    Find all errors in ../5-consensus/*.err and pull the error unitigs into
    backup/ folder.
    """
    p = OptionParser(error.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    backup_folder, = args
    mkdir(backup_folder)

    fw = open("errors.log", "w")

    seen = set()
    for g in sorted(glob("../5-consensus/*.err")):
        if "partitioned" in g:
            continue

        fp = open(g)
        partID = op.basename(g).rsplit(".err", 1)[0]
        partID = int(partID.split("_")[-1])

        for row in fp:
            if row.startswith(working):
                unitigID = row.split("(")[0].split()[-1]
                continue

            if not failed.upper() in row.upper():
                continue

            uu = (partID, unitigID)
            if uu in seen:
                continue
            seen.add(uu)

            print >> fw, "\t".join(str(x) for x in (partID, unitigID))

            cmd = "{0} {1}".format(*uu)
            unitigfile = pull(cmd.split())
            cmd = "mv {0} {1}".format(unitigfile, backup_folder)
            sh(cmd)

        fp.close()

    logging.debug("A total of {0} unitigs saved to {1}.".\
                 format(len(seen), backup_folder))
Ejemplo n.º 59
0
Archivo: str.py Proyecto: ascendo/jcvi
def compile(args):
    """
    %prog compile samples.csv

    Compile vcf results into master spreadsheet.
    """
    from multiprocessing import Pool

    p = OptionParser(compile.__doc__)
    p.add_option("--db", default="hg38,hg38-named",
                 help="Use these lobSTR db")
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    samples, = args
    workdir = opts.workdir
    dbs = opts.db.split(",")
    mkdir(workdir)
    os.chdir(workdir)

    stridsfile = "STR.ids"
    vcffiles = [x.strip() for x in must_open(samples)]
    if not op.exists(stridsfile):
        ids = []
        for db in dbs:
            ids.extend(STRFile(opts.lobstr_home, db=db).ids)
        uids = uniqify(ids)
        logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids)))

        fw = open(stridsfile, "w")
        print >> fw, "\n".join(uids)
        fw.close()

        # Generate two alleles
        dipuids = []
        for uid in uids:
            dipuids.extend([uid + ".1", uid + ".2"])
        fw = open("header.ids", "w")
        print >> fw, ",".join(dipuids)
        fw.close()

    p = Pool(processes=opts.cpus)
    run_args = [(x, opts.store, opts.cleanup) for x in vcffiles]
    #run(run_args[0])
    for res in p.map_async(run, run_args).get():
        continue