Exemple #1
0
def trf(args):
    """
    %prog trf outdir

    Run TRF on FASTA files.
    """
    from jcvi.apps.base import iglob

    p = OptionParser(trf.__doc__)
    p.add_option("--mismatch",
                 default=31,
                 type="int",
                 help="Mismatch and gap penalty")
    p.add_option("--minscore",
                 default=MINSCORE,
                 type="int",
                 help="Minimum score to report")
    p.add_option("--period",
                 default=6,
                 type="int",
                 help="Maximum period to report")
    p.add_option("--telomeres",
                 default=False,
                 action="store_true",
                 help="Run telomere search: minscore=140 period=7")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    outdir, = args
    mm = MakeManager()
    if opts.telomeres:
        opts.minscore, opts.period = 140, 7

    params = "2 {0} {0} 80 10 {1} {2}".\
            format(opts.mismatch, opts.minscore, opts.period).split()
    bedfiles = []
    for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")):
        pf = op.basename(fastafile).split(".")[0]
        cmd1 = "trf {0} {1} -d -h".format(fastafile, " ".join(params))
        datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat"
        bedfile = "{0}.trf.bed".format(pf)
        cmd2 = "cat {} | awk '($8 <= {} && $9 >= 0)'".format(datfile, READLEN)
        cmd2 += " | sed 's/ /\\t/g'"
        cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile)
        mm.add(fastafile, datfile, cmd1)
        mm.add(datfile, bedfile, cmd2)
        bedfiles.append(bedfile)

    bedfile = "trf.bed"
    cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile)
    mm.add(bedfiles, bedfile, cmd)

    mm.write()
Exemple #2
0
def trf(args):
    """
    %prog trf outdir

    Run TRF on FASTA files.
    """
    from jcvi.apps.base import iglob

    p = OptionParser(trf.__doc__)
    p.add_option("--mismatch", default=31, type="int",
                 help="Mismatch and gap penalty")
    p.add_option("--minscore", default=MINSCORE, type="int",
                 help="Minimum score to report")
    p.add_option("--period", default=6, type="int",
                 help="Maximum period to report")
    p.add_option("--minlength", default=MINSCORE / 2, type="int",
                 help="Minimum length of repeat tract")
    p.add_option("--telomeres", default=False, action="store_true",
                 help="Run telomere search: minscore=140 period=7")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    outdir, = args
    minlength = opts.minlength
    mm = MakeManager()
    if opts.telomeres:
        opts.minscore, opts.period = 140, 7

    params = "2 {0} {0} 80 10 {1} {2}".\
            format(opts.mismatch, opts.minscore, opts.period).split()
    bedfiles = []
    for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")):
        pf = op.basename(fastafile).split(".")[0]
        cmd1 = "trf {0} {1} -d -h".format(fastafile, " ".join(params))
        datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat"
        bedfile = "{0}.trf.bed".format(pf)
        cmd2 = "cat {} | awk '($8 >= {} && $8 <= {})'".\
                    format(datfile, minlength, READLEN - minlength)
        cmd2 += " | sed 's/ /\\t/g'"
        cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile)
        mm.add(fastafile, datfile, cmd1)
        mm.add(datfile, bedfile, cmd2)
        bedfiles.append(bedfile)

    bedfile = "trf.bed"
    cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile)
    mm.add(bedfiles, bedfile, cmd)

    mm.write()
Exemple #3
0
def liftover(args):
    """
    %prog liftover lobstr_v3.0.2_hg38_ref.bed hg38.upper.fa

    LiftOver CODIS/Y-STR markers.
    """
    p = OptionParser(liftover.__doc__)
    p.add_option("--checkvalid",
                 default=False,
                 action="store_true",
                 help="Check minscore, period and length")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    refbed, fastafile = args
    genome = pyfasta.Fasta(fastafile)
    edits = []
    fp = open(refbed)
    for i, row in enumerate(fp):
        s = STRLine(row)
        seq = genome[s.seqid][s.start - 1:s.end].upper()
        s.motif = get_motif(seq, len(s.motif))
        s.fix_counts(seq)
        if opts.checkvalid and not s.is_valid():
            continue
        edits.append(s)
        if i % 10000 == 0:
            print(i, "lines read", file=sys.stderr)

    edits = natsorted(edits, key=lambda x: (x.seqid, x.start))
    for e in edits:
        print(str(e))
Exemple #4
0
def liftover(args):
    """
    %prog liftover lobstr_v3.0.2_hg38_ref.bed hg38.upper.fa

    LiftOver CODIS/Y-STR markers.
    """
    p = OptionParser(liftover.__doc__)
    p.add_option("--checkvalid", default=False, action="store_true",
                help="Check minscore, period and length")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    refbed, fastafile = args
    genome = pyfasta.Fasta(fastafile)
    edits = []
    fp = open(refbed)
    for i, row in enumerate(fp):
        s = STRLine(row)
        seq = genome[s.seqid][s.start - 1: s.end].upper()
        s.motif = get_motif(seq, len(s.motif))
        s.fix_counts(seq)
        if opts.checkvalid and not s.is_valid():
            continue
        edits.append(s)
        if i % 10000 == 0:
            print >> sys.stderr, i, "lines read"

    edits = natsorted(edits, key=lambda x: (x.seqid, x.start))
    for e in edits:
        print str(e)
Exemple #5
0
def trf(args):
    """
    %prog trf outdir

    Run TRF on FASTA files.
    """
    from jcvi.apps.base import iglob
    cparams = "1 1 2 80 5 200 2000"

    p = OptionParser(trf.__doc__)
    p.add_option("--mismatch",
                 default=31,
                 type="int",
                 help="Mismatch and gap penalty")
    p.add_option("--minscore",
                 default=MINSCORE,
                 type="int",
                 help="Minimum score to report")
    p.add_option("--period",
                 default=6,
                 type="int",
                 help="Maximum period to report")
    p.add_option("--lobstr",
                 default=False,
                 action="store_true",
                 help="Generate output for lobSTR")
    p.add_option("--telomeres",
                 default=False,
                 action="store_true",
                 help="Run telomere search: minscore=140 period=7")
    p.add_option("--centromeres",
                 default=False,
                 action="store_true",
                 help="Run centromere search: {}".format(cparams))
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    outdir, = args
    minlength = opts.minscore / 2
    mm = MakeManager()
    if opts.telomeres:
        opts.minscore, opts.period = 140, 7

    params = "2 {0} {0} 80 10 {1} {2}".\
            format(opts.mismatch, opts.minscore, opts.period).split()
    if opts.centromeres:
        params = cparams.split()

    bedfiles = []
    for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")):
        pf = op.basename(fastafile).rsplit(".", 1)[0]
        # Commands starting with trf ignores errors
        cmd1 = "-trf {0} {1} -d -h".format(fastafile, " ".join(params))
        datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat"
        bedfile = "{0}.trf.bed".format(pf)
        cmd2 = "cat {} | grep -v ^Parameters".format(datfile)
        if opts.lobstr:
            cmd2 += " | awk '($8 >= {} && $8 <= {})'".\
                    format(minlength, READLEN - minlength)
        else:
            cmd2 += " | awk '($8 >= 0)'"
        cmd2 += " | sed 's/ /\\t/g'"
        cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile)
        mm.add(fastafile, datfile, cmd1)
        mm.add(datfile, bedfile, cmd2)
        bedfiles.append(bedfile)

    bedfile = "trf.bed"
    cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile)
    mm.add(bedfiles, bedfile, cmd)

    mm.write()