def trf(args): """ %prog trf outdir Run TRF on FASTA files. """ from jcvi.apps.base import iglob p = OptionParser(trf.__doc__) p.add_option("--mismatch", default=31, type="int", help="Mismatch and gap penalty") p.add_option("--minscore", default=MINSCORE, type="int", help="Minimum score to report") p.add_option("--period", default=6, type="int", help="Maximum period to report") p.add_option("--telomeres", default=False, action="store_true", help="Run telomere search: minscore=140 period=7") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) outdir, = args mm = MakeManager() if opts.telomeres: opts.minscore, opts.period = 140, 7 params = "2 {0} {0} 80 10 {1} {2}".\ format(opts.mismatch, opts.minscore, opts.period).split() bedfiles = [] for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")): pf = op.basename(fastafile).split(".")[0] cmd1 = "trf {0} {1} -d -h".format(fastafile, " ".join(params)) datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat" bedfile = "{0}.trf.bed".format(pf) cmd2 = "cat {} | awk '($8 <= {} && $9 >= 0)'".format(datfile, READLEN) cmd2 += " | sed 's/ /\\t/g'" cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile) mm.add(fastafile, datfile, cmd1) mm.add(datfile, bedfile, cmd2) bedfiles.append(bedfile) bedfile = "trf.bed" cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile) mm.add(bedfiles, bedfile, cmd) mm.write()
def trf(args): """ %prog trf outdir Run TRF on FASTA files. """ from jcvi.apps.base import iglob p = OptionParser(trf.__doc__) p.add_option("--mismatch", default=31, type="int", help="Mismatch and gap penalty") p.add_option("--minscore", default=MINSCORE, type="int", help="Minimum score to report") p.add_option("--period", default=6, type="int", help="Maximum period to report") p.add_option("--minlength", default=MINSCORE / 2, type="int", help="Minimum length of repeat tract") p.add_option("--telomeres", default=False, action="store_true", help="Run telomere search: minscore=140 period=7") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) outdir, = args minlength = opts.minlength mm = MakeManager() if opts.telomeres: opts.minscore, opts.period = 140, 7 params = "2 {0} {0} 80 10 {1} {2}".\ format(opts.mismatch, opts.minscore, opts.period).split() bedfiles = [] for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")): pf = op.basename(fastafile).split(".")[0] cmd1 = "trf {0} {1} -d -h".format(fastafile, " ".join(params)) datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat" bedfile = "{0}.trf.bed".format(pf) cmd2 = "cat {} | awk '($8 >= {} && $8 <= {})'".\ format(datfile, minlength, READLEN - minlength) cmd2 += " | sed 's/ /\\t/g'" cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile) mm.add(fastafile, datfile, cmd1) mm.add(datfile, bedfile, cmd2) bedfiles.append(bedfile) bedfile = "trf.bed" cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile) mm.add(bedfiles, bedfile, cmd) mm.write()
def liftover(args): """ %prog liftover lobstr_v3.0.2_hg38_ref.bed hg38.upper.fa LiftOver CODIS/Y-STR markers. """ p = OptionParser(liftover.__doc__) p.add_option("--checkvalid", default=False, action="store_true", help="Check minscore, period and length") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) refbed, fastafile = args genome = pyfasta.Fasta(fastafile) edits = [] fp = open(refbed) for i, row in enumerate(fp): s = STRLine(row) seq = genome[s.seqid][s.start - 1:s.end].upper() s.motif = get_motif(seq, len(s.motif)) s.fix_counts(seq) if opts.checkvalid and not s.is_valid(): continue edits.append(s) if i % 10000 == 0: print(i, "lines read", file=sys.stderr) edits = natsorted(edits, key=lambda x: (x.seqid, x.start)) for e in edits: print(str(e))
def liftover(args): """ %prog liftover lobstr_v3.0.2_hg38_ref.bed hg38.upper.fa LiftOver CODIS/Y-STR markers. """ p = OptionParser(liftover.__doc__) p.add_option("--checkvalid", default=False, action="store_true", help="Check minscore, period and length") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) refbed, fastafile = args genome = pyfasta.Fasta(fastafile) edits = [] fp = open(refbed) for i, row in enumerate(fp): s = STRLine(row) seq = genome[s.seqid][s.start - 1: s.end].upper() s.motif = get_motif(seq, len(s.motif)) s.fix_counts(seq) if opts.checkvalid and not s.is_valid(): continue edits.append(s) if i % 10000 == 0: print >> sys.stderr, i, "lines read" edits = natsorted(edits, key=lambda x: (x.seqid, x.start)) for e in edits: print str(e)
def trf(args): """ %prog trf outdir Run TRF on FASTA files. """ from jcvi.apps.base import iglob cparams = "1 1 2 80 5 200 2000" p = OptionParser(trf.__doc__) p.add_option("--mismatch", default=31, type="int", help="Mismatch and gap penalty") p.add_option("--minscore", default=MINSCORE, type="int", help="Minimum score to report") p.add_option("--period", default=6, type="int", help="Maximum period to report") p.add_option("--lobstr", default=False, action="store_true", help="Generate output for lobSTR") p.add_option("--telomeres", default=False, action="store_true", help="Run telomere search: minscore=140 period=7") p.add_option("--centromeres", default=False, action="store_true", help="Run centromere search: {}".format(cparams)) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) outdir, = args minlength = opts.minscore / 2 mm = MakeManager() if opts.telomeres: opts.minscore, opts.period = 140, 7 params = "2 {0} {0} 80 10 {1} {2}".\ format(opts.mismatch, opts.minscore, opts.period).split() if opts.centromeres: params = cparams.split() bedfiles = [] for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")): pf = op.basename(fastafile).rsplit(".", 1)[0] # Commands starting with trf ignores errors cmd1 = "-trf {0} {1} -d -h".format(fastafile, " ".join(params)) datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat" bedfile = "{0}.trf.bed".format(pf) cmd2 = "cat {} | grep -v ^Parameters".format(datfile) if opts.lobstr: cmd2 += " | awk '($8 >= {} && $8 <= {})'".\ format(minlength, READLEN - minlength) else: cmd2 += " | awk '($8 >= 0)'" cmd2 += " | sed 's/ /\\t/g'" cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile) mm.add(fastafile, datfile, cmd1) mm.add(datfile, bedfile, cmd2) bedfiles.append(bedfile) bedfile = "trf.bed" cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile) mm.add(bedfiles, bedfile, cmd) mm.write()