Ejemplo n.º 1
0
def location(args):
    fastafile = args.fastafile
    pwmfile = args.pwmfile

    lwidth = args.width
    if not lwidth:
        f = Fasta(fastafile)
        lwidth = len(f.items()[0][1])
        f = None

    jobs = []
    motifs = pwmfile_to_motifs(pwmfile)
    ids = [motif.id for motif in motifs]
    if args.ids:
        ids = args.ids.split(",")

    for motif in motifs:
        if motif.id in ids:
            outfile = os.path.join("%s_histogram" % motif.id)
            jobs.append(
                    pool.apply_async(
                        motif_localization, 
                        (fastafile,motif,lwidth,outfile, args.cutoff)
                        ))
    
    for job in jobs:
        job.get()
Ejemplo n.º 2
0
def location(args):
    """
    Creates histrogram of motif location.

    Parameters
    ----------
    args : argparse object
        Command line arguments.
    """
    fastafile = args.fastafile
    pwmfile = args.pwmfile

    lwidth = args.width
    if not lwidth:
        f = Fasta(fastafile)
        lwidth = len(f.items()[0][1])
        f = None

    jobs = []
    motifs = pwmfile_to_motifs(pwmfile)
    ids = [motif.id for motif in motifs]
    if args.ids:
        ids = args.ids.split(",")

    for motif in motifs:
        if motif.id in ids:
            outfile = os.path.join("%s_histogram" % motif.id)
            jobs.append(
                pool.apply_async(
                    motif_localization,
                    (fastafile, motif, lwidth, outfile, args.cutoff)))

    for job in jobs:
        job.get()
Ejemplo n.º 3
0
class TestMotifPwm(unittest.TestCase):
    """ A test class to test Motif pwmscan functionality and related things """
    def setUp(self):
        self.data_dir = "test/data/pwmscan"

        self.motif = pwmfile_to_motifs(os.path.join(self.data_dir,
                                                    "TATA.pwm"))[0]
        self.prom = Fasta(os.path.join(self.data_dir, "promoters.fa"))
        self.prom_gff = os.path.join(self.data_dir, "promoters_result.gff")
        self.random = Fasta(os.path.join(self.data_dir, "random_sequences.fa"))
        self.random_gff = os.path.join(self.data_dir, "random_result.gff")
        self.enrichment = os.path.join(self.data_dir, "enrichment.txt")
        self.tmp = NamedTemporaryFile().name

    def test1_pwm_scan(self):
        """ Scan a FASTA file with PWM of motif """
        result = self.motif.pwm_scan(self.prom, nreport=1)

        # Every sequence should have a TATA match
        self.assertEquals(len(result.keys()), len(self.prom.items()))

    def test2_pwm_scan_to_gff(self):
        """ Scan a FASTA file with PWM of motif, and produce GFF """

        self.motif.pwm_scan_to_gff(self.prom, self.tmp)
        self.assertEquals(open(self.prom_gff).read(), open(self.tmp).read())

    def test3_gff_enrichment(self):
        """ Test gff_enrichment """
        self.motif.pwm_scan_to_gff(self.random, self.random_gff)
        gff_enrichment(self.prom_gff, self.random_gff, 316, 3160, self.tmp)
        self.assertEquals(open(self.enrichment).read(), open(self.tmp).read())

    def tearDown(self):
        pass
Ejemplo n.º 4
0
    def _run_program(self, bin, fastafile, savedir, params=None):

        fastafile = os.path.abspath(fastafile)
        savedir = os.path.abspath(savedir)

        basename = "munk_in.fa"

        new_file = os.path.join(self.tmpdir, basename)
        out = open(new_file, "w")
        f = Fasta(fastafile)
        for name,seq in f.items():
            header = " ".join(["%0.1f" % x for x in range(len(seq) / 2) + range(len(seq) / 2, 0, -1)])
            out.write(">%s\n" % header)
            out.write("%s\n" % seq)
        out.close()
        
        fastafile = new_file
        outfile = fastafile + ".out"

        current_path = os.getcwd()
        os.chdir(self.dir())
        
        cmd = "%s %s %s yes 1.0 p:%s > %s" % (bin, params["width"], params["width"], fastafile, outfile)
        p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) 
        stdout, stderr = p.communicate()

        motifs = []
        if os.path.exists(outfile):
            motifs = self.parse(open(outfile))
        
        os.chdir(current_path)
        
        return motifs, stdout, stderr
Ejemplo n.º 5
0
class TestMotifPwm(unittest.TestCase):
	""" A test class to test Motif pwmscan functionality and related things """

	def setUp(self):
		self.data_dir = "test/data/pwmscan"
		
		self.motif = pwmfile_to_motifs(os.path.join(self.data_dir, "TATA.pwm"))[0]
		self.prom = Fasta(os.path.join(self.data_dir, "promoters.fa"))
		self.prom_gff = os.path.join(self.data_dir, "promoters_result.gff")
		self.random = Fasta(os.path.join(self.data_dir, "random_sequences.fa"))
		self.random_gff = os.path.join(self.data_dir, "random_result.gff")
		self.enrichment = os.path.join(self.data_dir, "enrichment.txt")
		self.tmp = NamedTemporaryFile().name
	
	def test1_pwm_scan(self):
		""" Scan a FASTA file with PWM of motif """
		result = self.motif.pwm_scan(self.prom, nreport=1)

		# Every sequence should have a TATA match
		self.assertEquals(len(result.keys()), len(self.prom.items()))

	def test2_pwm_scan_to_gff(self):
		""" Scan a FASTA file with PWM of motif, and produce GFF """
		
		self.motif.pwm_scan_to_gff(self.prom, self.tmp)
		self.assertEquals(open(self.prom_gff).read(), open(self.tmp).read())

	def test3_gff_enrichment(self):
		""" Test gff_enrichment """
		self.motif.pwm_scan_to_gff(self.random, self.random_gff)
		gff_enrichment(self.prom_gff, self.random_gff, 316, 3160, self.tmp)
		self.assertEquals(open(self.enrichment).read(), open(self.tmp).read())

	def tearDown(self):
		pass
Ejemplo n.º 6
0
def download_genome(genomebuild, genome_dir): 
    # download genome based on URL + genomebuild
    sys.stderr.write("Downloading {} genome\n".format(genomebuild))
    for genome_url in UCSC_GENOME_URLS:

        remote = genome_url.format(genomebuild)

        genome_fa = os.path.join(
                genome_dir,
                os.path.split(remote)[-1]
                )

        sys.stderr.write("Trying to download {}\n".format(genome_url.format(genomebuild)))
        
        try:
            urlretrieve(
                genome_url.format(genomebuild),
                genome_fa
                )

            if not check_genome_file(genome_fa):
                os.unlink(genome_fa)
                continue

            break
        except:
            pass

    if not check_genome_file(genome_fa):
        sys.stderr.write("Failed to download genome\n")
        sys.exit(1)

    sys.stderr.write("Unpacking\n")
    genome_fa = os.path.basename(genome_fa)
    if genome_fa.endswith("tar.gz"):
        cmd = "tar -C {0} -xvzf {1} && rm {1}".format(genome_dir, genome_fa)
    elif genome_fa.endswith(".zip"):
        cmd = "unzip {0}".format(genome_fa)
    else:
        cmd = "gunzip {0}".format(genome_fa)

    sp.call(cmd, shell=True, cwd=genome_dir)
    
    fa_files = glob("{}/*.fa".format(genome_dir))
    if len(fa_files) == 1:
        f = Fasta(fa_files[0])
        for n,s in f.items():
            with open("{}/{}.fa".format(genome_dir, n), "w") as f:
                f.write(">{}\n{}\n".format(n,s))

        os.unlink(fa_files[0])

    genome_fa = os.path.join(genome_dir, genome_fa)
    if os.path.exists(genome_fa):
        os.unlink(genome_fa)
Ejemplo n.º 7
0
class TestMotifPwm(unittest.TestCase):
    """ A test class to test Motif pwmscan functionality and related things """

    def setUp(self):
        self.data_dir = "test/data/pwmscan"
        
        self.motif = read_motifs(open(os.path.join(self.data_dir, "TATA.pwm")), fmt="pwm")[0]
        self.prom = Fasta(os.path.join(self.data_dir, "promoters.fa"))
        self.prom_gff = os.path.join(self.data_dir, "promoters_result.gff")
        self.random = Fasta(os.path.join(self.data_dir, "random_sequences.fa"))
        self.random_gff = os.path.join(self.data_dir, "random_result.gff")
        self.enrichment = os.path.join(self.data_dir, "enrichment.txt")
        self.tmp = NamedTemporaryFile().name
    
    def test1_pwm_scan(self):
        """ Scan a FASTA file with PWM of motif """
        result = self.motif.pwm_scan(self.prom, nreport=1)

        # Every sequence should have a TATA match
        self.assertEquals(len(result.keys()), len(self.prom.items()))

    def test2_pwm_scan_to_gff(self):
        """ Scan a FASTA file with PWM of motif, and produce GFF """
        
        self.motif.pwm_scan_to_gff(self.prom, self.tmp)
        for line in open(self.tmp):
            vals = line.strip().split("\t")
            self.assertEquals(9, len(vals))
            self.assertTrue(int(vals[3]) > 0)
            self.assertTrue(int(vals[4]) > 0)
            self.assertTrue(float(vals[5]) > 5.25)
            self.assertTrue(float(vals[5]) < 9.06)
            self.assertIn(vals[6], ["+", "-"])

    def test3_gff_enrichment(self):
        """ Test gff_enrichment """
        self.motif.pwm_scan_to_gff(self.random, self.random_gff)
        gff_enrichment(self.prom_gff, self.random_gff, 316, 3160, self.tmp)
        f = open(self.tmp)
        f.readline() # Header
        vals = f.readline().strip().split("\t")
        self.assertEquals(vals[0], "TATA-box")
        self.assertLess(float(vals[2]), 1e-60)
        self.assertGreater(float(vals[5]), 1.5)

    def tearDown(self):
        pass
Ejemplo n.º 8
0
class TestMotifPwm(unittest.TestCase):
    """ A test class to test Motif pwmscan functionality and related things """

    def setUp(self):
        self.data_dir = "test/data/pwmscan"
        
        self.motif = pwmfile_to_motifs(os.path.join(self.data_dir, "TATA.pwm"))[0]
        self.prom = Fasta(os.path.join(self.data_dir, "promoters.fa"))
        self.prom_gff = os.path.join(self.data_dir, "promoters_result.gff")
        self.random = Fasta(os.path.join(self.data_dir, "random_sequences.fa"))
        self.random_gff = os.path.join(self.data_dir, "random_result.gff")
        self.enrichment = os.path.join(self.data_dir, "enrichment.txt")
        self.tmp = NamedTemporaryFile().name
    
    def test1_pwm_scan(self):
        """ Scan a FASTA file with PWM of motif """
        result = self.motif.pwm_scan(self.prom, nreport=1)

        # Every sequence should have a TATA match
        self.assertEquals(len(result.keys()), len(self.prom.items()))

    def test2_pwm_scan_to_gff(self):
        """ Scan a FASTA file with PWM of motif, and produce GFF """
        
        self.motif.pwm_scan_to_gff(self.prom, self.tmp)
        for line in open(self.tmp):
            vals = line.strip().split("\t")
            self.assertEquals(9, len(vals))
            self.assertTrue(int(vals[3]) > 0)
            self.assertTrue(int(vals[4]) > 0)
            self.assertTrue(float(vals[5]) > 5.25)
            self.assertTrue(float(vals[5]) < 9.06)
            self.assertIn(vals[6], ["+", "-"])

    def test3_gff_enrichment(self):
        """ Test gff_enrichment """
        self.motif.pwm_scan_to_gff(self.random, self.random_gff)
        gff_enrichment(self.prom_gff, self.random_gff, 316, 3160, self.tmp)
        f = open(self.tmp)
        f.readline() # Header
        vals = f.readline().strip().split("\t")
        self.assertEquals(vals[0], "TATA-box")
        self.assertLess(float(vals[2]), 1e-60)
        self.assertGreater(float(vals[5]), 1.5)

    def tearDown(self):
        pass
Ejemplo n.º 9
0
def divide_fa_file(fname, sample, rest, fraction, abs_max):
    fa = Fasta(fname)
    ids = fa.ids[:]

    x = int(fraction * len(ids))
    if x > abs_max:
        x = abs_max

    sample_seqs = random.sample(ids, x)

    # Rest
    f_sample = open(sample, "w")
    f_rest = open(rest, "w")
    for name,seq in fa.items():
        if name in sample_seqs:
            f_sample.write(">%s\n%s\n" % (name, seq))
        else:
            f_rest.write(">%s\n%s\n" % (name, seq))
    f_sample.close()
    f_rest.close()
    
    return x, len(ids[x:])    
Ejemplo n.º 10
0
def divide_fa_file(fname, sample, rest, fraction, abs_max):
    fa = Fasta(fname)
    ids = fa.ids[:]

    x = int(fraction * len(ids))
    if x > abs_max:
        x = abs_max

    sample_seqs = random.sample(ids, x)

    # Rest
    f_sample = open(sample, "w")
    f_rest = open(rest, "w")
    for name,seq in fa.items():
        if name in sample_seqs:
            f_sample.write(">%s\n%s\n" % (name, seq))
        else:
            f_rest.write(">%s\n%s\n" % (name, seq))
    f_sample.close()
    f_rest.close()
    
    return x, len(ids[x:])    
Ejemplo n.º 11
0
def location(args):
    """
    Creates histrogram of motif location.

    Parameters
    ----------
    args : argparse object
        Command line arguments.
    """
    fastafile = args.fastafile
    pwmfile = args.pwmfile

    lwidth = args.width
    if not lwidth:
        f = Fasta(fastafile)
        lwidth = len(f.items()[0][1])
        f = None

    jobs = []
    motifs = pwmfile_to_motifs(pwmfile)
    ids = [motif.id for motif in motifs]
    if args.ids:
        ids = args.ids.split(",")
    
    n_cpus = int(MotifConfig().get_default_params()["ncpus"])
    pool = Pool(processes=n_cpus, maxtasksperchild=1000) 
    for motif in motifs:
        if motif.id in ids:
            outfile = os.path.join("%s_histogram" % motif.id)
            jobs.append(
                    pool.apply_async(
                        motif_localization, 
                        (fastafile,motif,lwidth,outfile, args.cutoff)
                        ))
    
    for job in jobs:
        job.get()
Ejemplo n.º 12
0
    def _run_program(self, bin, fastafile, savedir, params=None):

        fastafile = os.path.abspath(fastafile)
        savedir = os.path.abspath(savedir)

        basename = "munk_in.fa"

        new_file = os.path.join(self.tmpdir, basename)
        out = open(new_file, "w")
        f = Fasta(fastafile)
        for name, seq in f.items():
            header = " ".join([
                "%0.1f" % x
                for x in range(len(seq) / 2) + range(len(seq) / 2, 0, -1)
            ])
            out.write(">%s\n" % header)
            out.write("%s\n" % seq)
        out.close()

        fastafile = new_file
        outfile = fastafile + ".out"

        current_path = os.getcwd()
        os.chdir(self.dir())

        cmd = "%s %s %s yes 1.0 p:%s > %s" % (
            bin, params["width"], params["width"], fastafile, outfile)
        p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
        stdout, stderr = p.communicate()

        motifs = []
        if os.path.exists(outfile):
            motifs = self.parse(open(outfile))

        os.chdir(current_path)

        return motifs, stdout, stderr
Ejemplo n.º 13
0
def location(args):
    """
    Creates histrogram of motif location.

    Parameters
    ----------
    args : argparse object
        Command line arguments.
    """
    fastafile = args.fastafile
    pfmfile = args.pfmfile

    lsize = args.size
    if not lsize:
        f = Fasta(fastafile)
        lsize = len(f.items()[0][1])
        f = None

    jobs = []
    motifs = read_motifs(pfmfile)
    ids = [motif.id for motif in motifs]
    if args.ids:
        ids = args.ids.split(",")

    n_cpus = int(MotifConfig().get_default_params()["ncpus"])
    pool = Pool(processes=n_cpus, maxtasksperchild=1000)
    for motif in motifs:
        if motif.id in ids:
            outfile = os.path.join("%s_histogram" % motif.id)
            jobs.append(
                pool.apply_async(
                    motif_localization,
                    (fastafile, motif, lsize, outfile, args.cutoff)))

    for job in jobs:
        job.get()
Ejemplo n.º 14
0
def nmer_predict(fastafile):
    from tempfile import NamedTemporaryFile, mkdtemp
    from gimmemotifs.fasta import Fasta
    from numpy import sum, histogram
    from subprocess import Popen, PIPE
    from gimmemotifs.motif import Motif, motif_from_align
    from gimmemotifs.cluster import cluster_motifs
    from string import maketrans

    def rc(seq):
        t = maketrans("ATCG", "TAGC")
        return seq[::-1].translate(t)

    f = Fasta(fastafile)
    nmer = {}
    N = {6: 4, 8: 3, 10: 2, 12: 1}
    tmp = NamedTemporaryFile()
    abs_cutoff = len(f.items()) / 100.0 * 2
    for check_n, cutoff in N.items():
        for id, seq in f.items():
            for i in range(len(seq) - check_n):
                n = seq[i:i + check_n]
                nmer.setdefault(n.upper(), []).append(i)

    for n, pos in nmer.items():
        if len(pos) > abs_cutoff:
            hist = histogram(pos, bins=9, range=(0, 200))[0]
            if sum(hist[3:6]) > sum(hist[0:3] * N[len(n)]) and sum(
                    hist[3:6]) > sum(hist[7:]) * N[len(n)]:
                tmp.write(">%s\n" % n)
                for char in n:
                    w = []
                    for x in ["A", "C", "G", "T"]:
                        if x == char:
                            w.append(len(pos))
                        else:
                            w.append(0)

                    tmp.write("\t".join([str(x) for x in w]) + "\n")

    tmp.flush()
    tmpname = tmp.name

    tree = cluster_motifs(tmpname,
                          "subtotal",
                          "ed",
                          "mean",
                          False,
                          threshold=-0.1,
                          include_bg=False)
    clusters = tree.getResult()

    def refine_by_scanning(motifs, fastafile):

        tmp_gff = NamedTemporaryFile()
        file_in = NamedTemporaryFile()
        for m in motifs:
            file_in.write("%s\n" % m.to_pfm())
        file_in.flush()

        cmd = "pwmscan.py -i %s -p %s -c 0.8 > %s" % (fastafile, file_in.name,
                                                      tmp_gff.name)
        p = Popen(cmd, shell=True)
        stdout, stderr = p.communicate()

        aligns = {}
        for line in open(tmp_gff.name):
            vals = line.strip().split("\t")
            motif, instance = [
                x.split(" ")[1].replace('"', "") for x in vals[8].split(" ; ")
            ]

            if vals[6] == "+":
                aligns.setdefault(motif, []).append(instance.upper())
            else:
                aligns.setdefault(motif, []).append(rc(instance.upper()))

        tmp_out = NamedTemporaryFile()

        refined_motifs = []
        for id, align in aligns.items():
            if len(align) > 10:
                motif = motif_from_align(align)
                refined_motifs.append(motif)

        return refined_motifs

    motifs = refine_by_scanning([x[0] for x in clusters], fastafile)
    tmp4 = NamedTemporaryFile()
    for m in motifs:
        tmp4.write("%s\n" % m.to_pfm())
    tmp4.flush()

    motifs = []
    tree = cluster_motifs(tmp4.name,
                          "total",
                          "wic",
                          "mean",
                          True,
                          threshold=0.95,
                          include_bg=True)
    clusters = tree.getResult()
    for i, (cluster, members) in enumerate(clusters):
        cluster.id = "Nmer_%s" % (i + 1)
        motifs.append(cluster)

    refined_motifs = refine_by_scanning(motifs, fastafile)
    for i, m in enumerate(refined_motifs):
        m.id = "WannaMotif_%s" % (i + 1)

    return refined_motifs, "", ""
Ejemplo n.º 15
0
	sys.exit(0)

inputfile = options.inputfile

if options.nreport:
	nreport = int(options.nreport)

cutoff = float(options.cutoff)

motifs = pwmfile_to_motifs(options.pwmfile)

bed = options.bed

f = Fasta(inputfile)
strandmap = {-1:"-",1:"+"}
for (id,seq) in f.items():
	for motif in motifs:
		pwm = motif.pwm
		c =  motif.pwm_min_score() + (motif.pwm_max_score() - motif.pwm_min_score()) * cutoff 
		result = pwmscan(seq.upper(), pwm, c, nreport, options.scan_rc)
		for (score, pos, strand) in result:
			if bed:
				first = id.split(" ")[0]	
				(chr,loc) = first.split(":")
				if loc:
					(start, end) = map(int, loc.split("-"))
					print "%s\t%s\t%s\t%s" % (chr, start + pos, start + pos + len(pwm) , score)
				else:
					print "%s\t%s\t%s\t%s" % (id, pos, pos +  len(pwm), score)
			else:
				print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tmotif_name \"%s\" ; motif_instance \"%s\"" % (
Ejemplo n.º 16
0
                  dest="fpr",
                  help="Desired fpr",
                  type="float",
                  metavar="FLOAT")

(options, args) = parser.parse_args()

if not options.pwmfile or not options.inputfile or not options.fpr:
    parser.print_help()
    exit()

if options.fpr < 0 or options.fpr > 1:
    print "Please specify a FPR between 0 and 1"
    sys.exit()

f = Fasta(options.inputfile)
motifs = pwmfile_to_motifs(options.pwmfile)

print "Motif\tScore\tCutoff"
for motif in motifs:
    pwm = motif.pwm
    scores = []
    min_score = motif.pwm_min_score()
    for name, seq in f.items():
        result = pwmscan(seq.upper(), pwm, min_score, 1, True)
        score = result[0][0]
        scores.append(score)
    opt_score = scoreatpercentile(scores, 100 - (100 * options.fpr))
    cutoff = (opt_score - min_score) / (motif.pwm_max_score() - min_score)
    print "%s\t%s\t%s" % (motif.id, opt_score, cutoff)
        return motif.id, p
    else:
        return motif.id, 1.0


if not options.fastafile and not options.pwmfile:
    parser.print_help()
    sys.exit()

fastafile = options.fastafile
pwmfile = options.pwmfile

lwidth = options.width
if not lwidth:
    f = Fasta(fastafile)
    lwidth = len(f.items()[0][1])
    f = None

job_server = pp.Server(secret="pumpkinrisotto")
jobs = []
motifs = pwmfile_to_motifs(pwmfile)
ids = [motif.id for motif in motifs]
if options.ids:
    ids = options.ids.split(",")

for motif in motifs:
    if motif.id in ids:
        outfile = os.path.join("%s_histogram" % motif.id)
        jobs.append(job_server.submit(motif_localization, (fastafile, motif, lwidth, outfile, options.cutoff), (), ()))

for job in jobs:
Ejemplo n.º 18
0
def nmer_predict(fastafile):
	from tempfile import NamedTemporaryFile,mkdtemp
	from gimmemotifs.fasta import Fasta
	from numpy import sum,histogram
	from subprocess import Popen,PIPE
	from gimmemotifs.motif import Motif,motif_from_align
	from gimmemotifs.cluster import cluster_motifs 
	from string import maketrans

	def rc(seq):
		t = maketrans("ATCG", "TAGC")
		return seq[::-1].translate(t)
	
	f = Fasta(fastafile)
	nmer = {}
	N = {6:4, 8:3,10:2,12:1}
	tmp = NamedTemporaryFile()
	abs_cutoff = len(f.items()) / 100.0 * 2 
	for check_n,cutoff in N.items():
		for id,seq in f.items():
			for i in range(len(seq) - check_n):
				n = seq[i: i + check_n]
				nmer.setdefault(n.upper(), []).append(i)

	for n,pos in nmer.items():
		if len(pos) > abs_cutoff:
			hist = histogram(pos, bins=9, range=(0,200))[0]	
			if sum(hist[3:6]) > sum(hist[0:3] * N[len(n)]) and  sum(hist[3:6]) > sum(hist[7:]) *  N[len(n)]:
				tmp.write(">%s\n" % n)
				for char in n:
					w = []
					for x in  ["A", "C", "G", "T"]:
						if x == char:
							w.append(len(pos))
						else:
							w.append(0)

					tmp.write("\t".join([str(x) for x in w]) + "\n")
	
	
	tmp.flush()
	tmpname = tmp.name
	
	tree = cluster_motifs(tmpname, "subtotal", "ed", "mean", False, threshold=-0.1, include_bg=False)	
	clusters = tree.getResult()

	def refine_by_scanning(motifs, fastafile):
		
		tmp_gff = NamedTemporaryFile()
		file_in = NamedTemporaryFile()
		for m in motifs:
			file_in.write("%s\n" % m.to_pfm())
		file_in.flush()
		
		cmd = "pwmscan.py -i %s -p %s -c 0.8 > %s" % (fastafile, file_in.name, tmp_gff.name)
		p = Popen(cmd, shell=True)
		stdout,stderr = p.communicate()

		aligns = {}
		for line in open(tmp_gff.name):	
			vals = line.strip().split("\t")
			motif,instance = [x.split(" ")[1].replace('"', "") for x in vals[8].split(" ; ")]
		
			if vals[6] == "+":
				aligns.setdefault(motif,[]).append(instance.upper())
			else:
				aligns.setdefault(motif,[]).append(rc(instance.upper()))

		tmp_out = NamedTemporaryFile()
		
		refined_motifs = []
		for id,align in aligns.items():
			if len(align) > 10:
				motif = motif_from_align(align)
				refined_motifs.append(motif)
		
		return refined_motifs
	
	motifs = refine_by_scanning([x[0] for x in clusters], fastafile)
	tmp4 = NamedTemporaryFile()
	for m in motifs:
		tmp4.write("%s\n" % m.to_pfm())
	tmp4.flush()


	motifs = []
	tree = cluster_motifs(tmp4.name, "total", "wic", "mean", True, threshold=0.95, include_bg=True)	
	clusters = tree.getResult()
	for i, (cluster,members) in enumerate(clusters):
		cluster.id = "Nmer_%s" % (i + 1)
		motifs.append(cluster)
	
	refined_motifs = refine_by_scanning(motifs, fastafile)
	for i,m in enumerate(refined_motifs):
		m.id = "WannaMotif_%s" % (i + 1)
	
	return refined_motifs, "", ""	
Ejemplo n.º 19
0
def get_genome(genomebuild, fastadir, indexdir=None):

    config = MotifConfig()
    if not indexdir:
        indexdir = config.get_index_dir()

    genome_dir = os.path.join(fastadir, genomebuild)
    index_dir = os.path.join(indexdir, genomebuild)

    pred_bin = "genePredToBed"
    pred = find_executable(pred_bin)
    if not pred:
        sys.stderr.write("{} not found in path!\n".format(pred_bin))
        sys.exit(1)

    # Check for rights to write to directory
    if not os.path.exists(genome_dir):
        try:
            os.mkdir(genome_dir)
        except:
            sys.stderr.write(
                "Could not create genome dir {}\n".format(genome_dir))
            sys.exit(1)

    # Download gene file based on URL + genomebuild
    gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genomebuild)
    tmp = NamedTemporaryFile(delete=False, suffix=".gz")

    anno = []
    f = urllib2.urlopen(UCSC_GENE_URL.format(genomebuild))
    p = re.compile(r'\w+.Gene.txt.gz')
    for line in f.readlines():
        m = p.search(line)
        if m:
            anno.append(m.group(0))

    sys.stderr.write("Retrieving gene annotation for {}\n".format(genomebuild))
    url = ""
    for a in ANNOS:
        if a in anno:
            url = UCSC_GENE_URL.format(genomebuild) + a
            break
    if url:
        urllib.urlretrieve(url, tmp.name)

        sp.call("zcat {} | cut -f2-11 | {} /dev/stdin {}".format(
            tmp.name, pred, gene_file),
                shell=True)

    else:
        sys.stderr.write("No annotation found!")

    # download genome based on URL + genomebuild
    sys.stderr.write("Downloading {} genome\n".format(genomebuild))
    for genome_url in [UCSC_GENOME_URL, ALT_UCSC_GENOME_URL]:

        remote = genome_url.format(genomebuild)

        genome_fa = os.path.join(genome_dir, os.path.split(remote)[-1])

        sys.stderr.write("Trying to download {}\n".format(
            genome_url.format(genomebuild)))
        urllib.urlretrieve(genome_url.format(genomebuild), genome_fa)

        if not check_genome_file(genome_fa):
            os.unlink(genome_fa)
            continue

        break

    if not check_genome_file(genome_fa):
        sys.stderr.write("Failed to download genome\n")
        sys.exit(1)

    sys.stderr.write("Unpacking\n")
    if genome_fa.endswith("tar.gz"):
        cmd = "tar -C {0} -xvzf {1} && rm {1}".format(genome_dir, genome_fa)
    else:
        cmd = "gunzip {0}".format(genome_fa)

    sp.call(cmd, shell=True, cwd=genome_dir)

    fa_files = glob("{}/*.fa".format(genome_dir))
    if len(fa_files) == 1:
        f = Fasta(fa_files[0])
        for n, s in f.items():
            with open("{}/{}.fa".format(genome_dir, n), "w") as f:
                f.write(">{}\n{}\n".format(n, s))

        os.unlink(fa_files[0])

    sys.stderr.write("Creating index\n")
    g = GenomeIndex()
    g = g.create_index(genome_dir, index_dir)

    create_bedtools_fa(index_dir, genome_dir)
parser = OptionParser()
parser.add_option("-p", "--pwmfile", dest="pwmfile", help="File with pwms", metavar="FILE")
parser.add_option("-i", "--inputfile", dest="inputfile", help="FASTA file with background sequences", metavar="FILE") 
parser.add_option("-f", "--fpr", dest="fpr", help="Desired fpr", type="float", metavar="FLOAT") 

(options, args) = parser.parse_args()

if not options.pwmfile or not options.inputfile or not options.fpr:
	parser.print_help()
	exit()

if options.fpr < 0 or options.fpr > 1:
	print "Please specify a FPR between 0 and 1"
	sys.exit()

f = Fasta(options.inputfile)
motifs = pwmfile_to_motifs(options.pwmfile)

print "Motif\tScore\tCutoff"
for motif in motifs:
	pwm = motif.pwm
	scores = []
	min_score = motif.pwm_min_score()
	for name,seq in f.items():
		result = pwmscan(seq.upper(), pwm, min_score, 1, True)
		score = result[0][0]
		scores.append(score)
	opt_score = scoreatpercentile(scores, 100 - (100 * options.fpr))
	cutoff = (opt_score - min_score) / (motif.pwm_max_score() - min_score)
	print "%s\t%s\t%s" % (motif.id, opt_score , cutoff)
Ejemplo n.º 21
0
	sys.exit(0)

inputfile = options.inputfile

if options.nreport:
	nreport = int(options.nreport)

cutoff = float(options.cutoff)

motifs = pwmfile_to_motifs(options.pwmfile)

bed = options.bed

f = Fasta(inputfile)
strandmap = {-1:"-",1:"+"}
for (id,seq) in f.items():
	for motif in motifs:
		pwm = motif.pwm
		c =  motif.pwm_min_score() + (motif.pwm_max_score() - motif.pwm_min_score()) * cutoff 
		result = pwmscan(seq.upper(), pwm, c, nreport)
		for (score, pos, strand) in result:
			if bed:
				first = id.split(" ")[0]	
				(chr,loc) = first.split(":")
				if loc:
					(start, end) = map(int, loc.split("-"))
					print "%s\t%s\t%s\t%s" % (chr, start + pos, start + pos + len(pwm) , score)
				else:
					print "%s\t%s\t%s\t%s" % (id, pos, pos +  len(pwm), score)
			else:
				print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tmotif_name \"%s\" ; motif_instance \"%s\"" % (
Ejemplo n.º 22
0
def genome(args):
    
    config = MotifConfig()
    
    if not os.path.exists(args.indexdir):
        print "Index_dir %s does not exist!" % (args.indexdir)
        sys.exit(1)

    if not os.path.exists(args.fastadir):
        print "FASTA dir %s does not exist!" % (args.fastadir)
        sys.exit(1)
    
    pred_bin = "genePredToBed"
    pred = find_executable(pred_bin)
    if not pred:
        sys.stderr.write("{} not found in path!\n".format(pred_bin))
        sys.exit(1)
    
    fastadir = args.fastadir
    genomebuild = args.genomebuild
    genome_dir = os.path.join(fastadir, genomebuild)
    index_dir = os.path.join(args.indexdir, args.genomebuild)

    # Check for rights to write to directory

    if not os.path.exists(genome_dir):
        try:
            os.mkdir(genome_dir)
        except:
            sys.stderr.write("Could not create genome dir {}\n".format(genome_dir))
            sys.exit(1)
    
    # Download gene file based on URL + genomebuild
    gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genomebuild)
    tmp = NamedTemporaryFile(delete=False, suffix=".gz")
    
    anno = []
    f = urllib2.urlopen(UCSC_GENE_URL.format(genomebuild))
    p = re.compile(r'\w+.Gene.txt.gz')
    for line in f.readlines():
        m = p.search(line)
        if m:
            anno.append(m.group(0))

    sys.stderr.write("Retrieving gene annotation for {}\n".format(genomebuild))
    url = ""
    for a in ANNOS:
        if a in anno:
            url = UCSC_GENE_URL.format(genomebuild) + a
            break
    if url:
        urllib.urlretrieve(
                url,
                tmp.name 
                )

        sp.call("zcat {} | cut -f2-11 | {} /dev/stdin {}".format(tmp.name, pred, gene_file), shell=True)

    else: 
        sys.stderr.write("No annotation found!")
  
    # download genome based on URL + genomebuild
    sys.stderr.write("Downloading {} genome\n".format(genomebuild))
    for genome_url in [UCSC_GENOME_URL, ALT_UCSC_GENOME_URL]:
        
        remote = genome_url.format(genomebuild)

        genome_fa = os.path.join(
                genome_dir, 
                os.path.split(remote)[-1]
                )

        sys.stderr.write("Trying to download {}\n".format(genome_url.format(genomebuild)))
        urllib.urlretrieve(
                genome_url.format(genomebuild),
                genome_fa
                )
        
        if not check_genome_file(genome_fa):    
            continue
        
        break

    if not check_genome_file(genome_fa):
        sys.stderr.write("Failed to download genome\n")
        sys.exit(1)

    sys.stderr.write("Unpacking\n")
    if genome_fa.endswith("tar.gz"):
        cmd = "tar -C {0} -xvzf {1} && rm {1}".format(genome_dir, genome_fa)
    else:
        cmd = "gunzip {0} && rm {0}".format(genome_fa)

    sp.call(cmd, shell=True, cwd=genome_dir)

    fa_files = glob("{}/*.fa".format(genome_dir))
    if len(fa_files) == 1:
        f = Fasta(fa_files[0])
        for n,s in f.items():
            with open("{}/{}.fa".format(n)) as f:
                f.write("{}\n{}\n".format(n,s))
    
        os.unlink(fa_files[0])

    sys.stderr.write("Creating index\n")
    g = GenomeIndex()
    g = g.create_index(genome_dir, index_dir)
		plot_histogram(matches - width / 2 + len(motif) / 2, outfile, xrange=(-width / 2, width / 2), breaks=21, title="%s (p=%0.2e)" % (motif.id, p), xlabel="Position")
		return motif.id, p
	else:
		return motif.id, 1.0

if not options.fastafile and not options.pwmfile:
	parser.print_help()
	sys.exit()

fastafile = options.fastafile
pwmfile = options.pwmfile

lwidth = options.width
if not lwidth:
	f = Fasta(fastafile)
	lwidth = len(f.items()[0][1])
	f = None

job_server = pp.Server(secret="pumpkinrisotto")
jobs = []
motifs = pwmfile_to_motifs(pwmfile)
ids = [motif.id for motif in motifs]
if options.ids:
	ids = options.ids.split(",")

for motif in motifs:
	if motif.id in ids:
		outfile = os.path.join("%s_histogram" % motif.id)
		jobs.append(job_server.submit(motif_localization, (fastafile,motif,lwidth,outfile, options.cutoff), (),()))

for job in jobs: