Ejemplo n.º 1
0
def index_dir():
    from gimmemotifs.genome_index import GenomeIndex
    test_index_dir = 'tests/data/index/'
    g = GenomeIndex()
    g.create_index('tests/data/genome/', test_index_dir)

    return test_index_dir
Ejemplo n.º 2
0
def index(args):
    
    if not os.path.exists(args.indexdir):
        print "Index_dir %s does not exist!" % (args.indexdir)
        sys.exit(1)

    fasta_dir = args.fastadir
    index_dir = os.path.join(args.indexdir, args.indexname)

    g = GenomeIndex()
    g.create_index(fasta_dir, index_dir)

    # Create genome FASTA file for use with bedtools
    with open(os.path.join(index_dir, "genome.fa"), 'w') as out:
        for f in find_by_ext(fasta_dir, FASTA_EXT):
            for line in open(f):
                out.write(line)

    test_chr = g.get_chromosomes()[0]
    tmp = NamedTemporaryFile()
    tmp.write("{}\t1\t2\n".format(test_chr))
    tmp.flush()
    
    b = pybedtools.BedTool(tmp.name)
    try:
        b.nucleotide_content(fi=os.path.join(index_dir, "genome.fa"))
    except pybedtools.helpers.BEDToolsError as e:
        if str(e).find("generating") == -1:
            raise
Ejemplo n.º 3
0
def index_dir():
    from gimmemotifs.genome_index import GenomeIndex
    test_index_dir = 'tests/data/index/'
    g = GenomeIndex()
    g.create_index('tests/data/genome/', test_index_dir)

    return test_index_dir
Ejemplo n.º 4
0
def index(args):
    
    if not os.path.exists(args.indexdir):
        print "Index_dir %s does not exist!" % (args.indexdir)
        sys.exit(1)

    fasta_dir = args.fastadir
    index_dir = os.path.join(args.indexdir, args.indexname)

    g = GenomeIndex()
    g.create_index(fasta_dir, index_dir)

    create_bedtools_fa(index_dir, fasta_dir)
Ejemplo n.º 5
0
    def test2_as_fasta(self):
        """ convert bed, regions, etc to Fasta """
        tmpdir = mkdtemp()

        g = GenomeIndex()
        g.create_index(self.genome_dir, tmpdir)

        fafile = os.path.join(self.datadir, "test.fa")
        fa = Fasta(fafile)
        bedfile = os.path.join(self.datadir, "test.bed")
        regionfile = os.path.join(self.datadir, "test.txt")
        regions = [l.strip() for l in open(regionfile)]

        self.assertTrue(isinstance(as_fasta(fa), Fasta))
        self.assertTrue(isinstance(as_fasta(fafile), Fasta))

        self.assertTrue(isinstance(as_fasta(bedfile, tmpdir), Fasta))
        self.assertTrue(isinstance(as_fasta(regionfile, tmpdir), Fasta))
        self.assertTrue(isinstance(as_fasta(regions, tmpdir), Fasta))

        with self.assertRaises(ValueError):
            as_fasta(bedfile)

        rmtree(tmpdir)
Ejemplo n.º 6
0
    def test2_as_fasta(self):
        """ convert bed, regions, etc to Fasta """
        tmpdir = mkdtemp()

        g = GenomeIndex()
        g.create_index(self.genome_dir, tmpdir)

        fafile = os.path.join(self.datadir, "test.fa")
        fa = Fasta(fafile)
        bedfile = os.path.join(self.datadir, "test.bed")
        regionfile = os.path.join(self.datadir, "test.txt")
        regions = [l.strip() for l in open(regionfile)]

        self.assertTrue(isinstance(as_fasta(fa), Fasta))
        self.assertTrue(isinstance(as_fasta(fafile), Fasta))

        self.assertTrue(isinstance(as_fasta(bedfile, tmpdir), Fasta))
        self.assertTrue(isinstance(as_fasta(regionfile, tmpdir), Fasta))
        self.assertTrue(isinstance(as_fasta(regions, tmpdir), Fasta))
        
        with self.assertRaises(ValueError):
            as_fasta(bedfile)
        
        rmtree(tmpdir)
Ejemplo n.º 7
0
def genome(args):
    
    config = MotifConfig()
    
    if not os.path.exists(args.indexdir):
        print "Index_dir %s does not exist!" % (args.indexdir)
        sys.exit(1)

    if not os.path.exists(args.fastadir):
        print "FASTA dir %s does not exist!" % (args.fastadir)
        sys.exit(1)
    
    pred_bin = "genePredToBed"
    pred = find_executable(pred_bin)
    if not pred:
        sys.stderr.write("{} not found in path!\n".format(pred_bin))
        sys.exit(1)
    
    fastadir = args.fastadir
    genomebuild = args.genomebuild
    genome_dir = os.path.join(fastadir, genomebuild)
    index_dir = os.path.join(args.indexdir, args.genomebuild)

    # Check for rights to write to directory

    if not os.path.exists(genome_dir):
        try:
            os.mkdir(genome_dir)
        except:
            sys.stderr.write("Could not create genome dir {}\n".format(genome_dir))
            sys.exit(1)
    
    # Download gene file based on URL + genomebuild
    gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genomebuild)
    tmp = NamedTemporaryFile(delete=False, suffix=".gz")
    
    anno = []
    f = urllib2.urlopen(UCSC_GENE_URL.format(genomebuild))
    p = re.compile(r'\w+.Gene.txt.gz')
    for line in f.readlines():
        m = p.search(line)
        if m:
            anno.append(m.group(0))

    sys.stderr.write("Retrieving gene annotation for {}\n".format(genomebuild))
    url = ""
    for a in ANNOS:
        if a in anno:
            url = UCSC_GENE_URL.format(genomebuild) + a
            break
    if url:
        urllib.urlretrieve(
                url,
                tmp.name 
                )

        sp.call("zcat {} | cut -f2-11 | {} /dev/stdin {}".format(tmp.name, pred, gene_file), shell=True)

    else: 
        sys.stderr.write("No annotation found!")
  
    # download genome based on URL + genomebuild
    sys.stderr.write("Downloading {} genome\n".format(genomebuild))
    for genome_url in [UCSC_GENOME_URL, ALT_UCSC_GENOME_URL]:
        
        remote = genome_url.format(genomebuild)

        genome_fa = os.path.join(
                genome_dir, 
                os.path.split(remote)[-1]
                )

        sys.stderr.write("Trying to download {}\n".format(genome_url.format(genomebuild)))
        urllib.urlretrieve(
                genome_url.format(genomebuild),
                genome_fa
                )
        
        if not check_genome_file(genome_fa):    
            continue
        
        break

    if not check_genome_file(genome_fa):
        sys.stderr.write("Failed to download genome\n")
        sys.exit(1)

    sys.stderr.write("Unpacking\n")
    if genome_fa.endswith("tar.gz"):
        cmd = "tar -C {0} -xvzf {1} && rm {1}".format(genome_dir, genome_fa)
    else:
        cmd = "gunzip {0} && rm {0}".format(genome_fa)

    sp.call(cmd, shell=True, cwd=genome_dir)

    fa_files = glob("{}/*.fa".format(genome_dir))
    if len(fa_files) == 1:
        f = Fasta(fa_files[0])
        for n,s in f.items():
            with open("{}/{}.fa".format(n)) as f:
                f.write("{}\n{}\n".format(n,s))
    
        os.unlink(fa_files[0])

    sys.stderr.write("Creating index\n")
    g = GenomeIndex()
    g = g.create_index(genome_dir, index_dir)