Ejemplo n.º 1
0
def test_genome_index(index_dir, loc_and_seq):
    from gimmemotifs.genome_index import GenomeIndex
    g = GenomeIndex(index_dir)

    loc, seq = loc_and_seq

    assert seq == g.get_sequence(*loc)
Ejemplo n.º 2
0
def index_dir():
    from gimmemotifs.genome_index import GenomeIndex
    test_index_dir = 'tests/data/index/'
    g = GenomeIndex()
    g.create_index('tests/data/genome/', test_index_dir)

    return test_index_dir
Ejemplo n.º 3
0
def index(args):
    
    if not os.path.exists(args.indexdir):
        print "Index_dir %s does not exist!" % (args.indexdir)
        sys.exit(1)

    fasta_dir = args.fastadir
    index_dir = os.path.join(args.indexdir, args.indexname)

    g = GenomeIndex()
    g.create_index(fasta_dir, index_dir)

    create_bedtools_fa(index_dir, fasta_dir)
Ejemplo n.º 4
0
    def __init__(self,
                 session=None,
                 conn='mysql://pita:@localhost/pita',
                 new=False,
                 index=None):
        self.logger = logging.getLogger("pita")

        # initialize db session
        if session:
            self.session = session
        else:
            self._init_session(conn, new)

        # index to retrieve sequence
        self.index = None
        if index:
            self.index = GenomeIndex(index)

        self.cache_splice_stats = {}
        self.cache_feature_stats = {}
Ejemplo n.º 5
0
    def test2_as_fasta(self):
        """ convert bed, regions, etc to Fasta """
        tmpdir = mkdtemp()

        g = GenomeIndex()
        g.create_index(self.genome_dir, tmpdir)

        fafile = os.path.join(self.datadir, "test.fa")
        fa = Fasta(fafile)
        bedfile = os.path.join(self.datadir, "test.bed")
        regionfile = os.path.join(self.datadir, "test.txt")
        regions = [l.strip() for l in open(regionfile)]

        self.assertTrue(isinstance(as_fasta(fa), Fasta))
        self.assertTrue(isinstance(as_fasta(fafile), Fasta))

        self.assertTrue(isinstance(as_fasta(bedfile, tmpdir), Fasta))
        self.assertTrue(isinstance(as_fasta(regionfile, tmpdir), Fasta))
        self.assertTrue(isinstance(as_fasta(regions, tmpdir), Fasta))

        with self.assertRaises(ValueError):
            as_fasta(bedfile)

        rmtree(tmpdir)
Ejemplo n.º 6
0
    def _scan_regions(self, regions, nreport, scan_rc, cutoff=0.95):
        index_dir = self.index_dir
        motif_file = self.motifs
        motif_digest = self.checksum.get(motif_file, None)

        # determine which regions are not in the cache
        scan_regions = regions
        if self.use_cache:
            scan_regions = []
            for region in regions:
                key = str((region, index_dir, motif_digest, nreport, scan_rc,
                           cutoff))
                ret = self.cache.get(key)
                if ret == NO_VALUE:
                    scan_regions.append(region)

        # scan the regions that are not in the cache
        if len(scan_regions) > 0:
            n = 12

            genome_index = GenomeIndex(index_dir)

            motifs = load_motifs(motif_file, cutoff)

            scan_func = partial(scan_region_mult,
                                genome_index=genome_index,
                                motifs=motifs,
                                nreport=nreport,
                                scan_rc=scan_rc)

            jobs = []
            chunksize = len(scan_regions) / n + 1
            for i in range(n):
                job = pool.apply_async(
                    scan_func,
                    (scan_regions[i * chunksize:(i + 1) * chunksize], ))
                jobs.append(job)

            # return values or store values in cache
            i = 0
            for job in jobs:
                for ret in job.get():
                    if self.use_cache:
                        # store values in cache
                        region = scan_regions[i]
                        key = str((region, index_dir, motif_digest, nreport,
                                   scan_rc, cutoff))
                        self.cache.set(key, ret)
                    else:
                        #return values
                        yield ret
                    i += 1

        if self.use_cache:
            # return results from cache
            for region in regions:
                key = str((region, index_dir, motif_digest, nreport, scan_rc,
                           cutoff))
                ret = self.cache.get(key)
                if ret == NO_VALUE or ret is None:
                    raise Exception("cache is not big enough to hold all "
                                    "results, try increasing the cache size "
                                    "or disable cache")
                yield ret