Example #1
0
    def __init__(self,
                 load_refseq=True,
                 load_canonical=True,
                 load_transcript=False):
        self.ensembl_id_patt = re.compile('(ENST\d+)')
        if load_refseq:
            gene_file = "%s/%s" % (snapconf.TABIX_DB_PATH,
                                   snapconf.REFSEQ_ANNOTATION)
            gene_pickle_file = "%s.pkl" % (gene_file)
            self.gene_map = snaputil.load_cpickle_file(gene_pickle_file)
            if not self.gene_map:
                self.load_gene_coords(gene_file)
            snaputil.store_cpickle_file(gene_pickle_file, self.gene_map)
        if load_canonical:
            canonical_gene_file = "%s/%s" % (snapconf.TABIX_DB_PATH,
                                             snapconf.CANONICAL_ANNOTATION)
            canonical_gene_pickle_file = "%s.pkl" % (canonical_gene_file)
            self.canonical_gene_map = snaputil.load_cpickle_file(
                canonical_gene_pickle_file)
            if not self.canonical_gene_map:
                self.load_canonical_gene_coords(canonical_gene_file)
            snaputil.store_cpickle_file(canonical_gene_pickle_file,
                                        self.canonical_gene_map)

        #per transcript exons
        if load_transcript:
            transcript_file = "%s/%s" % (snapconf.TABIX_DB_PATH,
                                         snapconf.TABIX_GENE_INTERVAL_DB)
            transcript_pickle_file = "%s.pkl" % (transcript_file)
            self.transcript_map = snaputil.load_cpickle_file(
                transcript_pickle_file)
            if not self.transcript_map:
                self.load_transcripts(transcript_file)
            snaputil.store_cpickle_file(transcript_pickle_file,
                                        self.transcript_map)
 def __init__(self,load_refseq=True,load_canonical=True,load_transcript=False):
     self.ensembl_id_patt = re.compile('(ENST\d+)')
     if load_refseq:
         gene_file = "%s/%s" % (snapconf.TABIX_DB_PATH,snapconf.REFSEQ_ANNOTATION)
         gene_pickle_file = "%s.pkl" % (gene_file)
         self.gene_map = snaputil.load_cpickle_file(gene_pickle_file)
         if not self.gene_map:
             self.load_gene_coords(gene_file)
         snaputil.store_cpickle_file(gene_pickle_file,self.gene_map)
     if load_canonical:
         canonical_gene_file = "%s/%s" % (snapconf.TABIX_DB_PATH,snapconf.CANONICAL_ANNOTATION)
         canonical_gene_pickle_file = "%s.pkl" % (canonical_gene_file)
         self.canonical_gene_map = snaputil.load_cpickle_file(canonical_gene_pickle_file)
         if not self.canonical_gene_map:
             self.load_canonical_gene_coords(canonical_gene_file)
         snaputil.store_cpickle_file(canonical_gene_pickle_file,self.canonical_gene_map)
    
     #per transcript exons
     if load_transcript:
         transcript_file = "%s/%s" % (snapconf.TABIX_DB_PATH,snapconf.TABIX_GENE_INTERVAL_DB)
         transcript_pickle_file = "%s.pkl" % (transcript_file)
         self.transcript_map = snaputil.load_cpickle_file(transcript_pickle_file)
         if not self.transcript_map:
             self.load_transcripts(transcript_file)
         snaputil.store_cpickle_file(transcript_pickle_file,self.transcript_map)
Example #3
0
def sample_ids2intron_ids_from_bit_vector(sample_ids):
    snaptron_ids_final = None
    for sample_id in sample_ids:
        snaptron_ids = snaputil.load_cpickle_file("%s/%s.pkl" % (snapconf.PACKED_SAMPLE_IDS_PATH, str(sample_id)), compressed=False)
        #in a few cases we may not have a mapping for a specific sample_id
        if snaptron_ids is None:
            continue
        if snaptron_ids_final is None:
            snaptron_ids_final = snaptron_ids
        else:
            snaptron_ids_final = snaptron_ids_final | snaptron_ids
    snaptron_ids_final_set = set()
    [snaptron_ids_final_set.add(str(i)) for (i,x) in enumerate(snaptron_ids_final) if x]
    return snaptron_ids_final_set
Example #4
0
def sample_ids2intron_ids_from_bit_vector(sample_ids):
    snaptron_ids_final = None
    for sample_id in sample_ids:
        snaptron_ids = snaputil.load_cpickle_file("%s/%s.pkl" % (snapconf.PACKED_SAMPLE_IDS_PATH, str(sample_id)), compressed=False)
        #in a few cases we may not have a mapping for a specific sample_id
        if snaptron_ids is None:
            continue
        if snaptron_ids_final is None:
            snaptron_ids_final = snaptron_ids
        else:
            snaptron_ids_final = snaptron_ids_final | snaptron_ids
    snaptron_ids_final_set = set()
    [snaptron_ids_final_set.add(str(i)) for (i,x) in enumerate(snaptron_ids_final) if x]
    return snaptron_ids_final_set
Example #5
0
def load_sample_metadata(file_):
    start = time.time()
    fmd=snaputil.load_cpickle_file("%s.pkl" % (file_))
    if fmd:
        end = time.time()
        taken = end-start
        return fmd
    start = time.time()
    fmd={}
    #dont need the hash-on-column headers just yet
    with open(file_,"r") as f:
       for line in f:
           line = line.rstrip()
           fields=line.split("\t")
           fmd[fields[0]]=line
    end = time.time()
    taken = end-start
    #sys.stderr.write("time taken to load samples from normal: %d\n" % taken)
    snaputil.store_cpickle_file("%s.pkl" % (file_),fmd)
    return fmd
Example #6
0
def load_sample_metadata(file_):
    start = time.time()
    fmd=snaputil.load_cpickle_file("%s.pkl" % (file_))
    if fmd:
        end = time.time()
        taken = end-start
        return fmd
    start = time.time()
    fmd={}
    #dont need the hash-on-column headers just yet
    with open(file_,"r") as f:
       for line in f:
           line = line.rstrip()
           fields=line.split("\t")
           fmd[fields[0]]=line
    end = time.time()
    taken = end-start
    #sys.stderr.write("time taken to load samples from normal: %d\n" % taken)
    snaputil.store_cpickle_file("%s.pkl" % (file_),fmd)
    return fmd
Example #7
0
COMPRESSED=False
path='/data3/snaptron/sample_ids_full'
#path='/data3/snaptron/sample_ids'
#suffix='.gz'
suffix=''

def orthem(ba1,ba2):
    ba_final = ba1 | ba2
    return ba_final

def setthem(ba_final):
    i = 0
    s1=set()
    [s1.add(i) for (i,x) in enumerate(ba_final) if x]
    #for bit in ba_final:
    #    if bit:
    #        s1.add(i)
    #    i+=1
    return s1

ba_final = su.load_cpickle_file("%s/0.pkl%s" % (path,suffix), compressed=COMPRESSED)
for i in xrange(1,5000):
    ba2 = su.load_cpickle_file("%s/%s.pkl%s" % (path,str(i),suffix), compressed=COMPRESSED)
    if ba2 != None:
        ba_final = orthem(ba2,ba_final)

s1 = setthem(ba_final)
print len(s1)
#sys.stdout.write(",".join([str(x) for x in s1]))