Exemple #1
0
    def get_basic_batch(self, idx):
        #get seq positions
        inds = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]

        entries = self.data.iloc[inds]
        seqs = []
        for index, row in entries.iterrows():
            allele = row[self.allele_col]
            chrom = index[0]
            pos = index[1]
            left_flank_start = (pos - 1) - self.flank_size
            left_flank_end = (pos - 1)
            right_flank_start = pos
            right_flank_end = (pos + self.flank_size - 1)
            left_seq = self.ref.fetch(chrom, left_flank_start, left_flank_end)
            right_seq = self.ref.fetch(chrom, right_flank_start,
                                       right_flank_end)
            seq = left_seq + allele + right_seq
            seqs.append(seq)

        #one-hot-encode the fasta sequences
        seqs = np.array([[ltrdict.get(x, [0, 0, 0, 0]) for x in seq]
                         for seq in seqs])
        x_batch = np.expand_dims(seqs, 1)
        return x_batch
Exemple #2
0
    def get_upsampled_positives_batch(self, idx):
        #get seq positions
        pos_inds = self.pos_indices[idx * self.pos_batch_size:(idx + 1) *
                                    self.pos_batch_size]
        pos_bed_entries = self.ones.index[pos_inds]
        neg_inds = self.neg_indices[idx * self.neg_batch_size:(idx + 1) *
                                    self.neg_batch_size]
        neg_bed_entries = self.zeros.index[neg_inds]

        #print(neg_inds[0:10])
        #bed_entries=pos_bed_entries+neg_bed_entries

        #get sequences
        pos_seqs = [self.ref.fetch(i[0], i[1], i[2]) for i in pos_bed_entries]
        neg_seqs = [self.ref.fetch(i[0], i[1], i[2]) for i in neg_bed_entries]
        seqs = pos_seqs + neg_seqs
        if self.add_revcomp == True:
            #add in the reverse-complemented sequences for training.
            seqs_rc = [revcomp(s) for s in seqs]
            seqs = seqs + seqs_rc

        #one-hot-encode the fasta sequences
        seqs = np.array([[ltrdict.get(x, [0, 0, 0, 0]) for x in seq]
                         for seq in seqs])
        x_batch = np.expand_dims(seqs, 1)

        #extract the positive and negative labels at the current batch of indices
        y_batch_pos = self.ones.iloc[pos_inds]
        y_batch_neg = self.zeros.iloc[neg_inds]
        y_batch = np.concatenate((y_batch_pos, y_batch_neg), axis=0)
        #add in the labels for the reverse complement sequences, if used
        if self.add_revcomp == True:
            y_batch = np.concatenate((y_batch, y_batch), axis=0)
        return (x_batch, y_batch)
Exemple #3
0
    def get_upsampled_positives_batch(self,idx):
        #get seq positions
        pos_inds=self.pos_indices[idx*self.pos_batch_size:(idx+1)*self.pos_batch_size]
        pos_bed_entries=self.ones.index[pos_inds]
        neg_inds=self.neg_indices[idx*self.neg_batch_size:(idx+1)*self.neg_batch_size]
        neg_bed_entries=self.zeros.index[neg_inds]
    
        #print(neg_inds[0:10])
        #bed_entries=pos_bed_entries+neg_bed_entries

        #get sequences
        pos_seqs=[self.ref.fetch(i[0],i[1],i[2]) for i in pos_bed_entries]
        neg_seqs=[self.ref.fetch(i[0],i[1],i[2]) for i in neg_bed_entries]
        seqs=pos_seqs+neg_seqs 
        if self.add_revcomp==True:
            #add in the reverse-complemented sequences for training.
            seqs_rc=[revcomp(s) for s in seqs]
            seqs=seqs+seqs_rc
            
        #one-hot-encode the fasta sequences 
        seqs=np.array([[ltrdict.get(x,[0,0,0,0]) for x in seq] for seq in seqs])
        x_batch=np.expand_dims(seqs,1)
        
        #extract the positive and negative labels at the current batch of indices
        y_batch_pos=self.ones.iloc[pos_inds]
        y_batch_neg=self.zeros.iloc[neg_inds]
        y_batch=np.concatenate((y_batch_pos,y_batch_neg),axis=0)
        #add in the labels for the reverse complement sequences, if used 
        if self.add_revcomp==True:
            y_batch=np.concatenate((y_batch,y_batch),axis=0)
        return (x_batch,y_batch)            
Exemple #4
0
 def get_basic_batch(self,idx):
     #get seq positions
     inds=self.indices[idx*self.batch_size:(idx+1)*self.batch_size]
     bed_entries=self.data.index[inds]
     #get sequences
     seqs=[self.ref.fetch(i[0],i[1],i[2]) for i in bed_entries]
     if self.add_revcomp==True:
         #add in the reverse-complemented sequences for training.
         seqs_rc=[revcomp(s) for s in seqs]
         seqs=seqs+seqs_rc
     #one-hot-encode the fasta sequences 
     seqs=np.array([[ltrdict.get(x,[0,0,0,0]) for x in seq] for seq in seqs])
     x_batch=np.expand_dims(seqs,1)
     #extract the labels at the current batch of indices 
     y_batch=np.asarray(self.data.iloc[inds])
     #add in the labels for the reverse complement sequences, if used 
     if self.add_revcomp==True:
         y_batch=np.concatenate((y_batch,y_batch),axis=0)
     return (x_batch,y_batch)    
Exemple #5
0
 def get_basic_batch(self, idx):
     #get seq positions
     inds = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
     bed_entries = self.data.index[inds]
     #get sequences
     seqs = [self.ref.fetch(i[0], i[1], i[2]) for i in bed_entries]
     if self.add_revcomp == True:
         #add in the reverse-complemented sequences for training.
         seqs_rc = [revcomp(s) for s in seqs]
         seqs = seqs + seqs_rc
     #one-hot-encode the fasta sequences
     seqs = np.array([[ltrdict.get(x, [0, 0, 0, 0]) for x in seq]
                      for seq in seqs])
     x_batch = np.expand_dims(seqs, 1)
     #extract the labels at the current batch of indices
     y_batch = np.asarray(self.data.iloc[inds])
     #add in the labels for the reverse complement sequences, if used
     if self.add_revcomp == True:
         y_batch = np.concatenate((y_batch, y_batch), axis=0)
     return (x_batch, y_batch)
Exemple #6
0
 def get_basic_batch(self,idx): 
     #get seq positions
     inds=self.indices[idx*self.batch_size:(idx+1)*self.batch_size]
     
     entries=self.data.iloc[inds]
     seqs=[]
     for index,row in entries.iterrows():
         allele=row[self.allele_col]
         chrom=index[0]
         pos=index[1]
         left_flank_start=(pos-1)-self.flank_size
         left_flank_end=(pos-1)
         right_flank_start=pos
         right_flank_end=(pos+self.flank_size-1)
         left_seq=self.ref.fetch(chrom,left_flank_start,left_flank_end)
         right_seq=self.ref.fetch(chrom,right_flank_start,right_flank_end)
         seq=left_seq+allele+right_seq
         seqs.append(seq)
         
     #one-hot-encode the fasta sequences 
     seqs=np.array([[ltrdict.get(x,[0,0,0,0]) for x in seq] for seq in seqs])
     x_batch=np.expand_dims(seqs,1)
     return x_batch
Exemple #7
0
 def get_shuffled_ref_negatives_batch(self,idx): 
     #get seq positions
     inds=self.indices[idx*self.batch_size:(idx+1)*self.batch_size]
     bed_entries=self.data.index[inds]
     #get sequences
     seqs=[self.ref.fetch(i[0],i[1],i[2]) for i in bed_entries]
     if self.add_revcomp==True:
         #add in the reverse-complemented sequences for training.
         seqs_rc=[revcomp(s) for s in seqs]
         seqs=seqs+seqs_rc
         
     #generate the corresponding negative set by dinucleotide-shuffling the sequences
     seqs_shuffled=[dinuc_shuffle(s) for s in seqs]
     seqs=seqs+seqs_shuffled
     #one-hot-encode the fasta sequences 
     seqs=np.array([[ltrdict.get(x,[0,0,0,0]) for x in seq] for seq in seqs])
     x_batch=np.expand_dims(seqs,1)
     y_batch=np.asarray(self.data.iloc[inds])
     if self.add_revcomp==True:
         y_batch=np.concatenate((y_batch,y_batch),axis=0)
     y_shape=y_batch.shape 
     y_batch=np.concatenate((y_batch,np.zeros(y_shape)))
     return (x_batch,y_batch)
Exemple #8
0
    def get_shuffled_ref_negatives_batch(self, idx):
        #get seq positions
        inds = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        bed_entries = self.data.index[inds]
        #get sequences
        seqs = [self.ref.fetch(i[0], i[1], i[2]) for i in bed_entries]
        if self.add_revcomp == True:
            #add in the reverse-complemented sequences for training.
            seqs_rc = [revcomp(s) for s in seqs]
            seqs = seqs + seqs_rc

        #generate the corresponding negative set by dinucleotide-shuffling the sequences
        seqs_shuffled = [dinuc_shuffle(s) for s in seqs]
        seqs = seqs + seqs_shuffled
        #one-hot-encode the fasta sequences
        seqs = np.array([[ltrdict.get(x, [0, 0, 0, 0]) for x in seq]
                         for seq in seqs])
        x_batch = np.expand_dims(seqs, 1)
        y_batch = np.asarray(self.data.iloc[inds])
        if self.add_revcomp == True:
            y_batch = np.concatenate((y_batch, y_batch), axis=0)
        y_shape = y_batch.shape
        y_batch = np.concatenate((y_batch, np.zeros(y_shape)))
        return (x_batch, y_batch)