def get_basic_batch(self, idx): #get seq positions inds = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size] entries = self.data.iloc[inds] seqs = [] for index, row in entries.iterrows(): allele = row[self.allele_col] chrom = index[0] pos = index[1] left_flank_start = (pos - 1) - self.flank_size left_flank_end = (pos - 1) right_flank_start = pos right_flank_end = (pos + self.flank_size - 1) left_seq = self.ref.fetch(chrom, left_flank_start, left_flank_end) right_seq = self.ref.fetch(chrom, right_flank_start, right_flank_end) seq = left_seq + allele + right_seq seqs.append(seq) #one-hot-encode the fasta sequences seqs = np.array([[ltrdict.get(x, [0, 0, 0, 0]) for x in seq] for seq in seqs]) x_batch = np.expand_dims(seqs, 1) return x_batch
def get_upsampled_positives_batch(self, idx): #get seq positions pos_inds = self.pos_indices[idx * self.pos_batch_size:(idx + 1) * self.pos_batch_size] pos_bed_entries = self.ones.index[pos_inds] neg_inds = self.neg_indices[idx * self.neg_batch_size:(idx + 1) * self.neg_batch_size] neg_bed_entries = self.zeros.index[neg_inds] #print(neg_inds[0:10]) #bed_entries=pos_bed_entries+neg_bed_entries #get sequences pos_seqs = [self.ref.fetch(i[0], i[1], i[2]) for i in pos_bed_entries] neg_seqs = [self.ref.fetch(i[0], i[1], i[2]) for i in neg_bed_entries] seqs = pos_seqs + neg_seqs if self.add_revcomp == True: #add in the reverse-complemented sequences for training. seqs_rc = [revcomp(s) for s in seqs] seqs = seqs + seqs_rc #one-hot-encode the fasta sequences seqs = np.array([[ltrdict.get(x, [0, 0, 0, 0]) for x in seq] for seq in seqs]) x_batch = np.expand_dims(seqs, 1) #extract the positive and negative labels at the current batch of indices y_batch_pos = self.ones.iloc[pos_inds] y_batch_neg = self.zeros.iloc[neg_inds] y_batch = np.concatenate((y_batch_pos, y_batch_neg), axis=0) #add in the labels for the reverse complement sequences, if used if self.add_revcomp == True: y_batch = np.concatenate((y_batch, y_batch), axis=0) return (x_batch, y_batch)
def get_upsampled_positives_batch(self,idx): #get seq positions pos_inds=self.pos_indices[idx*self.pos_batch_size:(idx+1)*self.pos_batch_size] pos_bed_entries=self.ones.index[pos_inds] neg_inds=self.neg_indices[idx*self.neg_batch_size:(idx+1)*self.neg_batch_size] neg_bed_entries=self.zeros.index[neg_inds] #print(neg_inds[0:10]) #bed_entries=pos_bed_entries+neg_bed_entries #get sequences pos_seqs=[self.ref.fetch(i[0],i[1],i[2]) for i in pos_bed_entries] neg_seqs=[self.ref.fetch(i[0],i[1],i[2]) for i in neg_bed_entries] seqs=pos_seqs+neg_seqs if self.add_revcomp==True: #add in the reverse-complemented sequences for training. seqs_rc=[revcomp(s) for s in seqs] seqs=seqs+seqs_rc #one-hot-encode the fasta sequences seqs=np.array([[ltrdict.get(x,[0,0,0,0]) for x in seq] for seq in seqs]) x_batch=np.expand_dims(seqs,1) #extract the positive and negative labels at the current batch of indices y_batch_pos=self.ones.iloc[pos_inds] y_batch_neg=self.zeros.iloc[neg_inds] y_batch=np.concatenate((y_batch_pos,y_batch_neg),axis=0) #add in the labels for the reverse complement sequences, if used if self.add_revcomp==True: y_batch=np.concatenate((y_batch,y_batch),axis=0) return (x_batch,y_batch)
def get_basic_batch(self,idx): #get seq positions inds=self.indices[idx*self.batch_size:(idx+1)*self.batch_size] bed_entries=self.data.index[inds] #get sequences seqs=[self.ref.fetch(i[0],i[1],i[2]) for i in bed_entries] if self.add_revcomp==True: #add in the reverse-complemented sequences for training. seqs_rc=[revcomp(s) for s in seqs] seqs=seqs+seqs_rc #one-hot-encode the fasta sequences seqs=np.array([[ltrdict.get(x,[0,0,0,0]) for x in seq] for seq in seqs]) x_batch=np.expand_dims(seqs,1) #extract the labels at the current batch of indices y_batch=np.asarray(self.data.iloc[inds]) #add in the labels for the reverse complement sequences, if used if self.add_revcomp==True: y_batch=np.concatenate((y_batch,y_batch),axis=0) return (x_batch,y_batch)
def get_basic_batch(self, idx): #get seq positions inds = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size] bed_entries = self.data.index[inds] #get sequences seqs = [self.ref.fetch(i[0], i[1], i[2]) for i in bed_entries] if self.add_revcomp == True: #add in the reverse-complemented sequences for training. seqs_rc = [revcomp(s) for s in seqs] seqs = seqs + seqs_rc #one-hot-encode the fasta sequences seqs = np.array([[ltrdict.get(x, [0, 0, 0, 0]) for x in seq] for seq in seqs]) x_batch = np.expand_dims(seqs, 1) #extract the labels at the current batch of indices y_batch = np.asarray(self.data.iloc[inds]) #add in the labels for the reverse complement sequences, if used if self.add_revcomp == True: y_batch = np.concatenate((y_batch, y_batch), axis=0) return (x_batch, y_batch)
def get_basic_batch(self,idx): #get seq positions inds=self.indices[idx*self.batch_size:(idx+1)*self.batch_size] entries=self.data.iloc[inds] seqs=[] for index,row in entries.iterrows(): allele=row[self.allele_col] chrom=index[0] pos=index[1] left_flank_start=(pos-1)-self.flank_size left_flank_end=(pos-1) right_flank_start=pos right_flank_end=(pos+self.flank_size-1) left_seq=self.ref.fetch(chrom,left_flank_start,left_flank_end) right_seq=self.ref.fetch(chrom,right_flank_start,right_flank_end) seq=left_seq+allele+right_seq seqs.append(seq) #one-hot-encode the fasta sequences seqs=np.array([[ltrdict.get(x,[0,0,0,0]) for x in seq] for seq in seqs]) x_batch=np.expand_dims(seqs,1) return x_batch
def get_shuffled_ref_negatives_batch(self,idx): #get seq positions inds=self.indices[idx*self.batch_size:(idx+1)*self.batch_size] bed_entries=self.data.index[inds] #get sequences seqs=[self.ref.fetch(i[0],i[1],i[2]) for i in bed_entries] if self.add_revcomp==True: #add in the reverse-complemented sequences for training. seqs_rc=[revcomp(s) for s in seqs] seqs=seqs+seqs_rc #generate the corresponding negative set by dinucleotide-shuffling the sequences seqs_shuffled=[dinuc_shuffle(s) for s in seqs] seqs=seqs+seqs_shuffled #one-hot-encode the fasta sequences seqs=np.array([[ltrdict.get(x,[0,0,0,0]) for x in seq] for seq in seqs]) x_batch=np.expand_dims(seqs,1) y_batch=np.asarray(self.data.iloc[inds]) if self.add_revcomp==True: y_batch=np.concatenate((y_batch,y_batch),axis=0) y_shape=y_batch.shape y_batch=np.concatenate((y_batch,np.zeros(y_shape))) return (x_batch,y_batch)
def get_shuffled_ref_negatives_batch(self, idx): #get seq positions inds = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size] bed_entries = self.data.index[inds] #get sequences seqs = [self.ref.fetch(i[0], i[1], i[2]) for i in bed_entries] if self.add_revcomp == True: #add in the reverse-complemented sequences for training. seqs_rc = [revcomp(s) for s in seqs] seqs = seqs + seqs_rc #generate the corresponding negative set by dinucleotide-shuffling the sequences seqs_shuffled = [dinuc_shuffle(s) for s in seqs] seqs = seqs + seqs_shuffled #one-hot-encode the fasta sequences seqs = np.array([[ltrdict.get(x, [0, 0, 0, 0]) for x in seq] for seq in seqs]) x_batch = np.expand_dims(seqs, 1) y_batch = np.asarray(self.data.iloc[inds]) if self.add_revcomp == True: y_batch = np.concatenate((y_batch, y_batch), axis=0) y_shape = y_batch.shape y_batch = np.concatenate((y_batch, np.zeros(y_shape))) return (x_batch, y_batch)