def get_pwm(fasta, regions, window_size): pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size), ("G", [0.0] * window_size), ("T", [0.0] * window_size), ("N", [0.0] * window_size)]) for region in regions: middle = (region.initial + region.final) // 2 p1 = middle - window_size // 2 p2 = middle + window_size // 2 if p1 <= 0: continue aux_plus = 1 dna_seq = str(fasta.fetch(region.chrom, p1, p2)).upper() if window_size % 2 == 0: aux_plus = 0 dna_seq_rev = AuxiliaryFunctions.revcomp( str(fasta.fetch(region.chrom, p1 + aux_plus, p2 + aux_plus)).upper()) if region.orientation == "+": for i in range(len(dna_seq)): pwm[dna_seq[i]][i] += 1 elif region.orientation == "-": for i in range(len(dna_seq_rev)): pwm[dna_seq_rev[i]][i] += 1 return pwm
def update_pwm(pwm, fasta, region, p1, p2): # Update pwm aux_plus = 1 dna_seq = str(fasta.fetch(region.chrom, p1, p2)).upper() if (region.final - region.initial) % 2 == 0: aux_plus = 0 dna_seq_rev = AuxiliaryFunctions.revcomp( str(fasta.fetch(region.chrom, p1 + aux_plus, p2 + aux_plus)).upper()) if region.orientation == "+": for i in range(0, len(dna_seq)): pwm[dna_seq[i]][i] += 1 elif region.orientation == "-": for i in range(0, len(dna_seq_rev)): pwm[dna_seq_rev[i]][i] += 1
def update_pwm(pwm, fasta, region, p1, p2): # Update pwm aux_plus = 1 dna_seq = str(fasta.fetch(region.chrom, p1, p2)).upper() if (region.final - region.initial) % 2 == 0: aux_plus = 0 dna_seq_rev = AuxiliaryFunctions.revcomp(str(fasta.fetch(region.chrom, p1 + aux_plus, p2 + aux_plus)).upper()) if region.orientation == "+": for i in range(0, len(dna_seq)): pwm[dna_seq[i]][i] += 1 elif region.orientation == "-": for i in range(0, len(dna_seq_rev)): pwm[dna_seq_rev[i]][i] += 1
def get_bc_signal_by_fragment_length(self, ref, start, end, bam, fasta, bias_table, forward_shift, reverse_shift, min_length=None, max_length=None, strand=True): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(k_nb / 2.) p2_wk = p2_w + int(k_nb / 2.) if (p1 <= 0 or p1_w <= 0 or p2_wk <= 0): # Return raw counts signal = [0.0] * (p2 - p1) for read in self.bam.fetch(ref, p1, p2): if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 return signal currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create the bias signal signal_bias_f = [] signal_bias_r = [] for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1): fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)] rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i] try: signal_bias_f.append(fBiasDict[fseq]) except Exception: signal_bias_f.append(defaultKmerValue) try: signal_bias_r.append(rBiasDict[rseq]) except Exception: signal_bias_r.append(defaultKmerValue) # Raw counts raw_f = [0.0] * (p2_w - p1_w) raw_r = [0.0] * (p2_w - p1_w) if min_length is None and max_length is None: for read in bam.fetch(ref, p1_w, p2_w): if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: raw_f[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: raw_r[cut_site - p1_w] += 1.0 elif min_length is None and max_length is not None: for read in bam.fetch(ref, p1_w, p2_w): if abs(read.template_length) <= max_length: if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: raw_f[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: raw_r[cut_site - p1_w] += 1.0 elif min_length is not None and max_length is None: for read in bam.fetch(ref, p1_w, p2_w): if abs(read.template_length) > min_length: if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: raw_f[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: raw_r[cut_site - p1_w] += 1.0 elif min_length is not None and max_length is not None: for read in bam.fetch(ref, p1_w, p2_w): if min_length < abs(read.template_length) <= max_length: if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: raw_f[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: raw_r[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(raw_f[:window]) rSum = sum(raw_r[:window]) fLast = raw_f[0] rLast = raw_r[0] for i in range((window / 2), len(raw_f) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += raw_f[i + (window / 2)] fLast = raw_f[i - (window / 2) + 1] rSum -= rLast rSum += raw_r[i + (window / 2)] rLast = raw_r[i - (window / 2) + 1] # Calculating bias and writing to wig file fSum = sum(signal_bias_f[:window]) rSum = sum(signal_bias_r[:window]) fLast = signal_bias_f[0] rLast = signal_bias_r[0] bc_f = [] bc_r = [] for i in range((window / 2), len(signal_bias_f) - (window / 2)): nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum) nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum) bc_f.append(nhatf) bc_r.append(nhatr) fSum -= fLast fSum += signal_bias_f[i + (window / 2)] fLast = signal_bias_f[i - (window / 2) + 1] rSum -= rLast rSum += signal_bias_r[i + (window / 2)] rLast = signal_bias_r[i - (window / 2) + 1] if strand: return np.array(bc_f), np.array(bc_r) else: return np.add(np.array(bc_f), np.array(bc_r))
def print_signal(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift, initial_clip=1000, per_norm=98, per_slope=98, bias_table=None, genome_file_name=None, raw_signal_file=None, bc_signal_file=None, norm_signal_file=None, strand_specific=False): if raw_signal_file: pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift) if ps_version == "0.7.5": self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region) else: iter = self.bam.fetch(reference=ref, start=start, end=end) for alignment in iter: pileup_region.__call__(alignment) raw_signal = array([min(e, initial_clip) for e in pileup_region.vector]) f = open(raw_signal_file, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(raw_signal)]) + "\n") f.close() if bc_signal_file or norm_signal_file: # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fasta = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(k_nb / 2.) p2_wk = p2_w + int(k_nb / 2.) currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create the bias signal signal_bias_f = [] signal_bias_r = [] for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1): fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)] rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i] try: signal_bias_f.append(fBiasDict[fseq]) except Exception: signal_bias_f.append(defaultKmerValue) try: signal_bias_r.append(rBiasDict[rseq]) except Exception: signal_bias_r.append(defaultKmerValue) # Raw counts signal_raw_f = [0.0] * (p2_w - p1_w) signal_raw_r = [0.0] * (p2_w - p1_w) for read in self.bam.fetch(ref, p1_w, p2_w): if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: signal_raw_f[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: signal_raw_r[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(signal_raw_f[:window]) rSum = sum(signal_raw_r[:window]) fLast = signal_raw_f[0] rLast = signal_raw_r[0] for i in range((window / 2), len(signal_raw_f) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += signal_raw_f[i + (window / 2)] fLast = signal_raw_f[i - (window / 2) + 1] rSum -= rLast rSum += signal_raw_r[i + (window / 2)] rLast = signal_raw_r[i - (window / 2) + 1] # Calculating bias and writing to wig file fSum = sum(signal_bias_f[:window]) rSum = sum(signal_bias_r[:window]) fLast = signal_bias_f[0] rLast = signal_bias_r[0] signal_bc = [] signal_bc_f = [] signal_bc_r = [] for i in range((window / 2), len(signal_bias_f) - (window / 2)): nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum) nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum) signal_bc.append(nhatf + nhatr) signal_bc_f.append(nhatf) signal_bc_r.append(nhatr) fSum -= fLast fSum += signal_bias_f[i + (window / 2)] fLast = signal_bias_f[i - (window / 2) + 1] rSum -= rLast rSum += signal_bias_r[i + (window / 2)] rLast = signal_bias_r[i - (window / 2) + 1] if bc_signal_file: f = open(bc_signal_file, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(signal_bc)]) + "\n") f.close() if strand_specific: prefix = bc_signal_file.split(".")[0] bc_signal_file_f = prefix + "_Forward" + ".bc.wig" bc_signal_file_r = prefix + "_Reverse" + ".bc.wig" f = open(bc_signal_file_f, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(signal_bc_f)]) + "\n") f.close() f = open(bc_signal_file_r, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(signal_bc_r)]) + "\n") f.close() if norm_signal_file: norm_signal_bc = self.boyle_norm(signal_bc) perc = scoreatpercentile(norm_signal_bc, 98) std = np.std(norm_signal_bc) norm_signal_bc = self.hon_norm_atac(norm_signal_bc, perc, std) f = open(norm_signal_file, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(norm_signal_bc)]) + "\n") f.close() if strand_specific: prefix = bc_signal_file.split(".")[0] norm_signal_file_f = prefix + "_Forward" + ".norm.wig" norm_signal_file_r = prefix + "_Reverse" + ".norm.wig" signal_norm_f = self.boyle_norm(signal_bc_f) perc = scoreatpercentile(signal_norm_f, 98) std = np.std(signal_norm_f) signal_norm_f = self.hon_norm_atac(signal_norm_f, perc, std) signal_norm_r = self.boyle_norm(signal_bc_r) perc = scoreatpercentile(signal_norm_r, 98) std = np.std(signal_norm_r) signal_norm_r = self.hon_norm_atac(signal_norm_r, perc, std) f = open(norm_signal_file_f, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(signal_norm_f)]) + "\n") f.close() f = open(norm_signal_file_r, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(signal_norm_r)]) + "\n") f.close()
def bias_correction_atac(self, bias_table, genome_file_name, chrName, start, end, forward_shift, reverse_shift): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) if (p1 <= 0 or p1_w <= 0 or p2_wk <= 0): # Return raw counts nf = [0.0] * (p2 - p1) nr = [0.0] * (p2 - p1) for read in self.bam.fetch(chrName, p1, p2): if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: nf[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: nr[cut_site - p1] += 1.0 return nf, nr # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for read in self.bam.fetch(chrName, p1_w, p2_w): if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: nf[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: nr[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(nf[:window]) rSum = sum(nr[:window]) fLast = nf[0] rLast = nr[0] for i in range((window / 2), len(nf) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += nf[i + (window / 2)] fLast = nf[i - (window / 2) + 1] rSum -= rLast rSum += nr[i + (window / 2)] rLast = nr[i - (window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1): fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))] rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file fSum = sum(af[:window]) rSum = sum(ar[:window]) fLast = af[0] rLast = ar[0] bias_corrected_signal_forward = [] bias_corrected_signal_reverse = [] for i in range((window / 2), len(af) - (window / 2)): nhatf = Nf[i - (window / 2)] * (af[i] / fSum) nhatr = Nr[i - (window / 2)] * (ar[i] / rSum) bias_corrected_signal_forward.append(nhatf) bias_corrected_signal_reverse.append(nhatr) fSum -= fLast fSum += af[i + (window / 2)] fLast = af[i - (window / 2) + 1] rSum -= rLast rSum += ar[i + (window / 2)] rLast = ar[i - (window / 2) + 1] # Termination fastaFile.close() return bias_corrected_signal_forward, bias_corrected_signal_reverse
def bias_correction(chrom, start, end, bam, bias_table, genome_file_name, forward_shift, reverse_shift): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(list(fBiasDict.keys())[0]) p1 = start p2 = end p1_w = p1 - (window // 2) p2_w = p2 + (window // 2) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) if p1 <= 0 or p1_w <= 0 or p1_wk <= 0 or p2_wk <= 0: # Return raw counts bc_signal = [0.0] * (p2 - p1) for read in bam.fetch(chrom, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: bc_signal[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: bc_signal[cut_site - p1] += 1.0 return bc_signal # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for read in bam.fetch(chrom, p1_w, p2_w): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: nf[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: nr[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] f_sum = sum(nf[:window]) r_sum = sum(nr[:window]) f_last = nf[0] r_last = nr[0] for i in range(int(window / 2), len(nf) - int(window / 2)): Nf.append(f_sum) Nr.append(r_sum) f_sum -= f_last f_sum += nf[i + int(window / 2)] f_last = nf[i - int(window / 2) + 1] r_sum -= r_last r_sum += nr[i + int(window / 2)] r_last = nr[i - int(window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrom, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp( str(fastaFile.fetch(chrom, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1): fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))] rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file f_sum = sum(af[:window]) r_sum = sum(ar[:window]) f_last = af[0] r_last = ar[0] bc_signal = [] for i in range(int(window / 2), len(af) - int(window / 2)): nhatf = Nf[i - int(window / 2)] * (af[i] / f_sum) nhatr = Nr[i - int(window / 2)] * (ar[i] / r_sum) bc_signal.append(nhatf + nhatr) f_sum -= f_last f_sum += af[i + int(window / 2)] f_last = af[i - int(window / 2) + 1] r_sum -= r_last r_sum += ar[i + int(window / 2)] r_last = ar[i - int(window / 2) + 1] # Termination fastaFile.close() return bc_signal
def load_gene_list(self, file_name, filter_havana=True, protein_coding=False, known_only=False): """Reads gene annotation in gtf (gencode) format. It populates self.gene_list with such entries. *Keyword arguments:* - file_name -- The gencode .gtf file name. """ # Opening GTF file try: gtf_file = open(file_name, "r") except Exception: print("Error: Cannot find the annotation file: "+file_name) print("Please check the path in ~/rgtdata/data.config") sys.exit(1) # Reading GTF file for line in gtf_file: # Processing line line = line.strip() if line[0] == "#": continue line_list = line.split("\t") try: if filter_havana and line_list[1] == "HAVANA": continue except: pass addt_list = line_list[8].split(";") addt_list = filter(None, addt_list) # Processing additional list of options addt_dict = dict() for addt_element in addt_list: addt_element_list = addt_element.split(" ") addt_element_list = filter(None, addt_element_list) # Removing " symbol from string options addt_element_list[1] = addt_element_list[1].replace("\"", "") addt_dict[addt_element_list[0]] = addt_element_list[1] # filter non-protein-coding sequences, if required if protein_coding: if "gene_type" not in addt_dict or addt_dict["gene_type"] != "protein_coding": continue if "transcript_type" in addt_dict and addt_dict["transcript_type"] != "protein_coding": continue # filter unknown sequences, if required if known_only: if "gene_status" not in addt_dict or addt_dict["gene_status"] != "KNOWN": continue if "transcript_status" in addt_dict and addt_dict["transcript_status"] != "KNOWN": continue # Removing dot from IDs addt_dict["gene_id"] = addt_dict["gene_id"].split(".")[0] try: addt_dict["transcript_id"] = addt_dict["transcript_id"].split(".")[0] except: pass # Creating final version of additional arguments final_addt_list = [] for addt_key in ["gene_id", "transcript_id", "gene_type", "gene_status", "gene_name", "transcript_type", "transcript_status", "transcript_name", "level"]: try: final_addt_list.append(addt_dict[addt_key]) except Exception: final_addt_list.append(None) # Handling score current_score = 0 if AuxiliaryFunctions.string_is_int(line_list[5]): current_score = AuxiliaryFunctions.correct_standard_bed_score(line_list[5]) # Creating GenomicRegion genomic_region = GenomicRegion(chrom=line_list[0], initial=int(line_list[3])-1, final=int(line_list[4]), orientation=line_list[6], data=current_score) # Creating final vector extra_index_elements = [[],[]] # One list for each: EXACT_GENE_MATCHES, INEXACT_GENE_MATCHES final_vector = [genomic_region,line_list[1],line_list[2],line_list[7]] + final_addt_list + extra_index_elements self.gene_list.append(final_vector) # Termination gtf_file.close()
def load_gene_list(self, file_name, filter_havana=True): """ Reads gene annotation in gtf (gencode) format. It populates self.gene_list with such entries. Keyword arguments: file_name -- The gencode .gtf file name. Return: void. """ # Opening GTF file try: gtf_file = open(file_name, "r") except Exception: pass # TODO # Reading GTF file for line in gtf_file: # Processing line line = line.strip() if (line[0] == "#"): continue line_list = line.split("\t") if (filter_havana and line_list[1] == "HAVANA"): continue addt_list = line_list[8].split(";") addt_list = filter(None, addt_list) # Processing additional list of options addt_dict = dict() for addt_element in addt_list: addt_element_list = addt_element.split(" ") addt_element_list = filter(None, addt_element_list) addt_element_list[1] = addt_element_list[1].replace( "\"", "") # Removing " symbol from string options addt_dict[addt_element_list[0]] = addt_element_list[1] # Removing dot from IDs addt_dict["gene_id"] = addt_dict["gene_id"].split(".")[0] addt_dict["transcript_id"] = addt_dict["transcript_id"].split( ".")[0] # Creating final version of additional arguments final_addt_list = [] for addt_key in [ "gene_id", "transcript_id", "gene_type", "gene_status", "gene_name", "transcript_type", "transcript_status", "transcript_name", "level" ]: try: final_addt_list.append(addt_dict[addt_key]) except Exception: final_addt_list.append(None) # Handling score current_score = 0 if (AuxiliaryFunctions.string_is_int(line_list[5])): current_score = AuxiliaryFunctions.correct_standard_bed_score( line_list[5]) # Creating GenomicRegion genomic_region = GenomicRegion(chrom=line_list[0], initial=int(line_list[3]) - 1, final=int(line_list[4]), orientation=line_list[6], data=current_score) # Creating final vector extra_index_elements = [ [], [] ] # One list for each: EXACT_GENE_MATCHES, INEXACT_GENE_MATCHES final_vector = [ genomic_region, line_list[1], line_list[2], line_list[7] ] + final_addt_list + extra_index_elements self.gene_list.append(final_vector) # Termination gtf_file.close()
def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb, forward_shift, reverse_shift): """ Estimates bias based on HS regions, DNase-seq signal and genomic sequences. Keyword arguments: regions -- DNase-seq HS regions. dnase_file_name -- DNase-seq file name. genome_file_name -- Genome to fetch genomic sequences from. Return: bias_table_F, bias_table_R -- Bias tables. """ # Parameters maxDuplicates = 100 pseudocount = 1.0 # Initializing bam and fasta if (dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR bamFile = Samfile(dnase_file_name, "rb") fastaFile = Fastafile(genome_file_name) # Initializing dictionaries obsDictF = dict() obsDictR = dict() expDictF = dict() expDictR = dict() ct_reads_r = 0 ct_reads_f = 0 ct_kmers = 0 # Iterating on HS regions for region in regions: # Initialization prevPos = -1 trueCounter = 0 # Evaluating observed frequencies #################################### # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if (not r.is_reverse): cut_site = r.pos + forward_shift - 1 p1 = cut_site - int(floor(k_nb / 2)) else: cut_site = r.aend + reverse_shift + 1 p1 = cut_site - int(floor(k_nb / 2)) p2 = p1 + k_nb # Verifying PCR artifacts if (p1 == prevPos): trueCounter += 1 else: prevPos = p1 trueCounter = 0 if (trueCounter > maxDuplicates): continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if (r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if (not r.is_reverse): ct_reads_f += 1 try: obsDictF[currStr] += 1 except Exception: obsDictF[currStr] = 1 else: ct_reads_r += 1 try: obsDictR[currStr] += 1 except Exception: obsDictR[currStr] = 1 # Evaluating expected frequencies #################################### # Fetching whole sequence try: currStr = str( fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue currRevComp = AuxiliaryFunctions.revcomp(currStr) # Iterating on each sequence position for i in range(0, len(currStr) - k_nb): ct_kmers += 1 # Counting k-mer in dictionary s = currStr[i:i + k_nb] try: expDictF[s] += 1 except Exception: expDictF[s] = 1 # Counting k-mer in dictionary for reverse complement s = currRevComp[i:i + k_nb] try: expDictR[s] += 1 except Exception: expDictR[s] = 1 # Closing files bamFile.close() fastaFile.close() # Creating bias dictionary alphabet = ["A", "C", "G", "T"] kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)] bias_table_F = dict([(e, 0.0) for e in kmerComb]) bias_table_R = dict([(e, 0.0) for e in kmerComb]) for kmer in kmerComb: try: obsF = obsDictF[kmer] + pseudocount except Exception: obsF = pseudocount try: expF = expDictF[kmer] + pseudocount except Exception: expF = pseudocount if ct_reads_f == 0: bias_table_F[kmer] = 1 else: bias_table_F[kmer] = round( float(obsF / ct_reads_f) / float(expF / ct_kmers), 6) try: obsR = obsDictR[kmer] + pseudocount except Exception: obsR = pseudocount try: expR = expDictR[kmer] + pseudocount except Exception: expR = pseudocount if ct_reads_r == 0: bias_table_R[kmer] = 1 else: bias_table_R[kmer] = round( float(obsR / ct_reads_r) / float(expR / ct_kmers), 6) # Return return [bias_table_F, bias_table_R]
def estimate_table_pwm(self, regions, dnase_file_name, genome_file_name, k_nb, forward_shift, reverse_shift): """ Estimates bias based on HS regions, DNase-seq signal and genomic sequences. Keyword arguments: regions -- DNase-seq HS regions. atac_file_name -- DNase-seq file name. genome_file_name -- Genome to fetch genomic sequences from. Return: bias_table_F, bias_table_R -- Bias tables. """ # Initializing bam and fasta if (dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR bamFile = Samfile(dnase_file_name, "rb") fastaFile = Fastafile(genome_file_name) obsSeqsF = [] obsSeqsR = [] expSeqsF = [] expSeqsR = [] # Iterating on HS regions for region in regions: # Evaluating observed frequencies # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions # if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift # else: p1 = r.aend - (k_nb/2) + 1 - shift if (not r.is_reverse): cut_site = r.pos + forward_shift - 1 p1 = cut_site - int(floor(k_nb / 2)) else: cut_site = r.aend + reverse_shift + 1 p1 = cut_site - int(floor(k_nb / 2)) p2 = p1 + k_nb # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if (r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if 'N' not in currStr: if (not r.is_reverse): obsSeqsF.append(Seq(currStr)) else: obsSeqsR.append(Seq(currStr)) # Evaluating expected frequencies # Fetching whole sequence try: currStr = str( fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue currRevComp = AuxiliaryFunctions.revcomp(currStr) # Iterating on each sequence position for i in range(0, len(currStr) - k_nb): s = currStr[i:i + k_nb] if 'N' not in currStr: # Counting k-mer in dictionary expSeqsF.append(Seq(s)) # Counting k-mer in dictionary for reverse complement s = currRevComp[i:i + k_nb] expSeqsR.append(Seq(s)) # Closing files bamFile.close() fastaFile.close() obsMotifsF = motifs.create(obsSeqsF) obsMotifsR = motifs.create(obsSeqsR) expMotifsF = motifs.create(expSeqsF) expMotifsR = motifs.create(expSeqsR) obsPwmF = obsMotifsF.pwm obsPwmR = obsMotifsR.pwm expPwmF = expMotifsF.pwm expPwmR = expMotifsR.pwm # Output logos logo_obs_f = os.path.join( self.output_loc, "Bias", "logo", "obs_{}_{}_f.pdf".format(str(k_nb), str(forward_shift))) logo_obs_r = os.path.join( self.output_loc, "Bias", "logo", "obs_{}_{}_r.pdf".format(str(k_nb), str(forward_shift))) logo_exp_f = os.path.join( self.output_loc, "Bias", "logo", "exp_{}_{}_f.pdf".format(str(k_nb), str(forward_shift))) logo_exp_r = os.path.join( self.output_loc, "Bias", "logo", "exp_{}_{}_r.pdf".format(str(k_nb), str(forward_shift))) obsMotifsF.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) obsMotifsR.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) expMotifsF.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) expMotifsR.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) # Output pwms pwm_data_list = [obsPwmF, obsPwmR, expPwmF, expPwmR] pwm_file_list = [] pwm_obs_f = os.path.join( self.output_loc, "Bias", "pwm", "obs_{}_{}_f.pwm".format(str(k_nb), str(forward_shift))) pwm_obs_r = os.path.join( self.output_loc, "Bias", "pwm", "obs_{}_{}_r.pwm".format(str(k_nb), str(forward_shift))) pwm_exp_f = os.path.join( self.output_loc, "Bias", "pwm", "exp_{}_{}_f.pwm".format(str(k_nb), str(forward_shift))) pwm_exp_r = os.path.join( self.output_loc, "Bias", "pwm", "exp_{}_{}_r.pwm".format(str(k_nb), str(forward_shift))) pwm_file_list.append(pwm_obs_f) pwm_file_list.append(pwm_obs_r) pwm_file_list.append(pwm_exp_f) pwm_file_list.append(pwm_exp_r) for i in range(len(pwm_data_list)): with open(pwm_file_list[i], "w") as f: f.write(str(pwm_data_list[i])) # Creating bias dictionary alphabet = ["A", "C", "G", "T"] k_mer_comb = ["".join(e) for e in product(alphabet, repeat=k_nb)] bias_table_F = dict([(e, 0.0) for e in k_mer_comb]) bias_table_R = dict([(e, 0.0) for e in k_mer_comb]) for k_mer in k_mer_comb: obsF = self.get_pwm_score(k_mer, obsPwmF, k_nb) expF = self.get_pwm_score(k_mer, expPwmF, k_nb) bias_table_F[k_mer] = round(obsF / expF, 6) obsR = self.get_pwm_score(k_mer, obsPwmR, k_nb) expR = self.get_pwm_score(k_mer, expPwmR, k_nb) bias_table_R[k_mer] = round(obsR / expR, 6) # Return return [bias_table_F, bias_table_R]
def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb, shift): """ Estimates bias based on HS regions, DNase-seq signal and genomic sequences. Keyword arguments: regions -- DNase-seq HS regions. dnase_file_name -- DNase-seq file name. genome_file_name -- Genome to fetch genomic sequences from. Return: bias_table_F, bias_table_R -- Bias tables. """ # Parameters maxDuplicates = 100 pseudocount = 1.0 # Initializing bam and fasta if(dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR bamFile = Samfile(dnase_file_name, "rb") fastaFile = Fastafile(genome_file_name) # Initializing dictionaries obsDictF = dict(); obsDictR = dict() expDictF = dict(); expDictR = dict() ct_reads_r=0 ct_reads_f=0 ct_kmers=0 # Iterating on HS regions for region in regions: # Initialization prevPos = -1 trueCounter = 0 # Evaluating observed frequencies #################################### # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift else: p1 = r.aend - (k_nb/2) + 1 - shift p2 = p1 + k_nb # Verifying PCR artifacts if(p1 == prevPos): trueCounter += 1 else: prevPos = p1 trueCounter = 0 if(trueCounter > maxDuplicates): continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if(r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if(not r.is_reverse): ct_reads_r+=1 try: obsDictF[currStr] += 1 except Exception: obsDictF[currStr] = 1 else: ct_reads_f+=1 try: obsDictR[currStr] += 1 except Exception: obsDictR[currStr] = 1 # Evaluating expected frequencies #################################### # Fetching whole sequence try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue currRevComp = AuxiliaryFunctions.revcomp(currStr) # Iterating on each sequence position for i in range(0,len(currStr)-k_nb): ct_kmers+=1 # Counting k-mer in dictionary s = currStr[i:i+k_nb] try: expDictF[s] += 1 except Exception: expDictF[s] = 1 # Counting k-mer in dictionary for reverse complement s = currRevComp[i:i+k_nb] try: expDictR[s] += 1 except Exception: expDictR[s] = 1 # Closing files bamFile.close() fastaFile.close() # Creating bias dictionary alphabet = ["A","C","G","T"] kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)] bias_table_F = dict([(e,0.0) for e in kmerComb]) bias_table_R = dict([(e,0.0) for e in kmerComb]) for kmer in kmerComb: try: obsF = obsDictF[kmer] + pseudocount except Exception: obsF = pseudocount try: expF = expDictF[kmer] + pseudocount except Exception: expF = pseudocount bias_table_F[kmer] = round(float(obsF/ct_reads_f)/float(expF/ct_kmers),6) try: obsR = obsDictR[kmer] + pseudocount except Exception: obsR = pseudocount try: expR = expDictR[kmer] + pseudocount except Exception: expR = pseudocount bias_table_R[kmer] = round(float(obsR/ct_reads_r)/float(expR/ct_kmers),6) # Return return [bias_table_F, bias_table_R]
def estimate_bias_kmer(args): # Parameters maxDuplicates = 100 pseudocount = 1.0 # Initializing bam and fasta bamFile = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fastaFile = Fastafile(genome_data.get_genome()) regions = GenomicRegionSet("regions") regions.read(args.regions_file) # Initializing dictionaries obsDictF = dict() obsDictR = dict() expDictF = dict() expDictR = dict() ct_reads_r = 0 ct_reads_f = 0 ct_kmers = 0 # Iterating on HS regions for region in regions: # Initialization prevPos = -1 trueCounter = 0 # Evaluating observed frequencies #################################### # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if not r.is_reverse: cut_site = r.pos + args.forward_shift - 1 p1 = cut_site - int(floor(args.k_nb / 2)) else: cut_site = r.aend + args.reverse_shift + 1 p1 = cut_site - int(floor(args.k_nb / 2)) p2 = p1 + args.k_nb # Verifying PCR artifacts if p1 == prevPos: trueCounter += 1 else: prevPos = p1 trueCounter = 0 if trueCounter > maxDuplicates: continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if not r.is_reverse: ct_reads_f += 1 try: obsDictF[currStr] += 1 except Exception: obsDictF[currStr] = 1 else: ct_reads_r += 1 try: obsDictR[currStr] += 1 except Exception: obsDictR[currStr] = 1 # Evaluating expected frequencies #################################### # Fetching whole sequence try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue currRevComp = AuxiliaryFunctions.revcomp(currStr) # Iterating on each sequence position for i in range(0, len(currStr) - args.k_nb): ct_kmers += 1 # Counting k-mer in dictionary s = currStr[i:i + args.k_nb] try: expDictF[s] += 1 except Exception: expDictF[s] = 1 # Counting k-mer in dictionary for reverse complement s = currRevComp[i:i + args.k_nb] try: expDictR[s] += 1 except Exception: expDictR[s] = 1 # Closing files bamFile.close() fastaFile.close() # Creating bias dictionary alphabet = ["A", "C", "G", "T"] kmerComb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] bias_table_F = dict([(e, 0.0) for e in kmerComb]) bias_table_R = dict([(e, 0.0) for e in kmerComb]) for kmer in kmerComb: try: obsF = obsDictF[kmer] + pseudocount except Exception: obsF = pseudocount try: expF = expDictF[kmer] + pseudocount except Exception: expF = pseudocount if ct_reads_f == 0: bias_table_F[kmer] = 1 else: bias_table_F[kmer] = round(float(obsF / ct_reads_f) / float(expF / ct_kmers), 6) try: obsR = obsDictR[kmer] + pseudocount except Exception: obsR = pseudocount try: expR = expDictR[kmer] + pseudocount except Exception: expR = pseudocount if ct_reads_r == 0: bias_table_R[kmer] = 1 else: bias_table_R[kmer] = round(float(obsR / ct_reads_r) / float(expR / ct_kmers), 6) write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
def estimate_bias_pwm(args): # Parameters max_duplicates = 100 # Initializing bam and fasta bamFile = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fastaFile = Fastafile(genome_data.get_genome()) regions = GenomicRegionSet("regions") regions.read(args.regions_file) obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) # Iterating on HS regions for region in regions: # Initialization prev_pos = -1 true_counter = 0 # Evaluating observed frequencies # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if not r.is_reverse: cut_site = r.pos + args.forward_shift - 1 p1 = cut_site - int(floor(args.k_nb / 2)) else: cut_site = r.aend + args.reverse_shift + 1 p1 = cut_site - int(floor(args.k_nb / 2)) p2 = p1 + args.k_nb # Verifying PCR artifacts if p1 == prev_pos: true_counter += 1 else: prev_pos = p1 true_counter = 0 if true_counter > max_duplicates: continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if not r.is_reverse: for i in range(0, len(currStr)): obs_f_pwm_dict[currStr[i]][i] += 1 else: for i in range(0, len(currStr)): obs_r_pwm_dict[currStr[i]][i] += 1 # Evaluating expected frequencies # Fetching whole sequence try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue # Iterating on each sequence position s = None for i in range(0, len(currStr) - args.k_nb): # Counting k-mer in dictionary s = currStr[i:i + args.k_nb] for i in range(0, len(s)): exp_f_pwm_dict[s[i]][i] += 1 # Counting k-mer in dictionary for reverse complement s = AuxiliaryFunctions.revcomp(s) for i in range(0, len(s)): exp_r_pwm_dict[s[i]][i] += 1 # Closing files bamFile.close() fastaFile.close() # Output pwms os.system("mkdir -p " + os.path.join(args.output_location, "pfm")) pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict] pwm_file_list = [] pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb))) pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb))) pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb))) pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb))) pwm_file_list.append(pwm_obs_f) pwm_file_list.append(pwm_obs_r) pwm_file_list.append(pwm_exp_f) pwm_file_list.append(pwm_exp_r) for i in range(len(pwm_dict_list)): with open(pwm_file_list[i], "w") as pwm_file: for e in ["A", "C", "G", "T"]: pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n") motif_obs_f = motifs.read(open(pwm_obs_f), "pfm") motif_obs_r = motifs.read(open(pwm_obs_r), "pfm") motif_exp_f = motifs.read(open(pwm_exp_f), "pfm") motif_exp_r = motifs.read(open(pwm_exp_r), "pfm") # Output logos os.system("mkdir -p " + os.path.join(args.output_location, "logo")) logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb))) logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb))) logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb))) logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb))) motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) # Creating bias dictionary alphabet = ["A", "C", "G", "T"] k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] bias_table_F = dict([(e, 0.0) for e in k_mer_comb]) bias_table_R = dict([(e, 0.0) for e in k_mer_comb]) for k_mer in k_mer_comb: obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb) exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb) bias_table_F[k_mer] = round(obs_f / exp_f, 6) obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb) exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb) bias_table_R[k_mer] = round(obs_r / exp_r, 6) write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
def get_bias_raw_bc_signal(self, ref, start, end, bam, fasta, bias_table, forward_shift, reverse_shift, strand=False): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(k_nb / 2.) p2_wk = p2_w + int(k_nb / 2.) if p1 <= 0 or p1_w <= 0 or p2_wk <= 0: # Return raw counts signal = [0.0] * (p2 - p1) for read in self.bam.fetch(ref, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 return signal currStr = str(fasta.fetch(ref, p1_wk - 1 + forward_shift, p2_wk - 2 + forward_shift)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + reverse_shift + 2, p2_wk + reverse_shift + 1)).upper()) # Iterating on sequence to create the bias signal signal_bias_f = [] signal_bias_r = [] for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1): fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)] rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i] try: signal_bias_f.append(fBiasDict[fseq]) except Exception: signal_bias_f.append(defaultKmerValue) try: signal_bias_r.append(rBiasDict[rseq]) except Exception: signal_bias_r.append(defaultKmerValue) # Raw counts signal_raw_f = [0.0] * (p2_w - p1_w) signal_raw_r = [0.0] * (p2_w - p1_w) for read in bam.fetch(ref, p1_w, p2_w): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: signal_raw_f[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: signal_raw_r[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(signal_raw_f[:window]) rSum = sum(signal_raw_r[:window]) fLast = signal_raw_f[0] rLast = signal_raw_r[0] for i in range((window / 2), len(signal_raw_f) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += signal_raw_f[i + (window / 2)] fLast = signal_raw_f[i - (window / 2) + 1] rSum -= rLast rSum += signal_raw_r[i + (window / 2)] rLast = signal_raw_r[i - (window / 2) + 1] # Calculating bias and writing to wig file fSum = sum(signal_bias_f[:window]) rSum = sum(signal_bias_r[:window]) fLast = signal_bias_f[0] rLast = signal_bias_r[0] bias_f = [] bias_r = [] raw = [] raw_f = [] raw_r = [] bc = [] bc_f = [] bc_r = [] for i in range((window / 2), len(signal_bias_f) - (window / 2)): nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum) nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum) bias_f.append(signal_bias_f[i]) bias_r.append(signal_bias_r[i]) raw.append(signal_raw_f[i] + signal_raw_r[i]) raw_f.append(signal_raw_f[i]) raw_r.append(signal_raw_r[i]) # zf = (signal_raw_f[i]) / (signal_bias_f[i]) # zr = (signal_raw_r[i]) / (signal_bias_r[i]) bc.append(nhatf + nhatr) bc_f.append(nhatf) bc_r.append(nhatr) fSum -= fLast fSum += signal_bias_f[i + (window / 2)] fLast = signal_bias_f[i - (window / 2) + 1] rSum -= rLast rSum += signal_bias_r[i + (window / 2)] rLast = signal_bias_r[i - (window / 2) + 1] currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create the bias signal signal_bias_f = [] signal_bias_r = [] for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1): fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)] rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i] try: signal_bias_f.append(fBiasDict[fseq]) except Exception: signal_bias_f.append(defaultKmerValue) try: signal_bias_r.append(rBiasDict[rseq]) except Exception: signal_bias_r.append(defaultKmerValue) bias_f = [] bias_r = [] for i in range((window / 2), len(signal_bias_f) - (window / 2)): bias_f.append(signal_bias_f[i]) bias_r.append(signal_bias_r[i]) if strand: return bias_f, bias_r, raw, raw_f, raw_r, bc, bc_f, bc_r else: return bias_f, bias_r, raw, bc
def bias_correction(chrom, start, end, bam, bias_table, genome_file_name, forward_shift, reverse_shift): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) if p1 <= 0 or p1_w <= 0 or p2_wk <= 0: # Return raw counts bc_signal = [0.0] * (p2 - p1) for read in bam.fetch(chrom, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: bc_signal[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: bc_signal[cut_site - p1] += 1.0 return bc_signal # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for read in bam.fetch(chrom, p1_w, p2_w): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: nf[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: nr[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] f_sum = sum(nf[:window]) r_sum = sum(nr[:window]) f_last = nf[0] r_last = nr[0] for i in range((window / 2), len(nf) - (window / 2)): Nf.append(f_sum) Nr.append(r_sum) f_sum -= f_last f_sum += nf[i + (window / 2)] f_last = nf[i - (window / 2) + 1] r_sum -= r_last r_sum += nr[i + (window / 2)] r_last = nr[i - (window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrom, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrom, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1): fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))] rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file f_sum = sum(af[:window]) r_sum = sum(ar[:window]) f_last = af[0] r_last = ar[0] bc_signal = [] for i in range((window / 2), len(af) - (window / 2)): nhatf = Nf[i - (window / 2)] * (af[i] / f_sum) nhatr = Nr[i - (window / 2)] * (ar[i] / r_sum) bc_signal.append(nhatf + nhatr) f_sum -= f_last f_sum += af[i + (window / 2)] f_last = af[i - (window / 2) + 1] r_sum -= r_last r_sum += ar[i + (window / 2)] r_last = ar[i - (window / 2) + 1] # Termination fastaFile.close() return bc_signal
def load_gene_list(self, file_name, filter_havana=True, protein_coding=False, known_only=False): """Reads gene annotation in gtf (gencode) format. It populates self.gene_list with such entries. *Keyword arguments:* - file_name -- The gencode .gtf file name. """ # Opening GTF file try: gtf_file = open(file_name, "r") except Exception: print("Error: Cannot find the annotation file: " + file_name) print("Please check the path in ~/rgtdata/data.config") sys.exit(1) # Reading GTF file for line in gtf_file: # Processing line line = line.strip() if line[0] == "#": continue line_list = line.split("\t") try: if filter_havana and line_list[1] == "HAVANA": continue except: pass addt_list = line_list[8].split(";") addt_list = [_f for _f in addt_list if _f] # Processing additional list of options addt_dict = dict() for addt_element in addt_list: addt_element_list = addt_element.split(" ") addt_element_list = [_f for _f in addt_element_list if _f] # Removing " symbol from string options addt_element_list[1] = addt_element_list[1].replace("\"", "") addt_dict[addt_element_list[0]] = addt_element_list[1] # filter non-protein-coding sequences, if required if protein_coding: if "gene_type" not in addt_dict or addt_dict[ "gene_type"] != "protein_coding": continue if "transcript_type" in addt_dict and addt_dict[ "transcript_type"] != "protein_coding": continue # filter unknown sequences, if required if known_only: if "gene_status" not in addt_dict or addt_dict[ "gene_status"] != "KNOWN": continue if "transcript_status" in addt_dict and addt_dict[ "transcript_status"] != "KNOWN": continue # Removing dot from IDs addt_dict["gene_id"] = addt_dict["gene_id"].split(".")[0] try: addt_dict["transcript_id"] = addt_dict["transcript_id"].split( ".")[0] except: pass # Creating final version of additional arguments final_addt_list = [] for addt_key in [ "gene_id", "transcript_id", "gene_type", "gene_status", "gene_name", "transcript_type", "transcript_status", "transcript_name", "level" ]: try: final_addt_list.append(addt_dict[addt_key]) except Exception: final_addt_list.append(None) # Handling score current_score = 0 if AuxiliaryFunctions.string_is_int(line_list[5]): current_score = AuxiliaryFunctions.correct_standard_bed_score( line_list[5]) # Creating GenomicRegion genomic_region = GenomicRegion(chrom=line_list[0], initial=int(line_list[3]) - 1, final=int(line_list[4]), orientation=line_list[6], data=current_score) # Creating final vector extra_index_elements = [ [], [] ] # One list for each: EXACT_GENE_MATCHES, INEXACT_GENE_MATCHES final_vector = [ genomic_region, line_list[1], line_list[2], line_list[7] ] + final_addt_list + extra_index_elements self.gene_list.append(final_vector) # Termination gtf_file.close()
def get_bc_signal_by_fragment_length(self, ref, start, end, bam, fasta, bias_table, forward_shift, reverse_shift, min_length=None, max_length=None, strand=True): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(k_nb / 2.) p2_wk = p2_w + int(k_nb / 2.) if (p1 <= 0 or p1_w <= 0 or p2_wk <= 0): # Return raw counts signal = [0.0] * (p2 - p1) for read in self.bam.fetch(ref, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 return signal currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create the bias signal signal_bias_f = [] signal_bias_r = [] for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1): fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)] rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i] try: signal_bias_f.append(fBiasDict[fseq]) except Exception: signal_bias_f.append(defaultKmerValue) try: signal_bias_r.append(rBiasDict[rseq]) except Exception: signal_bias_r.append(defaultKmerValue) # Raw counts raw_f = [0.0] * (p2_w - p1_w) raw_r = [0.0] * (p2_w - p1_w) if min_length is None and max_length is None: for read in bam.fetch(ref, p1_w, p2_w): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: raw_f[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: raw_r[cut_site - p1_w] += 1.0 elif min_length is None and max_length is not None: for read in bam.fetch(ref, p1_w, p2_w): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if abs(read.template_length) <= max_length: if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: raw_f[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: raw_r[cut_site - p1_w] += 1.0 elif min_length is not None and max_length is None: for read in bam.fetch(ref, p1_w, p2_w): if abs(read.template_length) > min_length: if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: raw_f[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: raw_r[cut_site - p1_w] += 1.0 elif min_length is not None and max_length is not None: for read in bam.fetch(ref, p1_w, p2_w): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if min_length < abs(read.template_length) <= max_length: if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: raw_f[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: raw_r[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(raw_f[:window]) rSum = sum(raw_r[:window]) fLast = raw_f[0] rLast = raw_r[0] for i in range((window / 2), len(raw_f) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += raw_f[i + (window / 2)] fLast = raw_f[i - (window / 2) + 1] rSum -= rLast rSum += raw_r[i + (window / 2)] rLast = raw_r[i - (window / 2) + 1] # Calculating bias and writing to wig file fSum = sum(signal_bias_f[:window]) rSum = sum(signal_bias_r[:window]) fLast = signal_bias_f[0] rLast = signal_bias_r[0] bc_f = [] bc_r = [] for i in range((window / 2), len(signal_bias_f) - (window / 2)): nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum) nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum) bc_f.append(nhatf) bc_r.append(nhatr) fSum -= fLast fSum += signal_bias_f[i + (window / 2)] fLast = signal_bias_f[i - (window / 2) + 1] rSum -= rLast rSum += signal_bias_r[i + (window / 2)] rLast = signal_bias_r[i - (window / 2) + 1] if strand: return np.array(bc_f), np.array(bc_r) else: return np.add(np.array(bc_f), np.array(bc_r))
def get_bias_raw_bc_signal(self, ref, start, end, bam, fasta, bias_table, forward_shift, reverse_shift, strand=False): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(k_nb / 2.) p2_wk = p2_w + int(k_nb / 2.) if p1 <= 0 or p1_w <= 0 or p2_wk <= 0: # Return raw counts signal = [0.0] * (p2 - p1) for read in self.bam.fetch(ref, p1, p2): if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 return signal currStr = str(fasta.fetch(ref, p1_wk - 1 + forward_shift, p2_wk - 2 + forward_shift)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + reverse_shift + 2, p2_wk + reverse_shift + 1)).upper()) # Iterating on sequence to create the bias signal signal_bias_f = [] signal_bias_r = [] for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1): fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)] rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i] try: signal_bias_f.append(fBiasDict[fseq]) except Exception: signal_bias_f.append(defaultKmerValue) try: signal_bias_r.append(rBiasDict[rseq]) except Exception: signal_bias_r.append(defaultKmerValue) # Raw counts signal_raw_f = [0.0] * (p2_w - p1_w) signal_raw_r = [0.0] * (p2_w - p1_w) for read in bam.fetch(ref, p1_w, p2_w): if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: signal_raw_f[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: signal_raw_r[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(signal_raw_f[:window]) rSum = sum(signal_raw_r[:window]) fLast = signal_raw_f[0] rLast = signal_raw_r[0] for i in range((window / 2), len(signal_raw_f) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += signal_raw_f[i + (window / 2)] fLast = signal_raw_f[i - (window / 2) + 1] rSum -= rLast rSum += signal_raw_r[i + (window / 2)] rLast = signal_raw_r[i - (window / 2) + 1] # Calculating bias and writing to wig file fSum = sum(signal_bias_f[:window]) rSum = sum(signal_bias_r[:window]) fLast = signal_bias_f[0] rLast = signal_bias_r[0] bias_f = [] bias_r = [] raw = [] raw_f = [] raw_r = [] bc = [] bc_f = [] bc_r = [] for i in range((window / 2), len(signal_bias_f) - (window / 2)): nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum) nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum) bias_f.append(signal_bias_f[i]) bias_r.append(signal_bias_r[i]) raw.append(signal_raw_f[i] + signal_raw_r[i]) raw_f.append(signal_raw_f[i]) raw_r.append(signal_raw_r[i]) # zf = (signal_raw_f[i]) / (signal_bias_f[i]) # zr = (signal_raw_r[i]) / (signal_bias_r[i]) bc.append(nhatf + nhatr) bc_f.append(nhatf) bc_r.append(nhatr) fSum -= fLast fSum += signal_bias_f[i + (window / 2)] fLast = signal_bias_f[i - (window / 2) + 1] rSum -= rLast rSum += signal_bias_r[i + (window / 2)] rLast = signal_bias_r[i - (window / 2) + 1] currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create the bias signal signal_bias_f = [] signal_bias_r = [] for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1): fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)] rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i] try: signal_bias_f.append(fBiasDict[fseq]) except Exception: signal_bias_f.append(defaultKmerValue) try: signal_bias_r.append(rBiasDict[rseq]) except Exception: signal_bias_r.append(defaultKmerValue) bias_f = [] bias_r = [] for i in range((window / 2), len(signal_bias_f) - (window / 2)): bias_f.append(signal_bias_f[i]) bias_r.append(signal_bias_r[i]) if strand: return bias_f, bias_r, raw, raw_f, raw_r, bc, bc_f, bc_r else: return bias_f, bias_r, raw, bc
def load_gene_list(self, file_name, filter_havana=True, protein_coding=False, known_only=False): """ Reads gene annotation in gtf (gencode) format. It populates self.gene_list with such entries. Keyword arguments: file_name -- The gencode .gtf file name. Return: void. """ # Opening GTF file try: gtf_file = open(file_name,"r") except Exception: pass # TODO # Reading GTF file for line in gtf_file: # Processing line line = line.strip() if(line[0] == "#"): continue line_list = line.split("\t") if(filter_havana and line_list[1] == "HAVANA"): continue addt_list = line_list[8].split(";") if(protein_coding and "protein_coding" not in addt_list[2] ): continue if(known_only and "KNOWN" not in addt_list[3] ): continue if(protein_coding and "protein_coding" not in addt_list[5] ): continue if(known_only and "KNOWN" not in addt_list[6] ): continue addt_list = filter(None,addt_list) # Processing additional list of options addt_dict = dict() for addt_element in addt_list: addt_element_list = addt_element.split(" ") addt_element_list = filter(None,addt_element_list) addt_element_list[1] = addt_element_list[1].replace("\"","") # Removing " symbol from string options addt_dict[addt_element_list[0]] = addt_element_list[1] # Removing dot from IDs addt_dict["gene_id"] = addt_dict["gene_id"].split(".")[0] addt_dict["transcript_id"] = addt_dict["transcript_id"].split(".")[0] # Creating final version of additional arguments final_addt_list = [] for addt_key in ["gene_id", "transcript_id", "gene_type", "gene_status", "gene_name", "transcript_type", "transcript_status", "transcript_name", "level"]: try: final_addt_list.append(addt_dict[addt_key]) except Exception: final_addt_list.append(None) # Handling score current_score = 0 if(AuxiliaryFunctions.string_is_int(line_list[5])): current_score = AuxiliaryFunctions.correct_standard_bed_score(line_list[5]) # Creating GenomicRegion genomic_region = GenomicRegion(chrom = line_list[0], initial = int(line_list[3])-1, final = int(line_list[4]), orientation = line_list[6], data = current_score) # Creating final vector extra_index_elements = [[],[]] # One list for each: EXACT_GENE_MATCHES, INEXACT_GENE_MATCHES final_vector = [genomic_region,line_list[1],line_list[2],line_list[7]] + final_addt_list + extra_index_elements self.gene_list.append(final_vector) # Termination gtf_file.close()