def getFeatureCount(f,todir="./data",mode="cN"): """ """ n = f.split("/")[-1].replace(".bedpe.gz", "") fout = todir + "/" + n + '_%s.txt'%mode if os.path.isfile(fout): return #print(f) ss = {} for i, line in enumerate(gzip.open(f)): if i % 10000 == 0: cFlush("%s read from %s" % (i, f)) line = line.split("\n")[0].split("\t") if line[0] != line[3] or line[0]=="chrM": continue s = min(int(line[1]), int(line[4])) e = max(int(line[2]), int(line[5])) d = e - s m = (s+e)/2 if mode == "cN" and 140 <= d <= 180: #iv = HTSeq.GenomicInterval(line[0], s, e) iv = HTSeq.GenomicInterval(line[0], m, m+1) for niv, nv in model[iv].steps(): if nv != set([]): if nv not in ss: ss[nv] = 0 ss[nv] += 1 #ss.add(nv) #ss.update(nv) elif mode == "sP" and d <= 80: #iv = HTSeq.GenomicInterval(line[0], s, e) iv = HTSeq.GenomicInterval(line[0], m, m+1) for niv, nv in model[iv].steps(): if nv != set([]): if nv not in ss: ss[nv] = 0 ss[nv] += 1 #ss.update(nv) #ss.add( nv ) else: continue print() ss = pd.Series(ss) ss.to_csv(fout,sep="\t") #with open(fout, "w") as fo: # fo.write("\n".join(list(ss))) print(f, "finished") logger.info("file:%s,mode:%s,features:%s"%(f,mode,len(ss)))
def find_introns(self): """Returns false if no error encountered, true otherwise. """ # Don't do this again if introns are already defined. if 'intron' in self.elements and len(self.elements['intron']): return False self.elements['intron'] = [] if ('exon' not in self.elements) or (len(self.elements['exon']) == 1): return False exons = self.elements['exon'] exons = sorted(exons, key=lambda x: x.start) for exon_n, exon in enumerate(exons): if exon_n >= len(exons) - 1: return False if exons[exon_n].end >= exons[exon_n + 1].start: print(f"Exons not intrepretable:", exons[exon_n], exons[exon_n + 1], '\n-\n') return True self.elements['intron'].append( HTSeq.GenomicInterval(exon.chrom, exons[exon_n].end, exons[exon_n + 1].start, exon.strand)) return False
def VariantCallTabReader(filepath, chrom_size): """ This function aims to read the variant calls from the merged studies which contain various types of variant types. """ infile = pd.read_csv(filepath, sep="\t") # var_types is a dic keyed by var_type and valued by a list of genomic intervals var_types_ga = {} var_types_id = {} for _, line in infile.iterrows(): var_type = str(line['var_type']) var_type = var_type.replace(" ", "_") if var_type not in var_types_ga.keys(): var_types_ga[var_type] = [] var_types_id[var_type] = [] chrom = 'chr' + str(line['chr']) accesion = line[0] if (chrom in chrom_size.keys()): start = inner_outer_pref(line, 'start') end = inner_outer_pref(line, 'stop') # Create a 'Genomic interval' from this variant call iv = HTSeq.GenomicInterval(chrom, start, end, ".") var_types_ga[var_type].append(iv) var_types_id[var_type].append(accesion) return ((var_types_ga, var_types_id))
def Get_IPAsite_IPUI(input_tuple): IPAevent,curr_label_all_ga,gas = input_tuple label,intron_rank,IPA_inf,IPAtype = IPAevent.split(";") intronrank = int(intron_rank.split("_")[1]) position_list = list(map(int,IPA_inf.split(":")[1].split('-'))) SYMBOL = label.split(":")[1].split("|")[0] result = [] for feature,rank,chrom,start,end,strand,length,exon_rank_left,exon_rank_right in annot[label]: if feature == "intron" and int(rank) == intronrank: iv = HTSeq.GenomicInterval(chrom,start,end,strand) IPAstart = position_list[0]-int(start) IPA_location = position_list[1]-int(start) curr_label_all_cov = [] for ga in curr_label_all_ga: if strand == "-": curr_label_all_cov.append(list(ga[iv])[::-1]) IPAstart = int(end)-position_list[1] IPA_location = int(end)-position_list[0] else: curr_label_all_cov.append(list(ga[iv])) IPA_isoform_abundance = [np.mean(cvg_region[IPAstart:(IPAstart+int(int((IPA_location-IPAstart)/1.5)))]) for cvg_region in curr_label_all_cov] if strand == "+": exon_iv = tuple(i[0] for i in gas.steps() if i[1] == {('exon',int(exon_rank_left))}) else: exon_iv = tuple(i[0] for i in gas.steps() if i[1] == {('exon',int(exon_rank_right))}) if len(exon_iv) == 1: exon_abundance = [np.mean(sorted(list(ga[exon_iv[0]]),reverse=True)[:30]) for ga in curr_label_all_ga] if sum(np.array(exon_abundance)>10) == len(all_bamfiles) and sum(np.array([x-y for x,y in zip(exon_abundance,IPA_isoform_abundance)])>0)>len(all_bamfiles)*0.5: IPARatio_list = [round(x/y,3) for x,y in zip(IPA_isoform_abundance,exon_abundance)] IPUI_condition_diff = round(IPARatio_list[1] - IPARatio_list[0],3) ratio_val,P_val = sp.stats.fisher_exact([[IPA_isoform_abundance[0],exon_abundance[0]-IPA_isoform_abundance[0]],[IPA_isoform_abundance[1],exon_abundance[1]-IPA_isoform_abundance[1]]]) result = [SYMBOL,intron_rank,IPA_inf,IPAtype]+IPARatio_list+[IPUI_condition_diff,P_val] return result
def getCov(f, paired=True): logger.info("Building coverage model for %s, paired=%s" % (f, paired)) model = HTSeq.GenomicArrayOfSets("auto", stranded=False) i = None uniqs = set() for i, line in enumerate(gzip.open(f)): if i % 10000 == 0: cFlush("%s read from %s" % (i, f)) line = line.split("\n")[0].split("\t") if paired and line[0] != line[3]: continue if paired: s = min(int(line[1]), int(line[4])) e = max(int(line[2]), int(line[5])) else: s = int(line[1]) e = int(line[2]) r = (line[0], s, e) if r not in uniqs: iv = HTSeq.GenomicInterval(line[0], s, e) model[iv] += str(i) uniqs.add(r) if i is None: logger.error("ERROR! No read in %s." % f) return 0, None logger.info("%s read from %s, unique %s" % (i, f, len(uniqs))) return len(uniqs), model
def get_annotations(self, chromosome, start, end): entries = set() for annotation in self.gas[HTSeq.GenomicInterval( chromosome, start, end, '.')]: entries = entries.union(annotation) return entries
def __init__(self, bam_filename, chrom, start, stop, strand, log_base, color, bad_cigar=INSERTION_DELETIONS, coverage_cigar=COVERAGE_CIGAR, junction_cigar=JUNCTION_CIGAR, warn_skipped=True): self.bam_filename = bam_filename self.chrom = chrom self.start = start self.stop = stop self.strand = strand self.log_base = log_base self.color = color self.bad_cigar = bad_cigar self.coverage_cigar = coverage_cigar self.junction_cigar = junction_cigar self.warn_skipped = warn_skipped self.length = self.stop - self.start + 1 self.coordinates = self.chrom, self.start, self.stop, self.strand self.interval = HTSeq.GenomicInterval(*self.coordinates) self.bam = HTSeq.BAM_Reader(self.bam_filename) self.coverage = self.count_coverage() self.junctions = self.count_junctions()
def extract_splice_sites(file, bin): gtf_file = HTSeq.GFF_Reader(file) cvg = HTSeq.GenomicArray("auto", stranded=False, typecode='i') for feature in gtf_file: if feature.type == 'exon': iv1 = HTSeq.GenomicInterval(feature.iv.chrom, feature.iv.start - bin, feature.iv.start + bin, '.') iv2 = HTSeq.GenomicInterval(feature.iv.chrom, feature.iv.end - bin, feature.iv.end + bin, '.') try: cvg[iv1] = 1 cvg[iv2] = 1 except IndexError: continue return cvg
def bg2GModel(bg): """ BedGraph format, gzip or not into HTSeq.GenomicArray """ if bg.endswith(".gz"): f = gzip.open(bg, "rb") else: f = open(bg) print datetime.now(), "Start building model for %s" % bg model = HTSeq.GenomicArray("auto", stranded=False) for i, line in enumerate(f): if i % 10000 == 0: report = "%s lines genome signal read." % i commandFlush(report) line = line.split("\n")[0].split("\t") if len(line) < 3: continue chrom = line[0] s = int(line[1]) e = int(line[2]) iv = HTSeq.GenomicInterval(chrom, s, e) model[iv] = float(line[3]) print print datetime.now(), "Model built for %s" % bg #return genomic coverage model, chromosomes, reads count and read length return model
def cluster_genes(genes, chrom_list): """cleans overlapping regions, all partially or completely overlapping genes are clustered into a single gene """ genes2 = HTSeq.GenomicArrayOfSets(chrom_list, stranded=False) region = genes.steps() last = set() num = 0 FLAG = False iv0 = HTSeq.GenomicInterval("chr1", 0, 1) for iv, gene in region: if len(gene) == 0: if FLAG == False: last = set([]) num = 0 else: genes2[iv0] = last last = set() num = 0 else: FLAG = True last = set.union(last, gene) num += 1 if num > 1: iv0.extend_to_include(iv) else: iv0 = iv return genes2
def bedToolsInterval2GenomicInterval(bedtool): """ Given a pybedtools.BedTool object, returns dictionary of HTSeq.GenomicInterval objects. """ intervals = OrderedDict() for iv in bedtool: if iv.strand == "+" or iv.strand == 0 or iv.strand == str(0): intervals[iv.name] = HTSeq.GenomicInterval(iv.chrom, iv.start, iv.end, "+") elif iv.strand == "-" or iv.strand == 0 or iv.strand == str(1): intervals[iv.name] = HTSeq.GenomicInterval(iv.chrom, iv.start, iv.end, "-") else: intervals[iv.name] = HTSeq.GenomicInterval(iv.chrom, iv.start, iv.end) return intervals
def get_window_counts_and_normalize(window_dict, tags_dict, genome_data, scaling_factor, total_reads, window_size): # dictionary to store read count in each window window_counts_dict = {} # HTSeq genomic array to store normalized score for each window (used to generate bedgraph file) normalized_window_array = HTSeq.GenomicArray(genome_data, stranded=False, typecode='d') # create chromosome keys in window counts dictionary for all chromosomes in genome; the values are empty lists for chrom in genome_data: window_counts_dict[chrom] = [] # iterate through all chromosomes in the genome for chrom in genome_data: # iterate through all windows on the chromosome for window_start in window_dict[chrom]: # get read count in window read_count = get_read_count_in_window(chrom, window_start, window_size, tags_dict) window_counts_dict[chrom].append([window_start, read_count, 0]) # calculate normalized read count normalized_count = float(read_count) * float(scaling_factor) / float(total_reads) window_end = window_start + window_size window = HTSeq.GenomicInterval(chrom, window_start, window_end) # assign normalized read count to window on HTSeq genomic array normalized_window_array[window] = normalized_count return window_counts_dict, normalized_window_array
def reads_profile(regions, bam_file, size): """ Parses reads from BAM file and adds number of forward and reverse 5' coverage counts per position of each region. This function depend on the HTSeq package for fast parsing of read infromation from BAM files. """ print "INFO: Begin to parse reads from BAM file for n={0} regions.".format( len(regions)) # Open BAM file: bamHandle = HTSeq.BAM_Reader(bam_file) # get list of available chromosoms chromosomes = set([chr['SN'] for chr in bamHandle.get_header_dict()['SQ']]) for i, reg in enumerate(regions): center = reg["center"] # initialize read-counts for all positions of this region up_counts = size * [0] down_counts = size * [0] # check if chr of region is available in BAM file: if reg["chr"] in chromosomes: # get GenomicInterval object. extend it by +-1 to for including reads on negative strand inside the interval iv = HTSeq.GenomicInterval(reg["chr"], max(0, reg["ext_start"] - 1), reg["ext_end"] + 1, reg["strand"]) # iterate over all reads mapping to that region (interval) for aln in bamHandle[iv]: # consider motif on positiv stand if reg["strand"] == '+': dist = aln.iv.start_d - center pos = dist + size / 2 if pos >= 0 and pos < size: if aln.iv.strand == '+': up_counts[pos] += 1 if aln.iv.strand == '-': down_counts[pos] += 1 if reg["strand"] == '-': dist = -1 * (aln.iv.start_d - center) pos = dist + size / 2 if pos >= 0 and pos < size: if aln.iv.strand == '+': down_counts[pos] += 1 if aln.iv.strand == '-': up_counts[pos] += 1 # add counts to region dictionary: reg["up_counts"] = up_counts reg["down_counts"] = down_counts print "INFO: Finished parsing of BAM file." return regions
def getPeakProfiles(sites, bamfile,halfwinwidth=3000): bam = ht.BAM_Reader(bamfile) #make sure the sites and bam files have same naming convention # #retrict sites to those that have an entry in the bam file #probably has bug dealing with X and Y. bamChroms = [x["SN"] for x in bam.get_header_dict()["SQ"]] bamChroms = ["chr"+c if c.isdigit() else c for c in bamChroms] sites = sites[np.in1d(sites["chrom"], bamChroms)] sites.shape # peakProfs = [] #collect the sites as genomic intervals for i, pos in enumerate(sites): print "%d of %d" %(i, len(sites)) peakProfile = np.zeros(2*halfwinwidth) if i % 1000 == 0: print "%d of %d" %(i, len(sites)) #don't change the site notation here #sitechr = truncChr(pos["chrom"]) if hasChrPrefix else pos["chrom"] sitechr = pos["chrom"] window = ht.GenomicInterval( str(sitechr), pos["site"] - halfwinwidth, pos["site"] + halfwinwidth, str(pos["strand"]) ) #if (list(bam[window])): if next(bam[window], None) is not None: for almnt in bam[window]: if pos["strand"] == "+": a = almnt.iv.start - pos["site"] + halfwinwidth b = almnt.iv.end - pos["site"] + halfwinwidth if pos["strand"] == "-": a = pos["site"] + halfwinwidth - almnt.iv.end b = pos["site"] + halfwinwidth - almnt.iv.start peakProfile[a:b] += 1 if (np.sum(peakProfile) > 0): peakProfs.append(peakProfile) return np.array(peakProfs)
def bedTools_interval_to_genomic_interval(bedtool): """ Given a pybedtools.BedTool object, returns dictionary of HTSeq.GenomicInterval objects. """ intervals = OrderedDict() for iv in bedtool: name = "{}:{}-{}".format(iv.chrom, iv.start, iv.end) if iv.strand == "+" or iv.strand == 0 or iv.strand == str(0): intervals[name] = HTSeq.GenomicInterval(iv.chrom, iv.start, iv.end, "+") elif iv.strand == "-" or iv.strand == 0 or iv.strand == str(1): intervals[name] = HTSeq.GenomicInterval(iv.chrom, iv.start, iv.end, "-") else: intervals[name] = HTSeq.GenomicInterval(iv.chrom, iv.start, iv.end) return intervals
def HTSeq(self, bamlist): # Axin2 #window = HTSeq.GenomicInterval("chr11", 108914532, 108954079, "+") # Elf3 window = HTSeq.GenomicInterval("chr1", 135253574, 135258472, "-") coverage = HTSeq.GenomicArray("auto", stranded=True, typecode="i") a = [] samplelist = [] for bamfile in bamlist: sample = os.path.basename(bamfile).split("_")[0] marker = sample.split("-")[0] samplelist.append(sample) bamfile = HTSeq.BAM_Reader(bamfile) for almnt in bamfile: if almnt.aligned: almnt.iv.length = 1 coverage[almnt.iv] += 1 normalization = np.fromiter(coverage[window], dtype=float) / p.H3K27ac_bam[sample] a.append(normalization) b = np.array(a) df = pd.DataFrame(b.T) df.columns = samplelist data = df[[ "ctrl-H3K27ac", "2weeks-H3K27ac", "4weeks-H3K27ac", "7weeks-H3K27ac", "10weeks-H3K27ac" ]] data.to_csv( "/data3/zhaochen/project/colon_cancer/colon_chip/peakUCSCplot/H3K27ac_Elf3.txt", sep="\t", index=False)
def loops2degreesSharp(fin, fout): model = HTSeq.GenomicArrayOfSets("auto", stranded=0) for i, line in enumerate(open(fin)): if i == 0: continue line = line.split("\n")[0].split("\t") iva = HTSeq.GenomicInterval(line[0], int(line[1]), int(line[2])) ivb = HTSeq.GenomicInterval(line[3], int(line[4]), int(line[5])) model[iva] += line[6] + "-left" model[ivb] += line[6] + "-right" with open(fout, "w") as fo: for iv, value in list(model.steps()): if value == set([]): continue line = [iv.chrom, iv.start, iv.end, len(value), ",".join(value)] fo.write("\t".join(map(str, line)) + "\n")
def __init__(self, sample_name, genome, cvg, mis, feature_type, start, end, strand, locus, name, extend_utr): ''' Constructs Feature object. ''' self.sample_name = sample_name self.genome_id = genome.id self.type = feature_type self.start = int(start) - 1 self.end = int(end) self.strand = strand if self.type == 'five_prime_UTR': #add nucleotides to the 5'UTR to include the first codons if self.strand == '+': self.end += extend_utr else: self.start -= extend_utr self.locus = locus self.name = name self.iv = HTSeq.GenomicInterval(self.genome_id, self.start, self.end, self.strand) self.coord = range(self.start, self.end) self.seq = genome.seq[self.start : self.end] self.cvg = list(cvg[self.iv]) self.mis = list(mis[self.iv]) self.dms = [None] * len(self.cvg) if strand == '-': self.coord.reverse() self.seq = self.seq.reverse_complement() self.cvg.reverse() self.mis.reverse()
def find_sgRNA_in_polyc_regoin(fasta, db): ''' Search polyC region that can be targeted by spCas9 (PAM is NGG) ''' p = re.compile(r'C{6}[ATGC]{14}[ATGC][G]{2}') result = collections.namedtuple( 'PolycGuideRnaResult', ['chr', 'start', 'end', 'guide', 'PAM', 'score', 'is_exon']) with pysam.FastxFile(fasta) as fh: for entry in fh: for m in p.finditer(entry.sequence): start = m.start() end = m.end() score_seq = entry.sequence[start - 4:end + 3] score = calc_doench_score(score_seq) seed_seq = entry.sequence[start + 6:end - 3] sgRNA = entry.sequence[start:end] pam = sgRNA[-3:] if filter_homopolymer(seed_seq): query_iv = HTSeq.GenomicInterval(entry.name, start, end, '+') is_exon_overlapped = find_exon(query_iv, db) yield result(entry.name, start, end, sgRNA, pam, score, is_exon_overlapped)
def create_peak_gtf(path, exp_design_name, technique, bed_name): """ Read all PATH_PEAKS+'/'+exp_design_name+'_'+technique+'_'+Final.txt Combine peaks and save to GFF :param list_technique: :return: """ PATH_ANNOT = path + '/Genome/' if technique == '' or technique == 'All': PATH_PEAKS = path + '/PeakDetection/Peaks' peak_filename = PATH_PEAKS + '/' + exp_design_name + '_' + bed_name + '_Peaks.txt' gtf_filename = PATH_PEAKS + '/' + exp_design_name + '_' + bed_name + '.gtf' else: PATH_PEAKS = path + '/PeakDetection/' + technique + '/' peak_filename = PATH_PEAKS + '/' + exp_design_name + '_' + technique + '_' + bed_name + '_Peaks.txt' gtf_filename = PATH_PEAKS + '/' + exp_design_name + '_' + technique + '_' + bed_name + '.gtf' with open(gtf_filename, 'w') as gtf_file, \ open(peak_filename, 'rU') as peak_file: csv_peaks = csv.DictReader(peak_file, delimiter='\t') for row in csv_peaks: peak = HTSeq.GenomicInterval(row['chromo_peak'], int(row['begin_peak']), int(row['end_peak']), ".") peak_id = row['WindowId'] feature = HTSeq.GenomicFeature(peak_id, 'exon', peak) #print(feature.get_gff_line().strip() + '; gene_id \"'+peak_id+'\"') gtf_file.write(feature.get_gff_line().strip() + '; gene_id \"' + peak_id + '\"' + '\n')
def intersectcirc(self, circ_file, modified_gtf_file, strand=True): # imput the result file of print_start_end_file #intersectBed -a start.bed -b Drosophila_melanogaster.BDGP5.75.exon_id.dedup.gtf -wa -wb -loj > tmpintersect.2 circ = pybedtools.BedTool(circ_file) gtf = pybedtools.BedTool(modified_gtf_file) if strand: intersectfile = circ.intersect(gtf, wa=True, wb=True, loj=True, s=True, nonamecheck=True) else: intersectfile = circ.intersect(gtf, wa=True, wb=True, loj=True, nonamecheck=True) # Store circExons as: circle start or end intervals as key, custom_exon_id as value circExons = {} for lin in intersectfile: lin_split = str(lin).split('\t') if lin_split[14].strip('\n') == '.': #lin_split[11] = '' pass else: circExons.setdefault( HTSeq.GenomicInterval(lin_split[0], int(lin_split[1]), int(lin_split[2]), lin_split[5]), set()).add( HTSeq.parse_GFF_attribute_string( lin_split[14])['custom_exon_id']) #circExons.setdefault( HTSeq.GenomicInterval(lin_split[0],int(lin_split[1]),int(lin_split[2]),lin_split[9]), [] ).append( { HTSeq.GenomicInterval(lin_split[3],int(lin_split[6]),int(lin_split[7]),lin_split[9]):HTSeq.parse_GFF_attribute_string(lin_split[11]) }) return circExons
def __iter__(self): for chromosome_name,chromosome_obj in self.gas.chrom_vectors.items(): for gene in list(reduce(lambda s1, s2: s1 | s2, [x[1] for x in self.gas[HTSeq.GenomicInterval(chromosome_name,0,chromosome_obj['.'].iv.end)].steps()])): yield gene #def show_me(self): # print self.__str__()
def quantify(readF, peakF, fnOut): print("builidng coverage model for counting") covModel, t = buildCovModel(readF) r = set() ds = {} print("counting reads in peaks") for line in tqdm(list(open(peakF))): line = line.split("\n")[0].split("\t") iv = HTSeq.GenomicInterval(line[0], int(line[1]), int(line[2])) c = set() for ivb, vb in covModel[iv].steps(): try: c.add(vb) except: continue r.update(list(c)) c = list(c) ds["|".join(line[:3])] = { "count": len(c), "RPKM": len(c) / 1.0 / iv.length / t * 10**9, "TPM": len(c) / 1.0 / iv.length * 10**3, "length": iv.length } ds = pd.DataFrame(ds).T ds["TPM"] = ds["TPM"] / ds["TPM"].sum() * 10**6 ds.to_csv(fnOut, sep="\t", index_label="peakId")
def binData(tmodel, cmodel, tr, cr, chroms, binsize): """ Get bins for the genome and get the reads count. tr is read length for treatment cr is read length for control """ ts = [] cs = [] for chrom in chroms.keys(): s = chroms[chrom]["s"] e = chroms[chrom]["e"] bins = (e - s) / binsize for i in xrange(1, bins): iv = HTSeq.GenomicInterval(chrom, s + binsize * (i - 1), s + binsize * i) countT = getCount(tmodel, iv, tr) countC = getCount(cmodel, iv, cr) if countT + countC == 0: #if countT * countC == 0: continue else: ts.append(countT) cs.append(countC) ts, cs = np.array(ts), np.array(cs) return ts, cs
def countFeatures(f, featuref, paired=True): """ Count reads enrichment at the TSS regions. """ #logger.info("Building coverage for %s"%f) t, model = getCov(f, paired=paired) if t == 0: return None logger.info("%s reads from %s" % (t, f)) ds = {} logger.info("Caculating enriched reads at target regions of %s" % featuref) r = 0 for line in tqdm(open(featuref).read().split("\n")): line = line.split("\n")[0].split("\t") if len(line) < 3: continue s = int(line[1]) e = int(line[2]) iv = HTSeq.GenomicInterval(line[0], s, e) c, rpkm = getCount(t, model, iv) r += c n = f.split("/")[-1].split(".bedpe")[0] logger.info("FLAG!\t %s:: total:%s,inPeaks:%s,inPeaksRatio:%s" % (n, t, r, r / 1.0 / t)) return n, t, r, r / 1.0 / t
def load_bedfile_to_ga(bed_file): ga = HTSeq.GenomicArrayOfSets("auto", stranded=True) with open(bed_file, "r") as fh: for line in fh: row = line.strip().split("\t") # field values based on the bed files from DBTSS. BED files are 0 based, no adjusting necessary. try: chrom = row[0] pos = int(row[1]) strand = row[5] score = float(row[4]) except IndexError as e: print(e) print(row) continue try: ga[HTSeq.GenomicInterval(chrom, pos - 1, pos, strand)] = score except ValueError as e: print("Error loading GA:") print(row) continue return ga
def add_raw_reads_to_utr(self, ga, chr_len): #utr_left = self.cds_right #utr_right = self.txpt_right # Need chrom information. self.utr_arr = [] if self.txpt_right - self.cds_right < 2: for pos in range(0, self.txpt_right - self.cds_right + 1, 1): self.utr_arr = [0] return if self.strand == '-': txpt_left = chr_len[self.chrom] - self.txpt_right + 1 txpt_right = chr_len[self.chrom] - self.txpt_left cds_left = chr_len[self.chrom] - self.cds_right + 1 cds_right = chr_len[self.chrom] - self.cds_left else: txpt_left = self.txpt_left txpt_right = self.txpt_right cds_left = self.cds_left cds_right = self.cds_right iv = HTSeq.GenomicInterval(self.chrom, cds_right, txpt_right, self.strand) self.utr_ga = HTSeq.GenomicArray(chroms='auto', stranded=True) # if len(ga[iv].steps()) == 0: # for pos in range(0,self.txpt_right - self.cds_right + 1,1): # self.utr_arr.append(0) # return if txpt_right - cds_right < 2: self.utr_arr = [0] return for _iv, score in ga[iv].steps(): self.utr_ga[_iv] = score left_in_utr = _iv.start - cds_right right_in_utr = _iv.end - cds_right for pos in range(left_in_utr, right_in_utr, 1): self.utr_arr.append(score)
def get_gene_features(gtfFile, id_type, feature_type): ''' get exon features and gene interval features ''' features = HTSeq.GenomicArrayOfSets("auto", stranded="yes") geneFeatures = HTSeq.GenomicArrayOfSets("auto", stranded="yes") geneRange = {} gtf = HTSeq.GFF_Reader(gtfFile) i = 0 for line in gtf: if line.type == feature_type: feature_id = line.attr[id_type] features[line.iv] += feature_id if feature_id not in geneRange: geneRange[feature_id] = [line.iv.chrom, 0, 0, line.iv.strand] if geneRange[feature_id][1] != 0: geneRange[feature_id][1] = min(geneRange[feature_id][1], line.iv.start) else: geneRange[feature_id][1] = line.iv.start geneRange[feature_id][2] = max(geneRange[feature_id][2], line.iv.end) i += 1 if i % 100000 == 0: print("%d GFF lines processed.\n" % i, file=sys.stderr) for g, v in geneRange.items(): chrom, start, end, strand = v tmp_iv = HTSeq.GenomicInterval(chrom, start, end, strand) geneFeatures[tmp_iv] += g return features, geneFeatures
def _transform_pe_observed_data(self, aln_stat: AlignStat) -> np.ndarray: """Transform paired-end alignments pair to observed data.""" cf = self._calc_compatible_factor(aln_stat) aln1, aln2 = aln_stat.aln, aln_stat.aln2 start = min(aln1.iv.start, aln2.iv.start) end = max(aln1.iv.end, aln2.iv.end) fragment_iv = HTSeq.GenomicInterval(self.segment.iv.chrom, start, end, '.') # Inferred insert sizes for each isoform. inferred_insert_sizes = np.repeat([0], self.isoforms_count) for iv, value in self.region[fragment_iv].steps(): isoform_nums = [v for v in value if isinstance(v, int)] for i in isoform_nums: inferred_insert_sizes[i] += (iv.end - iv.start) iis = inferred_insert_sizes.astype(float) iis[iis == 0] = -np.inf # Mappable fragments count. c = self.isoform_lens.astype(float) - iis + 1 c[c <= 0] = np.inf p = scipy.stats.norm.pdf(iis, self.bam_param.insert_size_mean, self.bam_param.insert_size_std) c = 1 / c * p data = cf * np.tile(c, self.ploidy) return data
def fetch(self, interval, strand=None): """ Retrieve all reads within a given window Parameters ---------- interval : list, tuple or str If interval is a list or tuple, it should contain chromosome (str), start (int), end (int). If it is a string, it should be of the format chrom:start-end strand : str, optional Either '+' or '-'. By default all reads are returned. Yields ------ GenomicInterval Yields HTSeq GenomicInterval objects. """ feature = self._interval_bedtool(interval, strand=strand) chrom, start, end = self._get_interval(interval) for read in self.track.intersect(feature, u=True, stream=True, s=strand in ["+", "-"]): yield HTSeq.GenomicInterval(chrom, read.start, read.end, str(read.strand))