def buildCovModel(readF, dfilter=[80, 80, 180], mapq=1): """ Building Genome Coverage profile for MNase-seq data based on HTSeq. Parameters --- readF: str,bedpe.gz dfilter: list, distance to determin conical and particle mapq: int, MAPQ cutoff to remove PETs. Returns --- non-Redundant PETs number, int Conical nucleosome PETs number, int Particle PETs number, int Conical nucleosome PETs coverage, HTSeq.GenomeicArray Particle PETs coverage, HTSeq.GenomicArray """ print("building models for %s" % readF) n = readF.split('/')[-1].split(".bedpe.gz")[0] modelCn = HTSeq.GenomicArray("auto", stranded=False) modelSp = HTSeq.GenomicArray("auto", stranded=False) cn, sp = 0, 0 reds = set() for i, line in enumerate(gzip.open(readF, 'rt')): if i % 10000 == 0: report = "%s lines genome signal read." % i cFlush(report) line = line.split("\n")[0].split("\t") if len(line) < 7: continue if line[0] != line[3]: continue if int(line[7]) < mapq: continue s = min(int(line[1]), int(line[4])) e = max(int(line[2]), int(line[5])) d = e - s r = (line[0], s, e) if r in reds: continue else: reds.add(r) m = (s + e) / 2 #iv = HTSeq.GenomicInterval(line[0], m, m + 1) iv = HTSeq.GenomicInterval(line[0], s, e) if d <= dfilter[0]: #sp sp += 1 modelSp[iv] += 1 if d > dfilter[1] and d <= dfilter[2]: cn += 1 modelCn[iv] += 1 return cn, sp, modelCn, modelSp
def makeIslandFilteredGraphFile(chroms, chrom_lengths, window_size, bamfile, islandbedfile, outfile): ga = HTSeq.GenomicArray(chroms, stranded=False, typecode='d') bam_reader = HTSeq.BAM_Reader(bamfile) for alt_first, alt_second in HTSeq.pair_SAM_alignments(bam_reader): if alt_first == None or alt_second == None: continue if alt_first.aligned and alt_first.optional_field( "NH" ) == 1 and alt_second.aligned and alt_second.optional_field("NH") == 1: if alt_first.iv.chrom != alt_second.iv.chrom or alt_first.iv.strand == alt_second.iv.strand or alt_first.iv.chrom not in chroms: continue alt_first_iv_seq = [ co.ref_iv for co in alt_first.cigar if co.type == "M" and co.size > 0 ] alt_second_iv_seq = [ reverse_strand(co.ref_iv) for co in alt_second.cigar if co.type == "M" and co.size > 0 ] alt_iv_seq = combine_pair_iv_seq(alt_first_iv_seq, alt_second_iv_seq) read_length = get_read_length(alt_iv_seq) for alt_iv in alt_iv_seq: ga[alt_iv] += 1.0 / read_length ga_island = HTSeq.GenomicArray(chroms, stranded=False, typecode='d') bedfile = HTSeq.BED_Reader(islandbedfile) for alt in bedfile: for iv, value in ga[alt.iv].steps(): ga_island[iv] += value with open(outfile, 'w') as f: for chrom in chroms: chrom_length = chrom_lengths[chrom] num_windows = chrom_length / window_size for i in range(num_windows): count_in_window = 0 window_start = i * window_size window_end = (i + 1) * window_size window_iv = HTSeq.GenomicInterval(chrom, window_start, window_end) for iv, value in ga_island[window_iv].steps(): count_in_window += value * iv.length count_in_window = int(count_in_window) if count_in_window != 0: outline = chrom + '\t' + str(window_start) + '\t' + str( window_end) + '\t' + str(count_in_window) + '\n' f.write(outline)
def run(self): logging.info('[{0}] Target cutoff at FDR={1}'.format( time.ctime(), self.target_fdr)) faidx_genome = apriori_rts_analysis.FaidxGenome(self.genome_fasta) whitelist_array = HTSeq.GenomicArray(chroms='auto', stranded=True, storage='step', typecode='b') for in_whitelist_bed in self.whitelist_bed_list: apriori_rts_analysis.load_whitelist(whitelist_array, in_whitelist_bed) rg4_motif_information, rg4_motif_array, rg4_threeprime_array = apriori_rts_analysis.load_rg4_motif( self.motif_bed_list, _all_rts_array=None) structural_classes = [ 'canonical/G3L1-7', 'longloop', 'bulges', 'two-quartet' ] for forward_in_fsrtsv, reverse_in_fsrtsv, output_cutoff_config, output_cutoff_dump in self.io_tuples: if not self.overwrite and os.path.exists( output_cutoff_config) and os.path.isfile( output_cutoff_config): continue u_treatment_fsr, treatment_fsr, u_decoy_fsr, decoy_fsr = single_sample_load_fsrtsv( forward_in_fsrtsv, reverse_in_fsrtsv, whitelist_array, faidx_genome) rts_array = HTSeq.GenomicArray(chroms='auto', stranded=True, typecode='b') for strand_symbol in ('+', '-'): for fsr_store in [u_treatment_fsr]: for items in fsr_store[strand_symbol]: iv = items[0] rts_array[iv] = True result, result_dump = unknown_fdr_cut( treatment_fsr, rg4_motif_information, rg4_threeprime_array, frozenset(structural_classes), faidx_genome, self.target_fdr) assert result with open(output_cutoff_config, 'w') as fw: fw.write(str(result.first_alpha_cutoff) + '\n') fw.write(str(result.second_alpha_cutoff) + '\n') if self.debug_dump: with open(output_cutoff_dump, 'w') as fw: for line in result_dump: fw.write(str(line) + '\n') del u_treatment_fsr, treatment_fsr, u_decoy_fsr, decoy_fsr del rts_array del result, result_dump
def get_window_counts_and_normalize(window_dict, tags_dict, genome_data, scaling_factor, total_reads, window_size): # dictionary to store read count in each window window_counts_dict = {} # HTSeq genomic array to store normalized score for each window (used to generate bedgraph file) normalized_window_array = HTSeq.GenomicArray(genome_data, stranded=False, typecode='d') # create chromosome keys in window counts dictionary for all chromosomes in genome; the values are empty lists for chrom in genome_data: window_counts_dict[chrom] = [] # iterate through all chromosomes in the genome for chrom in genome_data: # iterate through all windows on the chromosome for window_start in window_dict[chrom]: # get read count in window read_count = get_read_count_in_window(chrom, window_start, window_size, tags_dict) window_counts_dict[chrom].append([window_start, read_count, 0]) # calculate normalized read count normalized_count = float(read_count) * float(scaling_factor) / float(total_reads) window_end = window_start + window_size window = HTSeq.GenomicInterval(chrom, window_start, window_end) # assign normalized read count to window on HTSeq genomic array normalized_window_array[window] = normalized_count return window_counts_dict, normalized_window_array
def find_clusters(self): # TO DO: A significant number of objects in arr are empty ([]). # Not sure why. Was this figured out? self.clusters = [] self.clusters_as_ga = HTSeq.GenomicArray('auto', stranded=True) n_clusters = 0 nonzeroes = np.nonzero(self.exon_coverage)[0] clusters = self.consecutive(np.nonzero(self.exon_coverage)[0]) self.clusters_as_indices_in_exon_coverage_array = clusters #print "exon_coverage= {0}\n\nnonzeroes= {1}\n\nconsecutive(nonzeroes)=\n\n{2}\ #".format(self.exon_coverage, nonzeroes, clusters) if (len(self.exon_coverage) == 0) or (len(clusters) == 0): n_clusters = 0 return n_clusters arr = np.array([self.exon_coverage[i] for i in clusters]) if len(arr) > 0: for _index_array in clusters: for pos in _index_array: self.clusters_as_ga[self.to_iv(pos)] += 1 nonzeros = [x for x in arr if len(x) > 0] #maxes = [np.max(x) for x in arr if len(x) > 0] self.clusters = [x for x in nonzeros if np.max(x) > 1] n_clusters = len(self.clusters) #len([x for x in maxes if x>1]) #n_clusters = len(filter(lambda x: np.nanmax(x)>1, arr)) if n_clusters == 0: print("...Failure") return n_clusters
def createMTrack(dirName): '''merge all mapped tracks in directory and create a single wig file''' fileList = cg.recurseDir(dirName, end = '.out') chroms = cg.humanChromosomes print 'Making Bed File vectors' cvg = HTSeq.GenomicArray(chroms, stranded=True, typecode='i') for fName in fileList: print fName alignment_file = HTSeq.BowtieReader(fName) for alngt in alignment_file: if alngt.aligned: try: cvg.add_value( 1, alngt.iv ) #iv is the genomic interval.. except KeyError: pass bedNamePos = dirName + '/Merge.' + 'hg19' + '.1.wig' bedNameNeg = dirName + '/Merge.' + 'hg19' + '.-1.wig' print 'Writing Bed File' cvg.write_bedgraph_file(bedNamePos, "+" ) cvg.write_bedgraph_file(bedNameNeg, "-" ) #Now extend it updateWigLength(bedNamePos, 'hg19') updateWigLength(bedNameNeg, 'hg19') #Now Sort it. cgSort.wigSort(bedNamePos) cgSort.wigSort(bedNameNeg)
def call_peaks_from_bam(clip_bam_filename, config): gtf_filename = config['gtf_filename'] #"/home/dp/Desktop/celegans_genome/wormbase_ws235/Caenorhabditis_elegans.WBcel235.78.gtf" gtf_noheader_filename = config['gtf_filename_noheader'] #"/home/dp/Desktop/celegans_genome/wormbase_ws235/Caenorhabditis_elegans.WBcel235.78.noheader.gtf" gtffile = HTSeq.GFF_Reader(gtf_filename) #clip_bam_filename = "/home/dp/Desktop/bams/celegans/run813_fbf_aacc_20mapq.bam" bamfiles = {'clip': clip_bam_filename, 'rna_seq': config['rna_seq_filename'], 'neg_ip': config['neg_ip_filename']} clip_bamfile = HTSeq.BAM_Reader(clip_bam_filename) coverage = HTSeq.GenomicArray("auto", stranded=True, typecode='i') #gtf_df = pandas.read_csv(gtf_noheader_filename, sep='\t', header=None) print "Reading alignments from bamfile..." for aln in clip_bamfile: # Very slow. if aln.aligned: coverage[aln.iv] += 1 print "Creating gtf file and dataframe..." gtf_df = create_gtf_with_names_file(gtf_noheader_filename) print "Calling peaks..." peaks_by_chrm = {} peak_objs_by_chrm = {} for chrm in dict(gtf_df['0'].value_counts()).keys(): peaks_by_chrm[chrm] = {} peak_objs_by_chrm[chrm] = {} for strand in ['+', '-']: peaks_by_chrm[chrm][strand] = find_peaks(coverage, chrm=chrm, strand=strand) peak_objs_by_chrm[chrm][strand] = find_borders( peaks_by_chrm[chrm][strand], coverage, chrm, strand) peak_objs_by_chrm[chrm][strand] = merge_overlapping_on_chrm_and_strand( peak_objs_by_chrm[chrm][strand], coverage) assign_to_gene(peak_objs_by_chrm, chrm, strand, gtf_df) add_local_signal(peak_objs_by_chrm[chrm][strand], bamfiles) add_gene_signal(peak_objs_by_chrm[chrm][strand], gtf_df, bamfiles) do_statistics(peak_objs_by_chrm[chrm][strand], bamfiles) any_have_na(peak_objs_by_chrm[chrm][strand]) return peak_objs_by_chrm
def createTrack(fName, organism): if organism == 'human': chroms = cg.humanChromosomes assembly = 'hg19' elif organism == 'mouse': chroms = cg.mouseChromosomes assembly = 'mm9' elif organism == 'zebrafish': chroms = cg.zebrafishChromosomes assembly = 'danRer6' alignment_file = HTSeq.BowtieReader(fName) cvg = HTSeq.GenomicArray(chroms, stranded=True, typecode='i') for alngt in alignment_file: if alngt.aligned: cvg.add_value( 1, alngt.iv ) #iv is the genomic interval.. bedNamePos = fName + '.1.' + 'wig' bedNameNeg = fName + '.-1.' + 'wig' cvg.write_bedgraph_file(bedNamePos, "+" ) cvg.write_bedgraph_file(bedNameNeg, "-" ) #Now extend it and sort it. updateWigLength(bedNamePos, assembly) updateWigLength(bedNameNeg, assembly) #Now Sort it. cgSort.wigSort(bedNamePos) cgSort.wigSort(bedNameNeg)
def collapse_unannotated_exons(stranded, unannot_exon_dict): """Collapses the list of unannotated exon genomic itervals. Similar to what dexseq_prepare_annotation.py does, it takes all overlapping exons and collapses them down into a single unique coordinate that contains the rest. returns a set of unqiue genomic interval objects""" ga_dict = {} # genomic array dictionary for gene_name, exon_list in unannot_exon_dict.iteritems(): # Create a GenomicArray object to get largest exons ga = HTSeq.GenomicArray('auto', stranded=stranded) for exon_iv in exon_list: ga[exon_iv] = 1 # real exons are set to 1, inbetween are 0's ga_dict[gene_name] = ga # create new defaultdict set to hold unique exon coordinates collapsed_unannot_dict = defaultdict(set) for gene_name, ga in ga_dict.iteritems(): for exon_iv, num in ga.steps(): if num == 1: collapsed_unannot_dict[gene_name].add(exon_iv) return collapsed_unannot_dict
def write_motif_track(cls, genomic_fasta, motif, motif2=None): ga = HTSeq.GenomicArray('auto', stranded=True) for chrom, seq in genomic_fasta.items(): print(chrom, ) cls.add_motif_locations(chrom, seq, '+', motif, ga) if motif2 is not None: cls.add_motif_locations(chrom, seq, '+', motif2, ga, value=2) cls.add_motif_locations(chrom, seq, '-', cls.rc(motif), ga) if motif2 is not None: cls.add_motif_locations(chrom, seq, '-', cls.rc(motif2), ga, value=2) cls._mk('beds/') cls._mk('beds/motif_bedgraphs/') ga.write_bedgraph_file('beds/motif_bedgraphs/{}_+.wig'.format(motif), '+') ga.write_bedgraph_file('beds/motif_bedgraphs/{}_-.wig'.format(motif), '-')
def bg2GModel(bg): """ BedGraph format, gzip or not into HTSeq.GenomicArray """ if bg.endswith(".gz"): f = gzip.open(bg, "rb") else: f = open(bg) print datetime.now(), "Start building model for %s" % bg model = HTSeq.GenomicArray("auto", stranded=False) for i, line in enumerate(f): if i % 10000 == 0: report = "%s lines genome signal read." % i commandFlush(report) line = line.split("\n")[0].split("\t") if len(line) < 3: continue chrom = line[0] s = int(line[1]) e = int(line[2]) iv = HTSeq.GenomicInterval(chrom, s, e) model[iv] = float(line[3]) print print datetime.now(), "Model built for %s" % bg #return genomic coverage model, chromosomes, reads count and read length return model
def get_bedgraph(do_combine_bedgraphs=False, bedgraphs_folder='data/wigs/', lib=None): if lib is not None: bedgraphs_folder = lib['coverage_wigs'] bedgraph_exp_plus = lib['bedgraph_exp_plus'] bedgraph_exp_minus = lib['bedgraph_exp_minus'] else: bedgraph_exp_plus = bedgraphs_folder + 'both_fbfs_plus.bed' bedgraph_exp_minus = bedgraphs_folder + 'both_fbfs_minus.bed' bedgraphs_folder = bedgraphs_folder.rstrip('/') + '/' if do_combine_bedgraphs: combine_bedgraphs(bedgraphs_folder=bedgraphs_folder) ga = HTSeq.GenomicArray(chroms='auto', stranded=True) with open(bedgraph_exp_plus, 'r') as f: next(f) for line in f: s = line.rstrip('\n').split('\t') ga[HTSeq.GenomicInterval(s[0], int(s[1]), int(s[2]), '+')] = float(s[3]) with open(bedgraph_exp_minus, 'r') as f: next(f) for line in f: s = line.rstrip('\n').split('\t') ga[HTSeq.GenomicInterval(s[0], int(s[1]), int(s[2]), '-')] = float(s[3]) return ga
def ave_ga(file_list): ga = build_ga(file_list) norm_ga = HTSeq.GenomicArray('auto', stranded=True) denom = len(file_list) for iv, score in ga.steps(): norm_ga[iv] = score / denom return norm_ga
def HTSeq(self, bamlist): # Axin2 #window = HTSeq.GenomicInterval("chr11", 108914532, 108954079, "+") # Elf3 window = HTSeq.GenomicInterval("chr1", 135253574, 135258472, "-") coverage = HTSeq.GenomicArray("auto", stranded=True, typecode="i") a = [] samplelist = [] for bamfile in bamlist: sample = os.path.basename(bamfile).split("_")[0] marker = sample.split("-")[0] samplelist.append(sample) bamfile = HTSeq.BAM_Reader(bamfile) for almnt in bamfile: if almnt.aligned: almnt.iv.length = 1 coverage[almnt.iv] += 1 normalization = np.fromiter(coverage[window], dtype=float) / p.H3K27ac_bam[sample] a.append(normalization) b = np.array(a) df = pd.DataFrame(b.T) df.columns = samplelist data = df[[ "ctrl-H3K27ac", "2weeks-H3K27ac", "4weeks-H3K27ac", "7weeks-H3K27ac", "10weeks-H3K27ac" ]] data.to_csv( "/data3/zhaochen/project/colon_cancer/colon_chip/peakUCSCplot/H3K27ac_Elf3.txt", sep="\t", index=False)
def main(cl=None): ''' Implements the Usage exception handler that can be raised from anywhere in process. ''' if cl is None: cl = CommandLine() else : cl = CommandLine(['-r']) try: print cl.args # print the parsed argument string alignment_file = HTSeq.SAM_Reader(cl.args["sam_file"]) # Get coverage for the whole genome cvg = HTSeq.GenomicArray( "auto", stranded=False, typecode='i' ) for alngt in alignment_file: if alngt.aligned: cvg[ alngt.iv ] += 1 # Write a "Wiggle" file for genome browser viewing cvg.write_bedgraph_file(cl.args["output_prefix"]+".wig") # Now need to iterate over every gene/transcript and get the # per-transcript coverage # gtf_file = HTSeq.GFF_Reader("/home/pvcastro/reference_known_genes.gtf") except Usage, err: cl.do_usage_and_die(err.msg)
def add_raw_reads_to_utr(self, ga, chr_len): #utr_left = self.cds_right #utr_right = self.txpt_right # Need chrom information. self.utr_arr = [] if self.txpt_right - self.cds_right < 2: for pos in range(0, self.txpt_right - self.cds_right + 1, 1): self.utr_arr = [0] return if self.strand == '-': txpt_left = chr_len[self.chrom] - self.txpt_right + 1 txpt_right = chr_len[self.chrom] - self.txpt_left cds_left = chr_len[self.chrom] - self.cds_right + 1 cds_right = chr_len[self.chrom] - self.cds_left else: txpt_left = self.txpt_left txpt_right = self.txpt_right cds_left = self.cds_left cds_right = self.cds_right iv = HTSeq.GenomicInterval(self.chrom, cds_right, txpt_right, self.strand) self.utr_ga = HTSeq.GenomicArray(chroms='auto', stranded=True) # if len(ga[iv].steps()) == 0: # for pos in range(0,self.txpt_right - self.cds_right + 1,1): # self.utr_arr.append(0) # return if txpt_right - cds_right < 2: self.utr_arr = [0] return for _iv, score in ga[iv].steps(): self.utr_ga[_iv] = score left_in_utr = _iv.start - cds_right right_in_utr = _iv.end - cds_right for pos in range(left_in_utr, right_in_utr, 1): self.utr_arr.append(score)
def load_bedgraph(filename_list, ga, use_key=False): if use_key: exp = use_key else: exp = filename_list[0] ga[exp] = HTSeq.GenomicArray(chroms='auto', stranded=True) ratio_fbf1_to_2 = float(9792191+3166675+10408265)/float(7680463+884888+5584323) #rep = HTSeq.WiggleReader(filename) if exp == 'combined_fbf2.txt': with open(filename_list[0], 'r') as f: next(f) for line in f: s = line.rstrip('\n').split('\t') ga[exp][HTSeq.GenomicInterval(s[0], int(s[1]), int(s[2]), '+')] = ratio_fbf1_to_2 * float(s[3]) with open(filename_list[1], 'r') as f: next(f) for line in f: s = line.rstrip('\n').split('\t') ga[exp][HTSeq.GenomicInterval(s[0], int(s[1]), int(s[2]), '-')] = ratio_fbf1_to_2 * float(s[3]) else: with open(filename_list[0], 'r') as f: next(f) for line in f: s = line.rstrip('\n').split('\t') ga[exp][HTSeq.GenomicInterval(s[0], int(s[1]), int(s[2]), '+')] = float(s[3]) with open(filename_list[1], 'r') as f: next(f) for line in f: s = line.rstrip('\n').split('\t') ga[exp][HTSeq.GenomicInterval(s[0], int(s[1]), int(s[2]), '-')] = float(s[3])
def create_tasks(self, ploidy: int, snps): """Create tasks.""" vc_sites = HTSeq.GenomicArray('auto', stranded=False, typecode='O') for snp in snps: snp = snp # type: SNP vc_sites[snp.iv] = snp self._mc.handle_progress('Creating tasks...') selected_snp_ids = set() selected_snps = [] n = -1 segment_ids = self.get_all_segment_ids() for segment_id in segment_ids: segment = self.get_segment(segment_id) phased_snp_ids, unphased_snp_ids = [], [] for isoform in segment.isoforms: for exon in isoform.exons: for vc_iv, snp in vc_sites[exon].steps(): if snp is not None: if snp.id not in selected_snp_ids: selected_snp_ids.add(snp.id) selected_snps.append(snp) if snp.phased and snp.id not in phased_snp_ids: phased_snp_ids.append(snp.id) if not snp.phased and snp.id not in unphased_snp_ids: unphased_snp_ids.append(snp.id) if len(phased_snp_ids) > 0: n = self._create_task(n, segment.id, True, ploidy, phased_snp_ids) for unphased_snp_id in unphased_snp_ids: n = self._create_task(n, segment.id, False, ploidy, [unphased_snp_id]) self.store_snps(selected_snps)
def _extract_long_continuous_regions(gff_path: str, min_region_len: int, out_path: str, mc: MessageCenter): """Extract long continuous regions with length of at least min_region_len.""" mc.log_debug('gff_path: {}'.format(gff_path)) mc.log_debug('min_region_len: {}'.format(min_region_len)) mc.log_debug('out_path: {}'.format(out_path)) mc.handle_progress('Calculating long continuous regions...') region = HTSeq.GenomicArray('auto', stranded=False, typecode='i') if not os.path.exists(gff_path): raise PEUtilPathError(gff_path, 'File not exists.') gff = HTSeq.GFF_Reader(gff_path) n = -1 for ft in gff: n += 1 if n != 0 and n % 100000 == 0: mc.handle_progress('{} lines read from GFF file...'.format(n)) if ft.type == 'exon': region[ft.iv] += 1 with open(out_path, 'w') as o: for iv, v in region.steps(): if v != 0: region_len = iv.end - iv.start if region_len >= min_region_len: o.write('{0}\t{1}\t{2}\t{3}\n'.format( iv.chrom, iv.start, iv.end, region_len))
def offset_read_alignment_positions(bam=None, offsets=None): """ Adjust the reported position of reads based on the offsets. Calculates the offset position of the read based on the read length, if the offset is not defined then set the offset position to the midpoint of the read. Based on these offsets, the reported reference position of the read is adjusted to the requisite A- or P-site position of the ribosome. """ try: if bam is not None: coverage = hts.GenomicArray(chroms='auto', stranded=True, typecode='i', storage='step') for alignment in bam: offset = (offsets[len(alignment.read.seq)] if len(alignment.read.seq) in offsets else len(alignment.read.seq) // 2) offset_pos = (convert_cigar_to_reference_coordinates( alignment.cigar)[offset - 1] if alignment.iv.strand == '+' else convert_cigar_to_reference_coordinates( alignment.cigar)[-offset]) coverage[HTSeq.GenomicPosition(alignment.iv.chrom, offset_pos, alignment.iv.strand)] += 1 if len(coverage.chrom_vectors) == 0: raise ValueError('Alignment position offset failure') else: raise NameError('Missing BAM input') except (NameError, ValueError): return None return coverage
def load_bedgraph(fname): ga = HTSeq.GenomicArray(chroms='auto', stranded=True) plus_file = fname.partition('.wig')[0] + '_+.wig' add_strand_to_ga_from_bedgraph_file(plus_file, ga, '+') minus_file = fname.partition('.wig')[0] + '_-.wig' add_strand_to_ga_from_bedgraph_file(minus_file, ga, '-') return ga
def get_total_tag_counts(chroms, bamfile): ga = HTSeq.GenomicArray(chroms, stranded=False, typecode='d') tag_count = 0 bam_reader = HTSeq.BAM_Reader(bamfile) for alt_first, alt_second in HTSeq.pair_SAM_alignments(bam_reader): if alt_first == None or alt_second == None: continue if alt_first.aligned and alt_first.optional_field( "NH" ) == 1 and alt_second.aligned and alt_second.optional_field("NH") == 1: if alt_first.iv.chrom != alt_second.iv.chrom or alt_first.iv.strand == alt_second.iv.strand or alt_first.iv.chrom not in chroms: continue tag_count += 1 alt_first_iv_seq = [ co.ref_iv for co in alt_first.cigar if co.type == "M" and co.size > 0 ] alt_second_iv_seq = [ reverse_strand(co.ref_iv) for co in alt_second.cigar if co.type == "M" and co.size > 0 ] alt_iv_seq = combine_pair_iv_seq(alt_first_iv_seq, alt_second_iv_seq) read_length = get_read_length(alt_iv_seq) for alt_iv in alt_iv_seq: ga[alt_iv] += 1.0 / read_length return tag_count, ga
def combine_bedgraphs(bedgraphs_folder='data/wigs_five_prime/'): ga = {} bedgraphs_folder = bedgraphs_folder.rstrip('/') + '/' for filename_list in [ (bedgraphs_folder + 'fbf1_reads_plus.bed', bedgraphs_folder + 'fbf1_reads_minus.bed', 'combined_fbf1.txt'), (bedgraphs_folder + 'fbf2_reads_plus.bed', bedgraphs_folder + 'fbf2_reads_minus.bed', 'combined_fbf2.txt') ]: peaks_filename = filename_list[2] scatterplot_correlation_by_wig.load_bedgraph(filename_list, ga, use_key=peaks_filename) ga['combined'] = HTSeq.GenomicArray(chroms='auto', stranded=True) for iv, score in ga['combined_fbf1.txt'].steps(): ga['combined'][iv] += score for iv, score in ga['combined_fbf2.txt'].steps(): ga['combined'][iv] += score # with open('temp_ga.p', 'w') as f: # pickle.dump(ga, f) ga['combined'].write_bedgraph_file(bedgraphs_folder + 'both_fbfs_plus.bed', '+') ga['combined'].write_bedgraph_file( bedgraphs_folder + 'both_fbfs_minus.bed', '-') return ga
def empty_array_from_file(bam_file, stranded=True, typecode="i"): cov_array = HTSeq.GenomicArray("auto", stranded=stranded, typecode=typecode) myheader = HTSeq.BAM_Reader(bam_file).get_header_dict() for entry in myheader['SQ']: cov_array.add_chrom(entry['SN'], entry['LN']) return cov_array
def run(input_bed, output_bedgraph_unnorm, output_bedgraph_norm): if not os.path.exists(output_bedgraph_unnorm): os.system('mkdir ' + output_bedgraph_unnorm) if not os.path.exists(output_bedgraph_norm): os.system('mkdir ' + output_bedgraph_norm) ga_all_exp = HTSeq.GenomicArray('auto', stranded=True) ga_all_control = HTSeq.GenomicArray('auto', stranded=True) ga_other = HTSeq.GenomicArray('auto', stranded=True) for infile in glob.glob(input_bed + '/*.bed'): ga = HTSeq.GenomicArray('auto', stranded=True) if (re.match('.*fog.*', os.path.basename(infile)) is not None) or (re.match( '.*exp.*', os.path.basename(infile)) is not None) or (re.match( '.*fbf.*', os.path.basename(infile)) is not None): # if re.match('.*fbf1.*', os.path.basename(infile)) is not None: # continue print(infile) ga = add_to_ga(infile, ga_all_exp) elif (re.match('.*control.*', os.path.basename(infile)) is not None) or (re.match( '.*n2.*', os.path.basename(infile)) is not None): ga = add_to_ga(infile, ga_all_control) else: ga = add_to_ga(infile, ga_other) outname = "{d}/{b}".format( d=output_bedgraph_unnorm, b=os.path.basename(infile).partition('.bed')[0]) print("Creating a bedgraph {c} from {a}...".format(c=outname, a=infile)) outname_plus = outname + '_+.wig' ga.write_bedgraph_file(outname_plus, strand='+') outname_minus = outname + '_-.wig' ga.write_bedgraph_file(outname_minus, strand='-') ga_all_exp.write_bedgraph_file(output_bedgraph_unnorm + '/all_exp_+.wig', strand='+') ga_all_exp.write_bedgraph_file(output_bedgraph_unnorm + '/all_exp_-.wig', strand='-') ga_all_control.write_bedgraph_file(output_bedgraph_unnorm + '/all_control_+.wig', strand='+') ga_all_control.write_bedgraph_file(output_bedgraph_unnorm + '/all_control_-.wig', strand='-') normalize_bedgraph.normalize_wig(input_bed, output_bedgraph_unnorm, output_bedgraph_norm)
def bed2model(bg, mapq=1, noRedu=True, ext=150): """ Convet BED format file into HTSeq.GenomicArray to get the genomic coverage. Only non-redundant reads will be kept. Parameteres ---- bg: str, .bed or .bed.gz file mapq: int, Bowtie2 MAPQ cutoff to filter reads. noRedu: bool, whether to keep redundant reads Returns ---- HTSeq.GenomicArray BedGraph format, gzip or not into HTSeq.GenomicArray """ rs = set() if bg.endswith(".gz"): f = gzip.open(bg, "rb") else: f = open(bg) logger.info("Start building model for %s, with MAPQ cutoff >=%s" % (bg, mapq)) model = HTSeq.GenomicArray("auto", stranded=False) t = 0 for i, line in enumerate(f): if i % 10000 == 0: report = "%s lines genome signal read." % i cFlush(report) line = line.split("\n")[0].split("\t") if len(line) < 3: continue try: chrom = line[0] s = int(line[1]) e = int(line[2]) except: continue if int(line[4]) < mapq: continue t += 1 r = (chrom, s, e) if noRedu: if r not in rs: if line[5] == "+": e = s + ext else: s = max(0, e - ext) iv = HTSeq.GenomicInterval(chrom, s, e) model[iv] += 1 rs.add(r) else: iv = HTSeq.GenomicInterval(chrom, s, e) model[iv] += 1 print("%s:totalReads:%s;nonRedudant:%s" % (f, i, len(rs))) logger.info("%s:totalReads:%s;nonRedudant:%s" % (f, i, len(rs))) return len(rs), model
def Get_label_information(label, annot, bam_reader): warnings.simplefilter("ignore") gas = HTSeq.GenomicArrayOfSets("auto", stranded=False) ga = HTSeq.GenomicArray("auto", stranded=False, typecode="i") gene_count = {} for feature, rank, chrom, start, end, strand, length, exon_rank_left, exon_rank_right in annot[ label]: iv = HTSeq.GenomicInterval(chrom, start, end, strand) gas[iv] += (feature, rank) gene_count[(feature, rank)] = 0 boundary_left, boundary_right = min([i[3] for i in annot[label] ]), max([i[4] for i in annot[label]]) region_fetch = annot[label][0][2] + ":" + str( int(boundary_left) - 500) + "-" + str(int(boundary_right) + 500) read_seq = bam_reader.fetch(region=region_fetch) read_seq_iter = iter(bam_reader.fetch()) one_read = next(read_seq_iter) pe_mode = one_read.paired_end if pe_mode: read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) for a in read_seq: if not pe_mode: if not a.aligned: continue if a.optional_field('NH') > 1: continue iv_seq = (cigop.ref_iv for cigop in a.cigar if cigop.type == "M" and cigop.size > 0) else: if ((a[0] and a[0].aQual < minaqual) or (a[1] and a[1].aQual < minaqual)): continue if ((a[0] and a[0].optional_field('NH') > 1) or (a[1] and a[1].optional_field('NH') > 1)): continue if a[0] is not None and a[0].aligned: iv_seq = (cigop.ref_iv for cigop in a[0].cigar if cigop.type in cigar_char and cigop.size > 0) else: iv_seq = tuple() if a[1] is not None and a[1].aligned: iv_seq = itertools.chain( iv_seq, (invert_strand(cigop.ref_iv) for cigop in a[1].cigar if cigop.type in cigar_char and cigop.size > 0)) feature_aligned = set() for iv in iv_seq: for iv2, val2 in gas[iv].steps(): feature_aligned |= val2 ga[iv] += 1 # for calculating coverage if len(feature_aligned) == 0: continue for f in [item for item in feature_aligned if item[0] == 'intron']: gene_count[f] += 1 if 'intron' not in [x for x, y in feature_aligned]: for f in feature_aligned: gene_count[f] += 1 return gas, ga, gene_count
def zero(self): if hasattr(self, 'bedgraphs'): for name in self.bedgraphs: self.bedgraphs[name] = HTSeq.GenomicArray('auto', stranded=True) if hasattr(self, 'raw_signal_by_type'): for _type in self.raw_signal_by_type: self.raw_signal_by_type[_type] = []
def read_coverage(bam_filename): _bamfile = HTSeq.BAM_Reader(bam_filename) coverage = HTSeq.GenomicArray("auto", stranded=True, typecode='i') #gtf_df = pandas.read_csv(gtf_noheader_filename, sep='\t', header=None) print "Reading alignments from bamfile..." for aln in _bamfile: # Very slow. if aln.aligned: coverage[aln.iv] += 1 return coverage
def bedpe2model(bg, mapq=10, noRedu=True): """ Convet BEDPE format file into HTSeq.GenomicArray to get the genomic coverage. Only non-redundant reads will be kept. Parameteres ---- bg: str, .bedpe or .bedpe.gz file mapq: int, Bowtie2 MAPQ cutoff to filter reads. noRedu: bool, whether to keep redundant reads Returns ---- HTSeq.GenomicArray """ rs = set() if bg.endswith(".gz"): fh = gzip.open(bg, "rb") else: fh = open(bg) logger.info("Start building model for %s, with MAPQ cutoff >=%s" % (bg, mapq)) model = HTSeq.GenomicArray("auto", stranded=False) t = 0 for i, line in enumerate(fh): if i % 10000 == 0: report = "%s lines genome signal read." % i cFlush(report) line = line.split("\n")[0].split("\t") try: pet = PET(line) except: logger.error("%s from %s is not a BEDPE record" % (line, bg)) if not pet.cis or "_" in pet.chromA: continue if pet.mapq < mapq: continue t += 1 r = (pet.chromA, pet.mid, pet.mid + 1) if noRedu: if r not in rs: iva = HTSeq.GenomicInterval(pet.chromA, pet.startA, pet.endA) ivb = HTSeq.GenomicInterval(pet.chromB, pet.startB, pet.endB) model[iva] += 1 model[ivb] += 1 rs.add(r) else: iva = HTSeq.GenomicInterval(pet.chromA, pet.startA, pet.endA) ivb = HTSeq.GenomicInterval(pet.chromB, pet.startB, pet.endB) model[iva] += 1 model[ivb] += 1 logger.info("%s:totalReads:%s;nonRedudant:%s" % (bg, t, len(rs))) if noRedu: return len(rs), model else: return t, model