def __iter__(self): if filetype.upper() == "BED": for line in HTSeq.BED_Reader(filepath): line.iv.start -= window_length line.iv.end += window_length yield line.iv elif filetype.upper() == "GFF" or filetype.upper() == "GTF": for line in HTSeq.GFF_Reader(filepath): line.iv.start -= window_length line.iv.end += window_length yield line.iv elif filetype.upper() == "SAM": for line in HTSeq.SAM_Reader(filepath): line.iv.start -= window_length line.iv.end += window_length yield line.iv elif filetype.upper() == "BAM": for line in HTSeq.BAM_Reader(filepath): line.iv.start -= window_length line.iv.end += window_length yield line.iv elif self.filetype.upper() == "OTHER": for line in func(self.filepath): line.iv.start -= window_length line.iv.end += window_length yield line.iv
def annotate(self, circfile, annotation_tree, output): # the circRNA file should be in a bed format, have chr\tstart\tend\t'.'\tjunctiontype\tstrand # The annotation tree should be a IntervalTree object # check the input with open(circfile, 'r') as tmpcirc: tmpsplit = tmpcirc.readline().split('\t') if len(tmpsplit) != 6: warnings.warn( 'Input circRNA file is not the desired bed6 format!') logging.warning( 'Input circRNA file is not the desired bed6 format!') ncol = len(tmpsplit) # Annotate with Interval tree algorithm out = open(output, 'w') circ_reagions = HTSeq.BED_Reader(circfile) for circ in circ_reagions: annotation = self.annotate_one_interval(circ.iv, annotation_tree, what='gene') out.write('\t'.join([ circ.iv.chrom, str(circ.iv.start), str(circ.iv.end), annotation, str(int(circ.score)), circ.iv.strand ]) + '\n') out.close()
def map_genome_features(files, ref, gtf_file, outpath='', aligner='bowtie', overwrite=True, aligner_params=''): """Convenience method that maps multiple files to a genome with features and return/process hits. Can be used for miRNA discovery. Args: ref: genome bowtie index name gtf_file: gtf or bed file with features outpath: output path aligner: short read alligner to use aligner_params: aligner parameters """ if aligner_params != '': aligners.set_params(aligner, aligner_params) if overwrite == True: print('removing old temp files') utils.remove_files(outpath, '*_mapped.sam') utils.remove_files(outpath, '*_r.fa') ext = os.path.splitext(gtf_file)[1] if ext == '.gtf' or ext == '.gff' or ext == '.gz': features = HTSeq.GFF_Reader(gtf_file) elif ext == '.bed': features = HTSeq.BED_Reader(gtf_file) exons = get_exons(features) cfiles = collapse_files(files, outpath) print(cfiles) result = [] for cfile in cfiles: label = os.path.splitext(os.path.basename(cfile))[0] samfile = os.path.join(outpath, '%s_%s.sam' % (label, ref)) if aligner == 'bowtie': aligners.bowtie_align(cfile, ref, outfile=samfile) elif aligner == 'subread': aligners.subread_align(cfile, ref, samfile) #get true read counts for collapsed file readcounts = utils.read_collapsed_file(cfile) #count features counts = count_features(samfile, features=exons, readcounts=readcounts) counts['label'] = label counts['genome'] = ref total = readcounts.reads.sum() counts['fraction'] = counts.reads / total result.append(counts) result = pd.concat(result) result = merge_features(result, gtf_file) return result
def get_gene(bgmodel=None, bed=None): gs = set() for item in HTSeq.BED_Reader(bed): iv = item.iv #just in case the bed contains not included chrom in background model try: for ivb, valueb in bgmodel[iv].steps(): gs.update(valueb) except: continue return gs
def get_overlap_rep(rep_model, peak_f): reps = set() c_t = os.path.split(peak_f)[1].split("_")[0] for g in HTSeq.BED_Reader(peak_f): iv = g.iv try: for niv, value in list(rep_model[iv].steps()): reps.update(value) except: continue return c_t, reps
def __init__(self, l_fp): #, min_peak_score=None, min_core_score=None, iv=None): # Read peaks l_peak_fp = [] for (i, fp) in enumerate(l_fp): fh = hts.BED_Reader(fp) l_peak = [gf for gf in fh] print >> sys.stderr, "%d peaks found in %s" % (len(l_peak), fp) #for peak in l_peak: peak.name = {"source": i} l_peak_fp.append(l_peak) self.l_peak_fp = l_peak_fp
def makeIslandFilteredGraphFile(chroms, chrom_lengths, window_size, bamfile, islandbedfile, outfile): ga = HTSeq.GenomicArray(chroms, stranded=False, typecode='d') bam_reader = HTSeq.BAM_Reader(bamfile) for alt_first, alt_second in HTSeq.pair_SAM_alignments(bam_reader): if alt_first == None or alt_second == None: continue if alt_first.aligned and alt_first.optional_field( "NH" ) == 1 and alt_second.aligned and alt_second.optional_field("NH") == 1: if alt_first.iv.chrom != alt_second.iv.chrom or alt_first.iv.strand == alt_second.iv.strand or alt_first.iv.chrom not in chroms: continue alt_first_iv_seq = [ co.ref_iv for co in alt_first.cigar if co.type == "M" and co.size > 0 ] alt_second_iv_seq = [ reverse_strand(co.ref_iv) for co in alt_second.cigar if co.type == "M" and co.size > 0 ] alt_iv_seq = combine_pair_iv_seq(alt_first_iv_seq, alt_second_iv_seq) read_length = get_read_length(alt_iv_seq) for alt_iv in alt_iv_seq: ga[alt_iv] += 1.0 / read_length ga_island = HTSeq.GenomicArray(chroms, stranded=False, typecode='d') bedfile = HTSeq.BED_Reader(islandbedfile) for alt in bedfile: for iv, value in ga[alt.iv].steps(): ga_island[iv] += value with open(outfile, 'w') as f: for chrom in chroms: chrom_length = chrom_lengths[chrom] num_windows = chrom_length / window_size for i in range(num_windows): count_in_window = 0 window_start = i * window_size window_end = (i + 1) * window_size window_iv = HTSeq.GenomicInterval(chrom, window_start, window_end) for iv, value in ga_island[window_iv].steps(): count_in_window += value * iv.length count_in_window = int(count_in_window) if count_in_window != 0: outline = chrom + '\t' + str(window_start) + '\t' + str( window_end) + '\t' + str(count_in_window) + '\n' f.write(outline)
def __create_genomic_signals(self, stranded=True, func=None, use_wrappers=True): """Prepares coverage as a HTSeq.GenomicArray :param filepath: path to file :param filetype: type of the file (can be bed etc.) """ stderr.write("Creating %s signal. It may take few minutes...\n" % self.name) self.coverage = HTSeq.GenomicArray("auto", stranded=stranded, typecode="d") self.library_size = 0 if self.filetype.upper() == "BED": if use_wrappers: self.coverage = BedWrapper(self.filepath) else: for line in HTSeq.BED_Reader(self.filepath): self.coverage[line.iv] += 1 self.library_size += 1 elif self.filetype.upper() == "GFF" or self.filetype.upper() == "GTF": for line in HTSeq.GFF_Reader(self.filepath): self.coverage[line.iv] += 1 self.library_size += 1 elif self.filetype.upper() == "SAM": for line in HTSeq.SAM_Reader(self.filepath): self.coverage[line.iv] += 1 self.library_size += 1 elif self.filetype.upper() == "BAM": if use_wrappers: raise NotImplementedError( "Bam wrapper is not yet implemented!") self.coverage = BamWrapper(self.filetype) for line in HTSeq.BAM_Reader(self.filepath): self.coverage[line.iv] += 1 self.library_size += 1 elif (self.filetype.upper() == "BG") or (self.filetype.upper() == "BEDGRAPH"): raise NotImplementedError("BedGraph is not yet implemented!") elif (self.filetype.upper() == "BW") or (self.filetype.upper() == "BIGWIG"): self.coverage = BigWigWrapper(self.filepath) elif self.filetype.upper() == "OTHER": for line in func(self.filepath): self.coverage[line.iv] += 1 self.library_size += 1 else: assert False, "I should not be here!"
def getGenomicarrayOfSetsAndNames(bed_file): """ Returns a GenomicArrayOfSets of all regions and a list of region names """ # build parser for rgions regionParser = HTSeq.BED_Reader(bed_file) # build GenomicArrayOfSets for all regions regions = HTSeq.GenomicArrayOfSets("auto", stranded=False) # add all region to the GenomicArrayOfSets for feature in regionParser: regions[feature.iv] += feature.name region_names = [feature.name for feature in regionParser] return regions, region_names
def test_output_bed_loss_resolution_equal_stepsize(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath # generate loss # # resolution < stepsize inputs = Array("x", numpy.random.random((7, 1, 1, 10))) outputs = Array('y', numpy.random.random((7, 1, 1, 4)), conditions=['c1', 'c2', 'c3', 'c4']) bwm = get_janggu_conv(inputs, outputs) data_path = pkg_resources.resource_filename('janggu', 'resources/10regions.bed') gi = GenomicIndexer.create_from_file(data_path, binsize=200, stepsize=200) dummy_eval = Scorer('loss', lambda t, p: [0.1] * len(t), exporter=export_bed) bwm.evaluate(inputs, outputs, callbacks=[dummy_eval], exporter_kwargs={ 'gindexer': gi, 'resolution': 200 }) file_ = os.path.join(tmpdir.strpath, 'evaluation', bwm.name, 'loss.nptest.y.{}.bed') for cond in ['c1', 'c2', 'c3', 'c4']: assert os.path.exists(file_.format(cond)) bed = iter(HTSeq.BED_Reader(file_.format('c1'))) nreg = 0 for reg in bed: numpy.testing.assert_equal(reg.score, 0.1) nreg += 1 # numpy.testing.assert_equal(breg.score, value) assert nreg == 7, 'There should be 7 regions in the bed file.'
def test_output_bed_predict_resolution_unequal_stepsize(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath # generate loss # # resolution < stepsize inputs = Array("x", numpy.random.random((7, 4, 1, 10))) outputs = Array('y', numpy.random.random((7, 4, 1, 4)), conditions=['c1', 'c2', 'c3', 'c4']) bwm = get_janggu(inputs, outputs) data_path = pkg_resources.resource_filename('janggu', 'resources/10regions.bed') gi = GenomicIndexer.create_from_file(data_path, binsize=200, stepsize=200) dummy_eval = Scorer('pred', lambda p: [0.1] * len(p), exporter=ExportBed(gindexer=gi, resolution=50), conditions=['c1', 'c2', 'c3', 'c4']) bwm.predict(inputs, callbacks=[dummy_eval]) file_ = os.path.join(tmpdir.strpath, 'evaluation', bwm.name, 'pred.nptest.y.{}.bed') for cond in ['c1', 'c2', 'c3', 'c4']: assert os.path.exists(file_.format(cond)) bed = iter(HTSeq.BED_Reader(file_.format('c1'))) nreg = 0 for reg in bed: numpy.testing.assert_equal(reg.score, 0.1) nreg += 1 assert nreg == 28, 'There should be 28 regions in the bed file.'
def produce_sequences(bed_file, fasta_file, gtf_file, min_length, max_length, width, padding, graphprot_compatible=False): print(" Reading primary peaks from BED file") bed_file = HTSeq.BED_Reader(bed_file) input_peaks = HTSeq.GenomicArrayOfSets("auto", stranded=True) total_peaks = 0 for peak in bed_file: total_peaks += 1 if width > 0: mid = int((peak.iv.start + peak.iv.end) / 2) peak.iv.start = mid - int(width / 2) peak.iv.end = peak.iv.start + width if padding > 0: peak.iv.start -= padding peak.iv.end += padding input_peaks[peak.iv] += peak.name print(" Reading GTF file from " + str(gtf_file)) genes = HTSeq.GenomicArrayOfSets("auto", stranded=True) gene_dict = dict() gtf_file = HTSeq.GFF_Reader(gtf_file) total_genes = 0 for feature in gtf_file: if feature.type == "gene": total_genes += 1 genes[feature.iv] += feature.name gene_dict[feature.name] = feature if total_genes == 0: # this GTF file doesn't have 'gene' features, we need to build the gene intervals from the exon intervals instead print( " No 'gene' features in GTF, building gene intervals from exons instead." ) for feature in gtf_file: if feature.type == "exon": gene = gene_dict.get(feature.attr["gene_id"], False) if not gene: feature.type = 'gene' gene_dict[feature.attr["gene_id"]] = feature total_genes += 1 else: if gene.iv.start > feature.iv.start: gene.iv.start = feature.iv.start if gene.iv.end < feature.iv.end: gene.iv.end = feature.iv.end gene_dict[feature.attr["gene_id"]] = gene for gene in gene_dict.values(): genes[gene.iv] += gene.attr["gene_id"] print(" Loaded {} total genes.".format(total_genes)) print(" Reading genome from file " + str(fasta_file) + " ...", ) sys.stdout.flush() genome = read_genome(fasta_file) print("done") print(" Filtering and constructing background...") pos_peaks = HTSeq.GenomicArrayOfSets("auto", stranded=True) neg_peaks = HTSeq.GenomicArrayOfSets("auto", stranded=True) pos_seqs = [] neg_seqs = [] seq_ids = [] not_in_gene = 0 multiple_genes = 0 redundant = 0 invalid = 0 for peak in bed_file: valid = True iset = None if peak.iv.length < min_length or peak.iv.length > max_length: valid = False invalid += 1 if valid: if width > 0: mid = int((peak.iv.start + peak.iv.end) / 2) peak.iv.start = mid - int(width / 2) peak.iv.end = peak.iv.start + width if padding > 0: peak.iv.start -= padding peak.iv.end += padding for iv2, step_set in input_peaks[peak.iv].steps(): if iset is None: iset = step_set.copy() else: iset.intersection_update(step_set) try: overlaps = len(iset) except TypeError: overlaps = 0 if overlaps == 1 and valid: # this peak does not overlap other peaks after padding, so we can assume it's reasonably unique pos_peaks[peak.iv] += peak.name # now find the gene that it overlaps gset = None #print " Looking for overlapping gene in list of {} total genes on chromosome {}.".format(len(genes[peak.iv]), peak.iv) for iv2, step_set in genes[peak.iv].steps(): if gset is None: gset = step_set.copy() else: gset.intersection_update(step_set) if len(gset) == 1: # this peak overlaps exactly one gene so we know where to randomly choose a background sequence gene = gene_dict[list(gset)[0]] overlap = True overlap_counter = 0 while overlap: overlap_counter += 1 start = random.randint(gene.iv.start, gene.iv.end - peak.iv.length) end = start + peak.iv.length neg_peak = HTSeq.GenomicInterval(gene.iv.chrom, start, end, gene.iv.strand) overlap_peak = None overlap_neg_peak = None for iv2, step_set in pos_peaks[neg_peak].steps(): if overlap_peak is None: overlap_peak = step_set.copy() else: overlap_peak.intersection_update(step_set) for iv2, step_set in neg_peaks[neg_peak].steps(): if overlap_neg_peak is None: overlap_neg_peak = step_set.copy() else: overlap_neg_peak.intersection_update(step_set) if not overlap_peak and not overlap_neg_peak: # yes! found a non-overlapping region suitable as background sequence overlap = False if overlap_counter > 1000: # accept that a non-overlap can't be found but don't use this peak print( "Warning: failed to find non-overlapping background for " + str(peak.name)) valid = False overlap = False invalid += 1 if 'n' in str(genome[neg_peak.chrom] [neg_peak.start:neg_peak.end].seq).lower(): print("Warning: 'n' in background sequence for " + str(peak.name)) valid = False invalid += 1 if valid: neg_peaks[neg_peak] += 1 pos_seq = Seq( str(genome[peak.iv.chrom] [peak.iv.start:peak.iv.end].seq), generic_dna) if peak.iv.strand == "-": pos_seq = pos_seq.reverse_complement() neg_seq = Seq( str(genome[neg_peak.chrom] [neg_peak.start:neg_peak.end].seq), generic_dna) if neg_peak.strand == "-": neg_seq = neg_seq.reverse_complement() pos_seq = str(pos_seq) neg_seq = str(neg_seq) if graphprot_compatible: pos_seq = pos_seq[:padding].lower( ) + pos_seq[padding:-padding].upper( ) + pos_seq[-padding:].lower() neg_seq = neg_seq[:padding].lower( ) + neg_seq[padding:-padding].upper( ) + neg_seq[-padding:].lower() pos_seqs.append(pos_seq) neg_seqs.append(neg_seq) seq_ids.append(peak.name) elif len(gset) == 0: not_in_gene += 1 elif len(gset) > 1: multiple_genes += 1 elif overlaps > 1 and valid: redundant += 1 print(" Found {} invalid peaks (too short or too long).".format(invalid)) print(" Found {} valid but redundant peaks.".format(redundant)) print( " Found {} non-redundant peaks that did not overlap any genes, and {} that overlapped multiple genes." .format(not_in_gene, multiple_genes)) print(" Found {} valid non-redundant peaks overlapping genes.".format( len(pos_seqs))) return pos_seqs, neg_seqs, seq_ids
def main(argv): parser = OptionParser() parser.add_option("-b", "--file", action="store", type="string", dest="file_name", metavar="<file>", help="name of bed file (not including .bed extension)") parser.add_option("-g", "--genome", action="store", type="string", dest="genome_data", metavar="<file>", help="name of reference genome (mm9 for mouse)") parser.add_option("-r", "--redundancy", action="store", type="int", dest="redundancy", metavar="<file>", help="redundancy threshold") parser.add_option( "-w", "--window_size", action="store", type="int", dest="window_size", metavar="<int>", help= "size of windows used to partition genome (200 for histones, 50 for TFs" ) parser.add_option( "-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help= "fragment size determines the shift (half of fragment_size of ChIP-seq read position, in bps)" ) parser.add_option("-p", "--genome_fraction", action="store", type="float", dest="genome_fraction", metavar="<int>", help="effective genome fraction: 0.8 in most cases") parser.add_option( "-s", "--gap_size", action="store", type="int", dest="gap_size", metavar="<int>", help= "maximum number of base pairs between windows in the same island (usually same as window size)" ) parser.add_option("-e", "--e-value", action="store", type="string", dest="e_value", metavar="<string>", help="e-value used to determine significance") parser.add_option("-i", "--input_dir", action="store", type="string", dest="input_dir", metavar="<string>", help="path to input directory") parser.add_option("-o", "--output_dir", action="store", type="string", dest="output_dir", metavar="<string>", help="path to output directory") parser.add_option("-a", "--SICER_dir", action="store", type="string", dest="sicer_dir", metavar="<string>", help="path to directory containing SICER files") (opt, args) = parser.parse_args(argv) if len(argv) < 10: parser.print_help() sys.exit(1) #create string names for files #remove .bed extension file_name = opt.file_name[:-4] bed_file_name = opt.input_dir + "/" + opt.file_name sorted_bed_file_name = opt.output_dir + "/" + file_name + "_sorted_temp.bed" # This file stores the preprocessed raw bed file. red_rem_bed_file_name = opt.output_dir + "/" + file_name + "-" + str( opt.redundancy) + "-removed.bed" # This file stores the candidate islands. score_island_file_name = opt.output_dir + "/" + file_name + "-W" + str( opt.window_size) + "-G" + str(opt.gap_size) + ".scoreisland" # This file stores the summary graph. graph_file_name = opt.output_dir + "/" + file_name + "-W" + str( opt.window_size) + ".graph" # This file stores the island-filtered non-redundant raw reads island_filtered_file_name = opt.output_dir + "/" + file_name + "-W" + str( opt.window_size) + "-G" + str(opt.gap_size) + "-E" + str( opt.e_value) + "-islandfiltered.bed" # This file stores the sample summary graph in bedgraph format normalized_bedgraph_file_name = opt.output_dir + "/" + file_name + "-W" + str( opt.window_size) + "-normalized.bedgraph" # This file stores normalized summary graph made by the island-filtered non-redundant raw reads in bedgraph format islandfiltered_normalized_bedgraph_file_name = opt.output_dir + "/" + file_name + "-W" + str( opt.window_size) + "-G" + str(opt.gap_size) + "-E" + str( opt.e_value) + "-islandfiltered-normalized.bedgraph" genome_file = opt.sicer_dir + "/genomes/" + opt.genome_data # read genome data from file containing genome data # store genome data in the dictionary genome genome = SICER_MS.get_genome_data(genome_file) # convert E_value to float e_value = float(opt.e_value) # sort bed file by chromosome, then by coordinate, then by strand print "\nSorting BED file..." SICER_MS.sort_bed_file(bed_file_name, sorted_bed_file_name) # remove redundant reads in bed file and count number of total reads and number of retained reads print "\nPreprocess the sorted BED file to remove redundancy with threshold " + str( opt.redundancy) + "..." total, retained = SICER_MS.remove_redundant_reads_bed( sorted_bed_file_name, red_rem_bed_file_name, opt.redundancy, genome) print "Total reads: " + str(total) + "\nTotal retained reads: " + str( retained) + "\n\n" # remove sorted bed file os.system('rm %s' % (sorted_bed_file_name)) # create HTSeq bed reader that can iterate through all of the reads bed_iterator = HTSeq.BED_Reader(red_rem_bed_file_name) print "Partition the genome in windows... \n" # make dictionary of reads and windows and count total reads # read_dict: keys are chromosomes and values are a list of read positions # window_dict: keys are chromosomes and values are a list of window start coordinates for windows containing reads read_dict, window_dict, total_reads = SICER_MS.make_dict_of_reads_and_windows( bed_iterator, genome, opt.fragment_size, opt.window_size) print "Count reads in windows... \n" # calculate the number of island reads in all the windows comprising the islands # calculate normalized read count for each window # add the window's normalized read count to a genomic array (island_normalized_window_array) # the island_normalized_window_array will be used to write a bedgraph file window_counts_dict, normalized_window_array = SICER_MS.get_window_counts_and_normalize( window_dict, read_dict, genome, 1000000, total_reads, opt.window_size) # write bedgraph file of normalized islands normalized_window_array.write_bedgraph_file(normalized_bedgraph_file_name) print "Find candidate islands exhibiting clustering... \n" # finds all islands using the dictionary of window counts and generates .scoreisland file # returns a genomic array island_array of all island tag counts and a list of islands (in dictionary format) # the dictionary keys of each island are 'island', 'score', and 'chip' (the read count) # also writes graph file island_array, islands_list = SICER_MS.find_islands( window_counts_dict, total_reads, opt.gap_size, opt.window_size, genome, opt.genome_fraction, e_value, score_island_file_name, graph_file_name, 2) print "\nFilter reads with identified significant islands...\n" # given HTSeq bed_iterator and HTSeq Genomic Array that has chip read count assigned to all islands # finds all reads in the bed_iterator that are located in islands # if a read is located in an island, it is written to a bed file # creates a genomic array of all windows that have reads located in islands # returns a dictionary containing all reads located in islands and a dictionary containing all windows in islands # dictionary format: keys are chromosomes, values are sorted lists of all read/window positions islandfiltered_reads_dict, islandfiltered_windows_dict, total_reads_in_islands = SICER_MS.filter_raw_tags_by_islands( bed_iterator, island_array, island_filtered_file_name, opt.fragment_size, opt.window_size, genome) # calculate the number of island filtered reads in all the windows comprising the islands # calculate normalized read count for each window # add the window's normalized read count to a genomic array (islandfilt_normalized_window_array) # the islandfilt_normalized_window_array will be used to write a bedgraph file islandfiltered_window_counts_dict, islandfiltered_normalized_window_array = SICER_MS.get_window_counts_and_normalize( islandfiltered_windows_dict, islandfiltered_reads_dict, genome, 1000000, total_reads_in_islands, opt.window_size) # write bedgraph file of normalized filtered islands islandfiltered_normalized_window_array.write_bedgraph_file( islandfiltered_normalized_bedgraph_file_name)
def main(argv): parser = OptionParser() parser.add_option("-b", "--file", action="store", type="string", dest="file_name", metavar="<file>", help="name of bed file") parser.add_option("-c", "--control", action="store", type="string", dest="control_file_name", metavar="<file>", help="name of control bed file") parser.add_option("-g", "--genome", action="store", type="string", dest="genome_data", metavar="<file>", help="name of reference genome (mm9 for mouse)") parser.add_option("-r", "--redundancy", action="store", type="int", dest="redundancy", metavar="<int>", help="redundancy threshold") parser.add_option( "-w", "--window_size", action="store", type="int", dest="window_size", metavar="<int>", help= "size of windows used to partition genome (200 for histones, 50 for TFs" ) parser.add_option( "-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help= "fragment size determines the shift (half of fragment_size of ChIP-seq read position, in bps)" ) parser.add_option("-p", "--genome_fraction", action="store", type="float", dest="genome_fraction", metavar="<int>", help="effective genome fraction: 0.8 in most cases") parser.add_option( "-s", "--gap_size", action="store", type="int", dest="gap_size", metavar="<int>", help= "maximum number of base pairs between windows in the same island (usually same as window size)" ) parser.add_option("-d", "--FDR", action="store", type="string", dest="FDR", metavar="<string>", help="false discovery rate controlling significance") parser.add_option("-i", "--input_dir", action="store", type="string", dest="input_dir", metavar="<string>", help="path to input directory") parser.add_option("-o", "--output_dir", action="store", type="string", dest="output_dir", metavar="<string>", help="path to output directory") parser.add_option("-a", "--SICER_dir", action="store", type="string", dest="sicer_dir", metavar="<string>", help="path to directory containing SICER files") (opt, args) = parser.parse_args(argv) if len(argv) < 10: parser.print_help() sys.exit(1) file_name = opt.file_name[:-4] control_file_name = opt.control_file_name[:-4] # create string names for files bed_file_name = opt.input_dir + "/" + opt.file_name control_bed_file_name = opt.input_dir + "/" + opt.control_file_name sorted_bed_file_name = opt.output_dir + "/" + file_name + "_sorted_temp.bed" # This file stores the preprocessed raw bed file. red_rem_bed_file_name = opt.output_dir + "/" + file_name + "-" + str( opt.redundancy) + "-removed.bed" island_bed_file_name = opt.output_dir + "/" + file_name + "-W" + str( opt.window_size) + "-G" + str(opt.gap_size) + "-FDR" + str( opt.FDR) + "-island.bed" sorted_control_file_name = opt.output_dir + "/" + control_file_name + "_sorted_temp.bed" # This file stores the preprocessed raw bed control file. red_rem_control_file_name = opt.output_dir + "/" + control_file_name + "-" + str( opt.redundancy) + "-removed.bed" # This file stores the sample summary graph in bedgraph format normalized_bedgraph_file_name = opt.output_dir + "/" + file_name + "-W" + str( opt.window_size) + "-normalized.bedgraph" # This file stores the control summary graph in bedgraph format control_normalized_bedgraph_file_name = opt.output_dir + "/" + control_file_name + "-W" + str( opt.window_size) + "-normalized.bedgraph" # This file stores the candidate islands. score_island_file_name = opt.output_dir + "/" + file_name + "-W" + str( opt.window_size) + "-G" + str(opt.gap_size) + ".scoreisland" # These files store the summary graphs. graph_file_name = opt.output_dir + "/" + file_name + "-W" + str( opt.window_size) + ".graph" control_graph_file_name = opt.output_dir + "/" + control_file_name + "-W" + str( opt.window_size) + ".graph" # This file stores the island-filtered non-redundant raw reads island_filtered_file_name = opt.output_dir + "/" + file_name + "-W" + str( opt.window_size) + "-G" + str(opt.gap_size) + "-FDR" + str( opt.FDR) + "-islandfiltered.bed" # This file stores normalized summary graph made by the island-filtered non-redundant raw reads in bedgraph format islandfiltered_normalized_bedgraph_file_name = opt.output_dir + "/" + file_name + "-W" + str( opt.window_size) + "-G" + str(opt.gap_size) + "-FDR" + str( opt.FDR) + "-islandfiltered-normalized.bedgraph" # This file stores the summary of candidate islands, including chrom start end read-count_sample read-count-control pvalue, fold change and qvalue islandsummary_file_name = opt.output_dir + "/" + file_name + "-W" + str( opt.window_size) + "-G" + str(opt.gap_size) + "-islands-summary" # This file stores the summary of significant islands identified with FDR criterion. filtered_island_file_name = opt.output_dir + "/" + file_name + "-W" + str( opt.window_size) + "-G" + str( opt.gap_size) + "-islands-summary-FDR" + str(opt.FDR) # convert FDR to float FDR = float(opt.FDR) genome_file = opt.sicer_dir + "/genomes/" + opt.genome_data # read genome data from file containing genome data # store genome data in the dictionary genome genome = SICER_MS.get_genome_data(genome_file) # number of islands expected in random background. # the E value is used for identification of candidate islands that exhibit clustering. e_value = 1000 # sort bed file by chromosome, then by coordinate, then by strand print "\nSorting BED file..." SICER_MS.sort_bed_file(bed_file_name, sorted_bed_file_name) # sort control file by chromosome, then by coordinate, then by strand print "Sorting control BED file..." SICER_MS.sort_bed_file(control_bed_file_name, sorted_control_file_name) # remove redundant reads in bed file and count number of total reads and number of retained reads print "\nPreprocess the sorted BED file to remove redundancy with threshold " + str( opt.redundancy) + "..." total, retained = SICER_MS.remove_redundant_reads_bed( sorted_bed_file_name, red_rem_bed_file_name, opt.redundancy, genome) print "Total reads: " + str(total) + "\nTotal retained reads: " + str( retained) # remove redundant reads in control file and count number of total reads and number of retained reads print "\nPreprocess the sorted control file to remove redundancy with threshold " + str( opt.redundancy) + "..." control_total, control_retained = SICER_MS.remove_redundant_reads_bed( sorted_control_file_name, red_rem_control_file_name, opt.redundancy, genome) print "Control file total reads: " + str( control_total) + "\nControl file total retained reads: " + str( control_retained) + "\n \n" os.system('rm %s %s' % (sorted_bed_file_name, sorted_control_file_name)) # create HTSeq bed readers that can iterate through all of the reads bed_iterator = HTSeq.BED_Reader(red_rem_bed_file_name) control_bed_iterator = HTSeq.BED_Reader(red_rem_control_file_name) print "Partition the genome in windows... \n" # make dictionary of reads and windows and count total reads # read_dict: keys are chromosomes and values are a list of read positions # window_dict: keys are chromosomes and values are a list of window start coordinates for windows containing reads read_dict, window_dict, total_reads = SICER_MS.make_dict_of_reads_and_windows( bed_iterator, genome, opt.fragment_size, opt.window_size) # make dictionary of reads and windows and count total reads for control file control_read_dict, control_window_dict, control_total_reads = SICER_MS.make_dict_of_reads_and_windows( control_bed_iterator, genome, opt.fragment_size, opt.window_size) print "Count reads in windows... \n" # get the read count and normalized read count of all windows in the bed file # create window counts dictionary window_counts_dict # add the window's score to the genomic array normalized_window_array # window_counts_dict: keys are chromosomes and values are a list of smaller # lists of the format [window_start, read_count, score] (the score will be calculated later) window_counts_dict, normalized_window_array = SICER_MS.get_window_counts_and_normalize( window_dict, read_dict, genome, 1000000, total_reads, opt.window_size) # get the read count and score of all windows in the control file file control_window_counts_dict, control_normalized_window_array = SICER_MS.get_window_counts_and_normalize( control_window_dict, control_read_dict, genome, 1000000, control_total_reads, opt.window_size) # write bedgraph file of normalized windows normalized_window_array.write_bedgraph_file(normalized_bedgraph_file_name) # write bedgraph file of normalized windows for control control_normalized_window_array.write_bedgraph_file( control_normalized_bedgraph_file_name) # write graph file for control reads SICER_MS.write_graph_file(control_window_counts_dict, opt.window_size, control_graph_file_name, genome) print "Find candidate islands exhibiting clustering... \n" # finds all islands using the dictionary of window counts and generates .scoreisland file # returns a genomic array island_array of all island tag counts and a list of islands (in dictionary format) # the dictionary keys of each island are 'island', 'score', and 'chip' (the read count) # also writes graph file island_array, islands_list = SICER_MS.find_islands( window_counts_dict, total_reads, opt.gap_size, opt.window_size, genome, opt.genome_fraction, e_value, score_island_file_name, graph_file_name, 2) # count the number of reads in the islands for both chip and control # returns updated list of islands including chip and control read counts and the total reads located in islands for # both island dictionaries islands_list, total_chip_reads_in_islands, total_control_reads_in_islands = SICER_MS.count_reads_in_islands( islands_list, read_dict, control_read_dict) print "Total chip reads in islands: " + str(total_chip_reads_in_islands) print "Total control reads in islands: " + str( total_control_reads_in_islands) # calculate the p-value and fold change (number of chip reads versus number of expected chip reads) for all islands # calculate alpha value for all islands # write island summary file # return list of islands islands_list; each island is a dictionary with keys 'island' (HTSeq genomic interval), # 'chip' (number of chip reads), 'control' (number of control reads), 'pvalue', 'fc' (fold change), and 'alpha' # also return HTSeq Genomic Array of all islands with their chip read count islands_list, island_array = SICER_MS.get_pvalue_fc_write_islandsummary( islands_list, total_reads, control_total_reads, opt.genome_fraction, genome, islandsummary_file_name) print "\nIdentify significant islands using FDR criterion..." # given list of islands as dictionaries, filter all islands with alpha values meeting the significance threshold to write two files # write filtered island file (format: chr start end chip_reads control_reads pvalue fc alpha) # write island bed file (format: chr start end chip_reads) filtered_islands_list, filtered_island_array = SICER_MS.filter_islands_by_significance( islands_list, filtered_island_file_name, island_bed_file_name, FDR, genome) print "\nFilter reads with identified significant islands...\n" # given HTSeq bed_iterator and HTSeq Genomic Array that has chip read count assigned to all islands # finds all reads in the bed_iterator that are located in islands # if a read is located in an island, it is written to a bed file # creates a genomic array of all windows that have reads located in islands # returns a dictionary containing all reads located in islands and a dictionary containing all windows in islands # dictionary format: keys are chromosomes, values are sorted lists of all read/window positions islandfiltered_reads_dict, islandfiltered_windows_dict, total_chip_reads_in_islands = SICER_MS.filter_raw_tags_by_islands( bed_iterator, filtered_island_array, island_filtered_file_name, opt.fragment_size, opt.window_size, genome) # calculate the number of island filtered reads in all the windows comprising the islands # calculate normalized read count for each window # add the window's normalized read count to a genomic array (islandfilt_normalized_window_array) # the islandfilt_normalized_window_array will be used to write a bedgraph file islandfilt_window_counts_dict, islandfilt_normalized_window_array = SICER_MS.get_window_counts_and_normalize( islandfiltered_windows_dict, islandfiltered_reads_dict, genome, 1000000, total_chip_reads_in_islands, opt.window_size) # write bedgraph file of normalized filtered islands islandfilt_normalized_window_array.write_bedgraph_file( islandfiltered_normalized_bedgraph_file_name)
def read_bed(ant_file, stranded=True): bed = HTSeq.BED_Reader(ant_file) ant = HTSeq.GenomicArrayOfSets("auto", stranded=stranded) for feature in bed: ant[feature.iv] += feature.name return ant
def main(argv): #establish options for running program parser = OptionParser() parser.add_option("-a", "--islandfile1", action="store", type="string", dest="file1", metavar="<file>", help="name of first islands file") parser.add_option("-b", "--islandfile2", action="store", type="string", dest="file2", metavar="<file>", help="name of second islands file") parser.add_option("-o", "--outputfile", action="store", type="string", dest="outfile", metavar="<file>", help="output file name") (opt, args) = parser.parse_args(argv) print "########## Island Union Program ##########" #add file1 islands to tempfile os.system('cat %s > %s' % (opt.file1, "tempfile.bed")) #add file2 islands to tempfile os.system('cat %s >> %s' % (opt.file2, "tempfile.bed")) #sort tempfile and store in sortedfile os.system('sort -k1,1 -k2,3n %s > %s' % ("tempfile.bed", "sortedfile.bed")) #instantiate HTSeq bediterator bed_iterator = HTSeq.BED_Reader("sortedfile.bed") outfile = open(opt.outfile, 'w') total_islands = 0 tempGI = None currentGI = None #iterate through GenomicInterval objects for read in bed_iterator: if tempGI is None: currentGI = read.iv tempGI = read.iv else: tempGI = currentGI currentGI = read.iv #use genomicInterval overlaps method if tempGI.overlaps(currentGI): currentGI.extend_to_include(tempGI) else: newLine = str(tempGI.chrom) + "\t" + str( tempGI.start) + "\t" + str(tempGI.end) + "\n" outfile.write(newLine) total_islands += 1 #add last entry to union file newLine = str(currentGI.chrom) + "\t" + str(currentGI.start) + "\t" + str( currentGI.end) + "\n" outfile.write(newLine) total_islands += 1 outfile.close() #remove tempfile.bed and sortedfile.bed os.system('rm %s %s' % ("tempfile.bed", "sortedfile.bed")) print "Total number of islands in islands_union_file: " + str( total_islands)
def main(argv): parser = OptionParser() parser.add_option( "-b", "--bam_file", action="store", type="string", dest="bamfile", help="paired-end bam file to be condensed into a single-ended bam file" ) (opt, args) = parser.parse_args(argv) #Sortng inputted paired-end BAM file by name, so paired reads are adjacent bam_reader = HTSeq.BAM_Reader(opt.bamfile) os.system('samtools sort -O BAM -n %s > %s' % (opt.bamfile, opt.bamfile[:-4] + "_sorted.bam")) #BAM file is converted to BED format os.system( 'bamToBed -i %s > %s' % (opt.bamfile[:-4] + "_sorted.bam", opt.bamfile[:-4] + "_sorted.bed")) #BED iterator created to traverse BED file bed_iterator = HTSeq.BED_Reader(opt.bamfile[:-4] + "_sorted.bed") outfile = open(opt.bamfile[:-4] + "_sorted_condensed.bed", 'w') #algorithmic logic -> combine paired reads into a single read pair1 = None pair2 = None oddRead = None new_start = 0 new_end = 0 pos = 0 new_strand = '' singleRead_iv = None line_count = 0 error_count = 0 finalcount = 0 for read in bed_iterator: line_count = +line_count + 1 if line_count % 2 != 0: oddRead = read.iv elif line_count % 2 == 0: pair1 = oddRead pair2 = read.iv print "Read 1 chr: " + str( pair1.chrom) + " and Read 2 chr: " + str(pair2.chrom) if str(pair1.chrom) == str(pair2.chrom): #determines start and end of single read new_start = min([pair1.start, pair2.start]) new_end = max([pair1.end, pair2.end]) #position calculation pos = (new_start + new_end) / 2 if pos % 1 != 0: pos = pos - 0.5 #decides proper strand for new single read (strand is that of leftmost read in pair) if pair1.start < pair2.start: new_strand = pair1.strand else: new_strand = pair2.strand #creates new single read with length 1 bp singleRead_iv = HTSeq.GenomicInterval(pair1.chrom, pos, pos + 1, new_strand) #writes read to output BED file write_to_outfile(outfile, singleRead_iv) finalcount = finalcount + 1 else: print "Error: paired reads not on same chromosome." error_count = error_count + 1 print "pairedRead to singleRead conversion is done running.\nThere were " + str( error_count) + " pairs on different chromosomes" print "Reads written to outfile: " + str(finalcount)
def main(argv): parser = OptionParser() parser.add_option("-g", "--genome", action="store", type="string", dest="genome_data", help="species, mm9, hg18, etc", metavar="<str>") parser.add_option("-a", "--rawreadfileA", action="store", type="string", dest="readfileA", metavar="<file>", help="raw read file A in bed format") parser.add_option("-b", "--rawreadfileB", action="store", type="string", dest="readfileB", metavar="<file>", help="raw read file B in bed format") parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after A experiment") parser.add_option("-d", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count summary file") (opt, args) = parser.parse_args(argv) if len(argv) < 12: parser.print_help() sys.exit(1) # create HTSeq BED_Readers for BED files file_A_iterator = HTSeq.BED_Reader(opt.readfileA) file_B_iterator = HTSeq.BED_Reader(opt.readfileB) island_file_iterator = HTSeq.BED_Reader(opt.islandfile) genome_file = opt.sicer_dir + "/genomes/" + opt.genome_data genome = get_genome_data(genome_file) read_dict_A, A_library_size = make_dict_of_reads(file_A_iterator, genome, opt.fragment_size) read_dict_B, B_library_size = make_dict_of_reads(file_B_iterator, genome, opt.fragment_size) print "Library size of " + opt.readfileA + ": " + str(A_library_size) print "Library size of " + opt.readfileB + ": " + str(B_library_size) A_reads_in_islands = 0 B_reads_in_islands = 0 islands_list = [] island_A_readcount_list = [] island_B_readcount_list = [] # Find read counts on the islands for region in island_file_iterator: read_count_A = get_read_count_in_region(region.iv, read_dict_A) A_reads_in_islands += read_count_A island_A_readcount_list.append(read_count_A) read_count_B = get_read_count_in_region(region.iv, read_dict_B) B_reads_in_islands += read_count_B island_B_readcount_list.append(read_count_B) island = {'region': region.iv, 'A_count': read_count_A, 'B_count': read_count_B} islands_list.append(island) pvalue_A_vs_B_list = [] pvalue_B_vs_A_list = [] print "Total number of A reads on islands is: " + str(A_reads_in_islands) print "Total number of B reads on islands is: " + str(B_reads_in_islands) library_scaling_factor = A_library_size * 1.0 / B_library_size pseudo_count = 1 pvalue_A_vs_B_list = [] pvalue_B_vs_A_list = [] # Calculate the p value. for island in islands_list: A_count = island['A_count'] B_count = island['B_count'] pvalue_A_vs_B = pvalue(A_count, B_count, library_scaling_factor, pseudo_count) pvalue_A_vs_B_list.append(pvalue_A_vs_B) pvalue_B_vs_A = pvalue(B_count, A_count, 1 / library_scaling_factor, pseudo_count) pvalue_B_vs_A_list.append(pvalue_B_vs_A) # Calculate the FDR fdr_A_vs_B_list = fdr(pvalue_A_vs_B_list) fdr_B_vs_A_list = fdr(pvalue_B_vs_A_list) # Output the islands read counts, normalized read counts, fc, pvalue both ways scaling_factor = 1000000 outfile = open(opt.out_file, 'w') outline = '#chrom' + "\t" + 'start' + "\t" + 'end' + "\t" + "Readcount_A" + "\t" + 'Normalized_Readcount_A' + "\t" \ + 'ReadcountB' + "\t" + 'Normalized_Readcount_B' + "\t" + "Fc_A_vs_B" + "\t" + "pvalue_A_vs_B" + "\t" + \ "FDR_A_vs_B" + "\t" + "Fc_B_vs_A" + "\t" + "pvalue_B_vs_A" + "\t" + "FDR_B_vs_A" + "\n" outfile.write(outline) ii = 0 for island in islands_list: A_count = island['A_count'] B_count = island['B_count'] normalized_A = A_count / float(A_library_size) * scaling_factor normalized_B = B_count / float(B_library_size) * scaling_factor fc_A_vs_B = ((A_count + pseudo_count) * 1.0 / (B_count + pseudo_count)) / library_scaling_factor fc_B_vs_A = ((B_count + pseudo_count) * 1.0 / (A_count + pseudo_count)) * library_scaling_factor outline = island['region'].chrom + "\t" + str(island['region'].start) + "\t" + str(island['region'].end) + "\t" + str( A_count) + "\t" + str(normalized_A) + "\t" + str(B_count) + "\t" + str(normalized_B) + "\t" + str( fc_A_vs_B) + "\t" + str(pvalue_A_vs_B_list[ii]) + "\t" + str(fdr_A_vs_B_list[ii]) + "\t" + str( fc_B_vs_A) + "\t" + str(pvalue_B_vs_A_list[ii]) + "\t" + str(fdr_B_vs_A_list[ii]) + "\n" outfile.write(outline) ii += 1 # Calculate the correlations using normalized read counts A_array = () B_array = () A_array = scipy.array(island_A_readcount_list) B_array = scipy.array(island_B_readcount_list) # Normalization to reads per million A_array = A_array / float(A_library_size) * scaling_factor B_array = B_array / float(B_library_size) * scaling_factor pearson = scipy.stats.pearsonr(A_array, B_array) print "Pearson's correlation is: " + str(pearson[0]) + " with p-value " + str(pearson[1]) spearman = scipy.stats.spearmanr(A_array, B_array) print "Spearman's correlation is: " + str(spearman[0]) + " with p-value " + str(spearman[1])
# get genomic arrays peakregions = dict() peakcutoffs = dict() samplenames = dict() maxpeaklength = dict() for count in range(0,len(bedfiles)): samp = bedfiles[count] # get sample name samplenames[samp] = samp.strip('noM_peaks.bed') output.write('\t' + samp.strip('noM_peaks.bed')) # get peak locations peakregions[samp] = HTSeq.GenomicArray("auto",stranded=False,typecode='d') maxpeaklength[samp] = 0 scoreslist = list() peakfile = HTSeq.BED_Reader(beddirectory + samp) totbases = 0 for peak in peakfile: peakregions[samp][peak.iv] = peak.score scoreslist.append(peak.score) peaklength = peak.iv.end - peak.iv.start + 1 totbases += peaklength maxpeaklength[samp] = max(maxpeaklength[samp],peaklength) # find score cutoff for this particular library sortedscores = sorted(scoreslist) scorecutind = len(sortedscores) - maxpeaks - 1 # -1 for python indexing 0 peakcutoffs[samp] = sortedscores[scorecutind] covoutput.write(samp.strip('noM_peaks.bed') + '\t' + str(totbases) + '\n') covoutput.close() output.write('\n')
def load_bed(bed_path): check_file_exist(bed_path) bed_reader = HTSeq.BED_Reader( bed_path) # also call it a sam reader for ease return bed_reader