def get_bc_signal(arguments): (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism, window_size, forward_shift, reverse_shift, bias_table1, bias_table2) = arguments mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") mpbs1.read(mpbs_file1) mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") mpbs2.read(mpbs_file2) mpbs = mpbs1.combine(mpbs2, output=True) mpbs.sort() bam1 = Samfile(reads_file1, "rb") bam2 = Samfile(reads_file2, "rb") genome_data = GenomeData(organism) fasta = Fastafile(genome_data.get_genome()) signal_1 = np.zeros(window_size) signal_2 = np.zeros(window_size) motif_len = None pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size), ("G", [0.0] * window_size), ("T", [0.0] * window_size), ("N", [0.0] * window_size)]) mpbs_regions = mpbs.by_names([mpbs_name]) num_motif = len(mpbs_regions) # Fetch bias corrected signal for region in mpbs_regions: if motif_len is None: motif_len = region.final - region.initial mid = (region.final + region.initial) / 2 p1 = mid - window_size / 2 p2 = mid + window_size / 2 if p1 <= 0: continue # Fetch raw signal signal1 = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam1, bias_table=bias_table1, genome_file_name=genome_data.get_genome(), forward_shift=forward_shift, reverse_shift=reverse_shift) signal2 = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam2, bias_table=bias_table2, genome_file_name=genome_data.get_genome(), forward_shift=forward_shift, reverse_shift=reverse_shift) if len(signal1) != len(signal_1) or len(signal2) != len(signal_2): continue # smooth the signal signal_1 = np.add(signal_1, np.array(signal1)) signal_2 = np.add(signal_2, np.array(signal2)) update_pwm(pwm, fasta, region, p1, p2) return signal_1, signal_2, motif_len, pwm, num_motif
def read_states_signals(args): # Read states from the annotation file states = "" with open(args.annotate_file) as f: for line in f: if len(line) < 2 or "#" in line or "=" in line: continue ll = line.strip().split(" ") for state in ll[1:-1]: states += state # If need to estimate bias table genome_data = GenomeData(args.organism) table = None # If the bias table is provided if args.bias_table: bias_table = BiasTable() bias_table_list = args.bias_table.split(",") table = bias_table.load_table(table_file_name_F=bias_table_list[0], table_file_name_R=bias_table_list[1]) # Get the normalization and slope signal from the raw bam file raw_signal = GenomicSignal(args.reads_file) raw_signal.load_sg_coefs(slope_window_size=9) norm_signal, slope_signal = \ raw_signal.get_signal(args.chrom, args.start, args.end, args.downstream_ext, args.upstream_ext, args.forward_shift, args.reverse_shift, bias_table=table, genome_file_name=genome_data.get_genome()) if args.print_bed_file: args.output_bed_file(states) return states, norm_signal, slope_signal
def __init__(self, rna_fasta, rna_name, dna_region, organism, showdbs=False): self.organism = organism genome = GenomeData(organism) self.genome_path = genome.get_genome() # RNA: Path to the FASTA file self.rna_fasta = rna_fasta self.showdbs = showdbs rnas = SequenceSet(name="rna", seq_type=SequenceType.RNA) rnas.read_fasta(self.rna_fasta) if rna_name: self.rna_name = rna_name else: self.rna_name = rnas[0].name # DNA: GenomicRegionSet self.dna_region = GenomicRegionSet(name="target") self.dna_region.read_bed(dna_region) self.dna_region = self.dna_region.gene_association( organism=self.organism, show_dis=True) self.topDBD = [] self.stat = OrderedDict(name=rna_name, genome=organism) self.stat["target_regions"] = str(len(self.dna_region))
def get_bc_signal(arguments): (mpbs_region, reads_file, organism, window_size, forward_shift, reverse_shift, bias_table) = arguments bam = Samfile(reads_file, "rb") genome_data = GenomeData(organism) signal = np.zeros(window_size) # Fetch bias corrected signal for region in mpbs_region: mid = (region.final + region.initial) // 2 p1 = mid - window_size // 2 p2 = mid + window_size // 2 if p1 <= 0: continue # Fetch raw signal _signal = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam, bias_table=bias_table, genome_file_name=genome_data.get_genome(), forward_shift=forward_shift, reverse_shift=reverse_shift) if len(_signal) != window_size: continue # smooth the signal signal = np.add(signal, np.array(_signal)) return signal
def main(): cArgs = args() root = os.environ["RGTDATA"] if "RGTDATA" in os.environ else "/rgtdata" if not os.path.exists(root + "/{assembly}/genome_{assembly}.fa".format(assembly = cArgs.assembly)): print( "WARNING: genomic data is not present for {assembly}. We will attempt to download it.".format(assembly = cArgs.assembly), file = sys.stderr ) print( "If you are running many jobs, they might run faster if you mount the appropriate data at {root}/{assembly}.".format(root = root, assembly = cArgs.assembly), file = sys.stderr ) result = os.system("python3 /reg-gen/data/setupGenomicData.py --{assembly}".format(assembly = cArgs.assembly)) if result != 0: print("FATAL: Unable to load genome data for {assembly}.".format(assembly = cArgs.assembly), file = sys.stderr) return 1 if cArgs.occurrence_threshold is None: signal = footprint(cArgs.bam, cArgs.bed, cArgs.assembly, cArgs.ext_size, cArgs.dnase, cArgs.bias_type) else: g = GenomeData(organism = cArgs.assembly) with FilteredRegions(cArgs.bed, cArgs.occurrence_threshold, g.get_genome(), g.get_chromosome_sizes()) as b: signal = footprint(cArgs.bam, b.name, cArgs.assembly, cArgs.ext_size, cArgs.dnase, cArgs.bias_type) if cArgs.aggregate or cArgs.plot_output is not None: signal = aggregate(signal, (lambda x: "all") if cArgs.occurrence_threshold is None else lambda x: x["name"], cArgs.ext_size) if cArgs.plot_output is not None: plot(signal["all"]["forward"], signal["all"]["reverse"], cArgs.font, cArgs.plot_output) if cArgs.output_file is None: if not cArgs.output_as_tsv or not cArgs.aggregate: print(json.dumps(signal)) else: for k, v in signal.items(): if k != "all": print("%s\t%s\t%s" % (k, ','.join([ str(x) for x in v["forward"] ]), ','.join([ str(x) for x in v["reverse"] ]))) else: with open(cArgs.output_file, 'w') as o: if not cArgs.output_as_tsv or not cArgs.aggregate: o.write(json.dumps(signal) + '\n') else: for k, v in signal.items(): if k != "all": o.write("%s\t%s\t%s\n" % (k, ','.join([ str(x) for x in v["forward"] ]), ','.join([ str(x) for x in v["reverse"] ]))) return 0
class MatchTest(unittest.TestCase): def setUp(self): # the genome must be available # TODO: we could make this test pure by manually using the sequence corresponding to the input region self.genome_data = GenomeData("hg19") self.genome_file = Fastafile(self.genome_data.get_genome()) def test_match_multiple(self): dirname = os.path.dirname(__file__) jasp_dir = "../../data/motifs/jaspar_vertebrates/" scanner = scan.Scanner(7) pssm_list = [] thresholds = [] motif = Motif(os.path.join(dirname, jasp_dir, "MA0139.1.CTCF.pwm"), 1, 0.0001, None) thresholds.append(motif.threshold) thresholds.append(motif.threshold_rc) pssm_list.append(motif.pssm) pssm_list.append(motif.pssm_rc) bg = tools.flat_bg(4) scanner.set_motifs(pssm_list, bg, thresholds) genomic_region = GenomicRegion("chr1", 710000, 715000) # Reading sequence associated to genomic_region sequence = str( self.genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final)) grs = match_multiple(scanner, [motif], sequence, genomic_region) self.assertSequenceEqual(grs.sequences, [ GenomicRegion( "chr1", 714270, 714289, name="MA0139.1.CTCF", orientation="+"), GenomicRegion( "chr1", 714180, 714199, name="MA0139.1.CTCF", orientation="-") ])
class MatchTest(unittest.TestCase): def setUp(self): # the genome must be available # TODO: we could make this test pure by manually using the sequence corresponding to the input region self.genome_data = GenomeData("hg19") self.genome_data.genome = os.path.join(os.path.dirname(__file__), "hg19_chr1_710000_715000.fa") self.genome_file = Fastafile(self.genome_data.get_genome()) def test_match_multiple(self): ms = MotifSet(preload_motifs="default") ms = ms.filter({'database': ["jaspar_vertebrates"], 'name': ["MA0139.1.CTCF"]}, search="inexact") self.assertEqual(len(ms), 1) motif = ms.get_motif_list(1, 0.0001)[0] scanner = scan.Scanner(7) pssm_list, thresholds = [], [] thresholds.append(motif.threshold) thresholds.append(motif.threshold) pssm_list.append(motif.pssm) pssm_list.append(motif.pssm_rc) bg = tools.flat_bg(4) scanner.set_motifs(pssm_list, bg, thresholds) genomic_region = GenomicRegion("chr1", 0, 5022) # Reading sequence associated to genomic_region sequence = str(self.genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final)) grs = match_multiple(scanner, [motif], sequence, genomic_region) self.assertSequenceEqual(grs.sequences, [GenomicRegion("chr1", 4270, 4289, name="MA0139.1.CTCF", orientation="+"), GenomicRegion("chr1", 4180, 4199, name="MA0139.1.CTCF", orientation="-")])
def get_dbss(input_BED,output_BED,rna_fasta,output_rbss,organism,l,e,c,fr,fm,of,mf,rm,temp): regions = GenomicRegionSet("Target") regions.read_bed(input_BED) regions.gene_association(organism=organism, show_dis=True) connect_rna(rna_fasta, temp=temp, rna_name="RNA") rnas = SequenceSet(name="rna", seq_type=SequenceType.RNA) rnas.read_fasta(os.path.join(temp,"rna_temp.fa")) rna_regions = get_rna_region_str(os.path.join(temp,rna_fasta)) # print(rna_regions) genome = GenomeData(organism) genome_path = genome.get_genome() txp = find_triplex(rna_fasta=rna_fasta, dna_region=regions, temp=temp, organism=organism, remove_temp=False, l=l, e=e, c=c, fr=fr, fm=fm, of=of, mf=mf, genome_path=genome_path, prefix="targeted_region", dna_fine_posi=True) print("Total binding events:\t",str(len(txp))) txp.write_bed(output_BED) txp.write_txp(filename=output_BED.replace(".bed",".txp")) rbss = txp.get_rbs() dbd_regions(exons=rna_regions, sig_region=rbss, rna_name="rna", output=output_rbss, out_file=True, temp=temp, fasta=False)
def __init__(self, rna_fasta, rna_name, dna_region, organism, showdbs=False): self.organism = organism genome = GenomeData(organism) self.genome_path = genome.get_genome() # RNA: Path to the FASTA file self.rna_fasta = rna_fasta self.showdbs = showdbs rnas = SequenceSet(name="rna", seq_type=SequenceType.RNA) rnas.read_fasta(self.rna_fasta) if rna_name: self.rna_name = rna_name else: self.rna_name = rnas[0].name # DNA: GenomicRegionSet self.dna_region = GenomicRegionSet(name="target") self.dna_region.read_bed(dna_region) self.dna_region = self.dna_region.gene_association(organism=self.organism, show_dis=True) self.topDBD = [] self.stat = OrderedDict(name=rna_name, genome=organism) self.stat["target_regions"] = str(len(self.dna_region))
def get_bc_signal(arguments): (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism, window_size, forward_shift, reverse_shift, bias_table1, bias_table2) = arguments mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") mpbs1.read(mpbs_file1) mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") mpbs2.read(mpbs_file2) mpbs = mpbs1.combine(mpbs2, output=True) mpbs.sort() bam1 = Samfile(reads_file1, "rb") bam2 = Samfile(reads_file2, "rb") genome_data = GenomeData(organism) fasta = Fastafile(genome_data.get_genome()) signal_1 = np.zeros(window_size) signal_2 = np.zeros(window_size) motif_len = None pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size), ("G", [0.0] * window_size), ("T", [0.0] * window_size), ("N", [0.0] * window_size)]) mpbs_regions = mpbs.by_names([mpbs_name]) num_motif = len(mpbs_regions) # Fetch bias corrected signal for region in mpbs_regions: if motif_len is None: motif_len = region.final - region.initial mid = (region.final + region.initial) / 2 p1 = mid - window_size / 2 p2 = mid + window_size / 2 if p1 <= 0: continue # Fetch raw signal signal1 = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam1, bias_table=bias_table1, genome_file_name=genome_data.get_genome(), forward_shift=forward_shift, reverse_shift=reverse_shift) signal2 = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam2, bias_table=bias_table2, genome_file_name=genome_data.get_genome(), forward_shift=forward_shift, reverse_shift=reverse_shift) if len(signal1) != len(signal_1) or len(signal2) != len(signal_2): continue signal_1 = np.add(signal_1, np.array(signal1)) signal_2 = np.add(signal_2, np.array(signal2)) update_pwm(pwm, fasta, region, p1, p2) return signal_1, signal_2, motif_len, pwm, num_motif
def get_bc_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() bam = Samfile(args.input_files[0], "rb") genome_data = GenomeData(args.organism) fasta = Fastafile(genome_data.get_genome()) hmm_data = HmmData() if args.bias_table: bias_table_list = args.bias_table.split(",") bias_table = BiasTable().load_table(table_file_name_F=bias_table_list[0], table_file_name_R=bias_table_list[1]) else: table_F = hmm_data.get_default_bias_table_F_ATAC() table_R = hmm_data.get_default_bias_table_R_ATAC() bias_table = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) if args.strand_specific: fname_forward = os.path.join(args.output_location, "{}_forward.wig".format(args.output_prefix)) fname_reverse = os.path.join(args.output_location, "{}_reverse.wig".format(args.output_prefix)) f_forward = open(fname_forward, "a") f_reverse = open(fname_reverse, "a") for region in regions: signal_f, signal_r = reads_file.get_bc_signal_by_fragment_length( ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=True) if args.norm: signal_f = reads_file.boyle_norm(signal_f) perc = scoreatpercentile(signal_f, 98) std = np.std(signal_f) signal_f = reads_file.hon_norm_atac(signal_f, perc, std) signal_r = reads_file.boyle_norm(signal_r) perc = scoreatpercentile(signal_r, 98) std = np.std(signal_r) signal_r = reads_file.hon_norm_atac(signal_r, perc, std) f_forward.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal_f)]) + "\n") f_reverse.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(-e) for e in np.nan_to_num(signal_r)]) + "\n") f_forward.close() f_reverse.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}_forward.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", fname_forward, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(fname_forward) bw_filename = os.path.join(args.output_location, "{}_reverse.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", fname_reverse, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(fname_reverse) else: output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) with open(output_fname, "a") as output_f: for region in regions: signal = reads_file.get_bc_signal_by_fragment_length(ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=False) if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(output_fname)
################################################################################## parser = argparse.ArgumentParser(description='Convert BED files into FASTAs', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-bed', type=str, help="BED file or a directory containing BED files") parser.add_argument('-output', type=str, help="Define the output directory") parser.add_argument('-organism', type=str, help="Define the organism") args = parser.parse_args() if not os.path.exists(args.output): os.makedirs(args.output) genome = GenomeData(args.organism) if os.path.isfile(args.bed): load_exon_sequence(bed=args.bed, directory=args.output, genome_path=genome.get_genome()) elif os.path.isdir(args.bed): for root, dirnames, filenames in os.walk(args.bed): for filename in filenames: if ".bed" in filename: print(filename) fn = os.path.basename(filename) fn = fn.partition(".bed")[0] try: load_exon_sequence(bed=os.path.join(args.bed,filename), directory=os.path.join(args.output,fn), genome_path=genome.get_genome()) except:
def footprint(bam: str, bed: str, assembly: str = "hg38", w: int = 500, dnase: bool = False, bias_type="SH"): # load HMM and bias parameters for ATAC-seq g = GenomeData(organism=assembly) hmm_data = HmmData() if dnase: hmm_file = hmm_data.get_default_hmm_dnase_bc() if bias_type == 'SH': table_F = hmm_data.get_default_bias_table_F_SH() table_R = hmm_data.get_default_bias_table_R_SH() bias_table = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) elif bias_type == 'DH': table_F = hmm_data.get_default_bias_table_F_DH() table_R = hmm_data.get_default_bias_table_R_DH() bias_table = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) else: hmm_file = hmm_data.get_default_hmm_atac_paired() table_F = hmm_data.get_default_bias_table_F_ATAC() table_R = hmm_data.get_default_bias_table_R_ATAC() bias_table = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) # load reads from BAM reads_file = GenomicSignal(bam) reads_file.load_sg_coefs(SG_WINDOW_SIZE) # open data and sequence bam = Samfile(bam, "rb") fasta = Fastafile(g.get_genome()) # load and expand regions with open(bed, 'r') as f: regions = [ expandRegion( *tuple(line.strip().split()[:3]), line.strip().split()[3] if len(line.strip().split()) >= 4 else None, w, line.strip().split()[4] if len(line.strip().split()) >= 5 else '.') for line in f ] # load signal forward = [] reverse = [] failed = 0 get_reads = reads_file.get_signal_atac if not dnase else reads_file.get_signal for i, x in enumerate(regions): try: chromosome, start, end, _, strand = x atac_norm_f, atac_slope_f, atac_norm_r, atac_slope_r = get_reads( chromosome, start, end, 0, 0, FORWARD_SHIFT if not dnase else 0, REVERSE_SHIFT if not dnase else 0, 1000 if dnase else 150, 98, 98, bias_table, g.get_genome()) atac_norm_f = [float(x) for x in atac_norm_f] atac_norm_r = [float(x) for x in atac_norm_r] if strand == '-': atac_norm_f.reverse() atac_norm_r.reverse() forward.append(atac_norm_f if strand != '-' else atac_norm_r) reverse.append(atac_norm_r if strand != '-' else atac_norm_f) if i % 500 == 0: print("INFO: aggregating region %d of %d" % (i, len(regions)), file=sys.stderr) except: if len(forward) <= i: forward.append(None) if len(reverse) <= i: reverse.append(None) failed += 1 if failed > 0: print( "WARNING: failed to generate bias-corrected signal profiles for %d regions" % failed, file=sys.stderr) return [ regionDict(regions[i], forward[i], reverse[i]) for i in range(len(regions)) if forward[i] is not None and reverse[i] is not None ]
def estimate_bias_kmer(args): # Parameters maxDuplicates = 100 pseudocount = 1.0 # Initializing bam and fasta bamFile = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fastaFile = Fastafile(genome_data.get_genome()) regions = GenomicRegionSet("regions") regions.read(args.regions_file) # Initializing dictionaries obsDictF = dict() obsDictR = dict() expDictF = dict() expDictR = dict() ct_reads_r = 0 ct_reads_f = 0 ct_kmers = 0 # Iterating on HS regions for region in regions: # Initialization prevPos = -1 trueCounter = 0 # Evaluating observed frequencies #################################### # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if not r.is_reverse: cut_site = r.pos + args.forward_shift - 1 p1 = cut_site - int(floor(args.k_nb / 2)) else: cut_site = r.aend + args.reverse_shift + 1 p1 = cut_site - int(floor(args.k_nb / 2)) p2 = p1 + args.k_nb # Verifying PCR artifacts if p1 == prevPos: trueCounter += 1 else: prevPos = p1 trueCounter = 0 if trueCounter > maxDuplicates: continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if not r.is_reverse: ct_reads_f += 1 try: obsDictF[currStr] += 1 except Exception: obsDictF[currStr] = 1 else: ct_reads_r += 1 try: obsDictR[currStr] += 1 except Exception: obsDictR[currStr] = 1 # Evaluating expected frequencies #################################### # Fetching whole sequence try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue currRevComp = AuxiliaryFunctions.revcomp(currStr) # Iterating on each sequence position for i in range(0, len(currStr) - args.k_nb): ct_kmers += 1 # Counting k-mer in dictionary s = currStr[i:i + args.k_nb] try: expDictF[s] += 1 except Exception: expDictF[s] = 1 # Counting k-mer in dictionary for reverse complement s = currRevComp[i:i + args.k_nb] try: expDictR[s] += 1 except Exception: expDictR[s] = 1 # Closing files bamFile.close() fastaFile.close() # Creating bias dictionary alphabet = ["A", "C", "G", "T"] kmerComb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] bias_table_F = dict([(e, 0.0) for e in kmerComb]) bias_table_R = dict([(e, 0.0) for e in kmerComb]) for kmer in kmerComb: try: obsF = obsDictF[kmer] + pseudocount except Exception: obsF = pseudocount try: expF = expDictF[kmer] + pseudocount except Exception: expF = pseudocount if ct_reads_f == 0: bias_table_F[kmer] = 1 else: bias_table_F[kmer] = round(float(obsF / ct_reads_f) / float(expF / ct_kmers), 6) try: obsR = obsDictR[kmer] + pseudocount except Exception: obsR = pseudocount try: expR = expDictR[kmer] + pseudocount except Exception: expR = pseudocount if ct_reads_r == 0: bias_table_R[kmer] = 1 else: bias_table_R[kmer] = round(float(obsR / ct_reads_r) / float(expR / ct_kmers), 6) write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
def create_signal(args, regions): def revcomp(s): rev_dict = dict([("A", "T"), ("T", "A"), ("C", "G"), ("G", "C"), ("N", "N")]) return "".join([rev_dict[e] for e in s[::-1]]) alphabet = ["A", "C", "G", "T"] kmer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] f_obs_dict = dict([(e, 0.0) for e in kmer_comb]) r_obs_dict = dict([(e, 0.0) for e in kmer_comb]) f_exp_dict = dict([(e, 0.0) for e in kmer_comb]) r_exp_dict = dict([(e, 0.0) for e in kmer_comb]) bam_file = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fasta_file = Fastafile(genome_data.get_genome()) for region in regions: # Fetching observed reads reads = bam_file.fetch(reference=region.chrom, start=region.initial, end=region.final) for read in reads: if not read.is_reverse: p1 = read.pos - int(floor(args.k_nb / 2)) + args.forward_shift - 1 else: p1 = read.aend - int(floor(args.k_nb / 2)) + args.reverse_shift + 1 p2 = p1 + args.k_nb try: dna_sequence_obs = str(fasta_file.fetch(region.chrom, p1, p2)).upper() except Exception: continue if 'N' not in dna_sequence_obs: if read.is_reverse: dna_sequence_obs = revcomp(dna_sequence_obs) r_obs_dict[dna_sequence_obs] += 1 else: f_obs_dict[dna_sequence_obs] += 1 # Fetching whole sequence try: dna_sequence_exp = str(fasta_file.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue dna_sequence_exp_rev = revcomp(dna_sequence_exp) for i in range(0, len(dna_sequence_exp) - args.k_nb): s = dna_sequence_exp[i:i + args.k_nb] if "N" not in s: f_exp_dict[s] += 1 s = dna_sequence_exp_rev[i:i + args.k_nb] if "N" not in s: r_exp_dict[s] += 1 output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb))) output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb))) output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb))) output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb))) output_file_f_obs = open(output_fname_f_obs, "w") output_file_f_exp = open(output_fname_f_exp, "w") output_file_r_obs = open(output_fname_r_obs, "w") output_file_r_exp = open(output_fname_r_exp, "w") for kmer in r_obs_dict.keys(): if f_obs_dict[kmer] > 0: output_file_f_obs.write(kmer + "\t" + str(f_obs_dict[kmer]) + "\n") for kmer in r_obs_dict.keys(): if f_exp_dict[kmer] > 0: output_file_f_exp.write(kmer + "\t" + str(f_exp_dict[kmer]) + "\n") for kmer in r_obs_dict.keys(): if r_obs_dict[kmer] > 0: output_file_r_obs.write(kmer + "\t" + str(r_obs_dict[kmer]) + "\n") for kmer in r_obs_dict.keys(): if r_exp_dict[kmer] > 0: output_file_r_exp.write(kmer + "\t" + str(r_exp_dict[kmer]) + "\n") output_file_f_obs.close() output_file_f_exp.close() output_file_r_obs.close() output_file_r_exp.close()
def diff_analysis_run(args): # Initializing Error Handler err = ErrorHandler() output_location = os.path.join(args.output_location, "Lineplots") try: if not os.path.isdir(output_location): os.makedirs(output_location) except Exception: err.throw_error("MM_OUT_FOLDER_CREATION") # check if they have same length mpbs_files = args.mpbs_files.strip().split(",") reads_files = args.reads_files.strip().split(",") conditions = args.conditions.strip().split(",") if args.colors is not None: colors = args.colors.strip().split(",") else: colors = [ "#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33", "#a65628", "#f781bf", "#66c2a5", "#fc8d62", "#8da0cb", "#e78ac3", "#a6d854", "#ffd92f", "#e5c494", "#b3b3b3", "#8dd3c7", "#ffffb3", "#bebada", "#fb8072", "#80b1d3", "#fdb462", "#b3de69", "#fccde5", "#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e", "#e6ab02", "#a6761d", "#666666", "#7fc97f", "#beaed4", "#fdc086", "#ffff99", "#386cb0", "#f0027f", "#bf5b17", "#666666" ] assert len(mpbs_files) == len(reads_files) == len(conditions), \ "Number of motif, read and condition names are not same: {}, {}, {}".format(len(mpbs_files), len(reads_files), len(conditions)) # Check if the index file exists for reads_file in reads_files: base_name = "{}.bai".format(reads_file) if not os.path.exists(base_name): pysam.index(reads_file) mpbs = GenomicRegionSet("Motif Predicted Binding Sites of All Conditions") for i, mpbs_file in enumerate(mpbs_files): mpbs.read(mpbs_file) mpbs.sort() mpbs.remove_duplicates() mpbs_name_list = list(set(mpbs.get_names())) signals = np.zeros(shape=(len(conditions), len(mpbs_name_list), args.window_size), dtype=np.float32) motif_len = list() motif_num = list() motif_pwm = list() print((" {} cpus are detected and {} of them will be used...\n".format( cpu_count(), args.nc))) genome_data = GenomeData(args.organism) fasta = Fastafile(genome_data.get_genome()) print("generating signal for each motif and condition...\n") # differential analysis using bias corrected signal if args.bc: hmm_data = HmmData() table_forward = hmm_data.get_default_bias_table_F_ATAC() table_reverse = hmm_data.get_default_bias_table_R_ATAC() bias_table = BiasTable().load_table(table_file_name_F=table_forward, table_file_name_R=table_reverse) # do not use multi-processing if args.nc == 1: for i, condition in enumerate(conditions): for j, mpbs_name in enumerate(mpbs_name_list): mpbs_regions = mpbs.by_names([mpbs_name]) arguments = (mpbs_regions, reads_files[i], args.organism, args.window_size, args.forward_shift, args.reverse_shift, bias_table) try: signals[i, j, :] = get_bc_signal(arguments) except Exception: logging.exception("get bias corrected signal failed") # get motif length, number and pwm matrix motif_len.append(mpbs_regions[0].final - mpbs_regions[0].initial) motif_num.append(len(mpbs_regions)) motif_pwm.append( get_pwm(fasta, mpbs_regions, args.window_size)) # use multi-processing else: for i, condition in enumerate(conditions): print(( "generating signal for condition {} \n".format(condition))) with Pool(processes=args.nc) as pool: arguments_list = list() for mpbs_name in mpbs_name_list: mpbs_regions = mpbs.by_names([mpbs_name]) arguments = (mpbs_regions, reads_files[i], args.organism, args.window_size, args.forward_shift, args.reverse_shift, bias_table) arguments_list.append(arguments) # get motif length, number and pwm matrix motif_len.append(mpbs_regions[0].final - mpbs_regions[0].initial) motif_num.append(len(mpbs_regions)) motif_pwm.append( get_pwm(fasta, mpbs_regions, args.window_size)) res = pool.map(get_bc_signal, arguments_list) signals[i] = np.array(res) # differential analysis using raw signal else: # do not use multi-processing if args.nc == 1: for i, condition in enumerate(conditions): for j, mpbs_name in enumerate(mpbs_name_list): mpbs_regions = mpbs.by_names([mpbs_name]) arguments = (mpbs_regions, reads_files[i], args.organism, args.window_size, args.forward_shift, args.reverse_shift) signals[i, j, :] = get_raw_signal(arguments) # get motif length, number and pwm matrix motif_len.append(mpbs_regions[0].final - mpbs_regions[0].initial) motif_num.append(len(mpbs_regions)) motif_pwm.append( get_pwm(fasta, mpbs_regions, args.window_size)) # use multi-processing else: for i, condition in enumerate(conditions): print(( "generating signal for condition {} \n".format(condition))) with Pool(processes=args.nc) as pool: arguments_list = list() for mpbs_name in mpbs_name_list: mpbs_regions = mpbs.by_names([mpbs_name]) arguments = (mpbs_regions, reads_files[i], args.organism, args.window_size, args.forward_shift, args.reverse_shift) arguments_list.append(arguments) # get motif length, number and pwm matrix motif_len.append(mpbs_regions[0].final - mpbs_regions[0].initial) motif_num.append(len(mpbs_regions)) motif_pwm.append( get_pwm(fasta, mpbs_regions, args.window_size)) res = pool.map(get_raw_signal, arguments_list) signals[i] = np.array(res) print("signal generation is done!\n") # compute normalization facotr for each condition factors = compute_factors(signals) output_factor(args, factors, conditions) # normalize signals by factor and number of motifs for i in range(len(conditions)): for j in range(len(mpbs_name_list)): signals[i, j, :] = signals[i, j, :] / (factors[i] * motif_num[j]) if args.output_profiles: output_profiles(mpbs_name_list, signals, conditions, args.output_location) print("generating line plot for each motif...\n") if args.nc == 1: for i, mpbs_name in enumerate(mpbs_name_list): output_line_plot( (mpbs_name, motif_num[i], signals[:, i, :], conditions, motif_pwm[i], output_location, args.window_size, colors)) else: with Pool(processes=args.nc) as pool: arguments_list = list() for i, mpbs_name in enumerate(mpbs_name_list): arguments_list.append( (mpbs_name, motif_num[i], signals[:, i, :], conditions, motif_pwm[i], output_location, args.window_size, colors)) pool.map(output_line_plot, arguments_list) ps_tc_results = list() for i, mpbs_name in enumerate(mpbs_name_list): ps_tc_results.append( get_ps_tc_results(signals[:, i, :], motif_len[i], args.window_size)) # find the significant motifs and generate a scatter plot if two conditions are given if len(conditions) == 2: ps_tc_results = scatter_plot(args, ps_tc_results, mpbs_name_list, conditions) output_stat_results(ps_tc_results, conditions, mpbs_name_list, motif_num, args)
parser.add_argument('-bed', type=str, help="BED file or a directory containing BED files") parser.add_argument('-output', type=str, help="Define the output directory") parser.add_argument('-organism', type=str, help="Define the organism") args = parser.parse_args() if not os.path.exists(args.output): os.makedirs(args.output) genome = GenomeData(args.organism) if os.path.isfile(args.bed): load_exon_sequence(bed=args.bed, directory=args.output, genome_path=genome.get_genome()) elif os.path.isdir(args.bed): for root, dirnames, filenames in os.walk(args.bed): for filename in filenames: if ".bed" in filename: print(filename) fn = os.path.basename(filename) fn = fn.partition(".bed")[0] try: load_exon_sequence(bed=os.path.join(args.bed, filename), directory=os.path.join(args.output, fn), genome_path=genome.get_genome()) except:
def get_raw_signal(arguments): (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism, window_size, forward_shift, reverse_shift) = arguments mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") mpbs1.read(mpbs_file1) mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") mpbs2.read(mpbs_file2) mpbs = mpbs1.combine(mpbs2, output=True) mpbs.sort() bam1 = Samfile(reads_file1, "rb") bam2 = Samfile(reads_file2, "rb") genome_data = GenomeData(organism) fasta = Fastafile(genome_data.get_genome()) signal_1 = np.zeros(window_size) signal_2 = np.zeros(window_size) motif_len = None pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size), ("G", [0.0] * window_size), ("T", [0.0] * window_size), ("N", [0.0] * window_size)]) mpbs_regions = mpbs.by_names([mpbs_name]) num_motif = len(mpbs_regions) for region in mpbs_regions: if motif_len is None: motif_len = region.final - region.initial mid = (region.final + region.initial) / 2 p1 = mid - window_size / 2 p2 = mid + window_size / 2 if p1 <= 0: continue # Fetch raw signal for read in bam1.fetch(region.chrom, p1, p2): if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal_1[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal_1[cut_site - p1] += 1.0 for read in bam2.fetch(region.chrom, p1, p2): if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal_2[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal_2[cut_site - p1] += 1.0 update_pwm(pwm, fasta, region, p1, p2) return signal_1, signal_2, motif_len, pwm, num_motif
def estimate_bias_pwm(args): # Parameters max_duplicates = 100 # Initializing bam and fasta bamFile = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fastaFile = Fastafile(genome_data.get_genome()) regions = GenomicRegionSet("regions") regions.read(args.regions_file) obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) # Iterating on HS regions for region in regions: # Initialization prev_pos = -1 true_counter = 0 # Evaluating observed frequencies # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if not r.is_reverse: cut_site = r.pos + args.forward_shift - 1 p1 = cut_site - int(floor(args.k_nb / 2)) else: cut_site = r.aend + args.reverse_shift + 1 p1 = cut_site - int(floor(args.k_nb / 2)) p2 = p1 + args.k_nb # Verifying PCR artifacts if p1 == prev_pos: true_counter += 1 else: prev_pos = p1 true_counter = 0 if true_counter > max_duplicates: continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if not r.is_reverse: for i in range(0, len(currStr)): obs_f_pwm_dict[currStr[i]][i] += 1 else: for i in range(0, len(currStr)): obs_r_pwm_dict[currStr[i]][i] += 1 # Evaluating expected frequencies # Fetching whole sequence try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue # Iterating on each sequence position s = None for i in range(0, len(currStr) - args.k_nb): # Counting k-mer in dictionary s = currStr[i:i + args.k_nb] for i in range(0, len(s)): exp_f_pwm_dict[s[i]][i] += 1 # Counting k-mer in dictionary for reverse complement s = AuxiliaryFunctions.revcomp(s) for i in range(0, len(s)): exp_r_pwm_dict[s[i]][i] += 1 # Closing files bamFile.close() fastaFile.close() # Output pwms os.system("mkdir -p " + os.path.join(args.output_location, "pfm")) pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict] pwm_file_list = [] pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb))) pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb))) pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb))) pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb))) pwm_file_list.append(pwm_obs_f) pwm_file_list.append(pwm_obs_r) pwm_file_list.append(pwm_exp_f) pwm_file_list.append(pwm_exp_r) for i in range(len(pwm_dict_list)): with open(pwm_file_list[i], "w") as pwm_file: for e in ["A", "C", "G", "T"]: pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n") motif_obs_f = motifs.read(open(pwm_obs_f), "pfm") motif_obs_r = motifs.read(open(pwm_obs_r), "pfm") motif_exp_f = motifs.read(open(pwm_exp_f), "pfm") motif_exp_r = motifs.read(open(pwm_exp_r), "pfm") # Output logos os.system("mkdir -p " + os.path.join(args.output_location, "logo")) logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb))) logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb))) logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb))) logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb))) motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) # Creating bias dictionary alphabet = ["A", "C", "G", "T"] k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] bias_table_F = dict([(e, 0.0) for e in k_mer_comb]) bias_table_R = dict([(e, 0.0) for e in k_mer_comb]) for k_mer in k_mer_comb: obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb) exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb) bias_table_F[k_mer] = round(obs_f / exp_f, 6) obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb) exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb) bias_table_R[k_mer] = round(obs_r / exp_r, 6) write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
def get_raw_signal(arguments): (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism, window_size, forward_shift, reverse_shift) = arguments mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") mpbs1.read(mpbs_file1) mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") mpbs2.read(mpbs_file2) mpbs = mpbs1.combine(mpbs2, output=True) mpbs.sort() bam1 = Samfile(reads_file1, "rb") bam2 = Samfile(reads_file2, "rb") genome_data = GenomeData(organism) fasta = Fastafile(genome_data.get_genome()) signal_1 = np.zeros(window_size) signal_2 = np.zeros(window_size) motif_len = None pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size), ("G", [0.0] * window_size), ("T", [0.0] * window_size), ("N", [0.0] * window_size)]) mpbs_regions = mpbs.by_names([mpbs_name]) num_motif = len(mpbs_regions) for region in mpbs_regions: if motif_len is None: motif_len = region.final - region.initial mid = (region.final + region.initial) / 2 p1 = mid - window_size / 2 p2 = mid + window_size / 2 if p1 <= 0: continue # Fetch raw signal for read in bam1.fetch(region.chrom, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal_1[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal_1[cut_site - p1] += 1.0 for read in bam2.fetch(region.chrom, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal_2[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal_2[cut_site - p1] += 1.0 update_pwm(pwm, fasta, region, p1, p2) return signal_1, signal_2, motif_len, pwm, num_motif
def create_signal(args, regions): def revcomp(s): rev_dict = dict([("A", "T"), ("T", "A"), ("C", "G"), ("G", "C"), ("N", "N")]) return "".join([rev_dict[e] for e in s[::-1]]) alphabet = ["A", "C", "G", "T"] kmer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] f_obs_dict = dict([(e, 0.0) for e in kmer_comb]) r_obs_dict = dict([(e, 0.0) for e in kmer_comb]) f_exp_dict = dict([(e, 0.0) for e in kmer_comb]) r_exp_dict = dict([(e, 0.0) for e in kmer_comb]) bam_file = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fasta_file = Fastafile(genome_data.get_genome()) for region in regions: # Fetching observed reads reads = bam_file.fetch(reference=region.chrom, start=region.initial, end=region.final) for read in reads: if not read.is_reverse: p1 = read.pos - int(floor(args.k_nb / 2)) + args.forward_shift - 1 else: p1 = read.aend - int(floor(args.k_nb / 2)) + args.reverse_shift + 1 p2 = p1 + args.k_nb try: dna_sequence_obs = str(fasta_file.fetch(region.chrom, p1, p2)).upper() except Exception: continue if 'N' not in dna_sequence_obs: if read.is_reverse: dna_sequence_obs = revcomp(dna_sequence_obs) r_obs_dict[dna_sequence_obs] += 1 else: f_obs_dict[dna_sequence_obs] += 1 # Fetching whole sequence try: dna_sequence_exp = str(fasta_file.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue dna_sequence_exp_rev = revcomp(dna_sequence_exp) for i in range(0, len(dna_sequence_exp) - args.k_nb): s = dna_sequence_exp[i:i + args.k_nb] if "N" not in s: f_exp_dict[s] += 1 s = dna_sequence_exp_rev[i:i + args.k_nb] if "N" not in s: r_exp_dict[s] += 1 output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb))) output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb))) output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb))) output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb))) output_file_f_obs = open(output_fname_f_obs, "w") output_file_f_exp = open(output_fname_f_exp, "w") output_file_r_obs = open(output_fname_r_obs, "w") output_file_r_exp = open(output_fname_r_exp, "w") for kmer in list(r_obs_dict.keys()): if f_obs_dict[kmer] > 0: output_file_f_obs.write(kmer + "\t" + str(f_obs_dict[kmer]) + "\n") for kmer in list(r_obs_dict.keys()): if f_exp_dict[kmer] > 0: output_file_f_exp.write(kmer + "\t" + str(f_exp_dict[kmer]) + "\n") for kmer in list(r_obs_dict.keys()): if r_obs_dict[kmer] > 0: output_file_r_obs.write(kmer + "\t" + str(r_obs_dict[kmer]) + "\n") for kmer in list(r_obs_dict.keys()): if r_exp_dict[kmer] > 0: output_file_r_exp.write(kmer + "\t" + str(r_exp_dict[kmer]) + "\n") output_file_f_obs.close() output_file_f_exp.close() output_file_r_obs.close() output_file_r_exp.close()
def get_bc_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() bam = Samfile(args.input_files[0], "rb") genome_data = GenomeData(args.organism) fasta = Fastafile(genome_data.get_genome()) hmm_data = HmmData() if args.bias_table: bias_table_list = args.bias_table.split(",") bias_table = BiasTable().load_table( table_file_name_F=bias_table_list[0], table_file_name_R=bias_table_list[1]) else: table_F = hmm_data.get_default_bias_table_F_ATAC() table_R = hmm_data.get_default_bias_table_R_ATAC() bias_table = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) if args.strand_specific: fname_forward = os.path.join( args.output_location, "{}_forward.wig".format(args.output_prefix)) fname_reverse = os.path.join( args.output_location, "{}_reverse.wig".format(args.output_prefix)) f_forward = open(fname_forward, "a") f_reverse = open(fname_reverse, "a") for region in regions: signal_f, signal_r = reads_file.get_bc_signal_by_fragment_length( ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=True) if args.norm: signal_f = reads_file.boyle_norm(signal_f) perc = scoreatpercentile(signal_f, 98) std = np.std(signal_f) signal_f = reads_file.hon_norm_atac(signal_f, perc, std) signal_r = reads_file.boyle_norm(signal_r) perc = scoreatpercentile(signal_r, 98) std = np.std(signal_r) signal_r = reads_file.hon_norm_atac(signal_r, perc, std) f_forward.write( "fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal_f)]) + "\n") f_reverse.write( "fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(-e) for e in np.nan_to_num(signal_r)]) + "\n") f_forward.close() f_reverse.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join( args.output_location, "{}_forward.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", fname_forward, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(fname_forward) bw_filename = os.path.join( args.output_location, "{}_reverse.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", fname_reverse, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(fname_reverse) else: output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) with open(output_fname, "a") as output_f: for region in regions: signal = reads_file.get_bc_signal_by_fragment_length( ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=False) if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write( "fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(output_fname)