def scan_sequence(seq, seq_gc_bin, motifs, nreport, scan_rc, motifs_meanstd=None, zscore=False): ret = [] # scan for motifs for motif, cutoff in motifs: if cutoff is None: ret.append([]) else: if zscore: m_mean, m_std = motifs_meanstd[seq_gc_bin][motif.id] result = pwmscan(seq, motif.logodds, motif.pwm_min_score(), nreport, scan_rc) result = [[(row[0] - m_mean) / m_std, row[1], row[2]] for row in result] result = [row for row in result if row[0] >= cutoff] else: result = pwmscan(seq, motif.logodds, cutoff, nreport, scan_rc) if cutoff <= motif.pwm_min_score() and len(result) == 0: result = [[motif.pwm_min_score(), 0, 1]] * nreport ret.append(result) return ret
def pwm_scan_to_gff(self, fa, gfffile, cutoff=0.9, nreport=50, scan_rc=True, append=False): if append: out = open(gfffile, "a") else: out = open(gfffile, "w") c = self.pwm_min_score() + (self.pwm_max_score() - self.pwm_min_score()) * cutoff pwm = self.pwm strandmap = {-1: "-", "-1": "-", "-": "-", "1": "+", 1: "+", "+": "+"} gff_line = ("{}\tpwmscan\tmisc_feature\t{}\t{}\t{}\t{}\t.\t" "motif_name \"{}\" ; motif_instance \"{}\"\n") for name, seq in fa.items(): result = pwmscan(seq.upper(), pwm, c, nreport, scan_rc) for score, pos, strand in result: out.write( gff_line.format(name, pos, pos + len(pwm), score, strandmap[strand], self.id, seq[pos:pos + len(pwm)])) out.close()
def pwm_scan_to_gff(self, fa, gfffile, cutoff=0.9, nreport=50, scan_rc=True, append=False): if append: out = open(gfffile, "a") else: out = open(gfffile, "w") c = self.pwm_min_score() + (self.pwm_max_score() - self.pwm_min_score()) * cutoff pwm = self.pwm strandmap = {-1:"-","-1":"-","-":"-","1":"+",1:"+","+":"+"} gff_line = ("{}\tpwmscan\tmisc_feature\t{}\t{}\t{}\t{}\t.\t" "motif_name \"{}\" ; motif_instance \"{}\"\n") for name, seq in fa.items(): result = pwmscan(seq.upper(), pwm, c, nreport, scan_rc) for score, pos, strand in result: out.write(gff_line.format( name, pos, pos + len(pwm), score, strandmap[strand], self.id, seq[pos:pos + len(pwm)] )) out.close()
def pwm_scan(self, fa, cutoff=0.9, nreport=50, scan_rc=True): c = self.pwm_min_score() + (self.pwm_max_score() - self.pwm_min_score()) * cutoff pwm = self.pwm matches = {} for id, seq in fa.items(): matches[id] = [] result = pwmscan(seq.upper(), pwm, c, nreport, scan_rc) for _,pos,strand in result: matches[id].append(pos) return matches
def pwm_scan_score(self, fa, cutoff=0, nreport=1, scan_rc=True): c = self.pwm_min_score() + (self.pwm_max_score() - self.pwm_min_score()) * cutoff pwm = self.pwm matches = {} for name, seq in fa.items(): matches[name] = [] result = pwmscan(seq.upper(), pwm, c, nreport, scan_rc) for score,_,_ in result: matches[name].append(score) return matches
def scan_sequence(seq, motifs, nreport, scan_rc): ret = [] # scan for motifs for motif, cutoff in motifs: result = pwmscan(seq, motif.pwm, cutoff, nreport, scan_rc) ret.append(result) # return results return ret
def pwm_scan_all(self, fa, cutoff=0.9, nreport=50, scan_rc=True): c = self.pwm_min_score() + (self.pwm_max_score() - self.pwm_min_score()) * cutoff pwm = self.pwm matches = {} for name, seq in fa.items(): matches[name] = [] result = pwmscan(seq.upper(), pwm, c, nreport, scan_rc) for score,pos,strand in result: matches[name].append((pos,score,strand)) return matches
def pwm_scan_score(self, fa, cutoff=0, nreport=1, scan_rc=True): c = self.pwm_min_score() + (self.pwm_max_score() - self.pwm_min_score()) * cutoff pwm = self.pwm strandmap = {"+":"+",1:"+","1":"+","-":"-",-1:"-","-1":"-"} matches = {} for name, seq in fa.items(): matches[name] = [] result = pwmscan(seq.upper(), pwm, c, nreport, scan_rc) for score,pos,strand in result: matches[name].append(score) return matches
def pwm_scan_score(self, fa, cutoff=0, nreport=1, scan_rc=True): from gimmemotifs.c_metrics import pwmscan c = self.pwm_min_score() + (self.pwm_max_score() - self.pwm_min_score()) * cutoff pwm = self.pwm strandmap = {"+":"+",1:"+","1":"+","-":"-",-1:"-","-1":"-"} matches = {} for id, seq in fa.items(): matches[id] = [] result = pwmscan(seq.upper(), pwm, c, nreport, scan_rc) for score,pos,strand in result: matches[id].append(score) return matches
def pwm_scan_score(self, fa, cutoff=0, nreport=1, scan_rc=True): c = self.pwm_min_score() + (self.pwm_max_score() - self.pwm_min_score()) * cutoff pwm = self.pwm strandmap = {"+": "+", 1: "+", "1": "+", "-": "-", -1: "-", "-1": "-"} matches = {} for name, seq in fa.items(): matches[name] = [] result = pwmscan(seq.upper(), pwm, c, nreport, scan_rc) for score, pos, strand in result: matches[name].append(score) return matches
def pwm_scan(self, fa, cutoff=0.9, scan_strand=None, nreport=50): from gimmemotifs.c_metrics import pwmscan c = self.pwm_min_score() + (self.pwm_max_score() - self.pwm_min_score()) * cutoff pwm = self.pwm strandmap = {"+":"+",1:"+","1":"+","-":"-",-1:"-","-1":"-"} matches = {} for id, seq in fa.items(): matches[id] = [] result = pwmscan(seq.upper(), pwm, c, nreport) for score,pos,strand in result: if not scan_strand or (scan_strand and strandmap[scan_strand] == strandmap[strand]): matches[id].append(pos) return matches
def pwm_scan_all(self, fa, cutoff=0.9, nreport=50, scan_rc=True): from gimmemotifs.c_metrics import pwmscan c = self.pwm_min_score() + (self.pwm_max_score() - self.pwm_min_score()) * cutoff pwm = self.pwm strandmap = {"+": "+", 1: "+", "1": "+", "-": "-", -1: "-", "-1": "-"} matches = {} for id, seq in fa.items(): matches[id] = [] # sys.stderr.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(seq.upper(), pwm, c, nreport, scan_rc)) result = pwmscan(seq.upper(), pwm, c, nreport, scan_rc) for score, pos, strand in result: matches[id].append((pos, score, strand)) return matches
def scan_sequence(seq, motifs, nreport, scan_rc): ret = [] # scan for motifs for motif, cutoff in motifs: if cutoff is None: ret.append([]) else: result = pwmscan(seq, motif.logodds, cutoff, nreport, scan_rc) if cutoff <= motif.pwm_min_score() and len(result) == 0: result = [[motif.pwm_min_score(), 0, 1]] * nreport ret.append(result) # return results return ret
def seqcor(m1, m2, seq=None): l1 = len(m1) l2 = len(m2) l = max(l1, l2) if seq is None: seq = RANDOM_SEQ # Scan random sequence result1 = pwmscan(seq, m1.pwm, m1.pwm_min_score(), len(seq), False, True) result2 = pwmscan(seq, m2.pwm, m2.pwm_min_score(), len(seq), False, True) result1 = np.array(result1) result2 = np.array(result2) # Return maximum correlation c = [] for i in range(l1): c.append(1 - distance.correlation(result1[:L - l - i], result2[i:L - l])) for i in range(l2): c.append(1 - distance.correlation(result1[i:L - l], result2[:L - l - i])) return max(c)
def seqcor(m1,m2): l1 = len(m1) l2 = len(m2) l = max(l1, l2) # Create random sequence nucs = [] L = 10 ** 4 for i in range(L): nucs.append(random.choice(['A', 'C', 'T', 'G'])) random_seq = "".join(nucs) # Scan random sequence result1 = pwmscan(random_seq.upper(), m1.pwm, m1.pwm_min_score(), len(random_seq), False, True) result2 = pwmscan(random_seq.upper(), m2.pwm, m2.pwm_min_score(), len(random_seq), False, True) # Return maximum correlation c = [] for i in range(l1): c.append(1 - distance.correlation(result1[:L-l-i],result2[i:L-l])) for i in range(l2): c.append(1 - distance.correlation(result1[i:L-l],result2[:L-l-i])) return max(c)
def pwm_scan_to_gff(self, fa, gfffile, cutoff=0.9, rc=True, nreport=50, append=False): from gimmemotifs.c_metrics import pwmscan if append: out = open(gfffile, "a") else: out = open(gfffile, "w") c = self.pwm_min_score() + (self.pwm_max_score() - self.pwm_min_score()) * cutoff pwm = self.pwm strandmap = {-1:"-","-1":"-","-":"-","1":"+",1:"+","+":"+"} for id, seq in fa.items(): result = pwmscan(seq.upper(), pwm, c, nreport) for score, pos, strand in result: out.write("%s\tpwmscan\tmisc_feature\t%s\t%s\t%s\t%s\t.\tmotif_name \"%s\" ; motif_instance \"%s\"\n" % (id, pos, pos + len(pwm), score, strandmap[strand], self.id, seq[pos:pos + len(pwm)])) out.close()
def pwm_scan_to_gff(self, fa, gfffile, cutoff=0.9, nreport=50, scan_rc=True, append=False): #print "received", gfffile, cutoff, nreport, scan_rc, append from gimmemotifs.c_metrics import pwmscan if append: out = open(gfffile, "a") else: out = open(gfffile, "w") c = self.pwm_min_score() + (self.pwm_max_score() - self.pwm_min_score()) * cutoff pwm = self.pwm strandmap = {-1:"-","-1":"-","-":"-","1":"+",1:"+","+":"+"} for id, seq in fa.items(): result = pwmscan(seq.upper(), pwm, c, nreport, scan_rc) for score, pos, strand in result: out.write("%s\tpwmscan\tmisc_feature\t%s\t%s\t%s\t%s\t.\tmotif_name \"%s\" ; motif_instance \"%s\"\n" % (id, pos, pos + len(pwm), score, strandmap[strand], self.id, seq[pos:pos + len(pwm)])) out.close()
parser = OptionParser() parser.add_option("-p", "--pwmfile", dest="pwmfile", help="File with pwms", metavar="FILE") parser.add_option("-i", "--inputfile", dest="inputfile", help="FASTA file with background sequences", metavar="FILE") parser.add_option("-f", "--fpr", dest="fpr", help="Desired fpr", type="float", metavar="FLOAT") (options, args) = parser.parse_args() if not options.pwmfile or not options.inputfile or not options.fpr: parser.print_help() exit() if options.fpr < 0 or options.fpr > 1: print "Please specify a FPR between 0 and 1" sys.exit() f = Fasta(options.inputfile) motifs = pwmfile_to_motifs(options.pwmfile) print "Motif\tScore\tCutoff" for motif in motifs: pwm = motif.pwm scores = [] min_score = motif.pwm_min_score() for name,seq in f.items(): result = pwmscan(seq.upper(), pwm, min_score, 1, True) score = result[0][0] scores.append(score) opt_score = scoreatpercentile(scores, 100 - (100 * options.fpr)) cutoff = (opt_score - min_score) / (motif.pwm_max_score() - min_score) print "%s\t%s\t%s" % (motif.id, opt_score , cutoff)
if options.nreport: nreport = int(options.nreport) cutoff = float(options.cutoff) motifs = pwmfile_to_motifs(options.pwmfile) bed = options.bed f = Fasta(inputfile) strandmap = {-1:"-",1:"+"} for (id,seq) in f.items(): for motif in motifs: pwm = motif.pwm c = motif.pwm_min_score() + (motif.pwm_max_score() - motif.pwm_min_score()) * cutoff result = pwmscan(seq.upper(), pwm, c, nreport, options.scan_rc) for (score, pos, strand) in result: if bed: first = id.split(" ")[0] (chr,loc) = first.split(":") if loc: (start, end) = map(int, loc.split("-")) print "%s\t%s\t%s\t%s" % (chr, start + pos, start + pos + len(pwm) , score) else: print "%s\t%s\t%s\t%s" % (id, pos, pos + len(pwm), score) else: print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tmotif_name \"%s\" ; motif_instance \"%s\"" % ( id, "pwmscan", "misc_feature", pos, pos + len(pwm) ,
if options.nreport: nreport = int(options.nreport) cutoff = float(options.cutoff) motifs = pwmfile_to_motifs(options.pwmfile) bed = options.bed f = Fasta(inputfile) strandmap = {-1:"-",1:"+"} for (id,seq) in f.items(): for motif in motifs: pwm = motif.pwm c = motif.pwm_min_score() + (motif.pwm_max_score() - motif.pwm_min_score()) * cutoff result = pwmscan(seq.upper(), pwm, c, nreport) for (score, pos, strand) in result: if bed: first = id.split(" ")[0] (chr,loc) = first.split(":") if loc: (start, end) = map(int, loc.split("-")) print "%s\t%s\t%s\t%s" % (chr, start + pos, start + pos + len(pwm) , score) else: print "%s\t%s\t%s\t%s" % (id, pos, pos + len(pwm), score) else: print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tmotif_name \"%s\" ; motif_instance \"%s\"" % ( id, "pwmscan", "misc_feature", pos, pos + len(pwm) ,
dest="fpr", help="Desired fpr", type="float", metavar="FLOAT") (options, args) = parser.parse_args() if not options.pwmfile or not options.inputfile or not options.fpr: parser.print_help() exit() if options.fpr < 0 or options.fpr > 1: print "Please specify a FPR between 0 and 1" sys.exit() f = Fasta(options.inputfile) motifs = pwmfile_to_motifs(options.pwmfile) print "Motif\tScore\tCutoff" for motif in motifs: pwm = motif.pwm scores = [] min_score = motif.pwm_min_score() for name, seq in f.items(): result = pwmscan(seq.upper(), pwm, min_score, 1, True) score = result[0][0] scores.append(score) opt_score = scoreatpercentile(scores, 100 - (100 * options.fpr)) cutoff = (opt_score - min_score) / (motif.pwm_max_score() - min_score) print "%s\t%s\t%s" % (motif.id, opt_score, cutoff)