def get_splice_statistics(self, chrom, fnames, name): if type("") == type(fnames): fnames = [fnames] for fname in fnames: self.logger.debug("Getting splicing data from %s", fname) read_source = get_or_create(self.session, ReadSource, name=name, source=fname) self.session.commit() for line in open(fname): vals = line.strip().split("\t") if vals[0] == chrom: start, end, c = [int(x) for x in vals[1:4]] strand = vals[5] splice = get_or_create(self.session, Feature, chrom = chrom, start = start, end = end, strand = strand, ftype = "splice_junction" ) self.session.commit() count = get_or_create(self.session, FeatureReadCount, feature_id = splice.id, read_source_id = read_source.id) if not count.count: count.count = c else: count.count += c self.session.commit()
def load_yaml(self, fname): data = yaml.load(open(fname)) source_map = {} if not data['feature']: return for old_id,name,fname,nreads in data['read_source']: r = get_or_create(self.session, ReadSource, name=name, source=fname, nreads=nreads) self.session.commit() source_map[old_id] = r.id t = ["chrom","start","end","strand","ftype","seq"] self.engine.execute( Feature.__table__.insert(), [dict(zip(t, row[1:])) for row in data['feature']] ) self.session.commit() first = self.fetch_feature(data['feature'][0][1:]) last = self.fetch_feature(data['feature'][-1][1:]) f_map = dict(zip([x[0] for x in data['feature']], range(first.id, last.id + 1))) data['read_count'] = [ [source_map[row[0]]] + [f_map[row[1]]] + row[2:] for row in data['read_count'] ] t = ["read_source_id", "feature_id", "count", "span", "extend_up", "extend_down"] self.engine.execute( FeatureReadCount.__table__.insert(), [dict(zip(t, row)) for row in data['read_count']] ) if data['evidence']: t = ["name","source"] result = self.engine.execute( Evidence.__table__.insert(), [dict(zip(t, row[1:])) for row in data['evidence']] ) self.session.commit() first = self.fetch_evidence(data['evidence'][0][1:]) last = self.fetch_evidence(data['evidence'][-1][1:]) ev_map = dict(zip([x[0] for x in data['evidence']], range(first.id, last.id + 1))) data['feature_evidence'] = [ [f_map[row[0]], ev_map[row[1]]] for row in data['feature_evidence'] ] t = ["feature_id", "evidence_id"] self.engine.execute( FeatureEvidence.__table__.insert(), [dict(zip(t, row)) for row in data['feature_evidence']] )
def load_yaml(self, fname): data = yaml.load(open(fname)) source_map = {} if not data['feature']: return for old_id, name, fname, nreads in data['read_source']: r = get_or_create(self.session, ReadSource, name=name, source=fname, nreads=nreads) self.session.commit() source_map[old_id] = r.id t = ["chrom", "start", "end", "strand", "ftype", "seq"] self.engine.execute(Feature.__table__.insert(), [dict(zip(t, row[1:])) for row in data['feature']]) self.session.commit() first = self.fetch_feature(data['feature'][0][1:]) last = self.fetch_feature(data['feature'][-1][1:]) f_map = dict( zip([x[0] for x in data['feature']], range(first.id, last.id + 1))) data['read_count'] = [[source_map[row[0]]] + [f_map[row[1]]] + row[2:] for row in data['read_count']] t = [ "read_source_id", "feature_id", "count", "span", "extend_up", "extend_down" ] self.engine.execute(FeatureReadCount.__table__.insert(), [dict(zip(t, row)) for row in data['read_count']]) if data['evidence']: t = ["name", "source"] result = self.engine.execute( Evidence.__table__.insert(), [dict(zip(t, row[1:])) for row in data['evidence']]) self.session.commit() first = self.fetch_evidence(data['evidence'][0][1:]) last = self.fetch_evidence(data['evidence'][-1][1:]) ev_map = dict( zip([x[0] for x in data['evidence']], range(first.id, last.id + 1))) data['feature_evidence'] = [[f_map[row[0]], ev_map[row[1]]] for row in data['feature_evidence']] t = ["feature_id", "evidence_id"] self.engine.execute( FeatureEvidence.__table__.insert(), [dict(zip(t, row)) for row in data['feature_evidence']])
def get_splice_statistics(self, chrom, fnames, name): if type("") == type(fnames): fnames = [fnames] for fname in fnames: self.logger.debug("Getting splicing data from %s", fname) read_source = get_or_create(self.session, ReadSource, name=name, source=fname) self.session.commit() for line in open(fname): vals = line.strip().split("\t") if vals[0] == chrom: start, end, c = [int(x) for x in vals[1:4]] strand = vals[5] splice = get_or_create(self.session, Feature, chrom=chrom, start=start, end=end, strand=strand, ftype="splice_junction") self.session.commit() count = get_or_create(self.session, FeatureReadCount, feature_id=splice.id, read_source_id=read_source.id) if not count.count: count.count = c else: count.count += c self.session.commit()
def get_read_statistics(self, chrom, fnames, name, span="all", extend=(0,0), nreads=None): if span not in ["all", "start", "end"]: raise Exception("Incorrect span: {}".format(span)) tmp = NamedTemporaryFile(delete=False) estore = {} self.logger.debug("Writing exons to file %s", tmp.name) exons = self.get_exons(chrom) if len(exons) == 0: return for exon in exons: start = exon.start end = exon.end if span == "start": if exon.strand == "+": end = start elif exon.strand == "-": start = end if span == "end": if exon.strand == "+": start = end elif exon.strand == "-": end = start if exon.strand == "-": start -= extend[1] end += extend[0] else: start -= extend[0] end += extend[1] if start < 0: start = 0 estr = "{}:{}-{}".format(exon.chrom, start, end) if estr in estore: estore[estr].append(exon) else: estore[estr] = [exon] tmp.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( exon.chrom, start, end, str(exon), 0, exon.strand )) tmp.flush() if type("") == type(fnames): fnames = [fnames] for i, fname in enumerate(fnames): self.logger.debug("Creating read_source for %s %s", name, fname) read_source = get_or_create(self.session, ReadSource, name=name, source=fname) self.session.commit() #rmrepeats = False if fname.endswith("bam") and (not nreads or not nreads[i]): #rmrepeats = True self.logger.debug("Counting reads in %s", fname) read_source.nreads = read_statistics(fname) self.logger.debug("Getting overlap from %s", fname) result = get_binned_stats(tmp.name, fname, 1, rpkm=False, rmdup=False, rmrepeats=False) self.logger.debug("Reading results, save to exon stats") insert_vals = [] for row in result: try: vals = row.strip().split("\t") e = "%s:%s-%s" % (vals[0], vals[1], vals[2]) c = float(vals[3]) for exon in estore[e]: insert_vals.append([read_source.id, exon.id, c, span, extend[0], extend[1]]) except: self.logger.info("binned_stat line skipped: {}".format(row)) t = ["read_source_id", "feature_id", "count", "span", "extend_up", "extend_down"] result = self.engine.execute( FeatureReadCount.__table__.insert(), [dict(zip(t,row)) for row in insert_vals] ) tmp.close()
def add_transcript(self, name, source, exons): """ Add a transcript to the database """ # Sanity checks for e1, e2 in zip(exons[:-1], exons[1:]): if e1[0] != e2[0]: sys.stderr.write("{0} - {1}\n".format(e1, e2)) raise ValueError("Different chromosomes!") if e2[1] <= e1[2]: sys.stderr.write("{0} - {1}\n".format(e1, e2)) raise ValueError("exons overlap, or in wrong order") if e1[3] != e2[3]: sys.stderr.write("{0} - {1}\n".format(e1, e2)) raise ValueError("strands don't match") chrom = exons[0][0] strand = exons[0][-1] evidence = get_or_create(self.session, Evidence, name = name, source=source) seqs = [] for exon in exons: seq = "" real_seq = "" if self.index: seq = "" try: seq = self.index.get_sequence(chrom, exon[1] - 20, exon[2] + 20, strand) real_seq = seq[20:-20] except Exception: real_seq = self.index.get_sequence(chrom, exon[1], exon[2], strand) seqs.append(seq) exon = get_or_create(self.session, Feature, chrom = chrom, start = exon[1], end = exon[2], strand = strand, ftype = "exon", seq = real_seq ) exon.evidences.append(evidence) splice_donors = [] splice_acceptors = [] for i,(start,end) in enumerate([(e1[2], e2[1]) for e1, e2 in zip(exons[0:-1], exons[1:])]): self.logger.debug("%s %s %s %s", chrom, start, end, strand) sj = get_or_create(self.session, Feature, chrom = chrom, start = start, end = end, strand = strand, ftype = "splice_junction" ) sj.evidences.append(evidence) if strand == "+": if len(seqs) > (i + 1) and len(seqs[i]) > 46: splice_donors.append(["{}_{}".format(name, i + 1), seqs[i][-23:-14]]) if len(seqs) > (i + 2) and len(seqs[i + 1]) > 46: f = ["{}_{}".format(name, i + 1), seqs[i + 1][:23]] splice_acceptors.append(f) else: if len(seqs) > (i + 2) and len(seqs[i + 1]) > 46: f = ["{}_{}".format(name, i + 1), seqs[i + 1][-23:-14]] splice_donors.append(f) if len(seqs) > (i + 1) and len(seqs[i]) > 46: f = ["{}_{}".format(name, i + 1), seqs[i][:23]] splice_acceptors.append(f) donor_score = get_splice_score(splice_donors, 5) acceptor_score = get_splice_score(splice_acceptors, 3) if donor_score + acceptor_score < 0: self.logger.warning("Skipping %s, splicing not OK!", name) self.session.rollback() else: self.session.commit()
def get_read_statistics(self, chrom, fnames, name, span="all", extend=(0, 0), nreads=None): if span not in ["all", "start", "end"]: raise Exception("Incorrect span: {}".format(span)) tmp = NamedTemporaryFile(delete=False, suffix=".bed") estore = {} self.logger.debug("Writing exons to file %s", tmp.name) exons = self.get_exons(chrom) if len(exons) == 0: return for exon in exons: start = exon.start end = exon.end if span == "start": if exon.strand == "+": end = start elif exon.strand == "-": start = end if span == "end": if exon.strand == "+": start = end elif exon.strand == "-": end = start if exon.strand == "-": start -= extend[1] end += extend[0] else: start -= extend[0] end += extend[1] if start < 0: start = 0 estr = "{}:{}-{}".format(exon.chrom, start, end) if estr in estore: estore[estr].append(exon) else: estore[estr] = [exon] tmp.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( exon.chrom, start, end, str(exon), 0, exon.strand)) tmp.flush() if type("") == type(fnames): fnames = [fnames] for i, fname in enumerate(fnames): self.logger.debug("Creating read_source for %s %s", name, fname) read_source = get_or_create(self.session, ReadSource, name=name, source=fname) self.session.commit() #rmrepeats = False if fname.endswith("bam") and (not nreads or not nreads[i]): #rmrepeats = True self.logger.debug("Counting reads in %s", fname) read_source.nreads = read_statistics(fname) self.logger.debug("Getting overlap from %s", fname) t = BamTrack(fname) result = t.binned_stats(tmp.name, 1, rpkm=False, rmdup=False, rmrepeats=False) self.logger.debug("Reading results, save to exon stats") insert_vals = [] for row in result: try: vals = row.strip().split("\t") e = "%s:%s-%s" % (vals[0], vals[1], vals[2]) c = float(vals[3]) for exon in estore[e]: insert_vals.append([ read_source.id, exon.id, c, span, extend[0], extend[1] ]) except: self.logger.info( "binned_stat line skipped: {}".format(row)) t = [ "read_source_id", "feature_id", "count", "span", "extend_up", "extend_down" ] result = self.engine.execute( FeatureReadCount.__table__.insert(), [dict(zip(t, row)) for row in insert_vals]) tmp.close()
def add_transcript(self, name, source, exons): """ Add a transcript to the database """ # Sanity checks for e1, e2 in zip(exons[:-1], exons[1:]): if e1[0] != e2[0]: sys.stderr.write("{0} - {1}\n".format(e1, e2)) raise ValueError("Different chromosomes!") if e2[1] <= e1[2]: sys.stderr.write("{0} - {1}\n".format(e1, e2)) raise ValueError("exons overlap, or in wrong order") if e1[3] != e2[3]: sys.stderr.write("{0} - {1}\n".format(e1, e2)) raise ValueError("strands don't match") chrom = exons[0][0] strand = exons[0][-1] evidence = get_or_create(self.session, Evidence, name=name, source=source) seqs = [] for exon in exons: seq = "" real_seq = "" if self.index: seq = "" try: seq = self.index.get_sequence(chrom, exon[1] - 20, exon[2] + 20, strand) real_seq = seq[20:-20] except Exception: real_seq = self.index.get_sequence(chrom, exon[1], exon[2], strand) seqs.append(seq) exon = get_or_create(self.session, Feature, chrom=chrom, start=exon[1], end=exon[2], strand=strand, ftype="exon", seq=real_seq) exon.evidences.append(evidence) splice_donors = [] splice_acceptors = [] for i, (start, end) in enumerate([ (e1[2], e2[1]) for e1, e2 in zip(exons[0:-1], exons[1:]) ]): self.logger.debug("%s %s %s %s", chrom, start, end, strand) sj = get_or_create(self.session, Feature, chrom=chrom, start=start, end=end, strand=strand, ftype="splice_junction") sj.evidences.append(evidence) if strand == "+": if len(seqs) > (i + 1) and len(seqs[i]) > 46: splice_donors.append( ["{}_{}".format(name, i + 1), seqs[i][-23:-14]]) if len(seqs) > (i + 2) and len(seqs[i + 1]) > 46: f = ["{}_{}".format(name, i + 1), seqs[i + 1][:23]] splice_acceptors.append(f) else: if len(seqs) > (i + 2) and len(seqs[i + 1]) > 46: f = ["{}_{}".format(name, i + 1), seqs[i + 1][-23:-14]] splice_donors.append(f) if len(seqs) > (i + 1) and len(seqs[i]) > 46: f = ["{}_{}".format(name, i + 1), seqs[i][:23]] splice_acceptors.append(f) donor_score = get_splice_score(splice_donors, 5) acceptor_score = get_splice_score(splice_acceptors, 3) if donor_score + acceptor_score < 0: self.logger.warning("Skipping %s, splicing not OK!", name) self.session.rollback() else: self.session.commit()