def run(self): self.getAlignmentDict() self.getTranscriptDict() self.getAnnotationDict() detailsDict = {} classifyDict = {} for aId, t in self.transcriptDict.iteritems(): a = self.annotationDict[psl_lib.remove_alignment_number(aId)] if a.getCdsLength() <= 75 or t.getCdsLength() <= 75: continue if t.getCdsLength() % 3 != 0 and a.getCdsLength() % 3 != 0: detailsDict[aId] = seq_lib.chromosome_coordinate_to_bed(t, t.thickStart, t.thickStop, self.colors["input"], self.column) classifyDict[aId] = 1 elif t.getCdsLength() % 3 != 0: detailsDict[aId] = seq_lib.chromosome_coordinate_to_bed(t, t.thickStart, t.thickStop, self.rgb, self.column) classifyDict[aId] = 1 else: classifyDict[aId] = 0 self.dumpValueDicts(classifyDict, detailsDict)
def run(self): self.getAlignmentDict() self.getTranscriptDict() self.getAnnotationDict() detailsDict = defaultdict(list) classifyDict = {} for aId, aln in self.alignmentDict.iteritems(): if aId not in self.transcriptDict: continue t = self.transcriptDict[aId] a = self.annotationDict[psl_lib.remove_alignment_number(aId)] # do not include noncoding transcripts or lift-overs that contain less than 1 codon if a.getCdsLength() <= 75 or t.getCdsLength() <= 75: continue frame_shifts = list(frameShiftIterator(a, t, aln)) if len(frame_shifts) == 0: classifyDict[aId] = 0 continue indel_starts, indel_stops, spans = zip(*frame_shifts) # calculate cumulative frame by adding each span and taking mod 3 - zeroes imply regaining frame # note that this code prepends a 0 to the list, offsetting all values by 1. This is useful. cumulative_frame = map(lambda x: x % 3, reduce(lambda l, v: (l.append(l[-1] + v) or l), spans, [0])) # every start is when a zero existed in the previous spot in cumulative_frame windowed_starts = [x for x, y in izip(indel_starts, cumulative_frame) if y == 0 or x == indel_starts[0]] # every stop is when a zero exists at this cumulative_frame windowed_stops = [x for x, y in izip(indel_stops, cumulative_frame[1:]) if y == 0] # sanity check assert any([len(windowed_starts) == len(windowed_stops), len(windowed_starts) - 1 == len(windowed_stops)]),\ (self.genome, self.column, aId) # now we need to fix frame and stops - if this shift extends to the end of the transcript, add that stop # additionally, if this is a negative strand transcript, flip starts/stops so that start is always < stop if len(windowed_stops) < len(windowed_starts) and t.strand is False: windowed_stops.append(t.thickStart) windowed_stops, windowed_starts = windowed_starts, windowed_stops elif len(windowed_stops) < len(windowed_starts): windowed_stops.append(t.thickStop) elif t.strand is False: windowed_stops, windowed_starts = windowed_starts, windowed_stops for start, stop in izip(windowed_starts, windowed_stops): detailsDict[aId].append(seq_lib.chromosome_coordinate_to_bed(t, start, stop, self.rgb, self.column)) classifyDict[aId] = 1 self.dumpValueDicts(classifyDict, detailsDict)