def mapStartRangeCheckFunction(val, line): lineStart = int(line.strip().split('\t')[3]) lineEnd = lineStart + len(line.strip().split('\t')[4]) chrom, strand, start, end = cg.tccSplit(val) start = int(start) end = int(end) if cg.simpleOverlap(start, end, lineStart, lineEnd): return 0 else: return -1
def subtractTwoRanges(rangeKeep, rangeOther): '''rangeKeep is a list of ranges, rangeOther is a list of ranges''' '''Im not sure if rangeOther can overlap for this to work...should union them first...''' #take care of empty ranges if len(rangeKeep) == 1 and not rangeKeep[0]: return list() if len(rangeOther) == 1 and not rangeOther[0]: return rangeKeep coord_type = {} #annotate keep for left, right in rangeKeep: #overlap with each otherPair if not rangeOther: coord_type[left] = 1 coord_type[right] = 5 for oLeft, oRight in rangeOther: #left if bioLibCG.simpleOverlap(left, left, oLeft, oRight): #same as coord if left == oLeft: if coord_type.get(left, 0) < 2: coord_type[left] = 2 elif left == oRight: if coord_type.get(left, 0) < 2: coord_type[left] = 3 else: coord_type[left] = 4 else: if coord_type.get(left, 0) < 1: coord_type[left] = 1 #right if bioLibCG.simpleOverlap(right, right, oLeft, oRight): #same as coord if right == oLeft: if coord_type.get(right, 0) < 2: coord_type[right] = 2 elif right == oRight: if coord_type.get(right, 0) < 2: coord_type[right] = 3 else: coord_type[right] = 4 else: if coord_type.get(right, 0) < 1: coord_type[right] = 5 #annotate other for left, right in rangeOther: for kLeft, kRight in rangeKeep: #left if left == kLeft or left == kRight: pass else: if bioLibCG.simpleOverlap(left, left, kLeft, kRight): coord_type[left] = 2 #right if right == kLeft or right == kRight: pass else: if bioLibCG.simpleOverlap(right, right, kLeft, kRight): coord_type[right] = 3 returnList = [] #get cut coords sortedKeys = sorted(coord_type.keys()) #print zip(sortedKeys, [coord_type[x] for x in sortedKeys]) for i, key in enumerate(sortedKeys): if i == 0: continue a = coord_type[sortedKeys[i - 1]] b = coord_type[key] if (a, b) in [(1, 2), (3, 5), (1, 5), (3, 2)]: #move by one left = sortedKeys[i - 1] right = key if a == 2: left = left - 1 elif a == 3: left = left + 1 if b == 2: right = right - 1 elif b == 3: right = right + 1 returnList.append((left, right)) return returnList
def subtractTwoRanges(rangeKeep, rangeOther): '''rangeKeep is a list of ranges, rangeOther is a list of ranges''' '''Im not sure if rangeOther can overlap for this to work...should union them first...''' #take care of empty ranges if len(rangeKeep) == 1 and not rangeKeep[0]: return list() if len(rangeOther) == 1 and not rangeOther[0]: return rangeKeep coord_type = {} #annotate keep for left, right in rangeKeep: #overlap with each otherPair if not rangeOther: coord_type[left] = 1 coord_type[right] = 5 for oLeft, oRight in rangeOther: #left if bioLibCG.simpleOverlap(left, left, oLeft, oRight): #same as coord if left == oLeft: if coord_type.get(left, 0) < 2: coord_type[left] = 2 elif left == oRight: if coord_type.get(left, 0) < 2: coord_type[left] = 3 else: coord_type[left] = 4 else: if coord_type.get(left, 0) < 1: coord_type[left] = 1 #right if bioLibCG.simpleOverlap(right, right, oLeft, oRight): #same as coord if right == oLeft: if coord_type.get(right, 0) < 2: coord_type[right] = 2 elif right == oRight: if coord_type.get(right, 0) < 2: coord_type[right] = 3 else: coord_type[right] = 4 else: if coord_type.get(right, 0) < 1: coord_type[right] = 5 #annotate other for left, right in rangeOther: for kLeft, kRight in rangeKeep: #left if left == kLeft or left == kRight: pass else: if bioLibCG.simpleOverlap(left, left, kLeft, kRight): coord_type[left] = 2 #right if right == kLeft or right == kRight: pass else: if bioLibCG.simpleOverlap(right, right, kLeft, kRight): coord_type[right] = 3 returnList = [] #get cut coords sortedKeys = sorted(coord_type.keys()) #print zip(sortedKeys, [coord_type[x] for x in sortedKeys]) for i, key in enumerate(sortedKeys): if i == 0: continue a = coord_type[sortedKeys[i - 1]] b = coord_type[key] if (a, b) in [(1,2), (3,5), (1,5), (3,2)]: #move by one left = sortedKeys[i-1] right = key if a == 2: left = left - 1 elif a == 3: left = left + 1 if b == 2: right = right - 1 elif b == 3: right = right + 1 returnList.append((left, right)) return returnList
def multiLoopCheck(xMerSeq, hairpinSeq, structure): fixedseq = xMerSeq.upper() #check U to T symbols = str(structure) nucleotides = hairpinSeq #find position of conserved Kmer matchObject = re.search(fixedseq, nucleotides) #find all matches of 8mer startingpt = matchObject.start(0) #returns in 0 format the start position of FIRST match #get the num/positions of each loop looppositions=[] loopcounter = 0 # matches = re.finditer('[(][.]{1,100}[)]', str(symbols)) for match in matches: loopcounter += 1 looppositions.append(int(match.start())) looppositions.append(int(match.end())) #ex. looppositions=[loop1start, loop1end, loop2start, loop2end, loop3start, loop3end] #multiple loops? if loopcounter > 1: multipleloops = True else: multipleloops = False if not multipleloops: return True else: #count number of parenths to starting point... a=0 countmatch=0 while a < startingpt: char=str(symbols[a]) if char=='(': countmatch +=1 if char==')': countmatch -=1 a +=1 #finding position of 8mer on the - strand (count backwards) revpos=len(symbols)-1 revmatch=0 while revmatch < countmatch: char=str(symbols[revpos]) if char==')': revmatch +=1 if char=='(': revmatch -=1 revpos -=1 #checking if the 8mer contains loop forward8mer=symbols[startingpt:startingpt+8] if '(' in str(forward8mer) and ')' in str(forward8mer): return False #checking if there is a loop directly across from 8mer reverse8mer=symbols[revpos-7:revpos+1] if '(' in str(reverse8mer) and ')' in str(reverse8mer): return False #if 8mer is on (-) strand, switch the coordinates if startingpt > revpos: temp=startingpt startingpt=revpos revpos=temp if (revpos - startingpt) > 60: loopspast8mer=0 x=0 while x<len(looppositions)-1: loopstart=int(looppositions[x]) loopend=int(looppositions[x+1]) if ((loopstart > startingpt) and (loopend < revpos)) or cg.simpleOverlap(startingpt, revpos, loopstart, loopend): loopspast8mer +=1 x+=2 else: #redo range to be 60 spread = (60 - (revpos - startingpt))/2 startingpt -= spread revpos += spread loopspast8mer=0 x=0 while x<len(looppositions)-1: loopstart=int(looppositions[x]) loopend=int(looppositions[x+1]) if ((loopstart > startingpt) and (loopend < revpos)) or cg.simpleOverlap(startingpt, revpos, loopstart, loopend): loopspast8mer +=1 x+=2 if loopspast8mer < 2: return True else: return False
def createGeneSetEditing(fN): '''Read transcript info into a gene set from our editing file...''' file = open(fN, 'r') file.readline() #header #collect all transcripts allTranscripts = [] for line in file: #parse info ls = line.strip().split('\t') tID = ls[0] chrom = ls[1] strand = ls[2] if strand == '+': strand = '1' else: strand = '-1' tStart = int(ls[3]) tEnd = int(ls[4]) - 1 #0 based cStart = int(ls[5]) cEnd = int(ls[6]) - 1 #0 based numExons = int(ls[7]) exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] # 0 based geneName = ls[10] cStartStat = ls[11] cEndStat = ls[12] tType = ls[13] gType = ls[15] #make transcript t = transcript() t.id = tID t.parent = geneName t.strand = strand t.chromosome = chrom t.tcc = '%s:%s:%s:%s' % (chrom, strand, tStart, tEnd) t.tType = tType t.gType = gType #Exons for i, eStart in enumerate(exonStarts): t.exonList.append([eStart, exonEnds[i]]) #Introns for i, eStart in enumerate(exonStarts): if i == 0: continue t.intronList.append([exonEnds[i - 1] + 1, eStart - 1]) #Set UTR stats. if strand == '1': if cStart == tStart or cStart == tEnd: t.cds5Stat = 'INC' else: t.cds5Stat = 'COMP' if cEnd == tEnd or cEnd == tStart: t.cds3Stat = 'INC' else: t.cds3Stat = 'COMP' else: if cStart == tStart or cStart == tEnd: t.cds3Stat = 'INC' else: t.cds3Stat = 'COMP' if cEnd == tEnd or cEnd == tStart: t.cds5Stat = 'INC' else: t.cds5Stat = 'COMP' #only do UTR if complete!!! Will be empty list if not complete if (cStart != tStart): #update 3' or 5' depending on which strand... if strand == '1': utr = t.utr5 else: utr = t.utr3 for exon in t.exonList: if bioLibCG.simpleOverlap(exon[0], exon[1], cStart, cStart + 1): utr.append([ exon[0], cStart - 1 ]) #-1 because utr ends right before coding begins... break else: utr.append(exon) if (cEnd != tEnd): #again, based on strand if strand == '1': utr = t.utr3 else: utr = t.utr5 for exon in reversed( t.exonList): #don't permanently reverse the list... if bioLibCG.simpleOverlap(exon[0], exon[1], cEnd, cEnd + 1): utr.append([cEnd + 1, exon[1]]) break else: utr.append(exon) allTranscripts.append(t) #put transcripts into genes genes = {} for t in allTranscripts: if t.parent in genes: genes[t.parent].append(t) else: genes[t.parent] = [t] allGenes = [] for gID in genes: g = gene(gID, genes[gID]) allGenes.append(g) ##print 'testing gene creation', allGenes[0].transcripts, allGenes[0].id #return a gene set return geneSet(allGenes)
def createGeneSetEditing(fN): """Read transcript info into a gene set from our editing file...""" file = open(fN, "r") file.readline() # header # collect all transcripts allTranscripts = [] for line in file: # parse info ls = line.strip().split("\t") tID = ls[0] chrom = ls[1] strand = ls[2] if strand == "+": strand = "1" else: strand = "-1" tStart = int(ls[3]) tEnd = int(ls[4]) - 1 # 0 based cStart = int(ls[5]) cEnd = int(ls[6]) - 1 # 0 based numExons = int(ls[7]) exonStarts = [int(x) for x in ls[8][:-1].split(",")] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(",")] # 0 based geneName = ls[10] cStartStat = ls[11] cEndStat = ls[12] tType = ls[13] gType = ls[15] # make transcript t = transcript() t.id = tID t.parent = geneName t.strand = strand t.chromosome = chrom t.tcc = "%s:%s:%s:%s" % (chrom, strand, tStart, tEnd) t.tType = tType t.gType = gType # Exons for i, eStart in enumerate(exonStarts): t.exonList.append([eStart, exonEnds[i]]) # Introns for i, eStart in enumerate(exonStarts): if i == 0: continue t.intronList.append([exonEnds[i - 1] + 1, eStart - 1]) # Set UTR stats. if strand == "1": if cStart == tStart or cStart == tEnd: t.cds5Stat = "INC" else: t.cds5Stat = "COMP" if cEnd == tEnd or cEnd == tStart: t.cds3Stat = "INC" else: t.cds3Stat = "COMP" else: if cStart == tStart or cStart == tEnd: t.cds3Stat = "INC" else: t.cds3Stat = "COMP" if cEnd == tEnd or cEnd == tStart: t.cds5Stat = "INC" else: t.cds5Stat = "COMP" # only do UTR if complete!!! Will be empty list if not complete if cStart != tStart: # update 3' or 5' depending on which strand... if strand == "1": utr = t.utr5 else: utr = t.utr3 for exon in t.exonList: if bioLibCG.simpleOverlap(exon[0], exon[1], cStart, cStart + 1): utr.append([exon[0], cStart - 1]) # -1 because utr ends right before coding begins... break else: utr.append(exon) if cEnd != tEnd: # again, based on strand if strand == "1": utr = t.utr3 else: utr = t.utr5 for exon in reversed(t.exonList): # don't permanently reverse the list... if bioLibCG.simpleOverlap(exon[0], exon[1], cEnd, cEnd + 1): utr.append([cEnd + 1, exon[1]]) break else: utr.append(exon) allTranscripts.append(t) # put transcripts into genes genes = {} for t in allTranscripts: if t.parent in genes: genes[t.parent].append(t) else: genes[t.parent] = [t] allGenes = [] for gID in genes: g = gene(gID, genes[gID]) allGenes.append(g) ##print 'testing gene creation', allGenes[0].transcripts, allGenes[0].id # return a gene set return geneSet(allGenes)
def multiLoopCheck(xMerSeq, hairpinSeq, structure): fixedseq = xMerSeq.upper() #check U to T symbols = str(structure) nucleotides = hairpinSeq #find position of conserved Kmer matchObject = re.search(fixedseq, nucleotides) #find all matches of 8mer startingpt = matchObject.start( 0) #returns in 0 format the start position of FIRST match #get the num/positions of each loop looppositions = [] loopcounter = 0 # matches = re.finditer('[(][.]{1,100}[)]', str(symbols)) for match in matches: loopcounter += 1 looppositions.append(int(match.start())) looppositions.append(int(match.end())) #ex. looppositions=[loop1start, loop1end, loop2start, loop2end, loop3start, loop3end] #multiple loops? if loopcounter > 1: multipleloops = True else: multipleloops = False if not multipleloops: return True else: #count number of parenths to starting point... a = 0 countmatch = 0 while a < startingpt: char = str(symbols[a]) if char == '(': countmatch += 1 if char == ')': countmatch -= 1 a += 1 #finding position of 8mer on the - strand (count backwards) revpos = len(symbols) - 1 revmatch = 0 while revmatch < countmatch: char = str(symbols[revpos]) if char == ')': revmatch += 1 if char == '(': revmatch -= 1 revpos -= 1 #checking if the 8mer contains loop forward8mer = symbols[startingpt:startingpt + 8] if '(' in str(forward8mer) and ')' in str(forward8mer): return False #checking if there is a loop directly across from 8mer reverse8mer = symbols[revpos - 7:revpos + 1] if '(' in str(reverse8mer) and ')' in str(reverse8mer): return False #if 8mer is on (-) strand, switch the coordinates if startingpt > revpos: temp = startingpt startingpt = revpos revpos = temp if (revpos - startingpt) > 60: loopspast8mer = 0 x = 0 while x < len(looppositions) - 1: loopstart = int(looppositions[x]) loopend = int(looppositions[x + 1]) if ((loopstart > startingpt) and (loopend < revpos)) or cg.simpleOverlap( startingpt, revpos, loopstart, loopend): loopspast8mer += 1 x += 2 else: #redo range to be 60 spread = (60 - (revpos - startingpt)) / 2 startingpt -= spread revpos += spread loopspast8mer = 0 x = 0 while x < len(looppositions) - 1: loopstart = int(looppositions[x]) loopend = int(looppositions[x + 1]) if ((loopstart > startingpt) and (loopend < revpos)) or cg.simpleOverlap( startingpt, revpos, loopstart, loopend): loopspast8mer += 1 x += 2 if loopspast8mer < 2: return True else: return False