예제 #1
0
def mapStartRangeCheckFunction(val, line):
    lineStart = int(line.strip().split('\t')[3])
    lineEnd = lineStart + len(line.strip().split('\t')[4])
    chrom, strand, start, end = cg.tccSplit(val)
    start = int(start)
    end = int(end)

    if cg.simpleOverlap(start, end, lineStart, lineEnd):
        return 0
    else:
        return -1
예제 #2
0
def mapStartRangeCheckFunction(val, line):
	lineStart = int(line.strip().split('\t')[3])
        lineEnd = lineStart + len(line.strip().split('\t')[4])
        chrom, strand, start, end = cg.tccSplit(val)
        start = int(start)
        end = int(end)

        if cg.simpleOverlap(start, end, lineStart, lineEnd):
                return 0
        else:
                return -1
예제 #3
0
def subtractTwoRanges(rangeKeep, rangeOther):
    '''rangeKeep is a list of ranges, rangeOther is a list of ranges'''
    '''Im not sure if rangeOther can overlap for this to work...should union them first...'''

    #take care of empty ranges
    if len(rangeKeep) == 1 and not rangeKeep[0]:
        return list()

    if len(rangeOther) == 1 and not rangeOther[0]:
        return rangeKeep

    coord_type = {}
    #annotate keep
    for left, right in rangeKeep:
        #overlap with each otherPair
        if not rangeOther:
            coord_type[left] = 1
            coord_type[right] = 5

        for oLeft, oRight in rangeOther:
            #left
            if bioLibCG.simpleOverlap(left, left, oLeft, oRight):
                #same as coord
                if left == oLeft:
                    if coord_type.get(left, 0) < 2:
                        coord_type[left] = 2

                elif left == oRight:
                    if coord_type.get(left, 0) < 2:
                        coord_type[left] = 3

                else:
                    coord_type[left] = 4
            else:
                if coord_type.get(left, 0) < 1:
                    coord_type[left] = 1

            #right
            if bioLibCG.simpleOverlap(right, right, oLeft, oRight):
                #same as coord
                if right == oLeft:
                    if coord_type.get(right, 0) < 2:
                        coord_type[right] = 2

                elif right == oRight:
                    if coord_type.get(right, 0) < 2:
                        coord_type[right] = 3

                else:
                    coord_type[right] = 4
            else:
                if coord_type.get(right, 0) < 1:
                    coord_type[right] = 5

    #annotate other
    for left, right in rangeOther:
        for kLeft, kRight in rangeKeep:

            #left
            if left == kLeft or left == kRight:
                pass
            else:
                if bioLibCG.simpleOverlap(left, left, kLeft, kRight):
                    coord_type[left] = 2
            #right
            if right == kLeft or right == kRight:
                pass
            else:

                if bioLibCG.simpleOverlap(right, right, kLeft, kRight):
                    coord_type[right] = 3

    returnList = []
    #get cut coords
    sortedKeys = sorted(coord_type.keys())
    #print zip(sortedKeys, [coord_type[x] for x in sortedKeys])
    for i, key in enumerate(sortedKeys):
        if i == 0: continue

        a = coord_type[sortedKeys[i - 1]]
        b = coord_type[key]
        if (a, b) in [(1, 2), (3, 5), (1, 5), (3, 2)]:
            #move by one
            left = sortedKeys[i - 1]
            right = key
            if a == 2:
                left = left - 1
            elif a == 3:
                left = left + 1

            if b == 2:
                right = right - 1
            elif b == 3:
                right = right + 1

            returnList.append((left, right))

    return returnList
예제 #4
0
def subtractTwoRanges(rangeKeep, rangeOther):
        '''rangeKeep is a list of ranges, rangeOther is a list of ranges'''
        '''Im not sure if rangeOther can overlap for this to work...should union them first...'''

        #take care of empty ranges
        if len(rangeKeep) == 1 and not rangeKeep[0]:
                return list()

        if len(rangeOther) == 1 and not rangeOther[0]:
                return rangeKeep

        coord_type = {}
        #annotate keep
        for left, right in rangeKeep:
                #overlap with each otherPair
                if not rangeOther:
                        coord_type[left] = 1
                        coord_type[right] = 5
                
                for oLeft, oRight in rangeOther:
                        #left
                        if bioLibCG.simpleOverlap(left, left, oLeft, oRight):
                                #same as coord
                                if left == oLeft:
                                        if coord_type.get(left, 0) < 2:
                                                coord_type[left] = 2
                                
                                elif left == oRight:
                                        if coord_type.get(left, 0) < 2:
                                                coord_type[left] = 3
                                
                                else:
                                        coord_type[left] = 4
                        else:
                                if coord_type.get(left, 0) < 1:
                                        coord_type[left] = 1
                        
                        #right
                        if bioLibCG.simpleOverlap(right, right, oLeft, oRight):
                                #same as coord
                                if right == oLeft:
                                        if coord_type.get(right, 0) < 2:
                                                coord_type[right] = 2
                                
                                elif right == oRight:
                                        if coord_type.get(right, 0) < 2:
                                                coord_type[right] = 3
                                
                                else:
                                        coord_type[right] = 4
                        else:
                                if coord_type.get(right, 0) < 1:
                                        coord_type[right] = 5

        #annotate other
        for left, right in rangeOther:
                for kLeft, kRight in rangeKeep:
                        
                        #left
                        if left == kLeft or left == kRight:
                                pass
                        else:
                                if bioLibCG.simpleOverlap(left, left, kLeft, kRight):
                                        coord_type[left] = 2
                        #right
                        if right == kLeft or right == kRight:
                                pass
                        else:

                                if bioLibCG.simpleOverlap(right, right, kLeft, kRight):
                                        coord_type[right] = 3
 

        returnList = []
        #get cut coords
        sortedKeys = sorted(coord_type.keys())
        #print zip(sortedKeys, [coord_type[x] for x in sortedKeys])
        for i, key in enumerate(sortedKeys):
                if i == 0: continue
                
                a = coord_type[sortedKeys[i - 1]]
                b = coord_type[key]
                if (a, b) in [(1,2), (3,5), (1,5), (3,2)]:
                        #move by one
                        left = sortedKeys[i-1]
                        right = key
                        if a == 2:
                                left = left - 1
                        elif a == 3:
                                left = left + 1

                        if b == 2:
                                right = right - 1
                        elif b == 3:
                                right = right + 1
                        
                        returnList.append((left, right))

        return returnList                                        
예제 #5
0
def multiLoopCheck(xMerSeq, hairpinSeq, structure):

	fixedseq = xMerSeq.upper() #check U to T
	symbols = str(structure)
	nucleotides = hairpinSeq

	#find position of conserved Kmer
	matchObject = re.search(fixedseq, nucleotides) #find all matches of 8mer
	startingpt = matchObject.start(0) #returns in 0 format the start position of FIRST match

	
	#get the num/positions of each loop
	looppositions=[]
	loopcounter = 0 #
	matches = re.finditer('[(][.]{1,100}[)]', str(symbols))
	for match in matches:
		loopcounter += 1
		looppositions.append(int(match.start()))
		looppositions.append(int(match.end()))
		#ex. looppositions=[loop1start, loop1end, loop2start, loop2end, loop3start, loop3end]

	
	#multiple loops?
	if loopcounter > 1:
		multipleloops = True
	else:
		multipleloops = False		
			
	
	
	if not multipleloops:
		return True
	else:
		
		#count number of parenths to starting point...
		a=0
		countmatch=0
		while a < startingpt:
			char=str(symbols[a])
			if char=='(':
				countmatch +=1
			if char==')':
				countmatch -=1
			a +=1
		
		#finding position of 8mer on the - strand (count backwards)
		revpos=len(symbols)-1
		revmatch=0
		while revmatch < countmatch:
			char=str(symbols[revpos])
			if char==')':
				revmatch +=1
			if char=='(':
				revmatch -=1
			revpos -=1
			
		#checking if the 8mer contains loop
		forward8mer=symbols[startingpt:startingpt+8]
		if '(' in str(forward8mer) and ')' in str(forward8mer):
			return False
			
		#checking if there is a loop directly across from 8mer	
		reverse8mer=symbols[revpos-7:revpos+1]
		if '(' in str(reverse8mer) and ')' in str(reverse8mer):
			return False
		
		#if 8mer is on (-) strand, switch the coordinates
		if startingpt > revpos:
			temp=startingpt
			startingpt=revpos
			revpos=temp
			
		if (revpos - startingpt) > 60:
			loopspast8mer=0
			x=0
			while x<len(looppositions)-1:
				loopstart=int(looppositions[x])
				loopend=int(looppositions[x+1])
				if ((loopstart > startingpt) and (loopend < revpos)) or cg.simpleOverlap(startingpt, revpos, loopstart, loopend):
					loopspast8mer +=1
				x+=2
				
		else: #redo range to be 60
			spread = (60 - (revpos - startingpt))/2
			startingpt -= spread
			revpos += spread
			
			loopspast8mer=0
			x=0
			while x<len(looppositions)-1:
				loopstart=int(looppositions[x])
				loopend=int(looppositions[x+1])
				if ((loopstart > startingpt) and (loopend < revpos)) or cg.simpleOverlap(startingpt, revpos, loopstart, loopend):
					loopspast8mer +=1
				x+=2
		

		if loopspast8mer < 2:
			return True
		else:
			return False
예제 #6
0
def createGeneSetEditing(fN):
    '''Read transcript info into a gene set from our editing file...'''
    file = open(fN, 'r')
    file.readline()  #header

    #collect all transcripts
    allTranscripts = []
    for line in file:

        #parse info
        ls = line.strip().split('\t')
        tID = ls[0]
        chrom = ls[1]
        strand = ls[2]
        if strand == '+':
            strand = '1'
        else:
            strand = '-1'
        tStart = int(ls[3])
        tEnd = int(ls[4]) - 1  #0 based
        cStart = int(ls[5])
        cEnd = int(ls[6]) - 1  #0 based
        numExons = int(ls[7])
        exonStarts = [int(x) for x in ls[8][:-1].split(',')]
        exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]  # 0 based
        geneName = ls[10]
        cStartStat = ls[11]
        cEndStat = ls[12]
        tType = ls[13]
        gType = ls[15]

        #make transcript
        t = transcript()
        t.id = tID
        t.parent = geneName
        t.strand = strand
        t.chromosome = chrom
        t.tcc = '%s:%s:%s:%s' % (chrom, strand, tStart, tEnd)
        t.tType = tType
        t.gType = gType

        #Exons
        for i, eStart in enumerate(exonStarts):
            t.exonList.append([eStart, exonEnds[i]])
        #Introns
        for i, eStart in enumerate(exonStarts):
            if i == 0: continue
            t.intronList.append([exonEnds[i - 1] + 1, eStart - 1])

        #Set UTR stats.
        if strand == '1':
            if cStart == tStart or cStart == tEnd:
                t.cds5Stat = 'INC'
            else:
                t.cds5Stat = 'COMP'

            if cEnd == tEnd or cEnd == tStart:
                t.cds3Stat = 'INC'
            else:
                t.cds3Stat = 'COMP'
        else:
            if cStart == tStart or cStart == tEnd:
                t.cds3Stat = 'INC'
            else:
                t.cds3Stat = 'COMP'

            if cEnd == tEnd or cEnd == tStart:
                t.cds5Stat = 'INC'
            else:
                t.cds5Stat = 'COMP'

        #only do UTR if complete!!! Will be empty list if not complete
        if (cStart != tStart):  #update 3' or 5' depending on which strand...
            if strand == '1':
                utr = t.utr5
            else:
                utr = t.utr3

            for exon in t.exonList:
                if bioLibCG.simpleOverlap(exon[0], exon[1], cStart,
                                          cStart + 1):
                    utr.append([
                        exon[0], cStart - 1
                    ])  #-1 because utr ends right before coding begins...
                    break
                else:
                    utr.append(exon)

        if (cEnd != tEnd):  #again, based on strand
            if strand == '1':
                utr = t.utr3
            else:
                utr = t.utr5

            for exon in reversed(
                    t.exonList):  #don't permanently reverse the list...
                if bioLibCG.simpleOverlap(exon[0], exon[1], cEnd, cEnd + 1):
                    utr.append([cEnd + 1, exon[1]])
                    break
                else:
                    utr.append(exon)

        allTranscripts.append(t)

#put transcripts into genes
    genes = {}
    for t in allTranscripts:
        if t.parent in genes:
            genes[t.parent].append(t)
        else:
            genes[t.parent] = [t]

    allGenes = []
    for gID in genes:
        g = gene(gID, genes[gID])
        allGenes.append(g)

##print 'testing gene creation', allGenes[0].transcripts, allGenes[0].id

#return a gene set
    return geneSet(allGenes)
예제 #7
0
def createGeneSetEditing(fN):
    """Read transcript info into a gene set from our editing file..."""
    file = open(fN, "r")
    file.readline()  # header

    # collect all transcripts
    allTranscripts = []
    for line in file:

        # parse info
        ls = line.strip().split("\t")
        tID = ls[0]
        chrom = ls[1]
        strand = ls[2]
        if strand == "+":
            strand = "1"
        else:
            strand = "-1"
        tStart = int(ls[3])
        tEnd = int(ls[4]) - 1  # 0 based
        cStart = int(ls[5])
        cEnd = int(ls[6]) - 1  # 0 based
        numExons = int(ls[7])
        exonStarts = [int(x) for x in ls[8][:-1].split(",")]
        exonEnds = [int(x) - 1 for x in ls[9][:-1].split(",")]  # 0 based
        geneName = ls[10]
        cStartStat = ls[11]
        cEndStat = ls[12]
        tType = ls[13]
        gType = ls[15]

        # make transcript
        t = transcript()
        t.id = tID
        t.parent = geneName
        t.strand = strand
        t.chromosome = chrom
        t.tcc = "%s:%s:%s:%s" % (chrom, strand, tStart, tEnd)
        t.tType = tType
        t.gType = gType

        # Exons
        for i, eStart in enumerate(exonStarts):
            t.exonList.append([eStart, exonEnds[i]])
        # Introns
        for i, eStart in enumerate(exonStarts):
            if i == 0:
                continue
            t.intronList.append([exonEnds[i - 1] + 1, eStart - 1])

        # Set UTR stats.
        if strand == "1":
            if cStart == tStart or cStart == tEnd:
                t.cds5Stat = "INC"
            else:
                t.cds5Stat = "COMP"

            if cEnd == tEnd or cEnd == tStart:
                t.cds3Stat = "INC"
            else:
                t.cds3Stat = "COMP"
        else:
            if cStart == tStart or cStart == tEnd:
                t.cds3Stat = "INC"
            else:
                t.cds3Stat = "COMP"

            if cEnd == tEnd or cEnd == tStart:
                t.cds5Stat = "INC"
            else:
                t.cds5Stat = "COMP"

        # only do UTR if complete!!! Will be empty list if not complete
        if cStart != tStart:  # update 3' or 5' depending on which strand...
            if strand == "1":
                utr = t.utr5
            else:
                utr = t.utr3

            for exon in t.exonList:
                if bioLibCG.simpleOverlap(exon[0], exon[1], cStart, cStart + 1):
                    utr.append([exon[0], cStart - 1])  # -1 because utr ends right before coding begins...
                    break
                else:
                    utr.append(exon)

        if cEnd != tEnd:  # again, based on strand
            if strand == "1":
                utr = t.utr3
            else:
                utr = t.utr5

            for exon in reversed(t.exonList):  # don't permanently reverse the list...
                if bioLibCG.simpleOverlap(exon[0], exon[1], cEnd, cEnd + 1):
                    utr.append([cEnd + 1, exon[1]])
                    break
                else:
                    utr.append(exon)

        allTranscripts.append(t)

    # put transcripts into genes
    genes = {}
    for t in allTranscripts:
        if t.parent in genes:
            genes[t.parent].append(t)
        else:
            genes[t.parent] = [t]

    allGenes = []
    for gID in genes:
        g = gene(gID, genes[gID])
        allGenes.append(g)

        ##print 'testing gene creation', allGenes[0].transcripts, allGenes[0].id

        # return a gene set
    return geneSet(allGenes)
예제 #8
0
def multiLoopCheck(xMerSeq, hairpinSeq, structure):

    fixedseq = xMerSeq.upper()  #check U to T
    symbols = str(structure)
    nucleotides = hairpinSeq

    #find position of conserved Kmer
    matchObject = re.search(fixedseq, nucleotides)  #find all matches of 8mer
    startingpt = matchObject.start(
        0)  #returns in 0 format the start position of FIRST match

    #get the num/positions of each loop
    looppositions = []
    loopcounter = 0  #
    matches = re.finditer('[(][.]{1,100}[)]', str(symbols))
    for match in matches:
        loopcounter += 1
        looppositions.append(int(match.start()))
        looppositions.append(int(match.end()))
        #ex. looppositions=[loop1start, loop1end, loop2start, loop2end, loop3start, loop3end]

    #multiple loops?
    if loopcounter > 1:
        multipleloops = True
    else:
        multipleloops = False

    if not multipleloops:
        return True
    else:

        #count number of parenths to starting point...
        a = 0
        countmatch = 0
        while a < startingpt:
            char = str(symbols[a])
            if char == '(':
                countmatch += 1
            if char == ')':
                countmatch -= 1
            a += 1

        #finding position of 8mer on the - strand (count backwards)
        revpos = len(symbols) - 1
        revmatch = 0
        while revmatch < countmatch:
            char = str(symbols[revpos])
            if char == ')':
                revmatch += 1
            if char == '(':
                revmatch -= 1
            revpos -= 1

        #checking if the 8mer contains loop
        forward8mer = symbols[startingpt:startingpt + 8]
        if '(' in str(forward8mer) and ')' in str(forward8mer):
            return False

        #checking if there is a loop directly across from 8mer
        reverse8mer = symbols[revpos - 7:revpos + 1]
        if '(' in str(reverse8mer) and ')' in str(reverse8mer):
            return False

        #if 8mer is on (-) strand, switch the coordinates
        if startingpt > revpos:
            temp = startingpt
            startingpt = revpos
            revpos = temp

        if (revpos - startingpt) > 60:
            loopspast8mer = 0
            x = 0
            while x < len(looppositions) - 1:
                loopstart = int(looppositions[x])
                loopend = int(looppositions[x + 1])
                if ((loopstart > startingpt) and
                    (loopend < revpos)) or cg.simpleOverlap(
                        startingpt, revpos, loopstart, loopend):
                    loopspast8mer += 1
                x += 2

        else:  #redo range to be 60
            spread = (60 - (revpos - startingpt)) / 2
            startingpt -= spread
            revpos += spread

            loopspast8mer = 0
            x = 0
            while x < len(looppositions) - 1:
                loopstart = int(looppositions[x])
                loopend = int(looppositions[x + 1])
                if ((loopstart > startingpt) and
                    (loopend < revpos)) or cg.simpleOverlap(
                        startingpt, revpos, loopstart, loopend):
                    loopspast8mer += 1
                x += 2

        if loopspast8mer < 2:
            return True
        else:
            return False