def process_bp(inFileName,outFileName,coordH,regionL):

	result = mygsnap.gsnapFile(inFileName,True)
	outFile = open(outFileName, 'w')

	outFile.write('browser full knownGene\n')
	outFile.write('track name="%s" visibility=2\n' % inFileName)

	for rL in result:

		if not (rL[0].nLoci==1 and rL[1].nLoci==1 and rL[0].pairRel=='unpaired'):
			raise Exception

		locL = [mygenome.locus(rL[0].matchL()[0].segL[0][2]), mygenome.locus(rL[1].matchL()[0].segL[0][2])]

		for loc in locL:

			loc.chrSta += coordH[loc.chrom][1] -1
			loc.chrEnd += coordH[loc.chrom][1] -1
			loc.chrom = coordH[loc.chrom][0]

		flag = False

		for loc in locL:
			for region in regionL:
				if loc.overlap(region) > 0:
					flag = True
			
		if flag:

			print '^%s.*%s$\n' % (rL[0].seq(),mybasic.rc(rL[1].seq())),

			for loc in locL:
				outFile.write('%s\t%s\t%s\n' % (loc.chrom,loc.chrSta,loc.chrEnd))
Esempio n. 2
0
def process_bed(inFileName, outFileName, coordH):

    result = mygsnap.gsnapFile(inFileName, True)
    outFile = open(outFileName, 'w')

    count_all = 0
    count_strand = 0

    outFile.write('browser full knownGene\n')
    outFile.write('track name="targeted" visibility=2\n')

    for rL in result:

        if not (rL[0].nLoci == 1 and rL[1].nLoci == 1
                and rL[0].pairRel == 'unpaired'):
            raise Exception

        locL = [
            mygenome.locus(rL[0].matchL()[0].segL[0][2]),
            mygenome.locus(rL[1].matchL()[0].segL[0][2])
        ]

        for loc in locL:
            loc.chrSta += coordH[loc.chrom][1] - 1
            loc.chrEnd += coordH[loc.chrom][1] - 1
            loc.chrom = coordH[loc.chrom][0]

        for loc in locL:
            outFile.write('%s\t%s\t%s\n' % (loc.chrom, loc.chrSta, loc.chrEnd))
Esempio n. 3
0
def filter_transloc(inFileName, outFileName):

    result = mygsnap.gsnapFile(inFileName, False)
    outFile = open(outFileName, 'w')

    count_all = 0
    count_transloc = 0

    for r in result:

        count_all += 1

        if not '(transloc)' in r.pairRel:
            continue

        match = r.matchL()[0]

        segObjL = match.getSegInfo()

        skip = False

        for segObj in segObjL:
            if segObj.span - segObj.numMatch > 2 or segObj.percMatch < 90 or segObj.span < 5:
                skip = True
                break

        if skip:
            continue

        outFile.write(r.rawText() + '\n')
        count_transloc += 1

    print 'Results:', count_transloc, count_all
Esempio n. 4
0
def filter_transloc(inFileName,outFileName):

	result = mygsnap.gsnapFile(inFileName,False)
	outFile = open(outFileName, 'w')

	count_all = 0
	count_transloc = 0

	for r in result:

		count_all += 1

		if not '(transloc)' in r.pairRel:
			continue

		match = r.matchL()[0]

		segObjL = match.getSegInfo()

		skip = False

		for segObj in segObjL:
			if segObj.span - segObj.numMatch > 2 or segObj.percMatch < 90 or segObj.span < 5:
				skip = True
				break

		if skip:
			continue

		outFile.write(r.rawText()+'\n')
		count_transloc += 1

	print 'Results:',count_transloc,count_all
Esempio n. 5
0
def filter_strand(inFileName,outFileName):

	result = mygsnap.gsnapFile(inFileName,True)
	outFile = open(outFileName, 'w')

	count_all = 0
	count_strand = 0

	for rL in result:

		if not (rL[0].nLoci==1 and rL[1].nLoci==1 and rL[0].pairRel=='unpaired'):
			raise Exception

		chrom0 = rL[0].matchL()[0].segL[0][2].split(':')[0]
		chrom1 = rL[1].matchL()[0].segL[0][2].split(':')[0]

		if ((chrom0[0]==chrom0[-1] and chrom1[0]!=chrom1[-1]) or (chrom0[0]!=chrom0[-1] and chrom1[0]==chrom1[-1])) and chrom0[1:-1]!=chrom1[1:-1]:

			for i in (0,1):
				outFile.write(rL[i].rawText()+'\n')

			count_strand += 1

		else:

			for i in (0,1):
				print rL[i].rawText()

		count_all += 1

	print count_strand, count_all
Esempio n. 6
0
def process_bp(inFileName,outFileName,regionL):

	result = mygsnap.gsnapFile(inFileName,True)
	outFile = open(outFileName, 'w')

	outFile.write('browser full knownGene\n')
	outFile.write('track name="%s" visibility=2\n' % inFileName)

	for rL in result:

		if not (rL[0].nLoci==1 and rL[1].nLoci==1 and rL[0].pairRel=='concordant'):
			raise Exception

		locL = [mygenome.locus(rL[0].matchL()[0].segL[0][2]), mygenome.locus(rL[1].matchL()[0].segL[0][2])]

		flag = False

		for loc in locL:
			for region in regionL:
				if loc.overlap(region) > 0:
					flag = True
			
		if flag:

			print '^%s.*%s$\n' % (rL[0].seq(),mybasic.rc(rL[1].seq())),

			for loc in locL:
				outFile.write('%s\t%s\t%s\n' % (loc.chrom,loc.chrSta,loc.chrEnd))
Esempio n. 7
0
def filter_strand(inFileName, outFileName):

    result = mygsnap.gsnapFile(inFileName, True)
    outFile = open(outFileName, 'w')

    count_all = 0
    count_strand = 0

    for rL in result:

        if not (rL[0].nLoci == 1 and rL[1].nLoci == 1
                and rL[0].pairRel == 'unpaired'):
            raise Exception

        chrom0 = rL[0].matchL()[0].segL[0][2].split(':')[0]
        chrom1 = rL[1].matchL()[0].segL[0][2].split(':')[0]

        if ((chrom0[0] == chrom0[-1] and chrom1[0] != chrom1[-1]) or
            (chrom0[0] != chrom0[-1]
             and chrom1[0] == chrom1[-1])) and chrom0[1:-1] != chrom1[1:-1]:

            for i in (0, 1):
                outFile.write(rL[i].rawText() + '\n')

            count_strand += 1

        else:

            for i in (0, 1):
                print rL[i].rawText()

        count_all += 1

    print count_strand, count_all
Esempio n. 8
0
def process_bed(inFileName,outFileName,coordH):

	result = mygsnap.gsnapFile(inFileName,True)
	outFile = open(outFileName, 'w')

	count_all = 0
	count_strand = 0

	outFile.write('browser full knownGene\n')
	outFile.write('track name="targeted" visibility=2\n')

	for rL in result:

		if not (rL[0].nLoci==1 and rL[1].nLoci==1 and rL[0].pairRel=='unpaired'):
			raise Exception

		locL = [mygenome.locus(rL[0].matchL()[0].segL[0][2]), mygenome.locus(rL[1].matchL()[0].segL[0][2])]

		for loc in locL:
			loc.chrSta += coordH[loc.chrom][1] -1
			loc.chrEnd += coordH[loc.chrom][1] -1
			loc.chrom = coordH[loc.chrom][0]

		for loc in locL:
			outFile.write('%s\t%s\t%s\n' % (loc.chrom,loc.chrSta,loc.chrEnd))
def process_bp(inGsnapFileName, outBpFileName):

    result = mygsnap.gsnapFile(inGsnapFileName, False)
    outBpFile = open(outBpFileName, 'w')

    seqH = {}

    for r in result:

        match = r.matchL()[0]

        if not '(transloc)' in r.pairRel:
            raise Exception

        if len(match.segL) != 2:
            raise Exception

        s1 = match.segL[0][2]
        s2 = match.segL[1][2]

        if s1[0] != s2[0]:
            raise Exception

        strand = s1[0]

        s1T = re.match('[+-]([^:]+):[0-9]+..([0-9]+)', s1).groups()
        s2T = re.match('[+-]([^:]+):([0-9]+)..[0-9]+', s2).groups()

        if strand == '+':
            seq = r.seq()
            offset = int(match.segL[0][1].split('..')[1])
            junction = (s1T, s2T)
        else:
            seq = mybasic.rc(r.seq(), 'DNA')
            offset = len(seq) - int(match.segL[0][1].split('..')[1])
            junction = (s2T, s1T)

        mybasic.addHash(seqH, junction, (offset, seq))

    for ((j1, j2), vL) in seqH.items():

        vL.sort(lambda x, y: cmp(x[0], y[0]))

        vL_mod = []

        for (offset, seq) in vL:

            offset = blockSize - offset + 1
            vL_mod.append('%s:%s' % (offset, seq))

        outBpFile.write('%s:%s-%s,%s:%s-%s,%s\n' %
                        (j1[0].split('_')[0], int(j1[1]) - blockSize, j1[1],
                         j1[0].split('_')[0], j1[1], int(j1[1]) + blockSize,
                         '|'.join(vL_mod)))
Esempio n. 10
0
def process_bp(inGsnapFileName):

    result = mygsnap.gsnapFile(inGsnapFileName, False)
    #outBpFile = open(outBpFileName, 'w')

    seqH = {}

    for r in result:

        match = r.matchL()[0]

        if not '(transloc)' in r.pairRel:
            raise Exception

        if len(match.segL) != 2:
            raise Exception

        s1 = match.segL[0][2]
        s2 = match.segL[1][2]

        direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1)

        bp1 = re.match('[+-]([^:]+):[0-9]+..([0-9]+)', s1).groups()
        bp2 = re.match('[+-]([^:]+):([0-9]+)..[0-9]+', s2).groups()

        #		if bp1[0] == bp2[0]:
        #			continue

        if direction == 'sense':
            seq = r.seq()
            offset = int(match.segL[0][1].split('..')[1])
            bp12 = (bp1, bp2)
        else:
            seq = mybasic.rc(r.seq(), 'DNA')
            offset = len(seq) - int(match.segL[0][1].split('..')[1])
            bp12 = (bp2, bp1)

        mybasic.addHash(seqH, bp12, (offset, seq))

    seqL = seqH.items()
    seqL.sort(lambda x, y: cmp(len(y[1]), len(x[1])))

    for ((bp1, bp2), vL) in seqL:

        vL.sort(lambda x, y: cmp(y[0], x[0]))

        maxOffset = vL[0][0]

        print '\n', bp1, bp2, len(vL), '\n'

        for (offset, seq) in vL:

            print '%s%s %s' % (' ' * (maxOffset - offset), seq[:offset],
                               seq[offset:])
Esempio n. 11
0
def main(inGsnapFileName, outReportFileName, sampN, geneNL=[], overlap=10):

    eiH, ei_keyH, juncInfoH = loadAnnot(geneNL)

    print 'Finished loading refFlat'

    result = mygsnap.gsnapFile(inGsnapFileName, False)

    count = 0

    for r in result:

        if r.nLoci != 1:
            continue

        match = r.matchL()[0]

        for seg in match.segL:

            loc = seg[2]
            rm = re.match('([+-])([^:]+):([0-9,]+)..([0-9,]+)', loc)

            strand = rm.group(1)
            chrom = rm.group(2)
            chrPosL = [int(rm.group(3)), int(rm.group(4))]
            chrSta = min(chrPosL) - 1
            chrEnd = max(chrPosL)

            for pos in ei_keyH[chrom]:

                if chrSta + overlap <= pos <= chrEnd - overlap:
                    eiH[chrom][pos] += 1
                elif chrEnd - overlap < pos:
                    break


#		count += 1
#
#		if count % 10000 == 0:
#			print count

    outReportFile = open(outReportFileName, 'w')

    for chrom in ei_keyH.keys():

        for e in ei_keyH[chrom]:

            if eiH[chrom][e] == []:
                continue

            outReportFile.write(
                '%s\t%s\t%s\t%s\n' %
                (sampN, '%s:%s' %
                 (chrom, e), ','.join(juncInfoH[chrom][e]), eiH[chrom][e]))
Esempio n. 12
0
def process_bp(inGsnapFileName,outBpFileName):

	result = mygsnap.gsnapFile(inGsnapFileName,False)
	outBpFile = open(outBpFileName, 'w')

	seqH = {}

	for r in result:

		match = r.matchL()[0]

		if not '(transloc)' in r.pairRel:
			raise Exception

		if len(match.segL) != 2:
			raise Exception

		s1 = match.segL[0][2]
		s2 = match.segL[1][2]

		if s1[0] != s2[0]:
			raise Exception

		strand = s1[0]

		s1T = re.match('[+-]([^:]+):[0-9]+..([0-9]+)',s1).groups()
		s2T = re.match('[+-]([^:]+):([0-9]+)..[0-9]+',s2).groups()

		if strand == '+':
			seq = r.seq()
			offset = int(match.segL[0][1].split('..')[1])
			junction = (s1T, s2T)
		else:
			seq = mybasic.rc(r.seq(),'DNA')
			offset = len(seq)-int(match.segL[0][1].split('..')[1])
			junction = (s2T, s1T)

		mybasic.addHash(seqH,junction,(offset,seq))

	for ((k1,k2), v) in seqH.items():

		v.sort(lambda x,y: cmp(y[0],x[0]))

		k1T = re.match()
		k2T = re.match()

		k1_pos = 
		k2_pos = 

		k1_seq = 
		k2_seq = 

		outBpFile.write('%s,%s,%s\n' % (':'.join(k1),':'.join(k2),'|'.join(['%s:%s' % (offset,seq) for (offset,seq) in v])))
Esempio n. 13
0
def process_bp(inGsnapFileName):

	result = mygsnap.gsnapFile(inGsnapFileName,False)
	#outBpFile = open(outBpFileName, 'w')

	seqH = {}

	for r in result:

		match = r.matchL()[0]

		if not '(transloc)' in r.pairRel:
			raise Exception

		if len(match.segL) != 2:
			raise Exception

		s1 = match.segL[0][2]
		s2 = match.segL[1][2]

		direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1)

		bp1 = re.match('[+-]([^:]+):[0-9]+..([0-9]+)',s1).groups()
		bp2 = re.match('[+-]([^:]+):([0-9]+)..[0-9]+',s2).groups()

#		if bp1[0] == bp2[0]:
#			continue

		if direction == 'sense':
			seq = r.seq()
			offset = int(match.segL[0][1].split('..')[1])
			bp12 = (bp1, bp2)
		else:
			seq = mybasic.rc(r.seq(),'DNA')
			offset = len(seq)-int(match.segL[0][1].split('..')[1])
			bp12 = (bp2, bp1)

		mybasic.addHash(seqH,bp12,(offset,seq))

	seqL = seqH.items()
	seqL.sort(lambda x,y: cmp(len(y[1]),len(x[1])))

	for ((bp1,bp2), vL) in seqL:

		vL.sort(lambda x,y: cmp(y[0],x[0]))

		maxOffset = vL[0][0]

		print '\n',bp1,bp2,len(vL),'\n'

		for (offset,seq) in vL:

			print '%s%s %s' % (' ' * (maxOffset-offset),seq[:offset],seq[offset:])
Esempio n. 14
0
def process_bp(inGsnapFileName,outBpFileName):

	result = mygsnap.gsnapFile(inGsnapFileName,False)
	outBpFile = open(outBpFileName, 'w')

	seqH = {}

	for r in result:

		match = r.matchL()[0]

		if not '(transloc)' in r.pairRel:
			raise Exception

		if len(match.segL) != 2:
			raise Exception

		s1 = match.segL[0][2]
		s2 = match.segL[1][2]

		if s1[0] != s2[0]:
			raise Exception

		strand = s1[0]

		s1T = re.match('[+-]([^:]+):[0-9]+..([0-9]+)',s1).groups()
		s2T = re.match('[+-]([^:]+):([0-9]+)..[0-9]+',s2).groups()

		if strand == '+':
			seq = r.seq()
			offset = int(match.segL[0][1].split('..')[1])
			junction = (s1T, s2T)
		else:
			seq = mybasic.rc(r.seq(),'DNA')
			offset = len(seq)-int(match.segL[0][1].split('..')[1])
			junction = (s2T, s1T)

		mybasic.addHash(seqH,junction,(offset,seq))

	for ((j1,j2), vL) in seqH.items():

		vL.sort(lambda x,y: cmp(x[0],y[0]))

		vL_mod = []

		for (offset,seq) in vL:

			offset = blockSize-offset+1
			vL_mod.append('%s:%s' % (offset,seq))

		outBpFile.write('%s:%s-%s,%s:%s-%s,%s\n' % (j1[0].split('_')[0],int(j1[1])-blockSize,j1[1], j1[0].split('_')[0],j1[1],int(j1[1])+blockSize, '|'.join(vL_mod)))
Esempio n. 15
0
def main(inGsnapFileName,outReportFileName,sampN,geneNL=[],overlap=10):

	eiH, ei_keyH, juncInfoH = loadAnnot(geneNL)

	print 'Finished loading refFlat'

	result = mygsnap.gsnapFile(inGsnapFileName,False)

	count = 0

	for r in result:

		if r.nLoci != 1:
			continue

		match = r.matchL()[0]

		for seg in match.segL:

			loc = seg[2]
			rm = re.match('([+-])([^:]+):([0-9,]+)..([0-9,]+)',loc)

			strand = rm.group(1)
			chrom = rm.group(2)
			chrPosL = [int(rm.group(3)), int(rm.group(4))]
			chrSta = min(chrPosL) - 1
			chrEnd = max(chrPosL)

			for pos in ei_keyH[chrom]:
				
				if chrSta+overlap <= pos <= chrEnd-overlap:
					eiH[chrom][pos] += 1
				elif chrEnd-overlap < pos:
					break

#		count += 1
#
#		if count % 10000 == 0:
#			print count

	outReportFile = open(outReportFileName,'w')

	for chrom in ei_keyH.keys():

		for e in ei_keyH[chrom]:

			if eiH[chrom][e]==[]:
				continue

			outReportFile.write('%s\t%s\t%s\t%s\n' % (sampN, '%s:%s' % (chrom,e), ','.join(juncInfoH[chrom][e]), eiH[chrom][e]))
Esempio n. 16
0
def main(inGsnapFileName,outReportFileName,sampN,geneNL=[],overlap=10):

	eiH, ei_keyH, juncInfoH, ei_cntH = loadAnnot(geneNL)

	print 'Finished loading refFlat'

	result = mygsnap.gsnapFile(inGsnapFileName,False)

	count = 0

	for r in result:

		if r.nLoci != 1:
			continue

		match = r.matchL()[0]

		for seg in match.segL:

			loc = mygenome.locus(seg[2])

			if loc.chrSta + overlap > loc.chrEnd - overlap:
				continue

			cnt_s = findCut(ei_cntH[loc.chrom], ei_keyH[loc.chrom], loc.chrSta + overlap - 1)
			cnt_e = findCut(ei_cntH[loc.chrom], ei_keyH[loc.chrom], loc.chrEnd - overlap)
			if cnt_e < 1: ## no junction overlaps
				continue
			elif cnt_s != cnt_e: # overlapping junction exists
				pos_min = bisect.bisect_right(ei_keyH[loc.chrom], loc.chrSta + overlap - 1) - 1
				pos_max = bisect.bisect_right(ei_keyH[loc.chrom], loc.chrEnd - overlap)
				for pos in range(pos_min, pos_max):
					if loc.chrSta+overlap <= ei_keyH[loc.chrom][pos] <= loc.chrEnd-overlap:
						eiH[loc.chrom][ei_keyH[loc.chrom][pos]] += 1

#		count += 1
#
#		if count % 10000 == 0:
#			print count

	outReportFile = open(outReportFileName,'w')

	for chrom in ei_keyH.keys():

		for e in ei_keyH[chrom]:

			if eiH[chrom][e]==[]:
				continue

			outReportFile.write('%s\t%s\t%s\t%s\n' % (sampN, '%s:%s' % (chrom,e), ','.join(juncInfoH[chrom][e]), eiH[chrom][e]))
Esempio n. 17
0
def filter_annot1(inFileName, outFileName):

    result = mygsnap.gsnapFile(inFileName, False)
    outFile = open(outFileName, 'w')

    count_all = 0
    count_include = 0

    for r in result:

        if not '(transloc)' in r.pairRel:
            raise Exception

        match = r.matchL()[0]

        geneSetL = []

        for i in range(len(match.segL)):

            rm = re.search('label_[12]:([^,\t]*)', match.segL[i][3])

            if rm:
                geneSetL.append(
                    set([x.split('.exon')[0] for x in rm.group(1).split('|')]))
            else:
                geneSetL.append(set())

        geneSetCommon = geneSetL[0]

        for s in geneSetL[1:]:
            geneSetCommon = geneSetCommon.intersection(s)

        if len(geneSetCommon) == 0:
            outFile.write(r.rawText() + '\n')
            count_include += 1


#		else:
#			print r.rawText()

        count_all += 1

    print 'Results:', count_include, count_all
Esempio n. 18
0
def filter_annot2(inFileName, outFileName):

    result = mygsnap.gsnapFile(inFileName, False)
    outFile = open(outFileName, "w")

    count_all = 0
    count_include = 0

    for r in result:

        if not "(transloc)" in r.pairRel:
            raise Exception

        match = r.matchL()[0]

        geneSetL = []
        isThereEmptySet = False

        for i in range(len(match.segL)):

            rm = re.search("label_[12]:([^,\t]*)", match.segL[i][3])

            if rm:
                geneSetL.append(set([x.split(".exon")[0] for x in rm.group(1).split("|")]))
            else:
                geneSetL.append(set())
                isThereEmptySet = True

        geneSetCommon = geneSetL[0]

        for s in geneSetL[1:]:
            geneSetCommon = geneSetCommon.intersection(s)

        if len(geneSetCommon) == 0 and isThereEmptySet == False:
            outFile.write(r.rawText() + "\n")
            count_include += 1
        else:
            print r.rawText()

        count_all += 1

    print count_include, count_all
Esempio n. 19
0
def filter_crossMap(inFileName,outFileName):

	result = mygsnap.gsnapFile(inFileName,True)
	outFile = open(outFileName, 'w')

	count_all = 0
	count_crossMap = 0

	for rL in result:

		if rL[0].nLoci==1 and rL[1].nLoci==1 and rL[0].pairRel=='unpaired':

			for i in (0,1):
				outFile.write(rL[i].rawText()+'\n')

			count_crossMap += 1

		count_all += 1

	print count_crossMap,count_all
Esempio n. 20
0
def main(inGsnapFileName,outReportFileName,sampN,geneNL=[],overlap=10):

	eiH, ei_keyH, juncInfoH = loadAnnot(geneNL)

	print 'Finished loading refFlat'

	result = mygsnap.gsnapFile(inGsnapFileName,False)

	count = 0

	for r in result:

		if r.nLoci != 1:
			continue

		match = r.matchL()[0]

		for seg in match.segL:

			loc = mygenome.locus(seg[2])

			cursor.execute('select 1 from temp_table where chrom="%s" and pos>=%s and pos<=%s' % (loc.chrom,loc.chrSta+overlap,loc.chrEnd-overlap))

			if cursor.fetchone():
				eiH[loc.chrom][pos] += 1

		count += 1

		if count % 10000 == 0:
			print count

	outReportFile = open(outReportFileName,'w')

	for chrom in ei_keyH.keys():

		for e in ei_keyH[chrom]:

			if eiH[chrom][e]==[]:
				continue

			outReportFile.write('%s\t%s\t%s\t%s\n' % (sampN, '%s:%s' % (chrom,e), ','.join(juncInfoH[chrom][e]), eiH[chrom][e]))
Esempio n. 21
0
def filter_crossMap(inFileName, outFileName):

    result = mygsnap.gsnapFile(inFileName, True)
    outFile = open(outFileName, 'w')

    count_all = 0
    count_crossMap = 0

    for rL in result:

        if rL[0].nLoci == 1 and rL[1].nLoci == 1 and rL[
                0].pairRel == 'unpaired':

            for i in (0, 1):
                outFile.write(rL[i].rawText() + '\n')

            count_crossMap += 1

        count_all += 1

    print count_crossMap, count_all
def bp_filter(inFileName,outFileName,expSize):
	'''
	filters gsnap records with (no-mismatch, no-indel, no-splicing) and (insert_length > N-nt)
	''' 

	result = mygsnap.gsnapFile(inFileName,True)
	outFile = open(outFileName, 'w')

	count_all = 0
	count_include = 0

	for rL in result:

		count_all += 1

		if rL[0].nLoci != 1 or rL[1].nLoci != 1:
			continue

		skip = False
		
		for i in (0,1):

			match = rL[i].matchL()[0]
			seg = match.getSegInfo()[0]

			if len(match.segL) > 1 or match.pairInfo()[0] < 1000 or match.pairInfo()[0] > 10000 or (seg.len - seg.numMatch) > 0:
				skip = True
				break

		if skip:
			continue

		for i in (0,1):
			print rL[i].rawText()
			print rL[i].matchL()[0].getSegInfo()[0].len, rL[i].matchL()[0].getSegInfo()[0].numMatch
			outFile.write(rL[i].rawText()+'\n')

		count_include += 1

	print count_include, count_all
Esempio n. 23
0
def filter_annot1(inFileName,outFileName):

	result = mygsnap.gsnapFile(inFileName,False)
	outFile = open(outFileName, 'w')

	count_all = 0
	count_include = 0

	for r in result:

		if not '(transloc)' in r.pairRel:
			raise Exception

		match = r.matchL()[0]

		geneSetL = []

		for i in range(len(match.segL)):

			rm = re.search('label_[12]:([^,\t]*)',match.segL[i][3])

			if rm:
				geneSetL.append(set([x.split('.exon')[0] for x in rm.group(1).split('|')]))
			else:
				geneSetL.append(set())

		geneSetCommon = geneSetL[0]

		for s in geneSetL[1:]:
			geneSetCommon = geneSetCommon.intersection(s)

		if len(geneSetCommon) == 0:
			outFile.write(r.rawText()+'\n')
			count_include += 1
#		else:
#			print r.rawText()

		count_all += 1

	print 'Results:',count_include,count_all
Esempio n. 24
0
def bp_filter(inFileName, outFileName, expSize):
    '''
	filters gsnap records with (no-mismatch, no-indel, no-splicing) and (insert_length > N-nt)
	'''

    result = mygsnap.gsnapFile(inFileName, True)
    outFile = open(outFileName, 'w')

    count_all = 0
    count_include = 0

    for rL in result:

        count_all += 1

        if rL[0].nLoci != 1 or rL[1].nLoci != 1 or rL[0].pairRel not in (
                'concordant', 'paired'):
            continue

        skip = False

        for i in (0, 1):

            match = rL[i].matchL()[0]

            if len(match.segL) > 1 or match.pairInfo()[0] == int(
                    expSize) or match.getSegInfo()[0].numMismatch > 0:
                skip = True
                break

        if skip:
            continue

        for i in (0, 1):
            outFile.write(rL[i].rawText() + '\n')

        count_include += 1

    print count_include, count_all
Esempio n. 25
0
def bp_filter(inFileName,outFileName,expSize):
	'''
	filters gsnap records with (no-mismatch, no-indel, no-splicing) and (insert_length > N-nt)
	''' 

	result = mygsnap.gsnapFile(inFileName,True)
	outFile = open(outFileName, 'w')

	count_all = 0
	count_include = 0

	for rL in result:

		count_all += 1

		if rL[0].nLoci != 1 or rL[1].nLoci != 1 or rL[0].pairRel not in ('concordant','paired'):
			continue

		skip = False
		
		for i in (0,1):

			match = rL[i].matchL()[0]

			if len(match.segL) > 1 or match.pairInfo()[0] == int(expSize) or match.getSegInfo()[0].numMismatch > 0:
				skip = True
				break

		if skip:
			continue

		for i in (0,1):
			outFile.write(rL[i].rawText()+'\n')

		count_include += 1

	print count_include, count_all
Esempio n. 26
0
def process_bp(inFileName,outFileName,regionL):

	result = mygsnap.gsnapFile(inFileName,True)
	outFile = open(outFileName, 'w')

	outFile.write('browser full knownGene\n')
	outFile.write('track name="%s" visibility=2\n' % inFileName)

	for rL in result:

		if not (rL[0].nLoci==1 and rL[1].nLoci==1 and rL[0].pairRel=='concordant'):
			raise Exception

		if int(rL[0].pairInfo()[0]) < 76:
			continue

		locL = [mygenome.locus(rL[0].matchL()[0].segL[0][2]), mygenome.locus(rL[1].matchL()[0].segL[0][2])]

		flag = False

		for loc in locL:
			for region in regionL:
				if loc.overlap(region) > 0:
					flag = True
			
		if flag:

#			print '^%s.*%s$\n' % (rL[0].seq(),mybasic.rc(rL[1].seq())),

#			for loc in locL:
#				outFile.write('%s\t%s\t%s\n' % (loc.chrom,loc.chrSta,loc.chrEnd))

			if locL[0].chrEnd < locL[1].chrSta:
				outFile.write('%s\t%s\t%s\n' % (loc.chrom,locL[0].chrEnd,locL[1].chrSta))
			else:
				outFile.write('%s\t%s\t%s\n' % (loc.chrom,locL[1].chrEnd,locL[0].chrSta))
Esempio n. 27
0
def fusion_proc_sort(inGsnapFileName,outGsnapFileName,outReportFileName,sampN):

	result = mygsnap.gsnapFile(inGsnapFileName,False)

	juncHH = {}

	for r in result:

		match = r.matchL()[0]

		if not '(transloc)' in r.pairRel:
			raise Exception

		if len(match.segL) != 2:
			raise Exception

		splice_type = re.search('splice_type:([^,\t]*)', match.segL[0][3]).group(1)
		direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1)
		offset = int(re.search('\.\.([0-9]*)', match.segL[0][1]).group(1))

		transcriptL = []

		for i in range(2):

			rm = re.search('label_[12]:([^,\t]*)', match.segL[i][3])

			if rm:
				transcriptL.append(rm.group(1).replace('|',','))
			else:
				transcriptL.append('')

		s1 = match.segL[0][2]
		s2 = match.segL[1][2]

		bp1 = re.match('([+-])([^:]+):[0-9]+..([0-9]+)',s1)
		bp2 = re.match('([+-])([^:]+):([0-9]+)..[0-9]+',s2)

		if (bp1.group(1),direction) in (('+','sense'),('-','antisense')):
			trans_strand1 = '+'
		elif (bp1.group(1),direction) in (('+','antisense'),('-','sense')):
			trans_strand1 = '-'
		else:
			raise Exception

		if (bp2.group(1),direction) in (('+','sense'),('-','antisense')):
			trans_strand2 = '+'
		elif (bp2.group(1),direction) in (('+','antisense'),('-','sense')):
			trans_strand2 = '-'
		else:
			raise Exception

		if direction=='sense':
			key = ((trans_strand1,)+bp1.groups()[1:],(trans_strand2,)+bp2.groups()[1:])
		elif direction=='antisense':
			key = ((trans_strand2,)+bp2.groups()[1:],(trans_strand1,)+bp1.groups()[1:])
			transcriptL = transcriptL[::-1]
		else:
			raise Exception

		if key in juncHH:

			juncHH[key]['match'].append(r)
			juncHH[key]['seq'].append(r.seq())
			juncHH[key]['pos'].append((direction,offset))

		else:

			juncHH[key] = {'match':[r], 'splice_type':splice_type, 'seq':[r.seq()], 'pos':[(direction,offset)], 'transcript':transcriptL}

	juncKH = juncHH.items()
	juncKH.sort(lambda x,y: cmp(len(set(y[1]['pos'])),len(set(x[1]['pos']))))

	outGsnapFile = open(outGsnapFileName,'w')
	outReportFile = open(outReportFileName,'w')

	for (key, juncH) in juncKH:

		outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
			(juncH['splice_type'], sampN, key[0][0]+':'.join(key[0][1:]), key[1][0]+':'.join(key[1][1:]), \
			juncH['transcript'][0], juncH['transcript'][1],  \
			len(juncH['match']), len(set(juncH['seq'])), len(set(juncH['pos']))))

		for m in juncH['match']:
			outGsnapFile.write(m.rawText()+'\n')
Esempio n. 28
0
def gsnap_process_junction(inGsnapFileName,outGsnapFileName,outReportFileName,sampN):

	result = mygsnap.gsnapFile(inGsnapFileName,False)

	juncHH = {}

	for r in result:

		match = r.matchL()[0]

		if not '(transloc)' in r.pairRel:
			raise Exception

		if len(match.segL) != 2:
			raise Exception

		splice_type = re.search('splice_type:([^,\t]*)', match.segL[0][3]).group(1)
		direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1)
		offset = int(re.search('\.\.([0-9]*)', match.segL[0][1]).group(1))

		transcript1 = re.search('label_[12]:([^,\t]*)', match.segL[0][3])

		if transcript1:

			transcript1 = tuple([x.split('.exon')[0] for x in transcript1.group(1).split('|')])

		else:

			transcript1 = ()

		transcript2 = re.search('label_[12]:([^,\t]*)', match.segL[1][3])

		if transcript2:

			transcript2 = tuple([x.split('.exon')[0] for x in transcript2.group(1).split('|')])

		else:

			transcript2 = ()


		s1 = match.segL[0][2]
		s2 = match.segL[1][2]

		bp1 = re.match('([+-])([^:]+):[0-9]+..([0-9]+)',s1)
		bp2 = re.match('([+-])([^:]+):([0-9]+)..[0-9]+',s2)
			

		
		if (bp1.group(1),direction) in (('+','sense'),('-','antisense')):
			trans_strand1 = '+'
		elif (bp1.group(1),direction) in (('+','antisense'),('-','sense')):
			trans_strand1 = '-'
		else:
			raise Exception

		if (bp2.group(1),direction) in (('+','sense'),('-','antisense')):
			trans_strand2 = '+'
		elif (bp2.group(1),direction) in (('+','antisense'),('-','sense')):
			trans_strand2 = '-'
		else:
			raise Exception


		if direction=='sense':
			key = (bp1.groups()[1:],bp2.groups()[1:])
			transcript = (transcript1,transcript2)

		elif direction=='antisense':
			key = (bp2.groups()[1:],bp1.groups()[1:])
			transcript = (transcript2,transcript1)

		else:
			raise Exception

		if key in juncHH:

			juncHH[key]['match'].append(r)
			juncHH[key]['seq'].append(r.seq())
			juncHH[key]['reg'].append((direction,offset))

		else:

			juncHH[key] = {'match':[r], 'splice_type':splice_type, 'seq':[r.seq()], 'reg':[(direction,offset)], 'transcript':transcript}

	juncKH = juncHH.items()
	juncKH.sort(lambda x,y: cmp(len(set(y[1]['reg'])),len(set(x[1]['reg']))))

	outGsnapFile = open(outGsnapFileName,'w')
	outReportFile = open(outReportFileName,'w')

	
	for (key, juncH) in juncKH:
		
		outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
			(juncH['splice_type'], sampN,':'.join(key[0]), ':'.join(key[1]),\
			';'.join(juncH['transcript'][0]), ';'.join(juncH['transcript'][1]),\
			len(juncH['match']) ,len(set(juncH['seq'])), len(set(juncH['reg']))))

		for m in juncH['match']:
			outGsnapFile.write(m.rawText()+'\n')
Esempio n. 29
0
def make_samse(ifileN, ofileN):

    headerL = make_header('/data1/Sequence/ucsc_hg19/hg19.chrom.sizes')
    #	ofile = open(ofileN, 'w')
    for header in headerL:
        print header
#		ofile.write('%s\n' % header)

    result = mygsnap.gsnapFile(
        '/pipeline/test_ini_gsnap2sam/S022_single.gsnap', False)
    #result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/test.gsnap',False)
    #result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/S022_pair.gsnap',True)

    ## for unpaired
    for r in result:

        qname = r.rid()
        flag = 0x0
        rname = '*'
        pos = 0
        mapq = 0
        cigar = ''
        rnext = '*'  ## assume --npath=1 (maximum 1 alignment per read)
        pnext = 0  ## assume --npath=1 (maximum 1 alignment per read)
        tlen = 0  ## assume --npath=1 (maximum 1 alignment per read)
        seq = r.seq()
        qual = r.qual()
        extra = 'NH:i:1\tHI:i:1'  ## assume --npath=1 (maximum 1 alignment per read)

        if r.nLoci > 1:
            flag = flag | 0x4
            cigar = '*'
            new_cigar = '*'
            extra = ''
            print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' %
                  (qname, flag, rname, pos, mapq, new_cigar, rnext, pnext,
                   tlen, seq, qual, extra))
        else:
            if r.pairRel == '(transloc)':
                match = r.matchL()[0]
                segL = match.getSegInfo()
                mapq = segL[0].mapq
                seq = r.seq()
                qual = r.qual()
                for seg in segL:
                    flag = 0x0
                    (strand, rname, pos1,
                     pos2) = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)',
                                       seg.seg[2]).groups()
                    pos = min(int(pos1), int(pos2))
                    (cigar, clip) = seg.toCIGAR_trans()
                    if clip < 0:  ## first half
                        seq2 = seq[:clip]
                        qual2 = qual[:clip]
                    else:  ## second half
                        seq2 = seq[clip:]
                        qual2 = qual[clip:]
                    if strand == '-':
                        flag = flag | 0x10
                        seq2 = mybasic.rc(seq2)
                        qual2 = mybasic.rev(qual2)
#					print qname,seg.toCIGAR_trans()
                    print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' %
                          (qname, flag, rname, pos, mapq, cigar, rnext, pnext,
                           tlen, seq2, qual2, extra))
            else:
                match = r.matchL()[
                    0]  ## assume --npath=1 (maximum 1 alignment per read)

                segL = match.getSegInfo()
                (strand, rname, pos1,
                 pos2) = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)',
                                   segL[0].seg[2]).groups()
                pos = min(int(pos1), int(pos2))
                mapq = segL[0].mapq
                seg_nm = segL[0].numSub
                cigar2 = match.toCIGAR()
                #				print qname, match.toCIGAR()

                if segL[0].start != '' and segL[0].start != '0':
                    cigar = str(segL[0].start) + 'S'

                if strand == '-':
                    cigar = str(segL[0].numMatch +
                                segL[0].numSub) + 'M' + cigar
                    if segL[0].ins != '' and segL[0].ins != '0':
                        cigar = str(segL[0].ins) + 'I' + cigar
                else:
                    cigar = cigar + str(segL[0].numMatch +
                                        segL[0].numSub) + 'M'
                    if segL[0].ins != '' and segL[0].ins != '0':
                        cigar = cigar + str(segL[0].ins) + 'I'

                if len(segL) == 1:
                    new_cigar = segL[0].toCIGAR(True)
                else:
                    new_cigar = segL[0].toCIGAR()
                prev_cigar = new_cigar
                index = 0
                for seg in segL[1:]:
                    index = index + 1
                    if index == (len(segL) - 1):
                        final = True
                    else:
                        final = False
                    rm = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)',
                                   seg.seg[2]).groups()

                    match = str(seg.numMatch + seg.numSub) + 'M'

                    if seg.ins != '' and seg.ins != '0':
                        ins = str(seg.ins) + 'I'
                    else:
                        ins = ''

                    if pos == 0 or pos > min(int(rm[2]), int(rm[3])):
                        pos = min(int(rm[2]), int(rm[3]))
                    if strand == '-':
                        dist = int(pos2) - int(rm[2]) - 1
                        if dist > 0:
                            cigar = match + ins + str(dist) + 'N' + cigar
                        else:
                            cigar = match + ins + cigar
                    else:
                        dist = int(rm[2]) - int(pos2) - 1
                        if dist > 0:
                            cigar = cigar + str(dist) + 'N' + match + ins
                        else:
                            cigar = cigar + match + ins
                    seg_nm = seg_nm + seg.numSub
                    pos1 = rm[2]
                    pos2 = rm[3]
                    cur_cigar = seg.toCIGAR(final)
                    if strand == '-':
                        if dist > 0 and 'D' not in prev_cigar and 'I' not in prev_cigar:
                            new_cigar = cur_cigar + str(dist) + 'N' + new_cigar
                        else:
                            new_cigar = cur_cigar + new_cigar
                    else:
                        if dist > 0 and 'D' not in prev_cigar and 'I' not in prev_cigar:
                            new_cigar = new_cigar + str(dist) + 'N' + cur_cigar
                        else:
                            new_cigar = new_cigar + cur_cigar
                    prev_cigar = cur_cigar

                if segL[-1].end != '' and segL[-1].end != '0':  ## last segment
                    if strand == '-':
                        cigar = str(segL[-1].end) + 'S' + cigar
                    else:
                        cigar = cigar + str(segL[-1].end) + 'S'

                extra = extra + ('\tNM:i:%s' % seg_nm)

                if strand == '-':
                    flag = flag | 0x10
                    seq = mybasic.rc(seq)
                    qual = mybasic.rev(qual)

##			print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual, extra))
                print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' %
                      (qname, flag, rname, pos, mapq, cigar2, rnext, pnext,
                       tlen, seq, qual, extra))
Esempio n. 30
0
def exonSkip_filter(inFileName,outFileName):
	'''
	filters-in exon-skipping candidates in splice-mapped gsnap
	''' 

	result = mygsnap.gsnapFile(inFileName, False)
	outFile = open(outFileName, 'w')

	count_all = 0
	count_include = 0

	for r in result:

		if r.nLoci != 1:
			continue
		
		match = r.matchL()[0]

		if len(match.segL) != 2:
			continue

		segObjL = match.getSegInfo()

		jncH = {}

		skip = False

		for segObj in segObjL:

			if segObj.span - segObj.numMatch > 2 or segObj.percMatch < 90 or segObj.span < 5:
				skip = True
				break

			if segObj.label == '':
				break

			for b in segObj.label.split('|'):

				rm2 = re.match('(.*)\.exon([0-9]+)\/[0-9]+',b)

				transId = rm2.group(1)
				exonNum = int(rm2.group(2))

				mybasic.addHash(jncH,transId,exonNum)

		if skip:
			continue

		jncL = jncH.items()
		
		if len(jncL)>0 and max([len(j[1]) for j in jncL])>1:

			minDist = 100

			for i in range(len(jncL)):

				if len(jncL[i][1]) == 2 and abs(jncL[i][1][0]-jncL[i][1][1]) < minDist:
					minDist = abs(jncL[i][1][0]-jncL[i][1][1])

			if minDist > 1:

				outFile.write(r.rawText()+'\n')
				count_include += 1

		count_all += 1

	print 'Results:',count_include, count_all
Esempio n. 31
0
def exonSkip_proc(inGsnapFileName, outGsnapFileName, outReportFileName, sampN):

    geneNameH = mygenome.geneNameH()
    geneSetH = mygenome.geneSetH()
    geneInfoH = mygenome.geneInfoH(geneNameH, geneSetH)
    refFlatH = mygenome.loadRefFlatByChr()

    result = mygsnap.gsnapFile(inGsnapFileName, False)

    juncHH = {}

    for r in result:

        match = r.matchL()[0]

        if not '(transloc)' in r.pairRel:
            raise Exception

        if len(match.segL) != 2:
            raise Exception

        splice_type = re.search('splice_type:([^,\t]*)',
                                match.segL[0][3]).group(1)
        direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1)
        offset = int(re.search('\.\.([0-9]*)', match.segL[0][1]).group(1))

        transcript1 = re.search('label_[12]:([^,\t]*)', match.segL[0][3])
        gene1 = set()

        if transcript1:

            transcript1 = tuple(
                [x.split('.exon')[0] for x in transcript1.group(1).split('|')])

            for t in transcript1:

                g = mygenome.gene(t, geneNameH, geneSetH, geneInfoH)

                if g.geneName:
                    gene1.add(g.geneName)

        else:

            transcript1 = ()

        transcript2 = re.search('label_[12]:([^,\t]*)', match.segL[1][3])
        gene2 = set()

        if transcript2:

            transcript2 = tuple(
                [x.split('.exon')[0] for x in transcript2.group(1).split('|')])

            for t in transcript2:

                g = mygenome.gene(t, geneNameH, geneSetH, geneInfoH)

                if g.geneName:
                    gene2.add(g.geneName)

        else:

            transcript2 = ()

        s1 = match.segL[0][2]
        s2 = match.segL[1][2]

        bp1 = re.match('([+-])([^:]+):[0-9]+..([0-9]+)', s1)
        bp2 = re.match('([+-])([^:]+):([0-9]+)..[0-9]+', s2)

        if (bp1.group(1), direction) in (('+', 'sense'), ('-', 'antisense')):
            trans_strand1 = '+'
        elif (bp1.group(1), direction) in (('+', 'antisense'), ('-', 'sense')):
            trans_strand1 = '-'
        else:
            raise Exception

        if (bp2.group(1), direction) in (('+', 'sense'), ('-', 'antisense')):
            trans_strand2 = '+'
        elif (bp2.group(1), direction) in (('+', 'antisense'), ('-', 'sense')):
            trans_strand2 = '-'
        else:
            raise Exception

        bp_gene1 = mygenome.locus(
            '%s:%s-%s%s' % (bp1.group(2), int(bp1.group(3)) - 1, bp1.group(3),
                            trans_strand1)).overlappingGeneL(
                                refFlatH=refFlatH, strand_sensitive=True)
        bp_gene2 = mygenome.locus(
            '%s:%s-%s%s' % (bp2.group(2), int(bp2.group(3)) - 1, bp2.group(3),
                            trans_strand2)).overlappingGeneL(
                                refFlatH=refFlatH, strand_sensitive=True)

        if direction == 'sense':
            key = (bp1.groups()[1:], bp2.groups()[1:])
            transcript = (transcript1, transcript2)
            gene = (tuple(gene1), tuple(gene2))
            bp_gene = (bp_gene1, bp_gene2)
        elif direction == 'antisense':
            key = (bp2.groups()[1:], bp1.groups()[1:])
            transcript = (transcript2, transcript1)
            gene = (tuple(gene2), tuple(gene1))
            bp_gene = (bp_gene2, bp_gene1)
        else:
            raise Exception

        if key in juncHH:

            juncHH[key]['match'].append(r)
            juncHH[key]['seq'].append(r.seq())
            juncHH[key]['reg'].append((direction, offset))

        else:

            juncHH[key] = {
                'match': [r],
                'splice_type': splice_type,
                'seq': [r.seq()],
                'reg': [(direction, offset)],
                'transcript': transcript,
                'gene': gene,
                'bp_gene': bp_gene
            }

    juncKH = juncHH.items()
    juncKH.sort(lambda x, y: cmp(len(set(y[1]['reg'])), len(set(x[1]['reg']))))

    outGsnapFile = open(outGsnapFileName, 'w')
    outReportFile = open(outReportFileName, 'w')

    for (key, juncH) in juncKH:

        if key[0][0] == key[1][0]:
            type = 'intra'
        else:
            type = 'inter'

        geneInfo1 = []
        censusInfo1 = []

        for geneName in juncH['gene'][0]:
            gene = mygenome.gene(geneName, geneNameH, geneSetH, geneInfoH)
            geneInfo1.append(
                '%s:%s:%s' %
                (geneName, gene.getAttr('desc'), gene.getAttr('summary')))
            censusInfo1.append('%s:%s:%s:%s' %
                               (gene.getAttr('census_somatic'),
                                gene.getAttr('census_germline'),
                                gene.getAttr('census_mutType'),
                                gene.getAttr('census_translocPartners')))

        geneInfo2 = []
        censusInfo2 = []

        for geneName in juncH['gene'][1]:
            gene = mygenome.gene(geneName, geneNameH, geneSetH, geneInfoH)
            geneInfo2.append(
                '%s:%s:%s' %
                (geneName, gene.getAttr('desc'), gene.getAttr('summary')))
            censusInfo2.append('%s:%s:%s:%s' %
                               (gene.getAttr('census_somatic'),
                                gene.getAttr('census_germline'),
                                gene.getAttr('census_mutType'),
                                gene.getAttr('census_translocPartners')))

        outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
         (type, juncH['splice_type'], sampN, ':'.join(key[0]), ':'.join(key[1]), \
         ';'.join(juncH['transcript'][0]), ';'.join(juncH['transcript'][1]), ';'.join(juncH['gene'][0]), ';'.join(juncH['gene'][1]), ';'.join(geneInfo1), ';'.join(geneInfo2), \
         ';'.join(censusInfo1), ';'.join(censusInfo2), ','.join(juncH['bp_gene'][0]), ','.join(juncH['bp_gene'][1]), \
         len(juncH['match']) ,len(set(juncH['seq'])), len(set(juncH['reg']))))

        for m in juncH['match']:
            outGsnapFile.write(m.rawText() + '\n')
Esempio n. 32
0
def exonSkip_proc_sort(inGsnapFileName,outGsnapFileName,outReportFileName,sampN):

	result = mygsnap.gsnapFile(inGsnapFileName,False)

	juncHH = {}

	for r in result:

		if r.nLoci != 1:
			raise Exception

		match = r.matchL()[0]

		if len(match.segL) != 2:
			raise Exception

		direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1)
		offset = int(re.search('\.\.([0-9]*)', match.segL[0][1]).group(1))

		exonLP = []

		for i in range(len(match.segL)):

			rm = re.search('label_[12]:([^,\t]*)',match.segL[i][3])

			if not rm:
				raise Exception

			exonLP.append(rm.group(1).replace('|',','))

		s1 = match.segL[0][2]
		s2 = match.segL[1][2]

		bp1 = re.match('([+-])([^:]+):[0-9]+..([0-9]+)',s1)
		bp2 = re.match('([+-])([^:]+):([0-9]+)..[0-9]+',s2)
			
		if (bp1.group(1),direction) in (('+','sense'),('-','antisense')):
			trans_strand1 = '+'
		elif (bp1.group(1),direction) in (('+','antisense'),('-','sense')):
			trans_strand1 = '-'
		else:
			raise Exception

		if (bp2.group(1),direction) in (('+','sense'),('-','antisense')):
			trans_strand2 = '+'
		elif (bp2.group(1),direction) in (('+','antisense'),('-','sense')):
			trans_strand2 = '-'
		else:
			raise Exception

		if direction=='sense':
			key = ((trans_strand1,)+bp1.groups()[1:],(trans_strand2,)+bp2.groups()[1:])

		elif direction=='antisense':
			key = ((trans_strand2,)+bp2.groups()[1:],(trans_strand1,)+bp1.groups()[1:])
			exonLP = exonLP[::-1]

		else:
			raise Exception

		if key in juncHH:

			juncHH[key]['match'].append(r)
			juncHH[key]['seq'].append(r.seq())
			juncHH[key]['reg'].append((direction,offset))

		else:

			juncHH[key] = {'match':[r], 'seq':[r.seq()], 'reg':[(direction,offset)], 'exonLP':exonLP}

	juncKH = juncHH.items()
	juncKH.sort(lambda x,y: cmp(len(set(y[1]['reg'])),len(set(x[1]['reg']))))

	outGsnapFile = open(outGsnapFileName,'w')
	outReportFile = open(outReportFileName,'w')

	
	for (key, juncH) in juncKH:
		
		outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
			(sampN, key[0][0]+':'.join(key[0][1:]), key[1][0]+':'.join(key[1][1:]),\
			juncH['exonLP'][0], juncH['exonLP'][1],\
			len(juncH['match']) ,len(set(juncH['seq'])), len(set(juncH['reg']))))

		for m in juncH['match']:
			outGsnapFile.write(m.rawText()+'\n')
Esempio n. 33
0
#!/usr/bin/python

import sys, re, numpy
import mygsnap

if len(sys.argv) >= 4:
    inFileName = sys.argv[1]
    outFileName_matches = sys.argv[2]
    outFileName_mmPos = sys.argv[3]
else:
    inFileName = 'GH.txt'
    outFileName_matches = 'GH_matches.dst'
    outFileName_mmPos = 'GH_mmPos.txt'

result = mygsnap.gsnapFile(inFileName)
out_matches = open(outFileName_matches, 'w')
out_mmPos = open(outFileName_mmPos, 'w')

matches_count = {'unpaired': [], 'concordant': []}
matches_score = {'unpaired': [], 'concordant': []}

totalPairs = {'unpaired': 0, 'concordant': 0}
mmPos = {'unpaired': {0: None, 1: None}, 'concordant': {0: None, 1: None}}

for rL in result:

    if rL[0].nLoci == 1 and rL[1].nLoci == 1 and not '(transloc)' in rL[
            0].pairRel:  # unique, no-within-read-splicing

        mL = [rL[0].matchL()[0], rL[1].matchL()[0]]
Esempio n. 34
0
def gsnap_process_junction(inGsnapFileName,outGsnapFileName,outReportFileName,sampN):

	geneNameH = mygenome.geneNameH()
	geneSetH = mygenome.geneSetH()
	geneInfoH = mygenome.geneInfoH(geneNameH,geneSetH)
	refFlatH = mygenome.loadRefFlatByChr()

	result = mygsnap.gsnapFile(inGsnapFileName,False)

	juncHH = {}

	for r in result:

		match = r.matchL()[0]

		if not '(transloc)' in r.pairRel:
			raise Exception

		if len(match.segL) != 2:
			raise Exception

		splice_type = re.search('splice_type:([^,\t]*)', match.segL[0][3]).group(1)
		direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1)
		offset = int(re.search('\.\.([0-9]*)', match.segL[0][1]).group(1))

		rm = re.search('label_[12]:([^,\t]*)', match.segL[0][3])
		gene1 = set()

		if rm:

			trans_exon1 = rm.group(1).split('|')

			for t in trans_exon1:

				g = mygenome.gene(t.split('.exon')[0],geneNameH,geneSetH,geneInfoH)

				if g.geneName:
					gene1.add(g.geneName)

		else:

			trans_exon1 = ()

		rm = re.search('label_[12]:([^,\t]*)', match.segL[0][3])
		gene2 = set()

		if rm:

			trans_exon2 = rm.group(1).split('|')

			for t in trans_exon2:

				g = mygenome.gene(t.split('.exon')[0],geneNameH,geneSetH,geneInfoH)

				if g.geneName:
					gene2.add(g.geneName)

		else:

			trans_exon2 = ()


		s1 = match.segL[0][2]
		s2 = match.segL[1][2]

		bp1 = re.match('([+-])([^:]+):[0-9]+..([0-9]+)',s1)
		bp2 = re.match('([+-])([^:]+):([0-9]+)..[0-9]+',s2)

		if (bp1.group(1),direction) in (('+','sense'),('-','antisense')):
			trans_strand1 = '+'
		elif (bp1.group(1),direction) in (('+','antisense'),('-','sense')):
			trans_strand1 = '-'
		else:
			raise Exception

		if (bp2.group(1),direction) in (('+','sense'),('-','antisense')):
			trans_strand2 = '+'
		elif (bp2.group(1),direction) in (('+','antisense'),('-','sense')):
			trans_strand2 = '-'
		else:
			raise Exception

		locus1 = mygenome.locus('%s:%s-%s%s' % (bp1.group(2),int(bp1.group(3))-1,bp1.group(3),trans_strand1))
		bp_gene1 = list(set(locus1.overlappingGeneL(refFlatH=refFlatH,strand_sensitive=True)).difference(gene1))

		locus2 = mygenome.locus('%s:%s-%s%s' % (bp2.group(2),int(bp2.group(3))-2,bp2.group(3),trans_strand2))
		bp_gene2 = list(set(locus2.overlappingGeneL(refFlatH=refFlatH,strand_sensitive=True)).difference(gene2))

		if direction=='sense':
			key = (bp1.groups()[1:],bp2.groups()[1:])
			trans_exon = (trans_exon1,trans_exon2)
			gene = (list(gene1),list(gene2))
			bp_gene = (bp_gene1,bp_gene2)
		elif direction=='antisense':
			key = (bp2.groups()[1:],bp1.groups()[1:])
			trans_exon = (trans_exon2,trans_exon1)
			gene = (list(gene2),list(gene1))
			bp_gene = (bp_gene2,bp_gene1)
		else:
			raise Exception

		if key in juncHH:

			juncHH[key]['match'].append(r)
			juncHH[key]['seq'].append(r.seq())
			juncHH[key]['reg'].append((direction,offset))

		else:

			juncHH[key] = {'match':[r], 'splice_type':splice_type, 'seq':[r.seq()], 'reg':[(direction,offset)], 'trans_exon':trans_exon, 'gene':gene, 'bp_gene':bp_gene}

	juncKH = juncHH.items()
	juncKH.sort(lambda x,y: cmp(len(set(y[1]['reg'])),len(set(x[1]['reg']))))

	outGsnapFile = open(outGsnapFileName,'w')
	outReportFile = open(outReportFileName,'w')

	for (key, juncH) in juncKH:

		if key[0][0] == key[1][0]:
			type = 'intra'
		else:
			type = 'inter'

		geneInfo1 = []
		censusInfo1 = []

		for geneName in juncH['gene'][0]+juncH['bp_gene'][0]:
			gene = mygenome.gene(geneName,geneNameH,geneSetH,geneInfoH)
			geneInfo1.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary')))
			censusInfo1.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners')))

		geneInfo2 = []
		censusInfo2 = []

		for geneName in juncH['gene'][1]+juncH['bp_gene'][1]:
			gene = mygenome.gene(geneName,geneNameH,geneSetH,geneInfoH)
			geneInfo2.append('%s:%s:%s' % (geneName,gene.getAttr('desc'),gene.getAttr('summary')))
			censusInfo2.append('%s:%s:%s:%s' % (gene.getAttr('census_somatic'),gene.getAttr('census_germline'),gene.getAttr('census_mutType'),gene.getAttr('census_translocPartners')))

		outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s;%s\t%s;%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
			(type, juncH['splice_type'], sampN, ':'.join(key[0]), ':'.join(key[1]), \
			','.join(juncH['trans_exon'][0]), ','.join(juncH['trans_exon'][1]), \
			','.join(juncH['gene'][0]), ','.join(juncH['bp_gene'][0]), ','.join(juncH['gene'][1]), ','.join(juncH['bp_gene'][1]), \
			';'.join(geneInfo1), ';'.join(geneInfo2), ';'.join(censusInfo1), ';'.join(censusInfo2), \
			len(juncH['match']) ,len(set(juncH['seq'])), len(set(juncH['reg']))))

		for m in juncH['match']:
			outGsnapFile.write(m.rawText()+'\n')
Esempio n. 35
0
def fusion_proc_sort(inGsnapFileName, outGsnapFileName, outReportFileName,
                     sampN):

    result = mygsnap.gsnapFile(inGsnapFileName, False)

    juncHH = {}

    for r in result:

        match = r.matchL()[0]

        if not '(transloc)' in r.pairRel:
            raise Exception

        if len(match.segL) != 2:
            raise Exception

        splice_type = re.search('splice_type:([^,\t]*)',
                                match.segL[0][3]).group(1)
        direction = re.search('dir:([^,\t]*)', match.segL[0][3]).group(1)
        offset = int(re.search('\.\.([0-9]*)', match.segL[0][1]).group(1))

        transcriptL = []

        for i in range(2):

            rm = re.search('label_[12]:([^,\t]*)', match.segL[i][3])

            if rm:
                transcriptL.append(rm.group(1).replace('|', ','))
            else:
                transcriptL.append('')

        s1 = match.segL[0][2]
        s2 = match.segL[1][2]

        bp1 = re.match('([+-])([^:]+):[0-9]+..([0-9]+)', s1)
        bp2 = re.match('([+-])([^:]+):([0-9]+)..[0-9]+', s2)

        if (bp1.group(1), direction) in (('+', 'sense'), ('-', 'antisense')):
            trans_strand1 = '+'
        elif (bp1.group(1), direction) in (('+', 'antisense'), ('-', 'sense')):
            trans_strand1 = '-'
        else:
            raise Exception

        if (bp2.group(1), direction) in (('+', 'sense'), ('-', 'antisense')):
            trans_strand2 = '+'
        elif (bp2.group(1), direction) in (('+', 'antisense'), ('-', 'sense')):
            trans_strand2 = '-'
        else:
            raise Exception

        if direction == 'sense':
            key = ((trans_strand1, ) + bp1.groups()[1:],
                   (trans_strand2, ) + bp2.groups()[1:])
        elif direction == 'antisense':
            key = ((trans_strand2, ) + bp2.groups()[1:],
                   (trans_strand1, ) + bp1.groups()[1:])
            transcriptL = transcriptL[::-1]
        else:
            raise Exception

        if key in juncHH:

            juncHH[key]['match'].append(r)
            juncHH[key]['seq'].append(r.seq())
            juncHH[key]['pos'].append((direction, offset))

        else:

            juncHH[key] = {
                'match': [r],
                'splice_type': splice_type,
                'seq': [r.seq()],
                'pos': [(direction, offset)],
                'transcript': transcriptL
            }

    juncKH = juncHH.items()
    juncKH.sort(lambda x, y: cmp(len(set(y[1]['pos'])), len(set(x[1]['pos']))))

    outGsnapFile = open(outGsnapFileName, 'w')
    outReportFile = open(outReportFileName, 'w')

    for (key, juncH) in juncKH:

        outReportFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
         (juncH['splice_type'], sampN, key[0][0]+':'.join(key[0][1:]), key[1][0]+':'.join(key[1][1:]), \
         juncH['transcript'][0], juncH['transcript'][1],  \
         len(juncH['match']), len(set(juncH['seq'])), len(set(juncH['pos']))))

        for m in juncH['match']:
            outGsnapFile.write(m.rawText() + '\n')
Esempio n. 36
0
def make_samse(ifileN, ofileN):

	headerL = make_header('/data1/Sequence/ucsc_hg19/hg19.chrom.sizes')
#	ofile = open(ofileN, 'w')
	for header in headerL:
		print header
#		ofile.write('%s\n' % header)

	result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/S022_single.gsnap',False)
	#result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/test.gsnap',False)
	#result = mygsnap.gsnapFile('/pipeline/test_ini_gsnap2sam/S022_pair.gsnap',True)

	## for unpaired
	for r in result:

		qname = r.rid()
		flag = 0x0
		rname = '*'
		pos = 0
		mapq = 0
		cigar = ''
		rnext = '*' ## assume --npath=1 (maximum 1 alignment per read)
		pnext = 0 ## assume --npath=1 (maximum 1 alignment per read)
		tlen = 0  ## assume --npath=1 (maximum 1 alignment per read)
		seq = r.seq()
		qual = r.qual()
		extra = 'NH:i:1\tHI:i:1' ## assume --npath=1 (maximum 1 alignment per read)

		if r.nLoci > 1:
			flag = flag | 0x4
			cigar = '*'
			new_cigar = '*'
			extra = ''
			print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, new_cigar, rnext, pnext, tlen, seq, qual, extra))
		else:
			if r.pairRel == '(transloc)':
				match = r.matchL()[0]
				segL = match.getSegInfo()
				mapq = segL[0].mapq
				seq = r.seq()
				qual = r.qual()
				for seg in segL:
					flag = 0x0
					(strand, rname, pos1, pos2) = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', seg.seg[2]).groups()
					pos = min(int(pos1), int(pos2))
					(cigar,clip) = seg.toCIGAR_trans()
					if clip < 0: ## first half
						seq2 = seq[:clip]
						qual2 = qual[:clip]
					else: ## second half
						seq2 = seq[clip:]
						qual2 = qual[clip:]
					if strand == '-':
						flag = flag | 0x10
						seq2 = mybasic.rc(seq2)
						qual2 = mybasic.rev(qual2)
#					print qname,seg.toCIGAR_trans()
					print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq2, qual2, extra))
			else:
				match = r.matchL()[0] ## assume --npath=1 (maximum 1 alignment per read)

				segL = match.getSegInfo()
				(strand, rname, pos1, pos2) = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', segL[0].seg[2]).groups()
				pos = min(int(pos1), int(pos2))
				mapq = segL[0].mapq
				seg_nm = segL[0].numSub
				cigar2 = match.toCIGAR()
#				print qname, match.toCIGAR()

				if segL[0].start != '' and segL[0].start != '0':
					cigar = str(segL[0].start) + 'S'

				if strand == '-':
					cigar = str(segL[0].numMatch + segL[0].numSub) + 'M' + cigar
					if segL[0].ins != '' and segL[0].ins != '0':
						cigar = str(segL[0].ins) + 'I' + cigar
				else:
					cigar = cigar + str(segL[0].numMatch + segL[0].numSub) + 'M'
					if segL[0].ins != '' and segL[0].ins != '0':
						cigar = cigar + str(segL[0].ins) + 'I'

				if len(segL) == 1:
					new_cigar = segL[0].toCIGAR(True)
				else:
					new_cigar = segL[0].toCIGAR()
				prev_cigar = new_cigar
				index = 0
				for seg in segL[1:]:
					index = index + 1
					if index == (len(segL) - 1):
						final = True
					else:
						final = False
					rm = re.search('([\+\-])(.*):([0-9]+)\.\.([0-9]+)', seg.seg[2]).groups()

					match = str(seg.numMatch + seg.numSub) + 'M'

					if seg.ins != '' and seg.ins != '0':
						ins = str(seg.ins) + 'I'
					else:
						ins = ''

					if pos == 0 or pos > min(int(rm[2]), int(rm[3])):
						pos = min(int(rm[2]), int(rm[3]))
					if strand == '-':
						dist = int(pos2) - int(rm[2]) - 1
						if dist > 0:
							cigar = match + ins + str(dist) + 'N' + cigar
						else:
							cigar = match + ins + cigar
					else:
						dist = int(rm[2]) - int(pos2) - 1
						if dist > 0:
							cigar = cigar + str(dist) + 'N' + match + ins
						else:
							cigar = cigar + match + ins
					seg_nm = seg_nm + seg.numSub
					pos1 = rm[2]
					pos2 = rm[3]
					cur_cigar = seg.toCIGAR(final)
					if strand == '-':
						if dist > 0 and 'D' not in prev_cigar and 'I' not in prev_cigar:
							new_cigar = cur_cigar + str(dist) + 'N' + new_cigar
						else:
							new_cigar = cur_cigar + new_cigar
					else:
						if dist > 0 and 'D' not in prev_cigar and 'I' not in prev_cigar:
							new_cigar = new_cigar + str(dist) + 'N' + cur_cigar
						else:
							new_cigar = new_cigar + cur_cigar
					prev_cigar = cur_cigar

				if segL[-1].end != '' and segL[-1].end != '0': ## last segment
					if strand == '-':
						cigar = str(segL[-1].end) + 'S' + cigar
					else:
						cigar = cigar + str(segL[-1].end) + 'S'

				extra = extra + ('\tNM:i:%s' % seg_nm)

				if strand == '-':
					flag = flag | 0x10
					seq = mybasic.rc(seq)
					qual = mybasic.rev(qual)

##			print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual, extra))
				print ('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (qname, flag, rname, pos, mapq, cigar2, rnext, pnext, tlen, seq, qual, extra))
Esempio n. 37
0
def exonSkip_filter(inFileName, outFileName):
    '''
	filters-in exon-skipping candidates in splice-mapped gsnap
	'''

    result = mygsnap.gsnapFile(inFileName, False)
    if outFileName[-3:] == '.gz':
        outFile = gzip.open(outFileName, 'wb')
    else:
        outFile = open(outFileName, 'w')

    count_all = 0
    count_include = 0

    for r in result:

        if r.nLoci != 1:
            continue

        match = r.matchL()[0]

        if len(match.segL) != 2:
            continue

        segObjL = match.getSegInfo()

        jncH = {}

        skip = False

        for segObj in segObjL:

            if segObj.span - segObj.numMatch > 2 or segObj.percMatch < 90 or segObj.span < 5:
                skip = True
                break

            if segObj.label == '':
                break

            for b in segObj.label.split('|'):

                rm2 = re.match('(.*)\.exon([0-9]+)\/[0-9]+', b)

                transId = rm2.group(1)
                exonNum = int(rm2.group(2))

                mybasic.addHash(jncH, transId, exonNum)

        if skip:
            continue

        jncL = jncH.items()

        if len(jncL) > 0 and max([len(j[1]) for j in jncL]) > 1:

            minDist = 100

            for i in range(len(jncL)):

                if len(jncL[i][1]) == 2 and abs(jncL[i][1][0] -
                                                jncL[i][1][1]) < minDist:
                    minDist = abs(jncL[i][1][0] - jncL[i][1][1])

            if minDist == 1:  # only difference

                outFile.write(r.rawText() + '\n')
                count_include += 1

        count_all += 1

    print 'Results:', count_include, count_all
Esempio n. 38
0
import sys
import mygsnap



if len(sys.argv) >= 3:
	inFileName = sys.argv[1]
	outFileName = sys.argv[2]
else:
	inFileName = '/Data2/RNASeq_SMC1_S02_result.txt'
	outFileName = 'GH_S02_matchfilter2.txt'

matchCutOff = 90

result = mygsnap.gsnapFile(inFileName)
outFile = open(outFileName, 'w')


for rL in result:

	if not (rL[0].nLoci==1 and rL[1].nLoci==1 and rL[0].pairRel=='unpaired') or '(transloc)' in rL[0].pairRel:
		continue

	if not (len(rL[0].matchL()[0].mergedLocusL())==1 and len(rL[1].matchL()[0].mergedLocusL())==1):
		continue

	if not (rL[0].matchL()[0].numMatch()>=matchCutOff and rL[1].matchL()[0].numMatch()>=matchCutOff):
		continue

	for i in (0,1):