Esempio n. 1
0
def conv_align2GRmat(aliDfile, pScoreCutoff, aliFormat):
    in1 = open(aliDfile, 'r')
    U = {}
    NU = {}
    h_readId = {}
    h_refId = {}
    genomes = []
    read = []
    gCnt = 0
    rCnt = 0

    maxScore = None
    minScore = None
    for ln in in1:
        if (ln[0] == '@' or ln[0] == '#'):
            continue

        l = ln.split('\t')

        readId = l[0]
        if (aliFormat == 0 or aliFormat == 1):  # gnu-sam or sam
            if int(l[1]) & 0x4 == 4:  # bitwise FLAG - 0x4 : segment unmapped
                continue
            refId = l[2]
        elif (aliFormat == 2):  # bl8
            refId = l[1]

        if refId == '*':
            continue

        #refId=refId.split("ti:")[-1]
        mObj = re.search(r'ti\|(\d+)\|org\|([^|]+)\|', refId)
        if mObj:
            refId = "ti|" + mObj.group(1) + "|org|" + mObj.group(2)
        else:
            mObj = re.search(r'ti\|(\d+)\|', refId)
            if mObj and mObj.group(1) != "-1":
                refId = "ti|" + mObj.group(1)

        (pScore, skipFlag) = find_entry_score(ln, l, aliFormat, pScoreCutoff)
        if skipFlag:
            continue
        if ((maxScore == None) or (pScore > maxScore)):
            maxScore = pScore
        if ((minScore == None) or (pScore < minScore)):
            minScore = pScore

        gIdx = h_refId.get(refId, -1)
        if gIdx == -1:
            gIdx = gCnt
            h_refId[refId] = gIdx
            genomes.append(refId)
            gCnt += 1

        rIdx = h_readId.get(readId, -1)
        if rIdx == -1:
            #hold on this new read
            #first, wrap previous read profile and see if any previous read has a same profile with that!
            rIdx = rCnt
            h_readId[readId] = rIdx
            read.append(readId)
            rCnt += 1
            U[rIdx] = [[gIdx], [pScore], [float(pScore)], pScore]
        else:
            if (rIdx in U):
                if gIdx in U[rIdx][0]:
                    continue
                NU[rIdx] = U[rIdx]
                del U[rIdx]
            if gIdx in NU[rIdx][0]:
                continue
            NU[rIdx][0].append(gIdx)
            NU[rIdx][1].append(pScore)
            if pScore > NU[rIdx][3]:
                NU[rIdx][3] = pScore
#			length = len(NU[rIdx][1])
#			NU[rIdx][2] = [1.0/length]*length

    in1.close()

    if (aliFormat == 1):  # sam
        (U, NU) = samUtils.rescale_samscore(U, NU, maxScore, minScore)

    del h_refId, h_readId
    for rIdx in U:
        U[rIdx] = [U[rIdx][0][0], U[rIdx][1][0]]  #keep gIdx and score only
    for rIdx in NU:
        pScoreSum = sum(NU[rIdx][1])
        NU[rIdx][2] = [k / pScoreSum for k in NU[rIdx][1]]  #Normalizing pScore

    return U, NU, genomes, read
Esempio n. 2
0
def conv_align2GRmat(aliDfile,pScoreCutoff,aliFormat):
	in1 = open(aliDfile,'r')
	U = {}
	NU = {}
	h_readId = {}
	h_refId = {}
	genomes = []
	read =[]
	gCnt = 0
	rCnt = 0

	maxScore = None
	minScore = None
	for ln in in1:
		if (ln[0] == '@' or ln[0] == '#'):
			continue

		l = ln.split('\t')
		
		readId=l[0]
		if (aliFormat == 0 or aliFormat == 1): # gnu-sam or sam
			if int(l[1])&0x4 == 4: # bitwise FLAG - 0x4 : segment unmapped
				continue
			refId=l[2]
		elif (aliFormat == 2): # bl8
			refId=l[1]
		
		if refId == '*':
			continue
		
		#refId=refId.split("ti:")[-1]
		mObj=re.search(r'ti\|(\d+)\|org\|([^|]+)\|gi',refId)
		if mObj:
			refId = "ti|"+mObj.group(1)+"|org|"+mObj.group(2)
		else:
			mObj=re.search(r'ti\|(\d+)\|gi',refId)
			if mObj and mObj.group(1)!="-1":
				refId = "ti|"+mObj.group(1)

		(pScore, skipFlag) = find_entry_score(ln, l, aliFormat, pScoreCutoff)
		if skipFlag:
			continue
		if ((maxScore == None) or (pScore > maxScore)):
			maxScore = pScore
		if ((minScore == None) or (pScore < minScore)):
			minScore = pScore
		
		gIdx = h_refId.get(refId,-1)
		if gIdx == -1:
			gIdx = gCnt
			h_refId[refId] = gIdx
			genomes.append(refId)
			gCnt += 1

		rIdx = h_readId.get(readId,-1)
		if rIdx == -1:
			#hold on this new read
			#first, wrap previous read profile and see if any previous read has a same profile with that!
			rIdx = rCnt
			h_readId[readId] = rIdx
			read.append(readId)
			rCnt += 1
			U[rIdx] = [[gIdx], [pScore], [float(pScore)], pScore]
		else:
			if (rIdx in U):
				if gIdx in U[rIdx][0]:
					continue
				NU[rIdx] = U[rIdx]
				del U[rIdx]
			if gIdx in NU[rIdx][0]:
				continue
			NU[rIdx][0].append(gIdx)
			NU[rIdx][1].append(pScore)
			if pScore > NU[rIdx][3]:
				NU[rIdx][3] = pScore
#			length = len(NU[rIdx][1])
#			NU[rIdx][2] = [1.0/length]*length

	in1.close()

	if (aliFormat == 1): # sam
		(U, NU) = samUtils.rescale_samscore(U, NU, maxScore, minScore)

	del h_refId, h_readId
	for rIdx in U:
		U[rIdx] = [U[rIdx][0][0], U[rIdx][1][0]] #keep gIdx and score only
	for rIdx in NU:
		pScoreSum = sum(NU[rIdx][1])
		NU[rIdx][2] = [k/pScoreSum for k in NU[rIdx][1]] #Normalizing pScore

	return U, NU, genomes, read