Beispiel #1
0
def detectRelationWithLeastFDR(statsData,
                               FDRLimit=0.01,
                               modeUsed=mode.onlyOne):

    # modes
    # mode.onlyOne -> removes the relation with least FDR, if it has FDR < FDRLimit
    # mode.onePerHigher -> removes one relation for each higher level element, if it has FDR < FDRLimit
    #***

    relationsToRemove = []

    sortedStats = stats.sortByInstance(statsData, "FDRij",
                                       isDescendent=False)[::-1]

    if modeUsed == mode.onlyOne:
        leastFDR = sys.float_info.max
        leastFDRRow = []
        for statsRow in sortedStats:
            if statsRow.FDRij < leastFDR:
                leastFDR = statsRow.FDRij
                leastFDRRow = statsRow
        if leastFDRRow != []:
            if leastFDRRow.FDRij < FDRLimit:
                newRelationToRemove = [leastFDRRow.id2, leastFDRRow.id1]
                relationsToRemove = [newRelationToRemove]

    if modeUsed == mode.onePerHigher:
        for statsRow in sortedStats:
            # begin: jmrc
            # if statsRow.FDRij < FDRLimit:
            if isinstance(statsRow.FDRij, float) and statsRow.FDRij < FDRLimit:
                # end: jmrc
                higherSelection = stats.filterByElement(
                    relationsToRemove, statsRow.id2)
                if len(higherSelection) == 0:
                    # add new outlier relation
                    newRelationToRemove = [
                        statsRow.id2, statsRow.id1, statsRow.FDRij,
                        abs(statsRow.Zij)
                    ]
                    relationsToRemove.append(newRelationToRemove)
                else:
                    # warning!! none should have len(higherSelection) > 1
                    if len(higherSelection) == 1:
                        # check which has the least FDRij, or the biggest |Zij| if FDRij == 0
                        if (statsRow.FDRij < higherSelection[0][2] or
                            (statsRow.FDRij == 0
                             and abs(statsRow.Zij) > higherSelection[0][3])):
                            newRelationToRemove = [
                                statsRow.id2, statsRow.id1, statsRow.FDRij,
                                abs(statsRow.Zij)
                            ]
                            relationsToRemove.remove(higherSelection[0])
                            relationsToRemove.append(newRelationToRemove)

        relationsToRemove = stats.extractColumns(relationsToRemove, 0, 1)

    return relationsToRemove
Beispiel #2
0
def getInverseOfFit(mergedData, k, variance, alpha):
	
	inverseOfFit = []
	#input = stats.sortByIndex(input, 2)

	for element in mergedData:
		sequence = element[0]
		# sort = False for speeding
		scanListWithSequence = stats.filterByElement(mergedData, sequence, sort = False)
		if len(scanListWithSequence) > 1:
			weight = element[3]
			inverseOfFit.append(getW_klibrate(weight, k, variance, alpha))

	inverseOfFit = sort(inverseOfFit) # sort by weight
	
	return inverseOfFit
def detectRelationWithLeastFDR(statsData, FDRLimit = 0.01, modeUsed = mode.onlyOne):

	# modes
	# mode.onlyOne -> removes the relation with least FDR, if it has FDR < FDRLimit
	# mode.onePerHigher -> removes one relation for each higher level element, if it has FDR < FDRLimit
	#***
	
	relationsToRemove = []

	sortedStats = stats.sortByInstance(statsData, "FDRij", isDescendent = False)[::-1]

	if modeUsed == mode.onlyOne:
		leastFDR = sys.float_info.max
		leastFDRRow = []
		for statsRow in sortedStats:
			if statsRow.FDRij < leastFDR:
				leastFDR = statsRow.FDRij
				leastFDRRow = statsRow
		if leastFDRRow != []:
			if leastFDRRow.FDRij < FDRLimit:
				newRelationToRemove = [leastFDRRow.id2, leastFDRRow.id1]
				relationsToRemove = [newRelationToRemove]
			
	if modeUsed == mode.onePerHigher:
		for statsRow in sortedStats:
			# begin: jmrc
			# if statsRow.FDRij < FDRLimit:
			if isinstance(statsRow.FDRij, float) and statsRow.FDRij < FDRLimit:
			# end: jmrc
				higherSelection = stats.filterByElement(relationsToRemove, statsRow.id2)
				if len(higherSelection) == 0:
					# add new outlier relation
					newRelationToRemove = [statsRow.id2, statsRow.id1, statsRow.FDRij, abs(statsRow.Zij)]
					relationsToRemove.append(newRelationToRemove)
				else:
					# warning!! none should have len(higherSelection) > 1
					if len(higherSelection) == 1:
						# check which has the least FDRij, or the biggest |Zij| if FDRij == 0
						if (statsRow.FDRij < higherSelection[0][2] or (statsRow.FDRij == 0 and abs(statsRow.Zij) > higherSelection[0][3])):
							newRelationToRemove = [statsRow.id2, statsRow.id1, statsRow.FDRij, abs(statsRow.Zij)]
							relationsToRemove.remove(higherSelection[0])
							relationsToRemove.append(newRelationToRemove)

		relationsToRemove = stats.extractColumns(relationsToRemove, 0, 1)
	
	return relationsToRemove
Beispiel #4
0
def replaceRelations(relations, FASTAHeaders = []):
	
	newRelations = []
	relationsSorted = stats.sortByIndex(relations, 1)
	FASTAHeadersSorted = stats.sortByIndex(FASTAHeaders, 0)
	
	
	if len(FASTAHeadersSorted) > 0:
		for relation in relationsSorted:
			element = relation[1]
			searchResult = stats.filterByElement(FASTAHeadersSorted, element, index = 0, sort = False)
			if len(searchResult) > 0:
				newRelations.append([relation[0], searchResult[0][1]])
	else:
		return relationsSorted
	
	return newRelations
Beispiel #5
0
def getInverseOfFit(mergedData, k, variance, alpha):

    inverseOfFit = []
    #input = stats.sortByIndex(input, 2)

    for element in mergedData:
        sequence = element[0]
        # sort = False for speeding
        scanListWithSequence = stats.filterByElement(mergedData,
                                                     sequence,
                                                     sort=False)
        if len(scanListWithSequence) > 1:
            weight = element[3]
            inverseOfFit.append(getW_klibrate(weight, k, variance, alpha))

    inverseOfFit = sort(inverseOfFit)  # sort by weight

    return inverseOfFit
Beispiel #6
0
def getMADDistribution(nextIdX,
                       mergedData,
                       k,
                       variance,
                       alpha,
                       medianSide=100,
                       showGraph=False,
                       verbose=False):

    MADconstant = 1.48260221850560  # *** 1 / DISTR.NORM.ESTAND.INV(3 / 4) get exact number
    MADDistribution = []
    distrWeight = []

    # inputSequences = extractColumns(input, 0)
    # outputSequences = extractColumns(output, 0)

    newlist = []
    for orow in nextIdX:
        sequence = orow[0]
        # it is important to avoid sorting to keep it fast
        # so in next line do not foget sort = False
        # this should arrive here already sorted
        scanListWithSequence = stats.filterByElement(mergedData,
                                                     sequence,
                                                     sort=False)

        if len(scanListWithSequence
               ) > 1:  # otherwise Xi = Xj --> Xi - Xj = 0 --> does not work
            for scanRow in scanListWithSequence:
                newrow = []
                weight = scanRow[3]  # the V
                degreesOfFreedom = len(scanListWithSequence)
                XiXj = scanRow[2] - orow[1]
                newrow.append(sequence)  # sequence = 0
                newrow.append(scanRow[1])  # scan number = 1
                newrow.append(XiXj)  # Xi - Xj = 2
                newrow.append(weight)  # weight = 3
                newrow.append(
                    len(scanListWithSequence))  # degrees of freedom = 4
                newrow.append(
                    fabs(XiXj) * sqrt(
                        float(degreesOfFreedom) /
                        (float(degreesOfFreedom - 1))))  # = 5
                newrow.append(0)  # space to save the median = 6
                newrow.append(0)  # space to save the MAD formula = 7

                newlist.append(newrow)

    newlist = stats.sortByIndex(newlist, 3)  # sort by weight

    # get median + rank
    nextlist = []
    counter = 0

    if len(newlist) < medianSide * 2:
        if verbose:
            print('Not enough data to perform statistics,')
            print('len(newlist) = %s, while medianSide = %s' %
                  (str(len(newlist)), str(medianSide)))
        sys.exit()

    for i in range(len(newlist))[medianSide:len(newlist) - medianSide]:
        window = newlist[i - medianSide:i + medianSide + 1]
        median = stats.medianByIndex(window, 5)
        newlist[i][6] = median

    # fill the borders
    for i in range(len(newlist))[:medianSide]:
        newlist[i][6] = newlist[medianSide + 1][6]

    for i in range(len(newlist))[len(newlist) - medianSide:]:
        newlist[i][6] = newlist[len(newlist) - medianSide - 1][6]

    # fill MAD formula
    for i in range(len(newlist)):
        newlist[i][7] = 1 / (MADconstant * newlist[i][6])**2
        MADDistribution.append(newlist[i][7])
        distrWeight.append(newlist[i][3])

    if verbose:
        print('k = %f, var = %f' % (k, variance))

    return MADDistribution, distrWeight
Beispiel #7
0
def getMADDistribution(nextIdX,
						mergedData,
						k,
						variance,
						alpha,
						medianSide = 100,
						showGraph = False,
						verbose = False):
	
	MADconstant = 1.48260221850560 # *** 1 / DISTR.NORM.ESTAND.INV(3 / 4) get exact number
	MADDistribution = []
	distrWeight = []

	# inputSequences = extractColumns(input, 0)
	# outputSequences = extractColumns(output, 0)
	
	newlist = []
	for orow in nextIdX:
		sequence = orow[0]
		# it is important to avoid sorting to keep it fast
		# so in next line do not foget sort = False
		# this should arrive here already sorted
		scanListWithSequence = stats.filterByElement(mergedData, sequence, sort = False)
		
		if len(scanListWithSequence) > 1: # otherwise Xi = Xj --> Xi - Xj = 0 --> does not work
			for scanRow in scanListWithSequence:
				newrow = []
				weight = scanRow[3] # the V
				degreesOfFreedom = len(scanListWithSequence)
				XiXj = scanRow[2] - orow[1]
				newrow.append(sequence) # sequence = 0
				newrow.append(scanRow[1]) # scan number = 1
				newrow.append(XiXj) # Xi - Xj = 2
				newrow.append(weight) # weight = 3
				newrow.append(len(scanListWithSequence)) # degrees of freedom = 4
				newrow.append(fabs(XiXj) * sqrt(float(degreesOfFreedom) / (float(degreesOfFreedom - 1)))) # = 5
				newrow.append(0) # space to save the median = 6
				newrow.append(0) # space to save the MAD formula = 7

				newlist.append(newrow)
	
	newlist = stats.sortByIndex(newlist, 3) # sort by weight
	
	# get median + rank
	nextlist = []
	counter = 0
	
	if len(newlist) < medianSide * 2:
		if verbose:
			print('Not enough data to perform statistics,')
			print('len(newlist) = %s, while medianSide = %s' % (str(len(newlist)), str(medianSide)))
		sys.exit()
	
	for i in range(len(newlist))[medianSide:len(newlist) - medianSide]:
		window = newlist[i - medianSide:i + medianSide + 1]
		median = stats.medianByIndex(window, 5)
		newlist[i][6] = median

	# fill the borders
	for i in range(len(newlist))[:medianSide]:
		newlist[i][6] = newlist[medianSide + 1][6]
	
	for i in range(len(newlist))[len(newlist) - medianSide:]:
		newlist[i][6] = newlist[len(newlist) - medianSide - 1][6]

	# fill MAD formula
	for i in range(len(newlist)):
		newlist[i][7] = 1 / (MADconstant * newlist[i][6]) ** 2
		MADDistribution.append(newlist[i][7])
		distrWeight.append(newlist[i][3])

	if verbose:
		print('k = %f, var = %f' % (k, variance))

	return MADDistribution, distrWeight
Beispiel #8
0
def getRels(qcInputFile = "", listChangingCats = [], qcInputNoOutsFile = "", modeSanXoTSieve = "newWay", caseSensitive = True, outlierTag = "out"):

	qcInputRawList = []
	qcInput = []
	qcInputNoOutsRawList = []
	qcInputNoOuts = []
	numRelsChangingCats = 0
	numOutliersChangingCats = 0
	numOutliersNonChangingCats = 0
	
	# lists of lists for filterByElement, needed to speed it up
	qcInputSortedList = []
	qcInputNoOutsSortedList = []
	listChangingCatsList = []
	listChangingCatsSortedList = []

	# when no qcInputFileNoOuts file is present, the newWay option is used
	# this has already been sorted previously, but just in case...
	if len(qcInputNoOutsFile) == 0: modeSanXoTSieve = "newWay"

	qcInputRawList = stats.loadStatsDataFile(qcInputFile, FDRasText = True, ZasText = True, includeTags = True)
	
	# next line is needed to nest list within list and make it work with the filterByElement method
	for cat in listChangingCats:
		if caseSensitive:
			listChangingCatsList.append([cat])
		else:
			listChangingCatsList.append([cat.lower()])
			
	# important NOT to sort listChangingCats, as this is not a list of lists and
	# sorting would only affect the first character instead of the first string
	listChangingCatsSortedList = stats.sortByIndex(listChangingCatsList, 0)	
	
	# get list of rels
	# next line is needed to nest list within list and make it work with the filterByElement method
	for qc in qcInputRawList:
		if caseSensitive:
			qcInput.append([[qc[0], qc[3], qc[9]]])
		else:
			qcInput.append([[qc[0].lower(), qc[3].lower(), qc[9]]])
			
	qcInputSortedList = stats.sortByIndex(qcInput, 0)
	
	if modeSanXoTSieve == "newWay":

		for qc in qcInputSortedList:
		
			if len(stats.filterByElement(listChangingCatsSortedList, qc[0][0], sort = False)) > 0:
				# get list of rels pointing to changing cats
				# get outlier rels
				numRelsChangingCats += 1
				if stats.tagIsPresent(qc[0][2], outlierTag):
					numOutliersChangingCats += 1
					
			else:
				# relations pointing to non changing cats
				if stats.tagIsPresent(qc[0][2], outlierTag):
					# outliers pointing to non changing cats
					numOutliersNonChangingCats += 1

	if modeSanXoTSieve == "oldWay":
	
		# quitar si sale bien sacándolo fuera
		
		# # next line is needed to nest list within list and make it work with the filterByElement method
		# for qc in qcInputRawList:
			# # if modeSanXoTSieve == "oldWay" and len(qc)
			# if caseSensitive:
				# qcInput.append([[qc[0], qc[3]]])
			# else:
				# qcInput.append([[qc[0].lower(), qc[3].lower()]])
				
		# qcInputSortedList = stats.sortByIndex(qcInput, 0)
		
		if len(qcInputNoOutsFile) > 0:
			qcInputNoOutsRawList = stats.loadStatsDataFile(qcInputNoOutsFile, FDRasText = True, ZasText = True, includeTags = False)
			
			# next line is needed to nest list within list and make it work with the filterByElement method
			for qcno in qcInputNoOutsRawList:
				if caseSensitive:
					qcInputNoOuts.append([[qcno[0], qcno[3]]])
				else:
					qcInputNoOuts.append([[qcno[0].lower(), qcno[3].lower()]])
				
			qcInputNoOutsSortedList = stats.sortByIndex(qcInputNoOuts, 0)
			

		
		print
		print "calculating with %i relations and %i changing categories..." % (len(qcInputSortedList), len(listChangingCats))
		
		for qc in qcInputSortedList:
			
			# better do not use something like "if x in list..." because that is quite slow
			if len(stats.filterByElement(listChangingCatsSortedList, qc[0][0], sort = False)) > 0:
				# is the category qc[0] in listChangingCatsSorted? If no --> 0
				# this relation points to a changing category
				numRelsChangingCats += 1
				if len(stats.filterByElement(qcInputNoOutsSortedList, qc[0][0:2], sort = False)) == 0:
					# is the relation qc in qcInputNoOuts? If no --> 0
					# this relation is an outlier in a changing category
					# the [0:2] part is to remove the space for tags
					numOutliersChangingCats += 1
			else:
				# this relation points to a non-changing category
				if len(stats.filterByElement(qcInputNoOutsSortedList, qc[0][0:2], sort = False)) == 0:
					# is the relation qc in qcInputNoOuts? If no --> 0
					# this relation is an outlier in a non-changing category
					numOutliersNonChangingCats += 1
					
	return numRelsChangingCats, numOutliersChangingCats, numOutliersNonChangingCats