def detectRelationWithLeastFDR(statsData, FDRLimit=0.01, modeUsed=mode.onlyOne): # modes # mode.onlyOne -> removes the relation with least FDR, if it has FDR < FDRLimit # mode.onePerHigher -> removes one relation for each higher level element, if it has FDR < FDRLimit #*** relationsToRemove = [] sortedStats = stats.sortByInstance(statsData, "FDRij", isDescendent=False)[::-1] if modeUsed == mode.onlyOne: leastFDR = sys.float_info.max leastFDRRow = [] for statsRow in sortedStats: if statsRow.FDRij < leastFDR: leastFDR = statsRow.FDRij leastFDRRow = statsRow if leastFDRRow != []: if leastFDRRow.FDRij < FDRLimit: newRelationToRemove = [leastFDRRow.id2, leastFDRRow.id1] relationsToRemove = [newRelationToRemove] if modeUsed == mode.onePerHigher: for statsRow in sortedStats: # begin: jmrc # if statsRow.FDRij < FDRLimit: if isinstance(statsRow.FDRij, float) and statsRow.FDRij < FDRLimit: # end: jmrc higherSelection = stats.filterByElement( relationsToRemove, statsRow.id2) if len(higherSelection) == 0: # add new outlier relation newRelationToRemove = [ statsRow.id2, statsRow.id1, statsRow.FDRij, abs(statsRow.Zij) ] relationsToRemove.append(newRelationToRemove) else: # warning!! none should have len(higherSelection) > 1 if len(higherSelection) == 1: # check which has the least FDRij, or the biggest |Zij| if FDRij == 0 if (statsRow.FDRij < higherSelection[0][2] or (statsRow.FDRij == 0 and abs(statsRow.Zij) > higherSelection[0][3])): newRelationToRemove = [ statsRow.id2, statsRow.id1, statsRow.FDRij, abs(statsRow.Zij) ] relationsToRemove.remove(higherSelection[0]) relationsToRemove.append(newRelationToRemove) relationsToRemove = stats.extractColumns(relationsToRemove, 0, 1) return relationsToRemove
def getInverseOfFit(mergedData, k, variance, alpha): inverseOfFit = [] #input = stats.sortByIndex(input, 2) for element in mergedData: sequence = element[0] # sort = False for speeding scanListWithSequence = stats.filterByElement(mergedData, sequence, sort = False) if len(scanListWithSequence) > 1: weight = element[3] inverseOfFit.append(getW_klibrate(weight, k, variance, alpha)) inverseOfFit = sort(inverseOfFit) # sort by weight return inverseOfFit
def detectRelationWithLeastFDR(statsData, FDRLimit = 0.01, modeUsed = mode.onlyOne): # modes # mode.onlyOne -> removes the relation with least FDR, if it has FDR < FDRLimit # mode.onePerHigher -> removes one relation for each higher level element, if it has FDR < FDRLimit #*** relationsToRemove = [] sortedStats = stats.sortByInstance(statsData, "FDRij", isDescendent = False)[::-1] if modeUsed == mode.onlyOne: leastFDR = sys.float_info.max leastFDRRow = [] for statsRow in sortedStats: if statsRow.FDRij < leastFDR: leastFDR = statsRow.FDRij leastFDRRow = statsRow if leastFDRRow != []: if leastFDRRow.FDRij < FDRLimit: newRelationToRemove = [leastFDRRow.id2, leastFDRRow.id1] relationsToRemove = [newRelationToRemove] if modeUsed == mode.onePerHigher: for statsRow in sortedStats: # begin: jmrc # if statsRow.FDRij < FDRLimit: if isinstance(statsRow.FDRij, float) and statsRow.FDRij < FDRLimit: # end: jmrc higherSelection = stats.filterByElement(relationsToRemove, statsRow.id2) if len(higherSelection) == 0: # add new outlier relation newRelationToRemove = [statsRow.id2, statsRow.id1, statsRow.FDRij, abs(statsRow.Zij)] relationsToRemove.append(newRelationToRemove) else: # warning!! none should have len(higherSelection) > 1 if len(higherSelection) == 1: # check which has the least FDRij, or the biggest |Zij| if FDRij == 0 if (statsRow.FDRij < higherSelection[0][2] or (statsRow.FDRij == 0 and abs(statsRow.Zij) > higherSelection[0][3])): newRelationToRemove = [statsRow.id2, statsRow.id1, statsRow.FDRij, abs(statsRow.Zij)] relationsToRemove.remove(higherSelection[0]) relationsToRemove.append(newRelationToRemove) relationsToRemove = stats.extractColumns(relationsToRemove, 0, 1) return relationsToRemove
def replaceRelations(relations, FASTAHeaders = []): newRelations = [] relationsSorted = stats.sortByIndex(relations, 1) FASTAHeadersSorted = stats.sortByIndex(FASTAHeaders, 0) if len(FASTAHeadersSorted) > 0: for relation in relationsSorted: element = relation[1] searchResult = stats.filterByElement(FASTAHeadersSorted, element, index = 0, sort = False) if len(searchResult) > 0: newRelations.append([relation[0], searchResult[0][1]]) else: return relationsSorted return newRelations
def getInverseOfFit(mergedData, k, variance, alpha): inverseOfFit = [] #input = stats.sortByIndex(input, 2) for element in mergedData: sequence = element[0] # sort = False for speeding scanListWithSequence = stats.filterByElement(mergedData, sequence, sort=False) if len(scanListWithSequence) > 1: weight = element[3] inverseOfFit.append(getW_klibrate(weight, k, variance, alpha)) inverseOfFit = sort(inverseOfFit) # sort by weight return inverseOfFit
def getMADDistribution(nextIdX, mergedData, k, variance, alpha, medianSide=100, showGraph=False, verbose=False): MADconstant = 1.48260221850560 # *** 1 / DISTR.NORM.ESTAND.INV(3 / 4) get exact number MADDistribution = [] distrWeight = [] # inputSequences = extractColumns(input, 0) # outputSequences = extractColumns(output, 0) newlist = [] for orow in nextIdX: sequence = orow[0] # it is important to avoid sorting to keep it fast # so in next line do not foget sort = False # this should arrive here already sorted scanListWithSequence = stats.filterByElement(mergedData, sequence, sort=False) if len(scanListWithSequence ) > 1: # otherwise Xi = Xj --> Xi - Xj = 0 --> does not work for scanRow in scanListWithSequence: newrow = [] weight = scanRow[3] # the V degreesOfFreedom = len(scanListWithSequence) XiXj = scanRow[2] - orow[1] newrow.append(sequence) # sequence = 0 newrow.append(scanRow[1]) # scan number = 1 newrow.append(XiXj) # Xi - Xj = 2 newrow.append(weight) # weight = 3 newrow.append( len(scanListWithSequence)) # degrees of freedom = 4 newrow.append( fabs(XiXj) * sqrt( float(degreesOfFreedom) / (float(degreesOfFreedom - 1)))) # = 5 newrow.append(0) # space to save the median = 6 newrow.append(0) # space to save the MAD formula = 7 newlist.append(newrow) newlist = stats.sortByIndex(newlist, 3) # sort by weight # get median + rank nextlist = [] counter = 0 if len(newlist) < medianSide * 2: if verbose: print('Not enough data to perform statistics,') print('len(newlist) = %s, while medianSide = %s' % (str(len(newlist)), str(medianSide))) sys.exit() for i in range(len(newlist))[medianSide:len(newlist) - medianSide]: window = newlist[i - medianSide:i + medianSide + 1] median = stats.medianByIndex(window, 5) newlist[i][6] = median # fill the borders for i in range(len(newlist))[:medianSide]: newlist[i][6] = newlist[medianSide + 1][6] for i in range(len(newlist))[len(newlist) - medianSide:]: newlist[i][6] = newlist[len(newlist) - medianSide - 1][6] # fill MAD formula for i in range(len(newlist)): newlist[i][7] = 1 / (MADconstant * newlist[i][6])**2 MADDistribution.append(newlist[i][7]) distrWeight.append(newlist[i][3]) if verbose: print('k = %f, var = %f' % (k, variance)) return MADDistribution, distrWeight
def getMADDistribution(nextIdX, mergedData, k, variance, alpha, medianSide = 100, showGraph = False, verbose = False): MADconstant = 1.48260221850560 # *** 1 / DISTR.NORM.ESTAND.INV(3 / 4) get exact number MADDistribution = [] distrWeight = [] # inputSequences = extractColumns(input, 0) # outputSequences = extractColumns(output, 0) newlist = [] for orow in nextIdX: sequence = orow[0] # it is important to avoid sorting to keep it fast # so in next line do not foget sort = False # this should arrive here already sorted scanListWithSequence = stats.filterByElement(mergedData, sequence, sort = False) if len(scanListWithSequence) > 1: # otherwise Xi = Xj --> Xi - Xj = 0 --> does not work for scanRow in scanListWithSequence: newrow = [] weight = scanRow[3] # the V degreesOfFreedom = len(scanListWithSequence) XiXj = scanRow[2] - orow[1] newrow.append(sequence) # sequence = 0 newrow.append(scanRow[1]) # scan number = 1 newrow.append(XiXj) # Xi - Xj = 2 newrow.append(weight) # weight = 3 newrow.append(len(scanListWithSequence)) # degrees of freedom = 4 newrow.append(fabs(XiXj) * sqrt(float(degreesOfFreedom) / (float(degreesOfFreedom - 1)))) # = 5 newrow.append(0) # space to save the median = 6 newrow.append(0) # space to save the MAD formula = 7 newlist.append(newrow) newlist = stats.sortByIndex(newlist, 3) # sort by weight # get median + rank nextlist = [] counter = 0 if len(newlist) < medianSide * 2: if verbose: print('Not enough data to perform statistics,') print('len(newlist) = %s, while medianSide = %s' % (str(len(newlist)), str(medianSide))) sys.exit() for i in range(len(newlist))[medianSide:len(newlist) - medianSide]: window = newlist[i - medianSide:i + medianSide + 1] median = stats.medianByIndex(window, 5) newlist[i][6] = median # fill the borders for i in range(len(newlist))[:medianSide]: newlist[i][6] = newlist[medianSide + 1][6] for i in range(len(newlist))[len(newlist) - medianSide:]: newlist[i][6] = newlist[len(newlist) - medianSide - 1][6] # fill MAD formula for i in range(len(newlist)): newlist[i][7] = 1 / (MADconstant * newlist[i][6]) ** 2 MADDistribution.append(newlist[i][7]) distrWeight.append(newlist[i][3]) if verbose: print('k = %f, var = %f' % (k, variance)) return MADDistribution, distrWeight
def getRels(qcInputFile = "", listChangingCats = [], qcInputNoOutsFile = "", modeSanXoTSieve = "newWay", caseSensitive = True, outlierTag = "out"): qcInputRawList = [] qcInput = [] qcInputNoOutsRawList = [] qcInputNoOuts = [] numRelsChangingCats = 0 numOutliersChangingCats = 0 numOutliersNonChangingCats = 0 # lists of lists for filterByElement, needed to speed it up qcInputSortedList = [] qcInputNoOutsSortedList = [] listChangingCatsList = [] listChangingCatsSortedList = [] # when no qcInputFileNoOuts file is present, the newWay option is used # this has already been sorted previously, but just in case... if len(qcInputNoOutsFile) == 0: modeSanXoTSieve = "newWay" qcInputRawList = stats.loadStatsDataFile(qcInputFile, FDRasText = True, ZasText = True, includeTags = True) # next line is needed to nest list within list and make it work with the filterByElement method for cat in listChangingCats: if caseSensitive: listChangingCatsList.append([cat]) else: listChangingCatsList.append([cat.lower()]) # important NOT to sort listChangingCats, as this is not a list of lists and # sorting would only affect the first character instead of the first string listChangingCatsSortedList = stats.sortByIndex(listChangingCatsList, 0) # get list of rels # next line is needed to nest list within list and make it work with the filterByElement method for qc in qcInputRawList: if caseSensitive: qcInput.append([[qc[0], qc[3], qc[9]]]) else: qcInput.append([[qc[0].lower(), qc[3].lower(), qc[9]]]) qcInputSortedList = stats.sortByIndex(qcInput, 0) if modeSanXoTSieve == "newWay": for qc in qcInputSortedList: if len(stats.filterByElement(listChangingCatsSortedList, qc[0][0], sort = False)) > 0: # get list of rels pointing to changing cats # get outlier rels numRelsChangingCats += 1 if stats.tagIsPresent(qc[0][2], outlierTag): numOutliersChangingCats += 1 else: # relations pointing to non changing cats if stats.tagIsPresent(qc[0][2], outlierTag): # outliers pointing to non changing cats numOutliersNonChangingCats += 1 if modeSanXoTSieve == "oldWay": # quitar si sale bien sacándolo fuera # # next line is needed to nest list within list and make it work with the filterByElement method # for qc in qcInputRawList: # # if modeSanXoTSieve == "oldWay" and len(qc) # if caseSensitive: # qcInput.append([[qc[0], qc[3]]]) # else: # qcInput.append([[qc[0].lower(), qc[3].lower()]]) # qcInputSortedList = stats.sortByIndex(qcInput, 0) if len(qcInputNoOutsFile) > 0: qcInputNoOutsRawList = stats.loadStatsDataFile(qcInputNoOutsFile, FDRasText = True, ZasText = True, includeTags = False) # next line is needed to nest list within list and make it work with the filterByElement method for qcno in qcInputNoOutsRawList: if caseSensitive: qcInputNoOuts.append([[qcno[0], qcno[3]]]) else: qcInputNoOuts.append([[qcno[0].lower(), qcno[3].lower()]]) qcInputNoOutsSortedList = stats.sortByIndex(qcInputNoOuts, 0) print print "calculating with %i relations and %i changing categories..." % (len(qcInputSortedList), len(listChangingCats)) for qc in qcInputSortedList: # better do not use something like "if x in list..." because that is quite slow if len(stats.filterByElement(listChangingCatsSortedList, qc[0][0], sort = False)) > 0: # is the category qc[0] in listChangingCatsSorted? If no --> 0 # this relation points to a changing category numRelsChangingCats += 1 if len(stats.filterByElement(qcInputNoOutsSortedList, qc[0][0:2], sort = False)) == 0: # is the relation qc in qcInputNoOuts? If no --> 0 # this relation is an outlier in a changing category # the [0:2] part is to remove the space for tags numOutliersChangingCats += 1 else: # this relation points to a non-changing category if len(stats.filterByElement(qcInputNoOutsSortedList, qc[0][0:2], sort = False)) == 0: # is the relation qc in qcInputNoOuts? If no --> 0 # this relation is an outlier in a non-changing category numOutliersNonChangingCats += 1 return numRelsChangingCats, numOutliersChangingCats, numOutliersNonChangingCats