def model_on_params(true_grades, true_seats, true_value, model, params, summary, n_trials, tail_type=TailType.UNKNOWN): """ Run the given model on the given parameters. Inputs: true_grades: ExamGrades the actual grade distribution for the course true_seats: SeatingChart the actual seating chart for the course true_value: the result of calling `summary(true_grades, true_seats)` (for the purpose of efficiency over repeated calls) model: Class extending Model which is of interest params: paramtype(model) the parameters to plug into the model summary: (ExamGrades, SeatingChart) -> Float which we are testing granularity: Integer the number of parameter values to try Output: (parameter, probability, report). see plausible_parameters for more info """ current_model = model(true_grades, *params) model_values = [ summary(current_model.create_grades(true_seats), true_seats) for _ in range(n_trials) ] p_val = p_value(true_value, model_values, tail_type) return params, p_val, PermutationReport(true_value, model_values, tail_type)
def evaluateStateRegulatoryStructure(expressionData, all_indexes, group_index, MarkerFinder, SignatureGenes, state, query=None): """Predict multi-lineage cells and their associated coincident lineage-defining TFs""" useProbablityOfExpression = False ICGS_State_as_Row = False matrix, column_header, row_header, dataset_name, group_db = expressionData def importGeneLists(fn): genes = {} for line in open(fn, 'rU').xreadlines(): data = clustering.cleanUpLine(line) gene, cluster = string.split(data, '\t')[0:2] genes[gene] = cluster return genes def importMarkerFinderHits(fn): genes = {} ICGS_State_ranked = {} skip = True for line in open(fn, 'rU').xreadlines(): data = clustering.cleanUpLine(line) if skip: skip = False else: try: gene, symbol, rho, ICGS_State = string.split(data, '\t') except Exception: gene, symbol, rho, rho_p, ICGS_State = string.split( data, '\t') #if ICGS_State!=state and float(rho)>0.0: if float(rho) > 0.15: try: ICGS_State_ranked[ICGS_State].append( [float(rho), gene, symbol]) except Exception: ICGS_State_ranked[ICGS_State] = [[ float(rho), gene, symbol ]] for ICGS_State in ICGS_State_ranked: ICGS_State_ranked[ICGS_State].sort() ICGS_State_ranked[ICGS_State].reverse() #print ICGS_State, ICGS_State_ranked[ICGS_State][:50] for (rho, gene, symbol) in ICGS_State_ranked[ICGS_State][:50]: genes[ gene] = rho, ICGS_State ### Retain all population specific genes (lax) genes[symbol] = rho, ICGS_State return genes def importQueryDataset(fn): matrix, column_header, row_header, dataset_name, group_db = clustering.importData( fn) return matrix, column_header, row_header, dataset_name, group_db signatureGenes = importGeneLists(SignatureGenes) markerFinderGenes = importMarkerFinderHits(MarkerFinder) #print len(signatureGenes),len(markerFinderGenes) ### Determine for each gene, its population frequency per cell state index = 0 expressedGenesPerState = {} stateAssociatedMarkers = {} def freqCutoff(x, cutoff): if x > cutoff: return 1 ### minimum expression cutoff else: return 0 for row in matrix: ICGS_state_gene_frq = {} gene = row_header[index] for ICGS_state in group_index: state_values = map(lambda i: row[i], group_index[ICGS_state]) def freqCheck(x): if x > 1: return 1 ### minimum expression cutoff else: return 0 expStateCells = sum(map(lambda x: freqCheck(x), state_values)) statePercentage = (float(expStateCells) / len(group_index[ICGS_state])) ICGS_state_gene_frq[ICGS_state] = statePercentage datasets_values = map(lambda i: row[i], all_indexes) all_cells_frq = sum(map(lambda x: freqCheck(x), datasets_values)) / ( len(datasets_values) * 1.0) all_states_frq = map(lambda x: ICGS_state_gene_frq[x], ICGS_state_gene_frq) all_states_frq.sort() ### frequencies of all non-multilin states states_expressed = sum( map(lambda x: freqCutoff(x, 0.5), all_states_frq)) / (len(all_states_frq) * 1.0) for State in ICGS_state_gene_frq: state_frq = ICGS_state_gene_frq[State] rank = all_states_frq.index(state_frq) if state_frq > 0.25 and rank > 0: #and states_expressed<0.75 #and all_cells_frq>0.75 if 'Rik' not in gene and 'Gm' not in gene and '-' not in gene: if gene in markerFinderGenes: # and gene in markerFinderGenes: if ICGS_State_as_Row: ICGS_State = signatureGenes[gene] if gene in markerFinderGenes: if ICGS_State_as_Row == False: rho, ICGS_State = markerFinderGenes[gene] else: rho, ICGS_Cell_State = markerFinderGenes[ gene] #ICGS_Cell_State score = int(rho * 100 * state_frq) * ( float(rank) / len(all_states_frq)) try: expressedGenesPerState[ICGS_State].append( (score, gene)) except Exception: expressedGenesPerState[ICGS_State] = [ (score, gene) ] #(rank*multilin_frq) try: stateAssociatedMarkers[ gene, ICGS_State].append(State) except Exception: stateAssociatedMarkers[gene, ICGS_State] = [State] index += 1 if query != None: matrix, column_header, row_header, dataset_name, group_db = importQueryDataset( query) markers_to_exclude = [] expressedGenesPerState2 = {} for (gene, ICGS_State) in stateAssociatedMarkers: if len( stateAssociatedMarkers[(gene, ICGS_State)] ) < 2: # or len(stateAssociatedMarkers[(gene,ICGS_State)])>len(ICGS_state_gene_frq)/2.0: markers_to_exclude.append(gene) else: print ICGS_State, gene, stateAssociatedMarkers[(gene, ICGS_State)] for ICGS_State in expressedGenesPerState: for (score, gene) in expressedGenesPerState[ICGS_State]: if gene not in markers_to_exclude: try: expressedGenesPerState2[ICGS_State].append((score, gene)) except Exception: expressedGenesPerState2[ICGS_State] = [(score, gene)] expressedGenesPerState = expressedGenesPerState2 createPseudoCell = True ### The expressedGenesPerState defines genes and modules co-expressed in the multi-Lin ### Next, find the cells that are most frequent in mulitple states representativeMarkers = {} for ICGS_State in expressedGenesPerState: expressedGenesPerState[ICGS_State].sort() expressedGenesPerState[ICGS_State].reverse() if '1Multi' not in ICGS_State: markers = expressedGenesPerState[ICGS_State] #[:5] markers_unique = list(set(map(lambda x: x[1], list(markers)))) print ICGS_State, ":", string.join(markers_unique, ', ') if createPseudoCell: for gene in markers: def getBinary(x): if x > 1: return 1 else: return 0 if gene[1] in row_header: ### Only for query datasets row_index = row_header.index(gene[1]) if useProbablityOfExpression: pvalues = calculateGeneExpressProbilities( matrix[row_index] ) ### probability of expression values = pvalues else: binaryValues = map(lambda x: getBinary(x), matrix[row_index]) values = binaryValues #if gene[1]=='S100a8': print binaryValues;sys.exit() try: representativeMarkers[ICGS_State].append(values) except Exception: representativeMarkers[ICGS_State] = [values] else: representativeMarkers[ICGS_State] = markers[0][-1] #int(len(markers)*.25)>5: #print ICGS_State, markers #sys.exit() for ICGS_State in representativeMarkers: if createPseudoCell: signature_values = representativeMarkers[ICGS_State] if useProbablityOfExpression: signature_values = [ numpy.median(value) for value in zip(*signature_values) ] else: signature_values = [ int(numpy.median(value)) for value in zip(*signature_values) ] representativeMarkers[ICGS_State] = signature_values else: gene = representativeMarkers[ICGS_State] row_index = row_header.index(gene) gene_values = matrix[row_index] representativeMarkers[ICGS_State] = gene_values ### Determine for each gene, its population frequency per cell state expressedStatesPerCell = {} multilin_probability = {} import export print 'Writing results matrix to:', MarkerFinder[:-4] + '-cellStateScores.txt' eo = export.ExportFile(MarkerFinder[:-4] + '-cellStateScores.txt') eo.write(string.join(['UID'] + column_header, '\t') + '\n') for ICGS_State in representativeMarkers: gene_values = representativeMarkers[ICGS_State] index = 0 scoreMatrix = [] HitsCount = 0 for cell in column_header: value = gene_values[index] expressedLiklihood = '0' if (value < 0.05 and useProbablityOfExpression == True) or ( value == 1 and useProbablityOfExpression == False): try: expressedStatesPerCell[cell].append(ICGS_State) except Exception: expressedStatesPerCell[cell] = [ICGS_State] expressedLiklihood = '1' HitsCount += 1 if useProbablityOfExpression: try: multilin_probability[cell].append(value) except Exception: multilin_probability[cell] = [value] index += 1 scoreMatrix.append(expressedLiklihood) if HitsCount > 1: #print ICGS_State,HitsCount eo.write(string.join([ICGS_State] + scoreMatrix, '\t') + '\n') eo.close() def multiply(values): p = 1 for i in values: if i > 0: p = p * i else: p = p * 1.e-16 return p ### Compute a matrix of Cell-State to LineagePotential eo = export.ExportFile(MarkerFinder[:-4] + '-primingMatrix.txt') cell_state_matrix = {} lineages = [] cell_population_count = {} for cell in expressedStatesPerCell: cell_state = string.split(cell, ':')[0] #print cell, cell_state; sys.exit() try: cell_population_count[cell_state] += 1 except: cell_population_count[cell_state] = 1 if cell_state not in lineages: lineages.append(cell_state) primed_lineages = expressedStatesPerCell[cell] for primed_lineage in primed_lineages: if cell_state not in cell_state_matrix: priming_count = {} priming_count[primed_lineage] = 1 cell_state_matrix[cell_state] = priming_count else: priming_count = cell_state_matrix[cell_state] try: priming_count[primed_lineage] += 1 except: priming_count[primed_lineage] = 1 eo.write(string.join(['Cell Populations'] + lineages, '\t') + '\n') for cell_state in lineages: priming_count_db = cell_state_matrix[cell_state] sum_scores = [] for cell_state2 in lineages: if cell_state2 in priming_count_db: sum_score = priming_count_db[cell_state2] sum_scores.append( (sum_score * 1.000 / cell_population_count[cell_state2])) else: sum_scores.append(0) eo.write(string.join([cell_state] + map(str, sum_scores), '\t') + '\n') eo.close() cell_mutlilin_ranking = [] for cell in expressedStatesPerCell: #if 'Multi-Lin:Gmp.R3.10' in cell: sys.exit() if useProbablityOfExpression: p = numpy.mean( multilin_probability[cell]) ### mean state probability lineageCount = expressedStatesPerCell[cell] if useProbablityOfExpression: cell_mutlilin_ranking.append((p, len(lineageCount), cell)) else: cell_mutlilin_ranking.append((len(lineageCount), cell)) cell_mutlilin_ranking.sort() if useProbablityOfExpression == False: cell_mutlilin_ranking.reverse() scores = [] state_scores = {} cellsPerState = {} ### Denominator for z-score analysis for cell in cell_mutlilin_ranking: score = cell[0] scores.append(score) cell_state = string.split(cell[-1], ':')[0] try: cellsPerState[cell_state] += 1 except Exception: cellsPerState[cell_state] = 1 try: state_scores[cell_state].append(float(score)) except Exception: state_scores[cell_state] = [float(score)] scoreMean = numpy.mean(scores) scoreSD = numpy.std(scores) oneSD = scoreMean + scoreSD twoSD = scoreMean + scoreSD + scoreSD oneStandDeviationAway = {} twoStandDeviationsAway = {} oneStandDeviationAwayTotal = 0 twoStandDeviationsAwayTotal = 0 print 'Mean:', scoreMean print 'STDev:', scoreSD state_scores2 = [] for cell_state in state_scores: state_scores2.append( (numpy.mean(state_scores[cell_state]), cell_state)) i = 0 print 'Writing results matrix to:', MarkerFinder[:-4] + '-cell-combined-scores.txt' eo = export.ExportFile(MarkerFinder[:-4] + '-cell-combined-score.txt') for cell in cell_mutlilin_ranking: score, cellName = cell CellState, CellName = string.split(cellName, ':') if score >= oneSD: try: oneStandDeviationAway[CellState] += 1 except Exception: oneStandDeviationAway[CellState] = 1 oneStandDeviationAwayTotal += 1 if score >= twoSD: try: twoStandDeviationsAway[CellState] += 1 except Exception: twoStandDeviationsAway[CellState] = 1 twoStandDeviationsAwayTotal += 1 #print cell, string.join(expressedStatesPerCell[cell[-1]],'|') a = expressedStatesPerCell[cell[-1]] eo.write(str(cell[1]) + '\t' + str(cell[0]) + '\t' + '\n') i += 1 state_scores2 state_scores2.sort() state_scores2.reverse() eo.close() twoStandDeviationsAway = oneStandDeviationAway twoStandDeviationsAwayTotal = oneStandDeviationAwayTotal print '\n\n' import statistics zscores = [] for CellState in twoStandDeviationsAway: #print CellState highMetaScoreCells = twoStandDeviationsAway[CellState] totalCellsPerState = cellsPerState[CellState] r = highMetaScoreCells n = twoStandDeviationsAwayTotal R = totalCellsPerState N = len(column_header) z = statistics.zscore(r, n, N, R) scores = [z, CellState, statistics.p_value(z)] zscores.append(scores) zscores.sort() zscores.reverse() for scores in zscores: scores = string.join(map(str, scores), '\t') print scores """ for i in state_scores2: print str(i[0])+'\t'+str(i[1])""" sys.exit() return numpy.mean(state_scores)