import os, sys, glob import utilities inDirPath = sys.argv[1] inFilePattern = sys.argv[2] variablesFilePath = sys.argv[3] dataValueIndex = int(sys.argv[4]) outPatientPrefix = sys.argv[5] outFilePath = sys.argv[6] patientIDs = utilities.getPatientIDs(inDirPath, inFilePattern) variables = None if variablesFilePath != "None": variables = utilities.readVectorFromFile(variablesFilePath) patientsKeyValuesDict = utilities.getPatientsKeyValuesDict( inDirPath, patientIDs, inFilePattern, dataValueIndex, variables) outFile = open(outFilePath, 'w') outFile.write("\t".join( ["Key"] + [outPatientPrefix + patientID for patientID in patientIDs]) + "\n") if variables == None: keys = sorted(patientsKeyValuesDict[patientIDs[0]].keys()) else: keys = list( set(variables) & set(patientsKeyValuesDict[patientIDs[0]].keys())) for key in keys:
import os, sys, glob import utilities matrixFilePath = sys.argv[1] samplesFilePath = sys.argv[2] outDirPath = utilities.checkDirPath(sys.argv[3]) outDirFilePattern = sys.argv[4] data = utilities.readMatrixFromFile(matrixFilePath) dataPatientIDs = data.pop(0) samplePatientIDs = [ x.replace(".", "-") for x in utilities.readVectorFromFile(samplesFilePath) ] if len(dataPatientIDs) == len(data[0]): #check for description in first column dataPatientIDs.pop(0) variableNames = [x[0] for x in data] patientValuesDict = {} for samplePatientID in samplePatientIDs: if not samplePatientID in dataPatientIDs: continue patientValues = [ x[dataPatientIDs.index(samplePatientID) + 1] for x in data ] output = ""
import os, sys, glob import utilities inDirPath = sys.argv[1] inFilePattern = sys.argv[2] variablesFilePath = sys.argv[3] dataValueIndex = int(sys.argv[4]) outPatientPrefix = sys.argv[5] outFilePath = sys.argv[6] patientIDs = utilities.getPatientIDs(inDirPath, inFilePattern) variables = None if variablesFilePath != "None": variables = utilities.readVectorFromFile(variablesFilePath) patientsKeyValuesDict = utilities.getPatientsKeyValuesDict(inDirPath, patientIDs, inFilePattern, dataValueIndex, variables) outFile = open(outFilePath, 'w') outFile.write("\t".join(["Key"] + [outPatientPrefix + patientID for patientID in patientIDs]) + "\n") if variables == None: keys = sorted(patientsKeyValuesDict[patientIDs[0]].keys()) else: keys = list(set(variables) & set(patientsKeyValuesDict[patientIDs[0]].keys())) for key in keys: outFile.write("\t".join([key] + [patientsKeyValuesDict[patientID][key] for patientID in patientIDs]) + "\n") outFile.close()
cel = MyCEL() norm = Normalize() if os.path.exists(outFilePath): print "Already processed %s" % outFilePath else: print "Reading annotations" probe2seq, coord2probe = getMetadata(probeSequenceTabFilePath, probeSequenceTabColIndices) print "Reading " + celFilePath probeIntensity = cel.read_cel(celFilePath, coord2probe) modelProbes = None if os.path.exists(modelProbesFilePath): print "Reading model probes file" modelProbes = utilities.readVectorFromFile(modelProbesFilePath) else: if modelProbesFilePath != "None": print "No model probes file exists at " + modelProbesFilePath print "Normalizing to " + outFilePath normValues = norm.normalize(probeIntensity, probe2seq, modelProbes) probes = sorted([str(x) for x in probe2seq.keys()]) exprValues = [normValues[probe][0] for probe in probes] probValues = [normValues[probe][1] for probe in probes] outFile = file(outFilePath, 'w') for i in range(len(probes)): outFile.write("%s\t%.9f\t%.9f\n" % (probes[i], exprValues[i], probValues[i]))
return utilities.calculateTrimmedMean(values) patientID = sys.argv[1] inFilePath = sys.argv[2] dataColumnIndex = int(sys.argv[3]) keyProbeFilePath = sys.argv[4] probeFilePath = sys.argv[5] minNumProbesPer = int(sys.argv[6]) summarizeFunction = getattr(sys.modules[__name__], sys.argv[7]) outlierSamplesFilePath = sys.argv[8] outFilePath = sys.argv[9] outlierSamples = [] if os.path.exists(outlierSamplesFilePath): outlierSamples = utilities.readVectorFromFile(outlierSamplesFilePath) if patientID in outlierSamples: print "%s is listed as an outlier, so it won't be summarized" % patientID sys.exit(0) keepProbes = set(utilities.readVectorFromFile(probeFilePath)) print "Getting key/probe dict from %s" % keyProbeFilePath keyProbeDict = utilities.getKeyProbeDict(keyProbeFilePath) print "Identifying probes to keep" for key in keyProbeDict.keys(): keyProbeDict[key] = list(set(keyProbeDict[key]) & keepProbes) print "Removing keys with few probes"
import os, sys, glob import utilities symbolsFilePath = sys.argv[1] entrezGenesSymbolsFilePath = sys.argv[2] entrezGenesSynonymsFilePath = sys.argv[3] outFilePath = sys.argv[4] symbols = utilities.readVectorFromFile(symbolsFilePath) entrezGenesSymbolsData = utilities.readMatrixFromFile( entrezGenesSymbolsFilePath) entrezGenesSymbolsDict = {} for row in entrezGenesSymbolsData: entrezGenesSymbolsDict[row[1]] = row[0] entrezGenesSynonymsData = utilities.readMatrixFromFile( entrezGenesSynonymsFilePath) entrezGenesSynonymsDict = {} for row in entrezGenesSynonymsData: for synonym in row[1].split("|"): entrezGenesSynonymsDict[synonym] = entrezGenesSynonymsDict.setdefault( synonym, []) + [row[0]] outFile = open(outFilePath, 'w') for symbol in symbols: if entrezGenesSymbolsDict.has_key(symbol): entrezID = entrezGenesSymbolsDict[symbol] else: if entrezGenesSynonymsDict.has_key(symbol): entrezIDs = entrezGenesSynonymsDict[symbol]
import os, sys, glob import utilities symbolsFilePath = sys.argv[1] entrezGenesSymbolsFilePath = sys.argv[2] entrezGenesSynonymsFilePath = sys.argv[3] outFilePath = sys.argv[4] symbols = utilities.readVectorFromFile(symbolsFilePath) entrezGenesSymbolsData = utilities.readMatrixFromFile(entrezGenesSymbolsFilePath) entrezGenesSymbolsDict = {} for row in entrezGenesSymbolsData: entrezGenesSymbolsDict[row[1]] = row[0] entrezGenesSynonymsData = utilities.readMatrixFromFile(entrezGenesSynonymsFilePath) entrezGenesSynonymsDict = {} for row in entrezGenesSynonymsData: for synonym in row[1].split("|"): entrezGenesSynonymsDict[synonym] = entrezGenesSynonymsDict.setdefault(synonym, []) + [row[0]] outFile = open(outFilePath, 'w') for symbol in symbols: if entrezGenesSymbolsDict.has_key(symbol): entrezID = entrezGenesSymbolsDict[symbol] else: if entrezGenesSynonymsDict.has_key(symbol): entrezIDs = entrezGenesSynonymsDict[symbol] if len(entrezIDs) == 1: entrezID = entrezIDs[0]
norm = Normalize() if os.path.exists(outFilePath): print "Already processed %s" % outFilePath else: print "Reading annotations" probe2seq, coord2probe = getMetadata(probeSequenceTabFilePath, probeSequenceTabColIndices) print "Reading " + celFilePath probeIntensity = cel.read_cel(celFilePath, coord2probe) modelProbes = None if os.path.exists(modelProbesFilePath): print "Reading model probes file" modelProbes = utilities.readVectorFromFile(modelProbesFilePath) else: if modelProbesFilePath != "None": print "No model probes file exists at " + modelProbesFilePath print "Normalizing to " + outFilePath normValues = norm.normalize(probeIntensity, probe2seq, modelProbes) probes = sorted([str(x) for x in probe2seq.keys()]) exprValues = [normValues[probe][0] for probe in probes] probValues = [normValues[probe][1] for probe in probes] outFile = file(outFilePath, 'w') for i in range(len(probes)): outFile.write("%s\t%.9f\t%.9f\n" %
import os, sys, glob import utilities idFilePattern = sys.argv[1] outFilePath = sys.argv[2] outFile = open(outFilePath, 'w') for idFilePath in glob.glob(idFilePattern): name = os.path.basename(idFilePath).replace(".gene.ids", "") name = name.replace(".txt", "") ids = utilities.readVectorFromFile(idFilePath) outFile.write("%s\t%s\t%s\n" % (name, name, "\t".join(ids))) outFile.close()
import os, sys, glob import utilities matrixFilePath = sys.argv[1] samplesFilePath = sys.argv[2] outDirPath = utilities.checkDirPath(sys.argv[3]) outDirFilePattern = sys.argv[4] data = utilities.readMatrixFromFile(matrixFilePath) dataPatientIDs = data.pop(0) samplePatientIDs = [x.replace(".", "-") for x in utilities.readVectorFromFile(samplesFilePath)] if len(dataPatientIDs) == len(data[0]): #check for description in first column dataPatientIDs.pop(0) variableNames = [x[0] for x in data] patientValuesDict = {} for samplePatientID in samplePatientIDs: if not samplePatientID in dataPatientIDs: continue patientValues = [x[dataPatientIDs.index(samplePatientID) + 1] for x in data] output = "" for i in range(len(variableNames)): output += "\t".join([variableNames[i], patientValues[i]]) + "\n" outFilePath = outDirPath + samplePatientID + outDirFilePattern