import os, sys, glob, math import utilities from operator import itemgetter, attrgetter inFilePath = sys.argv[1] expression = sys.argv[2] reverse = sys.argv[3] == "reverse" numHeaderRows = int(sys.argv[4]) outFilePath = sys.argv[5] data = utilities.readMatrixFromFile(inFilePath) headerRows = [] for i in range(numHeaderRows): headerRows.append(data.pop(0)) data = map(lambda x: eval("x + [" + expression + "]"), data) data.sort(key=itemgetter(len(data[0])-1), reverse=reverse) data = [x[:-1] for x in data] utilities.writeMatrixToFile(headerRows + data, outFilePath)
import os, sys, glob, shutil import utilities decoderFilePath = sys.argv[1] ccleRawAllDirPath = sys.argv[2] gskRawAllDirPath = sys.argv[3] ccleRawSelectedDirPath = sys.argv[4] gskRawSelectedDirPath = sys.argv[5] decoderData = utilities.readMatrixFromFile(decoderFilePath) decoderData.pop(0) decoderFileDict = {} decoderNameDict = {} for row in decoderData: gskFileName = row[1] ccleFileName = row[4] name = row[0] + "_" + row[8] + "_" + row[9] + "_" + row[10] name = name.replace(" ", "") decoderFileDict[ccleFileName] = decoderFileDict.setdefault(ccleFileName, []) + [gskFileName] decoderNameDict[ccleFileName] = decoderNameDict.setdefault(ccleFileName, []) + [name] for ccleFileName in decoderFileDict: name = decoderNameDict[ccleFileName][0] print name shutil.copy(ccleRawAllDirPath + "/" + ccleFileName + ".CEL", ccleRawSelectedDirPath + "/" + name + ".CEL") gskFileName = decoderFileDict[ccleFileName][0]
import os, sys, glob import utilities inFilePath = sys.argv[1] numHeaderRows = int(sys.argv[2]) colIndex = int(sys.argv[3]) numPlaces = sys.argv[4] outFilePath = sys.argv[5] numberFormatOption = "g" if len(sys.argv) > 6: numberFormatOption = sys.argv[6] data = utilities.readMatrixFromFile(inFilePath) def isNumber(x): try: float(x) return True except: return False for i in range(numHeaderRows, len(data)): if isNumber(data[i][colIndex]): modValue = ("%." + numPlaces + numberFormatOption) % float( data[i][colIndex]) else: modValue = data[i][colIndex] if len(data[i][colIndex]) > int(numPlaces):
import os, sys, glob import utilities symbolsFilePath = sys.argv[1] entrezGenesSymbolsFilePath = sys.argv[2] entrezGenesSynonymsFilePath = sys.argv[3] outFilePath = sys.argv[4] symbols = utilities.readVectorFromFile(symbolsFilePath) entrezGenesSymbolsData = utilities.readMatrixFromFile( entrezGenesSymbolsFilePath) entrezGenesSymbolsDict = {} for row in entrezGenesSymbolsData: entrezGenesSymbolsDict[row[1]] = row[0] entrezGenesSynonymsData = utilities.readMatrixFromFile( entrezGenesSynonymsFilePath) entrezGenesSynonymsDict = {} for row in entrezGenesSynonymsData: for synonym in row[1].split("|"): entrezGenesSynonymsDict[synonym] = entrezGenesSynonymsDict.setdefault( synonym, []) + [row[0]] outFile = open(outFilePath, 'w') for symbol in symbols: if entrezGenesSymbolsDict.has_key(symbol): entrezID = entrezGenesSymbolsDict[symbol] else: if entrezGenesSynonymsDict.has_key(symbol): entrezIDs = entrezGenesSynonymsDict[symbol]
import os, sys, glob import utilities inFilePath1 = sys.argv[1] inFilePath2 = sys.argv[2] outFilePath = sys.argv[3] hasHeader = True if len(sys.argv) > 4: hasHeader = sys.argv[4] == "True" data1 = utilities.readMatrixFromFile(inFilePath1) data2 = utilities.readMatrixFromFile(inFilePath2) if hasHeader: header1 = data1.pop(0) header2 = data2.pop(0) if len(header2) == len(data2[0]): header2.pop(0) headerCombined = header1 + header2 data1Dict = {} for row in data1: data1Dict[row[0]] = data1Dict.setdefault(row[0], []) + [row] data2Dict = {} for row in data2: data2Dict[row[0]] = data2Dict.setdefault(row[0], []) + [row[1:]]
import os, sys, glob import utilities symbolsFilePath = sys.argv[1] entrezGenesSymbolsFilePath = sys.argv[2] entrezGenesSynonymsFilePath = sys.argv[3] outFilePath = sys.argv[4] symbols = utilities.readVectorFromFile(symbolsFilePath) entrezGenesSymbolsData = utilities.readMatrixFromFile(entrezGenesSymbolsFilePath) entrezGenesSymbolsDict = {} for row in entrezGenesSymbolsData: entrezGenesSymbolsDict[row[1]] = row[0] entrezGenesSynonymsData = utilities.readMatrixFromFile(entrezGenesSynonymsFilePath) entrezGenesSynonymsDict = {} for row in entrezGenesSynonymsData: for synonym in row[1].split("|"): entrezGenesSynonymsDict[synonym] = entrezGenesSynonymsDict.setdefault(synonym, []) + [row[0]] outFile = open(outFilePath, 'w') for symbol in symbols: if entrezGenesSymbolsDict.has_key(symbol): entrezID = entrezGenesSymbolsDict[symbol] else: if entrezGenesSynonymsDict.has_key(symbol): entrezIDs = entrezGenesSynonymsDict[symbol] if len(entrezIDs) == 1: entrezID = entrezIDs[0]
import os, sys, glob import utilities pathwaysFilePath = sys.argv[1] genePathwayFilePath = sys.argv[2] geneEntrezFilePath = sys.argv[3] outFilePathTemplate = sys.argv[4] #KEGG_{PATHWAY_NAME}.gene.ids.txt pathwaysData = utilities.readMatrixFromFile(pathwaysFilePath) pathwaysData = [row for row in pathwaysData if not row[0].startswith("#")] pathwaysDict = {} for row in pathwaysData: pathwaysDict[row[0]] = row[1].upper().replace(" - ", "_").replace(", ", "_").replace(" / ", "_").replace("-", "_").replace(" ", "_").replace("/", "_").replace("(", "").replace(")", "").replace(",", "_").replace("'", "") genePathwayData = utilities.readMatrixFromFile(genePathwayFilePath) genePathwayDict = {} for row in genePathwayData: gene = row[0] for pathway in row[1].split(" "): genePathwayDict[pathway] = genePathwayDict.setdefault(pathway, []) + [gene] geneEntrezData = utilities.readMatrixFromFile(geneEntrezFilePath) geneEntrezDict = {} for row in geneEntrezData: geneEntrezDict[row[0].replace("hsa:", "")] = row[1].replace("ncbi-geneid:", "") for keggPathwayID in pathwaysDict.keys(): if not genePathwayDict.has_key(keggPathwayID): continue keggGeneIDs = genePathwayDict[keggPathwayID]
inFilePath = sys.argv[1] classFilePath = sys.argv[2] ignorePatientID = sys.argv[3] includeIDs = sys.argv[4] == "True" outFilePath = sys.argv[5] patientClassDict = {} for line in file(classFilePath): if line.startswith("#"): continue lineItems = line.rstrip().split("\t") patientClassDict[lineItems[0]] = lineItems[1] if os.path.exists(inFilePath): data = utilities.transposeMatrix(utilities.readMatrixFromFile(inFilePath)) features = data.pop(0) features.pop(0) else: data = [[patientID] for patientID in patientClassDict.keys()] features = [] outFile = open(outFilePath, "w") outFile.write("@relation data\n\n") if includeIDs: outFile.write("@attribute id string\n") for feature in features: outFile.write("@attribute %s numeric\n" % feature.replace("'", "prime"))
import os, sys, glob, math import utilities from TransformFunctions import * # See http://fluxcapacitor.wikidot.com/forum/t-333476 mappedReadsFilePath = sys.argv[1] metaFilePath = sys.argv[2] outFilePath = sys.argv[3] mappedReads = utilities.readMatrixFromFile(mappedReadsFilePath) READ_NR = math.fsum([float(x[1]) for x in mappedReads]) metaDict = {} for metaRow in utilities.readMatrixFromFile(metaFilePath): metaDict[metaRow[0]] = metaRow[4] outData = [] for mappedRead in mappedReads: id = mappedRead[0] if id not in metaDict: continue reads = float(mappedRead[1]) length = float(metaDict[id]) rpkm = (reads * 1000000000) / (length * READ_NR) outData.append((id, "%.9f" % rpkm))
import utilities from VariantUtilities import * inMetaFilePath = sys.argv[1] genomeDirPath = sys.argv[2] outFilePath = sys.argv[3] def getChromosomeSequence(filePath): sequence = "" for line in file(filePath): if not line.startswith(">"): sequence += line.rstrip().upper() return sequence outData = [] inMeta = utilities.readMatrixFromFile(inMetaFilePath) chromosomes = sorted(list(set([parseChromosome(x[1]) for x in inMeta]))) for chromosome in chromosomes: print "Processing %s" % chromosome faFilePath = genomeDirPath + "/" + chromosome + ".fa" if not os.path.exists(faFilePath): faFilePath = genomeDirPath + "/" + chromosome.replace("chr", "") + ".fa" if not os.path.exists(faFilePath): print "Ignoring %s because no chromosome file exists" % chromosome continue inMetaChromosome = [x for x in inMeta if x[1] == parseChromosome(chromosome)]
inFilePath = sys.argv[1] classFilePath = sys.argv[2] ignorePatientID = sys.argv[3] includeIDs = sys.argv[4] == "True" outFilePath = sys.argv[5] patientClassDict = {} for line in file(classFilePath): if line.startswith("#"): continue lineItems = line.rstrip().split("\t") patientClassDict[lineItems[0]] = lineItems[1] if os.path.exists(inFilePath): data = utilities.transposeMatrix(utilities.readMatrixFromFile(inFilePath)) features = data.pop(0) features.pop(0) else: data = [[patientID] for patientID in patientClassDict.keys()] features = [] outFile = open(outFilePath, 'w') outFile.write("@relation data\n\n") if includeIDs: outFile.write("@attribute id string\n") for feature in features: outFile.write("@attribute %s numeric\n" % feature.replace("'", "prime"))
import os, sys, glob import utilities def pearson(x, y): return utilities.calculatePearsonCoefficient(x, y) def spearman(x, y): return utilities.calculateSpearmanCoefficient(x, y) matrix1FilePath = sys.argv[1] matrix2FilePath = sys.argv[2] byRow = sys.argv[3] == "True" compareFunction = getattr(sys.modules[__name__], sys.argv[4]) outFilePath = sys.argv[5] matrix1 = utilities.readMatrixFromFile(matrix1FilePath) matrix2 = utilities.readMatrixFromFile(matrix2FilePath) colNames = matrix1.pop(0) matrix2.pop(0) if byRow: print "byRow Not yet implemented" exit() else: results = [] for colIndex in range(1, len(matrix1[0])): values1 = [float(row[colIndex]) for row in matrix1] values2 = [float(row[colIndex]) for row in matrix2]
summarizeFunction = getattr(sys.modules[__name__], sys.argv[7]) outlierSamplesFilePath = sys.argv[8] outFilePath = sys.argv[9] outlierSamples = [] if os.path.exists(outlierSamplesFilePath): outlierSamples = utilities.readVectorFromFile(outlierSamplesFilePath) if patientID in outlierSamples: print "%s is listed as an outlier, so it won't be summarized" % patientID sys.exit(0) print "Get data probes" dataProbes = set([line.rstrip().split("\t")[0] for line in file(inFilePath)]) keyProbes = utilities.readMatrixFromFile(keyProbeFilePath) keyProbes = [x for x in keyProbes if len(list(set(x[1].split(",")) & dataProbes)) > 0] keyProbeDict = {} for keyProbesRow in keyProbes: keyProbeDict[keyProbesRow[0]] = keyProbeDict.setdefault(keyProbesRow[0], []) + keyProbesRow[1].split(",") if os.path.exists(probeFilePath): print "Keeping only specified probes" keepProbes = set(utilities.readVectorFromFile(probeFilePath)) for key in keyProbeDict.keys(): keyProbeDict[key] = list(set(keyProbeDict[key]) & keepProbes) else: keepProbes = list(dataProbes)
import os, sys, glob, shutil import utilities ## The input file should contain samples as columns and variables as rows inFilePath = sys.argv[1] classesFilePath = sys.argv[2] outGctFilePath = sys.argv[3] outClsFilePath = sys.argv[4] data = utilities.readMatrixFromFile(inFilePath) samples = data.pop(0) if len(samples) == len(data[0]): samples.pop(0) classesDict = {} for row in utilities.readMatrixFromFile(classesFilePath): if row[0] in samples: classesDict[row[0]] = row[1] uniqueClasses = sorted(list(set(classesDict.values()))) variables = [x[0] for x in data] outGctFile = open(outGctFilePath, 'w') outGctFile.write("#1.2\n") outGctFile.write("%i\t%i\n" % (len(variables), len(classesDict))) outGctFile.write("NAME\tDescription\t") outGctFile.write("\t".join([sample for sample in samples if classesDict.has_key(sample)]) + "\n") for row in data: outGctFile.write("%s\t%s\t" % (row[0], row[0]))