Exemple #1
0
import os, sys, glob
import utilities

inDirPath = sys.argv[1]
inFilePattern = sys.argv[2]
variablesFilePath = sys.argv[3]
dataValueIndex = int(sys.argv[4])
outPatientPrefix = sys.argv[5]
outFilePath = sys.argv[6]

patientIDs = utilities.getPatientIDs(inDirPath, inFilePattern)

variables = None
if variablesFilePath != "None":
    variables = utilities.readVectorFromFile(variablesFilePath)

patientsKeyValuesDict = utilities.getPatientsKeyValuesDict(
    inDirPath, patientIDs, inFilePattern, dataValueIndex, variables)

outFile = open(outFilePath, 'w')
outFile.write("\t".join(
    ["Key"] + [outPatientPrefix + patientID
               for patientID in patientIDs]) + "\n")

if variables == None:
    keys = sorted(patientsKeyValuesDict[patientIDs[0]].keys())
else:
    keys = list(
        set(variables) & set(patientsKeyValuesDict[patientIDs[0]].keys()))

for key in keys:
import os, sys, glob
import utilities

matrixFilePath = sys.argv[1]
samplesFilePath = sys.argv[2]
outDirPath = utilities.checkDirPath(sys.argv[3])
outDirFilePattern = sys.argv[4]

data = utilities.readMatrixFromFile(matrixFilePath)

dataPatientIDs = data.pop(0)
samplePatientIDs = [
    x.replace(".", "-") for x in utilities.readVectorFromFile(samplesFilePath)
]

if len(dataPatientIDs) == len(data[0]):  #check for description in first column
    dataPatientIDs.pop(0)

variableNames = [x[0] for x in data]

patientValuesDict = {}

for samplePatientID in samplePatientIDs:
    if not samplePatientID in dataPatientIDs:
        continue

    patientValues = [
        x[dataPatientIDs.index(samplePatientID) + 1] for x in data
    ]

    output = ""
Exemple #3
0
import os, sys, glob
import utilities

inDirPath = sys.argv[1]
inFilePattern = sys.argv[2]
variablesFilePath = sys.argv[3]
dataValueIndex = int(sys.argv[4])
outPatientPrefix = sys.argv[5]
outFilePath = sys.argv[6]

patientIDs = utilities.getPatientIDs(inDirPath, inFilePattern)

variables = None
if variablesFilePath != "None":
    variables = utilities.readVectorFromFile(variablesFilePath)

patientsKeyValuesDict = utilities.getPatientsKeyValuesDict(inDirPath, patientIDs, inFilePattern, dataValueIndex, variables)

outFile = open(outFilePath, 'w')
outFile.write("\t".join(["Key"] + [outPatientPrefix + patientID for patientID in patientIDs]) + "\n")

if variables == None:
    keys = sorted(patientsKeyValuesDict[patientIDs[0]].keys())
else:
    keys = list(set(variables) & set(patientsKeyValuesDict[patientIDs[0]].keys()))

for key in keys:
    outFile.write("\t".join([key] + [patientsKeyValuesDict[patientID][key] for patientID in patientIDs]) + "\n")

outFile.close()
cel = MyCEL()
norm = Normalize()

if os.path.exists(outFilePath):
    print "Already processed %s" % outFilePath
else:
    print "Reading annotations"
    probe2seq, coord2probe = getMetadata(probeSequenceTabFilePath, probeSequenceTabColIndices)

    print "Reading " + celFilePath
    probeIntensity = cel.read_cel(celFilePath, coord2probe)

    modelProbes = None
    if os.path.exists(modelProbesFilePath):
        print "Reading model probes file"
        modelProbes = utilities.readVectorFromFile(modelProbesFilePath)
    else:
        if modelProbesFilePath != "None":
            print "No model probes file exists at " + modelProbesFilePath

    print "Normalizing to " + outFilePath
    normValues = norm.normalize(probeIntensity, probe2seq, modelProbes)

    probes = sorted([str(x) for x in probe2seq.keys()])

    exprValues = [normValues[probe][0] for probe in probes]
    probValues = [normValues[probe][1] for probe in probes]

    outFile = file(outFilePath, 'w')
    for i in range(len(probes)):
        outFile.write("%s\t%.9f\t%.9f\n" % (probes[i], exprValues[i], probValues[i]))
Exemple #5
0
    return utilities.calculateTrimmedMean(values)


patientID = sys.argv[1]
inFilePath = sys.argv[2]
dataColumnIndex = int(sys.argv[3])
keyProbeFilePath = sys.argv[4]
probeFilePath = sys.argv[5]
minNumProbesPer = int(sys.argv[6])
summarizeFunction = getattr(sys.modules[__name__], sys.argv[7])
outlierSamplesFilePath = sys.argv[8]
outFilePath = sys.argv[9]

outlierSamples = []
if os.path.exists(outlierSamplesFilePath):
    outlierSamples = utilities.readVectorFromFile(outlierSamplesFilePath)

if patientID in outlierSamples:
    print "%s is listed as an outlier, so it won't be summarized" % patientID
    sys.exit(0)

keepProbes = set(utilities.readVectorFromFile(probeFilePath))

print "Getting key/probe dict from %s" % keyProbeFilePath
keyProbeDict = utilities.getKeyProbeDict(keyProbeFilePath)

print "Identifying probes to keep"
for key in keyProbeDict.keys():
    keyProbeDict[key] = list(set(keyProbeDict[key]) & keepProbes)

print "Removing keys with few probes"
Exemple #6
0
import os, sys, glob
import utilities

symbolsFilePath = sys.argv[1]
entrezGenesSymbolsFilePath = sys.argv[2]
entrezGenesSynonymsFilePath = sys.argv[3]
outFilePath = sys.argv[4]

symbols = utilities.readVectorFromFile(symbolsFilePath)

entrezGenesSymbolsData = utilities.readMatrixFromFile(
    entrezGenesSymbolsFilePath)
entrezGenesSymbolsDict = {}
for row in entrezGenesSymbolsData:
    entrezGenesSymbolsDict[row[1]] = row[0]

entrezGenesSynonymsData = utilities.readMatrixFromFile(
    entrezGenesSynonymsFilePath)
entrezGenesSynonymsDict = {}
for row in entrezGenesSynonymsData:
    for synonym in row[1].split("|"):
        entrezGenesSynonymsDict[synonym] = entrezGenesSynonymsDict.setdefault(
            synonym, []) + [row[0]]

outFile = open(outFilePath, 'w')
for symbol in symbols:
    if entrezGenesSymbolsDict.has_key(symbol):
        entrezID = entrezGenesSymbolsDict[symbol]
    else:
        if entrezGenesSynonymsDict.has_key(symbol):
            entrezIDs = entrezGenesSynonymsDict[symbol]
import os, sys, glob
import utilities

symbolsFilePath = sys.argv[1]
entrezGenesSymbolsFilePath = sys.argv[2]
entrezGenesSynonymsFilePath = sys.argv[3]
outFilePath = sys.argv[4]

symbols = utilities.readVectorFromFile(symbolsFilePath)

entrezGenesSymbolsData = utilities.readMatrixFromFile(entrezGenesSymbolsFilePath)
entrezGenesSymbolsDict = {}
for row in entrezGenesSymbolsData:
    entrezGenesSymbolsDict[row[1]] = row[0]

entrezGenesSynonymsData = utilities.readMatrixFromFile(entrezGenesSynonymsFilePath)
entrezGenesSynonymsDict = {}
for row in entrezGenesSynonymsData:
    for synonym in row[1].split("|"):
        entrezGenesSynonymsDict[synonym] = entrezGenesSynonymsDict.setdefault(synonym, []) + [row[0]]

outFile = open(outFilePath, 'w')
for symbol in symbols:
    if entrezGenesSymbolsDict.has_key(symbol):
        entrezID = entrezGenesSymbolsDict[symbol]
    else:
        if entrezGenesSynonymsDict.has_key(symbol):
            entrezIDs = entrezGenesSynonymsDict[symbol]

            if len(entrezIDs) == 1:
                entrezID = entrezIDs[0]
Exemple #8
0
norm = Normalize()

if os.path.exists(outFilePath):
    print "Already processed %s" % outFilePath
else:
    print "Reading annotations"
    probe2seq, coord2probe = getMetadata(probeSequenceTabFilePath,
                                         probeSequenceTabColIndices)

    print "Reading " + celFilePath
    probeIntensity = cel.read_cel(celFilePath, coord2probe)

    modelProbes = None
    if os.path.exists(modelProbesFilePath):
        print "Reading model probes file"
        modelProbes = utilities.readVectorFromFile(modelProbesFilePath)
    else:
        if modelProbesFilePath != "None":
            print "No model probes file exists at " + modelProbesFilePath

    print "Normalizing to " + outFilePath
    normValues = norm.normalize(probeIntensity, probe2seq, modelProbes)

    probes = sorted([str(x) for x in probe2seq.keys()])

    exprValues = [normValues[probe][0] for probe in probes]
    probValues = [normValues[probe][1] for probe in probes]

    outFile = file(outFilePath, 'w')
    for i in range(len(probes)):
        outFile.write("%s\t%.9f\t%.9f\n" %
Exemple #9
0
import os, sys, glob
import utilities

idFilePattern = sys.argv[1]
outFilePath = sys.argv[2]

outFile = open(outFilePath, 'w')

for idFilePath in glob.glob(idFilePattern):
    name = os.path.basename(idFilePath).replace(".gene.ids", "")
    name = name.replace(".txt", "")
    ids = utilities.readVectorFromFile(idFilePath)

    outFile.write("%s\t%s\t%s\n" % (name, name, "\t".join(ids)))

outFile.close()
Exemple #10
0
    return utilities.calculateTrimmedMean(values)


patientID = sys.argv[1]
inFilePath = sys.argv[2]
dataColumnIndex = int(sys.argv[3])
keyProbeFilePath = sys.argv[4]
probeFilePath = sys.argv[5]
minNumProbesPer = int(sys.argv[6])
summarizeFunction = getattr(sys.modules[__name__], sys.argv[7])
outlierSamplesFilePath = sys.argv[8]
outFilePath = sys.argv[9]

outlierSamples = []
if os.path.exists(outlierSamplesFilePath):
    outlierSamples = utilities.readVectorFromFile(outlierSamplesFilePath)

if patientID in outlierSamples:
    print "%s is listed as an outlier, so it won't be summarized" % patientID
    sys.exit(0)

keepProbes = set(utilities.readVectorFromFile(probeFilePath))

print "Getting key/probe dict from %s" % keyProbeFilePath
keyProbeDict = utilities.getKeyProbeDict(keyProbeFilePath)

print "Identifying probes to keep"
for key in keyProbeDict.keys():
    keyProbeDict[key] = list(set(keyProbeDict[key]) & keepProbes)

print "Removing keys with few probes"
import os, sys, glob
import utilities

matrixFilePath = sys.argv[1]
samplesFilePath = sys.argv[2]
outDirPath = utilities.checkDirPath(sys.argv[3])
outDirFilePattern = sys.argv[4]

data = utilities.readMatrixFromFile(matrixFilePath)

dataPatientIDs = data.pop(0)
samplePatientIDs = [x.replace(".", "-") for x in utilities.readVectorFromFile(samplesFilePath)]

if len(dataPatientIDs) == len(data[0]): #check for description in first column
    dataPatientIDs.pop(0)

variableNames = [x[0] for x in data]

patientValuesDict = {}

for samplePatientID in samplePatientIDs:
    if not samplePatientID in dataPatientIDs:
        continue

    patientValues = [x[dataPatientIDs.index(samplePatientID) + 1] for x in data]

    output = ""
    for i in range(len(variableNames)):
        output += "\t".join([variableNames[i], patientValues[i]]) + "\n"

    outFilePath = outDirPath + samplePatientID + outDirFilePattern