Ejemplo n.º 1
0
import os, sys, glob, math
import utilities
from operator import itemgetter, attrgetter

inFilePath = sys.argv[1]
expression = sys.argv[2]
reverse = sys.argv[3] == "reverse"
numHeaderRows = int(sys.argv[4])
outFilePath = sys.argv[5]

data = utilities.readMatrixFromFile(inFilePath)

headerRows = []
for i in range(numHeaderRows):
    headerRows.append(data.pop(0))

data = map(lambda x: eval("x + [" + expression + "]"), data)
data.sort(key=itemgetter(len(data[0])-1), reverse=reverse)
data = [x[:-1] for x in data]

utilities.writeMatrixToFile(headerRows + data, outFilePath)
import os, sys, glob, shutil
import utilities

decoderFilePath = sys.argv[1]
ccleRawAllDirPath = sys.argv[2]
gskRawAllDirPath = sys.argv[3]
ccleRawSelectedDirPath = sys.argv[4]
gskRawSelectedDirPath = sys.argv[5]

decoderData = utilities.readMatrixFromFile(decoderFilePath)
decoderData.pop(0)

decoderFileDict = {}
decoderNameDict = {}

for row in decoderData:
    gskFileName = row[1]
    ccleFileName = row[4]
    name = row[0] + "_" + row[8] + "_" + row[9] + "_" + row[10]
    name = name.replace(" ", "")

    decoderFileDict[ccleFileName] = decoderFileDict.setdefault(ccleFileName, []) + [gskFileName]
    decoderNameDict[ccleFileName] = decoderNameDict.setdefault(ccleFileName, []) + [name]

for ccleFileName in decoderFileDict:
    name = decoderNameDict[ccleFileName][0]
    print name

    shutil.copy(ccleRawAllDirPath + "/" + ccleFileName + ".CEL", ccleRawSelectedDirPath + "/" + name + ".CEL")

    gskFileName = decoderFileDict[ccleFileName][0]
Ejemplo n.º 3
0
import os, sys, glob
import utilities

inFilePath = sys.argv[1]
numHeaderRows = int(sys.argv[2])
colIndex = int(sys.argv[3])
numPlaces = sys.argv[4]
outFilePath = sys.argv[5]

numberFormatOption = "g"
if len(sys.argv) > 6:
    numberFormatOption = sys.argv[6]

data = utilities.readMatrixFromFile(inFilePath)


def isNumber(x):
    try:
        float(x)
        return True
    except:
        return False


for i in range(numHeaderRows, len(data)):
    if isNumber(data[i][colIndex]):
        modValue = ("%." + numPlaces + numberFormatOption) % float(
            data[i][colIndex])
    else:
        modValue = data[i][colIndex]
        if len(data[i][colIndex]) > int(numPlaces):
Ejemplo n.º 4
0
import os, sys, glob
import utilities

symbolsFilePath = sys.argv[1]
entrezGenesSymbolsFilePath = sys.argv[2]
entrezGenesSynonymsFilePath = sys.argv[3]
outFilePath = sys.argv[4]

symbols = utilities.readVectorFromFile(symbolsFilePath)

entrezGenesSymbolsData = utilities.readMatrixFromFile(
    entrezGenesSymbolsFilePath)
entrezGenesSymbolsDict = {}
for row in entrezGenesSymbolsData:
    entrezGenesSymbolsDict[row[1]] = row[0]

entrezGenesSynonymsData = utilities.readMatrixFromFile(
    entrezGenesSynonymsFilePath)
entrezGenesSynonymsDict = {}
for row in entrezGenesSynonymsData:
    for synonym in row[1].split("|"):
        entrezGenesSynonymsDict[synonym] = entrezGenesSynonymsDict.setdefault(
            synonym, []) + [row[0]]

outFile = open(outFilePath, 'w')
for symbol in symbols:
    if entrezGenesSymbolsDict.has_key(symbol):
        entrezID = entrezGenesSymbolsDict[symbol]
    else:
        if entrezGenesSynonymsDict.has_key(symbol):
            entrezIDs = entrezGenesSynonymsDict[symbol]
Ejemplo n.º 5
0
import os, sys, glob
import utilities

inFilePath1 = sys.argv[1]
inFilePath2 = sys.argv[2]
outFilePath = sys.argv[3]

hasHeader = True
if len(sys.argv) > 4:
    hasHeader = sys.argv[4] == "True"

data1 = utilities.readMatrixFromFile(inFilePath1)
data2 = utilities.readMatrixFromFile(inFilePath2)

if hasHeader:
    header1 = data1.pop(0)
    header2 = data2.pop(0)

    if len(header2) == len(data2[0]):
        header2.pop(0)

    headerCombined = header1 + header2

data1Dict = {}
for row in data1:
    data1Dict[row[0]] = data1Dict.setdefault(row[0], []) + [row]

data2Dict = {}
for row in data2:
    data2Dict[row[0]] = data2Dict.setdefault(row[0], []) + [row[1:]]
import os, sys, glob
import utilities

symbolsFilePath = sys.argv[1]
entrezGenesSymbolsFilePath = sys.argv[2]
entrezGenesSynonymsFilePath = sys.argv[3]
outFilePath = sys.argv[4]

symbols = utilities.readVectorFromFile(symbolsFilePath)

entrezGenesSymbolsData = utilities.readMatrixFromFile(entrezGenesSymbolsFilePath)
entrezGenesSymbolsDict = {}
for row in entrezGenesSymbolsData:
    entrezGenesSymbolsDict[row[1]] = row[0]

entrezGenesSynonymsData = utilities.readMatrixFromFile(entrezGenesSynonymsFilePath)
entrezGenesSynonymsDict = {}
for row in entrezGenesSynonymsData:
    for synonym in row[1].split("|"):
        entrezGenesSynonymsDict[synonym] = entrezGenesSynonymsDict.setdefault(synonym, []) + [row[0]]

outFile = open(outFilePath, 'w')
for symbol in symbols:
    if entrezGenesSymbolsDict.has_key(symbol):
        entrezID = entrezGenesSymbolsDict[symbol]
    else:
        if entrezGenesSynonymsDict.has_key(symbol):
            entrezIDs = entrezGenesSynonymsDict[symbol]

            if len(entrezIDs) == 1:
                entrezID = entrezIDs[0]
import os, sys, glob
import utilities

pathwaysFilePath = sys.argv[1]
genePathwayFilePath = sys.argv[2]
geneEntrezFilePath = sys.argv[3]
outFilePathTemplate = sys.argv[4] #KEGG_{PATHWAY_NAME}.gene.ids.txt

pathwaysData = utilities.readMatrixFromFile(pathwaysFilePath)
pathwaysData = [row for row in pathwaysData if not row[0].startswith("#")]
pathwaysDict = {}
for row in pathwaysData:
    pathwaysDict[row[0]] = row[1].upper().replace(" - ", "_").replace(", ", "_").replace(" / ", "_").replace("-", "_").replace(" ", "_").replace("/", "_").replace("(", "").replace(")", "").replace(",", "_").replace("'", "")

genePathwayData = utilities.readMatrixFromFile(genePathwayFilePath)
genePathwayDict = {}
for row in genePathwayData:
    gene = row[0]
    for pathway in row[1].split(" "):
        genePathwayDict[pathway] = genePathwayDict.setdefault(pathway, []) + [gene]

geneEntrezData = utilities.readMatrixFromFile(geneEntrezFilePath)
geneEntrezDict = {}
for row in geneEntrezData:
    geneEntrezDict[row[0].replace("hsa:", "")] = row[1].replace("ncbi-geneid:", "")

for keggPathwayID in pathwaysDict.keys():
    if not genePathwayDict.has_key(keggPathwayID):
        continue

    keggGeneIDs = genePathwayDict[keggPathwayID]
Ejemplo n.º 8
0
inFilePath = sys.argv[1]
classFilePath = sys.argv[2]
ignorePatientID = sys.argv[3]
includeIDs = sys.argv[4] == "True"
outFilePath = sys.argv[5]

patientClassDict = {}
for line in file(classFilePath):
    if line.startswith("#"):
        continue

    lineItems = line.rstrip().split("\t")
    patientClassDict[lineItems[0]] = lineItems[1]

if os.path.exists(inFilePath):
    data = utilities.transposeMatrix(utilities.readMatrixFromFile(inFilePath))

    features = data.pop(0)
    features.pop(0)
else:
    data = [[patientID] for patientID in patientClassDict.keys()]
    features = []

outFile = open(outFilePath, "w")
outFile.write("@relation data\n\n")

if includeIDs:
    outFile.write("@attribute id string\n")

for feature in features:
    outFile.write("@attribute %s numeric\n" % feature.replace("'", "prime"))
Ejemplo n.º 9
0
import os, sys, glob, math
import utilities
from TransformFunctions import *

# See http://fluxcapacitor.wikidot.com/forum/t-333476

mappedReadsFilePath = sys.argv[1]
metaFilePath = sys.argv[2]
outFilePath = sys.argv[3]

mappedReads = utilities.readMatrixFromFile(mappedReadsFilePath)
READ_NR = math.fsum([float(x[1]) for x in mappedReads])

metaDict = {}
for metaRow in utilities.readMatrixFromFile(metaFilePath):
    metaDict[metaRow[0]] = metaRow[4]

outData = []
for mappedRead in mappedReads:
    id = mappedRead[0]

    if id not in metaDict:
        continue

    reads = float(mappedRead[1])
    length = float(metaDict[id])

    rpkm = (reads * 1000000000) / (length * READ_NR)

    outData.append((id, "%.9f" % rpkm))
Ejemplo n.º 10
0
import utilities
from VariantUtilities import *

inMetaFilePath = sys.argv[1]
genomeDirPath = sys.argv[2]
outFilePath = sys.argv[3]

def getChromosomeSequence(filePath):
    sequence = ""
    for line in file(filePath):
        if not line.startswith(">"):
            sequence += line.rstrip().upper()
    return sequence

outData = []
inMeta = utilities.readMatrixFromFile(inMetaFilePath)

chromosomes = sorted(list(set([parseChromosome(x[1]) for x in inMeta])))

for chromosome in chromosomes:
    print "Processing %s" % chromosome
    faFilePath = genomeDirPath + "/" + chromosome + ".fa"

    if not os.path.exists(faFilePath):
        faFilePath = genomeDirPath + "/" + chromosome.replace("chr", "") + ".fa"

    if not os.path.exists(faFilePath):
        print "Ignoring %s because no chromosome file exists" % chromosome
        continue

    inMetaChromosome = [x for x in inMeta if x[1] == parseChromosome(chromosome)]
Ejemplo n.º 11
0
inFilePath = sys.argv[1]
classFilePath = sys.argv[2]
ignorePatientID = sys.argv[3]
includeIDs = sys.argv[4] == "True"
outFilePath = sys.argv[5]

patientClassDict = {}
for line in file(classFilePath):
    if line.startswith("#"):
        continue

    lineItems = line.rstrip().split("\t")
    patientClassDict[lineItems[0]] = lineItems[1]

if os.path.exists(inFilePath):
    data = utilities.transposeMatrix(utilities.readMatrixFromFile(inFilePath))

    features = data.pop(0)
    features.pop(0)
else:
    data = [[patientID] for patientID in patientClassDict.keys()]
    features = []

outFile = open(outFilePath, 'w')
outFile.write("@relation data\n\n")

if includeIDs:
    outFile.write("@attribute id string\n")

for feature in features:
    outFile.write("@attribute %s numeric\n" % feature.replace("'", "prime"))
Ejemplo n.º 12
0
import os, sys, glob
import utilities

def pearson(x, y):
    return utilities.calculatePearsonCoefficient(x, y)

def spearman(x, y):
    return utilities.calculateSpearmanCoefficient(x, y)

matrix1FilePath = sys.argv[1]
matrix2FilePath = sys.argv[2]
byRow = sys.argv[3] == "True"
compareFunction = getattr(sys.modules[__name__], sys.argv[4])
outFilePath = sys.argv[5]

matrix1 = utilities.readMatrixFromFile(matrix1FilePath)
matrix2 = utilities.readMatrixFromFile(matrix2FilePath)

colNames = matrix1.pop(0)
matrix2.pop(0)

if byRow:
    print "byRow Not yet implemented"
    exit()
else:
    results = []

    for colIndex in range(1, len(matrix1[0])):
        values1 = [float(row[colIndex]) for row in matrix1]
        values2 = [float(row[colIndex]) for row in matrix2]
Ejemplo n.º 13
0
summarizeFunction = getattr(sys.modules[__name__], sys.argv[7])
outlierSamplesFilePath = sys.argv[8]
outFilePath = sys.argv[9]

outlierSamples = []
if os.path.exists(outlierSamplesFilePath):
    outlierSamples = utilities.readVectorFromFile(outlierSamplesFilePath)

if patientID in outlierSamples:
    print "%s is listed as an outlier, so it won't be summarized" % patientID
    sys.exit(0)

print "Get data probes"
dataProbes = set([line.rstrip().split("\t")[0] for line in file(inFilePath)])

keyProbes = utilities.readMatrixFromFile(keyProbeFilePath)
keyProbes = [x for x in keyProbes if len(list(set(x[1].split(",")) & dataProbes)) > 0]

keyProbeDict = {}
for keyProbesRow in keyProbes:
    keyProbeDict[keyProbesRow[0]] = keyProbeDict.setdefault(keyProbesRow[0], []) + keyProbesRow[1].split(",")

if os.path.exists(probeFilePath):
    print "Keeping only specified probes"
    keepProbes = set(utilities.readVectorFromFile(probeFilePath))

    for key in keyProbeDict.keys():
        keyProbeDict[key] = list(set(keyProbeDict[key]) & keepProbes)
else:
    keepProbes = list(dataProbes)
Ejemplo n.º 14
0
import os, sys, glob, shutil
import utilities

## The input file should contain samples as columns and variables as rows
inFilePath = sys.argv[1]
classesFilePath = sys.argv[2]
outGctFilePath = sys.argv[3]
outClsFilePath = sys.argv[4]

data = utilities.readMatrixFromFile(inFilePath)

samples = data.pop(0)
if len(samples) == len(data[0]):
    samples.pop(0)

classesDict = {}
for row in utilities.readMatrixFromFile(classesFilePath):
    if row[0] in samples:
        classesDict[row[0]] = row[1]

uniqueClasses = sorted(list(set(classesDict.values())))
variables = [x[0] for x in data]

outGctFile = open(outGctFilePath, 'w')
outGctFile.write("#1.2\n")
outGctFile.write("%i\t%i\n" % (len(variables), len(classesDict)))
outGctFile.write("NAME\tDescription\t")
outGctFile.write("\t".join([sample for sample in samples if classesDict.has_key(sample)]) + "\n")

for row in data:
    outGctFile.write("%s\t%s\t" % (row[0], row[0]))