Esempio n. 1
0
    ])
    header.extend(["shoresl", "shoresr", "shelvesl", "shelvesr"])
    header.extend([
        "lineContentGene", "sineContentGene", "lineContentPromoter",
        "sineContentPromoter"
    ])

    for genelist in genelists:
        header.append(genelist.getFriendlyName())

    outputcsv.writerow(header)

    # if expression file then use those genes for iterator
    # else use all ensembl genes
    if exprfile != None:
        exprCSV = IndexedCSV(exprfile, keyPos=1)
        iter = exprCSV
    else:
        iter = genedata

    for testid in iter:
        # look up meth value

        try:

            if exprfile != None:
                ensembl = exprCSV[testid][ensemblidcol]

                if ensembl not in genedata:
                    # RNA-Seq is not a single unique gene, skip
                    continue
        opts, args = getopt.getopt(sys.argv[1:], "", ["gene-expr-transcriptionfactor-file="])
    except getopt.GetoptError, err:
        # print help information and exit:
        print str(err) # will print something like "option -a not recognized"
        sys.exit(2)
    
    infile = None
    for o, a in opts:
        if o=="--gene-expr-transcriptionfactor-file":
            infile = a
            print "Matrix Dir:",infile            
    assert infile!=None
    
    ensemblidcol = "ensemblid"
    
    inCSV = IndexedCSV(infile,keyPos=1)

    keys = inCSV.keys
    
    transcriptionFactors = keys[keys.index("significant")+1:]
    
    print transcriptionFactors
    
    assert len(set(transcriptionFactors))==len(transcriptionFactors)
    
    transcriptionFactorUp = collections.defaultdict(int)
    transcriptionFactorDown = collections.defaultdict(int)
    transcriptionFactorNC = collections.defaultdict(int)
    
    totalUp = 0
    totalDown = 0
            sys.argv[1:],
            "",
            [
                # command args go here
                "gene-expression-difference=",
                "read-group-tracking=",
                "output="
            ])
    except getopt.GetoptError, err:
        # print help information and exit:
        print str(err)  # will print something like "option -a not recognized"
        sys.exit(2)

    for o, a in opts:
        if o == "--gene-expression-difference":
            differences = IndexedCSV(a, key="test_id")
        elif o == "--read-group-tracking":
            readgrouptracking = IndexedCSV(a, useRowNumAsKey=True)
            xlocindex = ColumnIndex(readgrouptracking, "tracking_id")
        elif o == "--output":
            outputFile = a
        else:
            print "Unknown parameter: " + o + " " + a
            sys.exit(2)

    # get a suitable header
    replicatenames = []
    for readtrackingrow in xlocindex["XLOC_000001"]:
        condition = readgrouptracking[readtrackingrow]["condition"]
        replicate = readgrouptracking[readtrackingrow]["replicate"]
        name = condition + "." + replicate
Created on 18 Jan 2011

@author: mcbryan
'''

import sys
import getopt
from sequence.genome import Genome
import subprocess
import math
import csv
from filesystem.mkdir import makeDirectory
from csvfile.indexedcsv import IndexedCSV

build = "hg18"
ucscNames = IndexedCSV(
    "/home/mcbryan/mount/publicdata/hg18/ucsc/ucsc-genes-xref.csv")

###################
# isPcr needs:
# ~/bin/x86_64/gfServer -stepSize=5 start localhost 20000 ~/mount/publicdata/hg18/assembly/twoBit/hg18.2bit
###################

splitString = lambda v, l: [
    v[i * l:(i + 1) * l] for i in range(int(math.ceil(len(v) / float(l))))
]

if __name__ == '__main__':

    try:
        opts, args = getopt.getopt(sys.argv[1:], "p:o:",
                                   ["primers=", "outputFolder="])
Esempio n. 5
0
for o, a in opts:
    if o == "-i":
        infile = a
    elif o == "-o":
        outfile = a
    elif o == "-g":
        genelists.append(GeneList(a))
    elif o == "-r":
        if (a.count(",") > 0):
            a, padding = a.split(",")
            regions.append(BedIntervalTree(a, padding=int(padding)))
        else:
            regions.append(BedIntervalTree(a))
    elif o == "-c":
        controlAffyExpressionData = IndexedCSV(a)
    #Annotated difference file input
    elif o == "-e":
        rnaSeqExpressionData = IndexedCSV(a, key="test_id")
    elif o == "-a":
        assembly = a

UPSTREAM_PROMOTOR_DIST = 2000
DOWNSTREAM_PROMOTOR_DIST = 2000

writer = csv.writer(open(outfile, "w"), delimiter="\t")

genome = Genome(assembly)

###
Esempio n. 6
0
    assert bSample != None

    assert affyfile != None
    assert affyfccol != None
    assert affyexprcol != None
    assert outputfile != None

    print affyexprcol

    genedata = EnsemblGenes(assembly=build)

    genome = Genome(genomeBuild=build)

    affyannotation = NetAffxAnnotation(genome=build, cdfname="HG-U133_Plus_2")

    affyCSV = IndexedCSV(affyfile)
    affyEnsemblLogFCs = collections.defaultdict(list)
    affyEnsemblExprs = collections.defaultdict(list)

    # make a list of genenames in each genelist and store in the genelist
    for genelist in genelists:
        genelist.genenames = set()
        for e in genelist:
            if e in genedata:
                genelist.genenames.add(genedata[e].name)

    for affy in affyCSV:
        ensembls = affyannotation.getValues(affy, "Ensembl")
        if len(ensembls) == 1:
            affyFC = float(affyCSV[affy][affyfccol])
            affylogFC = math.log(affyFC) if affyFC > 0.0 else math.log(
Esempio n. 7
0
                                                   "outputfolder="
                                                   ])
 except getopt.GetoptError, err:
     # print help information and exit:
     print str(err) # will print something like "option -a not recognized"
     print "Usage: xyz"
     sys.exit(2)
     
 isoformFile = None
 geneFile = None    
 
 for o, a in opts:
     if (o=="-G") or (o=="--gtf"):
         gtfFileLoc = a
     elif (o=="-i") or (o=="--isoformexp"):
         isoformFile = IndexedCSV(a,keyPos=0)
     elif (o=="-g") or (o=="--geneexp"):
         geneFile = IndexedCSV(a,keyPos=0)
     elif (o=="-o") or (o=="--outputfolder"):
         outputFilePrefix = a
     else:
         print "Unknown parameter: "+o+" "+a
         sys.exit(2)
 
 makeDirectory(outputFilePrefix)
 
 gtfReader = csv.reader(open(gtfFileLoc,"r"),delimiter="\t")
 
 class Transcript(list):
     def __init__(self):
         self.chr = None
Esempio n. 8
0
        elif o == "--affyfc":
            affyfc = a
        elif o == "--rnalogfc":
            rnalogfc = a
        elif o == "--rnasignificant":
            rnasig = a
        elif o == "--rnakey":
            rnakeypos = int(a)

    # read in gene_exp.diff as indexedcsv

    affyannotation = NetAffxAnnotation(genome="hg18",
                                       array="HG-U133_Plus_2",
                                       version="29")

    affyCSV = IndexedCSV(affyfile)
    rnaseqCSV = IndexedCSV(rnaseqfile, keyPos=rnakeypos)

    fig = plt.figure(figsize=(12, 12), dpi=100)
    ax = fig.add_subplot(111)

    affyEnsemblLogFCs = collections.defaultdict(list)
    affyEnsemblSig = collections.defaultdict(bool)

    for affy in affyCSV:
        ensembls = affyannotation.getValues(affy, "Ensembl")
        if len(ensembls) == 1:
            affyFC = float(affyCSV[affy][affyfc])
            affylogFC = math.log(affyFC) if affyFC > 0.0 else math.log(
                abs(affyFC)) * -1.0
            affyEnsemblLogFCs[ensembls[0]].append(affylogFC)
Esempio n. 9
0
    def __init__(self, assembly="hg18"):

        self.clusters = {}

        baseLocation = os.path.expanduser("~/mount/publicdata/" + assembly +
                                          "/ucsc/")

        xrefs = IndexedCSV(baseLocation + "ucsc-genes-xref.csv")
        clusters = IndexedCSV(baseLocation + "ucsc-genes-knownIsoforms.csv",
                              keyPos=1)
        ensemblMappings = IndexedCSV(baseLocation + "ucsc-knownToEnsembl.csv")

        self.reverseEnsemblMappings = collections.defaultdict(list)

        transcripts = IndexedCSV(baseLocation + "ucsc-knowngenes.csv")

        for id in transcripts:

            chr = transcripts[id]["chrom"]

            if chr not in [
                    "chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7",
                    "chr8", "chr9", "chr10", "chr11", "chr12", "chr13",
                    "chr14", "chr15", "chr16", "chr17", "chr18", "chr19",
                    "chr20", "chr21", "chr22", "chrX", "chrY"
            ]:
                continue

            strand = transcripts[id]["strand"]
            assert strand == "+" or strand == "-"

            if id in ensemblMappings:
                ensemblTranscript = ensemblMappings[id]["value"]
            else:
                ensemblTranscript = None

            start = int(transcripts[id]["txStart"])
            end = int(transcripts[id]["txEnd"])

            exonCount = int(transcripts[id]["exonCount"])
            exonStarts = transcripts[id]["exonStarts"].split(
                ",")  # comma seperated
            exonEnds = transcripts[id]["exonEnds"].split(
                ",")  # comma seperated
            # ucsc terminates with a comma which leaves a blank bit at the end after split
            exonStarts.remove('')
            exonEnds.remove('')

            # make sure it's consistent
            assert len(exonStarts) == len(exonEnds) and exonCount == len(
                exonStarts), str(transcripts[id])

            # not using these at the moment

            #if "proteinID" in transcripts[id]: # not always there
            #    proteinId = transcripts[id]["proteinID"]
            #alignId = transcripts[id]["alignID"]

            if id in xrefs:
                genesymbol = xrefs[id]["geneSymbol"]
            else:
                genesymbol = None

            assert id in clusters
            clusterid = clusters[id]["clusterId"]
            if clusterid not in self.clusters:
                self.clusters[clusterid] = UCSCCluster(clusterid)

            self[id] = UCSCTranscript(id, chr, start, end, strand, clusterid,
                                      exonStarts, exonEnds, genesymbol,
                                      ensemblTranscript)

            self.reverseEnsemblMappings[ensemblTranscript].append(id)

            self.clusters[clusterid].addToCluster(self[id])
Esempio n. 10
0

def isDownstream(distance, strand):
    if strand == "+":
        return 'Y' if distance <= 0 else 'N'
    elif strand == "-":
        return 'Y' if distance >= 0 else 'N'
    else:
        # wtf went wrong here
        exit(-1)


if not affyComparisonFile == None:
    #affyMapping = ExtendedBed(os.path.expanduser("~/mount/publicdata/positions2affy/HG-U133Plus2.csv"), chrPos=0, startPos = 2, stopPos=3,  defaultkeys=["chr", "strand", "start", "stop", "affy"])
    #print affyMapping.getValuesOfOverlappingIntervals("chr16", 72982016, 72983513)
    affyComparison = IndexedCSV(affyComparisonFile)

headerRow = ['Index', 'ColumnID', 'Symbol', 'Chr', 'Mapinfo', 'Coord']
headerRow.extend([
    'PD30.Avg', 'PD56.Avg', 'Fold change', 'Log2MethFC',
    'Bonferroni(p-value (PD56 vs. PD30))', 'Meth'
])
headerRow.extend(['In Gene', 'Genes', 'Names', 'Gene Bounds'])
headerRow.extend([
    TTS_TTS_Distance_Human + ' up or Gene Body',
    TTS_TTS_Distance_Human + ' up or Gene Body Genes',
    TTS_TTS_Distance_Human + ' up or Gene Body Names',
    TTS_TTS_Distance_Human + ' up or Gene Body Gene Bounds'
])
headerRow.extend([
    'Gene TSS Distance',
Esempio n. 11
0
 def addCounts(counts,arg,suffix):
     file = IndexedCSV(arg+suffix)
     for row in file:
         counts[row][arg] = Count(file[row]["AssignedCount"],
                                  file[row]["MinCount"],
                                  file[row]["MaxCount"])
Esempio n. 12
0
    extend = 150

    exprCSV = None
    outputfile = None

    upstreamPromotor = 5000
    downstreamPromotor = 1000

    genelists = []
    exactmatch = False

    genedata = EnsemblGenes(assembly="hg18")

    for opt, value in opts:
        if opt == "--gene-expression-file":
            exprCSV = IndexedCSV(value, keyPos=1)
            print "Expr File:", value
        elif opt == "--fccol":
            fccol = value
        elif opt == "--exprcols":
            exprcols = value.split(",")
        elif opt == "--outputfile":
            outputfile = value
        elif opt == "--promotorsize":
            upstreamPromotor = int(value)
            downstreamPromotor = int(value)
        elif opt == "--aSample":
            aSample = Bed(value, extend=extend)
            print "A Sample:", value
        elif opt == "--bSample":
            bSample = Bed(value, extend=extend)
Esempio n. 13
0
            affyfilelocation = a
        elif (o == "--rnaseq"):
            rnaseqfilelocation = a
        elif (o == "--output"):
            outputFileLoc = a

    assert affyfilelocation != None and rnaseqfilelocation != None
    assert outputFileLoc != None

    ucscgenedata = UCSC.UCSCTranscripts(assembly="hg18")
    ensemblgenedata = Ensembl.EnsemblGenes(assembly="hg18",
                                           annotation="ncbi36.1")

    affyannotation = NetAffxAnnotation()

    affyfile = IndexedCSV(affyfilelocation)
    rnaseqfile = IndexedCSV(rnaseqfilelocation, keyPos=1)

    print "Read Files"

    plotx = []
    ploty = []

    allaffys = 0
    allrnaclusters = 0
    finalentries = 0

    affySymbols = collections.defaultdict(list)
    rnaSymbols = collections.defaultdict(list)

    for affy in affyfile:
Esempio n. 14
0
    elif o in ["-a", "--affyarray"]:
        affyComparisonFile = a
    elif o in ["--bed"]:
        bedTrackLoc = a
    elif o in ["--bedgraphfc"]:
        bedGraphFcTrackLoc = a
    elif o in ["--bedgraphbeta"]:
        bedGraphBetaTrackLoc = a
    elif o in ["--bedgraphp"]:
        bedGraphPTrackLoc = a
    elif o in ["--genebygene"]:
        geneByGeneOutputLoc = a
    elif o in ["--affyvalues"]:
        affyValueColumns = a.split(",")

reader = IndexedCSV(infile)

writer = csv.writer(open(outfile, "w"), delimiter="\t")

###


def distanceHumanReadable(dist):
    return str(dist / 1000) + "kb"


TSS_TTS_Distance = 1000
SURROUNDING_SEQUENCE_Distance = 250  # each side
WINDOW_SIZE = 500
WINDOW_OFFSET = 5
Esempio n. 15
0
    outlocation = None

    for o, a in opts:
        if o == "-1":
            location1 = a
        elif o == "-2":
            location2 = a
        elif o == "-o":
            outlocation = a
        elif o == "--key1":
            key1 = int(a)
        elif o == "--key2":
            key2 = int(a)

    one = IndexedCSV(location1, keyPos=key1)

    two = IndexedCSV(location2, keyPos=key2)

    with open(outlocation, "w") as outfile:
        csvout = csv.writer(outfile, delimiter="\t")

        header = list(one.keys)
        header.extend(two.keys)
        header.insert(0, "Key")

        csvout.writerow(header)

        for item in one:

            row = [item]