예제 #1
0
# takes in a csv file of intervals and tells us some stuff about them
for o, a in opts:
    if o == "-i":
        infile = a
    elif o == "-o":
        outfile = a

promotorUp = 2000
promotorDown = 2000

intervals = csv.reader(open(infile, "r"), delimiter="\t")

writer = csv.writer(open(outfile, "w"), delimiter="\t")

cpgIslands = ExtendedBed(
    os.path.expanduser(
        "~/mount/publicdata/hg18/cpgislands/cpgislands-0-index.bed"))

genome = Genome()

# load gene data
genedata = Ensembl.EnsemblGenes(assembly="hg18", annotation="ncbi36.1")

headerRow = [
    'Ensembl', 'Name', 'chr', 'start', 'stop', 'strand', 'No. Transcripts',
    'Avg. Exons per Transcript', "Unique Exons per Gene", "Start positions",
    "Start positions / No. Transcripts"
]

#"Promotor G-Count",  "Promotor C-Count",  "Promotor A-Count", "Promotor T-Count" ,
headerRow.extend([
예제 #2
0
    # if we have an expression file we need fc and expression columns
    if exprfile != None:
        assert fccol != None
        assert exprcols != None
    else:
        exprcols = []

    assert outputfile != None

    genedata = EnsemblGenes(assembly=assembly)

    genome = Genome(genomeBuild=assembly)

    if assembly == "hg18":
        cpgIslands = ExtendedBed(
            os.path.expanduser(
                "~/mount/publicdata/hg18/cpgislands/cpgislands.bed"))
        lINEs = ExtendedBed(
            os.path.expanduser("~/mount/publicdata/hg18/repeats/LINEs-0.bed"))
        sINEs = ExtendedBed(
            os.path.expanduser("~/mount/publicdata/hg18/repeats/SINEs-0.bed"))
    elif assembly == "hg19":
        cpgIslands = ExtendedBed(
            os.path.expanduser(
                "~/mount/publicdata/hg19/CpGIslands/cpgislands.bed"))
        lINEs = ExtendedBed(
            os.path.expanduser(
                "~/mount/publicdata/hg19/Repeats/UCSC_HG19_LINEs.bed"),
            defaultkeys=["chrom", "chromStart", "chromEnd", "name", "strand"])
        sINEs = ExtendedBed(
            os.path.expanduser(
예제 #3
0
genespluspromotor = Ensembl.ReverseGeneMapping(
    genedata, tssPadding=UPSTREAM_PROMOTOR_DIST)

genepromotors = Ensembl.ReversePromotorMapping(
    genedata,
    upstreamPadding=UPSTREAM_PROMOTOR_DIST,
    downstreamPadding=DOWNSTREAM_PROMOTOR_DIST)

exons = Ensembl.ReverseExonMapping(genedata)

transcriptionSites = Ensembl.TranscriptionSites(genedata)

# UCSC table browser - Expression & Regulation - CpG Islands
# Download all columns with exception of "bin"
cpgIslands = ExtendedBed(
    os.path.expanduser("/mnt/50tb/publicdata/" + assembly +
                       "/CpGIslands/cpgislands.bed"))

# UCSC table browser - Mapping and Sequencing - Chromosome Bands
# Download all columns with exception of "gieStain"
gBanding = ExtendedBed(os.path.expanduser("/mnt/50tb/publicdata/" + assembly +
                                          "/G-Banding/cytogenetic.map.bed"),
                       defaultkeys=["chrm", "start", "stop", "band"],
                       forcekeys=True)

chromosomeEnds = ChromosomeEnds(assembly)

###
###
###
예제 #4
0
from bed.treatment import ExtendedBed
from sam.SamFormat import SAMFile
import os
import sys

lads = ExtendedBed(
    os.path.expanduser("~/mount/privatedata/non-Adams/donahue.greg/LADs.bed"))

alignments = SAMFile(os.path.expanduser(sys.argv[1]))

numbInLAD = 0
numbNotInLAD = 0

currentSeq = None


def previousKey(key, isin, isnotin):
    print key, isin, isnotin, isin / float(isin + isnotin)


for samEntry in alignments:

    if samEntry.chrm == "*":
        continue

    if currentSeq != None and samEntry.key != currentSeq:
        # print the stats on the previous key
        previousKey(currentSeq, numbInLAD, numbNotInLAD)
        numbInLAD = 0
        numbNotInLAD = 0
예제 #5
0
            downstreamPromotor = int(a)

    assert methdatafile != None
    assert affyfile != None
    assert affyfccol != None
    assert affyexprcol != None
    assert outputfile != None

    genedata = EnsemblGenes(assembly="hg18")

    genome = Genome(genomeBuild="hg18")

    affyannotation = NetAffxAnnotation(genome="hg18", cdfname="HG-U133_Plus_2")

    cpgIslands = ExtendedBed(
        os.path.expanduser(
            "~/mount/publicdata/hg18/cpgislands/cpgislands-0-index.bed"))

    affyCSV = IndexedCSV(affyfile)
    affyEnsemblLogFCs = collections.defaultdict(list)
    affyEnsemblExprs = collections.defaultdict(list)
    affyEnsemblPvalues = collections.defaultdict(list)

    for affy in affyCSV:
        ensembls = affyannotation.getValues(affy, "Ensembl")
        if len(ensembls) == 1:
            affyFC = float(affyCSV[affy][affyfccol])
            affylogFC = math.log(affyFC) if affyFC > 0.0 else math.log(
                abs(affyFC)) * -1.0
            affyEnsemblLogFCs[ensembls[0]].append(affylogFC)
예제 #6
0
Small_TSS_TTS_Distance = 1000
Small_TTS_TTS_Distance_Human = str(Small_TSS_TTS_Distance / 1000) + "kb"

# load data

genedata = Ensembl.EnsemblGenes(assembly="hg18", annotation="ncbi36.1")

genes = Ensembl.ReverseGeneMapping(genedata)

exons = Ensembl.ReverseExonMapping(genedata)

transcriptionSites = Ensembl.TranscriptionSites(genedata)

cpgIslands = ExtendedBed(
    os.path.expanduser(
        "~/mount/publicdata/hg18/cpgislands/cpgislands-0-index.bed"))

affyannotation = NetAffxAnnotation()

paddedGenes = Ensembl.ReverseGeneMapping(genedata, tssPadding=TSS_TTS_Distance)


def isUpstream(distance, strand):
    if strand == "+":
        return 'Y' if distance >= 0 else 'N'
    elif strand == "-":
        return 'Y' if distance <= 0 else 'N'
    else:
        # wtf went wrong here
        exit(-1)
예제 #7
0
# load data

genome = Genome(genomeBuild="hg18")

chromosomeEnds = ChromosomeEnds("hg18")

genedata = Ensembl.EnsemblGenes(assembly="hg18", annotation="ncbi36.1")

genes = Ensembl.ReverseGeneMapping(genedata)

exons = Ensembl.ReverseExonMapping(genedata)

transcriptionSites = Ensembl.TranscriptionSites(genedata)

cpgIslands = ExtendedBed(
    os.path.expanduser(
        "~/mount/publicdata/hg18/cpgislands/cpgislands-0-index.bed"))

affyannotation = NetAffxAnnotation()

paddedGenes = Ensembl.ReverseGeneMapping(genedata, tssPadding=TSS_TTS_Distance)

# store full mapping here for the end
geneToMethProbeMapping = collections.defaultdict(list)
geneToAffyProbeMapping = collections.defaultdict(list)


def isUpstream(distance, strand):
    assert strand in ['+', '-']
    if strand == "+":
        return distance >= 0