コード例 #1
0
    def loadGFF_transcript(self,fields,line,transcriptBeginEnd,GFF,
                           transcripts,readOrder,genes):
        begin=int(fields[3])-1
        end=int(fields[4])
        rex=Rex()
        if(rex.find('transcript_id[:=]?\s*"?([^\s";]+)"?',line)):
            transcriptId=rex[1]
            transcriptBeginEnd[transcriptId]=[begin,end]
            strand=fields[6]
            transcriptExtraFields=""
            for i in range(8,len(fields)):
                transcriptExtraFields+=fields[i]+" "
            transcript=transcripts.get(transcriptId,None)
            if(transcript is None):
                transcripts[transcriptId]=transcript= \
	                                   Transcript(transcriptId,strand)
                transcript.setStopCodons(self.stopCodons)
                transcript.readOrder=readOrder;
                readOrder+=1
                transcript.substrate=fields[0]
                transcript.source=fields[1]
                transcript.setBegin(begin)
                transcript.setEnd(end)
            geneId=None
            if(rex.find("genegrp=(\S+)",line)): geneId=rex[1]
            elif(rex.find('gene_id[:=]?\s*\"?([^\s\;"]+)\"?',line)):
                geneId=rex[1]
            if(not geneId): raise Exception("can't parse GTF: "+line)
            transcript.geneId=geneId
            gene=genes.get(geneId,None)
            if(not gene): genes[geneId]=gene=Gene(); gene.setId(geneId)
            transcript.setGene(gene)
            gene.addTranscript(transcript)
            transcript.extraFields=transcriptExtraFields
コード例 #2
0
ファイル: GFF3Parser.py プロジェクト: bmajoros/python
 def parseRecord(self, fields):
     if (len(fields) > 9):
         raise Exception("too many fields in GFF3 record" +
                         "\t".join(fields))
     (substrate, source, type, begin, end, score, strand, frame,
      extra) = fields
     extra = extra.rstrip()
     extraFields = extra.split(";")
     extraHash = {}
     rex = Rex()
     for field in extraFields:
         if (not rex.find("(.+)=(.+)", field)):
             raise Exception("Can't parse GFF3 field: " + field)
         key = rex[1]
         value = rex[2]
         extraHash[key] = value
     rec = {
         "substrate": substrate,
         "source": source,
         "type": type,
         "begin": int(begin) - 1,
         "end": int(end),
         "score": score,
         "strand": strand,
         "frame": frame,
         "extra": extraHash
     }
     return rec
コード例 #3
0
 def loadGFF_UTR(self,fields,line,transcriptBeginEnd,GFF,
                        transcripts,readOrder,genes):
     exonBegin=int(fields[3])-1
     exonEnd=int(fields[4])
     exonScore=fields[5]
     strand=fields[6]
     frame=fields[7]
     transcriptId=None
     rex=Rex()
     if(rex.find('transgrp[:=]\s*(\S+)',line)): transcriptId=rex[1]
     elif(rex.find('transcript_id[:=]?\s*"?([^\s";]+)"?',line)):
         transcriptId=rex[1]
     elif(rex.find('Parent=([^;,\s]+)',line)): transcriptId=rex[1]
     geneId=None
     if(rex.find('genegrp=(\S+)',line)): geneId=rex[1]
     elif(rex.find('gene_id[:=]?\s*"?([^\s\;"]+)"?',line)): geneId=rex[1]
     if(transcriptId is None): transcriptId=geneId
     if(geneId is None): geneId=transcriptId
     if(transcriptId is None): 
         raise Exception(line+" : no transcript ID found")        
     if(rex.find("(\S+);$",transcriptId)): transcriptId=rex[1]
     if(rex.find("(\S+);$",geneId)): geneId=rex[1]
     extra=""
     for i in range(8,len(fields)): extra+=fields[i]+" "
     if(exonBegin>exonEnd): (exonBegin,exonEnd)=(exonEnd,exonBegin)
     transcript=transcripts.get(transcriptId,None)
     if(not transcript):
         transcripts[transcriptId]=transcript= \
             Transcript(transcriptId,strand)
         transcript.setStopCodons(self.stopCodons)
         transcript.readOrder=readOrder
         readOrder+=1
         transcript.substrate=fields[0]
         transcript.source=fields[1]
         if(transcriptBeginEnd.get(transcriptId,None) is not None):
             (begin,end)=transcriptBeginEnd[transcriptId]
             transcript.setBegin(begin)
             transcript.setEnd(end)
         else:
             transcript.setBegin(exonBegin)
             transcript.setEnd(exonEnd)
     transcript.geneId=geneId
     gene=genes.get(geneId,None)
     if(gene is None):
         genes[geneId]=gene=Gene(); gene.setId(geneId)
     transcript.setGene(gene)
     exon=Exon(exonBegin,exonEnd,transcript)
     exon.extraFields=extra
     if(transcript.rawExons is not None): 
         exon.frame=frame
         exon.score=exonScore
         exon.type=fields[2]
         transcript.rawExons.append(exon)
     elif(not transcript.exonOverlapsExon(exon)):
         exon.frame=frame
         exon.score=exonScore
         exon.type=fields[2]
         transcript.UTR.append(exon) # OK -- we sort later
     gene.addTranscript(transcript)
コード例 #4
0
 def loadGFF_UTR(self,fields,line,transcriptBeginEnd,GFF,
                        transcripts,readOrder,genes):
     exonBegin=int(fields[3])-1
     exonEnd=int(fields[4])
     exonScore=fields[5]
     strand=fields[6]
     frame=fields[7]
     transcriptId=None
     rex=Rex()
     if(rex.find('transgrp[:=]\s*(\S+)',line)): transcriptId=rex[1]
     elif(rex.find('transcript_id[:=]?\s*"?([^\s";]+)"?',line)):
         transcriptId=rex[1]
     elif(rex.find('Parent=([^;,\s]+)',line)): transcriptId=rex[1]
     geneId=None
     if(rex.find('genegrp=(\S+)',line)): geneId=rex[1]
     elif(rex.find('gene_id[:=]?\s*"?([^\s\;"]+)"?',line)): geneId=rex[1]
     if(transcriptId is None): transcriptId=geneId
     if(geneId is None): geneId=transcriptId
     if(transcriptId is None): 
         raise Exception(line+" : no transcript ID found")        
     if(rex.find("(\S+);$",transcriptId)): transcriptId=rex[1]
     if(rex.find("(\S+);$",geneId)): geneId=rex[1]
     extra=""
     for i in range(8,len(fields)): extra+=fields[i]+" "
     if(exonBegin>exonEnd): (exonBegin,exonEnd)=(exonEnd,exonBegin)
     transcript=transcripts.get(transcriptId,None)
     if(not transcript):
         transcripts[transcriptId]=transcript= \
             Transcript(transcriptId,strand)
         transcript.setStopCodons(self.stopCodons)
         transcript.readOrder=readOrder
         readOrder+=1
         transcript.substrate=fields[0]
         transcript.source=fields[1]
         if(transcriptBeginEnd.find(transcriptId,None) is not None):
             (begin,end)=transcriptBeginEnd[transcriptId]
             transcript.setBegin(begin)
             transcript.setEnd(end)
     transcript.geneId=geneId
     gene=genes.get(geneId,None)
     if(gene is None):
         genes[geneId]=gene=Gene(); gene.setId(geneId)
     transcript.setGene(gene)
     exon=Exon(exonBegin,exonEnd,transcript)
     exon.extraFields=extra
     if(transcript.rawExons is not None): 
         exon.frame=frame
         exon.score=exonScore
         exon.type=fields[2]
         transcript.rawExons.append(exon)
     elif(not transcript.exonOverlapsExon(exon)):
         exon.frame=frame
         exon.score=exonScore
         exon.type=fields[2]
         transcript.UTR.append(exon) # OK -- we sort later
     gene.addTranscript(transcript)
コード例 #5
0
    def loadGFF_transcript(self,fields,line,transcriptBeginEnd,GFF,
                           transcripts,readOrder,genes):
        begin=int(fields[3])-1
        end=int(fields[4])
        rex=Rex()
        if(rex.find('transcript_id[:=]?\s*"?([^\s";]+)"?',line)):
            transcriptId=rex[1]
            transcriptBeginEnd[transcriptId]=[begin,end]
            strand=fields[6]
            score=fields[5]
            transcriptExtraFields=""
            for i in range(8,len(fields)):
                transcriptExtraFields+=fields[i]+" "
            transcript=transcripts.get(transcriptId,None)
            if(transcript is None):
                transcripts[transcriptId]=transcript= \
	                                   Transcript(transcriptId,strand)
                transcript.setStopCodons(self.stopCodons)
                transcript.readOrder=readOrder;
                readOrder+=1
                transcript.substrate=fields[0]
                transcript.source=fields[1]
                transcript.setBegin(begin)
                transcript.setEnd(end)
            if(transcript.score is None and
               score!="."): transcript.score=float(score)
            geneId=None
            if(rex.find("genegrp=(\S+)",line)): geneId=rex[1]
            elif(rex.find('gene_id[:=]?\s*\"?([^\s\;"]+)\"?',line)):
                geneId=rex[1]
            if(not geneId): raise Exception("can't parse GTF: "+line)
            transcript.geneId=geneId
            gene=genes.get(geneId,None)
            if(not gene): genes[geneId]=gene=Gene(); gene.setId(geneId)
            transcript.setGene(gene)
            gene.addTranscript(transcript)
            transcript.extraFields=transcriptExtraFields
コード例 #6
0
ファイル: GFF3Parser.py プロジェクト: bmajoros/python
 def parseRecord(self,fields):
     if(len(fields)>9):
         raise Exception("too many fields in GFF3 record"+"\t".join(fields))
     (substrate,source,type,begin,end,score,strand,frame,extra)=fields
     extra=extra.rstrip()
     extraFields=extra.split(";")
     extraHash={}
     rex=Rex()
     for field in extraFields:
         if(not rex.find("(.+)=(.+)",field)):
             raise Exception("Can't parse GFF3 field: "+field)
         key=rex[1]; value=rex[2]
         extraHash[key]=value
     rec={"substrate":substrate,
          "source":source,
          "type":type,
          "begin":begin,
          "end":end,
          "score":score,
          "strand":strand,
          "frame":frame,
          "extra":extraHash}
     return rec
コード例 #7
0
if(len(sys.argv)!=8):
    exit(sys.argv[0]+
         " <indiv> <hap> <in.broken-sites> <junctions.bed> <in.gff> <in.readcounts> <all-broken-sites.txt>")
(indiv,hap,infile,junctionsFile,gffFile,readCountsFile,masterFile)=sys.argv[1:]

#============================= main() =================================

# Read the readcounts file
totalMappedReads=None
readCounts={}
with open(readCountsFile,"rt") as IN:
    while(True):
        line=IN.readline()
        if(line==""): break
        if(rex.find("TOTAL MAPPED READS:\s*(\d+)",line)):
            totalMappedReads=rex[1]
        else:
            fields=line.split()
            (gene,count)=fields
            readCounts[gene]=count

# Read GFF file to find annotated sites to exclude
gff={}
exclude={}
reader=GffTranscriptReader()
transcripts=reader.loadGFF(gffFile)
for transcript in transcripts:
    if(transcript.getID()[0:3]=="ALT"): continue
    if(rex.find("(\S+)_\d",transcript.getID())): gff[rex[1]]=transcript
    substrate=transcript.getSubstrate()
コード例 #8
0
ファイル: fdr.py プロジェクト: ReddyLab/1000Genomes
from builtins import (bytes, dict, int, list, object, range, str, ascii,
   chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
# The above imports should allow this program to run in both Python 2 and
# Python 3.  You might need to update your version of module "future".
import sys
from Rex import Rex
rex=Rex()

if(len(sys.argv)!=2):
    exit(sys.argv[0]+" <p-values.txt>")
infile=sys.argv[1]

values=[]
with open(infile,"rt") as fh:
    for line in fh:
        if(rex.find("(\S+\d\S+)",line)):
            values.append(float(rex[1]))

values.sort()
L=len(values)
qValues=(0.1,0.05,0.01,0.005,0.001)
for q in qValues:
    bestP=None
    for i in range(L):
        P=values[i]
        threshold=float(i+1)/float(L)*q;
        if(P>threshold):
            if(i>0): bestP=values[i-1]
            break
    print("q="+str(q)+" p="+str(bestP))
コード例 #9
0
    print(ID,median,CI_left,CI_right,Preg,sep="\t")

#=========================================================================
# main()
#=========================================================================
(options,args)=getopt.getopt(sys.argv[1:],"s:t:")
if(len(args)!=6):
    exit(ProgramName.get()+" [-s stanfile] [-t thetafile] <model> <min-effect> <input.txt> <output.txt> <#MCMC-samples> <firstVariant-lastVariant>\n   -s = save raw STAN file\n   -t = save theta samples\n   variant range is zero-based and inclusive\n   min-effect (lambda) must be >= 1\n")
(model,minEffect,inFile,outfile,numSamples,numVariants)=args
stanFile=None
thetaFile=None
for pair in options:
    (key,value)=pair
    if(key=="-s"): stanFile=value
    if(key=="-t"): thetaFile=value
if(not rex.find("(\d+)-(\d+)",numVariants)):
    exit(numVariants+": specify range of variants: first-last")
firstIndex=int(rex[1])
lastIndex=int(rex[2])
minEffect=float(minEffect)
if(minEffect<1): raise Exception("Min-effect must be >= 1")
THETA=None
if(thetaFile is not None): THETA=open(thetaFile,"wt")

# Process all input lines, each line = one variant (one MCMC run)
thetaIndex=None
variantIndex=0

with open(inFile,"rt") as IN:
    for line in IN:
        # Check whether this variant is in the range to be processed
コード例 #10
0
rex=Rex()

# Process command line
if(len(sys.argv)!=4): exit(sys.argv[0]+" <in.fasta> <in.gff> <out.fasta>")
(fastaFile,gffFile,outFile)=sys.argv[1:]

# Read GFF
reader=GffTranscriptReader()
hash=reader.hashBySubstrate(gffFile)

# Open output file
OUT=open(outFile,"wt")
writer=FastaWriter()

# Process each substrate in the FASTA file
reader=FastaReader(fastaFile)
while(True):
    [defline,seq]=reader.nextSequence()
    if(not defline): break
    if(not rex.find("^\s*>\s*(\S+)",defline)): 
        exit("Can't parse defline: "+defline)
    id=rex[1]
    transcripts=hash.get(id,None)
    if(not transcripts): continue
    for transcript in transcripts:
        transSeq=transcript.loadTranscriptSeq(seq)
        writer.addToFasta(">"+transcript.getID(),transSeq,OUT)
reader.close()
OUT.close()

コード例 #11
0
                        with_statement)
from builtins import (bytes, dict, int, list, object, range, str, ascii, chr,
                      hex, input, next, oct, open, pow, round, super, filter,
                      map, zip)
# The above imports should allow this program to run in both Python 2 and
# Python 3.  You might need to update your version of module "future".
import sys
import ProgramName
from FastaReader import FastaReader
from FastaWriter import FastaWriter
from Rex import Rex

rex = Rex()

#=========================================================================
# main()
#=========================================================================
if (len(sys.argv) != 3):
    exit(ProgramName.get() + " <in.fasta> <out.fasta>\n")
(infile, outfile) = sys.argv[1:]

OUT = open(outfile, "wt")
writer = FastaWriter()
reader = FastaReader(infile)
while (True):
    (defline, seq) = reader.nextSequence()
    if (not defline): break
    if (not rex.find(">chr", defline)): continue
    writer.addToFasta(defline, seq, OUT)
OUT.close()
コード例 #12
0
ファイル: pool-counts.py プロジェクト: ReddyLab/POPSTARR2
                      hex, input, next, oct, open, pow, round, super, filter,
                      map, zip)
# The above imports should allow this program to run in both Python 2 and
# Python 3.  You might need to update your version of module "future".
import sys
from Rex import Rex
rex = Rex()

#=========================================================================
# main()
#=========================================================================
counts = {}
for line in sys.stdin:
    fields = line.rstrip().split()
    id = fields[0]
    alleles = counts.get(id, None)
    if (alleles is None): alleles = counts[id] = {}
    for field in fields[1:]:
        if (not rex.find("(\S+)=(\d+)", field)):
            raise Exception("can't parse field: " + field)
        allele = rex[1]
        count = int(rex[2])
        alleles[allele] = alleles.get(allele, 0) + count
for variant in counts.keys():
    print(variant, end="")
    alleles = counts[variant]
    for allele in alleles.keys():
        count = alleles[allele]
        print("\t" + allele + "=" + str(count), end="")
    print()
コード例 #13
0
(phenotypeID, ) = sys.argv[1:]
phenotypeID = int(phenotypeID) - 1
if (phenotypeID < 0): exit("phenotype must be 1-5")

phenotypes = loadPhenotypes(PHENOTYPES)
effects = loadEffects(EFFECTS)
IDs = None
points = []
initialized = False
predictorNum = 1
for chrom in CHROMS:
    vcf = VCF + "/" + chrom + "/" + chrom + ".vcf.gz"
    #print("processing",vcf,flush=True)
    IN = gzip.open(vcf, "rt")
    for line in IN:
        if (rex.find("^#CHROM", line)):
            fields = line.rstrip().split()
            IDs = fields[9:]
            if (not initialized):
                initializePoints(IDs, phenotypes, effects, phenotypeID, points)
                initialized = True
            continue
        if (rex.find("^\s*#", line)): continue
        fields = line.rstrip().split()
        (chrom, pos, variant, ref, alt) = fields[:5]
        genotypes = fields[9:]
        kept = processVariant(variant, ref, alt, IDs, genotypes, phenotypes,
                              effects, phenotypeID, points)
        if (kept):
            print(predictorNum, variant, sep="\t")
            predictorNum += 1
コード例 #14
0
# License (GPL) version 3, as described at www.opensource.org.
# Copyright (C)2016 William H. Majoros ([email protected]).
#=========================================================================
from __future__ import (absolute_import, division, print_function, 
   unicode_literals, generators, nested_scopes, with_statement)
from builtins import (bytes, dict, int, list, object, range, str, ascii,
   chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
# The above imports should allow this program to run in both Python 2 and
# Python 3.  You might need to update your version of module "future".
import sys
import os
import glob
from Rex import Rex
rex=Rex()

if(len(sys.argv)!=2):
    exit(sys.argv[0]+" <dir>")
directory=sys.argv[1]

files=glob.glob(directory+"/*.fastb")
for file in files:
    with open("tmp.fastb","wt") as OUT:
        with open(file,"rt") as IN:
            for line in IN:
                if(rex.find(">\S+",line)):
                    OUT.write(">dna\n")
                else: OUT.write(line)
    os.system("mv tmp.fastb "+file)


コード例 #15
0
EXPRESSED=ASSEMBLY+"/expressed.txt"

expressed={}
with open(EXPRESSED,"rt") as fh:
    for line in fh:
        fields=line.split()
        if(len(fields)!=4): continue
        (gene,trans,meanFPKM,SS)=fields
        expressed[trans]=True

hasAlts={}
supportedAlts={}
with open(READS,"rt") as fh:
    for line in fh:
        fields=line.split()
        if(len(fields)!=4): continue
        (indiv,hap,trans,reads)=fields
        if(not rex.find("(\S+)_(\S+)_(\d+)",trans)): raise Exception(trans)
        alt=rex[1]; baseTrans=rex[2]; happ=rex[3]
        if(not expressed.get(baseTrans,False)): continue
        key=indiv+" "+str(hap)+" "+baseTrans;
        hasAlts[key]=True
        if(int(reads)>=MIN_READS): supportedAlts[key]=True

numHasAlts=len(hasAlts.keys())
numSupportedAlts=len(supportedAlts.keys())
proportion=float(numSupportedAlts)/float(numHasAlts)
print(proportion,"=",numSupportedAlts,"/",numHasAlts)


コード例 #16
0
jobName="TRIM"
maxParallel=1000
THREADS=31
TRIMMOMATIC="java -jar /data/reddylab/software/Trimmomatic-0.33/Trimmomatic-0.33/trimmomatic-0.33.jar PE"

#=========================================================================
# main()
#=========================================================================
if(len(sys.argv)!=5):
    exit(ProgramName.get()+" <adapters.fasta> <fastq-in> <fastq-out> <full-path-to-slurms>\n")
(adaptersFasta,fastqIn,fastqOut,slurmDir)=sys.argv[1:]

files=os.listdir(fastqIn)
writer=SlurmWriter()
for file in files:
    if(not rex.find("(.*[_-])R1([_-].*)\.fastq.gz",file)): continue
    file1=file
    file2=rex[1]+"R2"+rex[2]+".fastq.gz"
    cmd=TRIMMOMATIC+" -threads "+str(THREADS)+" -phred33 "+\
        fastqIn+"/"+file1+" "+fastqIn+"/"+file2+" "+\
        fastqOut+"/"+rex[1]+"_FWD_paired.fq.gz "+\
        fastqOut+"/"+rex[1]+"_FWD_unpaired.fq.gz "+\
        fastqOut+"/"+rex[1]+"_REV_paired.fq.gz "+\
        fastqOut+"/"+rex[1]+"_REV_unpaired.fq.gz "+\
        "ILLUMINACLIP:"+adaptersFasta+\
        ":2:30:10:8:TRUE HEADCROP:1 LEADING:30 TRAILING:30 "+\
        "SLIDINGWINDOW:4:15 MINLEN:36"
    writer.addCommand("cd "+ROOT+"\n"+cmd)
writer.nice(NICE) # turns on "nice" (sets it to 100 by default)
writer.mem(MEM)
writer.threads(THREADS)
コード例 #17
0
        rec=hash[transcript]
        if(rec.get("supported",None) is None): continue
        numCryptic=rec["numSites"]
        supported=rec["supported"]
        array=tabulated.get(numCryptic,None)
        if(array is None): array=tabulated[numCryptic]=[]
        array.append(rec)
    return tabulated

#=============================== main() =================================
expressed=loadExpressed(EXPRESSED)
counts={}
dirs=os.listdir(COMBINED)
for indiv in dirs:
    indiv=indiv.rstrip()
    if(not rex.find("^HG\d+$",indiv) and not rex.find("^NA\d+$",indiv)):
        continue
    if(not os.path.exists(COMBINED+"/"+indiv+"/RNA/stringtie.gff")):
        continue
    dir=COMBINED+"/"+indiv
    changes=loadStructureChanges(dir)
    loadCrypskipCounts(indiv,dir,changes,counts)
tabulated=tabulate(counts)
keys=tabulated.keys()
for key in keys:
    array=tabulated[key]
    filename=OUTDIR+"/"+str(key)+".txt"
    with open(filename,"wt") as fh:
        for rec in array:
            supported=rec["supported"]
            fh.write(str(supported)+"\n")
コード例 #18
0
if(len(sys.argv)!=5):
    exit(sys.argv[0]+
         " <in.broken-sites> <junctions.bed> <in.gff> <in.readcounts>")
(infile,junctionsFile,gffFile,readCountsFile)=sys.argv[1:]

#============================= main() =================================

# Read the readcounts file
totalMappedReads=None
readCounts={}
with open(readCountsFile,"rt") as IN:
    while(True):
        line=IN.readline()
        if(line==""): break
        if(rex.find("TOTAL MAPPED READS:\s*(\d+)",line)):
            totalMappedReads=rex[1]
        else:
            fields=line.split()
            (gene,count)=fields
            readCounts[gene]=count

# Read GFF file to find annotated sites to exclude
exclude={}
reader=GffTranscriptReader()
transcripts=reader.loadGFF(gffFile)
for transcript in transcripts:
    if(transcript.getID()[0:3]=="ALT"): continue
    substrate=transcript.getSubstrate()
    exclusions=exclude.get(substrate,None)
    if(exclusions is None): exclusions=exclude[substrate]={}