def loadGFF_transcript(self,fields,line,transcriptBeginEnd,GFF, transcripts,readOrder,genes): begin=int(fields[3])-1 end=int(fields[4]) rex=Rex() if(rex.find('transcript_id[:=]?\s*"?([^\s";]+)"?',line)): transcriptId=rex[1] transcriptBeginEnd[transcriptId]=[begin,end] strand=fields[6] transcriptExtraFields="" for i in range(8,len(fields)): transcriptExtraFields+=fields[i]+" " transcript=transcripts.get(transcriptId,None) if(transcript is None): transcripts[transcriptId]=transcript= \ Transcript(transcriptId,strand) transcript.setStopCodons(self.stopCodons) transcript.readOrder=readOrder; readOrder+=1 transcript.substrate=fields[0] transcript.source=fields[1] transcript.setBegin(begin) transcript.setEnd(end) geneId=None if(rex.find("genegrp=(\S+)",line)): geneId=rex[1] elif(rex.find('gene_id[:=]?\s*\"?([^\s\;"]+)\"?',line)): geneId=rex[1] if(not geneId): raise Exception("can't parse GTF: "+line) transcript.geneId=geneId gene=genes.get(geneId,None) if(not gene): genes[geneId]=gene=Gene(); gene.setId(geneId) transcript.setGene(gene) gene.addTranscript(transcript) transcript.extraFields=transcriptExtraFields
def parseRecord(self, fields): if (len(fields) > 9): raise Exception("too many fields in GFF3 record" + "\t".join(fields)) (substrate, source, type, begin, end, score, strand, frame, extra) = fields extra = extra.rstrip() extraFields = extra.split(";") extraHash = {} rex = Rex() for field in extraFields: if (not rex.find("(.+)=(.+)", field)): raise Exception("Can't parse GFF3 field: " + field) key = rex[1] value = rex[2] extraHash[key] = value rec = { "substrate": substrate, "source": source, "type": type, "begin": int(begin) - 1, "end": int(end), "score": score, "strand": strand, "frame": frame, "extra": extraHash } return rec
def loadGFF_UTR(self,fields,line,transcriptBeginEnd,GFF, transcripts,readOrder,genes): exonBegin=int(fields[3])-1 exonEnd=int(fields[4]) exonScore=fields[5] strand=fields[6] frame=fields[7] transcriptId=None rex=Rex() if(rex.find('transgrp[:=]\s*(\S+)',line)): transcriptId=rex[1] elif(rex.find('transcript_id[:=]?\s*"?([^\s";]+)"?',line)): transcriptId=rex[1] elif(rex.find('Parent=([^;,\s]+)',line)): transcriptId=rex[1] geneId=None if(rex.find('genegrp=(\S+)',line)): geneId=rex[1] elif(rex.find('gene_id[:=]?\s*"?([^\s\;"]+)"?',line)): geneId=rex[1] if(transcriptId is None): transcriptId=geneId if(geneId is None): geneId=transcriptId if(transcriptId is None): raise Exception(line+" : no transcript ID found") if(rex.find("(\S+);$",transcriptId)): transcriptId=rex[1] if(rex.find("(\S+);$",geneId)): geneId=rex[1] extra="" for i in range(8,len(fields)): extra+=fields[i]+" " if(exonBegin>exonEnd): (exonBegin,exonEnd)=(exonEnd,exonBegin) transcript=transcripts.get(transcriptId,None) if(not transcript): transcripts[transcriptId]=transcript= \ Transcript(transcriptId,strand) transcript.setStopCodons(self.stopCodons) transcript.readOrder=readOrder readOrder+=1 transcript.substrate=fields[0] transcript.source=fields[1] if(transcriptBeginEnd.get(transcriptId,None) is not None): (begin,end)=transcriptBeginEnd[transcriptId] transcript.setBegin(begin) transcript.setEnd(end) else: transcript.setBegin(exonBegin) transcript.setEnd(exonEnd) transcript.geneId=geneId gene=genes.get(geneId,None) if(gene is None): genes[geneId]=gene=Gene(); gene.setId(geneId) transcript.setGene(gene) exon=Exon(exonBegin,exonEnd,transcript) exon.extraFields=extra if(transcript.rawExons is not None): exon.frame=frame exon.score=exonScore exon.type=fields[2] transcript.rawExons.append(exon) elif(not transcript.exonOverlapsExon(exon)): exon.frame=frame exon.score=exonScore exon.type=fields[2] transcript.UTR.append(exon) # OK -- we sort later gene.addTranscript(transcript)
def loadGFF_UTR(self,fields,line,transcriptBeginEnd,GFF, transcripts,readOrder,genes): exonBegin=int(fields[3])-1 exonEnd=int(fields[4]) exonScore=fields[5] strand=fields[6] frame=fields[7] transcriptId=None rex=Rex() if(rex.find('transgrp[:=]\s*(\S+)',line)): transcriptId=rex[1] elif(rex.find('transcript_id[:=]?\s*"?([^\s";]+)"?',line)): transcriptId=rex[1] elif(rex.find('Parent=([^;,\s]+)',line)): transcriptId=rex[1] geneId=None if(rex.find('genegrp=(\S+)',line)): geneId=rex[1] elif(rex.find('gene_id[:=]?\s*"?([^\s\;"]+)"?',line)): geneId=rex[1] if(transcriptId is None): transcriptId=geneId if(geneId is None): geneId=transcriptId if(transcriptId is None): raise Exception(line+" : no transcript ID found") if(rex.find("(\S+);$",transcriptId)): transcriptId=rex[1] if(rex.find("(\S+);$",geneId)): geneId=rex[1] extra="" for i in range(8,len(fields)): extra+=fields[i]+" " if(exonBegin>exonEnd): (exonBegin,exonEnd)=(exonEnd,exonBegin) transcript=transcripts.get(transcriptId,None) if(not transcript): transcripts[transcriptId]=transcript= \ Transcript(transcriptId,strand) transcript.setStopCodons(self.stopCodons) transcript.readOrder=readOrder readOrder+=1 transcript.substrate=fields[0] transcript.source=fields[1] if(transcriptBeginEnd.find(transcriptId,None) is not None): (begin,end)=transcriptBeginEnd[transcriptId] transcript.setBegin(begin) transcript.setEnd(end) transcript.geneId=geneId gene=genes.get(geneId,None) if(gene is None): genes[geneId]=gene=Gene(); gene.setId(geneId) transcript.setGene(gene) exon=Exon(exonBegin,exonEnd,transcript) exon.extraFields=extra if(transcript.rawExons is not None): exon.frame=frame exon.score=exonScore exon.type=fields[2] transcript.rawExons.append(exon) elif(not transcript.exonOverlapsExon(exon)): exon.frame=frame exon.score=exonScore exon.type=fields[2] transcript.UTR.append(exon) # OK -- we sort later gene.addTranscript(transcript)
def loadGFF_transcript(self,fields,line,transcriptBeginEnd,GFF, transcripts,readOrder,genes): begin=int(fields[3])-1 end=int(fields[4]) rex=Rex() if(rex.find('transcript_id[:=]?\s*"?([^\s";]+)"?',line)): transcriptId=rex[1] transcriptBeginEnd[transcriptId]=[begin,end] strand=fields[6] score=fields[5] transcriptExtraFields="" for i in range(8,len(fields)): transcriptExtraFields+=fields[i]+" " transcript=transcripts.get(transcriptId,None) if(transcript is None): transcripts[transcriptId]=transcript= \ Transcript(transcriptId,strand) transcript.setStopCodons(self.stopCodons) transcript.readOrder=readOrder; readOrder+=1 transcript.substrate=fields[0] transcript.source=fields[1] transcript.setBegin(begin) transcript.setEnd(end) if(transcript.score is None and score!="."): transcript.score=float(score) geneId=None if(rex.find("genegrp=(\S+)",line)): geneId=rex[1] elif(rex.find('gene_id[:=]?\s*\"?([^\s\;"]+)\"?',line)): geneId=rex[1] if(not geneId): raise Exception("can't parse GTF: "+line) transcript.geneId=geneId gene=genes.get(geneId,None) if(not gene): genes[geneId]=gene=Gene(); gene.setId(geneId) transcript.setGene(gene) gene.addTranscript(transcript) transcript.extraFields=transcriptExtraFields
def parseRecord(self,fields): if(len(fields)>9): raise Exception("too many fields in GFF3 record"+"\t".join(fields)) (substrate,source,type,begin,end,score,strand,frame,extra)=fields extra=extra.rstrip() extraFields=extra.split(";") extraHash={} rex=Rex() for field in extraFields: if(not rex.find("(.+)=(.+)",field)): raise Exception("Can't parse GFF3 field: "+field) key=rex[1]; value=rex[2] extraHash[key]=value rec={"substrate":substrate, "source":source, "type":type, "begin":begin, "end":end, "score":score, "strand":strand, "frame":frame, "extra":extraHash} return rec
if(len(sys.argv)!=8): exit(sys.argv[0]+ " <indiv> <hap> <in.broken-sites> <junctions.bed> <in.gff> <in.readcounts> <all-broken-sites.txt>") (indiv,hap,infile,junctionsFile,gffFile,readCountsFile,masterFile)=sys.argv[1:] #============================= main() ================================= # Read the readcounts file totalMappedReads=None readCounts={} with open(readCountsFile,"rt") as IN: while(True): line=IN.readline() if(line==""): break if(rex.find("TOTAL MAPPED READS:\s*(\d+)",line)): totalMappedReads=rex[1] else: fields=line.split() (gene,count)=fields readCounts[gene]=count # Read GFF file to find annotated sites to exclude gff={} exclude={} reader=GffTranscriptReader() transcripts=reader.loadGFF(gffFile) for transcript in transcripts: if(transcript.getID()[0:3]=="ALT"): continue if(rex.find("(\S+)_\d",transcript.getID())): gff[rex[1]]=transcript substrate=transcript.getSubstrate()
from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". import sys from Rex import Rex rex=Rex() if(len(sys.argv)!=2): exit(sys.argv[0]+" <p-values.txt>") infile=sys.argv[1] values=[] with open(infile,"rt") as fh: for line in fh: if(rex.find("(\S+\d\S+)",line)): values.append(float(rex[1])) values.sort() L=len(values) qValues=(0.1,0.05,0.01,0.005,0.001) for q in qValues: bestP=None for i in range(L): P=values[i] threshold=float(i+1)/float(L)*q; if(P>threshold): if(i>0): bestP=values[i-1] break print("q="+str(q)+" p="+str(bestP))
print(ID,median,CI_left,CI_right,Preg,sep="\t") #========================================================================= # main() #========================================================================= (options,args)=getopt.getopt(sys.argv[1:],"s:t:") if(len(args)!=6): exit(ProgramName.get()+" [-s stanfile] [-t thetafile] <model> <min-effect> <input.txt> <output.txt> <#MCMC-samples> <firstVariant-lastVariant>\n -s = save raw STAN file\n -t = save theta samples\n variant range is zero-based and inclusive\n min-effect (lambda) must be >= 1\n") (model,minEffect,inFile,outfile,numSamples,numVariants)=args stanFile=None thetaFile=None for pair in options: (key,value)=pair if(key=="-s"): stanFile=value if(key=="-t"): thetaFile=value if(not rex.find("(\d+)-(\d+)",numVariants)): exit(numVariants+": specify range of variants: first-last") firstIndex=int(rex[1]) lastIndex=int(rex[2]) minEffect=float(minEffect) if(minEffect<1): raise Exception("Min-effect must be >= 1") THETA=None if(thetaFile is not None): THETA=open(thetaFile,"wt") # Process all input lines, each line = one variant (one MCMC run) thetaIndex=None variantIndex=0 with open(inFile,"rt") as IN: for line in IN: # Check whether this variant is in the range to be processed
rex=Rex() # Process command line if(len(sys.argv)!=4): exit(sys.argv[0]+" <in.fasta> <in.gff> <out.fasta>") (fastaFile,gffFile,outFile)=sys.argv[1:] # Read GFF reader=GffTranscriptReader() hash=reader.hashBySubstrate(gffFile) # Open output file OUT=open(outFile,"wt") writer=FastaWriter() # Process each substrate in the FASTA file reader=FastaReader(fastaFile) while(True): [defline,seq]=reader.nextSequence() if(not defline): break if(not rex.find("^\s*>\s*(\S+)",defline)): exit("Can't parse defline: "+defline) id=rex[1] transcripts=hash.get(id,None) if(not transcripts): continue for transcript in transcripts: transSeq=transcript.loadTranscriptSeq(seq) writer.addToFasta(">"+transcript.getID(),transSeq,OUT) reader.close() OUT.close()
with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". import sys import ProgramName from FastaReader import FastaReader from FastaWriter import FastaWriter from Rex import Rex rex = Rex() #========================================================================= # main() #========================================================================= if (len(sys.argv) != 3): exit(ProgramName.get() + " <in.fasta> <out.fasta>\n") (infile, outfile) = sys.argv[1:] OUT = open(outfile, "wt") writer = FastaWriter() reader = FastaReader(infile) while (True): (defline, seq) = reader.nextSequence() if (not defline): break if (not rex.find(">chr", defline)): continue writer.addToFasta(defline, seq, OUT) OUT.close()
hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". import sys from Rex import Rex rex = Rex() #========================================================================= # main() #========================================================================= counts = {} for line in sys.stdin: fields = line.rstrip().split() id = fields[0] alleles = counts.get(id, None) if (alleles is None): alleles = counts[id] = {} for field in fields[1:]: if (not rex.find("(\S+)=(\d+)", field)): raise Exception("can't parse field: " + field) allele = rex[1] count = int(rex[2]) alleles[allele] = alleles.get(allele, 0) + count for variant in counts.keys(): print(variant, end="") alleles = counts[variant] for allele in alleles.keys(): count = alleles[allele] print("\t" + allele + "=" + str(count), end="") print()
(phenotypeID, ) = sys.argv[1:] phenotypeID = int(phenotypeID) - 1 if (phenotypeID < 0): exit("phenotype must be 1-5") phenotypes = loadPhenotypes(PHENOTYPES) effects = loadEffects(EFFECTS) IDs = None points = [] initialized = False predictorNum = 1 for chrom in CHROMS: vcf = VCF + "/" + chrom + "/" + chrom + ".vcf.gz" #print("processing",vcf,flush=True) IN = gzip.open(vcf, "rt") for line in IN: if (rex.find("^#CHROM", line)): fields = line.rstrip().split() IDs = fields[9:] if (not initialized): initializePoints(IDs, phenotypes, effects, phenotypeID, points) initialized = True continue if (rex.find("^\s*#", line)): continue fields = line.rstrip().split() (chrom, pos, variant, ref, alt) = fields[:5] genotypes = fields[9:] kept = processVariant(variant, ref, alt, IDs, genotypes, phenotypes, effects, phenotypeID, points) if (kept): print(predictorNum, variant, sep="\t") predictorNum += 1
# License (GPL) version 3, as described at www.opensource.org. # Copyright (C)2016 William H. Majoros ([email protected]). #========================================================================= from __future__ import (absolute_import, division, print_function, unicode_literals, generators, nested_scopes, with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". import sys import os import glob from Rex import Rex rex=Rex() if(len(sys.argv)!=2): exit(sys.argv[0]+" <dir>") directory=sys.argv[1] files=glob.glob(directory+"/*.fastb") for file in files: with open("tmp.fastb","wt") as OUT: with open(file,"rt") as IN: for line in IN: if(rex.find(">\S+",line)): OUT.write(">dna\n") else: OUT.write(line) os.system("mv tmp.fastb "+file)
EXPRESSED=ASSEMBLY+"/expressed.txt" expressed={} with open(EXPRESSED,"rt") as fh: for line in fh: fields=line.split() if(len(fields)!=4): continue (gene,trans,meanFPKM,SS)=fields expressed[trans]=True hasAlts={} supportedAlts={} with open(READS,"rt") as fh: for line in fh: fields=line.split() if(len(fields)!=4): continue (indiv,hap,trans,reads)=fields if(not rex.find("(\S+)_(\S+)_(\d+)",trans)): raise Exception(trans) alt=rex[1]; baseTrans=rex[2]; happ=rex[3] if(not expressed.get(baseTrans,False)): continue key=indiv+" "+str(hap)+" "+baseTrans; hasAlts[key]=True if(int(reads)>=MIN_READS): supportedAlts[key]=True numHasAlts=len(hasAlts.keys()) numSupportedAlts=len(supportedAlts.keys()) proportion=float(numSupportedAlts)/float(numHasAlts) print(proportion,"=",numSupportedAlts,"/",numHasAlts)
jobName="TRIM" maxParallel=1000 THREADS=31 TRIMMOMATIC="java -jar /data/reddylab/software/Trimmomatic-0.33/Trimmomatic-0.33/trimmomatic-0.33.jar PE" #========================================================================= # main() #========================================================================= if(len(sys.argv)!=5): exit(ProgramName.get()+" <adapters.fasta> <fastq-in> <fastq-out> <full-path-to-slurms>\n") (adaptersFasta,fastqIn,fastqOut,slurmDir)=sys.argv[1:] files=os.listdir(fastqIn) writer=SlurmWriter() for file in files: if(not rex.find("(.*[_-])R1([_-].*)\.fastq.gz",file)): continue file1=file file2=rex[1]+"R2"+rex[2]+".fastq.gz" cmd=TRIMMOMATIC+" -threads "+str(THREADS)+" -phred33 "+\ fastqIn+"/"+file1+" "+fastqIn+"/"+file2+" "+\ fastqOut+"/"+rex[1]+"_FWD_paired.fq.gz "+\ fastqOut+"/"+rex[1]+"_FWD_unpaired.fq.gz "+\ fastqOut+"/"+rex[1]+"_REV_paired.fq.gz "+\ fastqOut+"/"+rex[1]+"_REV_unpaired.fq.gz "+\ "ILLUMINACLIP:"+adaptersFasta+\ ":2:30:10:8:TRUE HEADCROP:1 LEADING:30 TRAILING:30 "+\ "SLIDINGWINDOW:4:15 MINLEN:36" writer.addCommand("cd "+ROOT+"\n"+cmd) writer.nice(NICE) # turns on "nice" (sets it to 100 by default) writer.mem(MEM) writer.threads(THREADS)
rec=hash[transcript] if(rec.get("supported",None) is None): continue numCryptic=rec["numSites"] supported=rec["supported"] array=tabulated.get(numCryptic,None) if(array is None): array=tabulated[numCryptic]=[] array.append(rec) return tabulated #=============================== main() ================================= expressed=loadExpressed(EXPRESSED) counts={} dirs=os.listdir(COMBINED) for indiv in dirs: indiv=indiv.rstrip() if(not rex.find("^HG\d+$",indiv) and not rex.find("^NA\d+$",indiv)): continue if(not os.path.exists(COMBINED+"/"+indiv+"/RNA/stringtie.gff")): continue dir=COMBINED+"/"+indiv changes=loadStructureChanges(dir) loadCrypskipCounts(indiv,dir,changes,counts) tabulated=tabulate(counts) keys=tabulated.keys() for key in keys: array=tabulated[key] filename=OUTDIR+"/"+str(key)+".txt" with open(filename,"wt") as fh: for rec in array: supported=rec["supported"] fh.write(str(supported)+"\n")
if(len(sys.argv)!=5): exit(sys.argv[0]+ " <in.broken-sites> <junctions.bed> <in.gff> <in.readcounts>") (infile,junctionsFile,gffFile,readCountsFile)=sys.argv[1:] #============================= main() ================================= # Read the readcounts file totalMappedReads=None readCounts={} with open(readCountsFile,"rt") as IN: while(True): line=IN.readline() if(line==""): break if(rex.find("TOTAL MAPPED READS:\s*(\d+)",line)): totalMappedReads=rex[1] else: fields=line.split() (gene,count)=fields readCounts[gene]=count # Read GFF file to find annotated sites to exclude exclude={} reader=GffTranscriptReader() transcripts=reader.loadGFF(gffFile) for transcript in transcripts: if(transcript.getID()[0:3]=="ALT"): continue substrate=transcript.getSubstrate() exclusions=exclude.get(substrate,None) if(exclusions is None): exclusions=exclude[substrate]={}