def loadGFF_transcript(self,fields,line,transcriptBeginEnd,GFF, transcripts,readOrder,genes): begin=int(fields[3])-1 end=int(fields[4]) rex=Rex() if(rex.find('transcript_id[:=]?\s*"?([^\s";]+)"?',line)): transcriptId=rex[1] transcriptBeginEnd[transcriptId]=[begin,end] strand=fields[6] transcriptExtraFields="" for i in range(8,len(fields)): transcriptExtraFields+=fields[i]+" " transcript=transcripts.get(transcriptId,None) if(transcript is None): transcripts[transcriptId]=transcript= \ Transcript(transcriptId,strand) transcript.setStopCodons(self.stopCodons) transcript.readOrder=readOrder; readOrder+=1 transcript.substrate=fields[0] transcript.source=fields[1] transcript.setBegin(begin) transcript.setEnd(end) geneId=None if(rex.find("genegrp=(\S+)",line)): geneId=rex[1] elif(rex.find('gene_id[:=]?\s*\"?([^\s\;"]+)\"?',line)): geneId=rex[1] if(not geneId): raise Exception("can't parse GTF: "+line) transcript.geneId=geneId gene=genes.get(geneId,None) if(not gene): genes[geneId]=gene=Gene(); gene.setId(geneId) transcript.setGene(gene) gene.addTranscript(transcript) transcript.extraFields=transcriptExtraFields
def parseRecord(self, fields): if (len(fields) > 9): raise Exception("too many fields in GFF3 record" + "\t".join(fields)) (substrate, source, type, begin, end, score, strand, frame, extra) = fields extra = extra.rstrip() extraFields = extra.split(";") extraHash = {} rex = Rex() for field in extraFields: if (not rex.find("(.+)=(.+)", field)): raise Exception("Can't parse GFF3 field: " + field) key = rex[1] value = rex[2] extraHash[key] = value rec = { "substrate": substrate, "source": source, "type": type, "begin": int(begin) - 1, "end": int(end), "score": score, "strand": strand, "frame": frame, "extra": extraHash } return rec
def loadGFF_UTR(self,fields,line,transcriptBeginEnd,GFF, transcripts,readOrder,genes): exonBegin=int(fields[3])-1 exonEnd=int(fields[4]) exonScore=fields[5] strand=fields[6] frame=fields[7] transcriptId=None rex=Rex() if(rex.find('transgrp[:=]\s*(\S+)',line)): transcriptId=rex[1] elif(rex.find('transcript_id[:=]?\s*"?([^\s";]+)"?',line)): transcriptId=rex[1] elif(rex.find('Parent=([^;,\s]+)',line)): transcriptId=rex[1] geneId=None if(rex.find('genegrp=(\S+)',line)): geneId=rex[1] elif(rex.find('gene_id[:=]?\s*"?([^\s\;"]+)"?',line)): geneId=rex[1] if(transcriptId is None): transcriptId=geneId if(geneId is None): geneId=transcriptId if(transcriptId is None): raise Exception(line+" : no transcript ID found") if(rex.find("(\S+);$",transcriptId)): transcriptId=rex[1] if(rex.find("(\S+);$",geneId)): geneId=rex[1] extra="" for i in range(8,len(fields)): extra+=fields[i]+" " if(exonBegin>exonEnd): (exonBegin,exonEnd)=(exonEnd,exonBegin) transcript=transcripts.get(transcriptId,None) if(not transcript): transcripts[transcriptId]=transcript= \ Transcript(transcriptId,strand) transcript.setStopCodons(self.stopCodons) transcript.readOrder=readOrder readOrder+=1 transcript.substrate=fields[0] transcript.source=fields[1] if(transcriptBeginEnd.get(transcriptId,None) is not None): (begin,end)=transcriptBeginEnd[transcriptId] transcript.setBegin(begin) transcript.setEnd(end) else: transcript.setBegin(exonBegin) transcript.setEnd(exonEnd) transcript.geneId=geneId gene=genes.get(geneId,None) if(gene is None): genes[geneId]=gene=Gene(); gene.setId(geneId) transcript.setGene(gene) exon=Exon(exonBegin,exonEnd,transcript) exon.extraFields=extra if(transcript.rawExons is not None): exon.frame=frame exon.score=exonScore exon.type=fields[2] transcript.rawExons.append(exon) elif(not transcript.exonOverlapsExon(exon)): exon.frame=frame exon.score=exonScore exon.type=fields[2] transcript.UTR.append(exon) # OK -- we sort later gene.addTranscript(transcript)
def loadGFF_UTR(self,fields,line,transcriptBeginEnd,GFF, transcripts,readOrder,genes): exonBegin=int(fields[3])-1 exonEnd=int(fields[4]) exonScore=fields[5] strand=fields[6] frame=fields[7] transcriptId=None rex=Rex() if(rex.find('transgrp[:=]\s*(\S+)',line)): transcriptId=rex[1] elif(rex.find('transcript_id[:=]?\s*"?([^\s";]+)"?',line)): transcriptId=rex[1] elif(rex.find('Parent=([^;,\s]+)',line)): transcriptId=rex[1] geneId=None if(rex.find('genegrp=(\S+)',line)): geneId=rex[1] elif(rex.find('gene_id[:=]?\s*"?([^\s\;"]+)"?',line)): geneId=rex[1] if(transcriptId is None): transcriptId=geneId if(geneId is None): geneId=transcriptId if(transcriptId is None): raise Exception(line+" : no transcript ID found") if(rex.find("(\S+);$",transcriptId)): transcriptId=rex[1] if(rex.find("(\S+);$",geneId)): geneId=rex[1] extra="" for i in range(8,len(fields)): extra+=fields[i]+" " if(exonBegin>exonEnd): (exonBegin,exonEnd)=(exonEnd,exonBegin) transcript=transcripts.get(transcriptId,None) if(not transcript): transcripts[transcriptId]=transcript= \ Transcript(transcriptId,strand) transcript.setStopCodons(self.stopCodons) transcript.readOrder=readOrder readOrder+=1 transcript.substrate=fields[0] transcript.source=fields[1] if(transcriptBeginEnd.find(transcriptId,None) is not None): (begin,end)=transcriptBeginEnd[transcriptId] transcript.setBegin(begin) transcript.setEnd(end) transcript.geneId=geneId gene=genes.get(geneId,None) if(gene is None): genes[geneId]=gene=Gene(); gene.setId(geneId) transcript.setGene(gene) exon=Exon(exonBegin,exonEnd,transcript) exon.extraFields=extra if(transcript.rawExons is not None): exon.frame=frame exon.score=exonScore exon.type=fields[2] transcript.rawExons.append(exon) elif(not transcript.exonOverlapsExon(exon)): exon.frame=frame exon.score=exonScore exon.type=fields[2] transcript.UTR.append(exon) # OK -- we sort later gene.addTranscript(transcript)
def parseRecord(self,fields): if(len(fields)>9): raise Exception("too many fields in GFF3 record"+"\t".join(fields)) (substrate,source,type,begin,end,score,strand,frame,extra)=fields extra=extra.rstrip() extraFields=extra.split(";") extraHash={} rex=Rex() for field in extraFields: if(not rex.find("(.+)=(.+)",field)): raise Exception("Can't parse GFF3 field: "+field) key=rex[1]; value=rex[2] extraHash[key]=value rec={"substrate":substrate, "source":source, "type":type, "begin":begin, "end":end, "score":score, "strand":strand, "frame":frame, "extra":extraHash} return rec
def crear_aldea(nombre, num_rex, num_spinosaurus, num_triceraptors): a = Aldea(nombre) for i in range(num_rex): a.add_dinosaurio( Rex("r" + str(i), 1000, random.randrange(-200, 200), a)) for i in range(num_spinosaurus): a.add_dinosaurio( Spinosaurus("s" + str(i), 1000, random.randrange(-200, 200), a)) for i in range(num_triceraptors): a.add_dinosaurio( Triceraptors("t" + str(i), 1000, random.randrange(-200, 200), a)) return a
def loadGFF_transcript(self,fields,line,transcriptBeginEnd,GFF, transcripts,readOrder,genes): begin=int(fields[3])-1 end=int(fields[4]) rex=Rex() if(rex.find('transcript_id[:=]?\s*"?([^\s";]+)"?',line)): transcriptId=rex[1] transcriptBeginEnd[transcriptId]=[begin,end] strand=fields[6] score=fields[5] transcriptExtraFields="" for i in range(8,len(fields)): transcriptExtraFields+=fields[i]+" " transcript=transcripts.get(transcriptId,None) if(transcript is None): transcripts[transcriptId]=transcript= \ Transcript(transcriptId,strand) transcript.setStopCodons(self.stopCodons) transcript.readOrder=readOrder; readOrder+=1 transcript.substrate=fields[0] transcript.source=fields[1] transcript.setBegin(begin) transcript.setEnd(end) if(transcript.score is None and score!="."): transcript.score=float(score) geneId=None if(rex.find("genegrp=(\S+)",line)): geneId=rex[1] elif(rex.find('gene_id[:=]?\s*\"?([^\s\;"]+)\"?',line)): geneId=rex[1] if(not geneId): raise Exception("can't parse GTF: "+line) transcript.geneId=geneId gene=genes.get(geneId,None) if(not gene): genes[geneId]=gene=Gene(); gene.setId(geneId) transcript.setGene(gene) gene.addTranscript(transcript) transcript.extraFields=transcriptExtraFields
#!/usr/bin/env python #========================================================================= # This is OPEN SOURCE SOFTWARE governed by the Gnu General Public # License (GPL) version 3, as described at www.opensource.org. # Copyright (C)2016 William H. Majoros ([email protected]). #========================================================================= from __future__ import (absolute_import, division, print_function, unicode_literals, generators, nested_scopes, with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". import os from Rex import Rex rex=Rex() COMBINED="/home/bmajoros/1000G/assembly/combined" OUTDIR="/home/bmajoros/1000G/assembly/cryptic" MIN_COUNT=3 EXPRESSED="/home/bmajoros/1000G/assembly/expressed.txt" def loadExpressed(filename): hash={} with open(filename,"rt") as fh: for line in fh: fields=line.split() (gene,transcript,fpkm,SS)=fields if(rex.find("ALT\d+_(\S+)",transcript)): ### transcript=rex[1] ### hash[transcript]=True return hash
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public # License (GPL) version 3, as described at www.opensource.org. #========================================================================= from __future__ import (absolute_import, division, print_function, unicode_literals, generators, nested_scopes, with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". import sys import ProgramName import gzip from Rex import Rex rex = Rex() from scipy import stats from statsmodels.stats.multitest import multipletests def getCounts(filename, variants, MIN_COUNT): counts = {} with open(filename, "rt") as IN: for line in IN: fields = line.rstrip().split() if (len(fields) != 7): continue (id, chr, pos, ref, alt, refCount, altCount) = fields refCount = int(refCount) altCount = int(altCount) if (refCount + altCount < MIN_COUNT): continue counts[id] = [refCount, altCount]
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public # License (GPL) version 3, as described at www.opensource.org. # Copyright (C)2017 William H. Majoros ([email protected]). #========================================================================= from __future__ import (absolute_import, division, print_function, unicode_literals, generators, nested_scopes, with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". import sys import os import ProgramName from SlurmWriter import SlurmWriter from Rex import Rex rex=Rex() ROOT="/home/bmajoros/PopSTARR/graham" MEM=50000 NICE=500 jobName="TRIM" maxParallel=1000 THREADS=31 TRIMMOMATIC="java -jar /data/reddylab/software/Trimmomatic-0.33/Trimmomatic-0.33/trimmomatic-0.33.jar PE" #========================================================================= # main() #========================================================================= if(len(sys.argv)!=5): exit(ProgramName.get()+" <adapters.fasta> <fastq-in> <fastq-out> <full-path-to-slurms>\n") (adaptersFasta,fastqIn,fastqOut,slurmDir)=sys.argv[1:]
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public # License (GPL) version 3, as described at www.opensource.org. # Copyright (C)2016 William H. Majoros ([email protected]). #========================================================================= from __future__ import (absolute_import, division, print_function, unicode_literals, generators, nested_scopes, with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". import sys from Transcript import Transcript from Interval import Interval from GffTranscriptReader import GffTranscriptReader from Rex import Rex rex=Rex() if(len(sys.argv)!=8): exit(sys.argv[0]+ " <indiv> <hap> <in.broken-sites> <junctions.bed> <in.gff> <in.readcounts> <all-broken-sites.txt>") (indiv,hap,infile,junctionsFile,gffFile,readCountsFile,masterFile)=sys.argv[1:] #============================= main() ================================= # Read the readcounts file totalMappedReads=None readCounts={} with open(readCountsFile,"rt") as IN: while(True): line=IN.readline() if(line==""): break
#!/usr/bin/env python #========================================================================= # This is OPEN SOURCE SOFTWARE governed by the Gnu General Public # License (GPL) version 3, as described at www.opensource.org. # Copyright (C)2016 William H. Majoros ([email protected]). #========================================================================= from __future__ import (absolute_import, division, print_function, unicode_literals, generators, nested_scopes, with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". import sys from Rex import Rex rex=Rex() if(len(sys.argv)!=2): exit(sys.argv[0]+" <p-values.txt>") infile=sys.argv[1] values=[] with open(infile,"rt") as fh: for line in fh: if(rex.find("(\S+\d\S+)",line)): values.append(float(rex[1])) values.sort() L=len(values) qValues=(0.1,0.05,0.01,0.005,0.001) for q in qValues: bestP=None
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public # License (GPL) version 3, as described at www.opensource.org. # Copyright (C)2018 William H. Majoros ([email protected]) #========================================================================= from __future__ import (absolute_import, division, print_function, unicode_literals, generators, nested_scopes, with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". import sys import os import math import ProgramName from Rex import Rex rex=Rex() import TempFilename import getopt WARMUP=1000 ALPHA=0.05 STDERR=TempFilename.generate(".stderr") INPUT_FILE=TempFilename.generate(".staninputs") INIT_FILE=TempFilename.generate(".staninit") OUTPUT_TEMP=TempFilename.generate(".stanoutputs") def printFields(fields,hFile): numFields=len(fields) for i in range(7,numFields): print(i-6,"=",fields[i],sep="",end="",file=hFile) if(i<numFields-1): print("\t",end="",file=hFile)
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public # License (GPL) version 3, as described at www.opensource.org. # Copyright (C)2016 William H. Majoros ([email protected]). #========================================================================= from __future__ import (absolute_import, division, print_function, unicode_literals, generators, nested_scopes, with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". import sys from FastaReader import FastaReader from FastaWriter import FastaWriter from GffTranscriptReader import GffTranscriptReader from Rex import Rex rex=Rex() # Process command line if(len(sys.argv)!=4): exit(sys.argv[0]+" <in.fasta> <in.gff> <out.fasta>") (fastaFile,gffFile,outFile)=sys.argv[1:] # Read GFF reader=GffTranscriptReader() hash=reader.hashBySubstrate(gffFile) # Open output file OUT=open(outFile,"wt") writer=FastaWriter() # Process each substrate in the FASTA file reader=FastaReader(fastaFile)
#========================================================================= from __future__ import (absolute_import, division, print_function, unicode_literals, generators, nested_scopes, with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". import sys import ProgramName from FastaReader import FastaReader from FastaWriter import FastaWriter from Rex import Rex rex = Rex() #========================================================================= # main() #========================================================================= if (len(sys.argv) != 3): exit(ProgramName.get() + " <in.fasta> <out.fasta>\n") (infile, outfile) = sys.argv[1:] OUT = open(outfile, "wt") writer = FastaWriter() reader = FastaReader(infile) while (True): (defline, seq) = reader.nextSequence() if (not defline): break if (not rex.find(">chr", defline)): continue
#!/usr/bin/env python #========================================================================= # This is OPEN SOURCE SOFTWARE governed by the Gnu General Public # License (GPL) version 3, as described at www.opensource.org. # Copyright (C)2016 William H. Majoros ([email protected]). #========================================================================= from __future__ import (absolute_import, division, print_function, unicode_literals, generators, nested_scopes, with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". from Rex import Rex rex=Rex() MIN_READS=3 ASSEMBLY="/home/bmajoros/1000G/assembly" READS=ASSEMBLY+"/reads.txt-rev3" EXPRESSED=ASSEMBLY+"/expressed.txt" expressed={} with open(EXPRESSED,"rt") as fh: for line in fh: fields=line.split() if(len(fields)!=4): continue (gene,trans,meanFPKM,SS)=fields expressed[trans]=True hasAlts={} supportedAlts={} with open(READS,"rt") as fh:
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public # License (GPL) version 3, as described at www.opensource.org. # Copyright (C)2016 William H. Majoros ([email protected]). #========================================================================= from __future__ import (absolute_import, division, print_function, unicode_literals, generators, nested_scopes, with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". import sys from Transcript import Transcript from Interval import Interval from GffTranscriptReader import GffTranscriptReader from Rex import Rex rex=Rex() if(len(sys.argv)!=5): exit(sys.argv[0]+ " <in.broken-sites> <junctions.bed> <in.gff> <in.readcounts>") (infile,junctionsFile,gffFile,readCountsFile)=sys.argv[1:] #============================= main() ================================= # Read the readcounts file totalMappedReads=None readCounts={} with open(readCountsFile,"rt") as IN: while(True): line=IN.readline() if(line==""): break
#========================================================================= # This is OPEN SOURCE SOFTWARE governed by the Gnu General Public # License (GPL) version 3, as described at www.opensource.org. # Copyright (C)2016 William H. Majoros ([email protected]). #========================================================================= from __future__ import (absolute_import, division, print_function, unicode_literals, generators, nested_scopes, with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". import sys import os import glob from Rex import Rex rex=Rex() if(len(sys.argv)!=2): exit(sys.argv[0]+" <dir>") directory=sys.argv[1] files=glob.glob(directory+"/*.fastb") for file in files: with open("tmp.fastb","wt") as OUT: with open(file,"rt") as IN: for line in IN: if(rex.find(">\S+",line)): OUT.write(">dna\n") else: OUT.write(line) os.system("mv tmp.fastb "+file)
#!/usr/bin/env python #========================================================================= # This is OPEN SOURCE SOFTWARE governed by the Gnu General Public # License (GPL) version 3, as described at www.opensource.org. #========================================================================= from __future__ import (absolute_import, division, print_function, unicode_literals, generators, nested_scopes, with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". import sys from Rex import Rex rex = Rex() #========================================================================= # main() #========================================================================= counts = {} for line in sys.stdin: fields = line.rstrip().split() id = fields[0] alleles = counts.get(id, None) if (alleles is None): alleles = counts[id] = {} for field in fields[1:]: if (not rex.find("(\S+)=(\d+)", field)): raise Exception("can't parse field: " + field) allele = rex[1] count = int(rex[2])
with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". import sys import ProgramName import gzip import math import numpy as np from sklearn import linear_model import statsmodels.api as sm from scipy import stats from Rex import Rex rex = Rex() MAX_P_ADJ = 1.0 MAX_P = 0.05 NUM_COVARIATES = 11 SKIP_NA = True BASE = "/home/bmajoros/PopSTARR/sarah" #EFFECTS=BASE+"/test-lucif-aug28-chr10.txt" EFFECTS = BASE + "/test-sarah-beta-2sided.txt" VCF = BASE + "/vcf" PHENOTYPES = BASE + "/phenotypes.txt" CHROMS = ("chr10", "chr11", "chr21", "chr5", "chr8") CENTERS = ("A", "E", "F", "K", "N", "P", "Q") CENTER_CODES = {} nextCode = 0