def main(): """ ARGS: RETURN: DESCRIPTION: DEBUG: FUTURE: """ if (sys.version_info[0] < 3): exit_with_error("ERROR!!! Runs with python3, NOT {}\n".format( sys.argv[0])) if (len(sys.argv) == 2 and ("--h" in sys.argv[1] or "-h" in sys.argv[1])): print_help(ExitCode=0) elif (len(sys.argv) != 2): print_help(ExitCode=1) path = sys.argv[1] # Path to search # Master loop for root, dirL, fileL in os.walk(path): for f in fileL: if (platform.system() == "Darwin"): print(os.path.join(root, f)) else: exit_with_error( "ERROR!!! {} is an unsupported platform".format( platform.system()))
def displacement(Agent1=None, Agent2=None): """ ARGS: RETURN: DESCRIPTION: Gives displacement between two agents. DEBUG: FUTURE: """ if (type(Agent1) == AGENT): x1 = Agent1.posL[0] y1 = Agent1.posL[1] elif (type(Agent1) == list): x1 = Agent1[0] y1 = Agent1[1] else: exit_with_error("ERROR!!! {} not a supported type\n".format( type(Agent1))) if (type(Agent2) == AGENT): x2 = Agent2.posL[0] y2 = Agent2.posL[1] elif (type(Agent2) == list): x2 = Agent2[0] y2 = Agent2[1] else: exit_with_error("ERROR!!! {} not a supported type\n".format( type(Agent2))) return np.sqrt((x1 - x2)**2 + (y1 - y2)**2)
def __init__(self, Chromosome = None, Sequence = None): """ ARGS: Chromosome = chromosome string Sequence = chromosome sequence RETURN: NONE : Initializes CHROMOSOME DESCRIPTION: Gets chromosome sequence so that we can use it to get the exon sequences. DEBUG: For a shortened whole genome fasta file, it correctly reads it in. See read_genome() for debugging code and futher details. FUTURE: """ self.chrm= None # str, Chromosome self.seq = [] # str, sequence if(Chromosome is not None): self.chrm = Chromosome else: exit_with_error("ERROR! Chromosome is not specified!\n") if(Sequence is not None): self.seq = Sequence else: exit_with_error("ERROR! Sequence is not specified!\n")
def get_trans_seq(transList, exonList): """ ARGS: RETURN: None DESCRIPTION: Gets sequences for transcripts from chromosome list DEBUG: Tested on 2 transcripts, more testing required. Getting a transcript file with the transcripts and sequences is challenging though FUTURE: """ timeBegin = datetime.datetime.now() for trans in transList: exonNum = 0 # use to check that indexs are loaded in order prevExonNum = 0 for exon in exonList: if (exon.transID == trans.transID): exonNum = int(exon.exonNum) if (exonNum - prevExonNum != 1): exit_with_error( "ERROR! exon numbers for %s are loaded out of " "order!\n" % (trans.transID)) if (trans.seq is None): trans.seq = exon.seq else: trans.seq += exon.seq prevExonNum = exonNum timeEnd = datetime.datetime.now() print("get_trans_seq() run time = %s" % (timeEnd - timeBegin))
def main(): timeBegin = time.time() if (len(sys.argv) != 7): if (len(sys.argv) > 1 and (sys.argv[1] == "--help" or sys.argv[1] == "-h")): print_help(0) else: print_help(1) #pathToGtf = "/reference/homo_sapiens/GRCh38/ensembl/Annotation/Genes/gtf/Homo_sapiens.GRCh38.83.gtf" #pathToSeq = "/reference/homo_sapiens/GRCh38/ensembl/Sequence/WholeGenomeFasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa" random.seed(42) pathToGtf = sys.argv[1] pathToSeq = sys.argv[2] pathToConfig = sys.argv[3] pathToFastq = sys.argv[5] readType = sys.argv[6] gtfList = read_gtf(pathToGtf) exonList = get_exon_list(gtfList) transList = get_transcript_list(gtfList, exonList) geneList = get_gene_list(gtfList, transList) chrmList = read_genome(pathToSeq) uniqueFeatureList = get_list_of_unique_gtf_features(gtfList) get_exon_seq(exonList, chrmList) link_exons_trans_and_genes(gtfList, exonList, transList, geneList) # print_transcripts_with_seqs(transList) # Debug link_exons_trans_and_genes() geneDict, transDict = create_gene_and_trans_lookup_dict( geneList, transList) print_gtf_statistics(exonList, transList, geneList) # find_trans_that_differ_by_1_exon(geneList, transList) # Uncomment for complete list readLength, desiredTransList, abundanceList, numOfReads = read_config( pathToConfig) numOfReads = int(sys.argv[4]) if (readType != 'single' and readType != 'paired-fr-first' and readType != 'paired-fr-second'): exit_with_error("ERROR!!! Incorrect value for {}".format(readType)) else: ### Paired end reads are not working yet ### if (readType == 'paired-fr-first' or readType == 'paired-fr-second'): exit_with_error( "ERROR!!! paired-fr-first and paired-fr-second \n" "not yet implemented. \n\n" "NOTE:: Both reads are tentatively found in the \n" " INSERT class. The second read is not used.\n" " The second read should definitely needs checked.\n") create_fastq_file(pathToFastq, desiredTransList, abundanceList, numOfReads, readLength, transDict, transList, exonList, readType) print("Unique features in Gtf : ") for feature in uniqueFeatureList: print("\t%s" % (feature)) timeEnd = time.time() print("Run time : %s" % (timeEnd - timeBegin)) sys.exit(0)
def read_gtf(PathToGtf=None): """ ARGS: PathToGtf = path to gene transfer file RETURN: A list of GTF_ENTRYs DESCRIPTION: Reads in gtf file. DEBUG: Can reproduce input gtf file. Works as expected. FUTURE: """ if (PathToGtf[-3:] != "gtf"): exit_with_error( "ERROR! You did not pass a file with the .gtf extention\n") gtfFile = open(PathToGtf, 'r') gtfList = [] gtfEntry = None timeBegin = datetime.datetime.now() for line in gtfFile: if (line[0] == "#"): continue line = line.split("\t") # Format check if (len(line) != 9): exit_with_error( "ERROR! There should be 9 tab separated columns in a" " GTF file\nYou only have %i\n" % (len(line))) gtfEntry = GTF_ENTRY(Chromosome=line[0], Source=line[1], EntryType=line[2], Start=line[3], Stop=line[4], Score=line[5], Strand=line[6], Frame=line[7], Attribute=line[8]) gtfList.append(gtfEntry) gtfFile.close() timeEnd = datetime.datetime.now() # Debug Code #for gtfEntry in gtfList: # gtfEntry.print_entry() print("read_gtf() run time = %s" % (timeEnd - timeBegin)) return gtfList
def parse_argv(argv): flag_1 = 0 flag_2 = 0 equation = "" argc = len(argv) # вывод ошибки при отстутствии аргументов if argc == 1: exit_with_error(-1) elif sys.argv[1] == "-h": printUsage() sys.exit(2) elif sys.argv[1] == "-p" or sys.argv[1] == "-P": if argc == 2: exit_with_error(-1) elif argc > 3: exit_with_error(-2) # бонусы if sys.argv[1] == "-p": flag_1 = 1 else: flag_2 = 1 equation = sys.argv[2] elif argc == 2: equation = sys.argv[1] else: exit_with_error(-2) return equation, flag_1, flag_2
def check(eq): s = re.sub('[^0-9X. \-+*^=]', '', eq) if s != eq: exit_with_error(-2) eq = re.sub(' ', '', eq) if eq.count('=') > 1: exit_with_error(-3) elif eq.count('=') == 0: exit_with_error(-4) if len(eq) < 3: exit_with_error(-5) if not eq[-1].isdigit(): exit_with_error(-5)
def get_exon_seq(exonList, chrmList): """ ARGS: RETURN: None DESCRIPTION: Gets sequences for exons from chromosome list DEBUG: Spot checked 3 exons, are all ok. More testing needed, however it is challenging to get a list of all the exons (incl. seqs) in a single file FUTURE: """ timeBegin = datetime.datetime.now() for exon in exonList: for chrm in chrmList: chrmLen = len(chrm.seq) if (chrm.chrm == exon.chrm): start = exon.start - 1 # -1 b/c python is 0 indexed, gtf file isnot end = exon.stop if (start >= chrmLen or end >= chrmLen): exit_with_error( "ERROR!! start (%i) or stop (%i) Position > " "chromosome length (%i)\n" % (start, end, chrmLen)) if (exon.strand == '+'): exon.seq = chrm.seq[start:end] elif (exon.strand == '-'): exon.seq = reverse_complement(chrm.seq[start:end]) tmp = exon.start exon.start = exon.stop exon.stop = tmp else: exit_with_error("ERROR! strand char = %s is invalid", exon.strand) timeEnd = datetime.datetime.now() print("get_exon_seq() run time = %s" % (timeEnd - timeBegin))
def reverse_complement(seq): """ ARGS: seq : sequence with _only_ A, T, C or G (case sensitive) RETURN: rcSeq : reverse complement of sequenced passed to it. DESCRIPTION: DEBUG: Compared several sequences. Is working. FUTURE: """ rcSeq = "" # Reverse Complement sequence # Complement for char in seq: if(char == 'A' ): rcSeq += 'T' continue if(char == 'T' ): rcSeq += 'A' continue if(char == 'G' ): rcSeq += 'C' continue if(char == 'C' ): rcSeq += 'G' continue if(char == 'N' ): rcSeq += 'N' continue if(char not in "ATCGN"): exit_with_error("ERROR! char %s is not a valid sequencing character!\n"%(char)) # Revese rcSeq = rcSeq[::-1] return rcSeq
def main(): """ ARGS: RETURN: DESCRIPTION: NOTES: DEBUG: FUTURE: """ ### Check Python version and CL args ### if (sys.version_info[0] != 3): exit_with_error("ERROR!!! Runs with python3, NOT python-{}\n\n".format( sys.version_info[0])) nArg = len(sys.argv) if (nArg == 2 and (sys.argv[1][0:3] == "--h" or sys.argv[1][0:2] == "-h")): print_help(0) elif (nArg != 1): print_help(1) ### Variables ### matrixSize = 5000 # outer dim of mats to run matrix_multiply on N = 50 # shortened num of trials to test,get stdev and mean random.seed(42) np.random.seed(42) Ax = matrixSize Ay = 10000 Bx = 10000 By = matrixSize A = np.random.rand(Ax, Ay) B = np.random.rand(Bx, By) print("Matrix Size = [{} {}] x [{} {}] = multiplied {} times ".format( Ax, Ay, Bx, By, N)) npStartTime = time.time() for i in range(N): AB = np.dot(A, B) print("Run time : {:.4f} s".format((time.time() - npStartTime)))
def main(): """ ARGS: RETURN: 1. Creates images. Turn into moving using ffmpeg, e.g. ffmpeg -framerate 4 -pattern_type glob -i 'output/*.png' -c:v libx264 out.mp4 DESCRIPTION: DEBUG: FUTURE: 1. Add option to fit only a specific section of data. 2. Make main loop NOT O(N^2). Maybe organize by position on a grid. """ # Check Python version nArg = len(sys.argv) # Use python 3 if (sys.version_info[0] != 3): exit_with_error("ERROR!!! Use Python 3\n") # Get options if (len(sys.argv) > 1 and "-h" in sys.argv[1]): print_help(0) elif (nArg != 1 and nArg != 2): print_help(1) elif (nArg == 1): quarantine = False elif (nArg == 2 and sys.argv[1] == "quarantine"): quarantine = True startTime = time.time() print("{} \n".format(sys.argv), flush=True) print(" Start Time : {}".format( time.strftime("%a, %d %b %Y %H:%M:%S ", time.localtime())), flush=True) ### Parameters to Change N = 200 # Number of Agents nDays = 100 # number of days in simulation dt = 0.25 # number of steps in a day, total steps = nDays / dt nStep = int(nDays / dt) infectTime = 14 / dt # Infection time in units of steps asymptomaticTime = 5 / dt # Infection time in units of steps prob = 0.125 # Probability of infecting agent within infectDist infectDist = 0.05 # Distance person must be within to get infected critMass = 20 # Number of people before instituting a quarantine nDayAsymptAndInfec = 2 # Number days asymptomatic AND infectious agentL = [] nSuscL = [] # Number of susceptible per step nInfL = [] # Number of infected per step nRmL = [] # Number of removed per step startQuarantine = False print("Parameters : \n" " N = {}\n" " prob = {}\n" " nDays = {}\n" " nStep= {}\n" " infectDist= {}\n" " critMass= {}\n" " nDayAsymptAndInfec = {}\n" " quarantine= {}\n".format(N, prob, nDays, nStep, infectDist, critMass, nDayAsymptAndInfec, quarantine)) # Initialize agents for n in range(N): agent = AGENT(n) agentL.append(agent) # Infect 1 agent agentL[0].infected = True agentL[0].start = 0 # Simulation - O(N**2) for step in range(nStep): # Use plotting sxL = [] # Susceptible xL syL = [] ixL = [] # Infected xL iyL = [] rxL = [] # Removed xL ryL = [] ### Only if quarentining infected individuals if (quarantine == True): qxL = [] # quarantine qyL = [] for i in range(len(agentL)): agent = agentL[i] # Generate for plot #xL.append(agent.posL[0]) #yL.append(agent.posL[1]) # Susceptible if (agent.infected == False and agent.immune == False): sxL.append(agent.posL[0]) syL.append(agent.posL[1]) # Infected if (agent.infected == True): ixL.append(agent.posL[0]) iyL.append(agent.posL[1]) # Removed if (agent.immune == True): rxL.append(agent.posL[0]) ryL.append(agent.posL[1]) ### Only if quarentining infected individuals if (quarantine == True): if (len(ixL) == critMass): # Critical mass to quarantine startQuarantine = True # Quarentine after 2 days of infeciousness if ((agent.infected == True and step - agent.start - asymptomaticTime >= nDayAsymptAndInfec / dt) and startQuarantine == True): agent.quarantine = True agent.vL[0] = 0 agent.vL[1] = 0 if (agent.quarantine == True): qxL.append(agent.posL[0]) qyL.append(agent.posL[1]) # continue # Move move_agent(agent, agentL, infectDist, quarantine, dt) # 'Removed' Group. If survived, adjust time. if (step - agent.start > infectTime and agent.infected == True): agent.infected = False agent.immune = True if (quarantine == True): agent.quarantine = False # Susceptible Group - Check if infected if (agent.infected == False or (step - agent.start < asymptomaticTime) or agent.quarantine == True): continue # Infectious Group - Try to infect someone for j in range(len(agentL)): # Skip self if (i == j or agentL[j].immune == True or agentL[j].infected == True or agentL[j].quarantine == True): continue d = displacement(agent, agentL[j]) if (d <= infectDist): rng = random.uniform(0, 1) if (rng < prob): agentL[j].infected = True agentL[j].start = step agent.nInfect += 1 # Plot fig, ax = plt.subplots() ax.scatter(sxL, syL, c="black", marker=".", label="Susceptible") ax.scatter(ixL, iyL, c="red", marker="^", label="Infected") # Add infection radius for i in range(len(ixL)): circle = Circle((ixL[i], iyL[i]), radius=infectDist) circle.set_edgecolor("red") circle.set_facecolor("none") ax.add_artist(circle) ax.scatter(rxL, ryL, c="blue", marker="s", label="Removed") ax.grid(True) #ax.legend([".", "^", "s"], ["Removed","Infected","Susceptible"], loc="best") ax.legend(loc=1) ax.set_xlim((0, 1)) ax.set_ylim((0, 1)) # State R0 RL = np.asarray([a.nInfect for a in agentL if (a.immune == True)]) if (np.sum(RL) > 0): R = np.mean(RL) else: R = 0 ax.set_title( "Critical Fraction : {:<.1f}%\n{:<.2f} days, R = {:<.2f}".format( critMass / N * 100, step * dt, R)) #plt.show() plt.savefig("tmp/{:04d}.png".format(step)) plt.close('all') # Record number susceptible, infected and removed nSuscL.append(len(sxL)) nInfL.append(len(ixL)) nRmL.append(len(rxL)) # Generate plot of SIR vs. Time fig, ax = plt.subplots() ax.plot(range(nStep), nSuscL, c="black", label="Susceptible") ax.plot(range(nStep), nInfL, c="red", label="Infected") ax.plot(range(nStep), nRmL, c="blue", label="Removed") ax.legend(loc=1) ax.xaxis.set_ticks([d * 1.0 / dt for d in range(nDays) if (d % 10 == 0)]) ax.set_xticklabels([d for d in range(nDays) if (d % 10 == 0)]) ax.set_title( "Critical Fraction : {:<.1f}%\nSusceptible-Infected-Removed vs. Time". format(critMass / N * 100, step * dt)) ax.set_xlabel("Time (days)") ax.set_ylabel("Number of Agents") plt.savefig("tmp/SIR_vs_time.png") plt.close('all') print("Ended : %s" % (time.strftime("%D:%H:%M:%S"))) print("Run Time : {:.4f} h".format((time.time() - startTime) / 3600.0)) sys.exit(0)
def link_exons_trans_and_genes(gtfList, exonList, transList, geneList): """ ARGS: gtfList : list of all GTF_ENTRYs exonList : list of all EXONS transList: list of all TRANSCRIPTS geneList : list of all GENES RETURN: DESCRIPTION: Loops through gtfList and captures the indices of exons in exonList and passes it to the transcripts in transList. Also captures indices of transcripts in transList and passes it to genes in geneList. Does this in one pass through gtfList and scales roughly O(N). Should be faster than previous versions. DEBUG: 1. I validated by using print_transcripts_with_seqs() and comparing against the biomart download for chromosome 1. My data file was _identical_ to biomart's. For how this was done, see the debug comment in print_transcripts_with_seqs() 2. Checked Transcript.seq for reverse strand ('-') transcript. Used ENST00000488147 it is correct. FUTURE: """ gIdx = 0 # Gene index, for geneList tIdx = 0 # Transcript index, for transList eIdx = 0 # Exon index, for exonList gtfListLen = len(gtfList) timeBegin = datetime.datetime.now() # Ugly / non-pythonic b/c cant find cleaner way of accessing the next gtfEntry in the list for i in range(len(gtfList)): if (gtfList[i].etype == "gene"): # Check that genes in geneList are same order as gtfList if (gtfList[i].geneID != geneList[gIdx].geneID): exit_with_error( "ERROR! gtfList[%i].geneID = %s and geneList[%i].geneID = %s" % (i, gtfEntry.geneID, gIdx, geneList[gIdx].geneID)) j = i + 1 # Get constituent transcripts between gene entries while (gtfList[j].etype != "gene"): if (gtfList[j].etype == "transcript"): # Check that transcripts in transList are same order as gtfList # Checking transcripts after gene in gtf _actually_ are members of the gene # Add trans info to appropriate geneList[] if (gtfList[j].transID == transList[tIdx].transID and gtfList[i].geneID == transList[tIdx].geneID and gtfList[i].geneID == geneList[gIdx].geneID): geneList[gIdx].transList.append( transList[tIdx].transID) geneList[gIdx].transIdxList.append(tIdx) k = j + 1 # Get constituent exons between transcript entries while (gtfList[k].etype != "transcript"): if (gtfList[k].etype == "exon"): # Check exons in exonList are same order as gtfList # Checking exons after trans in gtf are members trans # Add exon info to appropriate transList[] if (gtfList[k].transID == exonList[eIdx].transID and gtfList[i].geneID == exonList[eIdx].geneID and gtfList[i].geneID == geneList[gIdx].geneID): transList[tIdx].exonList.append( exonList[eIdx].exonID) transList[tIdx].exonIdxList.append(eIdx) eIdx += 1 else: exit_with_error( "ERROR! gtfList[%i].transID = %s and exonList[%i]." "transID = %s\n\tgtfList[%i].geneID = %s and " "transList[%i].geneID = " "%s\n\tand geneList[%i].geneID = %s\n" % (k, gtfList[k].transID, eIdx, exonList[eIdx].transID, k, gtfList[k].geneID, tIdx, transList[tIdx].geneID, gIdx, geneList[gIdx].geneID)) k += 1 if (k == gtfListLen): break tIdx += 1 else: exit_with_error( "ERROR! gtfList[%i].transID= %s and transList[%i].transID = " "%s\n\tgtfList[%i].geneID = %s and transList[%i].geneID = " "%s\n\tand geneList[%i].geneID = %s\n" % (j, gtfList[j].transID, tIdx, transList[tIdx].transID, j, gtfList[j].geneID, tIdx, transList[tIdx].geneID, gIdx, geneList[gIdx].geneID)) j += 1 if (j == gtfListLen): break gIdx += 1 # Now get transcript sequences. for trans in transList: trans.seq = "" for eIdx in trans.exonIdxList: trans.seq += exonList[eIdx].seq timeEnd = datetime.datetime.now() print("link_exons_trans_and_genes() run time = %s" % (timeEnd - timeBegin))
def main(): """ ARGS: RETURN: DESCRIPTION: DEBUG: 1. Checked adding countries' data, hopefully don't double count if france gets more points in the regions. CHecked summing of China's regions (since there isn't a global value) E.g. grep -i china data/jhu/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv | awk 'BEGIN{FS=","; SUM=0}{SUM+=$66}END{print SUM}' 3274 FUTURE: 1. Add option to fit only a specific section of data. """ # Check Python version nArg = len(sys.argv) # Use python 3 if(sys.version_info[0] != 3): exit_with_error("ERROR!!! Use Python 3\n") # Get options if(nArg > 1 and "-h" in sys.argv[1]): print_help(0) elif(nArg != 1 ): print_help(1) startTime = time.time() print("{} \n".format(sys.argv),flush=True) print(" Start Time : {}".format(time.strftime("%a, %d %b %Y %H:%M:%S ", time.localtime())),flush=True) plotType = "log-lin" # Get args dataPath = "data/jhu/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv" countryL = ["us","spain","italy","china","korea, south","germany","france", "canada","united kingdom"] df = pd.read_csv(dataPath) lastDate = df.columns[-1] dataD = dict() # A dictionary, keys = country, values=np array of deaths for country in countryL: for index, row in df.iterrows(): # Select country specified if(row.values[1].lower() == country.lower()): vector = np.asarray(row.values[4:],dtype=np.float32) # Convert nan's to 0's, maybe wrong? for i in range(len(vector)): if(np.isnan(vector[i])): vector[i]=0 # += b/c some countries, i.e. china don't have a single country val if(country in dataD.keys()): # Add dataD[country] += vector # Debugging if(country == 'us' and np.isnan(dataD[country][-1])): print("{} {}".format(country,len(dataD[country]))) #### FIGURE OUT NAN at end of US #### if(initLen != len(dataD[country])): exit_with_error("ERROR!!! {} != {}\n".format(initLen, len(dataD[country]))) else: dataD[country] = vector initLen = len(dataD[country]) # not every country got deaths at the same time. Let's shift the time # points to be starting at the first death(s) for country in dataD.keys(): shiftDataL = [] firstDeath = False dataV = dataD[country] for d in range(len(dataV)): if(dataV[d] > 0): dataD[country]=dataV[d:] break usDailyDeathA=[dataD['us'][i+1] - dataD['us'][i] for i in range(len(dataD['us'])-1)] print("\nDaily deaths (last 10 days) in US\n\t{}\n".format(usDailyDeathA[-10:])) print("Total deaths (last 10 days) in US\n\t{}\n".format(dataD['us'][-10:])) fig, ax = plt.subplots(1,1) ax.set_title("Covid-19 Deaths per country (ending {})".format(lastDate)) # Loop through keys and plot idx=0 lineStyleL=["-","--","-.",":", "solid"] for country in dataD.keys(): lineStyle=lineStyleL[idx%len(lineStyleL)] xV = range(len(dataD[country])) yV = np.log(dataD[country]) ax.plot(xV, yV, label="{}".format(country),ls=lineStyle) #ax.annotate([xV[-1], yV[-1] if(country == "united kingdom"): country = "UK" ax.annotate(country, xy=(xV[-1], yV[-1]), ha="center", va="center", rotation=45) idx+=1 # Generate data for different doubling times ## Doubling time = 1 day n=15 xV = range(n) yV = np.log(np.asarray([1*2**x for x in xV])) ax.plot(xV, yV, label="2bl time=1 day",ls="solid",color="black") ax.annotate("1 day", xy=(xV[-1], yV[-1]), ha="center", va="center", rotation=45) ## Doubling time = 3 day n=35 xV = range(n) yV = np.log(np.asarray([1*2**(x/3.0) for x in xV])) ax.plot(xV, yV, label="2bl time= 3 day",ls="solid",color="black") ax.annotate("3 day", xy=(xV[-1], yV[-1]), ha="center", va="center", rotation=45) ## Doubling time = 10 day n=45 xV = range(n) yV = np.log(np.asarray([1*2**(x/10.0) for x in xV])) ax.plot(xV, yV, label="2bl time= 10 day",ls="solid",color="black") ax.annotate("10 day", xy=(xV[-1], yV[-1]), ha="center", va="center", rotation=45) # Generate Plot ax.set_xlabel("Time spanning days since first death") ax.set_ylabel("{}".format("ln(deaths)")) ax.legend() plt.show() print("Ended : %s"%(time.strftime("%D:%H:%M:%S"))) print("Run Time : {:.4f} h".format((time.time() - startTime)/3600.0)) sys.exit(0)
def move_agent(Agent=None, AgentL=None, InfectDist=None, Quarantine=None, DeltaT=None): """ ARGS: Agent : AGENT, The AGENT whose trajectory we are computing AgentL : List of AGENTs, Used for avoiding quarantined AGENTs InfectDist : Float, Infectiousness distance. Used as radius around quarantined, infected AGENTs Quarantine : Boolean, do we quarantine infected agents? DeltaT : Time interval. RETURN: DESCRIPTION: Moves agent. Applies implied boundary conditions [0,0,0] -> [1,1,1] If quarantine option used, I do an approximation of reality. Reasons : 1. Trying to avoid multiple infected individuals gets logically complicated to code. 2. Avoiding the bounds [0,0,0] -> [1,1,1] and trying to avoid an infected individual adds yet another level of logical complexity. Solution : 1. I only avoid the first infected individual that I encounter and then I test the bounds. DEBUG: FUTURE: """ xi = Agent.posL[0] yi = Agent.posL[1] vx = Agent.vL[0] vy = Agent.vL[1] v = np.sqrt(vx**2 + vy**2) xf = Agent.vL[0] * DeltaT + Agent.posL[0] yf = Agent.vL[1] * DeltaT + Agent.posL[1] r = InfectDist # Radius about quarantined individual # Check if quarantined agent nearby. # 1. Only consider 1st quarantine encountered b/c it could be potentially very # challenging to solve for preventing a susceptible from completely avoiding _all_ # quarantined agents. # 2. # 3. # Quarantined agents can't move. if (Agent.quarantine == True and Agent.immune == True): return if (Quarantine == True and Agent.infected == False): # displacement, Agent final - initial dfi = np.sqrt((xf - xi)**2 + (yf - yi)**2) # Get line function, y = mx + b m = (yf - yi) / (xf - xi) # Slope of line b = yf - m * xf # Pick a point on the line, solve for intercept for agent in AgentL: # Must be quarantined to avoid if (agent.quarantine == False): continue xc = agent.posL[0] yc = agent.posL[1] # displ, quarntined - Agent final dfq = np.sqrt((xc - xf)**2 + (yc - yf)**2) # displ, quarntined - Agent initial diq = np.sqrt((xc - xi)**2 + (yc - yi)**2) # There might be a collision - # It is possible that both dfq and diq are # outside of radius, yet the trajectory passes through it #### Maybe easier if we just say if dfi < 3*r and dfq < 3*r if (dfq <= r or diq <= r): # This is gross and imprecise. # Get circle of exclusion line, recall # 0 = (x - xc)^2 + (y - yc)^2 - r^2 # xc,yc = x,yposition of center of circle def f(x): y = m * x + b return ((x - xc)**2 + (y - yc)**2 - r**2) ### With many root solvers, it requires that f(a)*f(b) < 0. However, ### fsolve doesn't care. It just needs bounds to look xroots = optimize.fsolve(f, [xc - r, xc + r]) # If there are two roots, which do i pick? Pick closest to Agent if (len(xroots) == 2): x1 = xroots[0] y1 = m * x1 + b d1 = displacement(Agent, [x1, y1]) x2 = xroots[1] y2 = m * x2 + b d2 = displacement(Agent, [x2, y2]) # Use 1st root b/c it is closer if (d1 < d2): x = x1 y = y1 else: x = x2 y = y2 elif (len(xroots) == 1): x = xroot y = m * xroot + b else: exit_with_error( "ERROR!!! I don't understand how there can " "be more than 2 roots!\n") rx = x - xc ry = y - yc rvect = [rx, ry] # Find line perpendicular to rvect, i.e. tangent to the circle, call it 't' # Let : # t = a \hat(i) + b \hat(j) # rvect = rx \hat(i) + ry \hat(j) # Solve equation : # t \dot rvect = 0 # (a \hat(i) + b \hat(j)) \dot (rx \hat(i) + ry \hat(j)) = 0 # a * rx + b * ry = 0 # a = -(b * ry) / rx ### Verticle line if (rx == 0): a = 0 b = 1 alpha = np.pi / 2 # 90deg, Angle between tangent and horizontal ### Horizontal line elif (ry == 0): a = 1 b = 0 alpha = 0 # 0deg, Angle between tangent and horizontal ### Exerything else else: b = 1 a = -b * ry / rx alpha = np.arctan( b / a) # 0deg, Angle between tangent and horizontal if (np.isnan(alpha)): exit_with_error( "ERROR!!! np.arctan({}/{}) == nan\n".format(b / a)) #if(np.isclose(np.sqrt(rvect[0]*rvect[0]+rvect[1]*rvect[1]), r) == False): # exit_with_error("ERROR!!! I don't know how |rvect| != |r|\n") # Now get angle between rvector and velocity vector theta = np.arccos((vx * rvect[0] + vy * rvect[1]) / np.sqrt( (vx**2 + vy**2) * (rvect[0]**2 + rvect[1]**2))) # Angle of reflection w/r/t to the tangent line on circle phi = theta - np.pi / 2.0 #phi = theta vx = v * np.sin(phi) * np.cos(alpha) vy = v * np.cos(phi * np.sin(alpha)) #print("{:<.5f} {:<.5f} {:<.5f}".format(vx,vy,phi)) xf = vx * DeltaT + xi yf = vy * DeltaT + yi Agent.vL[0] = vx Agent.vL[1] = vy break #else: # continue # Check bounds if (xf < 0): xf = -1.0 * xf Agent.vL[0] = -1.0 * Agent.vL[0] if (yf < 0): yf = -1.0 * yf Agent.vL[1] = -1.0 * Agent.vL[1] if (xf > 1.0): d = xf - 1.0 xf = xf - d Agent.vL[0] = -1.0 * Agent.vL[0] if (yf > 1.0): d = yf - 1.0 yf = yf - d Agent.vL[1] = -1.0 * Agent.vL[1] # Adjust Position Agent.posL[0] = xf Agent.posL[1] = yf # Adjust velocity dvx = random.uniform(-1, 1) / 100.0 # Want crossing time to be about 25 steps dvy = random.uniform(-1, 1) / 100.0 Agent.vL[0] += dvx Agent.vL[1] += dvy
def solve(fac): sl = Sol disc = fac.b * fac.b - 4 * fac.a * fac.c if fac.max_degree > 2: exit_with_error(-6) elif fac.a == 0 and fac.b == 0 and fac.c != 0: sl.comment = '\nThis equation has no solutions :C \n' elif fac.a == 0 and fac.b == 0 and fac.c == 0: sl.comment = '\nThe solution to this equation is any value of X *o* \n' elif fac.a == 0 and fac.b != 0: sl.comment = '\nThe graph of your equation is a straight line, so there is only one solution \n' sl.n = 1 sl.x.append(round(-fac.c / fac.b, 3)) else: if disc == 0: if fac.a > 0: sl.comment = '\nYour equation graph is a parabola with branches up. ' \ 'It touches the OX axis at the vertex, so there is only one solution \n' else: sl.comment = '\nYour equation graph is a parabola with branches down. ' \ 'It touches the OX axis at the vertex, so there is only one solution \n' sl.n = 1 sl.x.append(round(-fac.b / (2 * fac.a), 3)) elif disc > 0: if fac.a > 0: sl.comment = '\nYour equation graph is a parabola with branches up. ' \ 'It intersects the OX axis at two points, so there are two solutions \n' else: sl.comment = '\nYour equation graph is a parabola with branches down. ' \ 'It intersects the OX axis at two points, so there are two solutions \n' sl.n = 2 sl.x.append(round((-fac.b + math.sqrt(disc)) / (2 * fac.a), 3)) sl.x.append(round((-fac.b - math.sqrt(disc)) / (2 * fac.a), 3)) else: if fac.a > 0: sl.comment = '\nYour equation graph is a parabola with branches up. However, ' \ 'it does not cross the OX axis! Wow! This means you have two complex roots *0* \n' else: sl.comment = '\nYour equation graph is a parabola with branches down. However, ' \ 'it does not cross the OX axis! Wow! This means you have two complex roots *0* \n' sl.n = 2 sl.x.append( str(round(-fac.b / (2 * fac.a), 3)) + ' + ' + str(round(math.sqrt(-disc) / (2 * fac.a), 3)) + ' * i') sl.x.append( str(round(-fac.b / (2 * fac.a), 3)) + ' - ' + str(round(math.sqrt(-disc) / (2 * fac.a), 3)) + ' * i') if sl.n == 0 or fac.if_i == 1: print(sl.comment) if sl.n > 0: print('The solution is:') if sl.n == 1: print('X = ', sl.x[0]) elif sl.n == 2: print('X1 =', sl.x[0]) print('X2 =', sl.x[1], '\n') if fac.if_p and sl.n >= 1 and disc >= 0: show_plot(sl.x, fac.a, fac.b, fac.c)
def __init__(self, Insert = None, ReadLength = None, MetaData = None, ExonList = None, Direction = None): """ ARGS: Insert = an INSERT instance ReadLength = length of desired read. MetaData = Read number ExonList = Direction = either 'forward' or 'reverse' 'forward' : matches mRNA starting from 5' -> 3' (the way ribosome trascribes mRNA) 'reverse' : matches complement mRNA starting from 3' -> 5' E.g. 5' =>=>=>=>=>=>=>=>=>=>=>=>=>=>=>=>=>=>=>=>=>=> 3' -----> 'forward' 'reverse' <-------- RETURN: NONE : Initializes GENE DESCRIPTION: Currently, synthetic reads are forced to be completely contained within the transcript. i.e, I am not permitting them to read into the adapters, we'll save that for future work NOTES : 1. http://onetipperday.sterding.com/2012/07/how-to-tell-which-library-type-to-use.html DEBUG: 1. See debugging comments from INSERT class FUTURE: 1. Test both types of paired end reads 2. Test reads/transcrips on the +/- strands """ self.seq = None self.qual = None self.readlen = None self.metadata = None self.readDirection = Direction # whether the fastq read is forward / reverse comp self.start = 0 # coord wrt to chromosome start (gtf start def this way) self.stop = 0 # coord wrt to chromosome stop (gtf stop def this way) # type check if(not isinstance(Insert, INSERT)): exit_with_error("ERROR! Insert is not of class type INSERT\n") if(ReadLength is not None): # ReadLength = int(ReadLength) self.readlen = ReadLength else: exit_with_error("ERROR! ReadLength not specified!\n") self.get_qual() # ALSO : recall that Transcript.seq is always in the direction that # transcripts are transcribed. # See : http://onetipperday.sterding.com/2012/07/how-to-tell-which-library-type-to-use.html if(self.readDirection == "forward"): self.seq = Insert.seq[0:ReadLength] # start / stop are _inclusive_ self.start = Insert.r1Start self.stop = Insert.r1Stop exonsSpannedL = Insert.r1ExonL elif(self.readDirection == "reverse"): seqLen = len (Insert.seq) self.seq = reverse_complement(Insert.seq[seqLen-ReadLength:seqLen]) # start / stop are _inclusive_ self.start = Insert.r2Start self.stop = Insert.r2Stop exonsSpannedL = Insert.r2ExonL else: exit_with_error("ERROR!!! {} is invalid".format(self.readDirection)) # Add useful information to reads # shouldn't be so complicated. will need to revise Transcript.ExonList to # simplify this part if(MetaData is None): exit_with_error("ERROR! MetaData not specified!\n") #exonsSpannedL = list(set(exonsSpannedL)) self.metadata = "%s:trans:%s:start:%i:exons"%(MetaData,Insert.transcript.transID, self.start) for exon in exonsSpannedL: self.metadata = "{}:{}:{}:{}".format(self.metadata, exon.exonID, exon.start, exon.stop)
def __init__(self, Chromosome = None, Source = None, EntryType= None, Start = None, Stop = None, Score = None, Strand = None, Frame = None, Attribute = None): """ ARGS: Chromosome : Chromosome, can only be 1,2,...,X,Y Source : Database or Project name for entry EntryType : exon, transcript, CDS, gene, etc Start : start position on chromosome Stop : stop position on chromosome Score : UNKNOWN Strand : '+' (forward) or '-' (reverse) Frame : 0 indexed position of first base of codon Attribute : semicolon sep list of tag-value pairs RETURN: NONE : Initializes GTF_ENTRY DESCRIPTION: DEBUG: Tested by reading in gtf file and printing out list of GTF_ENTRYs. compared using full Homo_sapiens.GRCh38.83.gtf and the output was _identical_. FUTURE: """ self.chrm = None # str, Chromosome self.src = None # str, Source of data self.etype = None # str, exon, transcript, CDS, gene self.start = None # int, Start position on chrm self.stop = None # int, End position on chrm self.score = None # str, expects empty field, ie : '.' self.strand = None # str, '+' (forward) or '-' (reverse) self.frame = None # str, 0 indexed position of first base of codon self.attribute = None # str, semicolon sep list of tag-value pairs # below vars are parsed from attribute list self.geneID = None # str, Gene ID ... starts with ENSG.. self.geneName= None # str, common gene name self.transID = None # str, transcript ID ... starts with ENST... self.transName= None # str, transcript Name.. = gene common name + num self.exonID = None # str, exon ID ... starts with ENSE self.exonNum = None # int, exon number self.biotype = None # str, gene_biotype if(Chromosome is not None): self.chrm = Chromosome else: exit_with_error("ERROR! Chromosome not specified!\n") if(Source is not None): self.src = Source else: exit_with_error("ERROR! Source not specified!\n") if(EntryType is not None): self.etype = EntryType else: exit_with_error("ERROR! EntryType not specified!\n") if(Start is not None): self.start = int(Start) else: exit_with_error("ERROR! Start not specified!\n") if(Stop is not None): self.stop = int(Stop) else: exit_with_error("ERROR! Stop not specified!\n") if(Score is not None): self.score = Score # Check for empty field if(self.score != '.'): exit_with_error("ERROR! Score = %s. Expected empty field\n"%(self.score)) else: exit_with_error("ERROR! Score not specified!\n") if(Strand is not None): self.strand = Strand if(self.strand != '+' and self.strand != '-'): exit_with_error("ERROR! Strand = %s, wrong format\n"%(self.strand)) else: exit_with_error("ERROR! Strand not specified!\n") if(Frame is not None): self.frame = Frame else: exit_with_error("ERROR! Frame not specified!\n") if(Attribute is not None): self.attribute = Attribute.split("\n")[0] # Now parse attribute list for relevant fields. Create dictionary for easy lookup attributeDict = {} attributeSplit = self.attribute.split(";")[:-1] # lop off last due to ; at end for attrEntry in attributeSplit: attrEntry = attrEntry.strip() # Eliminate white space on ends key = attrEntry.split(" ")[0] value = attrEntry.split(" ")[1] attributeDict[key] = value # Every key is not necessarily in the attributeDict, handle accordingly try: self.geneID = attributeDict["gene_id"] except KeyError: pass try: self.geneName = attributeDict["gene_name"] except KeyError: pass try: self.transID = attributeDict["transcript_id"] except KeyError: pass try: self.transName= attributeDict["transcript_name"] except KeyError: pass try: self.exonID = attributeDict["exon_id"] except KeyError: pass try: self.exonNum = int((attributeDict["exon_number"])[1:-1]) except KeyError: pass try: self.biotype = attributeDict["gene_biotype"] except KeyError: pass else: exit_with_error("ERROR! Attribute not specified!\n")
def __init__(self, GtfEntry): """ ARGS: GtfEntry = a single GTF_ENTRY class element RETURN: NONE : Initializes GENE DESCRIPTION: DEBUG: FUTURE: """ self.chrm = None # str, Chromosome self.start = None # int, Start position on chrm self.stop = None # int, End position on chrm self.strand = None # str, '+' (forward) or '-' (reverse) self.geneID = None # str, nominal geneID, but may belong to multiple genes self.geneName= None # str, common gene name self.transList = [] # transcript names that are part of this gene self.transIdxList = [] # list, use to quickly map to AllTransList transIdx = 0 # type check if(not isinstance(GtfEntry, GTF_ENTRY)): exit_with_error("ERROR! GtfEntry is not of class type GTF_ENTRY\n") if(GtfEntry.chrm is not None): self.chrm = GtfEntry.chrm else: exit_with_error("ERROR! GtfEntry.chrm is None\n") if(GtfEntry.start is not None): self.start = int(GtfEntry.start) else: exit_with_error("ERROR! GtfEntry.start is None\n") if(GtfEntry.stop is not None): self.stop = int(GtfEntry.stop) else: exit_with_error("ERROR! GtfEntry.stop is None\n") if(GtfEntry.strand is not None): self.strand = GtfEntry.strand else: exit_with_error("ERROR! GtfEntry.strand is None\n") if(GtfEntry.geneID is not None): self.geneID = GtfEntry.geneID else: exit_with_error("ERROR! GtfEntry.geneID is None\n") if(GtfEntry.geneName is not None): self.geneName = GtfEntry.geneName else: exit_with_error("ERROR! GtfEntry.geneName is None\n")
def __init__(self, GtfEntry): """ ARGS: GtfEntry = a single GTF_ENTRY class element RETURN: NONE : Initializes TRANSCRIPT DESCRIPTION: FUTURE: """ self.seq = None # str, sequence self.chrm = None # str, Chromosome self.start = None # int, Start position on chrm self.stop = None # int, End position on chrm self.strand = None # str, '+' (forward) or '-' (reverse) self.geneID = None # str, nominal geneID, but may belong to multiple genes self.transID = None # str, nominal transcript ID, may belong to mult. trans. self.transNum= None # str, only number portion of transID for sorting by transcript num self.exonList= [] # exon names that are part of this transcript self.exonIdxList = [] # list, use to quickly map to AllExonList exonIdx = 0 self.copy = 1 # copy number of this particular transcript in the transcriptome # It is set to 1 at the moment. # type check if(not isinstance(GtfEntry, GTF_ENTRY)): exit_with_error("ERROR! GtfEntry is not of class type GTF_ENTRY\n") if(GtfEntry.chrm is not None): self.chrm = GtfEntry.chrm else: exit_with_error("ERROR! GtfEntry.chrm is None\n") if(GtfEntry.start is not None): self.start = int(GtfEntry.start) else: exit_with_error("ERROR! GtfEntry.start is None\n") if(GtfEntry.stop is not None): self.stop = int(GtfEntry.stop) else: exit_with_error("ERROR! GtfEntry.stop is None\n") if(GtfEntry.strand is not None): self.strand = GtfEntry.strand else: exit_with_error("ERROR! GtfEntry.strand is None\n") if(GtfEntry.geneID is not None): self.geneID = GtfEntry.geneID else: exit_with_error("ERROR! GtfEntry.geneID is None\n") if(GtfEntry.transID is not None): self.transID = GtfEntry.transID self.transNum = self.transID[4:] else: exit_with_error("ERROR! GtfEntry.transcriptID is None\n")
def main(): """ ARGS: RETURN: DESCRIPTION: NOTES: DEBUG: FUTURE: """ ### Check Python version and CL args ### if(sys.version_info[0] != 3): exit_with_error("ERROR!!! Runs with python3, NOT python-{}\n\n".format( sys.version_info[0])) nArg = len(sys.argv) if(nArg == 2 and (sys.argv[1][0:3] == "--h" or sys.argv[1][0:2] == "-h")): print_help(0) elif(nArg != 4): print_help(1) startTime = time.time() print("Start Time : {}".format(time.strftime("%a, %d %b %Y %H:%M:%S ", time.localtime()))) print("Logging run output to driver.log\n\n") ### Variables ### options = sys.argv[1] workPath = sys.argv[2] # Path where all the output/work will be saved. refPath = sys.argv[3] # Path where all the ref data and indices are located ompNumThreadsL = [1,2,5,20] # Cores used in OMP tasks matrixSizeL = [5000] # outer dim of mats to run matrix_multiply on #matrixSizeL = [2000,3000,5000] # outer dim of mats to run matrix_multiply on #rnaSeqSizeL = [10**4,10**5] rnaSeqSizeL = [10**5] nTrials = 3 # number of trials to test,get stdev and mean shortNTrials= 1 # shortened num of trials to test,get stdev and mean # Create work path dir if doesn't exist if(not os.path.isdir(workPath)): os.mkdir(workPath) ## In Linux singularity container add cores per socket and total cores ## to ompNumThreadsL if(shutil.which('lscpu') != None): # Record raw lscpu, lscpu -e and numactl --hardware lscpuLog=open("{}/lscpu.log".format(workPath), "a") cmd="lscpu" lscpuLog.write("\n{}:\n{}\n".format(cmd,subprocess.getoutput(cmd))) cmd="lscpu -e" lscpuLog.write("\n{}:\n{}\n".format(cmd,subprocess.getoutput(cmd))) cmd="numactl --hardware" lscpuLog.write("\n{}:\n{}\n".format(cmd,subprocess.getoutput(cmd))) lscpuLog.close() # other details cmd="lscpu | grep 'Core(s) per socket:' | awk '{print $4}'" coresPerSocket = int(subprocess.getoutput(cmd)) cmd="lscpu | grep '^CPU(s):' | awk '{print $2}'" totalCores = int(subprocess.getoutput(cmd)) cmd="lscpu | grep 'NUMA node0 CPU' | awk '{print $4}'" ## Numa - node coresPerNuma = subprocess.getoutput(cmd) if('-' in coresPerNuma): coresPerNuma = coresPerNuma.split('-') coresPerNuma[0] = int(coresPerNuma[0]) coresPerNuma[1] = int(coresPerNuma[1]) coresPerNuma = coresPerNuma[1] - coresPerNuma[0] + 1 elif(',' in coresPerNuma): # Interleave off coresPerNuma = len(coresPerNuma.split(',')) else: exit_with_error("ERROR!!! Format for coresPerNuma is not handled" ": {}".format(coresPerNuma)) ## Insert bisect.insort_left(ompNumThreadsL, coresPerNuma) bisect.insort_left(ompNumThreadsL, coresPerSocket) bisect.insort_left(ompNumThreadsL, totalCores) ompNumThreadsL=list(sorted(set(ompNumThreadsL))) print("Cores per NUMA : {}".format(coresPerNuma)) print("Cores per socket : {}".format(coresPerSocket)) print("Total Cores : {}".format(totalCores)) print("Cores tested : {}".format(ompNumThreadsL)) # Get operating system and list of cores (linux only) to take advantage of NUMA curOS = sys.platform if(curOS == 'darwin'): curOS = 'osx' # Rename for my own selfish readability elif(curOS == 'linux'): cmd = "grep -P 'processor[\t ]' /proc/cpuinfo | cut -d: -f2 | tr -d ' '" coreIDL = subprocess.getoutput(cmd) coreIDL = [int(idx) for idx in coreIDL.split()] ompCoresIdD = dict() # List of list cores to use associated with ompNumThreadsL for nThread in ompNumThreadsL: ompCoresIdD[nThread] = get_core_ids(NumThreads = nThread) else: exit_with_error("ERROR!! {} is an unsupported operating system".format(curOS)) if(options != 'all' and options != 'build_mat_mult_data' and options != 'mat_mult_non_cache_opt' and options != 'local_memory_access' and options != 'mat_mult_cache_opt' and options != 'build_rnaseq_data' and options != 'align_rnaseq_tophat' and options != 'align_rnaseq_hisat' and options != 'cufflinks_assemble' and options != 'cuffmerge' and options != 'cuffcompare' and options != 'cuffquant' and options != 'cuffnorm' and options != 'cuffdiff' and options != 'kelvin' ): exit_with_error("ERROR!!! {} is invalid option\n".format(options)) ######## Run Tests ######## if(options == 'all' or options == 'build_mat_mult_data'): nThread = 1 print("Building data for matrix_multiply (time to run is for numpy's matrix mult.: ") print("--------------------------------------------------------") print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean", "stdev")) print("--------------------------------------------------------") ### Create directory structure in data outDirPrefix = "{}/data/matrix".format(workPath) if(not os.path.isdir(outDirPrefix)): os.mkdir(outDirPrefix) for size in matrixSizeL: outDir = "{}/{}".format(outDirPrefix,size) if(not os.path.isdir(outDir)): os.mkdir(outDir) runTimeV = np.zeros([shortNTrials]) for tIdx in range(shortNTrials): ### change to shortNTrials if(curOS == 'linux'): taskset = "taskset -c {} ".format(ompCoresIdD[nThread]) else: taskset = "" cmd = ("{} python3 src/matrix/matrix_generator.py {} 10000 " "10000 {} {}".format(taskset, size, size, outDir)) output = "{}\n".format(cmd) output = output + subprocess.getoutput(cmd) runTime = parse_run_time(output,workPath) # Run time runTimeV[tIdx]= runTime print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread, np.mean(runTimeV), np.std(runTimeV))) print("--------------------------------------------------------") if(options == 'all' or options == 'mat_mult_cache_opt'): print("matrix_multiply (cache optimized using OpenMP) : ") print("--------------------------------------------------------") print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean", "stdev")) print("--------------------------------------------------------") ### Create directory structure in output outDirPrefix = "{}/output/matrix_cache_opt".format(workPath) if(not os.path.isdir(outDirPrefix)): os.mkdir(outDirPrefix) for size in matrixSizeL: outDir = "{}/{}".format(outDirPrefix,size) if(not os.path.isdir(outDir)): os.mkdir(outDir) for nThread in ompNumThreadsL: runTimeV = np.zeros([nTrials]) #nThread = 10 #size=2000 for tIdx in range(nTrials): if(curOS == 'linux'): taskset = "taskset -c {} ".format(ompCoresIdD[nThread]) else: taskset = "" cmd = ("export OMP_NUM_THREADS={}; {} " "./src/matrix/matrix_multiply_cache_opt " "{}/data/matrix/{}/A.txt {}/data/matrix/{}/B.txt " "{}".format(nThread,taskset,workPath,size,workPath,size, outDir)) output = "{}\n".format(cmd) output = output + subprocess.getoutput(cmd) runTime = parse_run_time(output,workPath) # Run time runTimeV[tIdx]= runTime print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread, np.mean(runTimeV), np.std(runTimeV))) print("--------------------------------------------------------") if(options == 'all' or options == 'mat_mult_non_cache_opt'): print("matrix_multiply (non-cache optimized using OpenMP) : ") print("--------------------------------------------------------") print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean", "stdev")) print("--------------------------------------------------------") ### Create directory structure in output outDirPrefix = "{}/output/matrix_non_cache_opt".format(workPath) if(not os.path.isdir(outDirPrefix)): os.mkdir(outDirPrefix) for size in matrixSizeL: outDir = "{}/{}".format(outDirPrefix,size) if(not os.path.isdir(outDir)): os.mkdir(outDir) for nThread in ompNumThreadsL: runTimeV = np.zeros([nTrials]) #nThread = 10 #size=2000 for tIdx in range(nTrials): if(curOS == 'linux'): taskset = "taskset -c {} ".format(ompCoresIdD[nThread]) else: taskset = "" cmd = ("export OMP_NUM_THREADS={}; {} " "./src/matrix/matrix_multiply_non_cache_opt " "{}/data/matrix/{}/A.txt {}/data/matrix/{}/B.txt " "{}".format(nThread,taskset,workPath,size,workPath, size,outDir)) output = "{}\n".format(cmd) output = output + subprocess.getoutput(cmd) runTime = parse_run_time(output,workPath) # Run time runTimeV[tIdx]= runTime print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread, np.mean(runTimeV), np.std(runTimeV))) print("--------------------------------------------------------") if(options == 'all' or options == 'build_rnaseq_data'): print("Building RNA-Seq Data sets : ") print("--------------------------------------------------------") print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean", "stdev")) print("--------------------------------------------------------") nThread = 1 nSamp = 3 treatSampL = [] wtSampL = [] gtf="{}/chr1_short.gtf".format(refPath) genome ="{}/chr1_short.fa".format(refPath) configL=["config/config_wt_chr1.txt", "config/config_treat_chr1.txt"] # Create output directory structure outDir = "{}/data/rnaseq".format(workPath) if(not os.path.isdir(outDir)): os.mkdir(outDir) outDir = "{}/fastq/".format(outDir) if(not os.path.isdir(outDir)): os.mkdir(outDir) ## Loop for size in rnaSeqSizeL: runTimeV = np.zeros([nSamp*len(configL)]) tIdx = 0 for config in configL: for samp in range(nSamp): ## Set output files if("treat" in config): if(not os.path.isdir("{}/{}".format(outDir,size))): os.mkdir("{}/{}".format(outDir,size)) outFile = "{}/{}/treat_{}".format(outDir,size,samp) treatSampL.append(outFile) elif("wt" in config): if(not os.path.isdir("{}/{}".format(outDir,size))): os.mkdir("{}/{}".format(outDir,size)) outFile = "{}/{}/wt_{}".format(outDir,size,samp) wtSampL.append(outFile) else: exit_with_error("ERROR!!! No correct config file found!\n") if(curOS == 'linux'): taskset = "taskset -c {} ".format(ompCoresIdD[nThread]) else: taskset = "" cmd = ("export OMP_NUM_THREADS={}; " "{} python3 src/simulate_fastq_data/simulate_fastq.py " "{} {} {} {} {} single" "".format(nThread, taskset, gtf, genome, config, size, outFile)) output = "{}\n".format(cmd) output = output + subprocess.getoutput(cmd) runTime = parse_run_time(output,workPath) # Run time runTimeV[tIdx]= runTime tIdx = tIdx + 1 print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread, np.mean(runTimeV), np.std(runTimeV))) print("--------------------------------------------------------") if(options == 'all' or options == 'align_rnaseq_tophat'): print("Aligning RNA-Seq Data sets with tophat : ") print("--------------------------------------------------------") print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean", "stdev")) print("--------------------------------------------------------") outDirPref = "{}/output/rnaseq".format(workPath) if(not os.path.isdir(outDirPref)): os.mkdir(outDirPref) outDirPref = os.path.abspath("{}/output/rnaseq/tophat".format(workPath)) if(not os.path.isdir(outDirPref)): os.mkdir(outDirPref) inDirPref = os.path.abspath("{}/data/rnaseq/fastq".format(workPath)) if(not os.path.isdir(inDirPref)): exit_with_error("ERROR!!! fastq data does not exits. Run build_rnaseq_data option") bowtieIdxPath = "{}/Bowtie2Index/Homo_sapiens.GRC38".format(refPath) ## Loop for size in rnaSeqSizeL: sampFileL = glob.glob("{}/{}/*.fq".format(inDirPref,size)) if(not os.path.isdir("{}/{}".format(outDirPref,size))): os.mkdir("{}/{}".format(outDirPref,size)) for nThread in [1]: # Tophat is poorly parallelizable runTimeV = np.zeros([len(sampFileL)]) tIdx = 0 for samp in sampFileL: sampDir = samp.split("/")[-1].split(".")[0] ## Set output directory outDir = "{}/{}/{}".format(outDirPref,size,sampDir) if(curOS == "osx"): # My OSX configuration b/c I use virtualenv python2="source ~/.local/virtualenvs/python2.7/bin/activate;" cmd = ( "{}; time {} tophat2 -p {} -o {} {} {}" "".format(python2,taskset, nThread, outDir, bowtieIdxPath, samp)) elif(curOS == 'linux'): # # On CentOS, default python is 2.6.6 # python2="/usr/bin/python" taskset = "taskset -c {} ".format(ompCoresIdD[nThread]) cmd = ( "time {} tophat2 -p {} -o {} {} {}" "".format(taskset, nThread, outDir, bowtieIdxPath, samp)) else: exit_with_error("ERROR!!! OS not supported") output = "{}\n".format(cmd) output = output + subprocess.getoutput(cmd) runTime = parse_run_time(output,workPath) # Run time runTimeV[tIdx]= runTime tIdx = tIdx + 1 print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread, np.mean(runTimeV), np.std(runTimeV))) print("--------------------------------------------------------") if(options == 'all' or options == 'align_rnaseq_hisat'): print("Aligning RNA-Seq Data sets with hisat : ") print("--------------------------------------------------------") print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean", "stdev")) print("--------------------------------------------------------") # Get directory structure outDirPref = "{}/output/rnaseq".format(workPath) if(not os.path.isdir(outDirPref)): os.mkdir(outDirPref) outDirPref = os.path.abspath("{}/output/rnaseq/hisat".format(workPath)) ## prefix if(not os.path.isdir(outDirPref)): os.mkdir(outDirPref) inDirPref = os.path.abspath("{}/data/rnaseq/fastq".format(workPath)) ## prefix if(not os.path.isdir(inDirPref)): exit_with_error("ERROR!!! fastq data does not exits. Run build_rnaseq_data option") hisatIdxPath = "{}/HisatIndex/genome".format(refPath) ## Loop for size in rnaSeqSizeL: sampFileL = glob.glob("{}/{}/*.fq".format(inDirPref,size)) if(not os.path.isdir("{}/{}".format(outDirPref,size))): os.mkdir("{}/{}".format(outDirPref,size)) for nThread in ompNumThreadsL: runTimeV = np.zeros([len(sampFileL)]) tIdx = 0 for samp in sampFileL: sampDir = samp.split("/")[-1].split(".")[0] ## Set output directory outDir = "{}/{}/{}".format(outDirPref,size,sampDir) if(curOS == 'linux'): taskset = "taskset -c {} ".format(ompCoresIdD[nThread]) else: taskset = "" if(not os.path.isdir(outDir)): os.mkdir(outDir) cmd = ( "time {} hisat2 -p {} --phred33 -x {} -U {} -S {}/output.sam" "".format(taskset, nThread, hisatIdxPath, samp, outDir)) output = "{}\n".format(cmd) output = output + subprocess.getoutput(cmd) runTime = parse_run_time(output,workPath) # Run time runTimeV[tIdx]= runTime tIdx = tIdx + 1 print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread, np.mean(runTimeV), np.std(runTimeV))) print("--------------------------------------------------------") if(options == 'all' or options == 'cufflinks_assemble'): print("Assembling transcriptome using cufflinks: ") print("--------------------------------------------------------") print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean", "stdev")) print("--------------------------------------------------------") outDirPref = os.path.abspath("{}/output/rnaseq/".format(workPath)) ## prefix if(not os.path.isdir(outDirPref)): os.mkdir(outDirPref) outDirPref = os.path.abspath("{}/output/rnaseq/cufflinks".format(workPath)) ## prefix if(not os.path.isdir(outDirPref)): os.mkdir(outDirPref) inDirPref = os.path.abspath("{}/output/rnaseq/tophat".format(workPath)) ## prefix gtf="{}/Homo_sapiens.GRCh38.83.gtf".format(refPath) ## Loop for size in rnaSeqSizeL: sampFileL = glob.glob("{}/{}/*/accepted_hits.bam".format(inDirPref,size)) if(not os.path.isdir("{}/{}".format(outDirPref,size))): os.mkdir("{}/{}".format(outDirPref,size)) for nThread in ompNumThreadsL: runTimeV = np.zeros([len(sampFileL)]) tIdx = 0 for samp in sampFileL: sampDir = samp.split("/")[-2].split(".")[0] ## Set output directory outDir = "{}/{}/{}".format(outDirPref,size,sampDir) if(not os.path.isdir(outDir)): os.mkdir(outDir) if(curOS == 'linux'): taskset = "taskset -c {} ".format(ompCoresIdD[nThread]) else: taskset = "" cmd = ( "time {} cufflinks --num-threads {} -g {} --output-dir {} {}" "".format(taskset, nThread, gtf, outDir, samp)) output = "{}\n".format(cmd) output = output + subprocess.getoutput(cmd) runTime = parse_run_time(output,workPath) # Run time runTimeV[tIdx]= runTime tIdx = tIdx + 1 print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread, np.mean(runTimeV), np.std(runTimeV))) print("--------------------------------------------------------") if(options == 'all' or options == 'cuffmerge'): print("Merging assembled transcriptomes using cuffmerge") print("--------------------------------------------------------") print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean", "stdev")) print("--------------------------------------------------------") outDirPref = os.path.abspath("{}/output/rnaseq/".format(workPath)) ## prefix if(not os.path.isdir(outDirPref)): os.mkdir(outDirPref) outDirPref = os.path.abspath("{}/output/rnaseq/cuffmerge".format(workPath)) ## prefix if(not os.path.isdir(outDirPref)): os.mkdir(outDirPref) inDirPref = os.path.abspath("{}/output/rnaseq/cufflinks".format(workPath)) ## prefix gtf="{}/Homo_sapiens.GRCh38.83.gtf".format(refPath) genome="{}/Homo_sapiens.GRCh38.dna.primary_assembly.fa".format(refPath) curDir = os.path.dirname(os.path.realpath(__file__)) ## Loop for size in rnaSeqSizeL: sampFileL = glob.glob("{}/{}/*/transcripts.gtf".format(inDirPref,size)) outDir = "{}/{}".format(outDirPref,size) if(not os.path.isdir(outDir)): os.mkdir(outDir) assemblyPath = "{}/assemblies.txt".format(outDir) if(not os.path.isfile(assemblyPath)): assemblyFile = open(assemblyPath, "w+") for samp in sampFileL: assemblyFile.write("{}\n".format(samp)) assemblyFile.close() for nThread in ompNumThreadsL: ## Consider adding nTrials here. runTimeV = np.zeros([1]) tIdx = 0 if(curOS == "osx"): # My OSX configuration b/c I use virtualenv python2="source ~/.local/virtualenvs/python2.7/bin/activate;" cmd = ( "{};" "time cuffmerge --num-threads {} -o {} " "--ref-gtf {} --ref-sequence {} {}" "".format(python2,nThread, outDir, gtf, genome, assemblyPath)) elif(curOS == "linux"): # On CentOS, default python is 2.6.6 # python2="/usr/bin/python" taskset = "taskset -c {} ".format(ompCoresIdD[nThread]) cmd = ( "pwd; cd /tmp/; alias python='/usr/bin/python';" "time {} cuffmerge --num-threads {} -o {} " "--ref-gtf {} --ref-sequence {} {}; cd {}/../" "".format(taskset, nThread, outDir, gtf, genome, assemblyPath, curDir)) else: exit_with_error("ERROR!!! Unsupported OS.") output = "{}\n".format(cmd) output = output + subprocess.getoutput(cmd) runTime = parse_run_time(output,workPath) # Run time runTimeV[tIdx]= runTime tIdx = tIdx + 1 print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread, np.mean(runTimeV), np.std(runTimeV))) print("--------------------------------------------------------") if(options == 'all' or options == 'cuffcompare'): print("Comparing cufflinks gtf using cuffcompare") print("--------------------------------------------------------") print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean", "stdev")) print("--------------------------------------------------------") # Check and make directory structure outDirPref = os.path.abspath("{}/output/rnaseq/".format(workPath)) ## prefix if(not os.path.isdir(outDirPref)): exit_with_error("ERROR!!! Expecting {}/output/rnaseq. Must have run tophat " "and cufflinks prior\n".format(workPath)) outDirPref = os.path.abspath("{}/output/rnaseq/cuffcompare".format(workPath)) if(not os.path.isdir(outDirPref)): os.mkdir(outDirPref) inDirPref = os.path.abspath("{}/output/rnaseq/cufflinks".format(workPath)) ## prefix gtf="{}/Homo_sapiens.GRCh38.83.gtf".format(refPath) genome="{}/Homo_sapiens.GRCh38.dna.primary_assembly.fa".format(refPath) nThread = 1 ## Loop for size in rnaSeqSizeL: sampFileL = glob.glob("{}/{}/*/transcripts.gtf".format(inDirPref,size)) outPref = "{}/{}".format(outDirPref,size) ## Consider adding nTrials here. runTimeV = np.zeros([1]) tIdx = 0 if(curOS == 'linux'): taskset = "taskset -c {} ".format(ompCoresIdD[nThread]) else: taskset = "" cmd = ( "time {} cuffcompare -o {} -r {} -R -C -V {}" "".format(taskset,outPref, gtf, " ".join(sampFileL))) output = "{}\n".format(cmd) output = output + subprocess.getoutput(cmd) runTime = parse_run_time(output,workPath) # Run time runTimeV[tIdx]= runTime tIdx = tIdx + 1 print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread, np.mean(runTimeV), np.std(runTimeV))) print("--------------------------------------------------------") if(options == 'all' or options == 'cuffquant'): print("Quantifying gene expression using cuffquant") print("--------------------------------------------------------") print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean", "stdev")) print("--------------------------------------------------------") outDirPref = os.path.abspath("{}/output/rnaseq/".format(workPath)) ## prefix if(not os.path.isdir(outDirPref)): os.mkdir(outDirPref) outDirPref = os.path.abspath("{}/output/rnaseq/cuffquant".format(workPath)) ## prefix if(not os.path.isdir(outDirPref)): os.mkdir(outDirPref) inGtfDirPref = os.path.abspath("{}/output/rnaseq/cuffmerge".format(workPath)) ## prefix inBamDirPref = os.path.abspath("{}/output/rnaseq/tophat".format(workPath)) ## prefix ## Loop for size in rnaSeqSizeL: bamFileL = glob.glob("{}/{}/*/accepted_hits.bam".format(inBamDirPref,size)) outDir = "{}/{}".format(outDirPref,size) gtf="{}/{}/merged.gtf".format(inGtfDirPref,size) if(not os.path.isdir(outDir)): os.mkdir(outDir) for nThread in ompNumThreadsL: ## Consider adding nTrials here. runTimeV = np.zeros([len(bamFileL)]) tIdx = 0 for bamFile in bamFileL: outDirSamp = "{}/{}".format(outDir,bamFile.split("/")[-2].split(".")[0]) if(not os.path.isdir(outDirSamp)): os.mkdir(outDirSamp) if(curOS == 'linux'): taskset = "taskset -c {} ".format(ompCoresIdD[nThread]) else: taskset = "" cmd = ( "time {} cuffquant --num-threads {} --output-dir {} " "{} {}" "".format(taskset, nThread, outDirSamp, gtf, bamFile)) output = "{}\n".format(cmd) output = output + subprocess.getoutput(cmd) runTime = parse_run_time(output,workPath) # Run time runTimeV[tIdx]= runTime tIdx = tIdx + 1 print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread, np.mean(runTimeV), np.std(runTimeV))) print("--------------------------------------------------------") if(options == 'all' or options == 'cuffnorm'): print("Quantifying gene expression using cuffnorm") print("--------------------------------------------------------") print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean", "stdev")) print("--------------------------------------------------------") outDirPref = os.path.abspath("{}/output/rnaseq/".format(workPath)) ## prefix if(not os.path.isdir(outDirPref)): os.mkdir(outDirPref) outDirPref = os.path.abspath("{}/output/rnaseq/cuffnorm".format(workPath)) ## prefix if(not os.path.isdir(outDirPref)): os.mkdir(outDirPref) inGtfDirPref = os.path.abspath("{}/output/rnaseq/cuffmerge".format(workPath)) ## prefix inCxbDirPref = os.path.abspath("{}/output/rnaseq/cuffquant".format(workPath)) ## prefix ## Loop for size in rnaSeqSizeL: cxbFileL = glob.glob("{}/{}/*/abundances.cxb".format(inCxbDirPref,size)) cxbFileL = sorted(cxbFileL) ## Break up into replicates # Get treat and wt groups sampNameL = [name.split('/')[-2] for name in cxbFileL] treatIdxL = ['treat_' in name for name in sampNameL] wtIdxL = ['wt_' in name for name in sampNameL] treatCxbL = [] wtCxbL = [] for idx in range(len(treatIdxL)): if(treatIdxL[idx] == True): treatCxbL.append(cxbFileL[idx]) elif(wtIdxL[idx] == True): wtCxbL.append(cxbFileL[idx]) else: exit_with_error("ERROR!!! neither treatIdxL[idx] {} nor wtIdxL[idx] " "{} are" "True".format(treatIdxL[idx], wtIdxL[idx])) outDir = "{}/{}".format(outDirPref,size) gtf="{}/{}/merged.gtf".format(inGtfDirPref,size) if(not os.path.isdir(outDir)): os.mkdir(outDir) for nThread in ompNumThreadsL: ## Consider adding nTrials here. runTimeV = np.zeros([1]) tIdx = 0 if(curOS == 'linux'): taskset = "taskset -c {} ".format(ompCoresIdD[nThread]) else: taskset = "" cmd = ( "time {} cuffnorm --num-threads {} --output-dir {} -L {} " " {} {} {}" "".format(taskset, nThread, outDir, "treat,wt", gtf, ",".join(treatCxbL), ",".join(wtCxbL))) #print(cmd) output = "{}\n".format(cmd) output = output + subprocess.getoutput(cmd) runTime = parse_run_time(output,workPath) # Run time runTimeV[tIdx]= runTime tIdx = tIdx + 1 print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread, np.mean(runTimeV), np.std(runTimeV))) print("--------------------------------------------------------") if(options == 'all' or options == 'cuffdiff'): print("Quantifying gene expression using cuffdiff") print("--------------------------------------------------------") print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Size", "OMP_Threads", "mean", "stdev")) print("--------------------------------------------------------") outDirPref = os.path.abspath("{}/output/rnaseq/".format(workPath)) ## prefix if(not os.path.isdir(outDirPref)): os.mkdir(outDirPref) outDirPref = os.path.abspath("{}/output/rnaseq/cuffdiff".format(workPath)) ## prefix if(not os.path.isdir(outDirPref)): os.mkdir(outDirPref) inGtfDirPref = os.path.abspath("{}/output/rnaseq/cuffmerge".format(workPath)) ## prefix inCxbDirPref = os.path.abspath("{}/output/rnaseq/cuffquant".format(workPath)) ## prefix ## Loop for size in rnaSeqSizeL: cxbFileL = glob.glob("{}/{}/*/abundances.cxb".format(inCxbDirPref,size)) cxbFileL = sorted(cxbFileL) ## Break up into replicates # Get treat and wt groups sampNameL = [name.split('/')[-2] for name in cxbFileL] treatIdxL = ['treat_' in name for name in sampNameL] wtIdxL = ['wt_' in name for name in sampNameL] treatCxbL = [] wtCxbL = [] for idx in range(len(treatIdxL)): if(treatIdxL[idx] == True): treatCxbL.append(cxbFileL[idx]) elif(wtIdxL[idx] == True): wtCxbL.append(cxbFileL[idx]) else: exit_with_error("ERROR!!! neither treatIdxL[idx] {} nor wtIdxL[idx] " "{} are" "True".format(treatIdxL[idx], wtIdxL[idx])) outDir = "{}/{}".format(outDirPref,size) gtf="{}/{}/merged.gtf".format(inGtfDirPref,size) if(not os.path.isdir(outDir)): os.mkdir(outDir) # Cuffdiff is too time intensive to go over all threads for nThread in [ompNumThreadsL[0]]: # Cheap hack iter over only nthread=1. ## Consider adding nTrials here. runTimeV = np.zeros([1]) tIdx = 0 if(curOS == 'linux'): taskset = "taskset -c {} ".format(ompCoresIdD[nThread]) else: taskset = "" cmd = ( "time {} cuffdiff --num-threads {} --output-dir {} -L {} " " {} {} {}" "".format(taskset, nThread, outDir, "treat,wt", gtf, ",".join(treatCxbL), ",".join(wtCxbL))) #print(cmd) output = "{}\n".format(cmd) output = output + subprocess.getoutput(cmd) runTime = parse_run_time(output,workPath) # Run time runTimeV[tIdx]= runTime tIdx = tIdx + 1 print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format(size, nThread, np.mean(runTimeV), np.std(runTimeV))) print("--------------------------------------------------------") # Note : # 1. This will only run on Linux, not OSX # 2. Per John, it is near pointless to run multiple threads here. # Just run it via his run_kelvin.sh, and leave my machinery out of it # 3. His script computes only the mean, but I'll shoe horn it into my # reporting scheme if(options == 'all' or options == 'kelvin'): print("Runnning Kelving...") print("--------------------------------------------------------") print(" {:<10} | {:<12} | {:<15} | {:<15}".format("Short", "OMP_Threads", "mean", "stdev")) print("--------------------------------------------------------") # Create output directory structure outDirPref = "{}/output".format(workPath) if(not os.path.isdir(outDirPref)): os.mkdir(outDirPref) outDirPref = "{}/kelvin".format(outDirPref) curDir = os.path.dirname(os.path.realpath(__file__)) if(not os.path.isdir(outDirPref)): os.mkdir(outDirPref) nThread = 1 runTimeV = np.zeros([1]) ## Loop outDir = "{}".format(outDirPref) if(not os.path.isdir(outDir)): os.mkdir(outDir) cmd = ("export LD_LIBRARY_PATH={}/kelvin/:$LD_LIBRARY_PATH;" "export PATH={}/kelvin/:$PATH;" "bash {}/kelvin/run_kelvin.sh {} {}/kelvin" # arg1 =outputdir, arg2=/path/to/kelvin.conf "".format(curDir, curDir, curDir, outDir, curDir)) output = "{}\n".format(cmd) output = output + subprocess.getoutput(cmd) runTime = parse_run_time(output,workPath) # Run time runTimeV[0]= runTime print(" {:<10} | {:<12} | {:<15.4f} | {:<15.4f}".format("Short", nThread, np.mean(runTimeV), np.std(runTimeV))) print("--------------------------------------------------------") print("Run Time for {} option : {:.4f} h\n\n".format(options,(time.time() - startTime)/3600.0)) sys.exit(0)
def create_insert(Transcript=None, ReadLength=None, Mu=None, Sigma=None, ExonList=None): """ ARGS: Transcript : a TRANSCRIPT instance ReadLength : length of reads. Different from insert length Mu : the mean of fragment length distribution Sigma : the standard deviation of fragment length distribution ExonList : RETURN: AN INSERT of length n, where n fall in a distribution of rnorm(Mu,sigma) DESCRIPTION: DEBUG: FUTURE: 1. Implement Proper solution where insert going into the Illumina adapter when stop - start < ReadLength """ start = 0 stop = 0 timeBegin = datetime.datetime.now() transLength = len(Transcript.seq) # type check if (not isinstance(Transcript, TRANSCRIPT)): exit_with_error( "ERROR! Transcript is not of class type TRANSCRIPT 1\n") insertLength = 0 # Ensure inserts are at least as long as the readlength while (insertLength < ReadLength): start = random.randint(0, transLength - 1) stop = start + int(numpy.random.normal(Mu, Sigma)) # Avoid unrealistically short inserts if (stop - start < ReadLength): continue # Avoid inserts that are past end of transcripts. if (stop > transLength - 1): # Proper solution here would have insert going into the Illumina adapter stop = transLength - 1 if (stop - start < ReadLength): # Insert must be at least as large as a read continue insert = INSERT(Transcript=Transcript, StartWrtTrans=start, StopWrtTrans=stop, ReadLength=ReadLength, ExonList=ExonList) insertLength = len(insert.seq) timeEnd = datetime.datetime.now() # print("get_insert_list() run time = %s"%(timeEnd - timeBegin)) return insert
def __init__(self, Transcript = None, StartWrtTrans = 0, StopWrtTrans = 0, ReadLength = 0, ExonList = None): """ ARGS: Transcript = a TRANSCRIPT instance start = the start wrt to the transcript stop = the stop wrt to the transcript sequence = the sequence of the insert RETURN: NONE : Initializes INSERT DESCRIPTION: DEBUG: 1. Created a crappy shell script, extract_read.sh Looking at the reads in the file generated by create_fastq_file() -> create_insert(), you'll see metadata look something like : @Read_num:0:trans:"ENST00000618181":start:935894:exons:"ENSE00002703998":935772:935896:"ENSE00002686739":939040:939129 To check to see if the metadata correctly describes the read (and by extension the read itself is likely correct), run : grep `bash src/simulate_fastq_data/extract_read.sh 935894 935772 \ 935896 939040 939129` tmp.fq CONCLUSION : single end reads of transcripts/inserts on the '+' strand in the sense direction work. 2. Tested ENST00000488147 for the reverse strand of DNA --> Spot checked on ensembl reads with 1 or 2 exons. Appears to work, but there is a bug (see below) FUTURE: 1. Check that the Read 2 is correctly handled. 2. BUG!!! The start and stop values put in the metadata when using a transcript that is on the reverse strand is wrong. This needs fixed! 3. Possible BUG : Added conditional to exclude duplicate exons when exons are completely spanned. If I thought about the equalities harder, the additional conditional would likely be unnecessary """ self.seq = None # str, sequence self.chrm = None # str, Chromosome self.start = -1 # int, start position w/r/t the chromosome self.stop = -1 # int, start position w/r/t the chromosome self.strand = None # str, '+' (forward) or '-' (reverse) self.geneID = None # str, nominal geneID, but may belong to multiple genes self.transID = None # str, nominal transID, may belong to mult. trans. self.transNum= None # str, only number portion of transID for sorting by transcript num if(StopWrtTrans - StartWrtTrans < ReadLength): exit_with_error("ERROR!! StopWrtTrans - StartWrtTrans ({}) < " "ReadLength ({})\n".format(StopWrtTrans - StartWrtTrans, ReadLength)) # type check if(not isinstance(Transcript, TRANSCRIPT)): exit_with_error("ERROR! Transcript is not of class type TRANSCRIPT 2\n") else: self.transcript = Transcript if(Transcript.chrm is not None): self.chrm = Transcript.chrm else: exit_with_error("ERROR! Transcript.chrm is None\n") if(Transcript.strand is not None): self.strand = Transcript.strand else: exit_with_error("ERROR! Transcript.strand is None\n") if(Transcript.geneID is not None): self.geneID = Transcript.geneID else: exit_with_error("ERROR! Transcript.geneID is None\n") if(Transcript.transID is not None): self.transID = Transcript.transID self.transNum = self.transID[4:] else: exit_with_error("ERROR! Transcript.transcriptID is None\n") # get the sequence of the insert self.seq = Transcript.seq[StartWrtTrans:StopWrtTrans] # get the start and stop position of insert and associated reads (1 and 2) # relative to the chromosome. May discard read 2 if single end transPos= 0 insertStart = 0 insertStop = 0 exonL = [ExonList[idx] for idx in self.transcript.exonIdxList] if(Transcript.strand == '+'): # Forward DNA strand exonL = sorted(exonL, key=operator.attrgetter('start')) elif(Transcript.strand == '-'): # Reverse DNA strand exonL = sorted(exonL, key=operator.attrgetter('start'), reverse=True) else: exit_with_error("ERROR!! invalid value for Transcript.strand : {}".format( Transcript.strand)) exonSpanL= [] # List of exons spanned by insert # Read 1 r1StartWrtTrans = StartWrtTrans r1StopWrtTrans = StartWrtTrans + ReadLength - 1 r1ExonSpanL = [] r1Start = None r1Stop = None # Read 2 r2StartWrtTrans = StopWrtTrans r2StopWrtTrans = StopWrtTrans - ReadLength + 1 r2ExonSpanL = [] r2Start = None r2Stop = None # Get all exons spanned by insert and reads 1 & 2. for exon in exonL: exonStart = transPos ## In transcript coords exonStop = transPos + len(exon.seq) ## In transcript coords ##### Insert ##### ## Insert starts in exon if(StartWrtTrans >= exonStart and StartWrtTrans <= exonStop): exonSpanL.append(exon) insertStart = exon.start + (StartWrtTrans - exonStart) ## Insert spans exon if(StartWrtTrans <= exonStart and StopWrtTrans >= exonStop): exonSpanL.append(exon) ## Insert ends in exon if(StopWrtTrans >= exonStart and StopWrtTrans <= exonStop): exonSpanL.append(exon) insertStop = exon.start + (StopWrtTrans - exonStart) ##### Read 1 ##### ## Insert starts in exon if(r1StartWrtTrans >= exonStart and r1StartWrtTrans <= exonStop): r1ExonSpanL.append(exon) r1Start = exon.start + (r1StartWrtTrans - exonStart) ## Insert spans exon if(r1StartWrtTrans <= exonStart and r1StopWrtTrans >= exonStop): if(exon not in r1ExonSpanL): r1ExonSpanL.append(exon) ## Insert ends in exon if(r1StopWrtTrans >= exonStart and r1StopWrtTrans <= exonStop): # Prevent duplicates if(exon not in r1ExonSpanL): r1ExonSpanL.append(exon) r1Stop = exon.start + (r1StopWrtTrans - exonStart) ##### Read 2 ##### ## Insert starts in exon if(r2StopWrtTrans >= exonStart and r2StopWrtTrans <= exonStop): r2ExonSpanL.append(exon) r2Stop = exon.start + (r2StopWrtTrans - exonStart) ## Insert spans exon if(r2StopWrtTrans <= exonStart and r2StartWrtTrans >= exonStop): if(exon not in r2ExonSpanL): r2ExonSpanL.append(exon) ## Insert ends in exon if(r2StartWrtTrans >= exonStart and r2StartWrtTrans <= exonStop): if(exon not in r2ExonSpanL): r2ExonSpanL.append(exon) r2Start = exon.start + (r2StartWrtTrans - exonStart) transPos = transPos + len(exon.seq) ## Error Check ## if(len(r1ExonSpanL) == 0): exit_with_error("ERROR! Read 1 does _not_ span any exons!\n") if(len(r2ExonSpanL) == 0): exit_with_error("ERROR! Read 2 does _not_ span any exons!\n") if(r1Start is None or r1Stop is None or r2Start is None or r2Stop is None): exit_with_error("ERROR!! Invalid trans={}, r1Start,r1Stop,r2Start,r2Stop = " "{},{},{},{}\n".format(Transcript.transID,r1Start,r1Stop, r2Start,r2Stop)) # Check for duplicate exons (bad!) if(len(r1ExonSpanL) != len(set(r1ExonSpanL))): exit_with_error("ERROR!! {} : len(r1ExonSpanL) {} != " "len(set(r1ExonSpanL)) {}\n".format(Transcript.transID, len(r1ExonSpanL),len(set(r1ExonSpanL)))) if(len(r2ExonSpanL) != len(set(r2ExonSpanL))): exit_with_error("ERROR!! {} : len(r2ExonSpanL) {} != " "len(set(r2ExonSpanL)) {}\n".format(Transcript.transID, len(r2ExonSpanL),len(set(r2ExonSpanL)))) self.start = insertStart ## In chromosome coords self.stop = insertStop ## In chromosome coords self.exonL = exonSpanL ## Exons spanned self.r1Start = r1Start ## In chromosome coords self.r1Stop = r1Stop ## In chromosome coords self.r1ExonL = r1ExonSpanL ## Exons spanned self.r2Start = r2Start ## In chromosome coords self.r2Stop = r2Stop ## In chromosome coords self.r2ExonL = r2ExonSpanL ## Exons spanned
def __init__(self, GtfEntry): """ ARGS: GtfEntry = a single GTF_ENTRY class element RETURN: NONE : Initializes EXON DESCRIPTION: DEBUG: FUTURE: """ self.seq = None # str, sequence self.chrm = None # str, Chromosome self.start = None # int, Start position on chrm self.stop = None # int, End position on chrm self.strand = None # str, '+' (forward) or '-' (reverse) self.exonNum= None # int, '1' indexed exon position w/r/t other exons self.exonID = None # str, exon ID ... starts with ENSE self.geneID = None # str, nominal geneID, but may belong to multiple genes self.transID = None # str, nominal transcript ID, may belong to mult. trans. # type check if(not isinstance(GtfEntry, GTF_ENTRY)): exit_with_error("ERROR! GtfEntry is not of class type GTF_ENTRY\n") if(GtfEntry.chrm is not None): self.chrm = GtfEntry.chrm else: exit_with_error("ERROR! GtfEntry.chrm is None\n") if(GtfEntry.start is not None): self.start = int(GtfEntry.start) else: exit_with_error("ERROR! GtfEntry.start is None\n") if(GtfEntry.stop is not None): self.stop = int(GtfEntry.stop) else: exit_with_error("ERROR! GtfEntry.stop is None\n") if(GtfEntry.strand is not None): self.strand = GtfEntry.strand else: exit_with_error("ERROR! GtfEntry.strand is None\n") if(GtfEntry.exonNum is not None): self.exonNum = GtfEntry.exonNum else: exit_with_error("ERROR! GtfEntry.exonNum is None\n") if(GtfEntry.exonID is not None): self.exonID = GtfEntry.exonID else: exit_with_error("ERROR! GtfEntry.exonID is None\n") if(GtfEntry.geneID is not None): self.geneID = GtfEntry.geneID else: exit_with_error("ERROR! GtfEntry.geneID is None\n") if(GtfEntry.transID is not None): self.transID = GtfEntry.transID else: exit_with_error("ERROR! GtfEntry.transcriptID is None\n")
def read_config(pathToConfig): """ ARGS: pathToConfig : str, path to configuration file RETURN: readLength = readlength desired desiredTransList= List of transcripts to use abundanceList = list of relative abundances of transcripts DESCRIPTION: Config file format : 1. Comment lines begin with '#' 2. first non-header line begins with 'ReadLength' 3. All subsequent lines must be transcripts with relative abundance The relative abundance can be any integer. Scaling is done automatically. E.g. ENST00000488147 10 ENST00000473358 5 DEBUG: For small config files it reads in all the fields correctly. FUTURE: """ desiredTransList = [] abundanceList = [ ] # integers used to get relative abundance of transcripts readLength = 0 numOfReads = 0 configFile = open(pathToConfig, 'r') for line in configFile: if (line[0] == "#"): continue line = (line.split("\n"))[0] # remove trailing \n # Check for tabs, only spaces permitted if (re.search('\t', line)): exit_with_error("ERROR! Tabs not permitted in config file!\n") line = line.split(" ") # ReadLength if (line[0] == "ReadLength"): if (readLength == 0): readLength = int(line[1]) continue else: exit_with_error( "ERROR! multiple instances of ReadLength in config " "file\n") # NumberOfReads if (line[0] == "NumberOfReads"): if (numOfReads == 0): numOfReads = int(line[1]) continue else: exit_with_error( "ERROR! multiple instances of ReadLength in config " "file\n") # Transcripts if (re.search('ENST', line[0])): desiredTransList.append(line[0]) abundanceList.append(int(line[1])) else: exit_with_error("ERROR! Incorrect transcript entry : %s\n" " All entries should begin with 'ENST'\n" % (line)) if (readLength == 0 or numOfReads == 0): exit_with_error("ERROR! ReadLength or NumberOfReads not specified in " "config.txt\n") print("Config File Parameters : \nReadLength : %i\nNumberOfReads : %i" % (readLength, numOfReads)) i = 0 for trans in desiredTransList: print("%s %i" % (trans, abundanceList[i])) i += 1 print("\n") return readLength, desiredTransList, abundanceList, numOfReads
def main(): """ ARGS: RETURN: DESCRIPTION: DEBUG: FUTURE: 1. Add option to fit only a specific section of data. """ # Check Python version nArg = len(sys.argv) # Use python 3 if (sys.version_info[0] != 3): exit_with_error("ERROR!!! Use Python 3\n") # Get options if ("-h" in sys.argv[1]): print_help(0) elif (nArg != 4): print_help(1) startTime = time.time() print("{} \n".format(sys.argv), flush=True) print(" Start Time : {}".format( time.strftime("%a, %d %b %Y %H:%M:%S ", time.localtime())), flush=True) # Get args R0 = float(sys.argv[1]) # Average number of people infected incubTime = float(sys.argv[2]) # Time before symptoms present infectTime = float(sys.argv[3]) # Time before infectious deathRate = 0.01 # fraction of infected that die hostRate = 0.20 # Fraction that get hospitalized healthyTime = 8 # Time after infection that person is healthy and no longer infectious nDays = 100 #sd = 5 # Standard deviation for gaussian distribution for infection N = 10**3 # Number of agents # Initialization agentL = [] for i in range(N): agentL.append(AGENT(ID=i)) # Make one person infected agentL[0].infected = True agentL[0].start = 0 yV = np.zeros([nDays]) R0V = np.zeros([nDays]) # Rate of infection as function of time xV = np.asarray(range(nDays)) # Do simulation. Run over 100 days. I'm sure I'm screwing up my monte-carlo methods for day in range(nDays): nInfect = 0 nTotal = 0 # number infected + number immune #i = 0 # Agent index for agent in agentL: if (agent.infected == True): nInfect += 1 # Track number infected at this time step diff = day - agent.start if (diff > incubTime): #idx = int(np.random.normal(loc=i, scale=sd)) #index of possible infection idx = random.randint(0, N - 1) #index of possible infection # If picking self to infect if (agentL[idx] == agent): continue # prob of infection = R0 / (healthyTime - infectTime prob = R0 / (healthyTime - infectTime) rng = random.random() #print(day,rng,prob) if (rng < prob and agentL[idx].immune == False and agentL[idx].infected == False): agentL[idx].infected = True agentL[idx].start = day agent.nInfect += 1 nInfect += 1 if (diff > healthyTime): agent.immune = True agent.infected = False # Get R0 as function of time if (agent.immune == True): R0V[day] += agent.nInfect nTotal += 1 #i+=1 yV[day] = nInfect if (nTotal > 0): R0V[day] = R0V[day] / nTotal # Get number immune nImmune = 0 meanR0 = 0 # Visualize spatial distribution of infection immuneV = np.zeros([N]) i = 0 for agent in agentL: if (agent.immune == True or agent.infected == True): nImmune += 1 meanR0 += agent.nInfect immuneV[i] = 1 i += 1 meanR0 = meanR0 / nImmune print("Number immune == {}, R0 = {:.4f} ".format(nImmune, meanR0)) fig, ax = plt.subplots(1, 1) ylabel = "Number of infections" ax.plot(xV, yV, label=ylabel) ax.legend() plt.show() fig, ax = plt.subplots(1, 1) ylabel = "R0" ax.plot(xV, R0V, label=ylabel) ax.legend() plt.show() print("Ended : %s" % (time.strftime("%D:%H:%M:%S"))) print("Run Time : {:.4f} h".format((time.time() - startTime) / 3600.0)) sys.exit(0)
def create_fastq_file(pathToFastq, desiredTransList, abundanceList, nReads, readLength, transDict, transList, exonList, readType): """ ARGS: pathToFastq : Path to output fastq file desiredTransList : transcripts read from config file abundanceList : list of integers that sum to a value used to normalize the number of reads. E.g. trans1 has 5 and trans2 has 10, the ratio of trans1 : trans2 = 1 : 2 nReads : Rough number of desired reads, the ratios from abundanceList is maintained at the expense of nReads. E.g from the above example if nReads = 10, the actual number of reads would be 3 for trans1, 6 for trans2 readLength : length of reads transDict : Dictionary used map transID quickly to the correct transcript in transList transList : List of TRANSCRIPTs. Contains sequence to pass to instance of FASTQ_READ() exonList : List of EXONs. Passed to FASTQ_READ() to get metadata for each fastq read. E.g. the start position and exons a read spans. readType : either : single, paired-fr, paired-rf" RETURN: None. File written DESCRIPTION: Writes fastq file. DEBUG: 1. Blasted against ensembl database, spot checked a couple of transcripts. Need further debugging. Took synthetic_sample.fastq and operated in vim on transcript ENST00000473358: Exons are ordered as : ENSE00001947070 ENSE00001922571 ENSE00001827679 Copied synthetic_sample.fastq to poop.fq ****** in vim ****** %s/^@Read.\{1,1000\}:start:\([0-9]\{1,100\}\):exons:\(.\{1,100\}\)\n\(^[A-Z]\{50\}\)\n^+\n\(^.\{50\}\)/\1\t\3\t\2/gc %s/:/\t/g # remove colon sep exon names %s/"//g # remove " around exon names ****** in shell, want exon reads start at (see order above) ****** ****** Avoid grepping enties with start positions on the exon prior ****** grep ENSE00001947070 poop.fq &> ENSE00001947070.txt grep ENSE00001922571 poop.fq | grep -v ENSE00001947070 &> ENSE00001922571.txt grep ENSE00001827679 poop.fq | grep -v ENSE00001922571 &> ENSE00001827679.txt awk '{print $1 "\t" $2}' ENSE00001947070.txt &> ENSE00001947070_1and2.txt awk '{print $1 "\t" $2}' ENSE00001922571.txt &> ENSE00001922571_1and2.txt awk '{print $1 "\t" $2}' ENSE00001827679.txt &> ENSE00001827679_1and2.txt awk '{print $2}' ENSE00001947070.txt | xargs -I{} grep -aob {} ENST00000473358.txt | awk 'BEGIN{FS=":"}{start = $1 + 29554; print start "\t" $2}' &> awk_out.txt awk '{print $2}' ENSE00001922571.txt | xargs -I{} grep -aob {} ENST00000473358.txt | awk 'BEGIN{FS=":"}{start = $1 + 30564 - 486; print start "\t" $2}' &> awk_out.txt awk '{print $2}' ENSE00001827679.txt | xargs -I{} grep -aob {} ENST00000473358.txt | awk 'BEGIN{FS=":"}{start = $1 + 30976 - 486 - 104; print start "\t" $2}' &> awk_out.txt Used diff to compare all the awk_out.txt to ENSE*_1and2.txt files. CONCLUSION : they are identical. Therefor I get the correct start position from the correct sequences. THEREFOR : I believe that create_fastq_file and FASTQ_READ() are working as expected. 2. See debug comments of INSERT class. CONCLUSION : single end reads of transcripts/inserts on the '+' strand in the sense direction work. FUTURE: Include more error checking for goofy parameters, e.g. not enough reads for the ratios, etc. """ abundanceSum = 0 transIdx = 0 readIdx = 0 for abundance in abundanceList: abundanceSum += abundance #abundanceNormalization = abundanceNormalization / len(abundanceList) # integer division if (abundanceSum < 1): exit_with_error("ERROR! abundanceSum = {}\nPlease enter abundance " "values > 1\n".format(abundanceNormalization)) if (readType == 'single'): pathToFastqR1 = pathToFastq + ".fq" fastqFileR1 = open(pathToFastqR1, "w+") fastqListR1 = [] elif (readType == 'paired-fr-first' or readType == 'paired-fr-second'): pathToFastqR1 = pathToFastq + "-R1.fq" pathToFastqR2 = pathToFastq + "-R2.fq" fastqFileR1 = open(pathToFastqR1, "w+") fastqFileR2 = open(pathToFastqR2, "w+") fastqListR1 = [] fastqListR2 = [] else: exit_with_error("ERROR!!! Incorrect value for {}".format(readType)) for transName in desiredTransList: try: trans = transList[transDict[transName]] except KeyError: exit_with_error("ERROR! {} is not a transcript annotated in your " "gtf file\n".format(transName)) for i in range( int( float(abundanceList[transIdx]) / float(abundanceSum) * nReads)): insert = create_insert(trans, readLength, 150, 15, exonList) if (readType == 'single'): fastqEntry = FASTQ_READ(Insert=insert, ReadLength=readLength, MetaData="@Read_num:%i" % (readIdx), ExonList=exonList, Direction="forward") fastqListR1.append(fastqEntry) elif (readType == 'paired-fr-first'): fastqEntry = FASTQ_READ(Insert=insert, ReadLength=readLength, MetaData="@Read_num:%i" % (readIdx), ExonList=exonList, Direction="reverse") fastqListR1.append(fastqEntry) fastqEntry = FASTQ_READ(Insert=insert, ReadLength=readLength, MetaData="@Read_num:%i" % (readIdx), ExonList=exonList, Direction="forward") fastqListR2.append(fastqEntry) elif (readType == 'paired-fr-second'): fastqEntry = FASTQ_READ(Insert=insert, ReadLength=readLength, MetaData="@Read_num:%i" % (readIdx), ExonList=exonList, Direction="forward") fastqListR1.append(fastqEntry) fastqEntry = FASTQ_READ(Insert=insert, ReadLength=readLength, MetaData="@Read_num:%i" % (readIdx), ExonList=exonList, Direction="reverse") fastqListR2.append(fastqEntry) readIdx += 1 transIdx += 1 if (readType == 'single'): for fastqEntry in fastqListR1: fastqFileR1.write( "%s\n%s\n+\n%s\n" % (fastqEntry.metadata, fastqEntry.seq, fastqEntry.qual)) fastqFileR1.close() else: for fastqEntry in fastqListR1: fastqFileR1.write( "%s\n%s\n+\n%s\n" % (fastqEntry.metadata, fastqEntry.seq, fastqEntry.qual)) for fastqEntry in fastqListR2: fastqFileR2.write( "%s\n%s\n+\n%s\n" % (fastqEntry.metadata, fastqEntry.seq, fastqEntry.qual)) fastqFileR1.close() fastqFileR2.close()
def main(): """ ARGS: RETURN: DESCRIPTION: DEBUG: FUTURE: 1. Add option to fit only a specific section of data. """ # Check Python version nArg = len(sys.argv) # Use python 3 if(sys.version_info[0] != 3): exit_with_error("ERROR!!! Use Python 3\n") # Get options if("-h" in sys.argv[1]): print_help(0) elif(nArg != 4 and nArg != 3): print_help(1) if(nArg == 4): slcIdx = int(sys.argv[3]) startTime = time.time() print("{} \n".format(sys.argv),flush=True) print(" Start Time : {}".format(time.strftime("%a, %d %b %Y %H:%M:%S ", time.localtime())),flush=True) # Get args country = sys.argv[1] plotType = sys.argv[2] # Straight line equals linear growth dataPath = "data/jhu/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv" countryFound = False df = pd.read_csv(dataPath) lastDate = df.columns[-1] for index, row in df.iterrows(): # Select country specified if(row.values[1].lower() == country.lower()): if(countryFound == True): exit_with_error("ERROR!! {} should only occur " "once".format(country.lower())) yV = np.asarray(row.values[4:],dtype=np.float32) # y vector -cases xV = np.asarray(range(len(yV))) # x vector - days n = len(xV) # Number of days countryFound = True fig, ax = plt.subplots(1,1) # Generate Plot if(plotType == "log-lin"): ylabel = "ln(cases + 1)" print(yV) yV = yV + 1 yV = np.log(yV) # Slice and only keep what if(nArg == 4): if(slcIdx < 0): xfit = xV[slcIdx:] yfit = yV[slcIdx:] elif(slcIdx > 0): xfit = xV[:slcIdx] yfit = yV[:slcIdx] fit = np.polyfit(xfit,yfit,deg=1) # Reuse xfit, and yfit xfit= np.asarray([x for x in np.arange(0,n,n/100.0)]) yfit= fit[0]*xfit + fit[1] ax.plot(xfit, yfit, label="Fit - y={:.3f}x+{:.3f}".format(fit[0],fit[1])) ax.set_title("Covid-19 in {} (ending {})".format(country, lastDate)) elif(plotType == "lin-lin"): ylabel = "Covid-19_Cases" exit_with_error("ERROR!! I haven't handled this option yet\n") else: exit_with_error("ERROR!! Invalid plotType option\n") ax.plot(xV, yV, label=ylabel) ax.set_xlabel("Time spanning 0-{} days".format(n-1)) ax.set_ylabel("{}".format(ylabel)) ax.legend() plt.show() print("Ended : %s"%(time.strftime("%D:%H:%M:%S"))) print("Run Time : {:.4f} h".format((time.time() - startTime)/3600.0)) sys.exit(0)
def main(): """ ARGS: RETURN: DESCRIPTION: DEBUG: FUTURE: 1. Add option to fit only a specific section of data. """ # Check Python version nArg = len(sys.argv) # Use python 3 if(sys.version_info[0] != 3): exit_with_error("ERROR!!! Use Python 3\n") # Get options if(len(sys.argv) > 1 and "-h" in sys.argv[1]): print_help(0) elif(nArg != 1): print_help(1) startTime = time.time() print("{} \n".format(sys.argv),flush=True) print(" Start Time : {}".format(time.strftime("%a, %d %b %Y %H:%M:%S ", time.localtime())),flush=True) ## Get args kappa = 1 # =1 is Poisson, >1 is negative binomial. Average contact network density beta = 0.1 # constant rate of infection gamma = 0.1 # average recovery rate mu = 3 # average number of nodes contacts delta = 0.1 # quarantine rho=1 R0 = kappa * beta / gamma nDays = 100 #R0 = float(sys.argv[1]) # Average number of people infected #incubTime = float(sys.argv[2]) # Time before symptoms present #infectTime = float(sys.argv[3]) # Time before infectious #deathRate = 0.01 # fraction of infected that die #hostRate = 0.20 # Fraction that get hospitalized #healthyTime= 8 # Time after infection that person is healthy and no longer infectious #nDays = 100 ##sd = 5 # Standard deviation for gaussian distribution for infection N = 10**3 # Number of agents # Initial Conditions xS = 1 xI = rho xD = mu * rho k1D = dict() k2D = dict() k3D = dict() k4D = dict() dt=1 print("{:<6} : {:<10} {:<10} {:<10}".format("Time","xD","xS","xI")) for t in range(0,nDays,dt): print("{:<6} : {:<10.4e} {:<10.4e} {:<10.4e}".format(t,xD,xS,xI)) ## k1 k1D['xD'] = k1(Funct=f_D, H=dt, Beta=beta, Delta=delta, Kappa=kappa, Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI) k1D['xS'] = k1(Funct=f_S, H=dt, Beta=beta, Delta=delta, Kappa=kappa, Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI) k1D['xI'] = k1(Funct=f_I, H=dt, Beta=beta, Delta=delta, Kappa=kappa, Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI) ## k2 k2D['xD'] = k2(K1D=k1D, Funct=f_D, H=dt, Beta=beta, Delta=delta, Kappa=kappa, Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI) k2D['xS'] = k2(K1D=k1D, Funct=f_S, H=dt, Beta=beta, Delta=delta, Kappa=kappa, Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI) k2D['xI'] = k2(K1D=k1D, Funct=f_I, H=dt, Beta=beta, Delta=delta, Kappa=kappa, Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI) ## k3 k3D['xD'] = k3(K2D=k2D, Funct=f_D, H=dt, Beta=beta, Delta=delta, Kappa=kappa, Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI) k3D['xS'] = k3(K2D=k2D, Funct=f_S, H=dt, Beta=beta, Delta=delta, Kappa=kappa, Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI) k3D['xI'] = k3(K2D=k2D, Funct=f_I, H=dt, Beta=beta, Delta=delta, Kappa=kappa, Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI) ## k4 k4D['xD'] = k4(K3D=k3D, Funct=f_D, H=dt, Beta=beta, Delta=delta, Kappa=kappa, Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI) k4D['xS'] = k4(K3D=k3D, Funct=f_S, H=dt, Beta=beta, Delta=delta, Kappa=kappa, Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI) k4D['xI'] = k4(K3D=k3D, Funct=f_I, H=dt, Beta=beta, Delta=delta, Kappa=kappa, Gamma=gamma, Mu=mu, X_D=xD, X_S=xS, X_I=xI) # Get next step value xD = xD + 1/6*(k1D['xD'] + 2*k2D['xD'] + 2*k3D['xD'] + k4D['xD']) xS = xS + 1/6*(k1D['xS'] + 2*k2D['xS'] + 2*k3D['xS'] + k4D['xS']) xI = xI + 1/6*(k1D['xI'] + 2*k2D['xI'] + 2*k3D['xI'] + k4D['xI']) print("Ended : %s"%(time.strftime("%D:%H:%M:%S"))) print("Run Time : {:.4f} h".format((time.time() - startTime)/3600.0)) sys.exit(0)
def check_correct_part(eq): for i in range(len(eq) - 1): if eq[i] == '-' or eq[i] == '+': if not (eq[i + 1][0].isdigit() or eq[i + 1][0] == 'X'): exit_with_error(-5) elif eq[i] == '*': if eq[i + 1][0] != 'X': exit_with_error(-5) elif eq[i][0].isdigit(): if eq[i].count('.') > 1: exit_with_error(-5) for n in eq[i]: if not (n.isdigit() or n == '.'): exit_with_error(-5) elif eq[i][0] == 'X': if eq[i].count('X') > 1 or eq[i].count('^') > 1: exit_with_error(-5) if len(eq[i]) > 1: if eq[i][1] != '^': exit_with_error(-5) else: eq_str = re.sub('X', '', eq[i]) eq_str = re.sub('\^', '', eq_str) for n in eq_str: if not n.isdigit(): exit_with_error(-5) if not (eq[i + 1] == '+' or eq[i + 1] == '-' or eq[i + 1] == '='): exit_with_error(-5) elif eq[i] == '=': if not (eq[i + 1][0].isdigit() or eq[i + 1][0] == 'X' or eq[i + 1] == '-'): exit_with_error(-5) else: exit_with_error(-5) if not (eq[0][0].isdigit() or eq[0][0] == 'X' or eq[0][0] == '-'): exit_with_error(-5) if not (eq[-1][-1].isdigit() or eq[-1][-1] == 'X'): print(eq[0][0]) print(eq) exit_with_error(-4)