def fragment2read(allparameterlist): """ input: transcript string, transcript cigar, fragmentlist output: list of (readstr,readcigar) """ #read_extract_parameters=SE,100,300,400 ; single:size and fragment range #read_extract_parameters=PE,100,300,400 ; paired:size and fragment range #read_extract_parameters=PB,500,1000 ; pacbio fragment range txstr, txcigar, fragmentlist, parameterlist = allparameterlist readlist = [] if parameterlist[0] == 'SE': substrsize = int(parameterlist[1]) for fragment in fragmentlist: if common.toss(): substrstart = fragment[0] else: substrstart = fragment[1] - substrsize readcigar = common.cigarsubstr(txcigar, substrstart, substrsize) readstr = txstr[substrstart:substrstart + substrsize] readlist.append([substrstart, readstr, readcigar]) if parameterlist[0] == 'PE': substrsize = int(parameterlist[1]) for fragment in fragmentlist: substrstart1 = fragment[0] readcigar1 = common.cigarsubstr(txcigar, substrstart1, substrsize) readstr1 = txstr[substrstart1:substrstart1 + substrsize] #readlist.append([substrstart,readstr,readcigar]) substrstart2 = fragment[1] - substrsize readcigar2 = common.cigarsubstr(txcigar, substrstart2, substrsize) readstr2 = txstr[substrstart2:substrstart2 + substrsize] readlist.append([[substrstart1, readstr1, readcigar1], [substrstart2, readstr2, readcigar2]]) if parameterlist[0] == 'PB': for fragment in fragmentlist: substrstart = fragment[0] substrsize = fragment[1] - fragment[0] readcigar = common.cigarsubstr(txcigar, substrstart, substrsize) readstr = txstr[substrstart:substrstart + substrsize] readlist.append([substrstart, readstr, readcigar]) readlist.sort() if parameterlist[0] == 'PE': pe_readlist = readlist[0:] readlist = [] for pe_read in pe_readlist: read1, read2 = pe_read readlist.append(read1) readlist.append(read2) return readlist
def fragment2read(allparameterlist): """ input: transcript string, transcript cigar, fragmentlist output: list of (readstr,readcigar) """ #read_extract_parameters=SE,100,300,400 ; single:size and fragment range #read_extract_parameters=PE,100,300,400 ; paired:size and fragment range #read_extract_parameters=PB,500,1000 ; pacbio fragment range txstr,txcigar,fragmentlist,parameterlist=allparameterlist readlist=[] if parameterlist[0]=='SE': substrsize=int(parameterlist[1]) for fragment in fragmentlist: if common.toss(): substrstart=fragment[0] else: substrstart=fragment[1]-substrsize readcigar=common.cigarsubstr(txcigar,substrstart,substrsize) readstr=txstr[substrstart:substrstart+substrsize] readlist.append([substrstart,readstr,readcigar]) if parameterlist[0]=='PE': substrsize=int(parameterlist[1]) for fragment in fragmentlist: substrstart1=fragment[0] readcigar1=common.cigarsubstr(txcigar,substrstart1,substrsize) readstr1=txstr[substrstart1:substrstart1+substrsize] #readlist.append([substrstart,readstr,readcigar]) substrstart2=fragment[1]-substrsize readcigar2=common.cigarsubstr(txcigar,substrstart2,substrsize) readstr2=txstr[substrstart2:substrstart2+substrsize] readlist.append([[substrstart1,readstr1,readcigar1],[substrstart2,readstr2,readcigar2]]) if parameterlist[0]=='PB': for fragment in fragmentlist: substrstart=fragment[0] substrsize=fragment[1]-fragment[0] readcigar=common.cigarsubstr(txcigar,substrstart,substrsize) readstr=txstr[substrstart:substrstart+substrsize] readlist.append([substrstart,readstr,readcigar]) readlist.sort() if parameterlist[0]=='PE': pe_readlist=readlist[0:] readlist=[] for pe_read in pe_readlist: read1,read2=pe_read readlist.append(read1) readlist.append(read2) return readlist
def adderrortoread(readstr,readcigar,readqualitydeteriorationrate,indelrate): switchdict={'A':'t','T':'a','C':'g','G':'c'} outstr='' errorrate=0 indellist=[0] indelflaglist=[-1] for i in range(len(readstr)): if common.toss(indelrate) and i!=0 and indellist[-1]!=(i-1): indellist.append(i) if common.toss(): outstr+=outstr[-1] outstr+=readstr[i] indelflaglist.append(1) else: indelflaglist.append(0) else: if common.toss(errorrate): outstr+=switchdict[readstr[i]] else: outstr+=readstr[i] errorrate+=readqualitydeteriorationrate indellist.append(len(readstr)) indelflaglist.append(-1) outcigarstr='' for i in range(1,len(indellist)): subcig=common.cigarsubstr(readcigar,indellist[i-1],indellist[i]-indellist[i-1]) #print 'here',readcigar,indellist[i-1],indellist[i],subcig outcigarstr+=subcig if indelflaglist[i]==1: outcigarstr+='1I' elif indelflaglist[i]==0: outcigarstr+='1D' return [outstr,outcigarstr]
def adderrortoread(readstr, readcigar, readqualitydeteriorationrate, indelrate): switchdict = {'A': 't', 'T': 'a', 'C': 'g', 'G': 'c'} outstr = '' errorrate = 0 indellist = [0] indelflaglist = [-1] for i in range(len(readstr)): if common.toss(indelrate) and i != 0 and indellist[-1] != (i - 1): indellist.append(i) if common.toss(): outstr += outstr[-1] outstr += readstr[i] indelflaglist.append(1) else: indelflaglist.append(0) else: if common.toss(errorrate): outstr += switchdict[readstr[i]] else: outstr += readstr[i] errorrate += readqualitydeteriorationrate indellist.append(len(readstr)) indelflaglist.append(-1) outcigarstr = '' for i in range(1, len(indellist)): subcig = common.cigarsubstr(readcigar, indellist[i - 1], indellist[i] - indellist[i - 1]) #print 'here',readcigar,indellist[i-1],indellist[i],subcig outcigarstr += subcig if indelflaglist[i] == 1: outcigarstr += '1I' elif indelflaglist[i] == 0: outcigarstr += '1D' return [outstr, outcigarstr]
def gensimulatedreadsdata(sample_id,replicate_id,folderdict,genetxreaddict,genetxcigardict,config_object): if debug_flg==2: funcstarttime=time.time() fragmentsizerange=[int(config_object.read_extract_parameters[2]),int(config_object.read_extract_parameters[3])] txstartbias=config_object.txstartbias cutscoredict=dict([(ln.split('\t')[0],float(ln.rstrip('\n').split('\t')[1])) for ln in open(config_object.cutpreferencefile)]) alignsam='%s/T%02dS%02d/alignments.sam'%(folderdict['data'][2],sample_id,replicate_id) alignbam='%s/T%02dS%02d/alignments.bam'%(folderdict['data'][2],sample_id,replicate_id) alignfout=open(alignsam,'w') alignerrsam='%s/T%02dS%02d/alignments_with_errors.sam'%(folderdict['data'][2],sample_id,replicate_id) alignerrbam='%s/T%02dS%02d/alignments_with_errors.bam'%(folderdict['data'][2],sample_id,replicate_id) alignerrfout=open(alignerrsam,'w') txgenereaddict=genetxreads2txgenereads(genetxreaddict,genetxcigardict) metadatafile='%s/expression_T%02dS%02d.txt'%(folderdict['metadata'],sample_id,replicate_id) fout=open(metadatafile,'w') for gene in txgenereaddict: for tx in txgenereaddict[gene]: fout.write('%s:%s:%s\t%s\tTX\t%d\t%6.4f\t%d\t%d\t%s\n'%tuple(txgenereaddict[gene][tx])) fout.close() chrgenedict=genetxdict2chrgenedict(genetxcigardict) chrlist=chrgenedict.keys() chrlist.sort() if debug_flg==2: tottxstrgettime=0 totfraglistgettime=0 totreadlistgettime=0 toterrorreadgettime=0 for chrname in chrlist: if not(os.path.isfile('%s/%s.fa'%(config_object.reference_genome_dir,chrname))): message='%s/%s.fa: not found'%(config_object.reference_genome_dir,chrname) common.printstatus(message,'S',common.func_name()) continue chrfafileptr=open('%s/%s.fa'%(config_object.reference_genome_dir,chrname)) for gene in chrgenedict[chrname]: for tx in genetxcigardict[gene]: if debug_flg==2: loopstarttime=time.time() numreads=txgenereaddict[gene][tx][6] startbase,cigarstr=genetxcigardict[gene][tx][2],genetxcigardict[gene][tx][3] txstr=getRNAtranscriptstring(chrfafileptr,startbase,cigarstr) if debug_flg==2: txstrgettime=time.time() tottxstrgettime+=txstrgettime-loopstarttime message='Gene:%s, Transcript:%s; Transcript Size:%d, Num reads=%d'%(gene,tx,len(txstr),numreads) common.printstatus(message,'S',common.func_name()) if len(txstr)<fragmentsizerange[0]: continue #print 'here',tx,startbase,cigarstr,len(txstr) #empirical number of cuts numcutmu=max(1,int(len(txstr)*4.0/(fragmentsizerange[0]+fragmentsizerange[1]))) numcutsig=numcutmu/2.0 #print gene,tx,len(txstr),fragmentsizerange[0],fragmentsizerange[1],numcutmu,numreads fragmentlist=getfragments(txstr,numreads,txgenereaddict[gene][tx][2],cutscoredict,fragmentsizerange,txstartbias,numcutmu,numcutsig) if debug_flg==2: fraglistgettime=time.time() totfraglistgettime+=fraglistgettime-txstrgettime #print 'here',len(txstr),numcutmu,numreads,len(fragmentlist) allparameterlist=[txstr,cigarstr,fragmentlist,config_object.read_extract_parameters] readlist=runfunc(config_object.read_extract_method,allparameterlist) if debug_flg==2: readlistgettime=time.time() totreadlistgettime+=readlistgettime-fraglistgettime inum=0 for read in readlist: rstartbase,readstr,readcigar=read readstartlocation=sum([int(x) for x in common.cigarsubstr(cigarstr,0,rstartbase+1).replace('M','N').split('N')[:-1]])-1 inum+=1 alignfout.write('%s:%d:%d\t1\t%s\t%d\t0\t%s\t*\t0\t0\t%s\t*\n'% (tx,inum,rstartbase,chrname,startbase+readstartlocation,readcigar,readstr)) if max(config_object.readqualitydeteriorationrate,config_object.indelrate)>0.0: errreadstr,errcigarstr=adderrortoread(readstr,readcigar,config_object.readqualitydeteriorationrate,config_object.indelrate) alignerrfout.write('%s:%d:%d\t1\t%s\t%d\t0\t%s\t*\t0\t0\t%s\t*\n'% (tx,inum,rstartbase,chrname,startbase+readstartlocation,errcigarstr,errreadstr)) if debug_flg==2: errorreadgettime=time.time() toterrorreadgettime+=errorreadgettime-readlistgettime alignfout.close() alignerrfout.close() if debug_flg==2: functime=time.time()-funcstarttime othertime=functime-(tottxstrgettime+totfraglistgettime+totreadlistgettime+toterrorreadgettime) message='\n\nFunc Time=%s,Txstr=%4.2f,Fragtime=%4.2f,Readtime=%4.2f,Errtime=%4.2f,Oth=%4.2f\n\n'% \ (functime,tottxstrgettime*100/functime,totfraglistgettime*100/functime, totreadlistgettime*100/functime,toterrorreadgettime*100/functime,othertime*100/functime) common.printstatus(message,'S',common.func_name()) cmd='%s/samtools view -bt %s %s >%s'%(config_object.pathsamtools,config_object.chrfaifile,alignsam,alignbam) message='Running %s'%cmd common.printstatus(message,'S',common.func_name()) os.system(cmd) if max(config_object.readqualitydeteriorationrate,config_object.indelrate)>0.0: cmd='%s/samtools view -bt %s %s >%s'%(config_object.pathsamtools,config_object.chrfaifile,alignerrsam,alignerrbam) message='Running %s'%cmd common.printstatus(message,'S',common.func_name()) os.system(cmd) return 1
def gensimulatedreadsdata(sample_id, replicate_id, folderdict, genetxreaddict, genetxcigardict, config_object): if debug_flg == 2: funcstarttime = time.time() fragmentsizerange = [ int(config_object.read_extract_parameters[2]), int(config_object.read_extract_parameters[3]) ] txstartbias = config_object.txstartbias cutscoredict = dict([(ln.split('\t')[0], float(ln.rstrip('\n').split('\t')[1])) for ln in open(config_object.cutpreferencefile)]) alignsam = '%s/T%02dS%02d/alignments.sam' % (folderdict['data'][2], sample_id, replicate_id) alignbam = '%s/T%02dS%02d/alignments.bam' % (folderdict['data'][2], sample_id, replicate_id) alignfout = open(alignsam, 'w') alignerrsam = '%s/T%02dS%02d/alignments_with_errors.sam' % ( folderdict['data'][2], sample_id, replicate_id) alignerrbam = '%s/T%02dS%02d/alignments_with_errors.bam' % ( folderdict['data'][2], sample_id, replicate_id) alignerrfout = open(alignerrsam, 'w') txgenereaddict = genetxreads2txgenereads(genetxreaddict, genetxcigardict) metadatafile = '%s/expression_T%02dS%02d.txt' % (folderdict['metadata'], sample_id, replicate_id) fout = open(metadatafile, 'w') for gene in txgenereaddict: for tx in txgenereaddict[gene]: fout.write('%s:%s:%s\t%s\tTX\t%d\t%6.4f\t%d\t%d\t%s\n' % tuple(txgenereaddict[gene][tx])) fout.close() chrgenedict = genetxdict2chrgenedict(genetxcigardict) chrlist = chrgenedict.keys() chrlist.sort() if debug_flg == 2: tottxstrgettime = 0 totfraglistgettime = 0 totreadlistgettime = 0 toterrorreadgettime = 0 for chrname in chrlist: if not (os.path.isfile('%s/%s.fa' % (config_object.reference_genome_dir, chrname))): message = '%s/%s.fa: not found' % ( config_object.reference_genome_dir, chrname) common.printstatus(message, 'S', common.func_name()) continue chrfafileptr = open('%s/%s.fa' % (config_object.reference_genome_dir, chrname)) for gene in chrgenedict[chrname]: for tx in genetxcigardict[gene]: if debug_flg == 2: loopstarttime = time.time() numreads = txgenereaddict[gene][tx][6] startbase, cigarstr = genetxcigardict[gene][tx][ 2], genetxcigardict[gene][tx][3] txstr = getRNAtranscriptstring(chrfafileptr, startbase, cigarstr) if debug_flg == 2: txstrgettime = time.time() tottxstrgettime += txstrgettime - loopstarttime message = 'Gene:%s, Transcript:%s; Transcript Size:%d, Num reads=%d' % ( gene, tx, len(txstr), numreads) common.printstatus(message, 'S', common.func_name()) if len(txstr) < fragmentsizerange[0]: continue #print 'here',tx,startbase,cigarstr,len(txstr) #empirical number of cuts numcutmu = max( 1, int( len(txstr) * 4.0 / (fragmentsizerange[0] + fragmentsizerange[1]))) numcutsig = numcutmu / 2.0 #print gene,tx,len(txstr),fragmentsizerange[0],fragmentsizerange[1],numcutmu,numreads fragmentlist = getfragments(txstr, numreads, txgenereaddict[gene][tx][2], cutscoredict, fragmentsizerange, txstartbias, numcutmu, numcutsig) if debug_flg == 2: fraglistgettime = time.time() totfraglistgettime += fraglistgettime - txstrgettime #print 'here',len(txstr),numcutmu,numreads,len(fragmentlist) allparameterlist = [ txstr, cigarstr, fragmentlist, config_object.read_extract_parameters ] readlist = runfunc(config_object.read_extract_method, allparameterlist) if debug_flg == 2: readlistgettime = time.time() totreadlistgettime += readlistgettime - fraglistgettime inum = 0 for read in readlist: rstartbase, readstr, readcigar = read readstartlocation = sum([ int(x) for x in common.cigarsubstr( cigarstr, 0, rstartbase + 1).replace('M', 'N').split('N')[:-1] ]) - 1 inum += 1 alignfout.write( '%s:%d:%d\t1\t%s\t%d\t0\t%s\t*\t0\t0\t%s\t*\n' % (tx, inum, rstartbase, chrname, startbase + readstartlocation, readcigar, readstr)) if max(config_object.readqualitydeteriorationrate, config_object.indelrate) > 0.0: errreadstr, errcigarstr = adderrortoread( readstr, readcigar, config_object.readqualitydeteriorationrate, config_object.indelrate) alignerrfout.write( '%s:%d:%d\t1\t%s\t%d\t0\t%s\t*\t0\t0\t%s\t*\n' % (tx, inum, rstartbase, chrname, startbase + readstartlocation, errcigarstr, errreadstr)) if debug_flg == 2: errorreadgettime = time.time() toterrorreadgettime += errorreadgettime - readlistgettime alignfout.close() alignerrfout.close() if debug_flg == 2: functime = time.time() - funcstarttime othertime = functime - (tottxstrgettime + totfraglistgettime + totreadlistgettime + toterrorreadgettime) message='\n\nFunc Time=%s,Txstr=%4.2f,Fragtime=%4.2f,Readtime=%4.2f,Errtime=%4.2f,Oth=%4.2f\n\n'% \ (functime,tottxstrgettime*100/functime,totfraglistgettime*100/functime, totreadlistgettime*100/functime,toterrorreadgettime*100/functime,othertime*100/functime) common.printstatus(message, 'S', common.func_name()) cmd = '%s/samtools view -bt %s %s >%s' % (config_object.pathsamtools, config_object.chrfaifile, alignsam, alignbam) message = 'Running %s' % cmd common.printstatus(message, 'S', common.func_name()) os.system(cmd) if max(config_object.readqualitydeteriorationrate, config_object.indelrate) > 0.0: cmd = '%s/samtools view -bt %s %s >%s' % (config_object.pathsamtools, config_object.chrfaifile, alignerrsam, alignerrbam) message = 'Running %s' % cmd common.printstatus(message, 'S', common.func_name()) os.system(cmd) return 1