Beispiel #1
0
def fragment2read(allparameterlist):
    """
    input:
        transcript string, transcript cigar, fragmentlist
    output:
        list of (readstr,readcigar)
    """
    #read_extract_parameters=SE,100,300,400       ; single:size and fragment range
    #read_extract_parameters=PE,100,300,400       ; paired:size and fragment range
    #read_extract_parameters=PB,500,1000          ; pacbio fragment range
    txstr, txcigar, fragmentlist, parameterlist = allparameterlist
    readlist = []
    if parameterlist[0] == 'SE':
        substrsize = int(parameterlist[1])
        for fragment in fragmentlist:
            if common.toss():
                substrstart = fragment[0]
            else:
                substrstart = fragment[1] - substrsize
            readcigar = common.cigarsubstr(txcigar, substrstart, substrsize)
            readstr = txstr[substrstart:substrstart + substrsize]
            readlist.append([substrstart, readstr, readcigar])
    if parameterlist[0] == 'PE':
        substrsize = int(parameterlist[1])
        for fragment in fragmentlist:
            substrstart1 = fragment[0]
            readcigar1 = common.cigarsubstr(txcigar, substrstart1, substrsize)
            readstr1 = txstr[substrstart1:substrstart1 + substrsize]
            #readlist.append([substrstart,readstr,readcigar])
            substrstart2 = fragment[1] - substrsize
            readcigar2 = common.cigarsubstr(txcigar, substrstart2, substrsize)
            readstr2 = txstr[substrstart2:substrstart2 + substrsize]
            readlist.append([[substrstart1, readstr1, readcigar1],
                             [substrstart2, readstr2, readcigar2]])
    if parameterlist[0] == 'PB':
        for fragment in fragmentlist:
            substrstart = fragment[0]
            substrsize = fragment[1] - fragment[0]
            readcigar = common.cigarsubstr(txcigar, substrstart, substrsize)
            readstr = txstr[substrstart:substrstart + substrsize]
            readlist.append([substrstart, readstr, readcigar])
    readlist.sort()
    if parameterlist[0] == 'PE':
        pe_readlist = readlist[0:]
        readlist = []
        for pe_read in pe_readlist:
            read1, read2 = pe_read
            readlist.append(read1)
            readlist.append(read2)
    return readlist
Beispiel #2
0
def fragment2read(allparameterlist):
    """
    input:
        transcript string, transcript cigar, fragmentlist
    output:
        list of (readstr,readcigar)
    """
    #read_extract_parameters=SE,100,300,400       ; single:size and fragment range
    #read_extract_parameters=PE,100,300,400       ; paired:size and fragment range
    #read_extract_parameters=PB,500,1000          ; pacbio fragment range
    txstr,txcigar,fragmentlist,parameterlist=allparameterlist
    readlist=[]          
    if parameterlist[0]=='SE':
        substrsize=int(parameterlist[1])
        for fragment in fragmentlist:
            if common.toss():
                substrstart=fragment[0]
            else:
                substrstart=fragment[1]-substrsize
            readcigar=common.cigarsubstr(txcigar,substrstart,substrsize)
            readstr=txstr[substrstart:substrstart+substrsize]
            readlist.append([substrstart,readstr,readcigar])
    if parameterlist[0]=='PE':
        substrsize=int(parameterlist[1])
        for fragment in fragmentlist:
            substrstart1=fragment[0]
            readcigar1=common.cigarsubstr(txcigar,substrstart1,substrsize)
            readstr1=txstr[substrstart1:substrstart1+substrsize]
            #readlist.append([substrstart,readstr,readcigar])
            substrstart2=fragment[1]-substrsize
            readcigar2=common.cigarsubstr(txcigar,substrstart2,substrsize)
            readstr2=txstr[substrstart2:substrstart2+substrsize]
            readlist.append([[substrstart1,readstr1,readcigar1],[substrstart2,readstr2,readcigar2]])       
    if parameterlist[0]=='PB':
        for fragment in fragmentlist:
            substrstart=fragment[0]
            substrsize=fragment[1]-fragment[0]
            readcigar=common.cigarsubstr(txcigar,substrstart,substrsize)
            readstr=txstr[substrstart:substrstart+substrsize]
            readlist.append([substrstart,readstr,readcigar])       
    readlist.sort() 
    if parameterlist[0]=='PE':
        pe_readlist=readlist[0:]  
        readlist=[] 
        for pe_read in pe_readlist:
            read1,read2=pe_read
            readlist.append(read1)
            readlist.append(read2)
    return readlist
Beispiel #3
0
def adderrortoread(readstr,readcigar,readqualitydeteriorationrate,indelrate):
    switchdict={'A':'t','T':'a','C':'g','G':'c'}
    outstr=''
    errorrate=0
    indellist=[0]
    indelflaglist=[-1]
    for i in range(len(readstr)):
        if common.toss(indelrate) and i!=0 and indellist[-1]!=(i-1):
            indellist.append(i)
            if common.toss():
                outstr+=outstr[-1]
                outstr+=readstr[i]
                indelflaglist.append(1)
            else:
                indelflaglist.append(0)
        else:
            if common.toss(errorrate):
                outstr+=switchdict[readstr[i]]
            else:
                outstr+=readstr[i]
        errorrate+=readqualitydeteriorationrate
    indellist.append(len(readstr))
    indelflaglist.append(-1)
    outcigarstr=''
    for i in range(1,len(indellist)):
        subcig=common.cigarsubstr(readcigar,indellist[i-1],indellist[i]-indellist[i-1])
        #print 'here',readcigar,indellist[i-1],indellist[i],subcig
        outcigarstr+=subcig
        if indelflaglist[i]==1:
            outcigarstr+='1I'
        elif indelflaglist[i]==0:
            outcigarstr+='1D'
    
    return [outstr,outcigarstr]
Beispiel #4
0
def adderrortoread(readstr, readcigar, readqualitydeteriorationrate,
                   indelrate):
    switchdict = {'A': 't', 'T': 'a', 'C': 'g', 'G': 'c'}
    outstr = ''
    errorrate = 0
    indellist = [0]
    indelflaglist = [-1]
    for i in range(len(readstr)):
        if common.toss(indelrate) and i != 0 and indellist[-1] != (i - 1):
            indellist.append(i)
            if common.toss():
                outstr += outstr[-1]
                outstr += readstr[i]
                indelflaglist.append(1)
            else:
                indelflaglist.append(0)
        else:
            if common.toss(errorrate):
                outstr += switchdict[readstr[i]]
            else:
                outstr += readstr[i]
        errorrate += readqualitydeteriorationrate
    indellist.append(len(readstr))
    indelflaglist.append(-1)
    outcigarstr = ''
    for i in range(1, len(indellist)):
        subcig = common.cigarsubstr(readcigar, indellist[i - 1],
                                    indellist[i] - indellist[i - 1])
        #print 'here',readcigar,indellist[i-1],indellist[i],subcig
        outcigarstr += subcig
        if indelflaglist[i] == 1:
            outcigarstr += '1I'
        elif indelflaglist[i] == 0:
            outcigarstr += '1D'

    return [outstr, outcigarstr]
Beispiel #5
0
def gensimulatedreadsdata(sample_id,replicate_id,folderdict,genetxreaddict,genetxcigardict,config_object):
    if debug_flg==2:
        funcstarttime=time.time()
    fragmentsizerange=[int(config_object.read_extract_parameters[2]),int(config_object.read_extract_parameters[3])]
    txstartbias=config_object.txstartbias
    cutscoredict=dict([(ln.split('\t')[0],float(ln.rstrip('\n').split('\t')[1])) for ln in open(config_object.cutpreferencefile)])
    
    alignsam='%s/T%02dS%02d/alignments.sam'%(folderdict['data'][2],sample_id,replicate_id)
    alignbam='%s/T%02dS%02d/alignments.bam'%(folderdict['data'][2],sample_id,replicate_id)
    alignfout=open(alignsam,'w')
    alignerrsam='%s/T%02dS%02d/alignments_with_errors.sam'%(folderdict['data'][2],sample_id,replicate_id)
    alignerrbam='%s/T%02dS%02d/alignments_with_errors.bam'%(folderdict['data'][2],sample_id,replicate_id)
    alignerrfout=open(alignerrsam,'w')
    txgenereaddict=genetxreads2txgenereads(genetxreaddict,genetxcigardict)
    metadatafile='%s/expression_T%02dS%02d.txt'%(folderdict['metadata'],sample_id,replicate_id)
    fout=open(metadatafile,'w')
    for gene in txgenereaddict:
        for tx in txgenereaddict[gene]:
            fout.write('%s:%s:%s\t%s\tTX\t%d\t%6.4f\t%d\t%d\t%s\n'%tuple(txgenereaddict[gene][tx]))
    fout.close()
    chrgenedict=genetxdict2chrgenedict(genetxcigardict)
    chrlist=chrgenedict.keys()
    chrlist.sort()
    if debug_flg==2:
        tottxstrgettime=0
        totfraglistgettime=0
        totreadlistgettime=0
        toterrorreadgettime=0       
    for chrname in chrlist:
        if not(os.path.isfile('%s/%s.fa'%(config_object.reference_genome_dir,chrname))):
            message='%s/%s.fa: not found'%(config_object.reference_genome_dir,chrname)
            common.printstatus(message,'S',common.func_name())
            continue
        chrfafileptr=open('%s/%s.fa'%(config_object.reference_genome_dir,chrname))
        for gene in chrgenedict[chrname]:
            for tx in genetxcigardict[gene]:
                if debug_flg==2:
                    loopstarttime=time.time()
                numreads=txgenereaddict[gene][tx][6]
                startbase,cigarstr=genetxcigardict[gene][tx][2],genetxcigardict[gene][tx][3]
                txstr=getRNAtranscriptstring(chrfafileptr,startbase,cigarstr)
                if debug_flg==2:
                    txstrgettime=time.time()
                    tottxstrgettime+=txstrgettime-loopstarttime
                    message='Gene:%s, Transcript:%s; Transcript Size:%d, Num reads=%d'%(gene,tx,len(txstr),numreads)
                    common.printstatus(message,'S',common.func_name())
                if len(txstr)<fragmentsizerange[0]:
                    continue
                #print 'here',tx,startbase,cigarstr,len(txstr)
                #empirical number of cuts
                numcutmu=max(1,int(len(txstr)*4.0/(fragmentsizerange[0]+fragmentsizerange[1])))
                numcutsig=numcutmu/2.0
                #print gene,tx,len(txstr),fragmentsizerange[0],fragmentsizerange[1],numcutmu,numreads
                fragmentlist=getfragments(txstr,numreads,txgenereaddict[gene][tx][2],cutscoredict,fragmentsizerange,txstartbias,numcutmu,numcutsig)
                if debug_flg==2:
                    fraglistgettime=time.time()
                    totfraglistgettime+=fraglistgettime-txstrgettime
                #print 'here',len(txstr),numcutmu,numreads,len(fragmentlist)
                allparameterlist=[txstr,cigarstr,fragmentlist,config_object.read_extract_parameters]
                readlist=runfunc(config_object.read_extract_method,allparameterlist)
                if debug_flg==2:
                    readlistgettime=time.time()
                    totreadlistgettime+=readlistgettime-fraglistgettime
                inum=0
                for read in readlist:
                    rstartbase,readstr,readcigar=read
                    readstartlocation=sum([int(x) for x in common.cigarsubstr(cigarstr,0,rstartbase+1).replace('M','N').split('N')[:-1]])-1
                    inum+=1
                    alignfout.write('%s:%d:%d\t1\t%s\t%d\t0\t%s\t*\t0\t0\t%s\t*\n'%
                                    (tx,inum,rstartbase,chrname,startbase+readstartlocation,readcigar,readstr))
                    if max(config_object.readqualitydeteriorationrate,config_object.indelrate)>0.0:
                        errreadstr,errcigarstr=adderrortoread(readstr,readcigar,config_object.readqualitydeteriorationrate,config_object.indelrate)
                        alignerrfout.write('%s:%d:%d\t1\t%s\t%d\t0\t%s\t*\t0\t0\t%s\t*\n'%
                                    (tx,inum,rstartbase,chrname,startbase+readstartlocation,errcigarstr,errreadstr))
                if debug_flg==2:
                    errorreadgettime=time.time()
                    toterrorreadgettime+=errorreadgettime-readlistgettime
    alignfout.close()
    alignerrfout.close() 
    if debug_flg==2:
        functime=time.time()-funcstarttime
        othertime=functime-(tottxstrgettime+totfraglistgettime+totreadlistgettime+toterrorreadgettime)
        message='\n\nFunc Time=%s,Txstr=%4.2f,Fragtime=%4.2f,Readtime=%4.2f,Errtime=%4.2f,Oth=%4.2f\n\n'% \
                 (functime,tottxstrgettime*100/functime,totfraglistgettime*100/functime,
                  totreadlistgettime*100/functime,toterrorreadgettime*100/functime,othertime*100/functime)     
        common.printstatus(message,'S',common.func_name())    
    cmd='%s/samtools view -bt %s %s >%s'%(config_object.pathsamtools,config_object.chrfaifile,alignsam,alignbam)
    message='Running %s'%cmd
    common.printstatus(message,'S',common.func_name())
    os.system(cmd)
    if max(config_object.readqualitydeteriorationrate,config_object.indelrate)>0.0:
        cmd='%s/samtools view -bt %s %s >%s'%(config_object.pathsamtools,config_object.chrfaifile,alignerrsam,alignerrbam)
        message='Running %s'%cmd
        common.printstatus(message,'S',common.func_name())
        os.system(cmd)
    return 1
Beispiel #6
0
def gensimulatedreadsdata(sample_id, replicate_id, folderdict, genetxreaddict,
                          genetxcigardict, config_object):
    if debug_flg == 2:
        funcstarttime = time.time()
    fragmentsizerange = [
        int(config_object.read_extract_parameters[2]),
        int(config_object.read_extract_parameters[3])
    ]
    txstartbias = config_object.txstartbias
    cutscoredict = dict([(ln.split('\t')[0],
                          float(ln.rstrip('\n').split('\t')[1]))
                         for ln in open(config_object.cutpreferencefile)])

    alignsam = '%s/T%02dS%02d/alignments.sam' % (folderdict['data'][2],
                                                 sample_id, replicate_id)
    alignbam = '%s/T%02dS%02d/alignments.bam' % (folderdict['data'][2],
                                                 sample_id, replicate_id)
    alignfout = open(alignsam, 'w')
    alignerrsam = '%s/T%02dS%02d/alignments_with_errors.sam' % (
        folderdict['data'][2], sample_id, replicate_id)
    alignerrbam = '%s/T%02dS%02d/alignments_with_errors.bam' % (
        folderdict['data'][2], sample_id, replicate_id)
    alignerrfout = open(alignerrsam, 'w')
    txgenereaddict = genetxreads2txgenereads(genetxreaddict, genetxcigardict)
    metadatafile = '%s/expression_T%02dS%02d.txt' % (folderdict['metadata'],
                                                     sample_id, replicate_id)
    fout = open(metadatafile, 'w')
    for gene in txgenereaddict:
        for tx in txgenereaddict[gene]:
            fout.write('%s:%s:%s\t%s\tTX\t%d\t%6.4f\t%d\t%d\t%s\n' %
                       tuple(txgenereaddict[gene][tx]))
    fout.close()
    chrgenedict = genetxdict2chrgenedict(genetxcigardict)
    chrlist = chrgenedict.keys()
    chrlist.sort()
    if debug_flg == 2:
        tottxstrgettime = 0
        totfraglistgettime = 0
        totreadlistgettime = 0
        toterrorreadgettime = 0
    for chrname in chrlist:
        if not (os.path.isfile('%s/%s.fa' %
                               (config_object.reference_genome_dir, chrname))):
            message = '%s/%s.fa: not found' % (
                config_object.reference_genome_dir, chrname)
            common.printstatus(message, 'S', common.func_name())
            continue
        chrfafileptr = open('%s/%s.fa' %
                            (config_object.reference_genome_dir, chrname))
        for gene in chrgenedict[chrname]:
            for tx in genetxcigardict[gene]:
                if debug_flg == 2:
                    loopstarttime = time.time()
                numreads = txgenereaddict[gene][tx][6]
                startbase, cigarstr = genetxcigardict[gene][tx][
                    2], genetxcigardict[gene][tx][3]
                txstr = getRNAtranscriptstring(chrfafileptr, startbase,
                                               cigarstr)
                if debug_flg == 2:
                    txstrgettime = time.time()
                    tottxstrgettime += txstrgettime - loopstarttime
                    message = 'Gene:%s, Transcript:%s; Transcript Size:%d, Num reads=%d' % (
                        gene, tx, len(txstr), numreads)
                    common.printstatus(message, 'S', common.func_name())
                if len(txstr) < fragmentsizerange[0]:
                    continue
                #print 'here',tx,startbase,cigarstr,len(txstr)
                #empirical number of cuts
                numcutmu = max(
                    1,
                    int(
                        len(txstr) * 4.0 /
                        (fragmentsizerange[0] + fragmentsizerange[1])))
                numcutsig = numcutmu / 2.0
                #print gene,tx,len(txstr),fragmentsizerange[0],fragmentsizerange[1],numcutmu,numreads
                fragmentlist = getfragments(txstr, numreads,
                                            txgenereaddict[gene][tx][2],
                                            cutscoredict, fragmentsizerange,
                                            txstartbias, numcutmu, numcutsig)
                if debug_flg == 2:
                    fraglistgettime = time.time()
                    totfraglistgettime += fraglistgettime - txstrgettime
                #print 'here',len(txstr),numcutmu,numreads,len(fragmentlist)
                allparameterlist = [
                    txstr, cigarstr, fragmentlist,
                    config_object.read_extract_parameters
                ]
                readlist = runfunc(config_object.read_extract_method,
                                   allparameterlist)
                if debug_flg == 2:
                    readlistgettime = time.time()
                    totreadlistgettime += readlistgettime - fraglistgettime
                inum = 0
                for read in readlist:
                    rstartbase, readstr, readcigar = read
                    readstartlocation = sum([
                        int(x) for x in common.cigarsubstr(
                            cigarstr, 0, rstartbase +
                            1).replace('M', 'N').split('N')[:-1]
                    ]) - 1
                    inum += 1
                    alignfout.write(
                        '%s:%d:%d\t1\t%s\t%d\t0\t%s\t*\t0\t0\t%s\t*\n' %
                        (tx, inum, rstartbase, chrname,
                         startbase + readstartlocation, readcigar, readstr))
                    if max(config_object.readqualitydeteriorationrate,
                           config_object.indelrate) > 0.0:
                        errreadstr, errcigarstr = adderrortoread(
                            readstr, readcigar,
                            config_object.readqualitydeteriorationrate,
                            config_object.indelrate)
                        alignerrfout.write(
                            '%s:%d:%d\t1\t%s\t%d\t0\t%s\t*\t0\t0\t%s\t*\n' %
                            (tx, inum, rstartbase, chrname, startbase +
                             readstartlocation, errcigarstr, errreadstr))
                if debug_flg == 2:
                    errorreadgettime = time.time()
                    toterrorreadgettime += errorreadgettime - readlistgettime
    alignfout.close()
    alignerrfout.close()
    if debug_flg == 2:
        functime = time.time() - funcstarttime
        othertime = functime - (tottxstrgettime + totfraglistgettime +
                                totreadlistgettime + toterrorreadgettime)
        message='\n\nFunc Time=%s,Txstr=%4.2f,Fragtime=%4.2f,Readtime=%4.2f,Errtime=%4.2f,Oth=%4.2f\n\n'% \
                 (functime,tottxstrgettime*100/functime,totfraglistgettime*100/functime,
                  totreadlistgettime*100/functime,toterrorreadgettime*100/functime,othertime*100/functime)
        common.printstatus(message, 'S', common.func_name())
    cmd = '%s/samtools view -bt %s %s >%s' % (config_object.pathsamtools,
                                              config_object.chrfaifile,
                                              alignsam, alignbam)
    message = 'Running %s' % cmd
    common.printstatus(message, 'S', common.func_name())
    os.system(cmd)
    if max(config_object.readqualitydeteriorationrate,
           config_object.indelrate) > 0.0:
        cmd = '%s/samtools view -bt %s %s >%s' % (config_object.pathsamtools,
                                                  config_object.chrfaifile,
                                                  alignerrsam, alignerrbam)
        message = 'Running %s' % cmd
        common.printstatus(message, 'S', common.func_name())
        os.system(cmd)
    return 1