Esempio n. 1
0
def test_run(capsys=None):

    r1 = os.path.join(TEST_DIR, "read1.fq")
    r2 = os.path.join(TEST_DIR, "read2.fq")

    ref = os.path.join(TEST_DIR, "lambda.fa")

    wgsim.core(r1=r1, r2=r2, ref=ref, N=1000, indel_frac=0.5, seed=1)

    assert 1
Esempio n. 2
0
def cmd(genome,
        read1="read1.fq",
        read2="read2.fq",
        err=0.02,
        dist=500,
        stdev=50,
        num=1000,
        L1=70,
        L2=70,
        mut=0.001,
        frac=0.15,
        ext=0.25,
        seed=0,
        amb=0.05,
        fixed=False,
        version=False):
    """
    Short read simulator for paired end reads based on wgsim.
    """

    # Sanity check on the parameters.
    seed = 0 if not seed else int(seed)
    fixed = int(fixed)

    locs = locals()

    def check_fraction(label):
        value = locs.get(label)
        if not (0 <= value <= 1):
            error("%s=%s, it must be in the range [0, 1]" % (label, value))

    # Check parameters to be valid fractions.
    for name in ["err", "mut", "frac"]:
        check_fraction(name)

    wgsim.core(ref=genome,
               r1=read1,
               r2=read2,
               err_rate=err,
               dist=dist,
               stdev=stdev,
               N=num,
               mut_rate=mut,
               indel_frac=frac,
               indel_ext=ext,
               size_l=L1,
               size_r=L2,
               max_n=amb,
               seed=seed,
               is_fixed=fixed)

    pass
Esempio n. 3
0
def StrandSim(w, c):
    '''
	Perform first part of strand-seq simulations and re-align to the original haplotype
	'''

    hfa = pyfaidx.Fasta(c.ffile)

    if w.chrom not in hfa.keys():

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Warning] Chromosome ' + w.chrom +
              ' not found in ' + c.ffile + '. Skipped simulation')

    else:

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Message] Preparing simulation from ' + c.ffile +
              '. Haplotype ' + str(c.hapnumber))

        chr_ = hfa[w.chrom]
        seq_ = chr_[w.start - 1:w.end].seq
        tmpfa = os.path.abspath(c.haplodir + '/' + 'htmp.fa')
        region = w.chrom + '_' + str(w.start) + '_' + str(w.end)

        with open(tmpfa,
                  'w') as tmpfout:  #write temporary fa for sampling reads

            tmpfout.write('>' + region + '\n' +
                          '\n'.join(re.findall('.{1,60}', seq_)) + '\n')

        Ns = seq_.count('N')  #normalize coverage on Ns
        Nreads = round(((c.regioncoverage * (len(seq_) - Ns)) / c.length) /
                       2)  #for paired-end sequencing

        mate1h = os.path.abspath(c.haplodir + '/hr1.tmp.fq')
        mate2h = os.path.abspath(c.haplodir + '/hr2.tmp.fq')

        hapcov = Nreads * c.length * 2 / ((w.end - w.start) - Ns)
        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now +
              '][Message] Simulated coverage for this region will be ' +
              str(hapcov))

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Message] Simulating')

        wgsim.core(r1=mate1h,
                   r2=mate2h,
                   ref=tmpfa,
                   err_rate=c.error,
                   mut_rate=c.mutation,
                   indel_frac=c.indels,
                   indel_ext=c.extindels,
                   N=Nreads,
                   dist=c.distance,
                   stdev=c.stdev,
                   size_l=c.length,
                   size_r=c.length,
                   max_n=0.05,
                   is_hap=0,
                   is_fixed=0,
                   seed=0)

        os.remove(tmpfa)

        mate1hnew = os.path.abspath(c.haplodir + '/hr1.fq')
        mate2hnew = os.path.abspath(c.haplodir + '/hr2.fq')

        with open(mate1hnew, 'w') as out1, open(mate2hnew, 'w') as out2:

            for (name1, seq1, qual1), (name2, seq2,
                                       qual2) in zip(mp.fastx_read(mate1h),
                                                     mp.fastx_read(mate2h)):

                #change name1/name2

                newname1 = '@c' + str(c.singlecellnum) + 'h' + str(
                    c.hapnumber) + 'fh_' + name1
                newname2 = '@c' + str(c.singlecellnum) + 'h' + str(
                    c.hapnumber) + 'fh_' + name2

                read1 = [newname1, seq1, '+', qual1]
                read2 = [newname2, seq2, '+', qual2]

                out1.write('\n'.join(read1) + '\n')
                out2.write('\n'.join(read2) + '\n')

        os.remove(mate1h)
        os.remove(mate2h)

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print(
            '[' + now +
            '][Message] Mapping simulated reads to the corresponding haplotype'
        )

        BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.srt.bam')

        sam_cmd = [
            'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only',
            '-t',
            str(c.threads), c.ffile, mate1hnew, mate2hnew
        ]
        bam_cmd = [
            'samtools', 'sort', '-@',
            str(round(c.threads / 2)), '-o', BAM
        ]

        p1 = subprocess.Popen(sam_cmd,
                              stderr=open(os.devnull, 'wb'),
                              stdout=subprocess.PIPE)
        bout = open(BAM, 'wb')
        p2 = subprocess.run(bam_cmd,
                            stdin=p1.stdout,
                            stderr=open(os.devnull, 'wb'),
                            stdout=bout)
        bout.close()

        os.remove(mate1hnew)
        os.remove(mate2hnew)

        #now re-parse BAM file to keep only Watson/Crick reads
        #Watson reads: read1 forward, read2 reverse
        #Crick reads: read2 forward, read1 reverse

        ivf = None

        if len(c.sce_bedregion) != 0:

            sce_string = ''

            for s in c.sce_bedregion:

                if s[3] == c.cellid and s[4] == c.hapid:

                    sce_string += s.chrom + '\t' + str(s.start) + '\t' + str(
                        s.end) + '\n'

            if sce_string != '':

                sce_fromscratch = pybedtools.BedTool(sce_string.rstrip(),
                                                     from_string=True)
                ivf = sce_fromscratch.as_intervalfile(
                )  #intervals where to perform SCE events

                now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                print(
                    '[' + now +
                    '][Message] Detected one ore more SCE event for current cell/haplotype'
                )

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print(
            '[' + now +
            '][Message] Extracting Watson (R1F,R2R) and Crick (R1R,R2F) reads')

        save = pysam.set_verbosity(0)
        bamstrand = pysam.AlignmentFile(
            BAM, 'rb', require_index=False)  #until-eof consumes the bamfile
        pysam.set_verbosity(save)
        Wreads = list(WR(bamstrand, ivf))
        bamstrand.close()

        save = pysam.set_verbosity(0)
        bamstrand = pysam.AlignmentFile(
            BAM, 'rb', require_index=False)  #re-open for second round
        pysam.set_verbosity(save)
        Creads = list(CR(bamstrand, ivf))
        bamstrand.close()

        os.remove(BAM)

        if c.noise > 0:

            now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
            print('[' + now + '][Message] Adding noise to strands')

            CtoW = random.sample(Creads, round(len(Wreads) / 100 * c.noise))
            Wreads += CtoW

            WtoC = random.sample(Wreads, round(len(Creads) / 100 * c.noise))
            Creads += WtoC

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Message] Writing Watson and Crick FASTQ')

        w1 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.w1.fq')
        w2 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.w2.fq')

        c1 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.c1.fq')
        c2 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.c2.fq')

        with open(w1, 'w') as wout1, open(w2, 'w') as wout2:

            for r1, r2 in Wreads:

                if r1.get_tag('OS') == 'W':  #this is true W

                    read1 = [
                        '@' + r1.query_name, r1.query_sequence, '+',
                        '2' * c.length
                    ]
                    read2 = [
                        '@' + r2.query_name,
                        mp.revcomp(r2.query_sequence), '+', '2' * c.length
                    ]

                else:  #write to Watson, but is Crick

                    read1 = [
                        '@' + r1.query_name,
                        mp.revcomp(r1.query_sequence), '+', '2' * c.length
                    ]
                    read2 = [
                        '@' + r2.query_name, r2.query_sequence, '+',
                        '2' * c.length
                    ]

                wout1.write('\n'.join(read1) + '\n')
                wout2.write('\n'.join(read2) + '\n')

        with open(c1, 'w') as cout1, open(c2, 'w') as cout2:

            for r1, r2 in Creads:

                if r1.get_tag('OS') == 'C':  #this is true C

                    read1 = [
                        '@' + r1.query_name,
                        mp.revcomp(r1.query_sequence), '+', '2' * c.length
                    ]
                    read2 = [
                        '@' + r2.query_name, r2.query_sequence, '+',
                        '2' * c.length
                    ]

                else:  #write to Crick, but is Watson

                    read1 = [
                        '@' + r1.query_name, r1.query_sequence, '+',
                        '2' * c.length
                    ]
                    read2 = [
                        '@' + r2.query_name,
                        mp.revcomp(r2.query_sequence), '+', '2' * c.length
                    ]

                cout1.write('\n'.join(read1) + '\n')
                cout2.write('\n'.join(read2) + '\n')

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print(
            '[' + now +
            '][Message] Mapping Watson and Crick reads to the original reference'
        )

        BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) +
                              '.W.srt.bam')

        sam_cmd = [
            'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only',
            '-t',
            str(c.threads), '-R', '@RG\\tID:illumina\\tSM:strand', c.REF, w1,
            w2
        ]
        bam_cmd = [
            'samtools', 'sort', '-@',
            str(round(c.threads / 2)), '-o', BAM
        ]

        p1 = subprocess.Popen(sam_cmd,
                              stderr=open(os.devnull, 'wb'),
                              stdout=subprocess.PIPE)
        bout = open(BAM, 'wb')
        p2 = subprocess.run(bam_cmd,
                            stdin=p1.stdout,
                            stderr=open(os.devnull, 'wb'),
                            stdout=bout)
        bout.close()

        os.remove(w1)
        os.remove(w2)

        BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) +
                              '.C.srt.bam')

        sam_cmd = [
            'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only',
            '-t',
            str(c.threads), '-R', '@RG\\tID:illumina\\tSM:strand', c.REF, c1,
            c2
        ]
        bam_cmd = [
            'samtools', 'sort', '-@',
            str(round(c.threads / 2)), '-o', BAM
        ]

        p1 = subprocess.Popen(sam_cmd,
                              stderr=open(os.devnull, 'wb'),
                              stdout=subprocess.PIPE)
        bout = open(BAM, 'wb')
        p2 = subprocess.run(bam_cmd,
                            stdin=p1.stdout,
                            stderr=open(os.devnull, 'wb'),
                            stdout=bout)
        bout.close()

        os.remove(c1)
        os.remove(c2)
Esempio n. 4
0
def BulkSim(w, c):
    '''
	Perform bulk simulations and re-align to the un-modified reference
	'''

    hfa = pyfaidx.Fasta(c.ffile)

    if w.chrom not in hfa.keys():

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Warning] Chromosome ' + w.chrom +
              ' not found in ' + c.ffile + '. Skipped simulation')

    else:

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Message] Preparing simulation from ' + c.ffile +
              '. Clone ' + str(c.clonenumber) + '. Haplotype ' +
              str(c.hapnumber))

        chr_ = hfa[w.chrom]
        seq_ = chr_[w.start - 1:w.end].seq
        tmpfa = os.path.abspath(c.haplodir + '/' + 'htmp.fa')
        region = w.chrom + '_' + str(w.start) + '_' + str(w.end)

        with open(tmpfa,
                  'w') as tmpfout:  #write temporary fa for sampling reads

            tmpfout.write('>' + region + '\n' +
                          '\n'.join(re.findall('.{1,60}', seq_)) + '\n')

        Ns = seq_.count('N')  #normalize coverage on Ns
        Nreads = round(((c.regioncoverage * (len(seq_) - Ns)) / c.length) /
                       2)  #for paired-end sequencing

        mate1h = os.path.abspath(c.haplodir + '/hr1.tmp.fq')
        mate2h = os.path.abspath(c.haplodir + '/hr2.tmp.fq')

        if float(w[4]) < 100.0:

            tmpref = os.path.abspath(c.haplodir + '/' + 'rtmp.fa')
            seq__ = c.refall[w.chrom][w.start - 1:w.end].seq

            with open(tmpref,
                      'w') as tmpfout:  #write temporary fa for sampling reads

                tmpfout.write('>' + region + '\n' +
                              '\n'.join(re.findall('.{1,60}', seq__)) + '\n')

            #simulate part from reference and part from haplotype

            haploreadsN = round(Nreads / 100 * float(w[4]))

            hapcov = haploreadsN * c.length * 2 / ((w.end - w.start) - Ns)
            now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
            print('[' + now +
                  '][Message] Simulated coverage for this region will be ' +
                  str(hapcov))

            refreadsN = Nreads - haploreadsN
            refcov = refreadsN * c.length * 2 / ((w.end - w.start) - Ns)
            print(
                '[' + now +
                '][Message] Simulated coverage for the corresponding reference region will be '
                + str(refcov))

            mate1r = os.path.abspath(c.haplodir + '/rr1.tmp.fq')
            mate2r = os.path.abspath(c.haplodir + '/rr2.tmp.fq')

            now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
            print('[' + now + '][Message] Simulating')

            wgsim.core(r1=mate1h,
                       r2=mate2h,
                       ref=tmpfa,
                       err_rate=c.error,
                       mut_rate=c.mutation,
                       indel_frac=c.indels,
                       indel_ext=c.extindels,
                       N=haploreadsN,
                       dist=c.distance,
                       stdev=c.stdev,
                       size_l=c.length,
                       size_r=c.length,
                       max_n=0.05,
                       is_hap=0,
                       is_fixed=0,
                       seed=0)
            wgsim.core(r1=mate1r,
                       r2=mate2r,
                       ref=tmpref,
                       err_rate=c.error,
                       mut_rate=c.mutation,
                       indel_frac=c.indels,
                       indel_ext=c.extindels,
                       N=refreadsN,
                       dist=c.distance,
                       stdev=c.stdev,
                       size_l=c.length,
                       size_r=c.length,
                       max_n=0.05,
                       is_hap=0,
                       is_fixed=0,
                       seed=0)

            os.remove(tmpfa)
            os.remove(tmpref)

            mate1hnew = os.path.abspath(c.haplodir + '/hr1.fq')
            mate2hnew = os.path.abspath(c.haplodir + '/hr2.fq')

            with open(mate1hnew, 'w') as out1, open(mate2hnew, 'w') as out2:

                for (name1, seq1,
                     qual1), (name2, seq2,
                              qual2) in zip(mp.fastx_read(mate1h),
                                            mp.fastx_read(mate2h)):

                    #change name1/name2

                    newname1 = '@c' + str(c.clonenumber) + 'h' + str(
                        c.hapnumber) + 'fh_' + name1
                    newname2 = '@c' + str(c.clonenumber) + 'h' + str(
                        c.hapnumber) + 'fh_' + name2

                    read1 = [newname1, seq1, '+', qual1]
                    read2 = [newname2, seq2, '+', qual2]

                    out1.write('\n'.join(x for x in read1) + '\n')
                    out2.write('\n'.join(x for x in read2) + '\n')

            os.remove(mate1h)
            os.remove(mate2h)

            with open(mate1hnew, 'a') as out1, open(mate2hnew, 'a') as out2:

                for (name1, seq1,
                     qual1), (name2, seq2,
                              qual2) in zip(mp.fastx_read(mate1r),
                                            mp.fastx_read(mate2r)):

                    #change name1/name2

                    newname1 = '@c' + str(c.clonenumber) + 'h' + str(
                        c.hapnumber) + 'fr_' + name1
                    newname2 = '@c' + str(c.clonenumber) + 'h' + str(
                        c.hapnumber) + 'fr_' + name2

                    read1 = [newname1, seq1, '+', qual1]
                    read2 = [newname2, seq2, '+', qual2]

                    out1.write('\n'.join(read1) + '\n')
                    out2.write('\n'.join(read2) + '\n')

            os.remove(mate1r)
            os.remove(mate2r)

            #split in chunks for multiprocessing

        else:

            hapcov = Nreads * c.length * 2 / ((w.end - w.start) - Ns)
            now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
            print('[' + now +
                  '][Message] Simulated coverage for this region will be ' +
                  str(hapcov))

            now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
            print('[' + now + '][Message] Simulating')

            wgsim.core(r1=mate1h,
                       r2=mate2h,
                       ref=tmpfa,
                       err_rate=c.error,
                       mut_rate=c.mutation,
                       indel_frac=c.indels,
                       indel_ext=c.extindels,
                       N=Nreads,
                       dist=c.distance,
                       stdev=c.stdev,
                       size_l=c.length,
                       size_r=c.length,
                       max_n=0.05,
                       is_hap=0,
                       is_fixed=0,
                       seed=0)

            os.remove(tmpfa)

            mate1hnew = os.path.abspath(c.haplodir + '/hr1.fq')
            mate2hnew = os.path.abspath(c.haplodir + '/hr2.fq')

            with open(mate1hnew, 'w') as out1, open(mate2hnew, 'w') as out2:

                for (name1, seq1,
                     qual1), (name2, seq2,
                              qual2) in zip(mp.fastx_read(mate1h),
                                            mp.fastx_read(mate2h)):

                    #change name1/name2

                    newname1 = '@c' + str(c.clonenumber) + 'h' + str(
                        c.hapnumber) + 'fh_' + name1
                    newname2 = '@c' + str(c.clonenumber) + 'h' + str(
                        c.hapnumber) + 'fh_' + name2

                    read1 = [newname1, seq1, '+', qual1]
                    read2 = [newname2, seq2, '+', qual2]

                    out1.write('\n'.join(read1) + '\n')
                    out2.write('\n'.join(read2) + '\n')

            os.remove(mate1h)
            os.remove(mate2h)

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now +
              '][Message] Mapping simulated reads to the reference genome')

        BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.srt.bam')

        sam_cmd = [
            'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only',
            '-t',
            str(c.threads), '-R', '@RG\\tID:illumina\\tSM:bulk', c.REF,
            mate1hnew, mate2hnew
        ]
        bam_cmd = [
            'samtools', 'sort', '-@',
            str(round(c.threads / 2)), '-o', BAM
        ]

        p1 = subprocess.Popen(sam_cmd,
                              stderr=open(os.devnull, 'wb'),
                              stdout=subprocess.PIPE)
        bout = open(BAM, 'wb')
        p2 = subprocess.run(bam_cmd,
                            stdin=p1.stdout,
                            stderr=open(os.devnull, 'wb'),
                            stdout=bout)
        bout.close()

        os.remove(mate1hnew)
        os.remove(mate2hnew)
Esempio n. 5
0
def MolSim(processor, molecule, hfa, w, c):
    '''
	Parallelize 10X linked reads simulation
	'''

    for mol in molecule:

        moleculenumber = str(mol.seqidx + 1)
        moleculedroplet = str(mol.index_droplet + 1)
        barcodestring = str(mol.barcode)
        chromstart = str(w.start + mol.start)
        chromend = str(w.start + mol.end)

        header = 'MOL:' + moleculenumber + '_GEM:' + moleculedroplet + '_BAR:' + barcodestring + '_CHROM:' + w.chrom + '_START:' + chromstart + '_END:' + chromend
        seq__ = hfa[w.chrom][w.start + mol.start - 1:w.start + mol.end].seq

        truedim = mol.length - seq__.count('N')
        N = int(truedim * c.molcov) / (c.length * 2)

        R1A = os.path.abspath(c.OUT + '/SIM_S1_L' + str(c.hapnumber).zfill(3) +
                              '_R1_001.fastq')
        R2A = os.path.abspath(c.OUT + '/SIM_S1_L' + str(c.hapnumber).zfill(3) +
                              '_R2_001.fastq')

        if N != 0:

            molfa = os.path.abspath(c.OUT + '/' + processor + '_' +
                                    moleculenumber + '.fa')

            with open(molfa, 'w') as faout:

                faout.write('>' + header + '\n' +
                            '\n'.join(re.findall('.{1,60}', seq__)) + '\n')

            R1tmp = os.path.abspath(c.OUT + '/' + processor + '.R1.tmp.fq')
            R2 = os.path.abspath(c.OUT + '/' + processor + '.R2.fq')

            wgsim.core(r1=R1tmp,
                       r2=R2,
                       ref=molfa,
                       err_rate=c.error,
                       mut_rate=c.mutation,
                       indel_frac=c.indels,
                       indel_ext=c.extindels,
                       N=N,
                       dist=c.distance,
                       stdev=c.stdev,
                       size_l=c.length - 22,
                       size_r=c.length,
                       max_n=0.05,
                       is_hap=0,
                       is_fixed=0,
                       seed=0)

            os.remove(molfa)
            RANDOM6MER = ''.join(
                np.random.choice(['A', 'T', 'G', 'C', 'N'], 6, replace=True))

            if os.stat(R1tmp).st_size == 0:

                os.remove(R1tmp)
                os.remove(R2)

            else:

                with open(R1tmp, 'r') as infile, open(R1A, 'a') as outfile:

                    for name, seq, qual in readfq(infile):

                        read = [
                            '@' + name, barcodestring + RANDOM6MER + seq, '+',
                            str(qual[0]) * (22) + qual
                        ]

                        outfile.write('\n'.join(read) + '\n')

                os.remove(R1tmp)

                with open(R2, 'r') as infile, open(R2A, 'a') as outfile:

                    for name, seq, qual in readfq(infile):

                        read = ['@' + name, seq, '+', qual]

                        outfile.write('\n'.join(read) + '\n')

                os.remove(R2)