def simulate_instance(args):
    print 'Started modyfiyng genome'
    if not os.path.exists(args.output_path):
        os.makedirs(args.output_path)

    # genome_path = args.genome
    contig_path = os.path.join(args.output_path, 'ctgs.fa')
    read1_path = args.read1 
    read2_path = args.read2 
    bam_path = os.path.join(args.output_path, 'mapped')
    gff_path = os.path.join(args.output_path, 'true_error_pos.gff')
    gff_file = open(gff_path,'w')
    genome_seqs = ReadInContigseqs(open(args.genome, 'r'),10)


    #contigs/scaffolds
    gap = args.gapsize
    error = args.error
    chunk_size = 20000 
    modified_genome = {}
    modified_chunks = []
    for acc,seq in genome_seqs.iteritems():
        if acc == 'sequence_0':
            pos = 0
            chunks = [seq[i:i+chunk_size] for i in range(0, len(seq), chunk_size)]
            i=0

            for sample in range(100):
                N_s = 'N'* max(0,(gap + error))
                cut_size = gap + max(0,-error)
                modified_chunk = chunks[i][: len(chunks[i])-(cut_size)] + N_s
            #print modified_chunk
                modified_chunks.append(modified_chunk) 
                i+=1
                #print len(modified_chunk)
    
                pos += len(modified_chunk)

                if (gap + error) > 0:
                    error_start = pos - len(N_s)  
                    error_stop = pos  # error is anywhere in the introduced gap (either contraction or expansion)
                else:
                    error_start = pos 
                    error_stop = pos + 1 # error is at a specific position where a contraction has occured
                if error < 0:
                    to_GFF(gff_file, '{0}'.format(acc), 'TRUTH','FCD', error_start, error_stop, 1, '+', '.', 'Note=Error:Contraction {0}bp'.format(abs(error)))
                else:
                    to_GFF(gff_file, '{0}'.format(acc), 'TRUTH','FCD', error_start, error_stop, 1, '+', '.', 'Note=Error:Expansion {0}bp'.format(abs(error)))

            mod_seq = ''.join(modified_chunks)
            if error < 0:
                modified_genome['scf_gap{0}_errorsize_minus{1}'.format(gap,error)] = mod_seq
            else:
                modified_genome['scf_gap{0}_errorsize{1}'.format(gap,error)] = mod_seq

        else:
            modified_genome[acc] = seq

        #print and map

        ctgs = open(contig_path,'w')
        for acc,seq in modified_genome.iteritems():
            ctgs.write('>{0}\n{1}\n'.format(acc,seq))
        ctgs.close()
        ctgs = open(contig_path,'r')

        print 'Started mapping'
        align.bwa_mem(read1_path, read2_path, contig_path, bam_path, args)
def simulate_instance(args):
    print 'Started modyfiyng genome'
    if not os.path.exists(args.output_path):
        os.makedirs(args.output_path)

    # genome_path = args.genome
    contig_path = os.path.join(args.output_path, 'ctgs.fa')
    read1_path = args.read1
    read2_path = args.read2
    bam_path = os.path.join(args.output_path, 'mapped')
    gff_path = os.path.join(args.output_path, 'true_error_pos.gff')
    gff_file = open(gff_path, 'w')
    genome_seqs = ReadInContigseqs(open(args.genome, 'r'), 10)

    #contigs/scaffolds
    gap = args.gapsize
    error = args.error
    chunk_size = 20000
    modified_genome = {}
    modified_chunks = []
    for acc, seq in genome_seqs.iteritems():
        if acc == 'sequence_0':
            pos = 0
            chunks = [
                seq[i:i + chunk_size] for i in range(0, len(seq), chunk_size)
            ]
            i = 0

            for sample in range(100):
                N_s = 'N' * max(0, (gap + error))
                cut_size = gap + max(0, -error)
                modified_chunk = chunks[i][:len(chunks[i]) - (cut_size)] + N_s
                #print modified_chunk
                modified_chunks.append(modified_chunk)
                i += 1
                #print len(modified_chunk)

                pos += len(modified_chunk)

                if (gap + error) > 0:
                    error_start = pos - len(N_s)
                    error_stop = pos  # error is anywhere in the introduced gap (either contraction or expansion)
                else:
                    error_start = pos
                    error_stop = pos + 1  # error is at a specific position where a contraction has occured
                if error < 0:
                    to_GFF(gff_file, '{0}'.format(acc), 'TRUTH', 'FCD',
                           error_start, error_stop, 1, '+', '.',
                           'Note=Error:Contraction {0}bp'.format(abs(error)))
                else:
                    to_GFF(gff_file, '{0}'.format(acc), 'TRUTH', 'FCD',
                           error_start, error_stop, 1, '+', '.',
                           'Note=Error:Expansion {0}bp'.format(abs(error)))

            mod_seq = ''.join(modified_chunks)
            if error < 0:
                modified_genome['scf_gap{0}_errorsize_minus{1}'.format(
                    gap, error)] = mod_seq
            else:
                modified_genome['scf_gap{0}_errorsize{1}'.format(
                    gap, error)] = mod_seq

        else:
            modified_genome[acc] = seq

        #print and map

        ctgs = open(contig_path, 'w')
        for acc, seq in modified_genome.iteritems():
            ctgs.write('>{0}\n{1}\n'.format(acc, seq))
        ctgs.close()
        ctgs = open(contig_path, 'r')

        print 'Started mapping'
        align.bwa_mem(read1_path, read2_path, contig_path, bam_path, args)
def simulate_instance(args):
    print 'Started simulating'
    if not os.path.exists(args.output_path):
        os.makedirs(args.output_path)
    genome_path = os.path.join(args.output_path, 'genome.fa')
    contig_path = os.path.join(args.output_path, 'ctgs.fa')
    read1_path = os.path.join(args.output_path, 'reads1.fa')
    read2_path = os.path.join(args.output_path, 'reads2.fa')
    bam_path = os.path.join(args.output_path, 'mapped')
    gff_path = os.path.join(args.output_path, 'true_error_pos.gff')
    gff_file = open(gff_path,'w')

    #genome
    genomelen = args.burnin + ( (args.contiglen+args.gaplen)*(args.nrgaps + 1 ) + args.contiglen ) * (len(args.errorsize) + 1)
    print genomelen
    g = genome.Genome([0.25]*4,genomelen,'genome1')
    g.genome()
    print >> open(genome_path,'w'), g.genome_fasta_format()

    #contigs/scaffolds
    if args.scaffolds:
    	scafs = open(contig_path,'w')
        scafs.write('>scf_burnin{0}\n{1}\n'.format(args.gaplen,g.sequence[0:args.burnin]))
    	scaffold = ''
        pos = args.burnin

        for error in args.errorsize:
            scaffold_coord = 0
            for i,x in enumerate(range(pos, pos + (args.nrgaps + 1)*(args.contiglen + args.gaplen ), args.contiglen + args.gaplen)):
                #print 'pos:', x
                if (args.gaplen + error) > 0:
                    if i < args.nrgaps:
                        scaffold += g.sequence[x:x+args.contiglen]+ 'N'* (args.gaplen + error) 
                        scaffold_coord = len(scaffold)
                        error_start = scaffold_coord - (args.gaplen + error) 
                        error_stop = scaffold_coord  # error is anywhere in the introduced gap (either contraction or expansion)
                    else:
                        scaffold += g.sequence[x:x+args.contiglen]
                else:
                    #scaffold += g.sequence[i*(args.gaplen + error) + x : x + args.contiglen + (i+1)*(args.gaplen + error)] 
                    scaffold += g.sequence[x : x + args.contiglen + (args.gaplen + error)] 

                    scaffold_coord = len(scaffold)
                    error_start = scaffold_coord
                    error_stop = scaffold_coord+1 # error is at a specific position where a contraction has occured

                if error < 0 and i < args.nrgaps:
                    to_GFF(gff_file, 'scf_gap{1}_errorsize_minus{2}'.format(i+1, args.gaplen, abs(error)), 'TRUTH','FCD', error_start, error_stop, 1, '+', '.', 'Note=Error:Contraction {0}bp'.format(abs(error)))
                elif error > 0 and i < args.nrgaps:
                    to_GFF(gff_file, 'scf_gap{1}_errorsize{2}'.format(i+1, args.gaplen, abs(error)), 'TRUTH','FCD', error_start, error_stop, 1, '+', '.', 'Note=Error:Expansion {0}bp'.format(abs(error)))
                else:
                    pass

            if error <0:
                scafs.write('>scf_gap{1}_errorsize_minus{2}\n{3}\n'.format(i+1, args.gaplen, abs(error), scaffold)) 
            else:
                scafs.write('>scf_gap{1}_errorsize{2}\n{3}\n'.format(i+1, args.gaplen, error, scaffold))   
	
            scaffold = ''
            pos = x + 2*args.contiglen  
        # dummy sequences to prevent bwa tor remove any of our scaffolds
        # for i in range(10):
        #     dummy = genome.Genome([0.25]*4,10000,'z_dummy{0}'.format(i+1))
        #     dummy.genome()
        #     scafs.write('>z_dummy{0}\n{1}\n'.format(i+1, dummy.sequence)) 
            
    else:
    	ctgs = open(contig_path,'w')
        ctgs.write('>ctg0\n{0}\n'.format(g.sequence[0:args.burnin]))
    	for i,x in enumerate(range(args.burnin,genomelen,(args.contiglen + args.gaplen))):
        	ctgs.write('>ctg{0}\n{1}\n'.format(i+1,g.sequence[x:x+args.contiglen]))

    #reads
    if args.distr == 'normal':
        lib = reads.DNAseq(args.read_length ,args.coverage, distribution=args.distr, mean=args.mean,stddev=args.sd)
        lib.simulate_pe_reads(g)
    elif args.distr == 'uniform':
        lib = reads.DNAseq(args.read_length ,args.coverage, distribution=args.distr, min_size=args.min_size,max_size=args.max_size)
        lib.simulate_pe_reads(g)

    reads1 = open(read1_path,'w')
    reads2 = open(read2_path,'w')
    i=0
    for read in lib.fasta_format():
        if i%2==0:
            reads1.write(read)
        else:
            reads2.write(read)
        i+=1

    print 'Started mapping'
    #mapping
    #align.map_paired_reads(read1_path, read2_path, contig_path, bam_path, args)
    align.bwa_mem(read1_path, read2_path, contig_path, bam_path, args)
Beispiel #4
0
def simulate_instance(args):
    print 'Started simulating'
    if not os.path.exists(args.output_path):
        os.makedirs(args.output_path)
    genome_path = os.path.join(args.output_path, 'genome.fa')
    contig_path = os.path.join(args.output_path, 'ctgs.fa')
    read1_path = os.path.join(args.output_path, 'reads1.fa')
    read2_path = os.path.join(args.output_path, 'reads2.fa')
    bam_path = os.path.join(args.output_path, 'mapped')
    gff_path = os.path.join(args.output_path, 'true_error_pos.gff')
    gff_file = open(gff_path, 'w')

    #genome
    genomelen = args.burnin + (
        (args.contiglen + args.gaplen) *
        (args.nrgaps + 1) + args.contiglen) * (len(args.errorsize) + 1)
    print genomelen
    g = genome.Genome([0.25] * 4, genomelen, 'genome1')
    g.genome()
    print >> open(genome_path, 'w'), g.genome_fasta_format()

    #contigs/scaffolds
    if args.scaffolds:
        scafs = open(contig_path, 'w')
        scafs.write('>scf_burnin{0}\n{1}\n'.format(args.gaplen,
                                                   g.sequence[0:args.burnin]))
        scaffold = ''
        pos = args.burnin

        for error in args.errorsize:
            scaffold_coord = 0
            for i, x in enumerate(
                    range(
                        pos, pos + (args.nrgaps + 1) *
                        (args.contiglen + args.gaplen),
                        args.contiglen + args.gaplen)):
                #print 'pos:', x
                if (args.gaplen + error) > 0:
                    if i < args.nrgaps:
                        scaffold += g.sequence[x:x + args.contiglen] + 'N' * (
                            args.gaplen + error)
                        scaffold_coord = len(scaffold)
                        error_start = scaffold_coord - (args.gaplen + error)
                        error_stop = scaffold_coord  # error is anywhere in the introduced gap (either contraction or expansion)
                    else:
                        scaffold += g.sequence[x:x + args.contiglen]
                else:
                    #scaffold += g.sequence[i*(args.gaplen + error) + x : x + args.contiglen + (i+1)*(args.gaplen + error)]
                    scaffold += g.sequence[x:x + args.contiglen +
                                           (args.gaplen + error)]

                    scaffold_coord = len(scaffold)
                    error_start = scaffold_coord
                    error_stop = scaffold_coord + 1  # error is at a specific position where a contraction has occured

                if error < 0 and i < args.nrgaps:
                    to_GFF(
                        gff_file, 'scf_gap{1}_errorsize_minus{2}'.format(
                            i + 1, args.gaplen, abs(error)), 'TRUTH', 'FCD',
                        error_start, error_stop, 1, '+', '.',
                        'Note=Error:Contraction {0}bp'.format(abs(error)))
                elif error > 0 and i < args.nrgaps:
                    to_GFF(
                        gff_file, 'scf_gap{1}_errorsize{2}'.format(
                            i + 1, args.gaplen, abs(error)), 'TRUTH', 'FCD',
                        error_start, error_stop, 1, '+', '.',
                        'Note=Error:Expansion {0}bp'.format(abs(error)))
                else:
                    pass

            if error < 0:
                scafs.write('>scf_gap{1}_errorsize_minus{2}\n{3}\n'.format(
                    i + 1, args.gaplen, abs(error), scaffold))
            else:
                scafs.write('>scf_gap{1}_errorsize{2}\n{3}\n'.format(
                    i + 1, args.gaplen, error, scaffold))

            scaffold = ''
            pos = x + 2 * args.contiglen
    # dummy sequences to prevent bwa tor remove any of our scaffolds
    # for i in range(10):
    #     dummy = genome.Genome([0.25]*4,10000,'z_dummy{0}'.format(i+1))
    #     dummy.genome()
    #     scafs.write('>z_dummy{0}\n{1}\n'.format(i+1, dummy.sequence))

    else:
        ctgs = open(contig_path, 'w')
        ctgs.write('>ctg0\n{0}\n'.format(g.sequence[0:args.burnin]))
        for i, x in enumerate(
                range(args.burnin, genomelen, (args.contiglen + args.gaplen))):
            ctgs.write('>ctg{0}\n{1}\n'.format(
                i + 1, g.sequence[x:x + args.contiglen]))

    #reads
    if args.distr == 'normal':
        lib = reads.DNAseq(args.read_length,
                           args.coverage,
                           distribution=args.distr,
                           mean=args.mean,
                           stddev=args.sd)
        lib.simulate_pe_reads(g)
    elif args.distr == 'uniform':
        lib = reads.DNAseq(args.read_length,
                           args.coverage,
                           distribution=args.distr,
                           min_size=args.min_size,
                           max_size=args.max_size)
        lib.simulate_pe_reads(g)

    reads1 = open(read1_path, 'w')
    reads2 = open(read2_path, 'w')
    i = 0
    for read in lib.fasta_format():
        if i % 2 == 0:
            reads1.write(read)
        else:
            reads2.write(read)
        i += 1

    print 'Started mapping'
    #mapping
    #align.map_paired_reads(read1_path, read2_path, contig_path, bam_path, args)
    align.bwa_mem(read1_path, read2_path, contig_path, bam_path, args)
Beispiel #5
0
def simulate_instance(args):
    #print 'Started simulating'
    if not os.path.exists(args.output_path):
        os.makedirs(args.output_path)
    
    if not args.contigs:
        genome_path = os.path.join(args.output_path, 'genome.fa')
        contig_path = os.path.join(args.output_path, 'ctgs.fa')
    else:
        genome_path = args.genome
        contig_path = args.contigs

    read1_path = os.path.join(args.output_path, 'reads1.fa')
    read2_path = os.path.join(args.output_path, 'reads2.fa')
    bam_path = os.path.join(args.output_path, 'mapped')

    if not args.contigs:
        #genome
        #print args.genomelen
        g = genome.Genome([0.25]*4,args.genomelen,'genome1')
        g.genome()
        print >> open(genome_path,'w'), g.genome_fasta_format()

        #contigs
        ctgs = open(contig_path,'w')
        ctg_list = [x for x in contigs.generate_contigs(g.sequence,args.min_contig, args.max_contig, 0,3000)]
        random.shuffle( ctg_list )

        for ctg in ctg_list:
            ctgs.write(ctg)
    else:
        g = genome.Genome([0.25]*4,args.genomelen,'genome1')
        #print genome_path, args.genomelen
        longest_seq = 0
        for acc,seq in  fasta.fasta_iter(open(genome_path,'r')):
            print acc, len(seq)
            if len(seq) > longest_seq:
                g.sequence = seq
                g.accession = acc
                longest_seq = len(seq)
        print 'chosen:',g.accession
    #ctgs.write('>ctg0\n{0}\n'.format(g.sequence[0:args.burnin]))
    #for i,x in enumerate(range(args.burnin,args.genomelen,(args.contiglen + args.gaplen))):
    #	ctgs.write('>ctg{0}\n{1}\n'.format(i+1,g.sequence[x:x+args.contiglen]))

    #reads
    if args.distr == 'normal':
        lib = reads.DNAseq(args.read_length ,args.coverage, distribution=args.distr, mean=args.mean,stddev=args.sd)
        lib.simulate_pe_reads(g)
    elif args.distr == 'uniform':
        lib = reads.DNAseq(args.read_length ,args.coverage, distribution=args.distr, min_size=args.min_size,max_size=args.max_size)
        lib.simulate_pe_reads(g)
    elif args.distr == 'mix':
        lib_part1 = reads.DNAseq(args.read_length ,args.coverage/2, distribution='normal', mean=args.mean,stddev=args.sd)
        lib_part1.simulate_pe_reads(g)
        lib_part2 = reads.DNAseq(args.read_length ,args.coverage/2, distribution='uniform', min_size=(args.mean - 4*args.sd),max_size=(args.mean + 4*args.sd))
        lib_part2.simulate_pe_reads(g)
        # concatenate the reads from each distribution
        lib = reads.DNAseq(args.read_length ,args.coverage, distribution=args.distr, mean=args.mean,stddev=args.sd)
        lib.reads = lib_part1.reads + lib_part2.reads


    reads1 = open(read1_path,'w')
    reads2 = open(read2_path,'w')
    i=0
    for read in lib.fasta_format():
        if i%2==0:
            reads1.write(read)
        else:
            reads2.write(read)
        i+=1

    #print 'Started mapping'
    #mapping
    #align.map_paired_reads(read1_path, read2_path, contig_path, bam_path, args)
    align.bwa_mem(read1_path, read2_path, genome_path, bam_path, args)