Exemple #1
0
    def __init__(self, transcriptome, seed=None, rand=None):
        if rand: self.random = rand
        elif seed: self.random = RandomSource(seed)
        else: self.random = RandomSource()

        self._transcriptome = transcriptome
        ######
        tcnt = len(self._transcriptome.get_transcripts())
        self._weights = [float(i + 1) / float(tcnt) for i in range(0, tcnt)]
        ## _log stores what we are emitting ##
        self._log = []
Exemple #2
0
def main(args):
    # check outputs
    if len(args.output) > 1 and not args.sr:
        sys.stderr.write(
            "Error: Long reads don't support multiple output files\n")
        sys.exit()
    elif len(args.output) > 2:
        sys.stderr.wrtie(
            "Error: Short reads support at most two output files (paired end)\n"
        )
        sys.exit()
    if args.sr_length < args.minimum_read_length:
        args.minimum_read_length = args.sr_length
    inf = sys.stdin
    if args.emitter != '-':
        inf = open(args.emitter)
    sys.stderr.write("reading in transcriptome emitter\n")
    indata = pickle.loads(
        zlib.decompress(base64.b64decode(inf.read().rstrip())))
    txome = Transcriptome()
    txome.load_serialized(indata['txome'])
    rnum = RandomSource()
    rnum_tx = RandomSource()  # for drawing transcripts
    if args.seed:
        rnum = RandomSource(args.seed)
        rnum_tx = RandomSource(args.seed)
    # Load in error profile data
    ep = None
    if args.error_profile:
        sys.stderr.write("read in error profile\n")
        ep = ErrorProfilePermuter(args.error_profile, rnum,
                                  args.skew_profile_error_rate)
    txemitter = TranscriptomeEmitter(txome, rand=rnum_tx)
    if indata['weight_type'] == 'expression_table':
        sys.stderr.write(
            "Using expression table defined transcript expression\n")
        txweight = indata['weights']
        txemitter.set_weights_by_dict(txweight)
    elif indata['weight_type'] == 'exponential_distribution':
        sys.stderr.write(
            "ERROR not yet implemented exponential distribution\n")
        sys.exit()
    elif indata['weight_type'] == 'uniform_distribution':
        sys.stderr.write(
            "Using uniform distribution of transcript expression\n")
    cutter = MakeCuts(rand=rnum_tx)
    if args.sr:
        cutter.set_custom(args.sr_gauss_min, args.sr_gauss_mu,
                          args.sr_gauss_sigma)
    elif args.lr:
        cutter.set_custom(args.lr_gauss_min, args.lr_gauss_mu,
                          args.lr_gauss_sigma)
    # Prepare outputs
    of1 = sys.stdout
    if args.output[0][-3:] == '.gz':
        of1 = gzip.open(args.output[0], 'w')
    elif args.output[0] != '-':
        of1 = open(args.output[0], 'w')
    of2 = None
    if len(args.output) > 1:
        if args.output[1][-3:] == '.gz':
            of2 = gzip.open(args.output[1], 'w')
        elif args.output[0] != '-':
            of2 = open(args.ouptput[1], 'w')
    of_origin = None
    if args.output_original_source:
        if args.output_original_source[-3:] == '.gz':
            of_origin = gzip.open(args.output_original_source, 'w')
        else:
            of_origin = open(args.output_original_source, 'w')
    of_sc = None
    if args.output_sequence_change:
        if args.output_sequence_change[-3:] == '.gz':
            of_sc = gzip.open(args.output_sequence_change, 'w')
        else:
            of_sc = open(args.output_sequence_change, 'w')

    absmax = args.count * 100
    finished_count = 0
    z = 0
    while finished_count < args.count:
        z += 1
        if z > absmax: break
        tx = txemitter.emit_transcript()
        seq = tx.get_sequence()
        stage1seq = seq
        if args.trim_5prime or args.trim_3prime:
            fivestart = 0
            threeend = len(seq)
            if args.trim_5prime:
                lcut = int(args.trim_5prime[0] * len(seq))
                rcut = int(args.trim_5prime[1] * len(seq))
                fivestart = rnum_tx.randint(lcut, rcut)
            if args.trim_3prime:
                lcut = int(args.trim_3prime[0] * len(seq))
                rcut = int(args.trim_3prime[1] * len(seq))
                threeend = rnum_tx.randint(lcut, rcut)
            # set sequence to its new trimmed bounds
            seq = seq[fivestart:threeend]

        # flip sequence if necessary
        if not args.no_flip:
            seq = random_flip(seq, rnum_tx)

        l_read = create_name(rnum)
        r_read = None
        if args.sr or args.lr:
            cutseq = cutter.get_cut(seq)
        else:
            cutseq = seq  #case for no_fragmentation
        ############# if we pass this we will really start with this one
        if len(cutseq) < args.minimum_read_length: continue
        # can now log our read name
        if of_origin:
            of_origin.write(l_read + "\t" + tx.get_gene_name() + "\t" +
                            tx.get_transcript_name() + "\n")
        stage2seq = cutseq
        r = None
        if args.sr:
            r_read = l_read
            l = cutseq[0:args.sr_length]
            r = rc(cutseq[-1 * args.sr_length:])
        elif args.lr:
            l = cutseq
        else:
            l = cutseq
        stage3left = l
        stage3right = r
        if not stage3right: stage3right = ''
        #################
        #  l (or l and r) contains the sequence prior to errors being added
        l_qual = 'I' * len(l)
        r_qual = None
        if r: r_qual = 'I' * len(r)
        if args.fixed_quality:
            #sys.stderr.write("Use fixed quality\n")
            if len(args.fixed_quality) != 1:
                sys.stderr.write(
                    "ERROR fixed quaility should be 1 character\n")
                sys.exit()
            l_qual = args.fixed_quality * len(l)
            if r: r_qual = args.fixed_quality * len(r)
        elif args.quality_from_error_rate:
            #sys.stderr.write("Set quality from error rate\n")
            qchar = chr(
                int(-10 * math.log10(args.quality_from_error_rate)) + 33)
            l_qual = qchar * len(l)
            if r: r_qual = qchar * len(r)
        else:  #default is generate quality from profile
            if not ep:
                sys.stderr.write(
                    "ERROR: cannot generate quality from a profile.  Set error profile or chooce quaility from error rate or fixed quality\n"
                )
                sys.exit()
            l_qual = ep.emit_qual(len(l))
            if r: r_qual = ep.emit_qual(len(r))
        # Now prior to errors l_qual and r_qual contain our qualities

        l_fastq = Fastq([l_read, l, '+', l_qual])
        r_fastq = None
        if r:
            r_fastq = Fastq([r_read, r, '+', r_qual])
        # Permute sequences by a specific error rate
        if args.specific_errors:
            rate = args.specific_errors
            me = MakeErrors(rand=rnum)
            if args.specific_before_context:
                me.set_before_context(args.specific_before_context)
            if args.specific_after_context:
                me.set_after_context(args.specific_after_context)
            if args.specific_reference_base:
                if args.specific_reference_base != '-':
                    me.set_observed_base(args.specific_reference_base)
            if args.specific_modified_base:
                if args.specific_modified_base != '-':
                    me.set_modified_base(args.specific_modified_base)
            if args.specific_reference_base == '-':  #doing insertions
                l_fastq = me.random_insertion(l_fastq, rate)
                if r_fastq: r_fastq = me.random_insertion(r_fastq, rate)
            elif args.specific_modified_base == '-':  #doing deletions
                l_fastq = me.random_deletion(l_fastq, rate)
                if r_fastq: r_fastq = me.random_insertion(r_fastq, rate)
            else:
                l_fastq = me.random_substitution(l_fastq, rate)
                if r_fastq: r_fastq = me.random_insertion(r_fastq, rate)
        elif args.uniform_any_error:
            l_fastq = do_uniform_any(l_fastq, rnum, args.uniform_any_error)
            if r_fastq:
                r_fastq = do_uniform_any(r_fastq, rnum, args.uniform_any_error)
        elif args.uniform_mismatch_error:
            l_fastq = do_uniform_mismatch(l_fastq, rnum,
                                          args.uniform_mismatch_error)
            if r_fastq:
                r_fastq = do_uniform_mismatch(r_fastq, rnum,
                                              args.uniform_mismatch_error)
        elif args.any_error_by_quality:
            l_fastq = do_quality_any(l_fastq, rnum)
            if r_fastq: r_fastq = do_quality_any(r_fastq, rnum)
        elif args.mismatch_error_by_quality:
            l_fastq = do_quality_mismatch(l_fastq, rnum)
            if r_fastq: r_fastq = do_quality_mismatch(r_fastq, rnum)
        elif args.profile_context_error:
            l_fastq = ep.permute_context(l_fastq)
            if r_fastq: r_fastq = ep.permute_context(r_fastq)
        elif args.profile_general_error:
            l_fastq = ep.permute_general(l_fastq)
            if r_fastq: r_fastq = ep.permute_general(r_fastq)

        # if SR grown/shrink to appropriate length
        if args.sr and len(l_fastq) != args.sr_length:
            l_fastq = fit_length(l_fastq, args.sr_length, rnum)
        if r:
            if args.sr and len(r_fastq) != args.sr_length:
                r_fastq = fit_length(r_fastq, args.sr_length, rnum)

        of1.write(l_fastq.fastq())
        if of2:
            of2.write(r_fastq.fastq())

        stage4left = l_fastq.seq
        stage4right = ''
        if of_sc:
            of_sc.write(l_fastq.name+"\t"+tx.get_gene_name()+"\t"+tx.get_transcript_name()+"\t" \
                      + stage1seq+"\t"+stage2seq+"\t"+stage3left+"\t"+stage3right+"\t"+stage4left+"\t"+stage4right+"\n")
        if r_fastq: stage4right = r_fastq.seq
        finished_count += 1
        if finished_count % 1000 == 0:
            sys.stderr.write(
                str(finished_count) + '/' + str(args.count) + "   \r")
    sys.stderr.write("\n")
    of1.close()
    if of2:
        of2.close()
    if of_origin:
        of_origin.close()
    if of_sc:
        of_sc.close()
    # Temporary working directory step 3 of 3 - Cleanup
    if not args.specific_tempdir:
        rmtree(args.tempdir)