예제 #1
0
파일: align_pairs.py 프로젝트: yangjl/cgat
def iterate_double_fasta ( fn1, fn2 ):
    iterator = FastaIterator.iterate_together( fn1, fn2 )
    for seq1, seq2 in iterator:
        yield AlignedPairs.UnalignedPair( 
            token1 = seq1.title,
            sequence1 = seq1.sequence,
            token2 = seq2.title,
            sequence2 = seq2.sequence )
예제 #2
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--output-quality-format", dest="q_format", type="int",
        help="sequence quality format, e.g 33 = +33/Sanger"
        "[default=%default].")

    parser.add_option(
        "--output-paired-end", dest="paired", action="store_true",
        help="generate paired end reads [default = %default].")

    parser.add_option(
        "--insert-length-mean", dest="insert_mean", type="float",
        help="mean insert length [default = %default].")

    parser.add_option(
        "--insert-length-sd", dest="insert_sd", type="float",
        help="insert length standard deviation [default = %default].")

    parser.add_option(
        "--counts-method", dest="counts_method", type="choice",
        choices=("reads", "copies"),
        help="simulate a ground truth number of reads per entry or"
        "copies per entry [default = %default].")

    parser.add_option(
        "--counts-min", dest="counts_min", type="float",
        help="minimum number of reads/read pairs per fasta entry"
        "or copies per entry [default = %default].")

    parser.add_option(
        "--counts-max", dest="counts_max", type="float",
        help="maximum number of reads/read pairs per fasta entry "
        "or copies per entry [default = %default].")

    parser.add_option(
        "--output-read-length", dest="read_length", type="int",
        help="read length [default = %default].")

    parser.add_option(
        "--sequence-error-phred", dest="phred", type="int",
        help="phred quality score [default = %default].")

    parser.add_option(
        "--output-counts", dest="output_counts", type="string",
        help="name for counts outfile [default=%default].")

    parser.add_option(
        "--output-fastq2", dest="fastq2_out", type="string",
        help="filename for second fastq outfile [default=%default].")

    parser.add_option(
        "--premrna-fraction", dest="premrna_fraction", type="float",
        help="the fraction of reads to simulate from pre-mRNA"
        "[default= % default].")

    parser.add_option(
        "--infile-premrna-fasta", dest="premrna_fasta", type="string",
        help="filename for pre-mRNA fasta[default=%default].")

    parser.set_defaults(
        q_format=33,
        paired=False,
        insert_mean=0,
        insert_sd=1,
        counts_method="reads",
        counts_min=1,
        counts_max=1,
        read_length=50,
        fastq2_out=None,
        output_counts=None,
        phred=30,
        premrna_fraction=0,
        premrna_fasta=None
    )

    (options, args) = E.Start(parser)

    if options.paired:
        assert options.fastq2_out, ("must specify a second fastq outfile for "
                                    "paired end (--output-fastq2)")
        outf2 = IOTools.openFile(options.fastq2_out, "w")

    if options.premrna_fraction:
        assert options.premrna_fasta, ("must specfify the location of the"
                                       "fasta file for the pre-mRNA")

    # the sequence quality string will always be the same so define here
    sequence_quality = chr(options.q_format + options.phred)
    qual = "".join([sequence_quality] * options.read_length)

    if options.premrna_fraction:
        iterator = FastaIterator.iterate_together(
            options.stdin, IOTools.openFile(options.premrna_fasta))
    else:
        iterator = FastaIterator.FastaIterator(options.stdin)

    # set a cut off of twice the read/pair length for short entries
    if options.paired:
        minimum_entry_length = (
            2 * ((options.read_length * 2) + options.insert_mean))
    else:
        minimum_entry_length = 2 * options.read_length

    c = collections.Counter()
    counts = collections.Counter()
    copies = collections.Counter()

    for f_entry in iterator:

        if options.premrna_fraction:

            assert getTitle(f_entry[0]) == getTitle(f_entry[1]), (
                "entry ids do not match: %s != %s" % (
                    f_entry[0].title, f_entry[1].title))
            entry = f_entry[0]
            pre_entry = f_entry[1]

        else:
            entry = f_entry

        # reject short fasta entries
        if len(entry.sequence) < minimum_entry_length:
            E.info("skipping short transcript: %s length=%i"
                   % (entry.title, len(entry.sequence)))
            c['skipped'] += 1
            continue

        else:
            c['not_skipped'] += 1

        if options.paired:
            fragment_length = (
                (2 * options.read_length) + options.insert_mean)
        else:
            fragment_length = options.read_length

        reads_per_entry = float(len(entry.sequence)) / fragment_length

        if options.counts_method == "reads":
            n_reads = random.randint(options.counts_min,
                                     options.counts_max + 1)

            n_copies = float(n_reads) / reads_per_entry

            if options.premrna_fraction:
                n_reads_pre = int(round(n_reads * options.premrna_fraction))

        elif options.counts_method == "copies":

            # random float [0-1]
            rand = np.random.random_sample()
            n_copies = (options.counts_min +
                        (rand * (options.counts_max - options.counts_min)))

            n_reads = int(round(n_copies * reads_per_entry, 0))

            # as n_reads must be rounded to int, need to redefine n_copies
            n_copies = float(n_reads) / reads_per_entry

            if options.premrna_fraction:
                reads_per_pre_entry = (float(len(pre_entry.sequence)) /
                                       fragment_length)
                n_copies_pre = n_copies * options.premrna_fraction
                n_reads_pre = int(round(n_copies_pre * reads_per_pre_entry, 0))
                # as n_reads_pre must be rounded to int, need to
                # redefine n_copies_pre
                n_copies_pre = float(n_reads_pre) / reads_per_pre_entry

        entry_id = getTitle(entry)

        counts[entry_id] = n_reads
        copies[entry_id] = n_copies

        if "N" in entry.sequence.upper():
            E.warn("fasta entry %s contains unknown bases ('N')" % entry_id)

        for i in range(0, n_reads):

            read = generateRead(entry=entry.sequence.upper(),
                                read_length=options.read_length,
                                error_rate=options.phred,
                                paired=options.paired,
                                insert_mean=options.insert_mean,
                                insert_sd=options.insert_sd)

            if options.paired:
                r1, r2 = read
                h1 = "@%s_%i/1" % (entry_id, i)
                h2 = "@%s_%i/2" % (entry_id, i)

                options.stdout.write("\n".join((h1, r1, "+", qual)) + "\n")
                outf2.write("\n".join((h2, r2, "+", qual)) + "\n")

            else:
                h = "@%s_%i/1" % (entry_id, i)

                options.stdout.write("\n".join((h, read, "+", qual)) + "\n")

        if options.premrna_fraction:
            c['pre_counts'] += n_reads_pre
            c['pre_copies'] += n_copies_pre

            for i in range(0, n_reads_pre):

                read = generateRead(entry=pre_entry.sequence.upper(),
                                    read_length=options.read_length,
                                    error_rate=options.phred,
                                    paired=options.paired,
                                    insert_mean=options.insert_mean,
                                    insert_sd=options.insert_sd)

                if options.paired:
                    r1, r2 = read
                    h1 = "@%s_pre-mRNA_%i/1" % (entry_id, i)
                    h2 = "@%s_pre-mRNA_%i/2" % (entry_id, i)

                    options.stdout.write("\n".join((h1, r1, "+", qual)) + "\n")
                    outf2.write("\n".join((h2, r2, "+", qual)) + "\n")

                else:
                    h = "@%s_pre-mRNA_%i/1" % (entry_id, i)

                    options.stdout.write("\n".join((h, read, "+", qual)) + "\n")

    if options.paired:
        outf2.close()

    with IOTools.openFile(options.output_counts, "w") as counts_out:

        counts_out.write("%s\n" % "\t".join(("id", "read_count", "tpm")))

        sum_copies = sum(copies.values())
        sum_counts = sum(counts.values())

        for entry_id, count in counts.items():
            tpm = 1000000 * (float(copies[entry_id]) / sum_copies)
            counts_out.write(
                "%s\n" % "\t".join(map(str, (entry_id, count, tpm))))

    E.info("Reads simulated for %i fasta entries, %i entries skipped"
           % (c['not_skipped'], c['skipped']))

    E.info("Simulated: %i reads (%i mRNA, %i pre-mRNA), "
           "%f transcripts (%f mRNA, %f pre-mRNA)" % (
               sum_counts + c['pre_counts'], sum_counts, c['pre_counts'],
               sum_copies + c['pre_copies'], sum_copies, c['pre_copies']))

    E.Stop()
예제 #3
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version",
                            usage=globals()["__doc__"])

    parser.add_option("--output-quality-format",
                      dest="q_format",
                      type="int",
                      help="sequence quality format, e.g 33 = +33/Sanger"
                      "[default=%default].")

    parser.add_option("--output-paired-end",
                      dest="paired",
                      action="store_true",
                      help="generate paired end reads [default = %default].")

    parser.add_option("--insert-length-mean",
                      dest="insert_mean",
                      type="float",
                      help="mean insert length [default = %default].")

    parser.add_option(
        "--insert-length-sd",
        dest="insert_sd",
        type="float",
        help="insert length standard deviation [default = %default].")

    parser.add_option(
        "--counts-method",
        dest="counts_method",
        type="choice",
        choices=("reads", "copies"),
        help="simulate a ground truth number of reads per entry or"
        "copies per entry [default = %default].")

    parser.add_option("--counts-min",
                      dest="counts_min",
                      type="float",
                      help="minimum number of reads/read pairs per fasta entry"
                      "or copies per entry [default = %default].")

    parser.add_option(
        "--counts-max",
        dest="counts_max",
        type="float",
        help="maximum number of reads/read pairs per fasta entry "
        "or copies per entry [default = %default].")

    parser.add_option("--output-read-length",
                      dest="read_length",
                      type="int",
                      help="read length [default = %default].")

    parser.add_option("--sequence-error-phred",
                      dest="phred",
                      type="int",
                      help="phred quality score [default = %default].")

    parser.add_option("--output-counts",
                      dest="output_counts",
                      type="string",
                      help="name for counts outfile [default=%default].")

    parser.add_option(
        "--output-fastq2",
        dest="fastq2_out",
        type="string",
        help="filename for second fastq outfile [default=%default].")

    parser.add_option("--premrna-fraction",
                      dest="premrna_fraction",
                      type="float",
                      help="the fraction of reads to simulate from pre-mRNA"
                      "[default= % default].")

    parser.add_option("--infile-premrna-fasta",
                      dest="premrna_fasta",
                      type="string",
                      help="filename for pre-mRNA fasta[default=%default].")

    parser.set_defaults(q_format=33,
                        paired=False,
                        insert_mean=0,
                        insert_sd=1,
                        counts_method="reads",
                        counts_min=1,
                        counts_max=1,
                        read_length=50,
                        fastq2_out=None,
                        output_counts=None,
                        phred=30,
                        premrna_fraction=0,
                        premrna_fasta=None)

    (options, args) = E.Start(parser)

    if options.paired:
        assert options.fastq2_out, ("must specify a second fastq outfile for "
                                    "paired end (--output-fastq2)")
        outf2 = IOTools.openFile(options.fastq2_out, "w")

    if options.premrna_fraction:
        assert options.premrna_fasta, ("must specfify the location of the"
                                       "fasta file for the pre-mRNA")

    # the sequence quality string will always be the same so define here
    sequence_quality = chr(options.q_format + options.phred)
    qual = "".join([sequence_quality] * options.read_length)

    if options.premrna_fraction:
        iterator = FastaIterator.iterate_together(
            options.stdin, IOTools.openFile(options.premrna_fasta))
    else:
        iterator = FastaIterator.FastaIterator(options.stdin)

    # set a cut off of twice the read/pair length for short entries
    if options.paired:
        minimum_entry_length = (
            2 * ((options.read_length * 2) + options.insert_mean))
    else:
        minimum_entry_length = 2 * options.read_length

    c = collections.Counter()
    counts = collections.Counter()
    copies = collections.Counter()

    for f_entry in iterator:

        if options.premrna_fraction:

            assert getTitle(f_entry[0]) == getTitle(
                f_entry[1]), ("entry ids do not match: %s != %s" %
                              (f_entry[0].title, f_entry[1].title))
            entry = f_entry[0]
            pre_entry = f_entry[1]

        else:
            entry = f_entry

        # reject short fasta entries
        if len(entry.sequence) < minimum_entry_length:
            E.info("skipping short transcript: %s length=%i" %
                   (entry.title, len(entry.sequence)))
            c['skipped'] += 1
            continue

        else:
            c['not_skipped'] += 1

        if options.paired:
            fragment_length = ((2 * options.read_length) + options.insert_mean)
        else:
            fragment_length = options.read_length

        reads_per_entry = float(len(entry.sequence)) / fragment_length

        if options.counts_method == "reads":
            n_reads = random.randint(options.counts_min,
                                     options.counts_max + 1)

            n_copies = float(n_reads) / reads_per_entry

            if options.premrna_fraction:
                n_reads_pre = int(round(n_reads * options.premrna_fraction))

        elif options.counts_method == "copies":

            # random float [0-1]
            rand = np.random.random_sample()
            n_copies = (options.counts_min +
                        (rand * (options.counts_max - options.counts_min)))

            n_reads = int(round(n_copies * reads_per_entry, 0))

            # as n_reads must be rounded to int, need to redefine n_copies
            n_copies = float(n_reads) / reads_per_entry

            if options.premrna_fraction:
                reads_per_pre_entry = (float(len(pre_entry.sequence)) /
                                       fragment_length)
                n_copies_pre = n_copies * options.premrna_fraction
                n_reads_pre = int(round(n_copies_pre * reads_per_pre_entry, 0))
                # as n_reads_pre must be rounded to int, need to
                # redefine n_copies_pre
                n_copies_pre = float(n_reads_pre) / reads_per_pre_entry

        entry_id = getTitle(entry)

        counts[entry_id] = n_reads
        copies[entry_id] = n_copies

        if "N" in entry.sequence.upper():
            E.warn("fasta entry %s contains unknown bases ('N')" % entry_id)

        for i in range(0, n_reads):

            read = generateRead(entry=entry.sequence.upper(),
                                read_length=options.read_length,
                                error_rate=options.phred,
                                paired=options.paired,
                                insert_mean=options.insert_mean,
                                insert_sd=options.insert_sd)

            if options.paired:
                r1, r2 = read
                h1 = "@%s_%i/1" % (entry_id, i)
                h2 = "@%s_%i/2" % (entry_id, i)

                options.stdout.write("\n".join((h1, r1, "+", qual)) + "\n")
                outf2.write("\n".join((h2, r2, "+", qual)) + "\n")

            else:
                h = "@%s_%i/1" % (entry_id, i)

                options.stdout.write("\n".join((h, read, "+", qual)) + "\n")

        if options.premrna_fraction:
            c['pre_counts'] += n_reads_pre
            c['pre_copies'] += n_copies_pre

            for i in range(0, n_reads_pre):

                read = generateRead(entry=pre_entry.sequence.upper(),
                                    read_length=options.read_length,
                                    error_rate=options.phred,
                                    paired=options.paired,
                                    insert_mean=options.insert_mean,
                                    insert_sd=options.insert_sd)

                if options.paired:
                    r1, r2 = read
                    h1 = "@%s_pre-mRNA_%i/1" % (entry_id, i)
                    h2 = "@%s_pre-mRNA_%i/2" % (entry_id, i)

                    options.stdout.write("\n".join((h1, r1, "+", qual)) + "\n")
                    outf2.write("\n".join((h2, r2, "+", qual)) + "\n")

                else:
                    h = "@%s_pre-mRNA_%i/1" % (entry_id, i)

                    options.stdout.write("\n".join((h, read, "+", qual)) +
                                         "\n")

    if options.paired:
        outf2.close()

    with IOTools.openFile(options.output_counts, "w") as counts_out:

        counts_out.write("%s\n" % "\t".join(("id", "read_count", "tpm")))

        sum_copies = sum(copies.values())
        sum_counts = sum(counts.values())

        for entry_id, count in counts.items():
            tpm = 1000000 * (float(copies[entry_id]) / sum_copies)
            counts_out.write("%s\n" %
                             "\t".join(map(str, (entry_id, count, tpm))))

    E.info("Reads simulated for %i fasta entries, %i entries skipped" %
           (c['not_skipped'], c['skipped']))

    E.info("Simulated: %i reads (%i mRNA, %i pre-mRNA), "
           "%f transcripts (%f mRNA, %f pre-mRNA)" %
           (sum_counts + c['pre_counts'], sum_counts, c['pre_counts'],
            sum_copies + c['pre_copies'], sum_copies, c['pre_copies']))

    E.Stop()