コード例 #1
0
def makeRead(genome, meanLength, sdLength, minReadLength, maxReadLength, id_,
             rate, circularGenome):
    """
    Make a read, according to various parameters and constraints regarding its
    length.

    Note that when circularGenome is False, reads generated using this method
    will not in fact have a mean length of C{meanLength}. This is because they
    are sometimes truncated at the start and end of the genome.

    @param genome: The C{str} genome to base the read on.
    @param meanLength: The C{float} mean read length.
    @param sdLength: The C{float} standard deviation of the read lengths.
    @param minReadLength: The C{int} minimum read length.
    @param maxReadLength: The C{int} maximum read length.
    @param id_: The C{str} read id.
    @param rate: The per-base C{float} mutation rate.
    @param circularGenome: If C{True}, the genome will be treated as circular.
        Reads that would otherwise be truncated by running into the end of the
        genome will continue with bases from the start of the genome.
    """
    genomeLen = len(genome)
    length = -1

    while (0 >= length > genomeLen or length < minReadLength
           or length > maxReadLength):
        length = int(normalvariate(meanLength, sdLength) + 0.5)

    if circularGenome:
        offset = int(uniform(0.0, genomeLen))

        sequence = genome[offset:offset + length]

        # If we didn't get enough from the end of the genome, take whatever
        # else we need from its start.
        if len(sequence) < length:
            sequence += genome[0:length - len(sequence)]

        assert len(sequence) == length
    else:
        # For symmetry, we calculate an offset that allows the read to
        # overlap (by at least minReadLength bases) with the start or end
        # of the genome. If that happens, we truncate the read.
        offset = int(
            uniform(-(length - 1) + minReadLength, genomeLen - minReadLength))

        if offset < 0:
            sequence = genome[:offset + length]
        else:
            sequence = genome[offset:offset + length]

    assert maxReadLength >= len(sequence) >= minReadLength, (
        'maxReadLength=%d, len(sequence)=%d, minReadLength=%d '
        'readLength=%d offset=%d' %
        (maxReadLength, len(sequence), minReadLength, length, offset))

    read = Read(id_, sequence)
    mutationOffsets = () if rate == 0.0 else mutateRead(read, rate)
    return read, offset, mutationOffsets
コード例 #2
0
 def testRateZero(self):
     """
     If the mutation rate is 0.0 no bases can be mutated.
     """
     read = Read('id', 'ACGTACGT')
     offsets = mutateRead(read, 0.0, 'Z')
     self.assertEqual('ACGTACGT', read.sequence)
     self.assertEqual(0, len(offsets))
コード例 #3
0
 def testRateOne(self):
     """
     If the mutation rate is 1.0 all bases must be mutated.
     """
     read = Read('id', 'ACGTACGT')
     offsets = mutateRead(read, 1.0, 'Z')
     self.assertEqual('ZZZZZZZZ', read.sequence)
     self.assertEqual(8, len(offsets))
コード例 #4
0
def main(args, logfp):
    """
    Create genomes and reads for a multiple infection detection experiment.

    @param args: A namespace instance, as returned by parse_args
    @param logfp: A file object to write log information to.
    """
    print('Invocation arguments', args, file=logfp)

    qOutputDir = quote(args.outputDir)
    genome1 = join(qOutputDir, 'genome-1.fasta')
    genome2 = join(qOutputDir, 'genome-2.fasta')
    genome2locations = join(qOutputDir, 'genome-2.locations')
    reads1 = join(qOutputDir, 'reads-1.fastq')
    reads2 = join(qOutputDir, 'reads-2.fastq')
    reads12 = join(qOutputDir, 'reads-12.fastq')

    executor = Executor(args.dryRun)

    if args.genome1Filename:
        executor.execute('ln -s %s %s' %
                         (quote(args.genome1Filename), genome1))
    else:
        if args.genomeLength < 1:
            print('Random initial genome length must be > 0.', file=sys.stderr)
            sys.exit(3)
        print('Writing random starting genome of length %d to %s' %
              (args.genomeLength, genome1),
              file=logfp)
        if not args.dryRun:
            sequence = ''.join(
                [choice('ACGT') for _ in range(args.genomeLength)])
            with open(genome1, 'w') as fp:
                print('>genome-1\n%s' % sequence, file=fp)

    if args.genome2Filename:
        executor.execute('ln -s %s %s' %
                         (quote(args.genome2Filename), genome2))
    else:
        # Make a second genome using the given mutation rate. Print its
        # mutated locations to a file.
        (genome1read, ) = list(FastaReads(genome1))
        offsets = mutateRead(genome1read, args.genome2MutationRate)
        with open(genome2locations, 'w') as fp:
            print('\n'.join(str(offset + 1) for offset in sorted(offsets)),
                  file=fp)
        genome1read.id = 'genome-2'
        Reads([genome1read]).save(genome2)

    cmdPrefix = ('create-reads.py --maxReadLength %d --minReadLength %d '
                 '--meanLength %d --sdLength %d --rate %f ' %
                 (args.maxReadLength, args.minReadLength, args.meanReadLength,
                  args.sdReadLength, args.readMutationRate))

    for info in [{
            'reads': reads1,
            'fasta': genome1,
            'number': 1,
            'count': args.genome1ReadCount or args.readCount,
    }, {
            'reads': reads2,
            'fasta': genome2,
            'number': 2,
            'count': args.genome2ReadCount or args.readCount,
    }]:
        executor.execute(cmdPrefix +
                         ('--idPrefix genome-%(number)d-read- '
                          '--count %(count)d < %(fasta)s > %(reads)s' % info))

    executor.execute('cat %s %s > %s' % (reads1, reads2, reads12))

    print('\n'.join(executor.log), file=logfp)
コード例 #5
0
ファイル: mutate-reads.py プロジェクト: acorg/midtools
        '--idSuffix',
        default='',
        help=('Add this string to the end of the read ids. This is added '
              'after the string added by --editIds (if also used).'))

    parser.add_argument(
        '--editIds',
        action='store_true',
        default=False,
        help=('Add "-mutations:N" to the end of each read id, where N '
              'is the number of mutations introduced to the read.'))

    addFASTACommandLineOptions(parser)
    args = parser.parse_args()
    reads = parseFASTACommandLineOptions(args)
    rate = args.rate
    verbose = args.verbose
    editIds = args.editIds
    idSuffix = args.idSuffix
    format_ = 'fastq' if args.fastq else 'fasta'

    for read in reads:
        count = len(mutateRead(read, rate))
        if verbose:
            print('%d mutation%s made in read (len %d) %s' %
                  (count, s(count), len(read), read.id),
                  file=sys.stderr)
        read.id = (read.id + (('-mutations:%d' % count) if editIds else '') +
                   idSuffix)
        print(read.toString(format_), end='')