Esempio n. 1
0
def test_introduce_errors():
    np.random.seed(42)
    err_mod = basic.BasicErrorModel()

    read = SeqRecord(Seq(str('AATGC' * 25), IUPAC.unambiguous_dna),
                     id='read_1',
                     description='test read')
    read = err_mod.introduce_error_scores(read, 'forward')
    qualities = read.letter_annotations["phred_quality"][:10]
    assert qualities == [40, 26, 40, 40, 25, 25, 40, 40, 22, 40]
Esempio n. 2
0
def test_simulate_and_save_short():
    err_mod = basic.BasicErrorModel()
    ref_genome = SeqRecord(
        Seq(str('AACCC' * 100),
            IUPAC.unambiguous_dna
            ),
        id='my_genome',
        description='test genome'
    )
    generator.reads(ref_genome, err_mod, 1000, 0, 'data/.test', 0, True)
Esempio n. 3
0
def test_basic():
    if sys.version_info > (3, ):
        random.seed(42)
        np.random.seed(42)
        err_mod = basic.BasicErrorModel()
        ref_genome = SeqRecord(Seq(str('AAAAACCCCC' * 100),
                                   IUPAC.unambiguous_dna),
                               id='my_genome',
                               description='test genome')
        read_tuple = generator.simulate_read(ref_genome, err_mod, 1)
        big_read = ''.join(str(read_tuple[0].seq) + str(read_tuple[1].seq))
        assert big_read[-15:] == 'TTTTGGGGGTTTTTG'
Esempio n. 4
0
def test_mut_sequence():
    random.seed(42)
    np.random.seed(42)

    err_mod = basic.BasicErrorModel()

    read = SeqRecord(Seq(str('AAAAA' * 25), IUPAC.unambiguous_dna),
                     id='read_1',
                     description='test read')
    read.letter_annotations["phred_quality"] = [5] * 125
    read.seq = err_mod.mut_sequence(read, 'forward')
    assert str(read.seq[:10]) == 'AAAACAGAAA'
Esempio n. 5
0
def test_introduce_indels():
    random.seed(42)
    np.random.seed(42)

    err_mod = basic.BasicErrorModel()
    err_mod.ins_for[1]['G'] = 1.0
    err_mod.del_for[0]['A'] = 1.0
    bounds = (5, 130)
    read = SeqRecord(Seq(str('ATATA' * 25), IUPAC.unambiguous_dna),
                     id='read_1',
                     description='test read')
    ref_genome = SeqRecord(Seq(str('ATATA' * 100), IUPAC.unambiguous_dna),
                           id='ref_genome',
                           description='test reference')
    read.seq = err_mod.introduce_indels(read, 'forward', ref_genome, bounds)
    assert len(read.seq) == 125
    assert read.seq[:10] == 'ATGATAATAT'
Esempio n. 6
0
def generate_reads(args):
    """Main function for the `iss generate` submodule

    This submodule generates reads from an ErrorModel and write them to
        args.output + _R(1|2).fastq

    Args:
        args (object): the command-line arguments from argparse
    """
    logger = logging.getLogger(__name__)
    logger.debug('iss version %s' % __version__)
    logger.debug('Using verbose logger')

    try:  # try to import and load the correct error model
        logger.info('Starting iss generate')
        logger.info('Using %s ErrorModel' % args.mode)
        if args.seed:
            logger.info('Setting random seed to %i' % args.seed)
            random.seed(args.seed)
            np.random.seed(args.seed)
        if args.mode == 'kde':
            from iss.error_models import kde
            if args.model is None:
                logger.error('--model is required in --mode kde')
                sys.exit(1)
            elif args.model.lower() == 'hiseq':
                npz = os.path.join(os.path.dirname(__file__), 'profiles/HiSeq')
            elif args.model.lower() == 'novaseq':
                npz = os.path.join(os.path.dirname(__file__),
                                   'profiles/NovaSeq')
            elif args.model.lower() == 'miseq':
                npz = os.path.join(os.path.dirname(__file__), 'profiles/MiSeq')
            else:
                npz = args.model
            err_mod = kde.KDErrorModel(npz)
        elif args.mode == 'basic':
            if args.model is not None:
                logger.warning('--model %s will be ignored in --mode %s' %
                               (args.model, args.mode))
            from iss.error_models import basic
            err_mod = basic.BasicErrorModel()
        elif args.mode == 'perfect':
            if args.model is not None:
                logger.warning('--model %s will be ignored in --mode %s' %
                               (args.model, args.mode))
            from iss.error_models import perfect
            err_mod = perfect.PerfectErrorModel()
    except ImportError as e:
        logger.error('Failed to import ErrorModel module: %s' % e)
        sys.exit(1)

    try:  # try to read genomes and concatenate --genomes and --ncbi genomes
        if args.genomes or args.draft or args.ncbi:
            genome_files = []
            if args.genomes:
                genome_files.extend(args.genomes)
            if args.draft:
                logger.warning('--draft is in early experimental stage.')
                logger.warning(
                    'disabling --abundance_file, --coverage and --n_genomes')
                logger.warning('Defaulting to --abundance.')
                genome_files.extend(args.draft)
            if args.ncbi and args.n_genomes_ncbi:
                util.genome_file_exists(args.output + '_ncbi_genomes.fasta')
                total_genomes_ncbi = []
                try:
                    assert len(*args.ncbi) == len(*args.n_genomes_ncbi)
                except AssertionError as e:
                    logger.error(
                        '--ncbi and --n_genomes_ncbi of unequal lengths. \
                        Aborting')
                    sys.exit(1)
                # this is py3 only
                # for g, n in zip(*args.ncbi, *args.n_genomes):
                # py2 compatibilty workaround
                # TODO switch to the more elegant solution when we drop python2
                args.ncbi = [x for y in args.ncbi for x in y]
                args.n_genomes_ncbi = [
                    x for y in args.n_genomes_ncbi for x in y
                ]
                for g, n in zip(args.ncbi, args.n_genomes_ncbi):
                    genomes_ncbi = download.ncbi(
                        g, n, args.output + '_ncbi_genomes.fasta')
                genome_files.append(genomes_ncbi)
            if args.ncbi and not args.n_genomes_ncbi:
                logger.error(
                    '--ncbi/-k requires --n_genomes_ncbi/-U. Aborting.')
                sys.exit(1)

        else:
            logger.error("One of --genomes/-g, --draft, --ncbi/-k is required")
            sys.exit(1)

        genome_file = args.output + '.iss.tmp.genomes.fasta'
        util.concatenate(genome_files, output=genome_file)

        # for n_genomes we use reservoir sampling to draw random genomes
        # from the concatenated genome file. We then override the file.
        if args.n_genomes and not args.draft and not args.ncbi:
            genome_count = util.count_records(genome_file)
            genome_files = [
                genome
                for genome in util.reservoir(SeqIO.parse(genome_file, 'fasta'),
                                             genome_count, args.n_genomes)
            ]
            SeqIO.write(genome_files, genome_file, 'fasta')

        assert os.stat(genome_file).st_size != 0
        f = open(genome_file, 'r')
        with f:  # count the number of records
            genome_list = util.count_records(f)
    except IOError as e:
        logger.error('Failed to open genome(s) file:%s' % e)
        sys.exit(1)
    except AssertionError as e:
        logger.error('Genome(s) file seems empty: %s' % genome_file)
        sys.exit(1)
    except KeyboardInterrupt as e:
        logger.error('iss generate interrupted: %s' % e)
        sys.exit(1)
    else:
        abundance_dispatch = {
            'uniform': abundance.uniform,
            'halfnormal': abundance.halfnormal,
            'exponential': abundance.exponential,
            'lognormal': abundance.lognormal,
            'zero_inflated_lognormal': abundance.zero_inflated_lognormal
        }
        # read the abundance file
        if args.abundance_file and not args.draft:
            logger.info('Using abundance file:%s' % args.abundance_file)
            abundance_dic = abundance.parse_abundance_file(args.abundance_file)
        elif args.coverage and not args.draft:
            logger.warning('--coverage is an experimental feature')
            logger.info('Using coverage file:%s' % args.coverage)
            abundance_dic = abundance.parse_abundance_file(args.coverage)
        elif args.abundance in abundance_dispatch:
            logger.info('Using %s abundance distribution' % args.abundance)
            if args.draft:
                abundance_dic = abundance.draft(
                    genome_list, args.draft,
                    abundance_dispatch[args.abundance], args.output)
            else:
                abundance_dic = abundance_dispatch[args.abundance](genome_list)
                abundance.to_file(abundance_dic, args.output)
        else:
            logger.error('Could not get abundance')
            sys.exit(1)

        cpus = args.cpus
        logger.info('Using %s cpus for read generation' % cpus)

        if not args.coverage:
            n_reads = util.convert_n_reads(args.n_reads)
            logger.info('Generating %s reads' % n_reads)

        try:
            temp_file_list = []  # list holding the prefix of all temp files
            f = open(genome_file, 'r')  # re-opens the file
            with f:
                fasta_file = SeqIO.parse(f, 'fasta')

                for record in fasta_file:
                    # generate reads for records
                    try:
                        species_abundance = abundance_dic[record.id]
                    except KeyError as e:
                        logger.error(
                            'Fasta record not found in abundance file: %s' % e)
                        sys.exit(1)
                    else:
                        logger.info('Generating reads for record: %s' %
                                    record.id)
                        genome_size = len(record.seq)

                        if args.coverage:
                            coverage = species_abundance
                        else:
                            coverage = abundance.to_coverage(
                                n_reads, species_abundance,
                                err_mod.read_length, genome_size)
                        n_pairs = int(
                            round((coverage * len(record.seq)) /
                                  err_mod.read_length) / 2)
                        # skip record if n_reads == 0
                        if n_pairs == 0:
                            continue

                        # exact n_reads for each cpus
                        if n_pairs % cpus == 0:
                            n_pairs_per_cpu = [(n_pairs // cpus)
                                               for _ in range(cpus)]
                        else:
                            n_pairs_per_cpu = [(n_pairs // cpus)
                                               for _ in range(cpus)]
                            n_pairs_per_cpu[-1] += n_pairs % cpus

                        # due to a bug in multiprocessing
                        # https://bugs.python.org/issue17560
                        # we can't send records taking more than 2**31 bytes
                        # through serialisation.
                        # In those cases we use memmapping
                        if sys.getsizeof(str(record.seq)) >= 2**31 - 1:
                            logger.warning("record %s unusually big." %
                                           record.id)
                            logger.warning("Using a memory map.")
                            mode = "memmap"

                            record_mmap = "%s.memmap" % args.output
                            if os.path.exists(record_mmap):
                                os.unlink(record_mmap)
                            util.dump(record, record_mmap)
                            del record
                            record = record_mmap
                            gc.collect()
                        else:
                            mode = "default"

                        record_file_name_list = Parallel(n_jobs=cpus)(
                            delayed(generator.reads)(
                                record, err_mod, n_pairs_per_cpu[i], i,
                                args.output, args.seed, args.gc_bias, mode)
                            for i in range(cpus))
                        temp_file_list.extend(record_file_name_list)
        except KeyboardInterrupt as e:
            logger.error('iss generate interrupted: %s' % e)
            temp_file_unique = list(set(temp_file_list))
            temp_R1 = [temp_file + '_R1.fastq' for temp_file in temp_file_list]
            temp_R2 = [temp_file + '_R2.fastq' for temp_file in temp_file_list]
            full_tmp_list = temp_R1 + temp_R2
            full_tmp_list.append(genome_file)
            if os.path.exists("%s.memmap" % args.output):
                full_tmp_list.append("%s.memmap" % args.output)
            util.cleanup(full_tmp_list)
            sys.exit(1)
        else:
            # remove the duplicates in file list and cleanup
            # we remove the duplicates in case two records had the same header
            # and reads were appended to the same temp file.
            temp_file_unique = list(set(temp_file_list))
            temp_R1 = [temp_file + '_R1.fastq' for temp_file in temp_file_list]
            temp_R2 = [temp_file + '_R2.fastq' for temp_file in temp_file_list]
            util.concatenate(temp_R1, args.output + '_R1.fastq')
            util.concatenate(temp_R2, args.output + '_R2.fastq')
            full_tmp_list = temp_R1 + temp_R2
            full_tmp_list.append(genome_file)
            if os.path.exists("%s.memmap" % args.output):
                full_tmp_list.append("%s.memmap" % args.output)
            util.cleanup(full_tmp_list)
            if args.compress:
                util.compress(args.output + '_R1.fastq')
                util.compress(args.output + '_R2.fastq')
            logger.info('Read generation complete')
Esempio n. 7
0
def test_basic_phred():
    np.random.seed(42)
    err_mod = basic.BasicErrorModel()

    distribution = err_mod.gen_phred_scores(20, 'forward')[:10]
    assert distribution == [23, 19, 25, 40, 19, 19, 40, 26, 18, 23]
Esempio n. 8
0
def generate_reads(args):
    """Main function for the `iss generate` submodule

    This submodule generates reads from an ErrorModel and write them to
        args.output + _R(1|2).fastq

    Args:
        args (object): the command-line arguments from argparse
    """
    logger = logging.getLogger(__name__)
    logger.debug('iss version %s' % __version__)
    logger.debug('Using verbose logger')

    try:  # try to import and load the correct error model
        logger.info('Starting iss generate')
        logger.info('Using %s ErrorModel' % args.mode)
        if args.mode == 'kde':
            from iss.error_models import kde
            if args.model.lower() == 'hiseq':
                npz = os.path.join(os.path.dirname(__file__), 'profiles/HiSeq')
            elif args.model.lower() == 'novaseq':
                npz = os.path.join(os.path.dirname(__file__),
                                   'profiles/NovaSeq')
            elif args.model.lower() == 'miseq':
                npz = os.path.join(os.path.dirname(__file__), 'profiles/MiSeq')
            elif args.model is None:
                logger.error('--model is required in --mode kde')
                sys.exit(1)
            else:
                npz = args.model
            err_mod = kde.KDErrorModel(npz)
        elif args.mode == 'basic':
            if args.model is not None:
                logger.warning('--model %s will be ignored in --mode %s' %
                               (args.model, args.mode))
            from iss.error_models import basic
            err_mod = basic.BasicErrorModel()
    except ImportError as e:
        logger.error('Failed to import ErrorModel module: %s' % e)
        sys.exit(1)

    try:  # try to read genomes and generate reads
        if args.genomes:
            genome_file = args.genomes
        elif args.ncbi and args.n_genomes:
            util.genome_file_exists(args.output + '_genomes.fasta')
            total_genomes = []
            try:
                assert len(*args.ncbi) == len(*args.n_genomes)
            except AssertionError as e:
                logger.error(
                    '--ncbi and --n_genomes of unequal lengths. Aborting')
                sys.exit(1)
            # for g, n in zip(*args.ncbi, *args.n_genomes):  # this is py3 only
            # py2 compatibilty workaround
            # TODO remove when we drop python2
            args.ncbi = [x for y in args.ncbi for x in y]
            args.n_genomes = [x for y in args.n_genomes for x in y]
            for g, n in zip(args.ncbi, args.n_genomes):
                genomes = download.ncbi(g, n)
                total_genomes.extend(genomes)
            genome_file = download.to_fasta(total_genomes, args.output)
        else:
            logger.error('Invalid input')  # TODO better error handling here
            sys.exit(1)

        assert os.stat(genome_file).st_size != 0
        f = open(genome_file, 'r')
        with f:  # count the number of records
            record_list = util.count_records(f)
    except IOError as e:
        logger.error('Failed to open genome(s) file:%s' % e)
        sys.exit(1)
    except AssertionError as e:
        logger.error('Genome(s) file seems empty: %s' % genome_file)
        sys.exit(1)
    except KeyboardInterrupt as e:
        logger.error('iss generate interrupted: %s' % e)
        sys.exit(1)
    else:
        abundance_dispatch = {
            'uniform': abundance.uniform,
            'halfnormal': abundance.halfnormal,
            'exponential': abundance.exponential,
            'lognormal': abundance.lognormal,
            'zero_inflated_lognormal': abundance.zero_inflated_lognormal
        }
        # read the abundance file
        if args.abundance_file:
            logger.info('Using abundance file:%s' % args.abundance_file)
            abundance_dic = abundance.parse_abundance_file(args.abundance_file)
        elif args.abundance in abundance_dispatch:
            logger.info('Using %s abundance distribution' % args.abundance)
            abundance_dic = abundance_dispatch[args.abundance](record_list)
            abundance.to_file(abundance_dic, args.output)
        else:
            logger.error('Could not get abundance')
            sys.exit(1)

        cpus = args.cpus
        logger.info('Using %s cpus for read generation' % cpus)

        n_reads = util.convert_n_reads(args.n_reads)
        logger.info('Generating %s reads' % n_reads)

        try:
            temp_file_list = []  # list holding the prefix of all temp files
            f = open(genome_file, 'r')  # re-opens the file
            with f:
                fasta_file = SeqIO.parse(f, 'fasta')
                for record in fasta_file:  # generate reads for each record
                    try:
                        species_abundance = abundance_dic[record.id]
                    except KeyError as e:
                        logger.error(
                            'Fasta record not found in abundance file: %s' % e)
                        sys.exit(1)
                    else:
                        logger.info('Generating reads for record: %s' %
                                    record.id)
                        genome_size = len(record.seq)

                        coverage = abundance.to_coverage(
                            n_reads, species_abundance, err_mod.read_length,
                            genome_size)
                        n_pairs = int(
                            round((coverage * len(record.seq)) /
                                  err_mod.read_length) / 2)

                        # good enough approximation
                        n_pairs_per_cpu = int(round(n_pairs / cpus))

                        record_file_name_list = Parallel(n_jobs=cpus)(
                            delayed(generator.reads)
                            (record, err_mod, n_pairs_per_cpu, i, args.output,
                             args.gc_bias) for i in range(cpus))
                        temp_file_list.extend(record_file_name_list)
        except KeyboardInterrupt as e:
            logger.error('iss generate interrupted: %s' % e)
            generator.cleanup(temp_file_list)
            sys.exit(1)
        else:
            # remove the duplicates in file list and cleanup
            # we remove the duplicates in case two records had the same header
            # and reads were appended to the same temp file.
            temp_file_unique = list(set(temp_file_list))
            generator.concatenate(temp_file_unique, args.output)
            generator.cleanup(temp_file_unique)
            logger.info('Read generation complete')