Example #1
0
def test_count_records():
    f = open('data/genomes.fasta', 'r')
    with f:  # count the number of records
        record_list = util.count_records(f)
    assert record_list == [
        'genome_A', 'genome_T', 'genome_GC', 'genome_ATCG', 'genome_TA'
    ]
Example #2
0
def test_reservoir():
    samples = []
    genome_file = 'data/genomes.fasta'
    with open(genome_file, 'r') as f:
        record_list = util.count_records(f)
    n = 2
    with open(genome_file, 'r') as f:
        fasta_file = SeqIO.parse(f, 'fasta')
        for record in util.reservoir(fasta_file, record_list, n):
            samples.append(record.id)
    assert len(samples) == 2
Example #3
0
def test_distributions():
    np.random.seed(42)
    f = open('data/genomes.fasta', 'r')
    with f:  # count the number of records
        record_list = util.count_records(f)

    uniform_dic = abundance.uniform(record_list)
    halfnormal_dic = abundance.halfnormal(record_list)
    exponential_dic = abundance.exponential(record_list)
    lognormal_dic = abundance.lognormal(record_list)

    np.random.seed(42)  # reset the seed to get 0s in zero_inflated_lognormal
    zero_inflated_lognormal_dic = abundance.zero_inflated_lognormal(
        record_list)
    assert list(uniform_dic.values()) == [0.2] * 5
    assert round(halfnormal_dic['genome_A'], 2) == 0.16
    assert round(exponential_dic['genome_A'], 2) == 0.01
    assert round(lognormal_dic['genome_T'], 2) == 0.19
    assert zero_inflated_lognormal_dic['genome_T'] == 0.0
    assert round(zero_inflated_lognormal_dic['genome_A'], 2) == 0.44
Example #4
0
def test_abunance_draft():
    abundance_dic = {
        'genome_A': 0.15511887441170918,
        'genome_T': 0.08220476760848751,
        'genome_GC': 0.18039811160555874,
        'genome_ATCG': 0.4329003045949206,
        'genome_TA': 0.07468835777633397,
        'contig_1': 0.02776920430880394,
        'contig_2': 0.011490705231229217,
        'contig_3': 0.03542967446295675
    }
    np.random.seed(42)
    f = open('data/genomes.fasta', 'r')
    with f:  # count the number of records
        complete_genomes = util.count_records(f)
    draft_genomes = ['data/draft.fasta']
    ab = abundance.draft(complete_genomes, draft_genomes, abundance.lognormal,
                         'data/test')
    for tv, v in zip(abundance_dic.values(), ab.values()):
        assert round(tv) == round(v)
Example #5
0
def generate_reads(args):
    """Main function for the `iss generate` submodule

    This submodule generates reads from an ErrorModel and write them to
        args.output + _R(1|2).fastq

    Args:
        args (object): the command-line arguments from argparse
    """
    logger = logging.getLogger(__name__)
    logger.debug('iss version %s' % __version__)
    logger.debug('Using verbose logger')

    try:  # try to import and load the correct error model
        logger.info('Starting iss generate')
        logger.info('Using %s ErrorModel' % args.mode)
        if args.seed:
            logger.info('Setting random seed to %i' % args.seed)
            random.seed(args.seed)
            np.random.seed(args.seed)
        if args.mode == 'kde':
            from iss.error_models import kde
            if args.model is None:
                logger.error('--model is required in --mode kde')
                sys.exit(1)
            elif args.model.lower() == 'hiseq':
                npz = os.path.join(os.path.dirname(__file__), 'profiles/HiSeq')
            elif args.model.lower() == 'novaseq':
                npz = os.path.join(os.path.dirname(__file__),
                                   'profiles/NovaSeq')
            elif args.model.lower() == 'miseq':
                npz = os.path.join(os.path.dirname(__file__), 'profiles/MiSeq')
            else:
                npz = args.model
            err_mod = kde.KDErrorModel(npz)
        elif args.mode == 'basic':
            if args.model is not None:
                logger.warning('--model %s will be ignored in --mode %s' %
                               (args.model, args.mode))
            from iss.error_models import basic
            err_mod = basic.BasicErrorModel()
        elif args.mode == 'perfect':
            if args.model is not None:
                logger.warning('--model %s will be ignored in --mode %s' %
                               (args.model, args.mode))
            from iss.error_models import perfect
            err_mod = perfect.PerfectErrorModel()
    except ImportError as e:
        logger.error('Failed to import ErrorModel module: %s' % e)
        sys.exit(1)

    try:  # try to read genomes and concatenate --genomes and --ncbi genomes
        if args.genomes or args.draft or args.ncbi:
            genome_files = []
            if args.genomes:
                genome_files.extend(args.genomes)
            if args.draft:
                logger.warning('--draft is in early experimental stage.')
                logger.warning(
                    'disabling --abundance_file, --coverage and --n_genomes')
                logger.warning('Defaulting to --abundance.')
                genome_files.extend(args.draft)
            if args.ncbi and args.n_genomes_ncbi:
                util.genome_file_exists(args.output + '_ncbi_genomes.fasta')
                total_genomes_ncbi = []
                try:
                    assert len(*args.ncbi) == len(*args.n_genomes_ncbi)
                except AssertionError as e:
                    logger.error(
                        '--ncbi and --n_genomes_ncbi of unequal lengths. \
                        Aborting')
                    sys.exit(1)
                # this is py3 only
                # for g, n in zip(*args.ncbi, *args.n_genomes):
                # py2 compatibilty workaround
                # TODO switch to the more elegant solution when we drop python2
                args.ncbi = [x for y in args.ncbi for x in y]
                args.n_genomes_ncbi = [
                    x for y in args.n_genomes_ncbi for x in y
                ]
                for g, n in zip(args.ncbi, args.n_genomes_ncbi):
                    genomes_ncbi = download.ncbi(
                        g, n, args.output + '_ncbi_genomes.fasta')
                genome_files.append(genomes_ncbi)
            if args.ncbi and not args.n_genomes_ncbi:
                logger.error(
                    '--ncbi/-k requires --n_genomes_ncbi/-U. Aborting.')
                sys.exit(1)

        else:
            logger.error("One of --genomes/-g, --draft, --ncbi/-k is required")
            sys.exit(1)

        genome_file = args.output + '.iss.tmp.genomes.fasta'
        util.concatenate(genome_files, output=genome_file)

        # for n_genomes we use reservoir sampling to draw random genomes
        # from the concatenated genome file. We then override the file.
        if args.n_genomes and not args.draft and not args.ncbi:
            genome_count = util.count_records(genome_file)
            genome_files = [
                genome
                for genome in util.reservoir(SeqIO.parse(genome_file, 'fasta'),
                                             genome_count, args.n_genomes)
            ]
            SeqIO.write(genome_files, genome_file, 'fasta')

        assert os.stat(genome_file).st_size != 0
        f = open(genome_file, 'r')
        with f:  # count the number of records
            genome_list = util.count_records(f)
    except IOError as e:
        logger.error('Failed to open genome(s) file:%s' % e)
        sys.exit(1)
    except AssertionError as e:
        logger.error('Genome(s) file seems empty: %s' % genome_file)
        sys.exit(1)
    except KeyboardInterrupt as e:
        logger.error('iss generate interrupted: %s' % e)
        sys.exit(1)
    else:
        abundance_dispatch = {
            'uniform': abundance.uniform,
            'halfnormal': abundance.halfnormal,
            'exponential': abundance.exponential,
            'lognormal': abundance.lognormal,
            'zero_inflated_lognormal': abundance.zero_inflated_lognormal
        }
        # read the abundance file
        if args.abundance_file and not args.draft:
            logger.info('Using abundance file:%s' % args.abundance_file)
            abundance_dic = abundance.parse_abundance_file(args.abundance_file)
        elif args.coverage and not args.draft:
            logger.warning('--coverage is an experimental feature')
            logger.info('Using coverage file:%s' % args.coverage)
            abundance_dic = abundance.parse_abundance_file(args.coverage)
        elif args.abundance in abundance_dispatch:
            logger.info('Using %s abundance distribution' % args.abundance)
            if args.draft:
                abundance_dic = abundance.draft(
                    genome_list, args.draft,
                    abundance_dispatch[args.abundance], args.output)
            else:
                abundance_dic = abundance_dispatch[args.abundance](genome_list)
                abundance.to_file(abundance_dic, args.output)
        else:
            logger.error('Could not get abundance')
            sys.exit(1)

        cpus = args.cpus
        logger.info('Using %s cpus for read generation' % cpus)

        if not args.coverage:
            n_reads = util.convert_n_reads(args.n_reads)
            logger.info('Generating %s reads' % n_reads)

        try:
            temp_file_list = []  # list holding the prefix of all temp files
            f = open(genome_file, 'r')  # re-opens the file
            with f:
                fasta_file = SeqIO.parse(f, 'fasta')

                for record in fasta_file:
                    # generate reads for records
                    try:
                        species_abundance = abundance_dic[record.id]
                    except KeyError as e:
                        logger.error(
                            'Fasta record not found in abundance file: %s' % e)
                        sys.exit(1)
                    else:
                        logger.info('Generating reads for record: %s' %
                                    record.id)
                        genome_size = len(record.seq)

                        if args.coverage:
                            coverage = species_abundance
                        else:
                            coverage = abundance.to_coverage(
                                n_reads, species_abundance,
                                err_mod.read_length, genome_size)
                        n_pairs = int(
                            round((coverage * len(record.seq)) /
                                  err_mod.read_length) / 2)
                        # skip record if n_reads == 0
                        if n_pairs == 0:
                            continue

                        # exact n_reads for each cpus
                        if n_pairs % cpus == 0:
                            n_pairs_per_cpu = [(n_pairs // cpus)
                                               for _ in range(cpus)]
                        else:
                            n_pairs_per_cpu = [(n_pairs // cpus)
                                               for _ in range(cpus)]
                            n_pairs_per_cpu[-1] += n_pairs % cpus

                        # due to a bug in multiprocessing
                        # https://bugs.python.org/issue17560
                        # we can't send records taking more than 2**31 bytes
                        # through serialisation.
                        # In those cases we use memmapping
                        if sys.getsizeof(str(record.seq)) >= 2**31 - 1:
                            logger.warning("record %s unusually big." %
                                           record.id)
                            logger.warning("Using a memory map.")
                            mode = "memmap"

                            record_mmap = "%s.memmap" % args.output
                            if os.path.exists(record_mmap):
                                os.unlink(record_mmap)
                            util.dump(record, record_mmap)
                            del record
                            record = record_mmap
                            gc.collect()
                        else:
                            mode = "default"

                        record_file_name_list = Parallel(n_jobs=cpus)(
                            delayed(generator.reads)(
                                record, err_mod, n_pairs_per_cpu[i], i,
                                args.output, args.seed, args.gc_bias, mode)
                            for i in range(cpus))
                        temp_file_list.extend(record_file_name_list)
        except KeyboardInterrupt as e:
            logger.error('iss generate interrupted: %s' % e)
            temp_file_unique = list(set(temp_file_list))
            temp_R1 = [temp_file + '_R1.fastq' for temp_file in temp_file_list]
            temp_R2 = [temp_file + '_R2.fastq' for temp_file in temp_file_list]
            full_tmp_list = temp_R1 + temp_R2
            full_tmp_list.append(genome_file)
            if os.path.exists("%s.memmap" % args.output):
                full_tmp_list.append("%s.memmap" % args.output)
            util.cleanup(full_tmp_list)
            sys.exit(1)
        else:
            # remove the duplicates in file list and cleanup
            # we remove the duplicates in case two records had the same header
            # and reads were appended to the same temp file.
            temp_file_unique = list(set(temp_file_list))
            temp_R1 = [temp_file + '_R1.fastq' for temp_file in temp_file_list]
            temp_R2 = [temp_file + '_R2.fastq' for temp_file in temp_file_list]
            util.concatenate(temp_R1, args.output + '_R1.fastq')
            util.concatenate(temp_R2, args.output + '_R2.fastq')
            full_tmp_list = temp_R1 + temp_R2
            full_tmp_list.append(genome_file)
            if os.path.exists("%s.memmap" % args.output):
                full_tmp_list.append("%s.memmap" % args.output)
            util.cleanup(full_tmp_list)
            if args.compress:
                util.compress(args.output + '_R1.fastq')
                util.compress(args.output + '_R2.fastq')
            logger.info('Read generation complete')
Example #6
0
def test_count_records_empty():
    util.count_records('data/empty_file')
Example #7
0
def generate_reads(args):
    """Main function for the `iss generate` submodule

    This submodule generates reads from an ErrorModel and write them to
        args.output + _R(1|2).fastq

    Args:
        args (object): the command-line arguments from argparse
    """
    logger = logging.getLogger(__name__)
    logger.debug('iss version %s' % __version__)
    logger.debug('Using verbose logger')

    try:  # try to import and load the correct error model
        logger.info('Starting iss generate')
        logger.info('Using %s ErrorModel' % args.mode)
        if args.mode == 'kde':
            from iss.error_models import kde
            if args.model.lower() == 'hiseq':
                npz = os.path.join(os.path.dirname(__file__), 'profiles/HiSeq')
            elif args.model.lower() == 'novaseq':
                npz = os.path.join(os.path.dirname(__file__),
                                   'profiles/NovaSeq')
            elif args.model.lower() == 'miseq':
                npz = os.path.join(os.path.dirname(__file__), 'profiles/MiSeq')
            elif args.model is None:
                logger.error('--model is required in --mode kde')
                sys.exit(1)
            else:
                npz = args.model
            err_mod = kde.KDErrorModel(npz)
        elif args.mode == 'basic':
            if args.model is not None:
                logger.warning('--model %s will be ignored in --mode %s' %
                               (args.model, args.mode))
            from iss.error_models import basic
            err_mod = basic.BasicErrorModel()
    except ImportError as e:
        logger.error('Failed to import ErrorModel module: %s' % e)
        sys.exit(1)

    try:  # try to read genomes and generate reads
        if args.genomes:
            genome_file = args.genomes
        elif args.ncbi and args.n_genomes:
            util.genome_file_exists(args.output + '_genomes.fasta')
            total_genomes = []
            try:
                assert len(*args.ncbi) == len(*args.n_genomes)
            except AssertionError as e:
                logger.error(
                    '--ncbi and --n_genomes of unequal lengths. Aborting')
                sys.exit(1)
            # for g, n in zip(*args.ncbi, *args.n_genomes):  # this is py3 only
            # py2 compatibilty workaround
            # TODO remove when we drop python2
            args.ncbi = [x for y in args.ncbi for x in y]
            args.n_genomes = [x for y in args.n_genomes for x in y]
            for g, n in zip(args.ncbi, args.n_genomes):
                genomes = download.ncbi(g, n)
                total_genomes.extend(genomes)
            genome_file = download.to_fasta(total_genomes, args.output)
        else:
            logger.error('Invalid input')  # TODO better error handling here
            sys.exit(1)

        assert os.stat(genome_file).st_size != 0
        f = open(genome_file, 'r')
        with f:  # count the number of records
            record_list = util.count_records(f)
    except IOError as e:
        logger.error('Failed to open genome(s) file:%s' % e)
        sys.exit(1)
    except AssertionError as e:
        logger.error('Genome(s) file seems empty: %s' % genome_file)
        sys.exit(1)
    except KeyboardInterrupt as e:
        logger.error('iss generate interrupted: %s' % e)
        sys.exit(1)
    else:
        abundance_dispatch = {
            'uniform': abundance.uniform,
            'halfnormal': abundance.halfnormal,
            'exponential': abundance.exponential,
            'lognormal': abundance.lognormal,
            'zero_inflated_lognormal': abundance.zero_inflated_lognormal
        }
        # read the abundance file
        if args.abundance_file:
            logger.info('Using abundance file:%s' % args.abundance_file)
            abundance_dic = abundance.parse_abundance_file(args.abundance_file)
        elif args.abundance in abundance_dispatch:
            logger.info('Using %s abundance distribution' % args.abundance)
            abundance_dic = abundance_dispatch[args.abundance](record_list)
            abundance.to_file(abundance_dic, args.output)
        else:
            logger.error('Could not get abundance')
            sys.exit(1)

        cpus = args.cpus
        logger.info('Using %s cpus for read generation' % cpus)

        n_reads = util.convert_n_reads(args.n_reads)
        logger.info('Generating %s reads' % n_reads)

        try:
            temp_file_list = []  # list holding the prefix of all temp files
            f = open(genome_file, 'r')  # re-opens the file
            with f:
                fasta_file = SeqIO.parse(f, 'fasta')
                for record in fasta_file:  # generate reads for each record
                    try:
                        species_abundance = abundance_dic[record.id]
                    except KeyError as e:
                        logger.error(
                            'Fasta record not found in abundance file: %s' % e)
                        sys.exit(1)
                    else:
                        logger.info('Generating reads for record: %s' %
                                    record.id)
                        genome_size = len(record.seq)

                        coverage = abundance.to_coverage(
                            n_reads, species_abundance, err_mod.read_length,
                            genome_size)
                        n_pairs = int(
                            round((coverage * len(record.seq)) /
                                  err_mod.read_length) / 2)

                        # good enough approximation
                        n_pairs_per_cpu = int(round(n_pairs / cpus))

                        record_file_name_list = Parallel(n_jobs=cpus)(
                            delayed(generator.reads)
                            (record, err_mod, n_pairs_per_cpu, i, args.output,
                             args.gc_bias) for i in range(cpus))
                        temp_file_list.extend(record_file_name_list)
        except KeyboardInterrupt as e:
            logger.error('iss generate interrupted: %s' % e)
            generator.cleanup(temp_file_list)
            sys.exit(1)
        else:
            # remove the duplicates in file list and cleanup
            # we remove the duplicates in case two records had the same header
            # and reads were appended to the same temp file.
            temp_file_unique = list(set(temp_file_list))
            generator.concatenate(temp_file_unique, args.output)
            generator.cleanup(temp_file_unique)
            logger.info('Read generation complete')