Python draft Examples

Programming Language: Python

Namespace/Package Name: iss.abundance

Method/Function: draft

Examples at hotexamples.com: 2

Python draft - 2 examples found. These are the top rated real world Python examples of iss.abundance.draft extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def test_abunance_draft():
    abundance_dic = {
        'genome_A': 0.15511887441170918,
        'genome_T': 0.08220476760848751,
        'genome_GC': 0.18039811160555874,
        'genome_ATCG': 0.4329003045949206,
        'genome_TA': 0.07468835777633397,
        'contig_1': 0.02776920430880394,
        'contig_2': 0.011490705231229217,
        'contig_3': 0.03542967446295675
    }
    np.random.seed(42)
    f = open('data/genomes.fasta', 'r')
    with f:  # count the number of records
        complete_genomes = util.count_records(f)
    draft_genomes = ['data/draft.fasta']
    ab = abundance.draft(complete_genomes, draft_genomes, abundance.lognormal,
                         'data/test')
    for tv, v in zip(abundance_dic.values(), ab.values()):
        assert round(tv) == round(v)

Example #2

Show file

def generate_reads(args):
    """Main function for the `iss generate` submodule

    This submodule generates reads from an ErrorModel and write them to
        args.output + _R(1|2).fastq

    Args:
        args (object): the command-line arguments from argparse
    """
    logger = logging.getLogger(__name__)
    logger.debug('iss version %s' % __version__)
    logger.debug('Using verbose logger')

    try:  # try to import and load the correct error model
        logger.info('Starting iss generate')
        logger.info('Using %s ErrorModel' % args.mode)
        if args.seed:
            logger.info('Setting random seed to %i' % args.seed)
            random.seed(args.seed)
            np.random.seed(args.seed)
        if args.mode == 'kde':
            from iss.error_models import kde
            if args.model is None:
                logger.error('--model is required in --mode kde')
                sys.exit(1)
            elif args.model.lower() == 'hiseq':
                npz = os.path.join(os.path.dirname(__file__), 'profiles/HiSeq')
            elif args.model.lower() == 'novaseq':
                npz = os.path.join(os.path.dirname(__file__),
                                   'profiles/NovaSeq')
            elif args.model.lower() == 'miseq':
                npz = os.path.join(os.path.dirname(__file__), 'profiles/MiSeq')
            else:
                npz = args.model
            err_mod = kde.KDErrorModel(npz)
        elif args.mode == 'basic':
            if args.model is not None:
                logger.warning('--model %s will be ignored in --mode %s' %
                               (args.model, args.mode))
            from iss.error_models import basic
            err_mod = basic.BasicErrorModel()
        elif args.mode == 'perfect':
            if args.model is not None:
                logger.warning('--model %s will be ignored in --mode %s' %
                               (args.model, args.mode))
            from iss.error_models import perfect
            err_mod = perfect.PerfectErrorModel()
    except ImportError as e:
        logger.error('Failed to import ErrorModel module: %s' % e)
        sys.exit(1)

    try:  # try to read genomes and concatenate --genomes and --ncbi genomes
        if args.genomes or args.draft or args.ncbi:
            genome_files = []
            if args.genomes:
                genome_files.extend(args.genomes)
            if args.draft:
                logger.warning('--draft is in early experimental stage.')
                logger.warning(
                    'disabling --abundance_file, --coverage and --n_genomes')
                logger.warning('Defaulting to --abundance.')
                genome_files.extend(args.draft)
            if args.ncbi and args.n_genomes_ncbi:
                util.genome_file_exists(args.output + '_ncbi_genomes.fasta')
                total_genomes_ncbi = []
                try:
                    assert len(*args.ncbi) == len(*args.n_genomes_ncbi)
                except AssertionError as e:
                    logger.error(
                        '--ncbi and --n_genomes_ncbi of unequal lengths. \
                        Aborting')
                    sys.exit(1)
                # this is py3 only
                # for g, n in zip(*args.ncbi, *args.n_genomes):
                # py2 compatibilty workaround
                # TODO switch to the more elegant solution when we drop python2
                args.ncbi = [x for y in args.ncbi for x in y]
                args.n_genomes_ncbi = [
                    x for y in args.n_genomes_ncbi for x in y
                ]
                for g, n in zip(args.ncbi, args.n_genomes_ncbi):
                    genomes_ncbi = download.ncbi(
                        g, n, args.output + '_ncbi_genomes.fasta')
                genome_files.append(genomes_ncbi)
            if args.ncbi and not args.n_genomes_ncbi:
                logger.error(
                    '--ncbi/-k requires --n_genomes_ncbi/-U. Aborting.')
                sys.exit(1)

        else:
            logger.error("One of --genomes/-g, --draft, --ncbi/-k is required")
            sys.exit(1)

        genome_file = args.output + '.iss.tmp.genomes.fasta'
        util.concatenate(genome_files, output=genome_file)

        # for n_genomes we use reservoir sampling to draw random genomes
        # from the concatenated genome file. We then override the file.
        if args.n_genomes and not args.draft and not args.ncbi:
            genome_count = util.count_records(genome_file)
            genome_files = [
                genome
                for genome in util.reservoir(SeqIO.parse(genome_file, 'fasta'),
                                             genome_count, args.n_genomes)
            ]
            SeqIO.write(genome_files, genome_file, 'fasta')

        assert os.stat(genome_file).st_size != 0
        f = open(genome_file, 'r')
        with f:  # count the number of records
            genome_list = util.count_records(f)
    except IOError as e:
        logger.error('Failed to open genome(s) file:%s' % e)
        sys.exit(1)
    except AssertionError as e:
        logger.error('Genome(s) file seems empty: %s' % genome_file)
        sys.exit(1)
    except KeyboardInterrupt as e:
        logger.error('iss generate interrupted: %s' % e)
        sys.exit(1)
    else:
        abundance_dispatch = {
            'uniform': abundance.uniform,
            'halfnormal': abundance.halfnormal,
            'exponential': abundance.exponential,
            'lognormal': abundance.lognormal,
            'zero_inflated_lognormal': abundance.zero_inflated_lognormal
        }
        # read the abundance file
        if args.abundance_file and not args.draft:
            logger.info('Using abundance file:%s' % args.abundance_file)
            abundance_dic = abundance.parse_abundance_file(args.abundance_file)
        elif args.coverage and not args.draft:
            logger.warning('--coverage is an experimental feature')
            logger.info('Using coverage file:%s' % args.coverage)
            abundance_dic = abundance.parse_abundance_file(args.coverage)
        elif args.abundance in abundance_dispatch:
            logger.info('Using %s abundance distribution' % args.abundance)
            if args.draft:
                abundance_dic = abundance.draft(
                    genome_list, args.draft,
                    abundance_dispatch[args.abundance], args.output)
            else:
                abundance_dic = abundance_dispatch[args.abundance](genome_list)
                abundance.to_file(abundance_dic, args.output)
        else:
            logger.error('Could not get abundance')
            sys.exit(1)

        cpus = args.cpus
        logger.info('Using %s cpus for read generation' % cpus)

        if not args.coverage:
            n_reads = util.convert_n_reads(args.n_reads)
            logger.info('Generating %s reads' % n_reads)

        try:
            temp_file_list = []  # list holding the prefix of all temp files
            f = open(genome_file, 'r')  # re-opens the file
            with f:
                fasta_file = SeqIO.parse(f, 'fasta')

                for record in fasta_file:
                    # generate reads for records
                    try:
                        species_abundance = abundance_dic[record.id]
                    except KeyError as e:
                        logger.error(
                            'Fasta record not found in abundance file: %s' % e)
                        sys.exit(1)
                    else:
                        logger.info('Generating reads for record: %s' %
                                    record.id)
                        genome_size = len(record.seq)

                        if args.coverage:
                            coverage = species_abundance
                        else:
                            coverage = abundance.to_coverage(
                                n_reads, species_abundance,
                                err_mod.read_length, genome_size)
                        n_pairs = int(
                            round((coverage * len(record.seq)) /
                                  err_mod.read_length) / 2)
                        # skip record if n_reads == 0
                        if n_pairs == 0:
                            continue

                        # exact n_reads for each cpus
                        if n_pairs % cpus == 0:
                            n_pairs_per_cpu = [(n_pairs // cpus)
                                               for _ in range(cpus)]
                        else:
                            n_pairs_per_cpu = [(n_pairs // cpus)
                                               for _ in range(cpus)]
                            n_pairs_per_cpu[-1] += n_pairs % cpus

                        # due to a bug in multiprocessing
                        # https://bugs.python.org/issue17560
                        # we can't send records taking more than 2**31 bytes
                        # through serialisation.
                        # In those cases we use memmapping
                        if sys.getsizeof(str(record.seq)) >= 2**31 - 1:
                            logger.warning("record %s unusually big." %
                                           record.id)
                            logger.warning("Using a memory map.")
                            mode = "memmap"

                            record_mmap = "%s.memmap" % args.output
                            if os.path.exists(record_mmap):
                                os.unlink(record_mmap)
                            util.dump(record, record_mmap)
                            del record
                            record = record_mmap
                            gc.collect()
                        else:
                            mode = "default"

                        record_file_name_list = Parallel(n_jobs=cpus)(
                            delayed(generator.reads)(
                                record, err_mod, n_pairs_per_cpu[i], i,
                                args.output, args.seed, args.gc_bias, mode)
                            for i in range(cpus))
                        temp_file_list.extend(record_file_name_list)
        except KeyboardInterrupt as e:
            logger.error('iss generate interrupted: %s' % e)
            temp_file_unique = list(set(temp_file_list))
            temp_R1 = [temp_file + '_R1.fastq' for temp_file in temp_file_list]
            temp_R2 = [temp_file + '_R2.fastq' for temp_file in temp_file_list]
            full_tmp_list = temp_R1 + temp_R2
            full_tmp_list.append(genome_file)
            if os.path.exists("%s.memmap" % args.output):
                full_tmp_list.append("%s.memmap" % args.output)
            util.cleanup(full_tmp_list)
            sys.exit(1)
        else:
            # remove the duplicates in file list and cleanup
            # we remove the duplicates in case two records had the same header
            # and reads were appended to the same temp file.
            temp_file_unique = list(set(temp_file_list))
            temp_R1 = [temp_file + '_R1.fastq' for temp_file in temp_file_list]
            temp_R2 = [temp_file + '_R2.fastq' for temp_file in temp_file_list]
            util.concatenate(temp_R1, args.output + '_R1.fastq')
            util.concatenate(temp_R2, args.output + '_R2.fastq')
            full_tmp_list = temp_R1 + temp_R2
            full_tmp_list.append(genome_file)
            if os.path.exists("%s.memmap" % args.output):
                full_tmp_list.append("%s.memmap" % args.output)
            util.cleanup(full_tmp_list)
            if args.compress:
                util.compress(args.output + '_R1.fastq')
                util.compress(args.output + '_R2.fastq')
            logger.info('Read generation complete')