Esempio n. 1
0
def generate_reads(chromosomes, dist: ReadLengthDistribution, num_reads: int,
                   min_length: int):
    total_length = sum(c.length for c in chromosomes)
    max_chromosome_length = max(c.length for c in chromosomes)
    weights = [c.length / total_length for c in chromosomes]

    read_lengths = dist.get_read_lengths(num_reads, min_length)
    for curr_read_length in read_lengths:
        curr_read_length = int(min(max_chromosome_length, curr_read_length))

        # Weigh the selection of chromosome by its length, and make sure the
        # chromosome is long enough to be able a produce a read of this size.
        # By weighing the chromosome selection by its length, we ensure equal
        # coverage across the whole genome.
        index = -1
        chromosome = None
        while not chromosome or chromosome.length < curr_read_length:
            index = numpy.random.choice(len(chromosomes), p=weights)
            chromosome = chromosomes[index]

        # Pick a random start location
        start_pos = random.randint(0, chromosome.length-curr_read_length)
        read = chromosome.sequence[start_pos:start_pos+curr_read_length]

        # Replace N's with random bases
        for i, base in enumerate(read):
            if base in {b'n', b'N'}:
                read[i] = random.choice([b'A', b'C', b'T', b'G'])

        # Convert half of the reads to its reverse complement
        if random.choice([0, 1]) == 1:
            read = reverse_complement(read)

        yield read, chromosome.name, start_pos
Esempio n. 2
0
def make_bigwig(genomic_fasta, kmer_score_h5, motif_id, score_column, nlargest,
                output_file):

    kmer_dict = get_kmer_dict(kmer_score_h5, motif_id, score_column, nlargest)

    far = dinopy.FastaReader(genomic_fasta)

    with open(output_file, 'w') as bed_file:
        for sequence, chromosome, length, interval in far.entries():
            chromosome = chromosome.decode()
            sequence = sequence.decode()

            for start, kmer in enumerate(iter_kmers(sequence, k=K_MER_LENGTH)):
                try:
                    name = kmer_dict[kmer]
                except KeyError:
                    try:
                        kmer = dinopy.reverse_complement(kmer)
                        name = kmer_dict[kmer]
                    except KeyError:
                        continue

                end = start + K_MER_LENGTH

                bed_file.write(f'{chromosome}\t{start}\t{end}\t{name}\n')
Esempio n. 3
0
def simulate_translocate(chromosome1: bytes,
                         chromosome2: bytes,
                         length1: int,
                         length2: int,
                         mode: int = 0) -> Tuple[bytes, bytes]:
    """
    Simulate chromosomal translation. This is done by cutting some prefix or
    suffix from one chromosome and and this piece to an other chromosome,
    and vice versa.

    .. seealso::
       https://en.wikipedia.org/wiki/Chromosomal_translocation

    :param chromosome1: DNA sequence of the first chromosome
    :param chromosome2: DNA sequence of the second chromosome
    :param length1: Number of bases to cut from the first chromosome and add
                    this to the second chromosome.
    :param length2: Number of bases to cut from the second chromosome and add
                    this to the first chromosome.
    :param mode: Mode=0 means modifying the suffixes of chromosomes, mode=1
                 means modifying the prefixes.

    """

    if mode == 0:
        new_chromosome1 = chromosome1[:-length1] + chromosome2[-length2:]
        new_chromosome2 = chromosome2[:-length2] + chromosome1[-length1:]
    elif mode == 1:
        new_chromosome1 = chromosome2[:length2] + chromosome1[length1:]
        new_chromosome2 = chromosome1[:length1] + chromosome2[length2:]
    elif mode == 2:
        # Get a suffix from chromosome 1, reverse it, and concatenate it
        # with a suffix of chromosome 2
        new_chromosome1 = (reverse_complement(chromosome1[-length1:]) +
                           chromosome2[length2:])

        # Get a prefix of chromosome 1, and concatenate it with a reversed
        # prefix of chromosome 2
        new_chromosome2 = (chromosome1[:-length1] +
                           reverse_complement(chromosome2[:length2]))
    else:
        raise ValueError("Invalid mode '{}' specified. Possible values are 0, "
                         "1 or 2".format(mode))

    return new_chromosome1, new_chromosome2
Esempio n. 4
0
    def _get_sequence(self, read: OrientedRead) -> bytes:
        """Read the sequence from the FASTA file. Returns the reverse
        complement if necessary."""

        seq = list(self.reader[read.id])[0].sequence.upper()

        if read.orientation == "-":
            seq = dinopy.reverse_complement(seq)

        return seq
Esempio n. 5
0
def overlap(args):
    args.output.write(gfa.gfa_header())
    overlapper = ExactOverlapper()
    fr = dinopy.FastaReader(args.fasta_input)

    logger.info("Building suffix tree and searching for pairwise overlaps...")
    for entry in fr.entries():
        name = entry.name.decode('utf-8')
        seq = entry.sequence.decode('utf-8')
        args.output.write(gfa.gfa_line("S", name, entry.length, "*"))
        overlapper.add_sequence(name + "+", seq)
        overlapper.add_sequence(name + "-", dinopy.reverse_complement(seq))

    overlaps = overlapper.overlaps(args.min_length)

    logger.info("Writing to GFA2...")

    for aread, bread, astart, aend, bstart, bend in overlaps:
        args.output.write(gfa.gfa_line(
            "E", "*", aread, bread, astart, aend, bstart, bend, "*"))

    logger.info("Done.")