def generate_reads(chromosomes, dist: ReadLengthDistribution, num_reads: int, min_length: int): total_length = sum(c.length for c in chromosomes) max_chromosome_length = max(c.length for c in chromosomes) weights = [c.length / total_length for c in chromosomes] read_lengths = dist.get_read_lengths(num_reads, min_length) for curr_read_length in read_lengths: curr_read_length = int(min(max_chromosome_length, curr_read_length)) # Weigh the selection of chromosome by its length, and make sure the # chromosome is long enough to be able a produce a read of this size. # By weighing the chromosome selection by its length, we ensure equal # coverage across the whole genome. index = -1 chromosome = None while not chromosome or chromosome.length < curr_read_length: index = numpy.random.choice(len(chromosomes), p=weights) chromosome = chromosomes[index] # Pick a random start location start_pos = random.randint(0, chromosome.length-curr_read_length) read = chromosome.sequence[start_pos:start_pos+curr_read_length] # Replace N's with random bases for i, base in enumerate(read): if base in {b'n', b'N'}: read[i] = random.choice([b'A', b'C', b'T', b'G']) # Convert half of the reads to its reverse complement if random.choice([0, 1]) == 1: read = reverse_complement(read) yield read, chromosome.name, start_pos
def make_bigwig(genomic_fasta, kmer_score_h5, motif_id, score_column, nlargest, output_file): kmer_dict = get_kmer_dict(kmer_score_h5, motif_id, score_column, nlargest) far = dinopy.FastaReader(genomic_fasta) with open(output_file, 'w') as bed_file: for sequence, chromosome, length, interval in far.entries(): chromosome = chromosome.decode() sequence = sequence.decode() for start, kmer in enumerate(iter_kmers(sequence, k=K_MER_LENGTH)): try: name = kmer_dict[kmer] except KeyError: try: kmer = dinopy.reverse_complement(kmer) name = kmer_dict[kmer] except KeyError: continue end = start + K_MER_LENGTH bed_file.write(f'{chromosome}\t{start}\t{end}\t{name}\n')
def simulate_translocate(chromosome1: bytes, chromosome2: bytes, length1: int, length2: int, mode: int = 0) -> Tuple[bytes, bytes]: """ Simulate chromosomal translation. This is done by cutting some prefix or suffix from one chromosome and and this piece to an other chromosome, and vice versa. .. seealso:: https://en.wikipedia.org/wiki/Chromosomal_translocation :param chromosome1: DNA sequence of the first chromosome :param chromosome2: DNA sequence of the second chromosome :param length1: Number of bases to cut from the first chromosome and add this to the second chromosome. :param length2: Number of bases to cut from the second chromosome and add this to the first chromosome. :param mode: Mode=0 means modifying the suffixes of chromosomes, mode=1 means modifying the prefixes. """ if mode == 0: new_chromosome1 = chromosome1[:-length1] + chromosome2[-length2:] new_chromosome2 = chromosome2[:-length2] + chromosome1[-length1:] elif mode == 1: new_chromosome1 = chromosome2[:length2] + chromosome1[length1:] new_chromosome2 = chromosome1[:length1] + chromosome2[length2:] elif mode == 2: # Get a suffix from chromosome 1, reverse it, and concatenate it # with a suffix of chromosome 2 new_chromosome1 = (reverse_complement(chromosome1[-length1:]) + chromosome2[length2:]) # Get a prefix of chromosome 1, and concatenate it with a reversed # prefix of chromosome 2 new_chromosome2 = (chromosome1[:-length1] + reverse_complement(chromosome2[:length2])) else: raise ValueError("Invalid mode '{}' specified. Possible values are 0, " "1 or 2".format(mode)) return new_chromosome1, new_chromosome2
def _get_sequence(self, read: OrientedRead) -> bytes: """Read the sequence from the FASTA file. Returns the reverse complement if necessary.""" seq = list(self.reader[read.id])[0].sequence.upper() if read.orientation == "-": seq = dinopy.reverse_complement(seq) return seq
def overlap(args): args.output.write(gfa.gfa_header()) overlapper = ExactOverlapper() fr = dinopy.FastaReader(args.fasta_input) logger.info("Building suffix tree and searching for pairwise overlaps...") for entry in fr.entries(): name = entry.name.decode('utf-8') seq = entry.sequence.decode('utf-8') args.output.write(gfa.gfa_line("S", name, entry.length, "*")) overlapper.add_sequence(name + "+", seq) overlapper.add_sequence(name + "-", dinopy.reverse_complement(seq)) overlaps = overlapper.overlaps(args.min_length) logger.info("Writing to GFA2...") for aread, bread, astart, aend, bstart, bend in overlaps: args.output.write(gfa.gfa_line( "E", "*", aread, bread, astart, aend, bstart, bend, "*")) logger.info("Done.")