def get_sample_kmers(self, sample): """Get all kmers from a sample, either from the reference sequence, all possible kmers from an alphabet or all kmers that cover a modified nucelotide :param sample: AbstractSamples object :return: set of desired kmers """ kmers = set() # if motifs is present, process for all motifs with modified base if sample.motifs: for motif in sample.motifs: kmers |= get_motif_kmers(motif, self.k, alphabet=self.canonical) # if we want to limit kmers which were seen in reference sequence if sample.kmers_from_reference: for _, _, sequence in read_fasta(sample.bwa_reference): kmers |= get_sequence_kmers(sequence, k=self.k, rev_comp=True) else: kmers |= { x for x in all_string_permutations(self.canonical, length=self.k) } return kmers
def __init__(self, alphabet="ACGT", length=6): self.alphabet = "".join(sorted(alphabet)) self.length = length self.kmers = [ x for x in all_string_permutations(self.alphabet, self.length) ] self.reads = [] self.kmer_classes = {x: Kmer(x) for x in self.kmers} self.kmer_counts = {x: 0 for x in self.kmers}
def get_motif_kmers(motif_pair, k, alphabet="ATGC"): """Given a motif pair, create a list of all kmers which contain modification """ assert len( motif_pair ) == 2, "Motif pair must be a list of length 2. len(motif_pair) = {}".format( len(motif_pair)) canonical = motif_pair[0] modified = motif_pair[1] motif_len = len(canonical) # get mod index and chars mod_index, old_char, new_char = find_modification_index_and_character( canonical, modified) bases_after = motif_len - mod_index - 1 # get overlaps for front and back of kmer front_overlap, back_overlap = get_front_back_kmer_overlap( k, motif_len, mod_index) # pre-compute kmers kmer_set_dict = dict() for i in range(1, max(front_overlap, back_overlap) + 1): kmer_set_dict[i] = [x for x in all_string_permutations(alphabet, i)] kmer_set_dict[0] = [''] motif_kmers = [] for i in range(k): # get prepend kmers and index for front of motif if i >= front_overlap: front_index = i - front_overlap prepend_kmers = [''] else: prepend_kmers = kmer_set_dict[front_overlap - i] front_index = 0 # get append kmers and index for back of motif if i > bases_after: append_kmers = kmer_set_dict[i - bases_after] back_index = motif_len else: back_index = mod_index + i + 1 append_kmers = [''] kmer = modified[front_index:back_index] motif_kmers.extend([ front + kmer + back for front in prepend_kmers for back in append_kmers if front + kmer + back is not '' ]) return set(motif_kmers)
def test_all_string_permutations(self): """Test allLexicographic""" with captured_output() as (_, _): for x in range(1, 10): test_string = ''.join( random.choice(string.ascii_uppercase) for _ in range(5)) generator = all_string_permutations(test_string, length=x) all_kmers = [] for kmer in generator: all_kmers.append(kmer) self.assertEqual(all_kmers, sorted(all_kmers)) num_chars = len(set(test_string)) self.assertEqual(num_chars**x, len(all_kmers)) self.assertRaises(AssertionError, all_string_permutations, "") self.assertRaises(AssertionError, all_string_permutations, "AT", 0)
def convert_csv_to_sa_model(csv_file, output_dir, transition_probs, state_number=3, rna=False): output_path = os.path.join( output_dir, os.path.splitext(os.path.basename(csv_file))[0] + ".model") data = pd.read_csv(csv_file, names=["kmer", "mean", "sd"]) alphabet = "".join(sorted(set("".join(data["kmer"])))) kmer_length = len(data["kmer"].iloc[0]) alphabet_size = len(alphabet) new_kmers = all_string_permutations(alphabet, length=kmer_length) with open(output_path, 'w') as f: # line 0 f.write("{stateNumber}\t{alphabetSize}\t{alphabet}\t{kmerLength}\n" "".format(stateNumber=state_number, alphabetSize=alphabet_size, alphabet=alphabet, kmerLength=kmer_length)) # line 1 transitions for i in range(state_number * state_number): f.write( "{transition}\t".format(transition=str(transition_probs[i]))) # likelihood f.write("{}\n".format(str(0))) # line 2 Event Model for kmer in new_kmers: # k_index = HmmModel._get_kmer_index(kmer, alphabet, kmer_length, alphabet_size) if rna: kmer = kmer[::-1] kmer_data = data[data["kmer"] == kmer] assert kmer == kmer_data["kmer"].iloc[ 0], "The input csv model is not sorted or something else is very wrong. Check inputs" f.write( "{level_mean}\t{level_sd}\t{noise_mean}\t{noise_sd}\t{noise_lambda}\t" "".format(level_mean=kmer_data["mean"].iloc[0], level_sd=kmer_data["sd"].iloc[0], noise_mean=0, noise_sd=0, noise_lambda=0)) f.write("\n") return output_path
def make_kmer_directories(dir_path, alphabet, kmer_length, complement=False): """Make the kmer directories where all the kmers will be written :param dir_path: path to directory :param alphabet: kmer alphabet :param kmer_length: length of kmer :param complement: boolean option to create complement kmers """ assert os.path.isdir(dir_path), "dir_path is not a directory. {}".format( dir_path) template_dirs = [] complment_dirs = [] for kmer in all_string_permutations(alphabet, length=kmer_length): template_path = os.path.join(dir_path, kmer) os.mkdir(template_path) template_dirs.append(template_path) if complement: complement_path = os.path.join(dir_path, kmer + "_c") os.mkdir(complement_path) complment_dirs.append(complement_path) template_dirs.extend(complment_dirs) return template_dirs
def test_get_kmer_index(self): all_kmers = [x for x in all_string_permutations("ATGC", 5)] for x in range(10): kmer = get_random_string(5, chars="ATGC") self.assertEqual(all_kmers.index(kmer), self.model.get_kmer_index(kmer))
def main(): OUTPUT_DIR = "/home/ubuntu/ecoli_methylation_analysis/kmer_analysis" positions_data = False keys = ["contig", "reference_index", "strand"] # RNA canonical # REFERENCE = "/home/ubuntu/mount/download/RNA_rel2/reference/gencode.v27.transcripts.fa" # VARIANT_HOME_DIRS = ["/home/ubuntu/mount/OICR_runs/all_runs/", "/home/ubuntu/mount/UBC_runs/all_runs/"] # # VARIANT_HOME_DIRS = ["/home/ubuntu/mount/OICR_runs/test/", "/home/ubuntu/mount/OICR_runs/test/"] # VARIANT_NAMES = ["/variant_calls/na12878_OICR_RNA_canonical.csv", "/variant_calls/na12878_UBC_RNA_canonical.csv"] # ALPHABET = "ATGC" # KMER_LENGTH = 5 # NAMES = ["OICR", "UBC"] # # # DNA canonical # REFERENCE = "/home/ubuntu/bisulfite_methylation_analysis/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa" # VARIANT_HOME_DIRS = ["/home/ubuntu/mount/FAB39088_runs/canonical_calling/all_runs/", # "/home/ubuntu/mount/FAF01169_runs/canonical_calling/all_runs/"] # VARIANT_NAMES = ["/variant_calls/variant_calls.csv", "/variant_calls/variant_calls.csv"] # ALPHABET = "ATGC" # KMER_LENGTH = 6 # NAMES = ["FAB39088_canonical", "FAF01169_canonical"] # DNA mod # REFERENCE = "/home/ubuntu/bisulfite_methylation_analysis/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa" # VARIANT_HOME_DIRS = ["/home/ubuntu/mount/FAB39088_runs/cpg_calling/all_runs/", # "/home/ubuntu/mount/FAF01169_runs/cpg_calling/all_runs/"] # NAMES = ["FAB39088_methyl", "FAF01169_methyl"] # VARIANT_NAMES = ["/variant_calls/variant_calls.csv", "/variant_calls/variant_calls.csv"] # ALPHABET = "ATGCM" # KMER_LENGTH = 6 # POSITIONS_FILE = "/home/ubuntu/bisulfite_methylation_analysis/positions/all_mC.positions" # positions_data = pd.read_csv(POSITIONS_FILE, names=["contig", "reference_index", "strand", "find", "replace"], # sep="\t") # ECOLI MOD REFERENCE = "/home/ubuntu/ecoli_methylation_analysis/reference/ecoli.fa" VARIANT_HOME_DIRS = [ "/home/ubuntu/ecoli_methylation_analysis/signalalign_output/" ] NAMES = ["variant_calls"] VARIANT_NAMES = ["ecoli_dna_baseline_ATCGMQ_sa.model.csv"] ALPHABET = "ATGCM" KMER_LENGTH = 6 # POSITIONS_FILE = "/home/ubuntu/ecoli_methylation_analysis/kmer_analysis/all.positions" # positions_data = pd.read_csv(POSITIONS_FILE, names=["contig", "reference_index", "strand", "find", "replace"], # sep="\t") if positions_data is not False: i2 = positions_data.set_index(keys).index assert os.path.exists(REFERENCE), "{} does not exist".format(REFERENCE) assert os.path.isdir(OUTPUT_DIR), "{} is not a directory".format( OUTPUT_DIR) rh = ReferenceHandler(REFERENCE) rc = ReverseComplement() kmers = {k: 0 for k in all_string_permutations(ALPHABET, KMER_LENGTH)} paths = [] for home_dir, variant_name in zip(VARIANT_HOME_DIRS, VARIANT_NAMES): assert os.path.isdir(home_dir), "{} is not a directory".format( home_dir) home_dir_paths = os.listdir(home_dir) tmp_paths = [ os.path.join(home_dir, x, variant_name) for x in home_dir_paths if os.path.exists(os.path.join(home_dir, x, variant_name)) ] assert len( tmp_paths ) > 0, "Check inputs, there are no paths which exist: {}".format( home_dir) paths.append(tmp_paths) def get_kmer(chromosome, pos, strand): try: seq = rh.get_sequence(chromosome, (pos - KMER_LENGTH) + 1, pos + KMER_LENGTH) if strand == "-": seq = rc.reverse_complement(seq) if positions_data is not False: replace = read_pos_data[ (read_pos_data["contig"] == chromosome) & (read_pos_data["reference_index"] == pos) & (read_pos_data["strand"] == strand)] if not replace.empty: seq = seq[:KMER_LENGTH - 1] + replace.iloc[0][ "replace"] + seq[KMER_LENGTH:] except Exception as e: print(e, chromosome, pos, strand) return seq void = '-' fill = '#' n_spaces = 100 n_files = 0 for variant_set, name in zip(paths, NAMES): n_paths = len(variant_set) count = n_spaces / n_paths increaseCount = 0 print("Starting on {}".format(name)) local_kmers = { k: 0 for k in all_string_permutations(ALPHABET, KMER_LENGTH) } for variant_path in variant_set: print('[' + (fill * int(increaseCount)) + (void * int(n_spaces - increaseCount)) + '] ' + str(int(increaseCount)) + '%', end='\r') increaseCount += count variant_data = pd.read_csv(variant_path) if positions_data is not False: i1 = variant_data.set_index(keys).index read_pos_data = positions_data[i2.isin(i1)] # read_id 028a34d4-2a7a-44e7-ab23-305915996ec8 # contig RDN18-1 # reference_index 973 # strand + # variants Aa # prob1 0.986967 # prob2 0.013033 # prob3 NaN variant_data['next_base'] = np.vectorize(get_kmer)( variant_data['contig'], variant_data['reference_index'], variant_data['strand']) large_kmers = set(variant_data['next_base']) for l_kmer in large_kmers: for i in range(KMER_LENGTH): k = l_kmer[i:KMER_LENGTH + i] if len(k) == KMER_LENGTH: kmers[k] += 1 local_kmers[k] += 1 print('[' + (fill * int(increaseCount)) + (void * int(n_spaces - increaseCount)) + '] ' + str(int(increaseCount)) + '%', end='\n') total_zeros = 0 for x, y in local_kmers.items(): if y == 0: total_zeros += 1 n_files += n_paths print("{} Kmers Covered: {}/{}".format(name, len(local_kmers) - total_zeros, len(local_kmers))) print("{} Average coverage: {:.4}".format( name, np.sum(list(local_kmers.values())) / (len(local_kmers) - total_zeros))) with open(os.path.join(OUTPUT_DIR, name + ".tsv"), 'w') as fh: print("\n".join([ "\t".join([x, str(y / n_paths)]) for x, y in local_kmers.items() ]), file=fh) total_zeros = 0 for x, y in kmers.items(): if y == 0: total_zeros += 1 print("TOTAL Kmers Covered: {}/{}".format( len(kmers) - total_zeros, len(kmers))) print("TOTAL Average coverage: {}".format( np.average(list(kmers.values())) / (n_files / 2))) with open(os.path.join(OUTPUT_DIR, "total_" + "_".join(NAMES) + ".tsv"), 'w') as fh: print("\n".join( ["\t".join([x, str(y / n_files)]) for x, y in kmers.items()]), file=fh)
def main(): args = parse_args() assert os.path.isdir(args.output_dir), "{} is not a directory".format( args.output_dir) assert os.path.exists(args.reference), "{} does not exist".format( args.reference) assert os.path.exists(args.positions_file), "{} does not exist".format( args.positions_file) positions_data = pd.read_csv( args.positions_file, names=["chr", "start", "strand", "find", "replace"], sep="\t") positions_data["kmer"] = np.nan # reference handler and reverse complement handler rh = ReferenceHandler(args.reference) rc = ReverseComplement() chromosome_data = { chromosome: rh.get_sequence(chromosome, 0, rh.get_chr_sequence_length(chromosome)) for chromosome in rh.fasta.references } alphabet = "ACGMT" kmer_length = 6 def get_kmer(chromosome, pos, strand, replace=None): try: seq = chromosome_data[chromosome][(pos - kmer_length) + 1:pos + kmer_length] if strand == "-": seq = rc.reverse_complement(seq) if replace is not None: seq = seq[:kmer_length - 1] + replace + seq[kmer_length:] except Exception as e: print(e, chromosome, pos, strand) return seq mod_pos_data = positions_data.loc[positions_data['replace'] == "M"].copy() mod_pos_data.loc[:, "kmer"] = np.vectorize(get_kmer)(mod_pos_data['chr'], mod_pos_data['start'], mod_pos_data['strand'], "M") kmers = {k: 0 for k in all_string_permutations(alphabet, kmer_length)} large_kmers = set(mod_pos_data['kmer']) for l_kmer in large_kmers: for i in range(kmer_length): k = l_kmer[i:kmer_length + i] if len(k) == kmer_length: kmers[k] += 1 m_kmers = [x for x, y in kmers.items() if x.count("M") == 1] found_m_only_kmers = { x: y for x, y in kmers.items() if y > 0 and x.count("M") == 1 } print(f"Number of M kmers: {len(m_kmers)}") print(f"Number of found M kmers: {len(found_m_only_kmers)}") c_pos_data = positions_data.loc[positions_data['replace'] == "C"].copy() c_pos_data.loc[:, 'kmer'] = np.vectorize(get_kmer)(c_pos_data['chr'], c_pos_data['start'], c_pos_data['strand'], "C") filter_c_pos_data = c_pos_data[~c_pos_data["kmer"].str.contains( '|'.join(["N", "W", "Y"]), regex=True)] kmers = {k: 0 for k in all_string_permutations(alphabet, kmer_length)} large_kmers = set(filter_c_pos_data['kmer']) for l_kmer in large_kmers: for i in range(kmer_length): k = l_kmer[i:kmer_length + i] if len(k) == kmer_length: kmers[k] += 1 no_m_kmers = [ x for x, y in kmers.items() if x.count("M") == 0 and x.count("C") > 0 ] found_no_m_kmers = { x: y for x, y in kmers.items() if y > 0 and x.count("M") == 0 and x.count("C") > 0 } print(f"Number of Canonical kmers: {len(no_m_kmers)}") print(f"Number of found Canonical kmers: {len(found_no_m_kmers)}")