Beispiel #1
0
    def get_sample_kmers(self, sample):
        """Get all kmers from a sample, either from the reference sequence, all possible kmers from an alphabet or
            all kmers that cover a modified nucelotide

        :param sample: AbstractSamples object
        :return: set of desired kmers
        """
        kmers = set()
        # if motifs is present, process for all motifs with modified base
        if sample.motifs:
            for motif in sample.motifs:
                kmers |= get_motif_kmers(motif,
                                         self.k,
                                         alphabet=self.canonical)
        # if we want to limit kmers which were seen in reference sequence
        if sample.kmers_from_reference:
            for _, _, sequence in read_fasta(sample.bwa_reference):
                kmers |= get_sequence_kmers(sequence, k=self.k, rev_comp=True)
        else:
            kmers |= {
                x
                for x in all_string_permutations(self.canonical, length=self.k)
            }

        return kmers
Beispiel #2
0
 def __init__(self, alphabet="ACGT", length=6):
     self.alphabet = "".join(sorted(alphabet))
     self.length = length
     self.kmers = [
         x for x in all_string_permutations(self.alphabet, self.length)
     ]
     self.reads = []
     self.kmer_classes = {x: Kmer(x) for x in self.kmers}
     self.kmer_counts = {x: 0 for x in self.kmers}
Beispiel #3
0
def get_motif_kmers(motif_pair, k, alphabet="ATGC"):
    """Given a motif pair, create a list of all kmers which contain modification


    """
    assert len(
        motif_pair
    ) == 2, "Motif pair must be a list of length 2. len(motif_pair) = {}".format(
        len(motif_pair))
    canonical = motif_pair[0]
    modified = motif_pair[1]
    motif_len = len(canonical)
    # get mod index and chars
    mod_index, old_char, new_char = find_modification_index_and_character(
        canonical, modified)
    bases_after = motif_len - mod_index - 1

    # get overlaps for front and back of kmer
    front_overlap, back_overlap = get_front_back_kmer_overlap(
        k, motif_len, mod_index)
    # pre-compute kmers
    kmer_set_dict = dict()
    for i in range(1, max(front_overlap, back_overlap) + 1):
        kmer_set_dict[i] = [x for x in all_string_permutations(alphabet, i)]
    kmer_set_dict[0] = ['']

    motif_kmers = []
    for i in range(k):
        # get prepend kmers and index for front of motif
        if i >= front_overlap:
            front_index = i - front_overlap
            prepend_kmers = ['']
        else:
            prepend_kmers = kmer_set_dict[front_overlap - i]
            front_index = 0
        # get append kmers and index for back of motif
        if i > bases_after:
            append_kmers = kmer_set_dict[i - bases_after]
            back_index = motif_len
        else:
            back_index = mod_index + i + 1
            append_kmers = ['']

        kmer = modified[front_index:back_index]
        motif_kmers.extend([
            front + kmer + back for front in prepend_kmers
            for back in append_kmers if front + kmer + back is not ''
        ])

    return set(motif_kmers)
Beispiel #4
0
 def test_all_string_permutations(self):
     """Test allLexicographic"""
     with captured_output() as (_, _):
         for x in range(1, 10):
             test_string = ''.join(
                 random.choice(string.ascii_uppercase) for _ in range(5))
             generator = all_string_permutations(test_string, length=x)
             all_kmers = []
             for kmer in generator:
                 all_kmers.append(kmer)
             self.assertEqual(all_kmers, sorted(all_kmers))
             num_chars = len(set(test_string))
             self.assertEqual(num_chars**x, len(all_kmers))
         self.assertRaises(AssertionError, all_string_permutations, "")
         self.assertRaises(AssertionError, all_string_permutations, "AT", 0)
def convert_csv_to_sa_model(csv_file,
                            output_dir,
                            transition_probs,
                            state_number=3,
                            rna=False):
    output_path = os.path.join(
        output_dir,
        os.path.splitext(os.path.basename(csv_file))[0] + ".model")
    data = pd.read_csv(csv_file, names=["kmer", "mean", "sd"])
    alphabet = "".join(sorted(set("".join(data["kmer"]))))
    kmer_length = len(data["kmer"].iloc[0])
    alphabet_size = len(alphabet)
    new_kmers = all_string_permutations(alphabet, length=kmer_length)
    with open(output_path, 'w') as f:

        # line 0
        f.write("{stateNumber}\t{alphabetSize}\t{alphabet}\t{kmerLength}\n"
                "".format(stateNumber=state_number,
                          alphabetSize=alphabet_size,
                          alphabet=alphabet,
                          kmerLength=kmer_length))
        # line 1 transitions
        for i in range(state_number * state_number):
            f.write(
                "{transition}\t".format(transition=str(transition_probs[i])))
        # likelihood
        f.write("{}\n".format(str(0)))

        # line 2 Event Model
        for kmer in new_kmers:
            # k_index = HmmModel._get_kmer_index(kmer, alphabet, kmer_length, alphabet_size)
            if rna:
                kmer = kmer[::-1]
            kmer_data = data[data["kmer"] == kmer]
            assert kmer == kmer_data["kmer"].iloc[
                0], "The input csv model is not sorted or something else is very wrong. Check inputs"
            f.write(
                "{level_mean}\t{level_sd}\t{noise_mean}\t{noise_sd}\t{noise_lambda}\t"
                "".format(level_mean=kmer_data["mean"].iloc[0],
                          level_sd=kmer_data["sd"].iloc[0],
                          noise_mean=0,
                          noise_sd=0,
                          noise_lambda=0))
        f.write("\n")
    return output_path
Beispiel #6
0
def make_kmer_directories(dir_path, alphabet, kmer_length, complement=False):
    """Make the kmer directories where all the kmers will be written
    :param dir_path: path to directory
    :param alphabet: kmer alphabet
    :param kmer_length: length of kmer
    :param complement: boolean option to create complement kmers
    """
    assert os.path.isdir(dir_path), "dir_path is not a directory. {}".format(
        dir_path)
    template_dirs = []
    complment_dirs = []

    for kmer in all_string_permutations(alphabet, length=kmer_length):
        template_path = os.path.join(dir_path, kmer)
        os.mkdir(template_path)
        template_dirs.append(template_path)
        if complement:
            complement_path = os.path.join(dir_path, kmer + "_c")
            os.mkdir(complement_path)
            complment_dirs.append(complement_path)
    template_dirs.extend(complment_dirs)
    return template_dirs
 def test_get_kmer_index(self):
     all_kmers = [x for x in all_string_permutations("ATGC", 5)]
     for x in range(10):
         kmer = get_random_string(5, chars="ATGC")
         self.assertEqual(all_kmers.index(kmer),
                          self.model.get_kmer_index(kmer))
Beispiel #8
0
def main():
    OUTPUT_DIR = "/home/ubuntu/ecoli_methylation_analysis/kmer_analysis"
    positions_data = False
    keys = ["contig", "reference_index", "strand"]
    # RNA canonical
    # REFERENCE = "/home/ubuntu/mount/download/RNA_rel2/reference/gencode.v27.transcripts.fa"
    # VARIANT_HOME_DIRS = ["/home/ubuntu/mount/OICR_runs/all_runs/", "/home/ubuntu/mount/UBC_runs/all_runs/"]
    # # VARIANT_HOME_DIRS = ["/home/ubuntu/mount/OICR_runs/test/", "/home/ubuntu/mount/OICR_runs/test/"]
    # VARIANT_NAMES = ["/variant_calls/na12878_OICR_RNA_canonical.csv", "/variant_calls/na12878_UBC_RNA_canonical.csv"]
    # ALPHABET = "ATGC"
    # KMER_LENGTH = 5
    # NAMES = ["OICR", "UBC"]
    #
    # # DNA canonical
    # REFERENCE = "/home/ubuntu/bisulfite_methylation_analysis/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa"
    # VARIANT_HOME_DIRS = ["/home/ubuntu/mount/FAB39088_runs/canonical_calling/all_runs/",
    #                      "/home/ubuntu/mount/FAF01169_runs/canonical_calling/all_runs/"]
    # VARIANT_NAMES = ["/variant_calls/variant_calls.csv", "/variant_calls/variant_calls.csv"]
    # ALPHABET = "ATGC"
    # KMER_LENGTH = 6
    # NAMES = ["FAB39088_canonical", "FAF01169_canonical"]

    # DNA mod
    # REFERENCE = "/home/ubuntu/bisulfite_methylation_analysis/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa"
    # VARIANT_HOME_DIRS = ["/home/ubuntu/mount/FAB39088_runs/cpg_calling/all_runs/",
    #                      "/home/ubuntu/mount/FAF01169_runs/cpg_calling/all_runs/"]
    # NAMES = ["FAB39088_methyl", "FAF01169_methyl"]
    # VARIANT_NAMES = ["/variant_calls/variant_calls.csv", "/variant_calls/variant_calls.csv"]
    # ALPHABET = "ATGCM"
    # KMER_LENGTH = 6
    # POSITIONS_FILE = "/home/ubuntu/bisulfite_methylation_analysis/positions/all_mC.positions"
    # positions_data = pd.read_csv(POSITIONS_FILE, names=["contig", "reference_index", "strand", "find", "replace"],
    #                              sep="\t")

    # ECOLI MOD
    REFERENCE = "/home/ubuntu/ecoli_methylation_analysis/reference/ecoli.fa"
    VARIANT_HOME_DIRS = [
        "/home/ubuntu/ecoli_methylation_analysis/signalalign_output/"
    ]
    NAMES = ["variant_calls"]
    VARIANT_NAMES = ["ecoli_dna_baseline_ATCGMQ_sa.model.csv"]
    ALPHABET = "ATGCM"
    KMER_LENGTH = 6
    # POSITIONS_FILE = "/home/ubuntu/ecoli_methylation_analysis/kmer_analysis/all.positions"
    # positions_data = pd.read_csv(POSITIONS_FILE, names=["contig", "reference_index", "strand", "find", "replace"],
    #                              sep="\t")

    if positions_data is not False:
        i2 = positions_data.set_index(keys).index

    assert os.path.exists(REFERENCE), "{} does not exist".format(REFERENCE)
    assert os.path.isdir(OUTPUT_DIR), "{} is not a directory".format(
        OUTPUT_DIR)
    rh = ReferenceHandler(REFERENCE)
    rc = ReverseComplement()
    kmers = {k: 0 for k in all_string_permutations(ALPHABET, KMER_LENGTH)}
    paths = []
    for home_dir, variant_name in zip(VARIANT_HOME_DIRS, VARIANT_NAMES):
        assert os.path.isdir(home_dir), "{} is not a directory".format(
            home_dir)
        home_dir_paths = os.listdir(home_dir)
        tmp_paths = [
            os.path.join(home_dir, x, variant_name) for x in home_dir_paths
            if os.path.exists(os.path.join(home_dir, x, variant_name))
        ]

        assert len(
            tmp_paths
        ) > 0, "Check inputs, there are no paths which exist: {}".format(
            home_dir)
        paths.append(tmp_paths)

    def get_kmer(chromosome, pos, strand):
        try:
            seq = rh.get_sequence(chromosome, (pos - KMER_LENGTH) + 1,
                                  pos + KMER_LENGTH)
            if strand == "-":
                seq = rc.reverse_complement(seq)
            if positions_data is not False:
                replace = read_pos_data[
                    (read_pos_data["contig"] == chromosome)
                    & (read_pos_data["reference_index"] == pos) &
                    (read_pos_data["strand"] == strand)]
                if not replace.empty:
                    seq = seq[:KMER_LENGTH - 1] + replace.iloc[0][
                        "replace"] + seq[KMER_LENGTH:]

        except Exception as e:
            print(e, chromosome, pos, strand)
        return seq

    void = '-'
    fill = '#'
    n_spaces = 100
    n_files = 0
    for variant_set, name in zip(paths, NAMES):
        n_paths = len(variant_set)
        count = n_spaces / n_paths
        increaseCount = 0
        print("Starting on {}".format(name))
        local_kmers = {
            k: 0
            for k in all_string_permutations(ALPHABET, KMER_LENGTH)
        }
        for variant_path in variant_set:
            print('[' + (fill * int(increaseCount)) +
                  (void * int(n_spaces - increaseCount)) + '] ' +
                  str(int(increaseCount)) + '%',
                  end='\r')
            increaseCount += count
            variant_data = pd.read_csv(variant_path)
            if positions_data is not False:
                i1 = variant_data.set_index(keys).index
                read_pos_data = positions_data[i2.isin(i1)]
            #         read_id            028a34d4-2a7a-44e7-ab23-305915996ec8
            #         contig                                          RDN18-1
            #         reference_index                                     973
            #         strand                                                +
            #         variants                                             Aa
            #         prob1                                          0.986967
            #         prob2                                          0.013033
            #         prob3                                               NaN
            variant_data['next_base'] = np.vectorize(get_kmer)(
                variant_data['contig'], variant_data['reference_index'],
                variant_data['strand'])
            large_kmers = set(variant_data['next_base'])
            for l_kmer in large_kmers:
                for i in range(KMER_LENGTH):
                    k = l_kmer[i:KMER_LENGTH + i]
                    if len(k) == KMER_LENGTH:
                        kmers[k] += 1
                        local_kmers[k] += 1

        print('[' + (fill * int(increaseCount)) +
              (void * int(n_spaces - increaseCount)) + '] ' +
              str(int(increaseCount)) + '%',
              end='\n')

        total_zeros = 0
        for x, y in local_kmers.items():
            if y == 0:
                total_zeros += 1
        n_files += n_paths
        print("{} Kmers Covered: {}/{}".format(name,
                                               len(local_kmers) - total_zeros,
                                               len(local_kmers)))
        print("{} Average coverage: {:.4}".format(
            name,
            np.sum(list(local_kmers.values())) /
            (len(local_kmers) - total_zeros)))
        with open(os.path.join(OUTPUT_DIR, name + ".tsv"), 'w') as fh:
            print("\n".join([
                "\t".join([x, str(y / n_paths)])
                for x, y in local_kmers.items()
            ]),
                  file=fh)

    total_zeros = 0
    for x, y in kmers.items():
        if y == 0:
            total_zeros += 1
    print("TOTAL Kmers Covered: {}/{}".format(
        len(kmers) - total_zeros, len(kmers)))
    print("TOTAL Average coverage: {}".format(
        np.average(list(kmers.values())) / (n_files / 2)))
    with open(os.path.join(OUTPUT_DIR, "total_" + "_".join(NAMES) + ".tsv"),
              'w') as fh:
        print("\n".join(
            ["\t".join([x, str(y / n_files)]) for x, y in kmers.items()]),
              file=fh)
def main():
    args = parse_args()
    assert os.path.isdir(args.output_dir), "{} is not a directory".format(
        args.output_dir)
    assert os.path.exists(args.reference), "{} does not exist".format(
        args.reference)
    assert os.path.exists(args.positions_file), "{} does not exist".format(
        args.positions_file)

    positions_data = pd.read_csv(
        args.positions_file,
        names=["chr", "start", "strand", "find", "replace"],
        sep="\t")
    positions_data["kmer"] = np.nan
    # reference handler and reverse complement handler
    rh = ReferenceHandler(args.reference)
    rc = ReverseComplement()
    chromosome_data = {
        chromosome: rh.get_sequence(chromosome, 0,
                                    rh.get_chr_sequence_length(chromosome))
        for chromosome in rh.fasta.references
    }

    alphabet = "ACGMT"
    kmer_length = 6

    def get_kmer(chromosome, pos, strand, replace=None):
        try:
            seq = chromosome_data[chromosome][(pos - kmer_length) + 1:pos +
                                              kmer_length]
            if strand == "-":
                seq = rc.reverse_complement(seq)
            if replace is not None:
                seq = seq[:kmer_length - 1] + replace + seq[kmer_length:]
        except Exception as e:
            print(e, chromosome, pos, strand)
        return seq

    mod_pos_data = positions_data.loc[positions_data['replace'] == "M"].copy()
    mod_pos_data.loc[:,
                     "kmer"] = np.vectorize(get_kmer)(mod_pos_data['chr'],
                                                      mod_pos_data['start'],
                                                      mod_pos_data['strand'],
                                                      "M")

    kmers = {k: 0 for k in all_string_permutations(alphabet, kmer_length)}
    large_kmers = set(mod_pos_data['kmer'])
    for l_kmer in large_kmers:
        for i in range(kmer_length):
            k = l_kmer[i:kmer_length + i]
            if len(k) == kmer_length:
                kmers[k] += 1
    m_kmers = [x for x, y in kmers.items() if x.count("M") == 1]
    found_m_only_kmers = {
        x: y
        for x, y in kmers.items() if y > 0 and x.count("M") == 1
    }
    print(f"Number of M kmers: {len(m_kmers)}")
    print(f"Number of found M kmers: {len(found_m_only_kmers)}")

    c_pos_data = positions_data.loc[positions_data['replace'] == "C"].copy()
    c_pos_data.loc[:,
                   'kmer'] = np.vectorize(get_kmer)(c_pos_data['chr'],
                                                    c_pos_data['start'],
                                                    c_pos_data['strand'], "C")
    filter_c_pos_data = c_pos_data[~c_pos_data["kmer"].str.contains(
        '|'.join(["N", "W", "Y"]), regex=True)]

    kmers = {k: 0 for k in all_string_permutations(alphabet, kmer_length)}
    large_kmers = set(filter_c_pos_data['kmer'])
    for l_kmer in large_kmers:
        for i in range(kmer_length):
            k = l_kmer[i:kmer_length + i]
            if len(k) == kmer_length:
                kmers[k] += 1
    no_m_kmers = [
        x for x, y in kmers.items() if x.count("M") == 0 and x.count("C") > 0
    ]
    found_no_m_kmers = {
        x: y
        for x, y in kmers.items()
        if y > 0 and x.count("M") == 0 and x.count("C") > 0
    }
    print(f"Number of Canonical kmers: {len(no_m_kmers)}")
    print(f"Number of found Canonical kmers: {len(found_no_m_kmers)}")