Example #1
0
 def getBackwardSequence(self, contig, raw_sequence):
     """Edit 'raw_sequence' given a ambiguity positions file, Assumes raw_sequence is forward direction( 5'-3')
     :param contig: which contig the sequence belongs (aka header)
     :param raw_sequence: raw nucleotide sequence
     :return: edited nucleotide sequence
     """
     rc = ReverseComplement()
     raw_sequence = rc.complement(raw_sequence)
     return self._get_substituted_sequence(contig, raw_sequence, "-")
Example #2
0
 def test_iupac_complement(self):
     with captured_output() as (_, _):
         handle = ReverseComplement()
         for char in IUPAC_BASES:
             bases = iupac_base_to_bases(char)
             complement = iupac_complement(char)
             complement_chars = iupac_base_to_bases(complement)
             for x in bases:
                 self.assertTrue(handle.complement(x) in complement_chars)
Example #3
0
 def test_reverse_complement(self):
     with captured_output() as (_, _):
         test = self.base.reverse_complement("ATGC")
         self.assertEqual(test, "GCAT")
         with self.assertRaises(AttributeError):
             self.base.reverse_complement(1)
         testbase = ReverseComplement(find="a1", replace="c2")
         self.assertEqual(testbase.reverse_complement("A1"), "2C")
         self.assertEqual(testbase.reverse_complement("a1"), "2C")
Example #4
0
def match_events_with_eventalign(events=None, event_detections=None, minus=False, rna=False):
    """Match event index with event detection data to label segments of signal for each kmer

    # RNA is sequenced 3'-5'
    # reversed for fasta/q sequence
    # if mapped to reverse strand
    # reverse reverse complement = complement

    # DNA is sequenced 5'-3'
    # if mapped to reverse strand
    # reverse complement

    :param events: events table reference_index', 'event_index', 'aligned_kmer', 'posterior_probability
    :param event_detections: event detection event table
    :param minus: boolean option to for minus strand mapping
    :param rna: boolean for RNA read
    """
    assert events is not None, "Must pass signal alignment events"
    assert event_detections is not None, "Must pass event_detections events"

    check_numpy_table(events, req_fields=('position', 'event_index',
                                          'reference_kmer'))

    check_numpy_table(event_detections, req_fields=('start', 'length'))

    label = np.zeros(len(events), dtype=[('raw_start', int), ('raw_length', int), ('reference_index', int),
                                         ('posterior_probability', float), ('kmer', 'S6')])

    label['raw_start'] = [event_detections[x]["start"] for x in events["event_index"]]
    label['raw_length'] = [event_detections[x]["length"] for x in events["event_index"]]
    label['reference_index'] = events["position"]

    def convert_to_str(string):
        """Helper function to catch bytes as strings"""
        if type(string) is str:
            return string
        else:
            return bytes.decode(string)

    flip = ReverseComplement()
    if minus:
        if rna:
            kmers = [flip.complement(convert_to_str(x)) for x in events["reference_kmer"]]
        else:
            kmers = [flip.reverse_complement(convert_to_str(x)) for x in events["reference_kmer"]]
    else:
        if rna:
            kmers = [flip.reverse(convert_to_str(x)) for x in events["reference_kmer"]]
        else:
            kmers = events["reference_kmer"]
    label['kmer'] = kmers
    label['posterior_probability'] = np.ones(len(events))
    # np.sort(label, order='raw_start', kind='mergesort')

    return label
Example #5
0
    def test_reverse_complement(self):
        rev_comp = ReverseComplement(find="ACGTMKRYBVDHNacgtmkrybvdhn", replace="TGCAKMYRVBHDNtgcakmyrvbhdn")
        for x in range(10):
            rand_len = np.random.randint(0, 1000)
            random_dna = get_random_string(rand_len, chars=list(set("ACGTMKRYBVDHN")))

            self.assertEqual(reverse_complement(random_dna, reverse=True, complement=True),
                             rev_comp.reverse_complement(random_dna))
            self.assertEqual(reverse_complement(random_dna, reverse=False, complement=True),
                             rev_comp.complement(random_dna))
            self.assertEqual(reverse_complement(random_dna, reverse=True, complement=False),
                             rev_comp.reverse(random_dna))
            self.assertEqual(reverse_complement(random_dna, reverse=False, complement=False),
                             random_dna)
    def test_rna_reads(self):
        with tempfile.TemporaryDirectory() as tempdir:
            template_model = os.path.join(
                self.HOME, "models/testModelR9p4_5mer_acgt_RNA.model")
            args = create_signalAlignment_args(
                alignment_file=self.rna_bam,
                bwa_reference=self.rna_reference,
                forward_reference=self.rna_reference,
                in_templateHmm=template_model,
                path_to_bin=self.path_to_bin,
                destination=tempdir,
                embed=True,
                delete_tmp=False)

            in_rna_file = os.path.join(
                self.test_dir_rna,
                "DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_36_ch_218_strand.fast5"
            )
            final_args = merge_dicts([args, dict(in_fast5=in_rna_file)])
            handle = SignalAlignment(**final_args)
            handle.run()
            fh = pysam.FastaFile(self.rna_reference)
            f5fh = Fast5(in_rna_file)
            sa_events = f5fh.get_signalalign_events()
            for i, event in enumerate(sa_events):
                kmer = fh.fetch(reference="rna_fake",
                                start=event["reference_index"],
                                end=event["reference_index"] + 5)[::-1]
                self.assertEqual(event["path_kmer"].decode(), kmer)
                self.assertEqual(event["reference_kmer"].decode(), kmer)

            in_rna_file = os.path.join(
                self.test_dir_rna,
                "DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_61_ch_151_strand.fast5"
            )
            final_args = merge_dicts([args, dict(in_fast5=in_rna_file)])
            handle = SignalAlignment(**final_args)
            handle.run()
            rev_c = ReverseComplement()
            f5fh = Fast5(in_rna_file)
            sa_events = f5fh.get_signalalign_events()
            for i, event in enumerate(sa_events):
                kmer = fh.fetch(reference="rna_fake",
                                start=event["reference_index"],
                                end=event["reference_index"] + 5)[::-1]
                rev_kmer = rev_c.reverse_complement(kmer)
                self.assertEqual(event["path_kmer"].decode(), rev_kmer)
                self.assertEqual(event["reference_kmer"].decode(), kmer)
def resegment_reads(fast5_path, params, speedy=False, overwrite=False):
    """Re-segment and create anchor alignment from previously base-called fast5 file
    :param fast5_path: path to fast5 file
    :param params: event detection parameters
    :param speedy: boolean option for speedyStatSplit or minknow
    :param overwrite: overwrite a previous event re-segmented event table
    :param name: name of key where events table will be placed (Analyses/'name'/Events)
    :return True when completed
    """
    assert os.path.isfile(fast5_path), "File does not exist: {}".format(fast5_path)
    name = "ReSegmentBasecall_00{}"
    # create Fast5 object
    f5fh = Fast5(fast5_path, read='r+')
    # gather previous event detection
    old_event_table = f5fh.get_basecall_data()
    # assert check_event_table_time(old_event_table), "Old event is not consistent"
    read_id = bytes.decode(f5fh.raw_attributes['read_id'])
    sampling_freq = f5fh.sample_rate
    start_time = f5fh.raw_attributes['start_time']
    # pick event detection algorithm
    signal = f5fh.get_read(raw=True, scale=True)

    if speedy:
        event_table = create_speedy_event_table(signal, sampling_freq, start_time, **params)
        params = merge_dicts([params, {"event_detection": "speedy_stat_split"}])
    else:
        event_table = create_minknow_event_table(signal, sampling_freq, start_time, **params)
        params = merge_dicts([params, {"event_detection": "minknow_event_detect"}])

    keys = ["nanotensor version", "time_stamp"]
    values = ["0.2.0", TimeStamp().posix_date()]
    attributes = merge_dicts([params, dict(zip(keys, values)), f5fh.raw_attributes])
    if f5fh.is_read_rna():
        old_event_table = index_to_time(old_event_table, sampling_freq=sampling_freq, start_time=start_time)
    # set event table
    new_event_table = create_anchor_kmers(new_events=event_table, old_events=old_event_table)
    f5fh.set_new_event_table(name, new_event_table, attributes, overwrite=overwrite)
    # gather new sequence
    sequence = sequence_from_events(new_event_table)
    if f5fh.is_read_rna():
        sequence = ReverseComplement().reverse(sequence)
        sequence = sequence.replace("T", "U")
    quality_scores = '!'*len(sequence)
    fastq = create_fastq_line(read_id+" :", sequence, quality_scores)
    # set fastq
    f5fh.set_fastq(name, fastq)
    return f5fh
Example #8
0
    def setUpClass(cls):
        super(ReverseComplementTest, cls).setUpClass()
        cls.HOME = '/'.join(os.path.abspath(__file__).split("/")[:-1])
        cls.fasta = os.path.join(cls.HOME,
                                 "test_files/test.fa")
        cls.fastq = os.path.join(cls.HOME,
                                 "test_files/test.fastq")

        cls.reference = os.path.join(cls.HOME,
                                     "test_files/ecoli_k12_mg1655.fa")
        cls.base = ReverseComplement()
Example #9
0
    def test_instantiation(self):
        with captured_output() as (_, _):
            self.assertEqual(ReverseComplement().find, "ATGC")
            self.assertEqual(ReverseComplement().replace, "TACG")

            with self.assertRaises(AssertionError):
                ReverseComplement(find="asdfe")
                ReverseComplement(replace="asdfe")
                ReverseComplement(find="asdfe", replace="poiuyq")
            with self.assertRaises(AssertionError):
                ReverseComplement(find="aa", replace="at")
def get_kmer_counts_from_reference_given_bed(
        reference,
        bed_file,
        k=5,
        param_filter=FilterBed.return_true,
        check_base=None):
    """Generate kmer counts covering positions in a bed file"""
    ref_handler = ReferenceHandler(reference)
    kmers = Counter()
    counter = 0
    for chromosome, start, stop, _, _, strand, _, _, _, coverage, percentage in parse_methyl_bed(
            bed_file):
        if param_filter(chromosome, start, stop, strand, coverage, percentage):
            block_start = max(0, start - (k - 1))
            block_end = min(ref_handler.get_chr_sequence_length(chromosome),
                            stop + (k - 1))
            seq = ref_handler.get_sequence(chromosome, block_start, block_end)
            # Check if base in bed file matches the reference sequence
            if check_base is not None:
                base = ref_handler.get_sequence(chromosome, start, stop)
                if strand == "-":
                    this_base = ReverseComplement().complement(check_base)
                else:
                    this_base = check_base

                assert this_base == base, \
                    "Check base is not the same as the one from the reference. " \
                    "{} != {}. {}".format(this_base, base, [chromosome, start, stop, strand, coverage, percentage])
            kmers += count_all_sequence_kmers(seq,
                                              k=k,
                                              rev_comp_only=(strand == "-"))

        # Print some updates because this takes a long time
        counter += 1
        if counter % 10000 == 0:
            print(".", end="")
            sys.stdout.flush()
            if counter % 1000000 == 0:
                print(counter)

    return kmers
Example #11
0
 def test_convert_write_fastq(self):
     with captured_output() as (_, _):
         with tempfile.TemporaryDirectory() as tempdir:
             path = os.path.join(tempdir, "test.fastq")
             ReverseComplement(find="AUGC", replace="ATGC").convert_write_fastq(self.fastq, path, complement=True,
                                                                                reverse=True)
             bad_path = os.path.join(tempdir, "test.txt")
             for new_record, record in zip(SeqIO.parse(path, "fastq"), SeqIO.parse(self.fastq, "fastq")):
                 self.assertTrue(new_record.id.endswith("reverse_complement"))
                 self.assertTrue(str(new_record.seq).find("U") == -1)
             self.assertEqual(str(new_record.seq),
                              "AACCTAACGACACCACTATCCCTACACCCTATCCAACTACTATTACTCTATTCTACTTATCACCCTACTACTACCTCATCCT"
                              "CCTCCCTAAAATTTCGAGTAAGTAAAATCAATTTCGTGTCAAAATTCATTAAGGGCATCCTAATAGAGGTTGGTCGGCGATT"
                              "TTAATAAGTGTATGTTTCGGACGTTCATAAGTTTAAAGTGTTTGTGTTAACGTTTTCGTCTTTGATTTTGGAAGTATCAGTC"
                              "ACTCTAATTTTGTTACGAAGTAGTAAGAATTTCATGGACAATTATTTACGACGATTTATGATTCACGATTTTTTTTTTTCGA"
                              "TCACGACCGTCGACCGACGACCACCGACCGTCGACCACCGAACCTACCATTATTTTTCCTTTTGAAAATATTACTGTTCGTG"
                              "AGAATATAAGTAAAAAATAGAGTATTGACCTATTGTGTCCCGTCCTAC")
             self.assertEqual(new_record.letter_annotations["phred_quality"],
                              record.letter_annotations["phred_quality"][::-1])
             os.remove(path)
         with self.assertRaises(AssertionError):
             self.base.convert_write_fastq(self.fastq, bad_path, complement=True, reverse=True)
             self.base.convert_write_fastq(self.fastq, path, complement=False, reverse=False)
Example #12
0
def main():
    args = parse_args()
    assert os.path.isdir(args.output_dir), "{} is not a directory".format(
        args.output_dir)
    assert os.path.exists(args.bam), "{} does not exist".format(args.bam)
    assert os.path.exists(args.positions_file), "{} does not exist".format(
        args.positions_file)

    output_dir = args.output_dir
    bam = args.bam
    positions_file = args.positions_file
    reference = args.reference
    alphabet = args.alphabet
    kmer_length = args.kmer_length
    n_processes = args.threads
    # output_dir = "/home/ubuntu/mount/download/FAB39088"
    # bam = "/home/ubuntu/mount/download/FAB39088/fastq/canonical_cpg_FAB39088.2308.sorted.bam"
    # output_dir = "/home/ubuntu/mount/download/FAF01169"
    # bam = "/home/ubuntu/mount/download/FAF01169/Bham/fastq/canonical_cpg_FAF01169.2308.sorted.bam"
    #
    # positions_file = "/home/ubuntu/bisulfite_methylation_analysis/positions/canonical_added_cxx.positions"
    # reference = "/home/ubuntu/bisulfite_methylation_analysis/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa"
    # alphabet = "ACGT"
    # kmer_length = 6

    fasta_handle = None
    if reference is not None:
        assert os.path.exists(reference), "{} does not exist".format(reference)
        fasta_handle = ReferenceHandler(reference)

    rc = ReverseComplement()
    positions_data = pd.read_csv(
        positions_file,
        names=["chr", "start", "strand", "find", "replace"],
        sep="\t")
    km = KmerMap(alphabet, kmer_length)
    counter = 0

    def get_kmer(sequence, pos, start_pos, strand, replace):
        try:
            base = sequence[(pos - (kmer_length - 1)) -
                            start_pos:(pos + kmer_length) - start_pos]
            base = base[:(kmer_length - 1)] + replace + base[kmer_length:]
            if strand == "-":
                return rc.complement(base)
            return base
        except Exception as e:
            print(e, sequence, pos, start_pos)

    # def get_ref_base(chromosome, start_pos, strand):
    #     try:
    #         base = fasta_handle.get_sequence(chromosome_name=chromosome, start=start_pos, stop=start_pos + 1)
    #         if strand == "-":
    #             return rc.complement(base)
    #         return base
    #     except Exception as e:
    #         print(e, fasta_handle, chromosome, start_pos, strand)
    #
    # def get_base(sequence, pos, start_pos, reversed):
    #     try:
    #         base = sequence[pos - start_pos]
    #         if reversed:
    #             return rc.complement(base)
    #         return base
    #     except Exception as e:
    #         print(e, sequence, pos, start_pos)

    def get_covered_kmers(positions_data1, read_name1, ref_sequence1,
                          ref_name1, strand1, ref_start1, ref_end1):
        this_positions_data = positions_data1.loc[
            (positions_data1["chr"] == ref_name1)
            & (positions_data1["strand"] == strand1) &
            (positions_data1["start"] >= ref_start1) &
            (positions_data1["start"] <= ref_end1)]
        if this_positions_data.empty:
            return None
        kmer_lists = np.vectorize(get_kmer)(ref_sequence1,
                                            this_positions_data['start'],
                                            ref_start1, strand1,
                                            this_positions_data["replace"])
        kmer_subset_lists1 = merge_lists([[
            kmer[i:i + kmer_length] for i in range(kmer_length)
            if len(kmer[i:i + kmer_length]) == kmer_length
            and set(kmer[i:i + kmer_length]) <= set(alphabet)
        ] for kmer in kmer_lists])
        return read_name1, kmer_subset_lists1

    def meta_get_covered_kmers(positions, all_args1):
        data_to_return = []
        for args1 in all_args1:
            data = get_covered_kmers(positions, *args1)
            if data is not None:
                data_to_return.append(data)
        return data_to_return

    all_args = []
    with closing(pysam.AlignmentFile(
            bam, 'rb' if bam.endswith("bam") else 'r')) as aln:
        for aligned_segment in aln.fetch(until_eof=True):
            try:
                if not aligned_segment.has_tag('MD'):
                    if fasta_handle is None:
                        raise Exception(
                            "Need to specify --reference if MD flag is not set"
                        )
                    else:
                        ref_sequence = fasta_handle.get_sequence(
                            chromosome_name=aligned_segment.reference_name,
                            start=aligned_segment.reference_start,
                            stop=aligned_segment.reference_end)
                else:
                    ref_sequence = aligned_segment.get_reference_sequence(
                    ).upper()
                read_name = aligned_segment.qname.split("_")[0]
                ref_name = aligned_segment.reference_name
                ref_start = aligned_segment.reference_start
                ref_end = aligned_segment.reference_end
                reversed_read = aligned_segment.is_reverse
                if reversed_read:
                    strand = "-"
                else:
                    strand = "+"
                all_args.append([
                    read_name, ref_sequence, ref_name, strand, ref_start,
                    ref_end
                ])
                counter += 1
            except Exception as e:
                print(e, file=sys.stderr)

    print("starting on {} reads".format(len(all_args)))
    list_of_args = [all_args[x::n_processes] for x in range(n_processes)]
    # extra_args = {"positions": positions_data}
    # data = get_covered_kmers(positions_data, *list_of_args[0][0])
    # print(data)
    service = BasicService2(meta_get_covered_kmers,
                            positions_data,
                            service_name="multiprocess_meta_get_covered_kmers")
    total, failure, messages, output = run_service(service.run, list_of_args,
                                                   {}, ["all_args1"],
                                                   n_processes)
    # print(pd.concat(output, ignore_index=True))
    km = KmerMap(alphabet, kmer_length)

    all_data = merge_lists(output)
    print("number of reads: ", len(all_data))
    for read_name, kmer_subset_lists in all_data:
        # print(read_name, kmer_subset_lists)
        r = Read(read_name)
        for kmer in kmer_subset_lists:
            r.add_kmer(kmer)
        km.add_read(r)

    kmer_counts_file_path = os.path.join(output_dir,
                                         "all_reads_kmer_counts.txt")
    with open(kmer_counts_file_path, "w") as fh:
        print("\n".join([
            "\t".join([kmer, str(count)])
            for kmer, count in km.kmer_counts.items()
        ]),
              file=fh)

    keep_kmer_map = KmerMap(alphabet, kmer_length)

    print("number of zero covered kmers: ", len(km.get_zero_kmers()))
    curr_threshold = 1
    iteration = 0
    increase_threshold = True
    while increase_threshold:
        curr_threshold += 1
        find_kmers = keep_kmer_map.get_threshold_uncovered_kmers(
            threshold=curr_threshold)
        while len(find_kmers) > 0:
            print(iteration, len(find_kmers))
            next_kmer = km.get_non_zero_min_kmer_in_kmers(find_kmers)
            if next_kmer is None:
                print(
                    "No more reads to cover found kmers: threshold {}".format(
                        curr_threshold))
                increase_threshold = True
                if curr_threshold >= 10:
                    increase_threshold = False
                break
            next_read_index, next_read = km.get_read(next_kmer)
            if next_read is None:
                print("Whoops, something is wrong")
                break
            keep_kmer_map.add_read(next_read)
            km.remove_read(next_read_index)
            find_kmers = keep_kmer_map.get_threshold_uncovered_kmers(
                threshold=curr_threshold)
            iteration += 1
        print("Exited first while")
        if len(find_kmers) == 0:
            print("Found reads covering all kmers at threshold {}".format(
                curr_threshold))
        file_path = os.path.join(
            output_dir, "{}_reads_covering_kmers_with_threshold_{}.txt".format(
                "all" if increase_threshold else "some", curr_threshold))
        with open(file_path, "w") as fh:
            print("\n".join([read.read_id for read in keep_kmer_map.reads]),
                  file=fh)
        kmer_counts_file_path = os.path.join(
            output_dir, "{}_kmer_counts_with_threshold_{}.txt".format(
                "all" if increase_threshold else "some", curr_threshold))
        with open(kmer_counts_file_path, "w") as fh:
            print("\n".join([
                "\t".join([kmer, str(count)])
                for kmer, count in keep_kmer_map.kmer_counts.items()
            ]),
                  file=fh)
from py3helpers.seq_tools import ReferenceHandler, ReverseComplement
from scipy.stats import norm, invgauss, entropy
# needs pyranges "conda install -c bioconda pyranges"
import pyranges as pr

OUTPUT_DIR = "/home/ubuntu/mount/download/RNA_rel2/reference"
REFERENCE = "/home/ubuntu/mount/download/RNA_rel2/reference/gencode.v27.transcripts.fa"
p_lambda = 50
delta = 6

assert os.path.isdir(OUTPUT_DIR), "{} is not a directory".format(OUTPUT_DIR)
assert os.path.exists(REFERENCE), "{} does not exist".format(REFERENCE)

# reference handler and reverse complement handler
rh = ReferenceHandler(REFERENCE)
rc = ReverseComplement()
transcript_strings = rh.fasta.references
transcript_data = {
    transcript: rh.get_sequence(transcript, 0,
                                rh.get_chr_sequence_length(transcript))
    for transcript in rh.fasta.references
}


def get_base(transcript, pos):
    try:
        base = transcript_data[transcript][pos]
    except:
        print(chromosome, pos)
    return base
Example #14
0
def main():
    OUTPUT_DIR = "/home/ubuntu/ecoli_methylation_analysis/kmer_analysis"
    positions_data = False
    keys = ["contig", "reference_index", "strand"]
    # RNA canonical
    # REFERENCE = "/home/ubuntu/mount/download/RNA_rel2/reference/gencode.v27.transcripts.fa"
    # VARIANT_HOME_DIRS = ["/home/ubuntu/mount/OICR_runs/all_runs/", "/home/ubuntu/mount/UBC_runs/all_runs/"]
    # # VARIANT_HOME_DIRS = ["/home/ubuntu/mount/OICR_runs/test/", "/home/ubuntu/mount/OICR_runs/test/"]
    # VARIANT_NAMES = ["/variant_calls/na12878_OICR_RNA_canonical.csv", "/variant_calls/na12878_UBC_RNA_canonical.csv"]
    # ALPHABET = "ATGC"
    # KMER_LENGTH = 5
    # NAMES = ["OICR", "UBC"]
    #
    # # DNA canonical
    # REFERENCE = "/home/ubuntu/bisulfite_methylation_analysis/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa"
    # VARIANT_HOME_DIRS = ["/home/ubuntu/mount/FAB39088_runs/canonical_calling/all_runs/",
    #                      "/home/ubuntu/mount/FAF01169_runs/canonical_calling/all_runs/"]
    # VARIANT_NAMES = ["/variant_calls/variant_calls.csv", "/variant_calls/variant_calls.csv"]
    # ALPHABET = "ATGC"
    # KMER_LENGTH = 6
    # NAMES = ["FAB39088_canonical", "FAF01169_canonical"]

    # DNA mod
    # REFERENCE = "/home/ubuntu/bisulfite_methylation_analysis/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa"
    # VARIANT_HOME_DIRS = ["/home/ubuntu/mount/FAB39088_runs/cpg_calling/all_runs/",
    #                      "/home/ubuntu/mount/FAF01169_runs/cpg_calling/all_runs/"]
    # NAMES = ["FAB39088_methyl", "FAF01169_methyl"]
    # VARIANT_NAMES = ["/variant_calls/variant_calls.csv", "/variant_calls/variant_calls.csv"]
    # ALPHABET = "ATGCM"
    # KMER_LENGTH = 6
    # POSITIONS_FILE = "/home/ubuntu/bisulfite_methylation_analysis/positions/all_mC.positions"
    # positions_data = pd.read_csv(POSITIONS_FILE, names=["contig", "reference_index", "strand", "find", "replace"],
    #                              sep="\t")

    # ECOLI MOD
    REFERENCE = "/home/ubuntu/ecoli_methylation_analysis/reference/ecoli.fa"
    VARIANT_HOME_DIRS = [
        "/home/ubuntu/ecoli_methylation_analysis/signalalign_output/"
    ]
    NAMES = ["variant_calls"]
    VARIANT_NAMES = ["ecoli_dna_baseline_ATCGMQ_sa.model.csv"]
    ALPHABET = "ATGCM"
    KMER_LENGTH = 6
    # POSITIONS_FILE = "/home/ubuntu/ecoli_methylation_analysis/kmer_analysis/all.positions"
    # positions_data = pd.read_csv(POSITIONS_FILE, names=["contig", "reference_index", "strand", "find", "replace"],
    #                              sep="\t")

    if positions_data is not False:
        i2 = positions_data.set_index(keys).index

    assert os.path.exists(REFERENCE), "{} does not exist".format(REFERENCE)
    assert os.path.isdir(OUTPUT_DIR), "{} is not a directory".format(
        OUTPUT_DIR)
    rh = ReferenceHandler(REFERENCE)
    rc = ReverseComplement()
    kmers = {k: 0 for k in all_string_permutations(ALPHABET, KMER_LENGTH)}
    paths = []
    for home_dir, variant_name in zip(VARIANT_HOME_DIRS, VARIANT_NAMES):
        assert os.path.isdir(home_dir), "{} is not a directory".format(
            home_dir)
        home_dir_paths = os.listdir(home_dir)
        tmp_paths = [
            os.path.join(home_dir, x, variant_name) for x in home_dir_paths
            if os.path.exists(os.path.join(home_dir, x, variant_name))
        ]

        assert len(
            tmp_paths
        ) > 0, "Check inputs, there are no paths which exist: {}".format(
            home_dir)
        paths.append(tmp_paths)

    def get_kmer(chromosome, pos, strand):
        try:
            seq = rh.get_sequence(chromosome, (pos - KMER_LENGTH) + 1,
                                  pos + KMER_LENGTH)
            if strand == "-":
                seq = rc.reverse_complement(seq)
            if positions_data is not False:
                replace = read_pos_data[
                    (read_pos_data["contig"] == chromosome)
                    & (read_pos_data["reference_index"] == pos) &
                    (read_pos_data["strand"] == strand)]
                if not replace.empty:
                    seq = seq[:KMER_LENGTH - 1] + replace.iloc[0][
                        "replace"] + seq[KMER_LENGTH:]

        except Exception as e:
            print(e, chromosome, pos, strand)
        return seq

    void = '-'
    fill = '#'
    n_spaces = 100
    n_files = 0
    for variant_set, name in zip(paths, NAMES):
        n_paths = len(variant_set)
        count = n_spaces / n_paths
        increaseCount = 0
        print("Starting on {}".format(name))
        local_kmers = {
            k: 0
            for k in all_string_permutations(ALPHABET, KMER_LENGTH)
        }
        for variant_path in variant_set:
            print('[' + (fill * int(increaseCount)) +
                  (void * int(n_spaces - increaseCount)) + '] ' +
                  str(int(increaseCount)) + '%',
                  end='\r')
            increaseCount += count
            variant_data = pd.read_csv(variant_path)
            if positions_data is not False:
                i1 = variant_data.set_index(keys).index
                read_pos_data = positions_data[i2.isin(i1)]
            #         read_id            028a34d4-2a7a-44e7-ab23-305915996ec8
            #         contig                                          RDN18-1
            #         reference_index                                     973
            #         strand                                                +
            #         variants                                             Aa
            #         prob1                                          0.986967
            #         prob2                                          0.013033
            #         prob3                                               NaN
            variant_data['next_base'] = np.vectorize(get_kmer)(
                variant_data['contig'], variant_data['reference_index'],
                variant_data['strand'])
            large_kmers = set(variant_data['next_base'])
            for l_kmer in large_kmers:
                for i in range(KMER_LENGTH):
                    k = l_kmer[i:KMER_LENGTH + i]
                    if len(k) == KMER_LENGTH:
                        kmers[k] += 1
                        local_kmers[k] += 1

        print('[' + (fill * int(increaseCount)) +
              (void * int(n_spaces - increaseCount)) + '] ' +
              str(int(increaseCount)) + '%',
              end='\n')

        total_zeros = 0
        for x, y in local_kmers.items():
            if y == 0:
                total_zeros += 1
        n_files += n_paths
        print("{} Kmers Covered: {}/{}".format(name,
                                               len(local_kmers) - total_zeros,
                                               len(local_kmers)))
        print("{} Average coverage: {:.4}".format(
            name,
            np.sum(list(local_kmers.values())) /
            (len(local_kmers) - total_zeros)))
        with open(os.path.join(OUTPUT_DIR, name + ".tsv"), 'w') as fh:
            print("\n".join([
                "\t".join([x, str(y / n_paths)])
                for x, y in local_kmers.items()
            ]),
                  file=fh)

    total_zeros = 0
    for x, y in kmers.items():
        if y == 0:
            total_zeros += 1
    print("TOTAL Kmers Covered: {}/{}".format(
        len(kmers) - total_zeros, len(kmers)))
    print("TOTAL Average coverage: {}".format(
        np.average(list(kmers.values())) / (n_files / 2)))
    with open(os.path.join(OUTPUT_DIR, "total_" + "_".join(NAMES) + ".tsv"),
              'w') as fh:
        print("\n".join(
            ["\t".join([x, str(y / n_files)]) for x, y in kmers.items()]),
              file=fh)
Example #15
0
def resegment_reads(fast5_path,
                    params=None,
                    speedy=False,
                    overwrite=True,
                    analysis_path="ReSegmentBasecall_000"):
    """Re-segment and create anchor alignment from previously base-called fast5 file
    :param fast5_path: path to fast5 file
    :param params: event detection parameters
    :param speedy: boolean option for speedyStatSplit or minknow
    :param overwrite: overwrite a previous event re-segmented event table
    :param analysis_path: name of key where events table will be placed (Analyses/'name'/Events)
    :return True when completed
    """
    assert os.path.isfile(fast5_path), "File does not exist: {}".format(
        fast5_path)
    # create Fast5 object and sanity check
    f5fh = Fast5(fast5_path, read='r+')
    if not f5fh.has_basecall_data():
        f5fh.close()
        return None

    # gather previous event detection
    old_event_table = f5fh.get_basecall_data()

    read_id = bytes.decode(f5fh.raw_attributes['read_id'])
    sampling_freq = f5fh.sample_rate
    start_time = f5fh.raw_attributes['start_time']

    # get params
    if params is None:
        params = get_default_event_detection_params(
            EVENT_DETECT_SPEEDY if speedy else EVENT_DETECT_MINKNOW)

    # pick event detection algorithm
    signal = f5fh.get_read(raw=True, scale=True)
    if speedy:
        event_table = create_speedy_event_table(signal, sampling_freq,
                                                start_time, **params)
        params = merge_dicts(
            [params, {
                "event_detection": "speedy_stat_split"
            }])
    else:
        event_table = create_minknow_event_table(signal, sampling_freq,
                                                 start_time, **params)
        params = merge_dicts(
            [params, {
                "event_detection": "minknow_event_detect"
            }])

    # metadata
    keys = ["nanotensor version", "time_stamp"]
    values = ["0.2.0", TimeStamp().posix_date()]
    attributes = merge_dicts(
        [params, dict(zip(keys, values)), f5fh.raw_attributes])

    # do resegmentation
    if f5fh.is_read_rna():
        old_event_table = index_to_time(old_event_table,
                                        sampling_freq=sampling_freq,
                                        start_time=start_time)
    new_event_table = create_anchor_kmers(new_events=event_table,
                                          old_events=old_event_table)

    # get destination in fast5
    #todo find latest location? ie: save_event_table_and_fastq(..)
    destination = f5fh._join_path(f5fh.__base_analysis__, analysis_path)

    f5fh.set_event_table(destination,
                         new_event_table,
                         attributes,
                         overwrite=overwrite)

    # gather new sequence
    sequence = sequence_from_events(new_event_table)
    if f5fh.is_read_rna():
        sequence = ReverseComplement().reverse(sequence)
        sequence = sequence.replace("T", "U")
    quality_scores = '!' * len(sequence)
    fastq = create_fastq_line(read_id + " :", sequence, quality_scores)

    # set fastq
    f5fh.set_fastq(destination, fastq, overwrite=overwrite)
    return f5fh
Example #16
0
    def setUpClass(cls):
        super(CreateLabelsTest, cls).setUpClass()
        cls.HOME = '/'.join(os.path.abspath(__file__).split("/")[:-4])
        cls.fasta = os.path.join(cls.HOME,
                                 "tests/test_sequences/E.coli_K12.fasta")
        dna_file = os.path.join(cls.HOME,
                                "tests/minion_test_reads/1D/LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch112_read108_strand.fast5")
        rev_dna_file = os.path.join(cls.HOME,
                                    "tests/minion_test_reads/1D/LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch6_read347_strand.fast5")
        rev_rna_file = os.path.join(cls.HOME,
                                "tests/minion_test_reads/RNA_no_events/DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_61_ch_151_strand.fast5")
        forward_rna_file = os.path.join(cls.HOME,
                                "tests/minion_test_reads/RNA_no_events/DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_36_ch_218_strand.fast5")

        rna_reference = os.path.join(cls.HOME, "tests/test_sequences/fake_rna_ref.fa")
        ecoli_dna_reference = os.path.join(cls.HOME, "tests/test_sequences/E.coli_K12.fasta")
        cls.dna_reference_handle = pysam.FastaFile(ecoli_dna_reference)
        cls.rna_reference_handle = pysam.FastaFile(rna_reference)
        cls.tmp_directory = tempfile.mkdtemp()

         # get file locations
        cls.tmp_dna_file = os.path.join(str(cls.tmp_directory), 'test_dna.fast5')
        cls.tmp_dna_file2 = os.path.join(str(cls.tmp_directory), 'test_dna2.fast5')

        cls.tmp_rna_file1 = os.path.join(str(cls.tmp_directory), 'test_rna.fast5')
        cls.tmp_rna_file2 = os.path.join(str(cls.tmp_directory), 'test_rna2.fast5')

        # run signalAlign on one file
        cls.rna_model_file = os.path.join(cls.HOME, "models/testModelR9p4_5mer_acgt_RNA.model")
        cls.dna_model_file_94 = os.path.join(cls.HOME, "models/testModelR9p4_5mer_acegt_template.model")
        cls.rna_sam = os.path.join(cls.HOME, "tests/minion_test_reads/RNA_edge_cases/rna_reads.bam")
        cls.dna_sam = os.path.join(cls.HOME, "tests/minion_test_reads/oneD.bam")
        cls.bin_path = os.path.join(cls.HOME, "bin")
        # kmer index
        cls.kmer_index = 2

        # copy file to tmp directory
        shutil.copy(dna_file, cls.tmp_dna_file)
        shutil.copy(rev_dna_file, cls.tmp_dna_file2)

        shutil.copy(forward_rna_file, cls.tmp_rna_file1)
        shutil.copy(rev_rna_file, cls.tmp_rna_file2)

        args = create_signalAlignment_args(destination=cls.tmp_directory,
                                           in_templateHmm=cls.rna_model_file,
                                           alignment_file=cls.rna_sam,
                                           forward_reference=rna_reference,
                                           embed=True,
                                           path_to_bin=cls.bin_path,
                                           diagonal_expansion=5,
                                           delete_tmp=False)
        sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_rna_file1}]))
        sa_h.run()

        sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_rna_file2}]))
        sa_h.run()

        args = create_signalAlignment_args(destination=cls.tmp_directory,
                                           in_templateHmm=cls.dna_model_file_94,
                                           alignment_file=cls.dna_sam,
                                           forward_reference=ecoli_dna_reference,
                                           embed=True,
                                           path_to_bin=cls.bin_path,
                                           diagonal_expansion=10,
                                           traceBackDiagonals=100,
                                           constraint_trim=3)
        sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_dna_file}]))
        sa_h.run()

        sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_dna_file2}]))
        sa_h.run()

        cls.dna_handle = CreateLabels(cls.tmp_dna_file, kmer_index=cls.kmer_index)
        cls.dna_handle2 = CreateLabels(cls.tmp_dna_file2, kmer_index=cls.kmer_index)

        cls.rna1_handle = CreateLabels(cls.tmp_rna_file1, kmer_index=cls.kmer_index)
        cls.rna2_handle = CreateLabels(cls.tmp_rna_file2, kmer_index=cls.kmer_index)
        cls.rev_comp = ReverseComplement()

        cls.tmp_dna_file3 = os.path.join(cls.HOME,
                                         "tests/minion_test_reads/embedded_files/miten_PC_20160820_FNFAD20259_MN17223_sequencing_run_AMS_158_R9_WGA_Ecoli_08_20_16_43623_ch100_read2324_strand.fast5")
        cls.dna3_handle = CreateLabels(cls.tmp_dna_file3, kmer_index=cls.kmer_index)
Example #17
0
def create_labels_from_guide_alignment(events,
                                       sam_string,
                                       rna=False,
                                       reference_path=None,
                                       kmer_index=2,
                                       one_ref_indexing=False):
    """Create labeled signal from a guide alignment with only matches being reported

    :param events: path to fast5 file
    :param sam_string: sam alignment string
    :param rna: if read is rna, reverse again
    :param reference_path: if sam_string has MDZ field the reference sequence can be inferred, otherwise, it is needed
    :param kmer_index: index of the kmer to select for reference to event mapping
    :param one_ref_indexing: boolean zero or 1 based indexing for reference
    """
    # test if the required fields are in structured numpy array
    check_numpy_table(events,
                      req_fields=('raw_start', 'model_state', 'p_model_state',
                                  'raw_length', 'move'))
    assert type(one_ref_indexing) is bool, "one_ref_indexing must be a boolean"

    psam_h = initialize_pysam_wrapper(sam_string,
                                      reference_path=reference_path)
    # create an indexed map of the events and their corresponding bases
    bases, base_raw_starts, base_raw_lengths, probs = index_bases_from_events(
        events, kmer_index=kmer_index)

    # check if string mapped to reverse strand
    if psam_h.alignment_segment.is_reverse:
        probs = probs[::-1]
        base_raw_starts = base_raw_starts[::-1]
        # rna reads go 3' to 5' so we dont need to reverse if it mapped to reverse strand
        if not rna:
            bases = ReverseComplement().reverse(''.join(bases))
    # reverse if it mapped to forward strand and RNA
    elif rna:
        bases = ReverseComplement().reverse(''.join(bases))

    # all 'matches' and 'mismatches'
    matches_map = psam_h.seq_alignment.matches_map
    # zero indexed reference start
    ref_start = psam_h.alignment_segment.reference_start + one_ref_indexing
    # set labels
    raw_start = []
    raw_length = []
    reference_index = []
    kmer = []
    posterior_probability = []
    cigar_labels = []
    prev = matches_map[0].reference_index
    for i, alignment in enumerate(matches_map):
        if i == 0 or alignment.reference_index == prev + 1:
            raw_start.append(base_raw_starts[alignment.query_index])
            raw_length.append(base_raw_lengths[alignment.query_index])
            reference_index.append(alignment.reference_index + ref_start)
            kmer.append(alignment.reference_base)
            posterior_probability.append(probs[alignment.query_index])
        else:
            # initialize labels
            cigar_label = np.zeros(len(raw_start),
                                   dtype=[('raw_start', int),
                                          ('raw_length', int),
                                          ('reference_index', int),
                                          ('posterior_probability', float),
                                          ('kmer', 'S5')])
            # assign labels
            cigar_label['raw_start'] = raw_start
            cigar_label['raw_length'] = raw_length
            cigar_label['reference_index'] = reference_index
            cigar_label['kmer'] = kmer
            cigar_label['posterior_probability'] = posterior_probability
            # add to other blocks
            cigar_labels.append(cigar_label)
            # reset trackers
            raw_start = [base_raw_starts[alignment.query_index]]
            raw_length = [base_raw_lengths[alignment.query_index]]
            reference_index = [alignment.reference_index + ref_start]
            kmer = [alignment.reference_base]
            posterior_probability = [probs[alignment.query_index]]
        # keep track of reference positions
        prev = alignment.reference_index

    # catch the last label
    cigar_label = np.zeros(len(raw_start),
                           dtype=[('raw_start', int), ('raw_length', int),
                                  ('reference_index', int),
                                  ('posterior_probability', float),
                                  ('kmer', 'S5')])
    # assign labels
    cigar_label['raw_start'] = raw_start
    cigar_label['raw_length'] = raw_length
    cigar_label['reference_index'] = reference_index
    cigar_label['kmer'] = kmer
    cigar_label['posterior_probability'] = posterior_probability
    # add to other blocks
    cigar_labels.append(cigar_label)

    return cigar_labels
def main():
    args = parse_args()
    assert os.path.isdir(args.output_dir), "{} is not a directory".format(
        args.output_dir)
    assert os.path.exists(args.reference), "{} does not exist".format(
        args.reference)
    assert os.path.exists(args.positions_file), "{} does not exist".format(
        args.positions_file)

    positions_data = pd.read_csv(
        args.positions_file,
        names=["chr", "start", "strand", "find", "replace"],
        sep="\t")
    positions_data["kmer"] = np.nan
    # reference handler and reverse complement handler
    rh = ReferenceHandler(args.reference)
    rc = ReverseComplement()
    chromosome_data = {
        chromosome: rh.get_sequence(chromosome, 0,
                                    rh.get_chr_sequence_length(chromosome))
        for chromosome in rh.fasta.references
    }

    alphabet = "ACGMT"
    kmer_length = 6

    def get_kmer(chromosome, pos, strand, replace=None):
        try:
            seq = chromosome_data[chromosome][(pos - kmer_length) + 1:pos +
                                              kmer_length]
            if strand == "-":
                seq = rc.reverse_complement(seq)
            if replace is not None:
                seq = seq[:kmer_length - 1] + replace + seq[kmer_length:]
        except Exception as e:
            print(e, chromosome, pos, strand)
        return seq

    mod_pos_data = positions_data.loc[positions_data['replace'] == "M"].copy()
    mod_pos_data.loc[:,
                     "kmer"] = np.vectorize(get_kmer)(mod_pos_data['chr'],
                                                      mod_pos_data['start'],
                                                      mod_pos_data['strand'],
                                                      "M")

    kmers = {k: 0 for k in all_string_permutations(alphabet, kmer_length)}
    large_kmers = set(mod_pos_data['kmer'])
    for l_kmer in large_kmers:
        for i in range(kmer_length):
            k = l_kmer[i:kmer_length + i]
            if len(k) == kmer_length:
                kmers[k] += 1
    m_kmers = [x for x, y in kmers.items() if x.count("M") == 1]
    found_m_only_kmers = {
        x: y
        for x, y in kmers.items() if y > 0 and x.count("M") == 1
    }
    print(f"Number of M kmers: {len(m_kmers)}")
    print(f"Number of found M kmers: {len(found_m_only_kmers)}")

    c_pos_data = positions_data.loc[positions_data['replace'] == "C"].copy()
    c_pos_data.loc[:,
                   'kmer'] = np.vectorize(get_kmer)(c_pos_data['chr'],
                                                    c_pos_data['start'],
                                                    c_pos_data['strand'], "C")
    filter_c_pos_data = c_pos_data[~c_pos_data["kmer"].str.contains(
        '|'.join(["N", "W", "Y"]), regex=True)]

    kmers = {k: 0 for k in all_string_permutations(alphabet, kmer_length)}
    large_kmers = set(filter_c_pos_data['kmer'])
    for l_kmer in large_kmers:
        for i in range(kmer_length):
            k = l_kmer[i:kmer_length + i]
            if len(k) == kmer_length:
                kmers[k] += 1
    no_m_kmers = [
        x for x, y in kmers.items() if x.count("M") == 0 and x.count("C") > 0
    ]
    found_no_m_kmers = {
        x: y
        for x, y in kmers.items()
        if y > 0 and x.count("M") == 0 and x.count("C") > 0
    }
    print(f"Number of Canonical kmers: {len(no_m_kmers)}")
    print(f"Number of found Canonical kmers: {len(found_no_m_kmers)}")