Ejemplo n.º 1
0
    def test_init(self):
        '''__init__ should do the expected with a cigar string'''
        with self.assertRaises(cigar.Error):
            cigar.Cigar("5M2")
        with self.assertRaises(cigar.Error):
            cigar.Cigar("H5M2X")

        test_cig = '5S2M3I1M1D4H'
        self.assertEqual(test_cig, str(cigar.Cigar(test_cig)))
Ejemplo n.º 2
0
def acquire_clip_pos(deal_cigar):
    seq = list(cigar.Cigar(deal_cigar).items())
    if seq[0][1] == 'S':
        first_pos = seq[0][0]
    else:
        first_pos = 0
    if seq[-1][1] == 'S':
        last_pos = seq[-1][0]
    else:
        last_pos = 0
    # seq = cigar.split('S')
    # if len(seq) == 3:
    # 	first_pos = int(seq[0])
    # 	last_pos = int(seq[1].split('M')[-1])
    # 	return [first_pos, last_pos]
    # if len(seq) == 1:
    # 	return []
    # if len(seq) == 2:
    # 	if seq[1] == '':
    # 		return []
    # 	first_pos = int(seq[0])
    # 	last_pos = 0
    bias = 0
    for i in seq:
        if i[1] == 'M' or i[1] == 'D':
            bias += i[0]

    return [first_pos, last_pos, bias]
Ejemplo n.º 3
0
def acquire_clip_pos(deal_cigar):
    '''
	resolution of cigar in supplementary mapping
	'''
    seq = list(cigar.Cigar(deal_cigar).items())
    first_pos = seq[0][0] if seq[0][1] == 'S' else 0
    last_pos = seq[-1][0] if seq[-1][1] == 'S' else 0
    bias = 0
    for i in seq:
        if i[1] in ['M', 'D']:
            bias += i[0]
    return [first_pos, last_pos, bias]
Ejemplo n.º 4
0
def analysis_cigar(deal_cigar, ins_l):
    seq = list(cigar.Cigar(deal_cigar).items())
    SoftClip_len = 0

    if seq[0][1] == 'S':
        SoftClip_len += seq[0][0]

    if seq[-1][1] == 'S':
        SoftClip_len += seq[-1][0]

    if SoftClip_len * 4 > ins_l:
        return 0
    else:
        return 1
Ejemplo n.º 5
0
def acquire_clip_pos(deal_cigar):
	seq = list(cigar.Cigar(deal_cigar).items())
	if seq[0][1] == 'S':
		first_pos = seq[0][0]
	else:
		first_pos = 0
	if seq[-1][1] == 'S':
		last_pos = seq[-1][0]
	else:
		last_pos = 0
	bias = 0
	for i in seq:
		if i[1] == 'M' or i[1] == 'D':
			bias += i[0]
	return [first_pos, last_pos, bias]
Ejemplo n.º 6
0
    def test_get_differences_from_ref(self):
        '''check test_get_differences_from_ref finds the correct differences'''
        ref = fastn.Fasta('ID', 'ACGTACGTACGT')
        c = cigar.Cigar("12M")

        pairs_to_check = [(cigar.Cigar("12M"), 'ACGTACGTACGT'),
                          (cigar.Cigar("12M"), 'AGGTACGTACGT'),
                          (cigar.Cigar("1S12M"), 'AAGGTACGTACGT'),
                          (cigar.Cigar("1S12M1S"), 'AAGGTACGTACGTA'),
                          (cigar.Cigar("1M1I10M"), 'AiCGTACGTACGT'),
                          (cigar.Cigar("3M1I3M1D3M"), 'AGGiTACTACGT'),
                          (cigar.Cigar("2S3M1I3M1D3M5S"), 'ssAGGiTACTACGTsssss')]
        correct_answers = [[],
                           [(1, 'S', 'C/G', 1)],
                           [(1, 'S', 'C/G', 1)],
                           [(1, 'S', 'C/G', 1)],
                           [(1, 'I', 'i', 1)],
                           [(1, 'S', 'C/G', 1), (3, 'I', 'i', 1), (6, 'D', 'G', 1)],
                           [(1, 'S', 'C/G', 1), (3, 'I', 'i', 1), (6, 'D', 'G', 1)]]

        for i in range(len(pairs_to_check)):
            self.assertListEqual(pairs_to_check[i][0].get_differences_from_ref(pairs_to_check[i][1], ref), correct_answers[i])
Ejemplo n.º 7
0
    def __init__(self, line):
        # example line:
        # HS4_6280:2:1104:12102:124607  99  PyYM_01_v1  1   47  2S73M   =   362 438 TGTTAAAAATATCATTTATATAATATAATTAAAATTATTTATTTTTAGATATTATAATATTATGAATAATAGTAT HHHHHHHHHHHHHHHHHHGHHHHHHHHHHHGFHHHHHHHEHHHHHHHFCHFHHHHGFHCHHFHFAFFEFECF@BF AS:i:73
        try:
            (self.id, self.flag, self.rname, self.pos, self.mapq, self.cigar,
             self.mrname, self.mpos, self.isize, self.seq, self.qual, *self.tags_list) = line.rstrip().split('\t')

            self.pos = int(self.pos) - 1
            self.flag = int(self.flag)
            self.mapq = int(self.mapq)
            self.cigar = cigar.Cigar(self.cigar)
            self.mpos = int(self.mpos) - 1
            self.isize = int(self.isize)
            self.tags = {}
            for tag in self.tags_list:
                (tag, type, value) = tag.split(':', 2)
                if type == 'i':
                    value = int(value)
                self.tags[tag] = (type, value)
        except:
            raise Error('Error reading this sam line:\n' + line)
Ejemplo n.º 8
0
def clip_analysis(deal_cigar, clipping_threshold):
    seq = list(cigar.Cigar(deal_cigar).items())
    if seq[0][1] == 'S':
        first_pos = seq[0][0]
    else:
        first_pos = 0
    if seq[-1][1] == 'S':
        last_pos = seq[-1][0]
    else:
        last_pos = 0

    total_len = first_pos + last_pos
    signal_len = 0
    for i in seq:
        signal_len += i[0]

    if signal_len == 0:
        return 0

    if total_len * 1.0 / signal_len >= clipping_threshold:
        return 0
    else:
        return 1
Ejemplo n.º 9
0
def pysam_cigar(x):
    _convertcigar = dict(zip(['M', 'I', 'D', 'N', 'S', 'H'], range(6)))
    as_list = list(cr.Cigar(x).items())
    rev_list = [tuple((i[1], i[0])) for i in as_list]
    pysam_ready = [tuple((_convertcigar[i[0]], i[1])) for i in rev_list]
    return (pysam_ready)
Ejemplo n.º 10
0
def extract_vcf_records(
        sample_name,
        # input paths
        alignments_path,
        contigs_path,
        ref_fasta_path,
        vcf_template_path,
        # output paths
        vcf_out_path,
        selected_contigs_path,
        flanked_contigs_path,
        flank_length,
        min_insert_size):

    n_records = 0
    ref_fasta = pysam.FastaFile(ref_fasta_path)
    contig_fasta = pysam.FastaFile(contigs_path)

    selected_contig_fasta = open(selected_contigs_path, "w")
    flanked_contig_fasta = open(flanked_contigs_path, "w")

    alns = pandas.read_csv(alignments_path, sep=" ")

    reader = vcfpy.Reader.from_path(vcf_template_path)
    reader.header.samples = vcfpy.SamplesInfos([sample_name])

    writer = vcfpy.Writer.from_path(vcf_out_path, reader.header)

    contig_loci = set()

    # parse each alignment and look for insertions above min_insert_size
    for r in alns.iterrows():
        # skip secondary alignments
        hit = r[1]["Hit"]
        if hit > 0:
            continue

        query_name = r[1]["QName"]

        # local alignment window in the reference
        ref_chrom, ref_start, ref_end, phase_set, phase, n = query_name.split(
            "_")

        phase_set = phase_set[2:]
        phase = phase[2:]

        # convert to ints
        ref_start, ref_end = (int(ref_start), int(ref_end))

        # alignment start and end for reference sequence
        target_start = r[1]["TStart"]
        target_end = r[1]["TEnd"]

        # alignment start and end for query sequence
        query_start = r[1]["QStart"]
        query_end = r[1]["QEnd"]

        # strand-ness of the query sequence
        strand = r[1]["Strand"]

        # parse cigar for variant extraction
        cig = cigar.Cigar(r[1]["CIGAR"])
        ops = list(cig.items())

        # convert sequences to the positive strand
        query_seq = contig_fasta.fetch(query_name)
        if strand == "-":
            query_seq = str(Bio.Seq.Seq(query_seq).reverse_complement())

        ref_seq = ref_fasta.fetch(ref_chrom, ref_start, ref_end)

        # initialize iterators for the cigar string
        query_pos = query_start
        target_pos = target_start

        # we are looking to extract insertions larger than 50bp
        for op in ops:
            # skip matches
            if op[1] == 'M':
                query_pos += op[0]
                target_pos += op[0]

            # skip deletions in the query sequence
            elif op[1] == 'D':
                target_pos += op[0]

            # insertions in the query sequence
            elif op[1] == 'I':
                # only interested in large insertions
                if op[0] > min_insert_size:
                    # Generate pysam.VariantRecord

                    # need to check conversion from 0-based coordinates to 1-based
                    ref_allele = ref_seq[target_pos]
                    alt_allele = ref_allele + query_seq[query_pos:query_pos +
                                                        op[0]]

                    gt = ""
                    if phase == "1":
                        gt = "1|0"
                    elif phase == "2":
                        gt = "0|1"
                    else:
                        gt = "0/1"

                    break_point = ref_start + target_pos
                    # output VCF record corresponding to the insertion
                    rec = vcfpy.Record(
                        CHROM=ref_chrom,
                        POS=break_point + 1,
                        ID=[query_name],
                        REF=ref_allele,
                        ALT=[vcfpy.Substitution("INS", alt_allele)],
                        QUAL=999,
                        FILTER=["PASS"],
                        INFO={},
                        FORMAT=[
                            "GT", "SVLEN", "PS", "HP", "CIGAR", "STRAND",
                            "CONTIG_START"
                        ],
                        calls=[
                            vcfpy.Call(sample=sample_name,
                                       data=vcfpy.OrderedDict(
                                           GT=gt,
                                           SVLEN=op[0],
                                           PS=phase_set,
                                           HP=phase,
                                           CIGAR=str(cig),
                                           STRAND=strand,
                                           CONTIG_START=str(query_start)))
                        ])

                    n_records += 1
                    # output contig that contains this insertion
                    writer.write_record(rec)

                    contig_locus = ">" + query_name + "_" + sample_name
                    contig_hash = sha1("_{chrom}_{pos}_{alt}".format(
                        chrom=ref_chrom, pos=ref_start,
                        alt=alt_allele[1:]).encode()).hexdigest()

                    contig_name = contig_locus + "_" + contig_hash + "_" + str(
                        op[0])

                    if contig_locus not in contig_loci:
                        selected_contig_fasta.writelines(
                            [contig_name + "\n", query_seq + "\n"])
                        contig_loci.add(contig_locus)

                    # output same insertion, but with flanking sequences
                    # note, the interval is [start, end[
                    if flank_length > 0:
                        left_flank = ref_fasta.fetch(
                            ref_chrom, break_point - flank_length, break_point)
                        right_flank = ref_fasta.fetch(
                            ref_chrom, break_point, break_point + flank_length)
                    else:
                        left_flank = ""
                        right_flank = ""
                    flanked_contig_fasta.writelines([
                        contig_name + "\n",
                        left_flank + alt_allele[1:] + right_flank + "\n"
                    ])

                query_pos += op[0]
    selected_contig_fasta.close()
    return n_records
Ejemplo n.º 11
0
def get_seq(sam,
            ref,
            ret_pos=False,
            from_line=False,
            correct=False,
            find_ref=True):
    ret = ["", 0, None, None]
    maxlen = 0

    if from_line:
        op = io.StringIO(sam)
    else:
        op = open(sam, 'r')

    with op as f:
        s = f.readline()  # read the 1st line
        while s != '':
            ss = s.split()
            if len(ss) < 5:
                s = f.readline()
                continue

            if ss[0] != '@SQ' and ss[0] != '@PG':

                if ss[1] == '0' or ss[
                        1] == '16':  # take only main alignment (not chimeric)
                    Bit = ss[1]
                    Chrom = ss[2]
                    if ss[1] == '0':
                        pos = max(int(ss[3]) - 1, 0)
                    else:
                        pos = max(int(ss[3]) - 1, 0)

                    CIGAR = ss[5]
                    # SamSeq = ss[9]
                    # print ss[0]
                    # print Q
                    # print(Chrom, pos, Bit, LenghtOnRef(CIGAR), ref)
                    Len = len(cigar.Cigar(CIGAR))
                    offset_start = 0
                    offset_end = 0

                    if correct:
                        offset_start = CIGAR.split("S")[0]
                        try:
                            offset_start = int(offset_start)
                        except:
                            offset_start = 0
                        # print(offset_start)
                        offset_end = re.split('[a-zA-Z]', CIGAR)[-2]
                        try:
                            offset_end = int(offset_end)
                        except:
                            offset_end = 0

                    if Len < maxlen:
                        continue

                    if find_ref:
                        Seq = SeqInRef(Chrom, pos - offset_start, Bit,
                                       LenghtOnRef(CIGAR) + offset_end, ref)
                    else:
                        Seq = "NotLookedFor"
                    # print("Inside", Seq)

                    if ss[2] == '*' or "chr" not in ss[2]:
                        Chrom = None
                    else:
                        try:
                            Chrom = int(ss[2][3:]) + 0
                        except:
                            Chrom = ss[2][3:]
                        ret = [Seq + "", 1, Chrom, pos + 0]
                        maxlen = max(len(Seq), maxlen)

                else:

                    break

            s = f.readline()
    if ret_pos:
        return ret
    else:
        return ret[:2]
Ejemplo n.º 12
0
def extract_consensus_insertions(contig_path, cons_path, ref_fasta_path, vcf_out_path, vcf_template_path, min_insertion_size, flank_length, flanked_contigs_path):
    n_records = 0
    # open input sequences
    cons_fasta = pysam.FastaFile(cons_path)
    ref_fasta = pysam.FastaFile(ref_fasta_path)

    flanked_contig_fasta = open(flanked_contigs_path, "w")

    (samples, loci) = collect_genotypes(contig_path)
    print("Found", len(samples), "samples for", len(loci), "phased loci")

    reader = vcfpy.Reader.from_path(vcf_template_path)
    reader.header.samples = vcfpy.SamplesInfos(list(samples))
    writer = vcfpy.Writer.from_path(vcf_out_path, reader.header)

    for contig in cons_fasta.references:
        # parse coordinates
        (chrom, start, end) = contig.split("_")
        (start, end) = int(start), int(end)

        cons_seq  = cons_fasta.fetch(contig)
        ref_seq = ref_fasta.fetch(chrom, start, end)

        aligner = mappy.Aligner(seq = ref_seq, preset = None , k = 15, w = 10, n_threads = 1,
                                max_join_long = 20000, max_join_short = 10000, min_join_flank_sc = 10,
                                min_join_flank_ratio = 0.1, max_gap = 10000, bw = 2000, end_bonus = 10,
                                zdrop = 10000, zdrop_inv = 1000,
                                scoring = (2, 4, 4, 10, 300, 0, 1),
                                extra_flags = 0x1)
        alignments = list(aligner.map(cons_seq, seq2 = None, cs = True, MD = False))

        if len(alignments) == 0:
            print("No hits in", contig)
            continue

        aln = max(alignments, key = lambda x: x.blen)

        cig = cigar.Cigar(aln.cigar_str)
        ops = list(cig.items())


        cons_pos = aln.q_st
        target_pos = aln.r_st

        strand = "+"
        if aln.strand == -1:
                cons_seq = str(Bio.Seq.Seq(cons_seq).reverse_complement())
                strand = "-"
        # print(contig)
        for op in ops:
            # skip matches
            if op[1] == 'M':
                cons_pos += op[0]
                target_pos += op[0]

            # skip deletions in the query sequence
            elif op[1] == 'D':
                target_pos += op[0]

            # insertions in the query sequence
            elif op[1] == 'I':
                # only interested in large insertions
                if op[0] > min_insertion_size:
                    # Generate pysam.VariantRecord

                    # need to check conversion from 0-based coordinates to 1-based
                    ref_allele = ref_seq[target_pos-1]
                    alt_allele = cons_seq[cons_pos:cons_pos + op[0]]

                    break_point = start + target_pos
                    # output VCF record corresponding to the insertion
                    # print(break_point, (start + end) / 2 )

                    # print(len(loci[contig]), "samples at", contig)

                    # build calls data structure
                    calls = []
                    for sample in samples:
                        sample_gt = "0/0"
                        ps = 0
                        if sample in loci[contig]:
                            sample_gt = loci[contig][sample]["1"] + "|" + loci[contig][sample]["2"]
                            ps = loci[contig][sample]["ps"]
                        sample_call = vcfpy.Call(sample = sample,
                                                 data = vcfpy.OrderedDict(GT = sample_gt, PS = ps))
                        # print(sample_call)
                        calls.append(sample_call)

                    rec = vcfpy.Record(CHROM = chrom, POS = break_point, ID = [contig + "_" + str(cons_pos)],
                                       REF = ref_allele, ALT = [vcfpy.Substitution("INS", ref_allele + alt_allele)],
                                       QUAL = 999, FILTER = ["PASS"],
                                       INFO = vcfpy.OrderedDict(SVLEN = op[0],
                                                                CIGAR = [str(cig)],
                                                                STRAND = strand,
                                                                CONTIG_START = str(aln.q_st)),
                                       FORMAT = ["GT", "PS"],
                                    calls = calls)

                    # output contig that contains this insertion
                    writer.write_record(rec)

                    # output same insertion, but with flanking sequences
                    # note, the interval is [start, end[
                    if flank_length > 0:
                        left_flank = ref_fasta.fetch(chrom, break_point - flank_length, break_point)
                        right_flank = ref_fasta.fetch(chrom, break_point, break_point + flank_length)
                    else:
                        left_flank = ""
                        right_flank = ""

                    flanked_contig_fasta.writelines([ ">" + contig + "_" + str(cons_pos) + "\n",
                                                     left_flank + alt_allele[1:] + right_flank + "\n"])

                    # output same contig, but with large flanking sequences
                    # note, the interval is [start, end[
                    n_records += 1

                cons_pos += op[0]
    flanked_contig_fasta.close()
    return n_records
Ejemplo n.º 13
0
 def test_soft_clipped_bases(self):
     '''Check that sort_clipped_bases() counts the right number of bases'''
     self.assertEqual(cigar.Cigar("10M").soft_clipped_bases(), 0)
     self.assertEqual(cigar.Cigar("1S10M").soft_clipped_bases(), 1)
     self.assertEqual(cigar.Cigar("10M2S").soft_clipped_bases(), 2)
     self.assertEqual(cigar.Cigar("2S10M3S").soft_clipped_bases(), 5)
Ejemplo n.º 14
0
 def test_reverse(self):
     '''Test that reverse works as expected'''
     c = cigar.Cigar('1S10M1I5M2S')
     c.reverse()
     self.assertEqual(str(c), '2S5M1I10M1S')
Ejemplo n.º 15
0
 def test_read_hit_length(self):
     '''Check that read_hit_length() returns the right number'''
     self.assertEqual(cigar.Cigar("10M").read_hit_length(), 10)
     self.assertEqual(cigar.Cigar("1S9M").read_hit_length(), 9)
     self.assertEqual(cigar.Cigar("1S7M1I1M").read_hit_length(), 9)
     self.assertEqual(cigar.Cigar("1S7M1D1M").read_hit_length(), 8)