Ejemplos de Cigar en Python, ejemplos de cigar.Cigar en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: fast5class.py Proyecto: utrori/Nanopore_cas9

 def get_position_in_ref(self, pos, ref):
     around_seq = self.seq[pos - 50:pos + 50]
     around_qual = self.quality[pos - 50:pos + 50]
     with open('temp_files/temp_fastq.fastq', 'w') as fw:
         fw.write('@temp\n' + around_seq + '\n+\n' + around_qual)
     subprocess.run('bwa mem -M -x ont2d -t 7 ' + ref +
                    ' temp_files/temp_fastq.fastq > temp_files/'
                    'temp.sam',
                    shell=True,
                    stdout=FNULL,
                    stderr=subprocess.STDOUT)
     with open('temp_files/temp.sam') as f:
         row = f.readlines()[2].strip().split()
         if row[2] == '*':
             return None
         cigar = row[5]
         ref_pos = int(row[3])
         c = Cigar(cigar)
         split_cigar = ''
         for i in c.items():
             split_cigar += i[0] * i[1]
         shift = 0
         current = 0
         for l in split_cigar:
             current += 1
             if l == 'I':
                 shift -= 1
             elif l == 'D':
                 shift += 1
                 current -= 1
             if current == 50:
                 ref_coordinate = ref_pos + 49 + shift
                 break
         return ref_coordinate

Ejemplo n.º 2

0

Mostrar archivo

Archivo: tasean.py Proyecto: leducquangpm/check-gene

def getGeneLocation(hit, gene):
    #parse CIGAR string
    loc = {}
    loc['note'] = ''
    list_hit = []
    total_len = 0
    for h in hit:
        hit = {}
        cigar = Cigar(h['cigar'])
        items = list(cigar.items())
        if items[0][1] == 'S' and items[-1][1] == 'S':
            hit['seq'] = h['seq'][items[0][0]:-items[-1][0]]
        elif items[0][1] == 'H' and items[-1][1] == 'H':
            hit['seq'] = h['seq']
        else:
            hit['seq'] = h['seq']
        send = int(h['ss'])
        for item in items:
            if item[1] == 'M':
                send = send + int(item[0])
            if item[1] == 'D':
                send = send + int(item[0])
            if item[1] == 'I':
                send = send - int(item[0])
        hit['pos'] = int(items[0][0])
        hit['ss'] = int(h['ss'])
        hit['send'] = send
        list_hit.append(hit)
    list_hit_sorted = sorted(list_hit, key=lambda k: k['ss'])
    if not len(list_hit_sorted) > 0:
        loc['consensus'] = ''
        loc['hit'] = []
        loc['note'] = 'Not found'
        return loc
    scafold = [list_hit_sorted[0]]
    cover_len = 0
    for i in range(len(list_hit_sorted)):
        if list_hit_sorted[i]['ss'] > scafold[-1]['ss'] + len(
                scafold[-1]['seq']):
            scafold.append(list_hit_sorted[i])

    if len(scafold) < len(list_hit_sorted):
        loc['note'] = "Multiple sequences found"
    loc['hit'] = list_hit_sorted
    loc['pos'] = list_hit_sorted[0]['pos']
    for i in range(len(scafold)):
        cover_len = cover_len + len(scafold[i]['seq'])
    loc['consensus'] = makeConsensus(scafold, gene)

    loc['coverage'] = cover_len / len(gene)
    #make consensus sequence from scafold

    #print(loc)
    return loc

Ejemplo n.º 3

0

Mostrar archivo

    def __init__(self, ss, alignmentDataset, sample=1.0, bin_size=10000000):
        """
        Initializes a IndelDistribution class.
        Computes the insertiona and deletion distribution of alignmentDataset.

        Args:
            :param SparkSession: the global SparkSession
            :param alignmentDataset: A bdgenomics.adam.dataset.AlignmentDataset object
            :param bin_size: Division size per bin
        """
        bin_size = int(bin_size)
        self.bin_size = bin_size
        self.sc = ss.sparkContext
        self.sample = sample

        # filter alignments without a position
        filteredAlignments = alignmentDataset.transform(lambda x: x.sample(False, self.sample)) \
            .transform(lambda x: x.filter(x["start"] >= 0))

        # Assign alignments with counter for contigs. Reduce and collect.
        mappedDistributions = filteredAlignments.toDF().rdd \
            .map(lambda r: ((r["referenceName"], r["start"] - r["start"]%bin_size), \
                            Counter(dict([(y,x) for x,y in Cigar(r["cigar"]).items()])))) \
            .reduceByKey(lambda x,y: x+y)

        self.alignments = mappedDistributions.collect()

Ejemplo n.º 4

0

Mostrar archivo

def get_max_clip_len(read):
    """Returns the longest clipped portion of a read, to the left or to the right."""

    if not read.cigarstring:
        raise ValueError("Missing Cigar string")

    cigar_tuples = list(Cigar(read.cigarstring).items())
    clip_lengths = [cigar_tuples[i][0] for i in (0, -1) if cigar_tuples[i][1] not in Cigar.ref_consuming_ops]
    return max(clip_lengths) if clip_lengths else 0

Ejemplo n.º 5

0

Mostrar archivo

def get_sa_attributes(sa_tag):
    '''
    An entry in the SA tag consist of rname, POS, strand, CIGAR, mapQ, NM
    Returned are POS, strand and aligned length (based on cigar)
    '''
    sasplit = sa_tag.split(',')
    start = int(sasplit[1])
    end = start + Cigar(sasplit[3]).reference_length()
    strand = sasplit[2]
    return start, end, strand

Ejemplo n.º 6

0

Mostrar archivo

Archivo: chimeras2.py Proyecto: xflicsu/splash

def cigar_to_list(cigar):
    """Convencience function that converts list, string or cigarstruct cigar to
    list of cigarstruct items
    """
    if isinstance(cigar, list):
        cig = cigar
    elif not isinstance(cigar, Cigar):
        cig = list(Cigar(cigar).items())
    else:
        cig = list(cigar)
    return cig

Ejemplo n.º 7

0

Mostrar archivo

def get_splices(lines):
    parted = (line.split() for line in lines)
    spliced = (p for p in parted if "N" in p[5])
    for parts in spliced:
        chrom = parts[2]
        direction = "-" if (16 & int(parts[1])) else "+"
        cigar = list(Cigar(parts[5]).items())
        cur_pos = int(parts[3]) - 1
        for d, code in cigar:
            if code == "N":
                yield chrom, cur_pos, cur_pos + d, direction
            if code in consuming:
                cur_pos += d

Ejemplo n.º 8

0

Mostrar archivo

def make_split_read(read, breakpoint, clip_left, hard_clip_threshold=1.0, sequence=None):
    """
    Create a split read (a continuous soft-clip from one end of a read until a breakpoint).
    Modifies both the CIGAR string and the actual sequence of the read.

    For example, If the read sequence is `ACACACAC` with a CIGAR of 8M, the breakpoint in position 3, and the sequence provided is `GTGTGT`,
    then if the clipping is a to the left of the breakpoint the modified read will have a CIGAR of 3S5M and its sequence will be `GTGCACAC`,
    and if the clipping is to the right of the breakpoint, the modified read will have a CIGAR of 3M5S and its sequence will be `ACAGTGTG`.

    Args:
        read: The read to modify.
        breakpoint: The breakpoint of the read.
        clip_left: Whether to clip every base to the left of the breakpoint or to the right of it.
        hard_clip_threshold: By default bases are soft-clipped. If more than `hard_clip_threshold` of the read is clipped, hard-clip instead
        sequence: An optional sequence to use for overriding bases in the clipped region.

    Returns:
        The split read.

    """
    split_read = copy.deepcopy(read)
    split_read.qname = split_read.query_name = read.qname + '-' + 'split'

    # CIGAR clipping.
    if read.cigarstring:
        cigar = Cigar(read.cigarstring)
        split_read.cigarstring = str(cigar.mask_left(breakpoint) if clip_left else cigar.mask_right(read.rlen - breakpoint))

    # Convert to hard-clipping, if needed.
    clip_len = breakpoint if clip_left else read.rlen - breakpoint
    if float(clip_len) / read.rlen > hard_clip_threshold:
        soft_clipped_cigar = '{}S'.format(clip_len)
        hard_clipped_cigar = '{}H'.format(clip_len)
        cigar = split_read.cigarstring
        if clip_left and cigar.startswith(soft_clipped_cigar):
            cigar = cigar.replace(soft_clipped_cigar, hard_clipped_cigar, 1)
        elif not clip_left and split_read.cigarstring.endswith(soft_clipped_cigar):
            cigar = cigar[:-len(hard_clipped_cigar)] + hard_clipped_cigar
        split_read.cigarstring = cigar

    if clip_left:
        # adjust reference match.
        split_read.reference_start += breakpoint

    # Sequence replacement.
    if sequence:
        split_seq = list(split_read.seq)
        if clip_left:
            split_seq[:breakpoint] = sequence[-breakpoint:]
        else:
            split_seq[breakpoint:] = sequence[:read.rlen - breakpoint]
        qual = split_read.qual
        split_read.seq = ''.join(split_seq)
        qual += DEFAULT_QUAL * (len(split_seq) - len(qual))
        split_read.qual = qual[:len(split_seq)]

    return split_read

Ejemplo n.º 9

0

Mostrar archivo

def alignment_length_cigar(CIGAR):
    '''
    Compute alignment on the reference length from CIGAR string

    Input:
        1. CIGAR: CIGAR string

    Output:
        1. alignmentLen: alignment on the reference length
    '''
    ## 1. Read CIGAR string using proper module
    cigarTuples = Cigar(CIGAR)

    ## 2. Iterate over the operations and compute the alignment length
    alignmentLen = 0

    for cigarTuple in list(cigarTuples.items()):

        length = int(cigarTuple[0])
        operation = cigarTuple[1]

        ### Update reference alignment length
        ## a) Operations consuming query and reference
        # - Op M, tag 0, alignment match (can be a sequence match or mismatch)
        # - Op =, tag 7, sequence match
        # - Op X, tag 8, sequence mismatch
        if (operation == 'M') or (operation == '=') or (operation == 'X'):
            alignmentLen += length

        ## b) Operations only consuming reference
        # - Op D, tag 2, deletion from the reference
        # - Op N, tag 3, skipped region from the reference
        elif (operation == 'D') or (operation == 'N'):
            alignmentLen += length

    return alignmentLen

Ejemplo n.º 10

0

Mostrar archivo

def rna_bam_to_bed(lines):
    for line in lines:
        parts = line.split()
        direction = "-" if 16 & int(parts[1]) else "+"
        cigar = list(Cigar(parts[5]).items())
        start_pos = int(parts[3]) - 1
        cur_pos = start_pos
        for d, code in cigar:
            if code == "N":
                yield (parts[2], start_pos, cur_pos, direction, 1)
                start_pos = cur_pos + d
            if code in consuming:
                cur_pos += d

        yield (parts[2], start_pos, cur_pos, direction, 0)

Ejemplo n.º 11

0

Mostrar archivo

def parseCIGARForIntrons(cigar):
    """
	Parses a CIGAR string and returns values which can used to determine an intron's
	3' and 5' splice sites

	Args:
		cigar, a CIGAR string with an intron in it
			E.x. cigar='3M1D40M20N'

	Returns:
	    offset, a figure which accomodates for insertion and deletion events to adjust
	    an alignment's positions back to the reference genome

	    matchedExon, a figure to be added to the start position of an alignment which
	    forms the 5' end of a splice site. This function only considers 'M''s before 
	    an intron (N) for this figure.

		intronLength, the length of an intron as reported by the CIGAR string. This figure
		is added with matchedExon and the start of an alignment to produce the position of
		the 3' end of a splice site.

	Raises:
	    None
	"""

    if 'N' in cigar:
        cigar = cigar.split('N')[0] + 'N'  #remove all information after intron
    else:
        raise Exception('No intron detected')

    offset = 0
    matchedExon = 0
    intronLength = 0

    for c in list(
            Cigar(cigar).items()):  # returns list of tuples : [(20, 'N')]
        if c[1] == 'N':
            intronLength += int(c[0])
        elif c[1] == 'D':
            offset += int(c[0])
        elif c[1] == 'I':
            offset -= int(c[0])
        elif c[1] == 'M':
            matchedExon += int(c[0])
        ## soft clipping is ignored
        ## hard clipping is ignored too

    return offset, matchedExon, intronLength

Ejemplo n.º 12

0

Mostrar archivo

Archivo: chop_regions.py Proyecto: 0xTCG/biser

def modify_sequences(main_dict, bed_file):
    from cigar import Cigar
    from Bio.Seq import Seq
    for i in open(bed_file, 'r'):
        line = i.split('\t')
        chr1 = line[0]
        if chr1 == '#chr1':
            continue
        chr2 = line[3]

        if chr1 == 'chrM' or chr2 == 'chrM':
            continue
        s1 = int(line[1])
        e1 = int(line[2])

        s2 = int(line[4])
        e2 = int(line[5])
        cigar = line[12]

        seq1 = main_dict[chr1][s1:e1]
        seq2 = main_dict[chr2][s2:e2] if line[9] == '+' else str(
            Seq(main_dict[chr2][s2:e2]).reverse_complement())
        seq_new = ''

        counter_1 = 0
        counter_2 = 0

        for num, let in Cigar(cigar).items():
            # print (let, num)
            if let == 'M':
                seq_new += seq1[counter_1:counter_1 + num]
                # if seq1[counter_1: counter_1+num] != seq2[counter_2: counter_2+num]:
                #     print (seq1[counter_1: counter_1+num])
                #     print (seq2[counter_2: counter_2+num])
                counter_1 += num
                counter_2 += num
            elif let == 'D':
                counter_1 += num
            elif let == 'I':
                seq_new += seq2[counter_2:counter_2 + num]
                counter_2 += num
        if line[9] == '-':
            seq_new = str(Seq(seq_new).reverse_complement())
        main_dict[chr2] = main_dict[chr2][:s2] + seq_new + main_dict[chr2][e2:]
        # print (len(seq_new), len(seq2))
        # print (seq_new)
        # print (seq2)
        assert len(seq_new) == len(seq2)

Ejemplo n.º 13

0

Mostrar archivo

    def __init__(self, ss, alignmentRDD, sample=1.0, bin_size=10000000):
        """
        Initializes a AlignmentDistribution class.
        Computes the alignment distribution of multiple coverageRDDs.
        :param ss: Spark Object
        :param alignmentRDDs: A list of bdgenomics.adam.rdd.AlignmentRDD objects
        :param int bin_size: Division size per bin
        """
        self.bin_size = int(bin_size)
        self.sc = ss.sparkContext

        # filter alignments without a position
        filteredAlignments = alignmentRDD.transform(lambda x: x.sample(False, sample)) \
            .toDF().rdd.filter(lambda r: r["start"] != None)

        # Assign alignments with counter for contigs. Reduce and collect.
        mappedDistributions = filteredAlignments \
            .map(lambda r: ((r["contigName"], r["start"] - r["start"]%bin_size), \
                            Counter(dict([(y,x) for x,y in Cigar(r["cigar"]).items()])))) \
            .reduceByKey(lambda x,y: x+y)

        self.alignments = mappedDistributions.collect()

Ejemplo n.º 14

0

Mostrar archivo

    def test_make_split_read_bam_file(self):
        sorted_bam = path.join(TEST_DATA_DIR, 'sorted.bam')
        with pysam.Samfile(sorted_bam, 'rb') as samfile:
            for read in samfile:
                if not read.cigarstring:
                    continue

                for breakpoint in (10, 50, 100):
                    if breakpoint >= read.rlen:
                        continue

                    for is_left_split in (True, False):
                        split_read = make_split_read(read, breakpoint,
                                                     is_left_split)
                        cigar_items = list(
                            Cigar(split_read.cigarstring).items())
                        clipped_item = cigar_items[
                            0] if is_left_split else cigar_items[-1]
                        min_clip_len = breakpoint if is_left_split else read.rlen - breakpoint  # Can be longer if adjacent to another clip.
                        self.assertGreaterEqual(clipped_item[0], min_clip_len)
                        self.assertIn(
                            clipped_item[1], ('S', 'H')
                        )  # Will be soft-clipped unless already hard-clipped.

Ejemplo n.º 15

0

Mostrar archivo

def parseCIGARForIntrons(cigar):

	if 'N' in cigar:
		cigar = cigar.split('N')[0] + 'N' #remove all information after intron
	else:
		raise Exception('no intron detected')

	offset = 0
	matchedExon = 0
	intronLength = 0
	
	for c in list(Cigar(cigar).items()): # returns list of tuples : [(20, 'N')]
		if c[1] == 'N':
			intronLength += int(c[0])
		elif c[1] == 'D':
			offset += int(c[0])
		elif c[1] == 'I':
			offset -= int(c[0])
		elif c[1] == 'M':
			matchedExon += int(c[0])
		## soft clipping is ignored
		## hard clipping is ignored too

	return offset, matchedExon, intronLength

Ejemplo n.º 16

0

Mostrar archivo

        l = line.split("\t")
        readID = l[0]
        chrName = l[2]
        filterDict[readID] = []
        filterDict[readID].append(chrName)
        filterDict[readID] = set(filterDict[readID])

samFile.close()

samFile2 = open("QH046cDNA.sam", "r")
for line2 in samFile2:
    if line2.startswith("@"):
        print line2
    else:
        l2 = line2.split("\t")
        readID2 = l[0]
        if len(filterDict[readID2]) > 2:
            continue
        else:
            seqLength = len(l2[9])
            cigar = Cigar(l2[5])
            cigarList = list(cigar.items())
            mapp = 0
            for i in cigarList:
                if i[1] == "M":
                    mapp += i[0]
                else:
                    continue
            if seqLength - mapp <= 1:
                print line2
samFile2.close()

Ejemplo n.º 17

0

Mostrar archivo

def alignment_interval_query(CIGAR, orientation):
    '''
    Compute alignment on the reference length from CIGAR string

    Input:
        1. CIGAR: CIGAR string
        2. orientation: alignment orientation (+ or -) 

    Output:
        1. beg: begin position in query
        2. end: end position in query
    '''
    ## 1. Read CIGAR string using proper module
    cigar = Cigar(CIGAR)

    ## 2. Iterate over the operations and compute query alignment length and start position in query
    alignmentLen = 0
    counter = 0  # Count operations

    for cigarTuple in list(cigar.items()):

        length = int(cigarTuple[0])
        operation = cigarTuple[1]

        ## Set start position in query based on first operation
        if counter == 0:

            # a) Soft or Hard clipping
            if (operation == 'S') or (operation == 'H'):
                startPos = length

            # b) No clipping
            else:
                startPos = 0

        #### Update query alignment length
        # - Op M, alignment match (can be a sequence match or mismatch)
        # - Op =, sequence match
        # - Op X, sequence mismatch
        # - Op I, insertion to the reference
        if (operation == 'M') or (operation == '=') or (operation
                                                        == 'X') or (operation
                                                                    == 'I'):
            alignmentLen += length

        ## Update operations counter
        counter += 1

    ## 3. Compute alignment interval in raw query
    ## Compute read length
    readLen = len(cigar)

    # a) Query aligned in +
    if orientation == '+':
        beg = startPos
        end = startPos + alignmentLen

    # b) Query aligned in - (reversed complemented to align)
    else:
        beg = readLen - startPos - alignmentLen
        end = readLen - startPos

    return beg, end

Ejemplo n.º 18

0

Mostrar archivo

Archivo: delfind.py Proyecto: tdfy/CRISPRAmpSeq

    filter_df['CIGAR_str'] = CIGAR_list
    filter_df['CIGAR_list'] = C_list
    filter_df['read_len'] = len_list
    filter_df['start_coo'] = ref_st_list
    filter_df['loc'] = filter_df.index

    filter_df = filter_df.sort_values(['Score'], ascending=[False])

    filter_df = filter_df.drop_duplicates(subset=['Read_Name'], keep='first')

    CIG_dict = dict(zip(filter_df['CIGAR_list'], filter_df['Read_Name']))
    COO_dict = dict(zip(filter_df['CIGAR_list'], filter_df['start_coo']))
    read_len_dict = dict(zip(filter_df['CIGAR_list'], filter_df['read_len']))

    for c_str in filter_df['CIGAR_list']:
        CIGAR_edit = Cigar(c_str)
        l = len(CIGAR_edit)
        CIG_list = list(CIGAR_edit.items())

        for index, tup in enumerate(CIG_list):
            if CIG_list[0][1] is 'S':
                CIG_list.remove(CIG_list[0])
            else:
                pass

            ch = CIG_list[index][1]

            if ch in String_list:

                slice_l = CIG_list[:index + 1]
                slice_m = CIG_list[:index]

Ejemplo n.º 19

0

Mostrar archivo

def invert_read(read, start, end, sequence, snp_rate, indel_rate, max_clip_len=None):
    """
    Invert (a portion of) a read.

    Args:
        read: The read to modify.
        start: The start of the inversion.
        end: The end of the inversion.
        sequence: The full sequence that is inverted in the sample that the read belong to. This sequence should be provided in its
                  reverse-complement form (e.g. as returned by `get_inverse_sequence`).
        snp_rate: The fraction of bases that will be randomly modified in reads that were modified.
        indel_rate: The fraction of bases that will be randomly inserted or deleted in in reads that were modified.
        max_clip_len: If more than "max_clip_len" of the read would be clipped on either end, return None,
                      since this read would not have been captured.

    Returns:
        A duplicate read to the provided one, where any position covered by the inverted region is replaced with the inversion.
    """
    inv_len = end - start
    if start >= read.reference_end or end <= read.reference_start or inv_len < 2:
        return read, 0

    read_with_inversion = copy.deepcopy(read)
    read_with_inversion.qname = read_with_inversion.query_name = read.qname + '-' + 'inv'

    if read.reference_start <= start < end <= read.reference_end:
        # Read spans the entire inversion.
        left_breakpoint = start - read.reference_start
        right_breakpoint = left_breakpoint + inv_len
        read_with_inversion.seq = "{left}{inv}{right}".format(
            left=read.seq[:left_breakpoint],
            inv="".join(reversed(read.seq[left_breakpoint:right_breakpoint])),
            right=read.seq[right_breakpoint:])

        # Clipped bases in reads must start at a read boundary; choose the closest one.
        # TODO: add a supplemental/secondary read where the shorter region is matched, and the longer one clipped.
        cigar_tuples = unpack_cigar(read.cigarstring)
        if left_breakpoint < read.rlen - right_breakpoint:
            start_clip, end_clip = 0, right_breakpoint
        else:
            start_clip, end_clip = left_breakpoint, read.rlen
        for i in range(start_clip, end_clip):
            cigar_tuples[i] = '1S'

        read_with_inversion.cigarstring = str(Cigar("".join(cigar_tuples)).merge_like_ops())

    elif start <= read.reference_start < read.reference_end <= end:
        # Inversion spans the entire read.
        pos_in_inversion = read.reference_start - start
        inv_seq = sequence[pos_in_inversion:pos_in_inversion + read.rlen]
        read_with_inversion = make_split_read(read_with_inversion, 0, clip_left=False, sequence=inv_seq)

        # If a read was reversed, modify its strand.
        read_with_inversion.is_reverse = not read.is_reverse

    elif start > read.reference_start:
        # Inversion starts mid-read, continuing to the end of it (or past it).
        breakpoint = start - read.reference_start
        read_with_inversion = make_split_read(read_with_inversion, breakpoint, clip_left=False, sequence=sequence)

    elif end < read.reference_end:
        # Inversion starts before the read, continuing into it.
        breakpoint = end - read.reference_start
        read_with_inversion = make_split_read(read_with_inversion, breakpoint, clip_left=True, sequence=sequence)

    if max_clip_len and int(max_clip_len) < get_max_clip_len(read_with_inversion):
        return None, 0

    # Add noise.
    return modify_read(read_with_inversion, snp_rate, indel_rate / 2, indel_rate / 2)

Ejemplo n.º 20

0

Mostrar archivo

Archivo: utils.py Proyecto: nnoll/pangraph

def parse_cigar(aln, qryseq, refseq, cutoff=500):
    from cigar import Cigar

    aln = Cigar(aln)

    lq, rq = 0, 0
    lr, rr = 0, 0
    refs = []
    qrys = []
    blks = []

    R, Q = {}, {}
    blkseq = ""
    blkpos = 0
    refmap = [(rr, blkpos - rr)]
    qrymap = [(rq, blkpos - rq)]

    def push(qval=None, rval=None):
        nonlocal R, Q, blkseq, blkpos, refmap, qrymap
        assert not (qval is None and rval is None)

        def f(xs, x):
            if x is None:
                xs.append(None)
                return True
            else:
                l, r = zip(x)
                if l < r:
                    xs.append(x)
                    return True
                return False

        hasq = f(qrys, qval)
        hasr = f(refs, rval)

        if hasq or hasr:
            assert len(qrys) == len(refs)
            assert len(blkseq) > 0, "empty seq"
            blks.append((np.array(list(blkseq)), (Q, np.array(qrymap).T),
                         (R, np.array(refmap).T)))

        R, Q = {}, {}
        blkseq = ""
        blkpos = 0
        refmap = [(rr, blkpos - rr)]
        qrymap = [(rq, blkpos - rq)]

    def recordbp():
        nonlocal blkpos, refmap, qrymap

        blkpos = len(blkseq)
        refmap.append((rr, blkpos - rr))
        qrymap.append((rq, blkpos - rq))

    for l, t in aln.items():
        if t in ['S', 'H']:
            if l >= cutoff:
                print(aln)
                import ipdb
                ipdb.set_trace()

                push((lq, rq), (lr, rr))

                blkseq = qryseq[rq:rq + l]
                # TODO: Think through soft/hard clips
                # if t == 'S':
                rq += l
                recordbp()

                push((rq - l, rq), None)
                lq = rq
                lr = rr
            else:
                rq += l
                recordbp()

        elif t == 'M':
            rs = np.array(list(refseq[rr:rr + l]))
            qs = np.array(list(qryseq[rq:rq + l]))
            diff = np.where(np.array(rs != qs))[0]
            for i in diff:
                Q[i + blkpos] = qs[i]
            blkseq += refseq[rr:rr + l]

            rq += l
            rr += l

            recordbp()

        elif t == 'D':
            if l >= cutoff:
                push((lq, rq), (lr, rr))

                blkseq = refseq[rr:rr + l]
                rr += l
                recordbp()

                push(None, (rr - l, rr))
                lr = rr
                lq = rq
            else:
                for i in range(l):
                    Q[i + blkpos] = '-'
                blkseq += refseq[rr:rr + l]

                rr += l
                recordbp()

        elif t == 'I':
            if l >= cutoff:
                push((lq, rq), (lr, rr))

                blkseq = qryseq[rq:rq + l]
                rq += l
                recordbp()

                push((rq - l, rq), None)
                lq = rq
                lr = rr
            else:
                for i in range(l):
                    R[i + blkpos] = '-'
                blkseq += qryseq[rq:rq + l]

                rq += l
                recordbp()

    push((lq, rq), (lr, rr))
    assert len(qrys) == len(refs) and len(qrys) == len(blks)

    return qrys, refs, blks

Ejemplo n.º 21

0

Mostrar archivo

Archivo: parse_bam.py Proyecto: ifishlin/KBH_thesis_work

def main():

    parser = argparse.ArgumentParser(
        description=
        'Parse BAM file for multi-alignment and soft-clipped reads. Will return a BAM file containing split reads and a BEDPE file containing the coordinates of the split reads (useful for circos plots, etc.). Can additionally return a filtered methylation TSV file if one is provided (optional)'
    )

    required = parser.add_argument_group(
        'Required',
        'Bam, clip size to filter on, output location, flags to filter on, splits to filter on, and True/False if alternative chromosomes were used'
    )

    required.add_argument('-b',
                          '--bam',
                          type=str,
                          help='bam file - must be created with NGMLR')

    required.add_argument('-c',
                          '--clip_size_thresh',
                          type=int,
                          help='soft clip size threshold to filter on [1000]',
                          default=1000)

    required.add_argument('-o',
                          '--output',
                          type=str,
                          help='output location and prefix')

    required.add_argument(
        '-f',
        '--flag',
        type=str,
        help=
        'flag(s) to filter bam file on. delimited list, default 256,2046,2304',
        default='256,2048,2304')

    required.add_argument(
        '-s',
        '--splits',
        type=int,
        help=
        'Number of splits read aligns to filter on for bedpe file (2 only option right now, hope to change in the future)[2]',
        default=2)

    required.add_argument(
        '-a',
        '--alt_chroms',
        type=bool,
        help=
        'Does BAM file use alternative chromosome names? (i.e. NC_000001.11, etc.) [False]',
        default=False)

    optional = parser.add_argument_group(
        'Optional', 'methylation call tsv file (from f5c)')

    optional.add_argument(
        '-m',
        '--meth',
        type=str,
        help='Methylation calls tsv file to filter (from f5c)')

    args = parser.parse_args()
    '''
    1. Read in BAM file & extract read ID, flag, and CIGAR string, then filter based on flags
    '''

    inbam = pysam.AlignmentFile(args.bam, "rb")

    reads = dict()

    codes = [int(item) for item in args.flag.split(',')]

    #print(codes)

    for read in inbam:
        if read.flag in codes:
            reads[read.query_name] = read.cigarstring
            #print(reads)

        #return(reads)
    '''
    2. Parsing CIGAR string for left and right soft-clipping
    '''

    clips = defaultdict(dict)

    for key, value in reads.items():
        #print(value)
        c = Cigar(value)
        items = list(c.items())
        #print(items[-1][1])
        if (items[0][1] == "S"):
            clips[key]["LC"] = int(items[0][0])
        else:
            clips[key]["LC"] = 0

        if (items[-1][1] == "S"):
            clips[key]["RC"] = int(items[-1][0])
        else:
            clips[key]["RC"] = 0

        #print(clips)
    '''
    3. Converting clips nested dict into pd dataframe, filtering on clipping criteria
    '''

    clips_df = pd.DataFrame.from_dict(clips, orient='index')
    #print(clips_df.head())

    clips_df = clips_df[(clips_df['LC'] >= args.clip_size_thresh) |
                        (clips_df['RC'] >= args.clip_size_thresh)]
    #print(clips_df.head())
    '''
    4. Extracting read id's from list above, and creating new BAM file
    '''

    big_clip = list(clips_df.index)

    outfile = pysam.AlignmentFile(args.output + '_clipped.bam',
                                  'w',
                                  template=inbam)

    inbam = pysam.AlignmentFile(
        args.bam, "rb")  #Always need to re-load bam file for some reason

    for read in inbam:
        if read.query_name in big_clip:
            #print(read)
            outfile.write(read)
    '''
    5. Creating bedpe file 
    BEDPE format:
    chrom1, start1, end1, chrom2, start2, end2
    Right now this will only return reads that map to 2 places in the genome, in the future having it enabled for multi-mapping reads would be preferable
    '''

    test_list = []

    inbam = pysam.AlignmentFile(args.bam, "rb")

    for read in inbam:
        if read.query_name in big_clip:
            test_list.append(read.query_name)
            counts = Counter(test_list)

    unique_reads = []

    N = args.splits  #Right now this will always be 2, but in the future I'd like to make the program able to identify multi-mapping reads

    for key, value in counts.items():
        #print(key, value)
        if value == N:
            unique_reads.append(key)

    inbam = pysam.AlignmentFile(args.bam, "rb")

    #print(len(unique_reads))

    splits = defaultdict(dict)

    for read in inbam:
        if read.query_name in unique_reads:
            #print(read.query_name)
            if read.query_name not in splits.keys():
                splits[read.query_name]["chromosome"] = read.reference_name
                splits[read.query_name]["start"] = str(read.reference_start)
                splits[read.query_name]["end"] = str(read.reference_end)

            else:
                splits[
                    read.query_name]["chromosome"] += "," + read.reference_name
                splits[read.query_name]["start"] += "," + str(
                    read.reference_start)
                splits[read.query_name]["end"] += "," + str(read.reference_end)
                #print(splits)

    #print(splits)

    bedpe = pd.DataFrame.from_dict(splits, orient='index')

    #print(bedpe)

    bedpe[['chrom1', 'chrom2']] = bedpe['chromosome'].str.split(
        ',',
        expand=True,
    )
    bedpe[['start1', 'start2']] = bedpe['start'].str.split(
        ',',
        expand=True,
    )
    bedpe[['end1', 'end2']] = bedpe['end'].str.split(
        ',',
        expand=True,
    )

    bedpe = bedpe[["chrom1", "start1", "end1", "chrom2", "start2", "end2"]]

    chr_dict = {
        "NC_000001.11": "chr1",
        "NC_000002.12": "chr2",
        "NC_000003.12": "chr3",
        "NC_000004.12": "chr4",
        "NC_000005.10": "chr5",
        "NC_000006.12": "chr6",
        "NC_000007.14": "chr7",
        "NC_000008.11": "chr8",
        "NC_000009.12": "chr9",
        "NC_000010.11": "chr10",
        "NC_000011.10": "chr11",
        "NC_000012.12": "chr12",
        "NC_000013.11": "chr13",
        "NC_000014.9": "chr14",
        "NC_000015.10": "chr15",
        "NC_000016.10": "chr16",
        "NC_000017.11": "chr17",
        "NC_000018.10": "chr18",
        "NC_000019.10": "chr19",
        "NC_000020.11": "chr20",
        "NC_000021.9": "chr21",
        "NC_000022.11": "chr22",
        "NC_000023.11": "chrX",
        "NC_000024.10": "chrY"
    }

    if args.alt_chroms == True:
        bedpe['chrom1'] = bedpe['chrom1'].map(chr_dict)
        bedpe['chrom2'] = bedpe['chrom2'].map(chr_dict)

    #print(bedpe.head())

    bedpe.to_csv(args.output + "_split_reads.bedpe", index=False, sep='\t')
    '''
    6. Optional Methylation filtering
    '''

    if args.meth is not None:

        meth = pd.read_csv(args.meth, sep='\t')

        meth = meth[meth.read_name.isin(big_clip)]

        meth.to_csv(args.output + '_clipped_meth.tsv', index=False, sep='\t')

Ejemplo n.º 22

0

Mostrar archivo

def find_sa_chimeras(
        bam,
        fh_out,
        sa_singletons_only=False,  # anything with dist below min_chim_dist but >0
        min_chim_dist=MIN_CHIM_DIST,  # anythin below min_chim_dist is called sa_singleton (split alignment singleton)
        max_nm_perc=100,
        ign_string=None,
        ign_max_err=0,
        add_nm=False):
    """Find chimeras from BWA-MEM split mappings (defined as chimeras here)
    """

    debug = False
    if debug:
        num_chimeras = 0

    fh_in = pysam.Samfile(bam)
    for (num_reads, read) in enumerate(fh_in):

        if debug and (num_chimeras > 100 or num_reads) > 10000:
            sys.stderr.write("DEBUG break\n")
            break
        if read.is_unmapped or read.is_qcfail or read.is_duplicate:
            continue
        if not read_is_primary(read):
            continue

        qname = read.qname
        qseq = read.seq
        # pysam 0.7.7 uses seq and not query_sequence (clipped anyway?)
        # pysam 0.7.7 uses qname not query_name

        if ign_string:
            if regex.search("({}){{e<={}}}".format(ign_string, ign_max_err),
                            qseq, regex.BESTMATCH):
                #sys.stderr.write("DEBUG: ignoring {}\n".format(qseq))
                continue

        # previously ignored MQ0. Should be decided downstream.
        # Still valuable in large genomes.
        #
        # if not read.mapping_quality > 0:
        # pysam 0.7.7 has mapq and not mapping_quality
        #if not read.mapq > 0:
        #    continue

        # finding split alignments via SA tag in primary alignment
        tags = dict(read.tags)
        if not tags.has_key('SA'):
            continue

        assert 'NM' in tags
        perc_nm = tags['NM'] * 100.0 / float(
            len(query_aln_seq(read.seq, read.cigarstring)))
        if perc_nm > max_nm_perc:
            continue

        ori_cigar = list(Cigar(read.cigarstring).items())

        # no need to determine clip site here and later. overlap/proximity of
        # mapping determines chimera already
        #
        # skip if clip site can't be determined
        #if not clip_site(ori_cigar):
        #    sys.stderr.write(
        #        "WARN: can't determine clip site (or clip too small) for {} in {}\n".format(
        #            ori_cigar, qname))
        #    continue

        # SA == supplementary alignment
        # for definition of SA tag see:
        # https://sourceforge.net/p/samtools/mailman/message/30853577/
        # chr,strandPos,CIGAR,mapQ,NM;
        num_valid_sa = 0
        for (_sa_num, sa_tag) in enumerate(tags['SA'].rstrip(";").split(';')):
            sa_tag = dict(
                zip(['chrom', 'pos', 'strand', 'cigar', 'mq', 'nm'],
                    sa_tag.split(",")))
            assert sa_tag['strand'] in ['+', '-']
            for k in ['pos', 'mq', 'nm']:
                sa_tag[k] = int(sa_tag[k])

            # previously ignored MQ0. Should be decided downstream.
            # Still valuable in large genomes.
            #if int(sa_tag['mq']) == 0:
            #    continue

            sa_cigar = list(Cigar(sa_tag['cigar']).items())
            #if not clip_site(sa_cigar):
            #    sys.stderr.write(
            #        "WARN: can't determine clip site (or clip too small) for {} in {} (SA)\n".format(
            #            sa_cigar, qname))
            #    continue

            # clips have to be on opposite sites
            #if clip_site(ori_cigar) == clip_site(sa_cigar):
            #    sys.stderr.write(
            #        "WARN: clip on identical sites for {}: {} and {}\n".format(
            #            qname, ori_cigar, sa_cigar))
            #    continue

            perc_nm = sa_tag['nm'] * 100.0 / float(
                len(query_aln_seq(read.seq, sa_tag['cigar'])))
            if perc_nm > max_nm_perc:
                continue

            # indirect testing of cigar2rlen()
            # pysam 0.7.7 has aend and not reference_end
            #assert read.pos + cigar2rlen(ori_cigar) == read.reference_end
            assert read.pos + cigar2reflen(ori_cigar) == read.aend

            chim = chimeras2.Chimera()
            chim.qname = qname

            # default is to assign primary to left and SA to right and then sort

            chim.left.flag = read.flag
            chim.left.rname = fh_in.getrname(read.tid)
            # pysam 0.7.7 uses tid not reference_id
            chim.left.pos = read.pos
            chim.left.aend = read.aend
            chim.left.mapq = read.mapq
            chim.left.cigar = read.cigarstring
            chim.left.seq = query_aln_seq(qseq, chim.left.cigar)
            if add_nm:
                chim.left.nm = tags['NM']

            chim.right.flag = 0 if sa_tag['strand'] == '+' else 16
            chim.right.rname = sa_tag['chrom']
            chim.right.pos = sa_tag['pos']
            chim.right.aend = sa_tag['pos'] + cigar2reflen(sa_cigar)
            chim.right.mapq = sa_tag['mq']
            chim.right.cigar = sa_tag['cigar']
            chim.right.seq = query_aln_seq(qseq, chim.right.cigar)
            if add_nm:
                chim.right.nm = sa_tag['nm']

            chim.sort_halves()
            if chim.order == "invalid":
                # likely revcomp vs non-revcomp
                continue

            # no overlap allowed ever
            if chim.dist2d() < 0:
                continue
            # sa_singleton and wanted?
            if chim.dist2d() < min_chim_dist:
                if not sa_singletons_only:
                    continue
            else:
                if sa_singletons_only:
                    continue

            #sys.stderr.write(
            #    "DEBUG chim before sanity_check = {} (dist2d={})\n".format(
            #        chim, chim.dist2d()))
            chim.sanity_check()

            num_valid_sa += 1
            if num_valid_sa > 1:
                sys.stderr.write(
                    "WARN: More than one valid SA found for {}\n".format(
                        qname))

            fh_out.write("{}\n".format(chim))
            if debug:
                num_chimeras += 1

    fh_in.close()

Ejemplo n.º 23

0

Mostrar archivo

Archivo: writeMappedReadPairs_pub.py Proyecto: Zhong-Lab-UCSD/PROPERseqTools

idList = list(set(idList))

targetFile1 = open(
    '%s/%sintermediateFiles/mappedReadPairs_all_bwa.csv_%s' %
    (sys.argv[1], sys.argv[5], sys.argv[2]), 'a')
targetFile2 = open(
    '%s/%sintermediateFiles/chimericReadPairs_all_bwa.csv_%s' %
    (sys.argv[1], sys.argv[5], sys.argv[2]), 'a')
targetFile2.write(
    'readId,R1Tx,R1start,R1end,R1Gene,R1Cigar,R2Tx,R2start,R2end,R2Gene,R2Cigar\n'
)
for readId in idList:
    geneList1 = ';'.join(list(dicReadIdGene1[readId]))
    geneList2 = ';'.join(list(dicReadIdGene2[readId]))
    cigar1, cigar2 = Cigar(dicIdtoCigar1[readId][0]), Cigar(
        dicIdtoCigar2[readId][0])
    [txId1, start1, end1] = dicReadIdPos1[readId][0]
    [txId2, start2, end2] = dicReadIdPos2[readId][0]
    gene1, gene2 = dicIdGeneName[txId1], dicIdGeneName[txId2]
    type1, type2 = dicIdGeneType[txId1], dicIdGeneType[txId2]
    targetFile1.write(
        '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' %
        (readId, gene1, gene2, txId1, start1, end1, str(cigar1), type1,
         geneList1, txId2, start2, end2, str(cigar2), type2, geneList2))

    if len(set(geneList1) & set(geneList2)) == 0:
        if type1 == 'mRNA' and type2 == 'mRNA':
            #check cigar string
            cigar1, cigar2 = Cigar(dicIdtoCigar1[readId][0]), Cigar(
                dicIdtoCigar2[readId][0])

Ejemplo n.º 24

0

Mostrar archivo

Archivo: apply_dipcall_bed_filter.py Proyecto: yangchou0916/cactus

def adjust_mapping(mapping, overlap_region):
    # don't adjust the original mapping, adjust a copy.
    mapping = copy.deepcopy(mapping)

    debug_original_mapping = mapping[:]
    # find the cigar & its field from the SAM-liked typed key-value pairs at the end.
    cig = None
    cig_field = None
    for field in range(len(mapping[12:])):
        field += 12
        if mapping[field][:5] == "cg:Z:":
            cig = col.deque(Cigar(mapping[field][5:]).items())
            cig_field = field
            break
    # print ("adjusting mapping", cig, "overlap region", overlap_region)

    ## Note: mapping[X], X=2,3 is query start & end; X=7,8 is target start and end;
    #        9 is # of M bases; 10 is # of M+I+D bases

    if mapping[7] < overlap_region[0]:
        #adjust start of the mapping
        adjust_amount = overlap_region[0] - mapping[7]
        while adjust_amount > 0:
            if cig[0][1] == "M":
                if cig[0][0] <= adjust_amount:
                    adjust_amount -= cig[0][0]
                    mapping[2] += cig[0][0]
                    mapping[7] += cig[0][0]
                    # mapping[9] -= cig[0][0]
                    mapping[10] -= cig[0][0]
                    cig.popleft()
                else:
                    # we only need to slightly adjust this cig field, because the adjust amount is less than the cig field.
                    cig[0] = (cig[0][0] - adjust_amount, cig[0][1])
                    mapping[2] += adjust_amount
                    mapping[7] += adjust_amount
                    # mapping[9] -= adjust_amount
                    mapping[10] -= adjust_amount
                    adjust_amount = 0
            elif cig[0][1] == "D":
                if cig[0][0] <= adjust_amount:
                    adjust_amount -= cig[0][0]
                    mapping[7] += cig[0][0]
                    mapping[10] -= cig[0][0]
                    cig.popleft()
                else:
                    # we only need to slightly adjust this cig field, because the adjust amount is less than the cig field.
                    cig[0] = (cig[0][0] - adjust_amount, cig[0][1])
                    mapping[7] += adjust_amount
                    mapping[10] -= adjust_amount
                    adjust_amount = 0
            elif cig[0][1] == "I":
                mapping[2] += cig[0][0]
                mapping[10] -= cig[0][0]
                cig.popleft()
            elif cig[0][1] in "HS":
                cig.popleft()
            else:
                raise ValueError('Inappropiate Cigar value ' + cig[0][1] +
                                 ' in cigar ' + mapping[cig_field])

    # print ("mapping with adj start", cig, "overlap region", overlap_region)

    if mapping[8] > overlap_region[1]:
        #adjust end of the mapping
        adjust_amount = mapping[8] - overlap_region[1]
        while adjust_amount > 0:
            if cig[-1][1] == "M":
                if cig[-1][0] <= adjust_amount:
                    adjust_amount -= cig[-1][0]
                    mapping[3] -= cig[-1][0]
                    mapping[8] -= cig[-1][0]
                    # mapping[9] -= cig[-1][0]
                    mapping[10] -= cig[-1][0]
                    cig.pop()
                else:
                    # we only need to slightly adjust this cig field, because the adjust amount is less than the cig field.
                    cig[-1] = (cig[-1][0] - adjust_amount, cig[-1][1])
                    mapping[3] -= adjust_amount
                    mapping[8] -= adjust_amount
                    mapping[9] -= adjust_amount
                    mapping[10] -= adjust_amount
                    adjust_amount = 0
            elif cig[-1][1] == "D":
                if cig[-1][0] <= adjust_amount:
                    adjust_amount -= cig[-1][0]
                    mapping[8] -= cig[-1][0]
                    mapping[10] -= cig[-1][0]
                    cig.pop()
                else:
                    # we only need to slightly adjust this cig field, because the adjust amount is less than the cig field.
                    cig[-1] = (cig[-1][0] - adjust_amount, cig[-1][1])
                    mapping[8] -= adjust_amount
                    mapping[10] -= adjust_amount
                    adjust_amount = 0
            elif cig[-1][1] == "I":
                mapping[3] -= cig[-1][0]
                mapping[10] -= cig[-1][0]
                cig.pop()
            elif cig[-1][1] in "HS":
                cig.pop()
            else:
                raise ValueError('Inappropiate Cigar value ' + cig[-1][1] +
                                 ' in cigar ' + mapping[cig_field])

    # print ("mapping with adj end", cig, "overlap region", overlap_region)

    # if mapping[7] == 28596469 or mapping[7] == 17555049:
    #     print("mapping of interest before final adjustment:", mapping)
    #     print("cigar", cig)

    #todo: consider if there may be no mapping left - just a deletion field. Is it possible to accidentally find an "overlap region" that just shows unmapped reference, i.e. a deletion? This would mean below while-loop would crash eventually. Need to catch that eventuality and have it return a None mapping, instead.
    while cig[-1][1] in "ID":
        #while the cigar ends on an insertion/deletion, (which doesn't make biological sense), remove it.
        if cig[-1][1] == "D":
            mapping[8] -= cig[-1][0]
            mapping[10] -= cig[-1][0]
            cig.pop()
        elif cig[-1][1] == "I":
            mapping[3] -= cig[-1][0]
            mapping[10] -= cig[-1][0]
            cig.pop()

    # print ("dropped all in 'ID' in end.", cig, "overlap region", overlap_region)

    while cig[0][1] in "ID":
        #while the cigar starts on an insertion/deletion, (which doesn't make biological sense), remove it.
        if cig[0][1] == "D":
            mapping[7] += cig[0][0]
            mapping[10] -= cig[0][0]
            cig.popleft()
        elif cig[0][1] == "I":
            mapping[2] += cig[0][0]
            mapping[10] -= cig[0][0]
            cig.popleft()

    # if mapping[7] == 28596469 or mapping[7] == 17555049:
    #     print("mapping of interest after final adjustment:", mapping)
    #     print("cigar", cig)

    # adjust the "match count" field.
    mapping[9] = 0
    for i in cig:
        if i[1] == "M":
            mapping[9] += i[0]

    adjusted_cigar = ""
    for i in cig:
        adjusted_cigar += str(i[0]) + i[1]

    mapping[cig_field] = "cg:Z:" + adjusted_cigar

    mapping = drop_unadjusted_fields(mapping)

    return mapping

Ejemplo n.º 25

0

Mostrar archivo

Archivo: CRISPRAmpSeq.py Proyecto: tdfy/CRISPRAmpSeq

                              set(ctrl_df_dict[samp].iloc[:, 1]))
        u2 = set.intersection(set(ctrl_df_dict[samp].iloc[:, 0]),
                              set(ctrl_df_dict[samp].iloc[:, 2]))
        u3 = set.intersection(set(ctrl_df_dict[samp].iloc[:, 1]),
                              set(ctrl_df_dict[samp].iloc[:, 2]))
        print("Number of Controls:", len(ctrl_df_dict[samp].columns))

        u = u1 | u2 | u3
        print("Length of Artifacts:", len(u1), len(u2), len(u3))

    elif len(ctrl_df_dict[samp].columns) == 1:
        pos_list = []

        changes = ctrl_df_dict[samp].iloc[:, 0].tolist()
        for cig in changes:
            CIGAR_edit = Cigar(cig)
            CIG_list = list(CIGAR_edit.items())

            for index, tup in enumerate(CIG_list):
                ch = CIG_list[index][1]
                if ch == 'X':

                    slice_l = CIG_list[:index + 1]
                    pos = str(sum([t[0] for t in slice_l]))
                    variant = slice_l[-1][1]
                    pos_list.append(pos)

                else:
                    pass

        artifact_pos = [

Ejemplo n.º 26

0

Mostrar archivo

def map_pos(dna_pos, cigar_val, rna_query):
    """
    Return genomic positon of a transcript position.
    
    Args:
        dna_pos: read mapping start position on a chromosome
        cigar_val: cigar string
        rna_query: transcript position
    
    Returns:
        Genomic positon
    """
    #Split cigar using cigar module
    c = Cigar(cigar_val)
    c_split = list(c.items())

    #Initiate variables
    rna_pos = 0
    dna_pos = dna_pos

    #Initiate list
    rna_map = []
    dna_map = []

    #Using cigar string, build transcript to genomic position mapping table
    for i, (c_len, c_type) in enumerate(c_split):

        #Define action for each type of cigar string

        #Cigar type: match, mismatch
        if c_type == "M" or c_type == "=" or c_type == "X":
            rna_map = rna_map + list(range(rna_pos, rna_pos + c_len))
            dna_map = dna_map + list(range(dna_pos, dna_pos + c_len))
            rna_pos = rna_map[-1] + 1
            dna_pos = dna_map[-1] + 1

        #Cigar type: Soft clip
        elif c_type == "S":
            dna_pos = dna_pos - c_len
            rna_map = rna_map + list(range(rna_pos, rna_pos + c_len))
            dna_map = dna_map + list(range(dna_pos, dna_pos + c_len))
            rna_pos = rna_map[-1] + 1
            dna_pos = dna_map[-1] + 1

        #Cigar type: Hard clip
        elif c_type == "H":
            rna_pos = rna_pos
            dna_pos = dna_pos

        #Cigar type: deletion
        elif c_type == "D":
            rna_map = rna_map + [str(rna_pos) + 'D'] * c_len
            dna_map = dna_map + list(range(dna_pos, dna_pos + c_len))
            dna_pos = dna_map[-1] + 1

        #Cigar type: Skipped region in the read
        elif c_type == "N":
            rna_map = rna_map + [str(rna_pos) + 'N'] * c_len
            dna_map = dna_map + list(range(dna_pos, dna_pos + c_len))
            dna_pos = dna_map[-1] + 1

        #Cigar type: insertion in the read
        elif c_type == "I":
            rna_map = rna_map + list(range(rna_pos, rna_pos + c_len))
            dna_map = dna_map + [str(dna_pos) + 'I'] * c_len
            rna_pos = rna_map[-1] + 1

        #Cigar type: padding
        elif c_type == "P":
            rna_map = rna_map + list(range(rna_pos, rna_pos + c_len))
            dna_map = dna_map + [str(dna_pos) + 'P'] * c_len
            rna_pos = rna_map[-1] + 1

    #Convert list to data frame
    pos_map_df = pd.DataFrame(list(zip(rna_map, dna_map)),
                              columns=['rna', 'dna'])

    #Get genomic position for transcript position query
    dna_val = pos_map_df[pos_map_df['rna'] == rna_query]
    #Return genomic position only
    return (dna_val['dna'].values[0])