Beispiel #1
0
 def get_position_in_ref(self, pos, ref):
     around_seq = self.seq[pos - 50:pos + 50]
     around_qual = self.quality[pos - 50:pos + 50]
     with open('temp_files/temp_fastq.fastq', 'w') as fw:
         fw.write('@temp\n' + around_seq + '\n+\n' + around_qual)
     subprocess.run('bwa mem -M -x ont2d -t 7 ' + ref +
                    ' temp_files/temp_fastq.fastq > temp_files/'
                    'temp.sam',
                    shell=True,
                    stdout=FNULL,
                    stderr=subprocess.STDOUT)
     with open('temp_files/temp.sam') as f:
         row = f.readlines()[2].strip().split()
         if row[2] == '*':
             return None
         cigar = row[5]
         ref_pos = int(row[3])
         c = Cigar(cigar)
         split_cigar = ''
         for i in c.items():
             split_cigar += i[0] * i[1]
         shift = 0
         current = 0
         for l in split_cigar:
             current += 1
             if l == 'I':
                 shift -= 1
             elif l == 'D':
                 shift += 1
                 current -= 1
             if current == 50:
                 ref_coordinate = ref_pos + 49 + shift
                 break
         return ref_coordinate
Beispiel #2
0
def getGeneLocation(hit, gene):
    #parse CIGAR string
    loc = {}
    loc['note'] = ''
    list_hit = []
    total_len = 0
    for h in hit:
        hit = {}
        cigar = Cigar(h['cigar'])
        items = list(cigar.items())
        if items[0][1] == 'S' and items[-1][1] == 'S':
            hit['seq'] = h['seq'][items[0][0]:-items[-1][0]]
        elif items[0][1] == 'H' and items[-1][1] == 'H':
            hit['seq'] = h['seq']
        else:
            hit['seq'] = h['seq']
        send = int(h['ss'])
        for item in items:
            if item[1] == 'M':
                send = send + int(item[0])
            if item[1] == 'D':
                send = send + int(item[0])
            if item[1] == 'I':
                send = send - int(item[0])
        hit['pos'] = int(items[0][0])
        hit['ss'] = int(h['ss'])
        hit['send'] = send
        list_hit.append(hit)
    list_hit_sorted = sorted(list_hit, key=lambda k: k['ss'])
    if not len(list_hit_sorted) > 0:
        loc['consensus'] = ''
        loc['hit'] = []
        loc['note'] = 'Not found'
        return loc
    scafold = [list_hit_sorted[0]]
    cover_len = 0
    for i in range(len(list_hit_sorted)):
        if list_hit_sorted[i]['ss'] > scafold[-1]['ss'] + len(
                scafold[-1]['seq']):
            scafold.append(list_hit_sorted[i])

    if len(scafold) < len(list_hit_sorted):
        loc['note'] = "Multiple sequences found"
    loc['hit'] = list_hit_sorted
    loc['pos'] = list_hit_sorted[0]['pos']
    for i in range(len(scafold)):
        cover_len = cover_len + len(scafold[i]['seq'])
    loc['consensus'] = makeConsensus(scafold, gene)

    loc['coverage'] = cover_len / len(gene)
    #make consensus sequence from scafold

    #print(loc)
    return loc
Beispiel #3
0
def alignment_length_cigar(CIGAR):
    '''
    Compute alignment on the reference length from CIGAR string

    Input:
        1. CIGAR: CIGAR string

    Output:
        1. alignmentLen: alignment on the reference length
    '''
    ## 1. Read CIGAR string using proper module
    cigarTuples = Cigar(CIGAR)

    ## 2. Iterate over the operations and compute the alignment length
    alignmentLen = 0

    for cigarTuple in list(cigarTuples.items()):

        length = int(cigarTuple[0])
        operation = cigarTuple[1]

        ### Update reference alignment length
        ## a) Operations consuming query and reference
        # - Op M, tag 0, alignment match (can be a sequence match or mismatch)
        # - Op =, tag 7, sequence match
        # - Op X, tag 8, sequence mismatch
        if (operation == 'M') or (operation == '=') or (operation == 'X'):
            alignmentLen += length

        ## b) Operations only consuming reference
        # - Op D, tag 2, deletion from the reference
        # - Op N, tag 3, skipped region from the reference
        elif (operation == 'D') or (operation == 'N'):
            alignmentLen += length

    return alignmentLen
Beispiel #4
0
def parse_cigar(aln, qryseq, refseq, cutoff=500):
    from cigar import Cigar

    aln = Cigar(aln)

    lq, rq = 0, 0
    lr, rr = 0, 0
    refs = []
    qrys = []
    blks = []

    R, Q = {}, {}
    blkseq = ""
    blkpos = 0
    refmap = [(rr, blkpos - rr)]
    qrymap = [(rq, blkpos - rq)]

    def push(qval=None, rval=None):
        nonlocal R, Q, blkseq, blkpos, refmap, qrymap
        assert not (qval is None and rval is None)

        def f(xs, x):
            if x is None:
                xs.append(None)
                return True
            else:
                l, r = zip(x)
                if l < r:
                    xs.append(x)
                    return True
                return False

        hasq = f(qrys, qval)
        hasr = f(refs, rval)

        if hasq or hasr:
            assert len(qrys) == len(refs)
            assert len(blkseq) > 0, "empty seq"
            blks.append((np.array(list(blkseq)), (Q, np.array(qrymap).T),
                         (R, np.array(refmap).T)))

        R, Q = {}, {}
        blkseq = ""
        blkpos = 0
        refmap = [(rr, blkpos - rr)]
        qrymap = [(rq, blkpos - rq)]

    def recordbp():
        nonlocal blkpos, refmap, qrymap

        blkpos = len(blkseq)
        refmap.append((rr, blkpos - rr))
        qrymap.append((rq, blkpos - rq))

    for l, t in aln.items():
        if t in ['S', 'H']:
            if l >= cutoff:
                print(aln)
                import ipdb
                ipdb.set_trace()

                push((lq, rq), (lr, rr))

                blkseq = qryseq[rq:rq + l]
                # TODO: Think through soft/hard clips
                # if t == 'S':
                rq += l
                recordbp()

                push((rq - l, rq), None)
                lq = rq
                lr = rr
            else:
                rq += l
                recordbp()

        elif t == 'M':
            rs = np.array(list(refseq[rr:rr + l]))
            qs = np.array(list(qryseq[rq:rq + l]))
            diff = np.where(np.array(rs != qs))[0]
            for i in diff:
                Q[i + blkpos] = qs[i]
            blkseq += refseq[rr:rr + l]

            rq += l
            rr += l

            recordbp()

        elif t == 'D':
            if l >= cutoff:
                push((lq, rq), (lr, rr))

                blkseq = refseq[rr:rr + l]
                rr += l
                recordbp()

                push(None, (rr - l, rr))
                lr = rr
                lq = rq
            else:
                for i in range(l):
                    Q[i + blkpos] = '-'
                blkseq += refseq[rr:rr + l]

                rr += l
                recordbp()

        elif t == 'I':
            if l >= cutoff:
                push((lq, rq), (lr, rr))

                blkseq = qryseq[rq:rq + l]
                rq += l
                recordbp()

                push((rq - l, rq), None)
                lq = rq
                lr = rr
            else:
                for i in range(l):
                    R[i + blkpos] = '-'
                blkseq += qryseq[rq:rq + l]

                rq += l
                recordbp()

    push((lq, rq), (lr, rr))
    assert len(qrys) == len(refs) and len(qrys) == len(blks)

    return qrys, refs, blks
        dicIdtoCigar2[readId][0])
    [txId1, start1, end1] = dicReadIdPos1[readId][0]
    [txId2, start2, end2] = dicReadIdPos2[readId][0]
    gene1, gene2 = dicIdGeneName[txId1], dicIdGeneName[txId2]
    type1, type2 = dicIdGeneType[txId1], dicIdGeneType[txId2]
    targetFile1.write(
        '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' %
        (readId, gene1, gene2, txId1, start1, end1, str(cigar1), type1,
         geneList1, txId2, start2, end2, str(cigar2), type2, geneList2))

    if len(set(geneList1) & set(geneList2)) == 0:
        if type1 == 'mRNA' and type2 == 'mRNA':
            #check cigar string
            cigar1, cigar2 = Cigar(dicIdtoCigar1[readId][0]), Cigar(
                dicIdtoCigar2[readId][0])
            cigar1List = list(cigar1.items())
            cigar2List = list(cigar2.items())
            flag1 = False
            flag2 = False
            totalLength1 = float(sum([x[0] for x in cigar1List]))
            totalLength2 = float(sum([x[0] for x in cigar2List]))
            if cigar1List[0][
                    1] == 'M' and cigar1List[0][0] / totalLength1 >= 0.5:
                flag1 = True
            if (cigar1List[0][1] == 'S' or cigar1List[0][1]
                    == 'H') and cigar1List[0][0] / totalLength1 <= 0.2:
                if cigar1List[1][
                        1] == 'M' and cigar1List[1][0] / totalLength1 >= 0.5:
                    flag1 = True
            if cigar2List[0][
                    1] == 'M' and cigar2List[0][0] / totalLength2 >= 0.5:
Beispiel #6
0
def alignment_interval_query(CIGAR, orientation):
    '''
    Compute alignment on the reference length from CIGAR string

    Input:
        1. CIGAR: CIGAR string
        2. orientation: alignment orientation (+ or -) 

    Output:
        1. beg: begin position in query
        2. end: end position in query
    '''
    ## 1. Read CIGAR string using proper module
    cigar = Cigar(CIGAR)

    ## 2. Iterate over the operations and compute query alignment length and start position in query
    alignmentLen = 0
    counter = 0  # Count operations

    for cigarTuple in list(cigar.items()):

        length = int(cigarTuple[0])
        operation = cigarTuple[1]

        ## Set start position in query based on first operation
        if counter == 0:

            # a) Soft or Hard clipping
            if (operation == 'S') or (operation == 'H'):
                startPos = length

            # b) No clipping
            else:
                startPos = 0

        #### Update query alignment length
        # - Op M, alignment match (can be a sequence match or mismatch)
        # - Op =, sequence match
        # - Op X, sequence mismatch
        # - Op I, insertion to the reference
        if (operation == 'M') or (operation == '=') or (operation
                                                        == 'X') or (operation
                                                                    == 'I'):
            alignmentLen += length

        ## Update operations counter
        counter += 1

    ## 3. Compute alignment interval in raw query
    ## Compute read length
    readLen = len(cigar)

    # a) Query aligned in +
    if orientation == '+':
        beg = startPos
        end = startPos + alignmentLen

    # b) Query aligned in - (reversed complemented to align)
    else:
        beg = readLen - startPos - alignmentLen
        end = readLen - startPos

    return beg, end
Beispiel #7
0
def map_pos(dna_pos, cigar_val, rna_query):
    """
    Return genomic positon of a transcript position.
    
    Args:
        dna_pos: read mapping start position on a chromosome
        cigar_val: cigar string
        rna_query: transcript position
    
    Returns:
        Genomic positon
    """
    #Split cigar using cigar module
    c = Cigar(cigar_val)
    c_split = list(c.items())

    #Initiate variables
    rna_pos = 0
    dna_pos = dna_pos

    #Initiate list
    rna_map = []
    dna_map = []

    #Using cigar string, build transcript to genomic position mapping table
    for i, (c_len, c_type) in enumerate(c_split):

        #Define action for each type of cigar string

        #Cigar type: match, mismatch
        if c_type == "M" or c_type == "=" or c_type == "X":
            rna_map = rna_map + list(range(rna_pos, rna_pos + c_len))
            dna_map = dna_map + list(range(dna_pos, dna_pos + c_len))
            rna_pos = rna_map[-1] + 1
            dna_pos = dna_map[-1] + 1

        #Cigar type: Soft clip
        elif c_type == "S":
            dna_pos = dna_pos - c_len
            rna_map = rna_map + list(range(rna_pos, rna_pos + c_len))
            dna_map = dna_map + list(range(dna_pos, dna_pos + c_len))
            rna_pos = rna_map[-1] + 1
            dna_pos = dna_map[-1] + 1

        #Cigar type: Hard clip
        elif c_type == "H":
            rna_pos = rna_pos
            dna_pos = dna_pos

        #Cigar type: deletion
        elif c_type == "D":
            rna_map = rna_map + [str(rna_pos) + 'D'] * c_len
            dna_map = dna_map + list(range(dna_pos, dna_pos + c_len))
            dna_pos = dna_map[-1] + 1

        #Cigar type: Skipped region in the read
        elif c_type == "N":
            rna_map = rna_map + [str(rna_pos) + 'N'] * c_len
            dna_map = dna_map + list(range(dna_pos, dna_pos + c_len))
            dna_pos = dna_map[-1] + 1

        #Cigar type: insertion in the read
        elif c_type == "I":
            rna_map = rna_map + list(range(rna_pos, rna_pos + c_len))
            dna_map = dna_map + [str(dna_pos) + 'I'] * c_len
            rna_pos = rna_map[-1] + 1

        #Cigar type: padding
        elif c_type == "P":
            rna_map = rna_map + list(range(rna_pos, rna_pos + c_len))
            dna_map = dna_map + [str(dna_pos) + 'P'] * c_len
            rna_pos = rna_map[-1] + 1

    #Convert list to data frame
    pos_map_df = pd.DataFrame(list(zip(rna_map, dna_map)),
                              columns=['rna', 'dna'])

    #Get genomic position for transcript position query
    dna_val = pos_map_df[pos_map_df['rna'] == rna_query]
    #Return genomic position only
    return (dna_val['dna'].values[0])
Beispiel #8
0
    filter_df['read_len'] = len_list
    filter_df['start_coo'] = ref_st_list
    filter_df['loc'] = filter_df.index

    filter_df = filter_df.sort_values(['Score'], ascending=[False])

    filter_df = filter_df.drop_duplicates(subset=['Read_Name'], keep='first')

    CIG_dict = dict(zip(filter_df['CIGAR_list'], filter_df['Read_Name']))
    COO_dict = dict(zip(filter_df['CIGAR_list'], filter_df['start_coo']))
    read_len_dict = dict(zip(filter_df['CIGAR_list'], filter_df['read_len']))

    for c_str in filter_df['CIGAR_list']:
        CIGAR_edit = Cigar(c_str)
        l = len(CIGAR_edit)
        CIG_list = list(CIGAR_edit.items())

        for index, tup in enumerate(CIG_list):
            if CIG_list[0][1] is 'S':
                CIG_list.remove(CIG_list[0])
            else:
                pass

            ch = CIG_list[index][1]

            if ch in String_list:

                slice_l = CIG_list[:index + 1]
                slice_m = CIG_list[:index]

                pos2 = str(sum([t[0] for t in slice_m if t[1] != 'I']) + 1)
Beispiel #9
0
        l = line.split("\t")
        readID = l[0]
        chrName = l[2]
        filterDict[readID] = []
        filterDict[readID].append(chrName)
        filterDict[readID] = set(filterDict[readID])

samFile.close()

samFile2 = open("QH046cDNA.sam", "r")
for line2 in samFile2:
    if line2.startswith("@"):
        print line2
    else:
        l2 = line2.split("\t")
        readID2 = l[0]
        if len(filterDict[readID2]) > 2:
            continue
        else:
            seqLength = len(l2[9])
            cigar = Cigar(l2[5])
            cigarList = list(cigar.items())
            mapp = 0
            for i in cigarList:
                if i[1] == "M":
                    mapp += i[0]
                else:
                    continue
            if seqLength - mapp <= 1:
                print line2
samFile2.close()
Beispiel #10
0
def main():

    parser = argparse.ArgumentParser(
        description=
        'Parse BAM file for multi-alignment and soft-clipped reads. Will return a BAM file containing split reads and a BEDPE file containing the coordinates of the split reads (useful for circos plots, etc.). Can additionally return a filtered methylation TSV file if one is provided (optional)'
    )

    required = parser.add_argument_group(
        'Required',
        'Bam, clip size to filter on, output location, flags to filter on, splits to filter on, and True/False if alternative chromosomes were used'
    )

    required.add_argument('-b',
                          '--bam',
                          type=str,
                          help='bam file - must be created with NGMLR')

    required.add_argument('-c',
                          '--clip_size_thresh',
                          type=int,
                          help='soft clip size threshold to filter on [1000]',
                          default=1000)

    required.add_argument('-o',
                          '--output',
                          type=str,
                          help='output location and prefix')

    required.add_argument(
        '-f',
        '--flag',
        type=str,
        help=
        'flag(s) to filter bam file on. delimited list, default 256,2046,2304',
        default='256,2048,2304')

    required.add_argument(
        '-s',
        '--splits',
        type=int,
        help=
        'Number of splits read aligns to filter on for bedpe file (2 only option right now, hope to change in the future)[2]',
        default=2)

    required.add_argument(
        '-a',
        '--alt_chroms',
        type=bool,
        help=
        'Does BAM file use alternative chromosome names? (i.e. NC_000001.11, etc.) [False]',
        default=False)

    optional = parser.add_argument_group(
        'Optional', 'methylation call tsv file (from f5c)')

    optional.add_argument(
        '-m',
        '--meth',
        type=str,
        help='Methylation calls tsv file to filter (from f5c)')

    args = parser.parse_args()
    '''
    1. Read in BAM file & extract read ID, flag, and CIGAR string, then filter based on flags
    '''

    inbam = pysam.AlignmentFile(args.bam, "rb")

    reads = dict()

    codes = [int(item) for item in args.flag.split(',')]

    #print(codes)

    for read in inbam:
        if read.flag in codes:
            reads[read.query_name] = read.cigarstring
            #print(reads)

        #return(reads)
    '''
    2. Parsing CIGAR string for left and right soft-clipping
    '''

    clips = defaultdict(dict)

    for key, value in reads.items():
        #print(value)
        c = Cigar(value)
        items = list(c.items())
        #print(items[-1][1])
        if (items[0][1] == "S"):
            clips[key]["LC"] = int(items[0][0])
        else:
            clips[key]["LC"] = 0

        if (items[-1][1] == "S"):
            clips[key]["RC"] = int(items[-1][0])
        else:
            clips[key]["RC"] = 0

        #print(clips)
    '''
    3. Converting clips nested dict into pd dataframe, filtering on clipping criteria
    '''

    clips_df = pd.DataFrame.from_dict(clips, orient='index')
    #print(clips_df.head())

    clips_df = clips_df[(clips_df['LC'] >= args.clip_size_thresh) |
                        (clips_df['RC'] >= args.clip_size_thresh)]
    #print(clips_df.head())
    '''
    4. Extracting read id's from list above, and creating new BAM file
    '''

    big_clip = list(clips_df.index)

    outfile = pysam.AlignmentFile(args.output + '_clipped.bam',
                                  'w',
                                  template=inbam)

    inbam = pysam.AlignmentFile(
        args.bam, "rb")  #Always need to re-load bam file for some reason

    for read in inbam:
        if read.query_name in big_clip:
            #print(read)
            outfile.write(read)
    '''
    5. Creating bedpe file 
    BEDPE format:
    chrom1, start1, end1, chrom2, start2, end2
    Right now this will only return reads that map to 2 places in the genome, in the future having it enabled for multi-mapping reads would be preferable
    '''

    test_list = []

    inbam = pysam.AlignmentFile(args.bam, "rb")

    for read in inbam:
        if read.query_name in big_clip:
            test_list.append(read.query_name)
            counts = Counter(test_list)

    unique_reads = []

    N = args.splits  #Right now this will always be 2, but in the future I'd like to make the program able to identify multi-mapping reads

    for key, value in counts.items():
        #print(key, value)
        if value == N:
            unique_reads.append(key)

    inbam = pysam.AlignmentFile(args.bam, "rb")

    #print(len(unique_reads))

    splits = defaultdict(dict)

    for read in inbam:
        if read.query_name in unique_reads:
            #print(read.query_name)
            if read.query_name not in splits.keys():
                splits[read.query_name]["chromosome"] = read.reference_name
                splits[read.query_name]["start"] = str(read.reference_start)
                splits[read.query_name]["end"] = str(read.reference_end)

            else:
                splits[
                    read.query_name]["chromosome"] += "," + read.reference_name
                splits[read.query_name]["start"] += "," + str(
                    read.reference_start)
                splits[read.query_name]["end"] += "," + str(read.reference_end)
                #print(splits)

    #print(splits)

    bedpe = pd.DataFrame.from_dict(splits, orient='index')

    #print(bedpe)

    bedpe[['chrom1', 'chrom2']] = bedpe['chromosome'].str.split(
        ',',
        expand=True,
    )
    bedpe[['start1', 'start2']] = bedpe['start'].str.split(
        ',',
        expand=True,
    )
    bedpe[['end1', 'end2']] = bedpe['end'].str.split(
        ',',
        expand=True,
    )

    bedpe = bedpe[["chrom1", "start1", "end1", "chrom2", "start2", "end2"]]

    chr_dict = {
        "NC_000001.11": "chr1",
        "NC_000002.12": "chr2",
        "NC_000003.12": "chr3",
        "NC_000004.12": "chr4",
        "NC_000005.10": "chr5",
        "NC_000006.12": "chr6",
        "NC_000007.14": "chr7",
        "NC_000008.11": "chr8",
        "NC_000009.12": "chr9",
        "NC_000010.11": "chr10",
        "NC_000011.10": "chr11",
        "NC_000012.12": "chr12",
        "NC_000013.11": "chr13",
        "NC_000014.9": "chr14",
        "NC_000015.10": "chr15",
        "NC_000016.10": "chr16",
        "NC_000017.11": "chr17",
        "NC_000018.10": "chr18",
        "NC_000019.10": "chr19",
        "NC_000020.11": "chr20",
        "NC_000021.9": "chr21",
        "NC_000022.11": "chr22",
        "NC_000023.11": "chrX",
        "NC_000024.10": "chrY"
    }

    if args.alt_chroms == True:
        bedpe['chrom1'] = bedpe['chrom1'].map(chr_dict)
        bedpe['chrom2'] = bedpe['chrom2'].map(chr_dict)

    #print(bedpe.head())

    bedpe.to_csv(args.output + "_split_reads.bedpe", index=False, sep='\t')
    '''
    6. Optional Methylation filtering
    '''

    if args.meth is not None:

        meth = pd.read_csv(args.meth, sep='\t')

        meth = meth[meth.read_name.isin(big_clip)]

        meth.to_csv(args.output + '_clipped_meth.tsv', index=False, sep='\t')