コード例 #1
ファイル: annotation.py プロジェクト: MEIGA-tk/MEIGA-PAV
def repeats_annotation(events, repeatsDb, buffer):
    For each input event assess if overlaps with an annotated repeat in the reference genome

        1. events: list containing input events to be annotated. Events should be objects containing ref, beg and end attributes.
        2. repeatsDb: dictionary containing annotated repeats organized per chromosome (keys) into genomic bins (values)
        3. buffer: number of base pairs to extend begin and end coordinates for each event prior assessing overlap

        New 'repeatAnnot' attribute set for each input event. 
        'repeatAnnot' is a list of dictionaries. Each dictionary contains information pertaining to one overlapping repeat

    ## Assess for each input event if it overlaps with an annotated repeat
    for event in events:

        # A) Annotated repeat in the same ref where the event is located
        if event.ref in repeatsDb:
            ### Select repeats bin database for the corresponding reference 
            repeatsBinDb = repeatsDb[event.ref]        

            ### Retrieve all the annotated repeats overlapping with the event interval
            overlaps = repeatsBinDb.collect_interval(event.beg - buffer, event.end + buffer, 'ALL')    

            ### Compute distance between the annotated repeat and the raw interval
            annotatedRepeats = []

            ## For each intersection
            for overlap in overlaps:

                repeat = {}
                repeat['family'] = overlap[0].optional['family']
                repeat['subfamily'] = overlap[0].optional['subfamily']

                overlapLen = overlap[1] 
                boolean = gRanges.overlap(event.beg, event.end, overlap[0].beg, overlap[0].end)[0]

                # a) Overlapping raw intervals, distance == 0 
                if boolean:
                    distance = 0

                # b) Not overlapping raw intervals. Compute distance
                    distance = abs(overlapLen - buffer)

                repeat['distance'] = distance

        # B) No repeat in the same ref as the event
            annotatedRepeats = []
        ## Add repeat annotation as attribute 
        event.repeatAnnot = annotatedRepeats
コード例 #2
ファイル: MEIGA-PAV.py プロジェクト: MEIGA-tk/MEIGA-PAV
def search4partnered_5prime(structures, fasta, reference, outDir):
    ## 1. Create Fasta with sequences to realign
    seq2realign = formats.FASTA()

    for insId in structures:

        # Discard if strand not determined
        if structures[insId]['STRAND'] is None:

        ## Extract unresolved 5' sequence if any
        qBeg, qEnd = structures[insId]['CHAIN'].interval()

        if structures[insId]['STRAND'] == '+':
            seq2realign.seqDict[insId] = fasta.seqDict[insId][:qBeg]

            seq2realign.seqDict[insId] = fasta.seqDict[insId][qEnd:]

    fastaPath = outDir + '/seq2realign.5prime.fasta'

    ## 2. Realign sequences on the reference with BWA-mem
    SAM_path = alignment.alignment_bwa(fastaPath, reference,
                                       'hits2genome.5prime', 1, outDir)
    PAF_path = alignment.sam2paf(SAM_path, 'hits2genome.5prime', outDir)

    PAF = formats.PAF()

    ## 3. Make 5' transduction calls
    # For each hit
    for hit in PAF.alignments:

        hit.tName = 'chr' + hit.tName
        iRef, coord = hit.qName.split(':')
        iBeg = int(coord) - 500
        iEnd = int(coord) + 500

        ## Filter out hits
        if (hit.alignmentPerc() < 75) or (hit.MAPQ < 30) or (
                iRef == hit.tName
                and gRanges.overlap(iBeg, iEnd, hit.tBeg, hit.tEnd)[0]):

        ## Make call
        structures[hit.qName]['ITYPE'] = 'partnered'
        structures[hit.qName]['5PRIME'] = True
        structures[hit.qName]['TDCOORD_5PRIME'] = hit.tName + ':' + str(
            hit.tBeg) + '-' + str(hit.tEnd)
        structures[hit.qName]['TDLEN_5PRIME'] = hit.tEnd - hit.tBeg

    return structures
コード例 #3
def collectINDELS(alignmentObj, targetEvents, minINDELlen, targetInterval,
                  overhang, sample):
    Collect insertions and deletions longer than a threshold that are completely spanned within an input read alignment

        1. alignmentObj: pysam read alignment object instance
        2. targetEvents: list with target events (INS: insertion; DEL: deletion)
        3. minINDELlen: minimum INS and DEL lenght
        4. targetInterval: tuple containing begin and end position of the target genomic interval to extract events from. If 'None' all the events spanned by the read alignment will be reported
        5. overhang: Number of flanking base pairs around the SV event to be collected from the supporting read. If 'None' the complete read sequence will be collected)        
        6. sample: type of sample (TUMOUR, NORMAL or None). 

        1. INDEL_events: dictionary containing list of SV events grouped according to the type of INDEL (only those types included in targetEvents):
            * INS -> list of INS objects
            * DEL -> list of DEL objects
    ## Initialize dict
    INDEL_events = {}

    if ('INS' in targetEvents):
        INDEL_events['INS'] = []

    if ('DEL' in targetEvents):
        INDEL_events['DEL'] = []

    ## Initialize positions at query and ref
    posQuery = 0
    posRef = alignmentObj.reference_start

    # Iterate over the CIGAR
    for cigarTuple in alignmentObj.cigartuples:

        operation = int(cigarTuple[0])
        length = int(cigarTuple[1])

        ## a) INSERTION to the reference >= Xbp
        if ('INS' in targetEvents) and (operation
                                        == 1) and (length >= minINDELlen):

            ## Create INS if:
            # a) No interval specified OR
            # b) Insertion within target interval
            if (targetInterval == None) or (gRanges.overlap(
                    posRef, posRef, targetInterval[0], targetInterval[1])[0]):

                # Collect piece of sequence flanking the INS event
                flankingSeq, bkpPos = (
                    alignmentObj.query_sequence, posQuery
                ) if overhang == None else events.pick_flanking_seq_INS(
                    alignmentObj.query_sequence, posQuery, length, overhang)

                # Create INS object
                INS = events.INS(alignmentObj.reference_name, posRef, posRef,
                                 length, alignmentObj.query_name, flankingSeq,
                                 bkpPos, alignmentObj, sample)

        ## b) DELETION to the reference >= Xbp
        if ('DEL' in targetEvents) and (operation
                                        == 2) and (length >= minINDELlen):

            ## Create DEL if:
            # a) No interval specified OR
            # b) Deletion within target interval
            if (targetInterval == None) or (gRanges.overlap(
                    posRef, posRef + length, targetInterval[0],

                # Collect piece of sequence flanking the DEL event
                flankingSeq, bkpPos = (
                    alignmentObj.query_sequence, posQuery
                ) if overhang == None else events.pick_flanking_seq_DEL(
                    alignmentObj.query_sequence, posQuery, overhang)

                # Create DEL object
                DEL = events.DEL(alignmentObj.reference_name, posRef,
                                 posRef + length, length,
                                 alignmentObj.query_name, flankingSeq, bkpPos,
                                 alignmentObj, sample)

        #### Update position over reference and read sequence
        ### a) Operations consuming query and reference
        # - Op M, tag 0, alignment match (can be a sequence match or mismatch)
        # - Op =, tag 7, sequence match
        # - Op X, tag 8, sequence mismatch
        if (operation == 0) or (operation == 7) or (operation == 8):
            posQuery += length
            posRef += length

        ### b) Operations only consuming query
        # - Op I, tag 1, insertion to the reference
        # - Op S, tag 4, soft clipping (clipped sequences present in SEQ)
        elif (operation == 1) or (operation == 4):
            posQuery += length

        ### c) Operations only consuming reference
        # - Op D, tag 2, deletion from the reference
        # - Op N, tag 3, skipped region from the reference
        elif (operation == 2) or (operation == 3):
            posRef += length

        ### d) Operations not consuming query nor reference
        # - Op H, tag 5, hard clipping (clipped sequences NOT present in SEQ)
        # - Op P, tag 6, padding (silent deletion from padded reference)
        # Do not do anything

    return INDEL_events
コード例 #4
def collectCLIPPING(alignmentObj, minCLIPPINGlen, targetInterval, sample):
    For a read alignment check if the read is clipped on each side and return the corresponding clipping objects

        1. alignmentObj: pysam read alignment object
        2. minCLIPPINGlen: minimum clipping lenght
        3. targetInterval: tuple containing begin and end position of the target genomic interval to extract events from. If 'None' all clippings will be reported
        4. sample: type of sample (TUMOUR, NORMAL or None). 

        1. left_CLIPPING: left CLIPPING object (None if no clipping found)
        2. right_CLIPPING: right CLIPPING object (None if no clipping found)
    # Initialize as None
    left_CLIPPING, right_CLIPPING = (None, None)

    # Determine if discordant is mate 1 or 2
    if alignmentObj.is_read1:
        pair = '1'
        pair = '2'

    # Select first and last operation from cigar to search for clipping
    firstOperation, firstOperationLen = alignmentObj.cigartuples[0]
    lastOperation, lastOperationLen = alignmentObj.cigartuples[-1]

    ## Clipping >= X bp at the LEFT
    #  Note: soft (Operation=4) or hard clipped (Operation=5)
    if ((firstOperation == 4) or
        (firstOperation == 5)) and (firstOperationLen >= minCLIPPINGlen):

        ## Create CLIPPING object if:
        # a) No interval specified OR
        # b) Clipping within target interval
        if (targetInterval == None) or (gRanges.overlap(
                alignmentObj.reference_start, alignmentObj.reference_start,
                targetInterval[0], targetInterval[1])[0]):

            # Create CLIPPING object
            left_CLIPPING = events.CLIPPING(
                alignmentObj.reference_name, alignmentObj.reference_start,
                alignmentObj.reference_start, firstOperationLen, 'left', pair,
                alignmentObj.query_name, alignmentObj.query_sequence,
                alignmentObj.query_alignment_start, alignmentObj, sample)

    ## Clipping > X bp at the RIGHT
    if ((lastOperation == 4) or
        (lastOperation == 5)) and (lastOperationLen >= minCLIPPINGlen):

        ## Create CLIPPING object if:
        # a) No interval specified OR
        # b) Clipping within target interval
        if (targetInterval == None) or (gRanges.overlap(
                alignmentObj.reference_end, alignmentObj.reference_end,
                targetInterval[0], targetInterval[1])[0]):

            # Create CLIPPING object
            right_CLIPPING = events.CLIPPING(
                alignmentObj.reference_name, alignmentObj.reference_end,
                alignmentObj.reference_end, lastOperationLen, 'right', pair,
                alignmentObj.query_name, alignmentObj.query_sequence,
                alignmentObj.query_alignment_end, alignmentObj, sample)

    return left_CLIPPING, right_CLIPPING
コード例 #5
ファイル: annotation.py プロジェクト: MEIGA-tk/MEIGA-PAV
def repeats_annotation_lighter(metaclustersList, repeatsDb, buffer):
    For each input event assess if overlaps with an annotated repeat in the reference genome

        1. events: list containing input events to be annotated. Events should be objects containing ref, beg and end attributes.
        1. metaclustersList
        2. repeatsDb: dictionary containing annotated repeats organized per chromosome (keys) into genomic bins (values)
        3. buffer: number of base pairs to extend begin and end coordinates for each event prior assessing overlap

        New 'repeatAnnot' attribute set for each input event. 
        'repeatAnnot' is a list of dictionaries. Each dictionary contains information pertaining to one overlapping repeat

    ## Assess for each input event if it overlaps with an annotated repeat
    for metaclusterFields in metaclustersList:

        # Check if its a ME
        # NOTE SR: Perform repeats_annotation for both, MEIs and VIRUSES.
        #if metaclusterFields[-1]['INTERNAL_ELEMENT'] == 'ME':

        ref = metaclusterFields[0]
        beg = metaclusterFields[1]
        ## metaclusterFields[-1] == VCF INFO dictionary
        end = metaclusterFields[-1]['END']

        # A) Annotated repeat in the same ref where the event is located
        if ref in repeatsDb:
            ### Select repeats bin database for the corresponding reference 
            repeatsBinDb = repeatsDb[ref]        

            ### Retrieve all the annotated repeats overlapping with the event interval
            overlaps = repeatsBinDb.collect_interval(beg - buffer, end + buffer, 'ALL')    

            ### Compute distance between the annotated repeat and the raw interval
            annotatedRepeats = []

            ## For each intersection
            for overlap in overlaps:

                repeat = {}
                repeat['family'] = overlap[0].optional['family']
                repeat['subfamily'] = overlap[0].optional['subfamily']

                overlapLen = overlap[1] 
                boolean = gRanges.overlap(beg, end, overlap[0].beg, overlap[0].end)[0]

                # a) Overlapping raw intervals, distance == 0 
                if boolean:
                    distance = 0

                # b) Not overlapping raw intervals. Compute distance
                    distance = abs(overlapLen - buffer)

                repeat['distance'] = distance

        # B) No repeat in the same ref as the event
            annotatedRepeats = []
        ## Add repeat annotation as attribute 
        #event.repeatAnnot = annotatedRepeats
        repeatAnnot = annotatedRepeats

        ## Add fields to INFO VCF field:
        ## metaclusterFields[-1] == VCF INFO dictionary
        metaclusterFields[-1]['REP'] = ','.join([repeat['family'] for repeat in repeatAnnot]) if repeatAnnot else None 
        metaclusterFields[-1]['REPSUB'] = ','.join([repeat['subfamily'] for repeat in repeatAnnot]) if repeatAnnot else None   
        metaclusterFields[-1]['DIST'] = ','.join([str(repeat['distance']) for repeat in repeatAnnot]) if repeatAnnot else None