def repeats_annotation(events, repeatsDb, buffer): ''' For each input event assess if overlaps with an annotated repeat in the reference genome Input: 1. events: list containing input events to be annotated. Events should be objects containing ref, beg and end attributes. 2. repeatsDb: dictionary containing annotated repeats organized per chromosome (keys) into genomic bins (values) 3. buffer: number of base pairs to extend begin and end coordinates for each event prior assessing overlap Output: New 'repeatAnnot' attribute set for each input event. 'repeatAnnot' is a list of dictionaries. Each dictionary contains information pertaining to one overlapping repeat ''' ## Assess for each input event if it overlaps with an annotated repeat for event in events: # A) Annotated repeat in the same ref where the event is located if event.ref in repeatsDb: ### Select repeats bin database for the corresponding reference repeatsBinDb = repeatsDb[event.ref] ### Retrieve all the annotated repeats overlapping with the event interval overlaps = repeatsBinDb.collect_interval(event.beg - buffer, event.end + buffer, 'ALL') ### Compute distance between the annotated repeat and the raw interval annotatedRepeats = [] ## For each intersection for overlap in overlaps: repeat = {} repeat['family'] = overlap[0].optional['family'] repeat['subfamily'] = overlap[0].optional['subfamily'] overlapLen = overlap[1] boolean = gRanges.overlap(event.beg, event.end, overlap[0].beg, overlap[0].end)[0] # a) Overlapping raw intervals, distance == 0 if boolean: distance = 0 # b) Not overlapping raw intervals. Compute distance else: distance = abs(overlapLen - buffer) repeat['distance'] = distance annotatedRepeats.append(repeat) # B) No repeat in the same ref as the event else: annotatedRepeats = [] ## Add repeat annotation as attribute event.repeatAnnot = annotatedRepeats
def search4partnered_5prime(structures, fasta, reference, outDir): ''' ''' ## 1. Create Fasta with sequences to realign seq2realign = formats.FASTA() for insId in structures: # Discard if strand not determined if structures[insId]['STRAND'] is None: continue ## Extract unresolved 5' sequence if any qBeg, qEnd = structures[insId]['CHAIN'].interval() if structures[insId]['STRAND'] == '+': seq2realign.seqDict[insId] = fasta.seqDict[insId][:qBeg] else: seq2realign.seqDict[insId] = fasta.seqDict[insId][qEnd:] fastaPath = outDir + '/seq2realign.5prime.fasta' seq2realign.write(fastaPath) ## 2. Realign sequences on the reference with BWA-mem SAM_path = alignment.alignment_bwa(fastaPath, reference, 'hits2genome.5prime', 1, outDir) PAF_path = alignment.sam2paf(SAM_path, 'hits2genome.5prime', outDir) PAF = formats.PAF() PAF.read(PAF_path) ## 3. Make 5' transduction calls # For each hit for hit in PAF.alignments: hit.tName = 'chr' + hit.tName iRef, coord = hit.qName.split(':') iBeg = int(coord) - 500 iEnd = int(coord) + 500 ## Filter out hits if (hit.alignmentPerc() < 75) or (hit.MAPQ < 30) or ( iRef == hit.tName and gRanges.overlap(iBeg, iEnd, hit.tBeg, hit.tEnd)[0]): continue ## Make call structures[hit.qName]['ITYPE'] = 'partnered' structures[hit.qName]['5PRIME'] = True structures[hit.qName]['TDCOORD_5PRIME'] = hit.tName + ':' + str( hit.tBeg) + '-' + str(hit.tEnd) structures[hit.qName]['TDLEN_5PRIME'] = hit.tEnd - hit.tBeg return structures
def collectINDELS(alignmentObj, targetEvents, minINDELlen, targetInterval, overhang, sample): ''' Collect insertions and deletions longer than a threshold that are completely spanned within an input read alignment Input: 1. alignmentObj: pysam read alignment object instance 2. targetEvents: list with target events (INS: insertion; DEL: deletion) 3. minINDELlen: minimum INS and DEL lenght 4. targetInterval: tuple containing begin and end position of the target genomic interval to extract events from. If 'None' all the events spanned by the read alignment will be reported 5. overhang: Number of flanking base pairs around the SV event to be collected from the supporting read. If 'None' the complete read sequence will be collected) 6. sample: type of sample (TUMOUR, NORMAL or None). Output: 1. INDEL_events: dictionary containing list of SV events grouped according to the type of INDEL (only those types included in targetEvents): * INS -> list of INS objects * DEL -> list of DEL objects ''' ## Initialize dict INDEL_events = {} if ('INS' in targetEvents): INDEL_events['INS'] = [] if ('DEL' in targetEvents): INDEL_events['DEL'] = [] ## Initialize positions at query and ref posQuery = 0 posRef = alignmentObj.reference_start # Iterate over the CIGAR for cigarTuple in alignmentObj.cigartuples: operation = int(cigarTuple[0]) length = int(cigarTuple[1]) ## a) INSERTION to the reference >= Xbp if ('INS' in targetEvents) and (operation == 1) and (length >= minINDELlen): ## Create INS if: # a) No interval specified OR # b) Insertion within target interval if (targetInterval == None) or (gRanges.overlap( posRef, posRef, targetInterval[0], targetInterval[1])[0]): # Collect piece of sequence flanking the INS event flankingSeq, bkpPos = ( alignmentObj.query_sequence, posQuery ) if overhang == None else events.pick_flanking_seq_INS( alignmentObj.query_sequence, posQuery, length, overhang) # Create INS object INS = events.INS(alignmentObj.reference_name, posRef, posRef, length, alignmentObj.query_name, flankingSeq, bkpPos, alignmentObj, sample) INDEL_events['INS'].append(INS) ## b) DELETION to the reference >= Xbp if ('DEL' in targetEvents) and (operation == 2) and (length >= minINDELlen): ## Create DEL if: # a) No interval specified OR # b) Deletion within target interval if (targetInterval == None) or (gRanges.overlap( posRef, posRef + length, targetInterval[0], targetInterval[1])[0]): # Collect piece of sequence flanking the DEL event flankingSeq, bkpPos = ( alignmentObj.query_sequence, posQuery ) if overhang == None else events.pick_flanking_seq_DEL( alignmentObj.query_sequence, posQuery, overhang) # Create DEL object DEL = events.DEL(alignmentObj.reference_name, posRef, posRef + length, length, alignmentObj.query_name, flankingSeq, bkpPos, alignmentObj, sample) INDEL_events['DEL'].append(DEL) #### Update position over reference and read sequence ### a) Operations consuming query and reference # - Op M, tag 0, alignment match (can be a sequence match or mismatch) # - Op =, tag 7, sequence match # - Op X, tag 8, sequence mismatch if (operation == 0) or (operation == 7) or (operation == 8): posQuery += length posRef += length ### b) Operations only consuming query # - Op I, tag 1, insertion to the reference # - Op S, tag 4, soft clipping (clipped sequences present in SEQ) elif (operation == 1) or (operation == 4): posQuery += length ### c) Operations only consuming reference # - Op D, tag 2, deletion from the reference # - Op N, tag 3, skipped region from the reference elif (operation == 2) or (operation == 3): posRef += length ### d) Operations not consuming query nor reference # - Op H, tag 5, hard clipping (clipped sequences NOT present in SEQ) # - Op P, tag 6, padding (silent deletion from padded reference) # Do not do anything return INDEL_events
def collectCLIPPING(alignmentObj, minCLIPPINGlen, targetInterval, sample): ''' For a read alignment check if the read is clipped on each side and return the corresponding clipping objects Input: 1. alignmentObj: pysam read alignment object 2. minCLIPPINGlen: minimum clipping lenght 3. targetInterval: tuple containing begin and end position of the target genomic interval to extract events from. If 'None' all clippings will be reported 4. sample: type of sample (TUMOUR, NORMAL or None). Output: 1. left_CLIPPING: left CLIPPING object (None if no clipping found) 2. right_CLIPPING: right CLIPPING object (None if no clipping found) ''' # Initialize as None left_CLIPPING, right_CLIPPING = (None, None) # Determine if discordant is mate 1 or 2 if alignmentObj.is_read1: pair = '1' else: pair = '2' # Select first and last operation from cigar to search for clipping firstOperation, firstOperationLen = alignmentObj.cigartuples[0] lastOperation, lastOperationLen = alignmentObj.cigartuples[-1] ## Clipping >= X bp at the LEFT # Note: soft (Operation=4) or hard clipped (Operation=5) if ((firstOperation == 4) or (firstOperation == 5)) and (firstOperationLen >= minCLIPPINGlen): ## Create CLIPPING object if: # a) No interval specified OR # b) Clipping within target interval if (targetInterval == None) or (gRanges.overlap( alignmentObj.reference_start, alignmentObj.reference_start, targetInterval[0], targetInterval[1])[0]): # Create CLIPPING object left_CLIPPING = events.CLIPPING( alignmentObj.reference_name, alignmentObj.reference_start, alignmentObj.reference_start, firstOperationLen, 'left', pair, alignmentObj.query_name, alignmentObj.query_sequence, alignmentObj.query_alignment_start, alignmentObj, sample) ## Clipping > X bp at the RIGHT if ((lastOperation == 4) or (lastOperation == 5)) and (lastOperationLen >= minCLIPPINGlen): ## Create CLIPPING object if: # a) No interval specified OR # b) Clipping within target interval if (targetInterval == None) or (gRanges.overlap( alignmentObj.reference_end, alignmentObj.reference_end, targetInterval[0], targetInterval[1])[0]): # Create CLIPPING object right_CLIPPING = events.CLIPPING( alignmentObj.reference_name, alignmentObj.reference_end, alignmentObj.reference_end, lastOperationLen, 'right', pair, alignmentObj.query_name, alignmentObj.query_sequence, alignmentObj.query_alignment_end, alignmentObj, sample) return left_CLIPPING, right_CLIPPING
def repeats_annotation_lighter(metaclustersList, repeatsDb, buffer): ''' For each input event assess if overlaps with an annotated repeat in the reference genome Input: 1. events: list containing input events to be annotated. Events should be objects containing ref, beg and end attributes. 1. metaclustersList 2. repeatsDb: dictionary containing annotated repeats organized per chromosome (keys) into genomic bins (values) 3. buffer: number of base pairs to extend begin and end coordinates for each event prior assessing overlap Output: New 'repeatAnnot' attribute set for each input event. 'repeatAnnot' is a list of dictionaries. Each dictionary contains information pertaining to one overlapping repeat ''' ## Assess for each input event if it overlaps with an annotated repeat for metaclusterFields in metaclustersList: # Check if its a ME # NOTE SR: Perform repeats_annotation for both, MEIs and VIRUSES. #if metaclusterFields[-1]['INTERNAL_ELEMENT'] == 'ME': ref = metaclusterFields[0] beg = metaclusterFields[1] ## metaclusterFields[-1] == VCF INFO dictionary end = metaclusterFields[-1]['END'] # A) Annotated repeat in the same ref where the event is located if ref in repeatsDb: ### Select repeats bin database for the corresponding reference repeatsBinDb = repeatsDb[ref] ### Retrieve all the annotated repeats overlapping with the event interval overlaps = repeatsBinDb.collect_interval(beg - buffer, end + buffer, 'ALL') ### Compute distance between the annotated repeat and the raw interval annotatedRepeats = [] ## For each intersection for overlap in overlaps: repeat = {} repeat['family'] = overlap[0].optional['family'] repeat['subfamily'] = overlap[0].optional['subfamily'] overlapLen = overlap[1] boolean = gRanges.overlap(beg, end, overlap[0].beg, overlap[0].end)[0] # a) Overlapping raw intervals, distance == 0 if boolean: distance = 0 # b) Not overlapping raw intervals. Compute distance else: distance = abs(overlapLen - buffer) repeat['distance'] = distance annotatedRepeats.append(repeat) # B) No repeat in the same ref as the event else: annotatedRepeats = [] ## Add repeat annotation as attribute #event.repeatAnnot = annotatedRepeats repeatAnnot = annotatedRepeats ## Add fields to INFO VCF field: ## metaclusterFields[-1] == VCF INFO dictionary metaclusterFields[-1]['REP'] = ','.join([repeat['family'] for repeat in repeatAnnot]) if repeatAnnot else None metaclusterFields[-1]['REPSUB'] = ','.join([repeat['subfamily'] for repeat in repeatAnnot]) if repeatAnnot else None metaclusterFields[-1]['DIST'] = ','.join([str(repeat['distance']) for repeat in repeatAnnot]) if repeatAnnot else None